#!/usr/bin/env python3
"""
通用书籍合并工具 (Generic Book Combiner)
功能:
1. 自动扫描当前或指定目录。
2. 解析 SUMMARY.md 获取章节结构。
3. 解析 README.md 获取书籍标题和简介信息。
4. 生成 single-page.md 和 single-page.html。
"""
import re
import html
import argparse
import sys
from pathlib import Path
from datetime import datetime
# HTML 模板
HTML_TEMPLATE = """
{title}
{title}
{subtitle}
{content}
"""
def extract_book_info(project_dir: Path) -> tuple[str, str]:
"""
从 README.md 或 SUMMARY.md 中提取书籍标题和副标题。
Returns:
(title, subtitle)
"""
title = "Untitled Book"
subtitle = "Generated Book"
# 优先尝试 README.md
readme_path = project_dir / 'README.md'
if readme_path.exists():
try:
content = readme_path.read_text(encoding='utf-8')
lines = content.split('\n')
for line in lines[:10]: # 只看前10行
match = re.match(r'^#\s+(.+)$', line)
if match:
title = match.group(1).strip()
break
# 尝试查找引用块作为副标题
for line in lines[:20]:
match = re.match(r'^>\s+(.+)$', line)
if match:
subtitle = match.group(1).strip()
break
return title, subtitle
except Exception:
pass
# 其次尝试 SUMMARY.md
summary_path = project_dir / 'SUMMARY.md'
if summary_path.exists():
try:
content = summary_path.read_text(encoding='utf-8')
lines = content.split('\n')
for line in lines[:5]:
match = re.match(r'^#\s+(.+)$', line)
if match:
title = match.group(1).strip()
return title, subtitle
except Exception:
pass
return title, subtitle
def parse_summary(summary_path: Path) -> list[tuple[str, str, int]]:
"""
解析 SUMMARY.md,提取所有章节链接。
Returns:
list of (title, file_path, indent_level)
"""
entries = []
if not summary_path.exists():
return entries
content = summary_path.read_text(encoding='utf-8')
# 匹配 Markdown 链接格式: * [标题](文件路径) 或 - [标题](文件路径)
# 支持多级缩进
pattern = r'^(\s*)[\*\-]\s*\[([^\]]+)\]\(([^)]+)\)'
for line in content.split('\n'):
match = re.match(pattern, line)
if match:
indent = len(match.group(1))
title = match.group(2)
file_path = match.group(3)
# 跳过外部链接
if file_path.startswith('http'):
continue
entries.append((title, file_path, indent))
return entries
def convert_internal_links_to_anchors(content: str, file_to_anchor_map: dict[str, str]) -> str:
"""
Convert internal markdown file links to anchor links for single-page output.
Examples:
[Title](1.2_xxx.md) -> [Title](#anchor-id)
[Title](../04_mcp/README.md) -> [Title](#anchor-id)
[Title](file.md#section) -> [Title](#section)
Args:
content: The markdown content to process
file_to_anchor_map: Mapping from file paths to their anchor IDs
Returns:
Content with internal links converted to anchors
"""
def replace_link(match):
link_text = match.group(1)
link_target = match.group(2)
# Skip external URLs and mailto links
if link_target.startswith('http://') or link_target.startswith('https://') or link_target.startswith('mailto:'):
return match.group(0)
# Skip image links (they start with !)
# Check the character before the match - this is handled by the regex not matching ![]()
# Handle anchor-only links
if link_target.startswith('#'):
return match.group(0)
# Split target into file path and anchor
if '#' in link_target:
file_path, anchor = link_target.split('#', 1)
# If there's a specific anchor, use it directly
return f'[{link_text}](#{anchor})'
else:
file_path = link_target
# Normalize the file path (remove ./, ../ prefixes and get the basename for matching)
# Extract just the filename for simple matching
normalized_path = file_path.replace('\\', '/').strip()
# Try to find a matching anchor in the map
# First try exact match
if normalized_path in file_to_anchor_map:
return f'[{link_text}](#{file_to_anchor_map[normalized_path]})'
# Try matching by filename only (for links like ../04_mcp/README.md)
from pathlib import PurePosixPath
filename = PurePosixPath(normalized_path).name
# Search for matching file in the map
for path, anchor in file_to_anchor_map.items():
if PurePosixPath(path).name == filename:
# For README.md, we need to be more specific - check parent directory
if filename == 'README.md':
# Try to match by parent directory
parts = normalized_path.replace('../', '').replace('./', '').split('/')
if len(parts) >= 2:
parent_dir = parts[-2]
path_parts = path.split('/')
if len(path_parts) >= 2 and path_parts[-2] == parent_dir:
return f'[{link_text}](#{anchor})'
continue
return f'[{link_text}](#{anchor})'
# If no match found, generate an anchor from the link text
# This handles cases where the file might not be in the map
fallback_anchor = re.sub(r'[^\w\u4e00-\u9fff]+', '-', link_text.lower()).strip('-')
return f'[{link_text}](#{fallback_anchor})'
# Match markdown links: [text](target) but not image links 
# Use negative lookbehind for !
pattern = r'(? str:
"""
Fix relative image paths for single-page output.
When combining files from different directories, relative image paths like
`_images/xxx.png` need to be prefixed with the source file's directory.
Examples:
If file is from 07_coding/7.4_ide.md:
 -> 
Args:
content: The markdown content to process
file_path: The relative path of the source file (e.g., "07_coding/7.4_ide.md")
Returns:
Content with fixed image paths
"""
from pathlib import PurePosixPath
# Get the directory of the source file
source_dir = str(PurePosixPath(file_path).parent)
# If the file is in the root directory, no path fixing needed
if source_dir == '.':
return content
def replace_image(match):
alt_text = match.group(1)
image_path = match.group(2)
# Skip external URLs
if image_path.startswith('http://') or image_path.startswith('https://'):
return match.group(0)
# Skip absolute paths
if image_path.startswith('/'):
return match.group(0)
# Skip paths that already have a directory prefix (not starting with _images/)
if not image_path.startswith('_images/') and not image_path.startswith('./_images/'):
# Check if it's already a full path like 07_coding/_images/
if '/_images/' in image_path or image_path.startswith('../'):
return match.group(0)
# Remove leading ./ if present
clean_path = image_path.lstrip('./')
# Prepend the source directory
new_path = f"{source_dir}/{clean_path}"
return f''
# Match markdown image syntax: 
pattern = r'!\[([^\]]*)\]\(([^)]+)\)'
return re.sub(pattern, replace_image, content)
def clean_navigation_links(content: str) -> str:
"""
Remove navigation links (Next/Previous, arrows) from the end of the content.
"""
lines = content.rstrip().split('\n')
# Navigation line patterns
nav_patterns = [
r'^\s*[-=]{3,}\s*$', # Separator lines
r'^\s*(\*\*|__)?(Next|Previous|下一[节章页]|上一[节章页])(\*\*|__)?.*$', # Text based
r'^\s*(➡️|→|=>|==>|Example|Download)\s*.*$', # Arrow/Indicator based
r'^\s*\[(Next|Previous|下一[节章]|上一[节章]).*?\]\(.*?\)\s*$', # Link with nav text
]
# Also catch "Arrow [Link](Url)" specifically if not caught above
# And purely link lines that look like nav " [Title](Url) " relative short
while lines:
last_line = lines[-1].strip()
if not last_line:
lines.pop()
continue
is_nav = False
# Check explicit patterns
for pattern in nav_patterns:
if re.match(pattern, last_line, re.IGNORECASE):
is_nav = True
break
# Check "Arrow + Link" specifically (common in this book)
if not is_nav:
# Pattern: Arrow (optional) + Link
# e.g. "➡️ [Title](Link)"
if re.match(r'^\s*(➡️|→|=>|==>)\s*\[.+?\]\(.+?\)\s*$', last_line):
is_nav = True
if is_nav:
# print(f"DEBUG: Removing nav line: {last_line}")
lines.pop()
else:
# Found a non-nav line, stop checking
break
return '\n'.join(lines)
def clean_redundant_header(content: str, title: str, subtitle: str) -> str:
"""
Remove the title and subtitle from the beginning of the content if they match the book info.
"""
lines = content.split('\n')
# Remove leading blank lines
while lines and not lines[0].strip():
lines.pop(0)
if not lines:
return content
# Check for Title (H1)
# Case 1: Exact match "# Title"
# Case 2: Match with some whitespace flexibility
if re.match(r'^#\s+' + re.escape(title) + r'\s*$', lines[0].strip(), re.IGNORECASE):
lines.pop(0)
# Remove blank lines after title
while lines and not lines[0].strip():
lines.pop(0)
# Check for Subtitle (Blockquote)
if subtitle and lines and lines[0].strip().startswith(">"):
# Clean punctuation for comparison just in case
line_text = lines[0].strip().lstrip('>').strip()
if subtitle in line_text or line_text in subtitle:
lines.pop(0)
# Remove blank lines after subtitle
while lines and not lines[0].strip():
lines.pop(0)
# Also remove common separator lines like "---" that often follow the header
if lines and lines[0].strip().replace(' ', '') == '---':
lines.pop(0)
while lines and not lines[0].strip():
lines.pop(0)
return '\n'.join(lines)
def markdown_to_html(md_content: str) -> str:
"""
将 Markdown 转换为 HTML。
简单实现,不依赖外部库。
"""
lines = md_content.split('\n')
html_lines = []
in_code_block = False
code_lang = ''
code_content = []
in_list = False
in_table = False
table_rows = []
def process_inline(text: str) -> str:
"""处理行内格式"""
# 代码
text = re.sub(r'`([^`]+)`', r'\1', text)
# 粗体
text = re.sub(r'\*\*([^*]+)\*\*', r'\1', text)
# 斜体
text = re.sub(r'\*([^*]+)\*', r'\1', text)
# 链接
text = re.sub(r'\[([^\]]+)\]\(([^)]+)\)', r'\1', text)
return text
i = 0
while i < len(lines):
line = lines[i]
# 代码块
if line.startswith('```'):
if in_code_block:
html_lines.append(f'
{html.escape(chr(10).join(code_content))}
')
code_content = []
in_code_block = False
else:
code_lang = line[3:].strip() or 'text'
in_code_block = True
i += 1
continue
if in_code_block:
code_content.append(line)
i += 1
continue
# 表格
if '|' in line and not line.strip().startswith('```'):
cells = [c.strip() for c in line.split('|')]
cells = [c for c in cells if c] # 移除空单元格
if cells and not all(re.match(r'^[-:]+$', c) for c in cells):
if not in_table:
in_table = True
table_rows = []
table_rows.append(cells)
elif in_table and all(re.match(r'^[-:]+$', c) for c in cells):
pass # 跳过分隔行
# 检查下一行是否还是表格
if i + 1 >= len(lines) or '|' not in lines[i + 1]:
if in_table and table_rows:
html_lines.append('
')
for j, row in enumerate(table_rows):
tag = 'th' if j == 0 else 'td'
html_lines.append('
')
for cell in row:
html_lines.append(f'<{tag}>{process_inline(cell)}{tag}>')
html_lines.append('
')
html_lines.append('
')
table_rows = []
in_table = False
i += 1
continue
# 标题
heading_match = re.match(r'^(#{1,6})\s+(.+)$', line)
if heading_match:
level = len(heading_match.group(1))
text = process_inline(heading_match.group(2))
# 生成简单的 ID
anchor = re.sub(r'[^\w\u4e00-\u9fff]+', '-', heading_match.group(2).lower()).strip('-')
html_lines.append(f'{text}')
i += 1
continue
# 引用
if line.startswith('>'):
quote_text = process_inline(line[1:].strip())
html_lines.append(f'
{quote_text}
')
i += 1
continue
# 水平线
if re.match(r'^-{3,}$|^\*{3,}$|^_{3,}$', line.strip()):
html_lines.append('')
i += 1
continue
# 无序列表
list_match = re.match(r'^(\s*)[\*\-]\s+(.+)$', line)
if list_match:
if not in_list:
html_lines.append('
')
in_list = True
html_lines.append(f'
{process_inline(list_match.group(2))}
')
# 检查下一行
if i + 1 >= len(lines) or not re.match(r'^\s*[\*\-]\s+', lines[i + 1]):
html_lines.append('
')
in_list = False
i += 1
continue
# 有序列表
ol_match = re.match(r'^(\s*)\d+\.\s+(.+)$', line)
if ol_match:
if not in_list:
html_lines.append('')
in_list = True
html_lines.append(f'