#!/usr/bin/env python3 """ 通用书籍合并工具 (Generic Book Combiner) 功能: 1. 自动扫描当前或指定目录。 2. 解析 SUMMARY.md 获取章节结构。 3. 解析 README.md 获取书籍标题和简介信息。 4. 生成 single-page.md 和 single-page.html。 """ import re import html import argparse import sys from pathlib import Path from datetime import datetime # HTML 模板 HTML_TEMPLATE = """ {title}

{title}

{subtitle}

{content}
""" def extract_book_info(project_dir: Path) -> tuple[str, str]: """ 从 README.md 或 SUMMARY.md 中提取书籍标题和副标题。 Returns: (title, subtitle) """ title = "Untitled Book" subtitle = "Generated Book" # 优先尝试 README.md readme_path = project_dir / 'README.md' if readme_path.exists(): try: content = readme_path.read_text(encoding='utf-8') lines = content.split('\n') for line in lines[:10]: # 只看前10行 match = re.match(r'^#\s+(.+)$', line) if match: title = match.group(1).strip() break # 尝试查找引用块作为副标题 for line in lines[:20]: match = re.match(r'^>\s+(.+)$', line) if match: subtitle = match.group(1).strip() break return title, subtitle except Exception: pass # 其次尝试 SUMMARY.md summary_path = project_dir / 'SUMMARY.md' if summary_path.exists(): try: content = summary_path.read_text(encoding='utf-8') lines = content.split('\n') for line in lines[:5]: match = re.match(r'^#\s+(.+)$', line) if match: title = match.group(1).strip() return title, subtitle except Exception: pass return title, subtitle def parse_summary(summary_path: Path) -> list[tuple[str, str, int]]: """ 解析 SUMMARY.md,提取所有章节链接。 Returns: list of (title, file_path, indent_level) """ entries = [] if not summary_path.exists(): return entries content = summary_path.read_text(encoding='utf-8') # 匹配 Markdown 链接格式: * [标题](文件路径) 或 - [标题](文件路径) # 支持多级缩进 pattern = r'^(\s*)[\*\-]\s*\[([^\]]+)\]\(([^)]+)\)' for line in content.split('\n'): match = re.match(pattern, line) if match: indent = len(match.group(1)) title = match.group(2) file_path = match.group(3) # 跳过外部链接 if file_path.startswith('http'): continue entries.append((title, file_path, indent)) return entries def convert_internal_links_to_anchors(content: str, file_to_anchor_map: dict[str, str]) -> str: """ Convert internal markdown file links to anchor links for single-page output. Examples: [Title](1.2_xxx.md) -> [Title](#anchor-id) [Title](../04_mcp/README.md) -> [Title](#anchor-id) [Title](file.md#section) -> [Title](#section) Args: content: The markdown content to process file_to_anchor_map: Mapping from file paths to their anchor IDs Returns: Content with internal links converted to anchors """ def replace_link(match): link_text = match.group(1) link_target = match.group(2) # Skip external URLs and mailto links if link_target.startswith('http://') or link_target.startswith('https://') or link_target.startswith('mailto:'): return match.group(0) # Skip image links (they start with !) # Check the character before the match - this is handled by the regex not matching ![]() # Handle anchor-only links if link_target.startswith('#'): return match.group(0) # Split target into file path and anchor if '#' in link_target: file_path, anchor = link_target.split('#', 1) # If there's a specific anchor, use it directly return f'[{link_text}](#{anchor})' else: file_path = link_target # Normalize the file path (remove ./, ../ prefixes and get the basename for matching) # Extract just the filename for simple matching normalized_path = file_path.replace('\\', '/').strip() # Try to find a matching anchor in the map # First try exact match if normalized_path in file_to_anchor_map: return f'[{link_text}](#{file_to_anchor_map[normalized_path]})' # Try matching by filename only (for links like ../04_mcp/README.md) from pathlib import PurePosixPath filename = PurePosixPath(normalized_path).name # Search for matching file in the map for path, anchor in file_to_anchor_map.items(): if PurePosixPath(path).name == filename: # For README.md, we need to be more specific - check parent directory if filename == 'README.md': # Try to match by parent directory parts = normalized_path.replace('../', '').replace('./', '').split('/') if len(parts) >= 2: parent_dir = parts[-2] path_parts = path.split('/') if len(path_parts) >= 2 and path_parts[-2] == parent_dir: return f'[{link_text}](#{anchor})' continue return f'[{link_text}](#{anchor})' # If no match found, generate an anchor from the link text # This handles cases where the file might not be in the map fallback_anchor = re.sub(r'[^\w\u4e00-\u9fff]+', '-', link_text.lower()).strip('-') return f'[{link_text}](#{fallback_anchor})' # Match markdown links: [text](target) but not image links ![text](target) # Use negative lookbehind for ! pattern = r'(? str: """ Fix relative image paths for single-page output. When combining files from different directories, relative image paths like `_images/xxx.png` need to be prefixed with the source file's directory. Examples: If file is from 07_coding/7.4_ide.md: ![alt](_images/cursor.png) -> ![alt](07_coding/_images/cursor.png) Args: content: The markdown content to process file_path: The relative path of the source file (e.g., "07_coding/7.4_ide.md") Returns: Content with fixed image paths """ from pathlib import PurePosixPath # Get the directory of the source file source_dir = str(PurePosixPath(file_path).parent) # If the file is in the root directory, no path fixing needed if source_dir == '.': return content def replace_image(match): alt_text = match.group(1) image_path = match.group(2) # Skip external URLs if image_path.startswith('http://') or image_path.startswith('https://'): return match.group(0) # Skip absolute paths if image_path.startswith('/'): return match.group(0) # Skip paths that already have a directory prefix (not starting with _images/) if not image_path.startswith('_images/') and not image_path.startswith('./_images/'): # Check if it's already a full path like 07_coding/_images/ if '/_images/' in image_path or image_path.startswith('../'): return match.group(0) # Remove leading ./ if present clean_path = image_path.lstrip('./') # Prepend the source directory new_path = f"{source_dir}/{clean_path}" return f'![{alt_text}]({new_path})' # Match markdown image syntax: ![alt](path) pattern = r'!\[([^\]]*)\]\(([^)]+)\)' return re.sub(pattern, replace_image, content) def clean_navigation_links(content: str) -> str: """ Remove navigation links (Next/Previous, arrows) from the end of the content. """ lines = content.rstrip().split('\n') # Navigation line patterns nav_patterns = [ r'^\s*[-=]{3,}\s*$', # Separator lines r'^\s*(\*\*|__)?(Next|Previous|下一[节章页]|上一[节章页])(\*\*|__)?.*$', # Text based r'^\s*(➡️|→|=>|==>|Example|Download)\s*.*$', # Arrow/Indicator based r'^\s*\[(Next|Previous|下一[节章]|上一[节章]).*?\]\(.*?\)\s*$', # Link with nav text ] # Also catch "Arrow [Link](Url)" specifically if not caught above # And purely link lines that look like nav " [Title](Url) " relative short while lines: last_line = lines[-1].strip() if not last_line: lines.pop() continue is_nav = False # Check explicit patterns for pattern in nav_patterns: if re.match(pattern, last_line, re.IGNORECASE): is_nav = True break # Check "Arrow + Link" specifically (common in this book) if not is_nav: # Pattern: Arrow (optional) + Link # e.g. "➡️ [Title](Link)" if re.match(r'^\s*(➡️|→|=>|==>)\s*\[.+?\]\(.+?\)\s*$', last_line): is_nav = True if is_nav: # print(f"DEBUG: Removing nav line: {last_line}") lines.pop() else: # Found a non-nav line, stop checking break return '\n'.join(lines) def clean_redundant_header(content: str, title: str, subtitle: str) -> str: """ Remove the title and subtitle from the beginning of the content if they match the book info. """ lines = content.split('\n') # Remove leading blank lines while lines and not lines[0].strip(): lines.pop(0) if not lines: return content # Check for Title (H1) # Case 1: Exact match "# Title" # Case 2: Match with some whitespace flexibility if re.match(r'^#\s+' + re.escape(title) + r'\s*$', lines[0].strip(), re.IGNORECASE): lines.pop(0) # Remove blank lines after title while lines and not lines[0].strip(): lines.pop(0) # Check for Subtitle (Blockquote) if subtitle and lines and lines[0].strip().startswith(">"): # Clean punctuation for comparison just in case line_text = lines[0].strip().lstrip('>').strip() if subtitle in line_text or line_text in subtitle: lines.pop(0) # Remove blank lines after subtitle while lines and not lines[0].strip(): lines.pop(0) # Also remove common separator lines like "---" that often follow the header if lines and lines[0].strip().replace(' ', '') == '---': lines.pop(0) while lines and not lines[0].strip(): lines.pop(0) return '\n'.join(lines) def markdown_to_html(md_content: str) -> str: """ 将 Markdown 转换为 HTML。 简单实现,不依赖外部库。 """ lines = md_content.split('\n') html_lines = [] in_code_block = False code_lang = '' code_content = [] in_list = False in_table = False table_rows = [] def process_inline(text: str) -> str: """处理行内格式""" # 代码 text = re.sub(r'`([^`]+)`', r'\1', text) # 粗体 text = re.sub(r'\*\*([^*]+)\*\*', r'\1', text) # 斜体 text = re.sub(r'\*([^*]+)\*', r'\1', text) # 链接 text = re.sub(r'\[([^\]]+)\]\(([^)]+)\)', r'\1', text) return text i = 0 while i < len(lines): line = lines[i] # 代码块 if line.startswith('```'): if in_code_block: html_lines.append(f'
{html.escape(chr(10).join(code_content))}
') code_content = [] in_code_block = False else: code_lang = line[3:].strip() or 'text' in_code_block = True i += 1 continue if in_code_block: code_content.append(line) i += 1 continue # 表格 if '|' in line and not line.strip().startswith('```'): cells = [c.strip() for c in line.split('|')] cells = [c for c in cells if c] # 移除空单元格 if cells and not all(re.match(r'^[-:]+$', c) for c in cells): if not in_table: in_table = True table_rows = [] table_rows.append(cells) elif in_table and all(re.match(r'^[-:]+$', c) for c in cells): pass # 跳过分隔行 # 检查下一行是否还是表格 if i + 1 >= len(lines) or '|' not in lines[i + 1]: if in_table and table_rows: html_lines.append('') for j, row in enumerate(table_rows): tag = 'th' if j == 0 else 'td' html_lines.append('') for cell in row: html_lines.append(f'<{tag}>{process_inline(cell)}') html_lines.append('') html_lines.append('
') table_rows = [] in_table = False i += 1 continue # 标题 heading_match = re.match(r'^(#{1,6})\s+(.+)$', line) if heading_match: level = len(heading_match.group(1)) text = process_inline(heading_match.group(2)) # 生成简单的 ID anchor = re.sub(r'[^\w\u4e00-\u9fff]+', '-', heading_match.group(2).lower()).strip('-') html_lines.append(f'{text}') i += 1 continue # 引用 if line.startswith('>'): quote_text = process_inline(line[1:].strip()) html_lines.append(f'

{quote_text}

') i += 1 continue # 水平线 if re.match(r'^-{3,}$|^\*{3,}$|^_{3,}$', line.strip()): html_lines.append('
') i += 1 continue # 无序列表 list_match = re.match(r'^(\s*)[\*\-]\s+(.+)$', line) if list_match: if not in_list: html_lines.append('') in_list = False i += 1 continue # 有序列表 ol_match = re.match(r'^(\s*)\d+\.\s+(.+)$', line) if ol_match: if not in_list: html_lines.append('
    ') in_list = True html_lines.append(f'
  1. {process_inline(ol_match.group(2))}
  2. ') # 检查下一行 if i + 1 >= len(lines) or not re.match(r'^\s*\d+\.\s+', lines[i + 1]): html_lines.append('
') in_list = False i += 1 continue # 空行 if not line.strip(): i += 1 continue # 注释行(跳过) if line.strip().startswith('\n") md_body.append(f'
\n') # 添加锚点 md_body.append(content) md_body.append("\n") processed_count += 1 print(f" ✅ {title}") except Exception as e: print(f" ❌ 读取失败 {file_path}: {e}") skipped_count += 1 # 生成 Markdown 文件 (包含标题头) final_md = '\n'.join(md_header + md_body) # 规范化空行 final_md = re.sub(r'\n{4,}', '\n\n\n', final_md) md_path = project_dir / output_md md_path.write_text(final_md, encoding='utf-8') # 生成 HTML 文件 (仅使用正文,因为 HTML 模板已有 Header) print("\n🔄 正在生成 HTML...") final_html_md = '\n'.join(md_body) html_content = markdown_to_html(final_html_md) # 填充 HTML 模板 current_date = datetime.now().strftime("%Y-%m-%d") final_html = HTML_TEMPLATE.format( title=book_title, subtitle=book_subtitle, content=html_content, date=current_date ) html_path = project_dir / output_html html_path.write_text(final_html, encoding='utf-8') # 统计信息 md_size = md_path.stat().st_size html_size = html_path.stat().st_size print(f"\n{'=' * 50}") print(f"📖 合并完成!") print(f" 输出目录: {project_dir}") print(f" Markdown: {md_path.name} ({md_size / 1024:.1f} KB)") print(f" HTML: {html_path.name} ({html_size / 1024:.1f} KB)") print(f" 处理章节: {processed_count}") print(f" 跳过文件: {skipped_count}") print(f"{'=' * 50}") def main(): parser = argparse.ArgumentParser(description="GitBook 书籍自动合并工具") parser.add_argument("path", nargs="?", default=".", help="书籍根目录路径 (包含 SUMMARY.md 的目录)") parser.add_argument("--output-md", default="single-page.md", help="输出 Markdown 文件名") parser.add_argument("--output-html", default="single-page.html", help="输出 HTML 文件名") args = parser.parse_args() target_dir = Path(args.path).resolve() if not target_dir.is_dir(): print(f"❌ 错误: 目录不存在: {target_dir}") sys.exit(1) print("=" * 50) print("通用书籍合并工具 v2.0") print("=" * 50) print(f"工作目录: {target_dir}\n") combine_book(target_dir, args.output_md, args.output_html) if __name__ == '__main__': main()