From e58bd39de520db502b9a2f8c503f09f2bf1c01dd Mon Sep 17 00:00:00 2001 From: Baohua Yang Date: Sun, 15 Feb 2026 09:12:40 -0800 Subject: [PATCH] chore: remove combine.py and ignore it --- .gitignore | 2 +- combine.py | 1001 ---------------------------------------------------- 2 files changed, 1 insertion(+), 1002 deletions(-) delete mode 100644 combine.py diff --git a/.gitignore b/.gitignore index 94eaf02..7ced625 100644 --- a/.gitignore +++ b/.gitignore @@ -16,4 +16,4 @@ docker-compose.override.yml .obsidian/ .vscode/ -.agent/ \ No newline at end of file +.agent/combine.py diff --git a/combine.py b/combine.py deleted file mode 100644 index 14075b0..0000000 --- a/combine.py +++ /dev/null @@ -1,1001 +0,0 @@ -#!/usr/bin/env python3 -""" -通用书籍合并工具 (Generic Book Combiner) - -功能: -1. 自动扫描当前或指定目录。 -2. 解析 SUMMARY.md 获取章节结构。 -3. 解析 README.md 获取书籍标题和简介信息。 -4. 生成 single-page.md 和 single-page.html。 -""" - -import re -import html -import argparse -import sys -from pathlib import Path -from datetime import datetime - -# HTML 模板 -HTML_TEMPLATE = """ - - - - - {title} - - - - -
- - - -
- -
-

{title}

-

{subtitle}

-
- -{content} - -
- - - - - -""" - - -def extract_book_info(project_dir: Path) -> tuple[str, str]: - """ - 从 README.md 或 SUMMARY.md 中提取书籍标题和副标题。 - - Returns: - (title, subtitle) - """ - title = "Untitled Book" - subtitle = "Generated Book" - - # 优先尝试 README.md - readme_path = project_dir / 'README.md' - if readme_path.exists(): - try: - content = readme_path.read_text(encoding='utf-8') - lines = content.split('\n') - for line in lines[:10]: # 只看前10行 - match = re.match(r'^#\s+(.+)$', line) - if match: - title = match.group(1).strip() - break - - # 尝试查找引用块作为副标题 - for line in lines[:20]: - match = re.match(r'^>\s+(.+)$', line) - if match: - subtitle = match.group(1).strip() - break - return title, subtitle - except Exception: - pass - - # 其次尝试 SUMMARY.md - summary_path = project_dir / 'SUMMARY.md' - if summary_path.exists(): - try: - content = summary_path.read_text(encoding='utf-8') - lines = content.split('\n') - for line in lines[:5]: - match = re.match(r'^#\s+(.+)$', line) - if match: - title = match.group(1).strip() - return title, subtitle - except Exception: - pass - - return title, subtitle - - -def parse_summary(summary_path: Path) -> list[tuple[str, str, int]]: - """ - 解析 SUMMARY.md,提取所有章节链接。 - - Returns: - list of (title, file_path, indent_level) - """ - entries = [] - if not summary_path.exists(): - return entries - - content = summary_path.read_text(encoding='utf-8') - - # 匹配 Markdown 链接格式: * [标题](文件路径) 或 - [标题](文件路径) - # 支持多级缩进 - pattern = r'^(\s*)[\*\-]\s*\[([^\]]+)\]\(([^)]+)\)' - - for line in content.split('\n'): - match = re.match(pattern, line) - if match: - indent = len(match.group(1)) - title = match.group(2) - file_path = match.group(3) - - # 跳过外部链接 - if file_path.startswith('http'): - continue - - entries.append((title, file_path, indent)) - - return entries - - -def convert_internal_links_to_anchors(content: str, file_to_anchor_map: dict[str, str]) -> str: - """ - Convert internal markdown file links to anchor links for single-page output. - - Examples: - [Title](1.2_xxx.md) -> [Title](#anchor-id) - [Title](../04_mcp/README.md) -> [Title](#anchor-id) - [Title](file.md#section) -> [Title](#section) - - Args: - content: The markdown content to process - file_to_anchor_map: Mapping from file paths to their anchor IDs - - Returns: - Content with internal links converted to anchors - """ - def replace_link(match): - link_text = match.group(1) - link_target = match.group(2) - - # Skip external URLs and mailto links - if link_target.startswith('http://') or link_target.startswith('https://') or link_target.startswith('mailto:'): - return match.group(0) - - # Skip image links (they start with !) - # Check the character before the match - this is handled by the regex not matching ![]() - - # Handle anchor-only links - if link_target.startswith('#'): - return match.group(0) - - # Split target into file path and anchor - if '#' in link_target: - file_path, anchor = link_target.split('#', 1) - # If there's a specific anchor, use it directly - return f'[{link_text}](#{anchor})' - else: - file_path = link_target - - # Normalize the file path (remove ./, ../ prefixes and get the basename for matching) - # Extract just the filename for simple matching - normalized_path = file_path.replace('\\', '/').strip() - - # Try to find a matching anchor in the map - # First try exact match - if normalized_path in file_to_anchor_map: - return f'[{link_text}](#{file_to_anchor_map[normalized_path]})' - - # Try matching by filename only (for links like ../04_mcp/README.md) - from pathlib import PurePosixPath - filename = PurePosixPath(normalized_path).name - - # Search for matching file in the map - for path, anchor in file_to_anchor_map.items(): - if PurePosixPath(path).name == filename: - # For README.md, we need to be more specific - check parent directory - if filename == 'README.md': - # Try to match by parent directory - parts = normalized_path.replace('../', '').replace('./', '').split('/') - if len(parts) >= 2: - parent_dir = parts[-2] - path_parts = path.split('/') - if len(path_parts) >= 2 and path_parts[-2] == parent_dir: - return f'[{link_text}](#{anchor})' - continue - return f'[{link_text}](#{anchor})' - - # If no match found, generate an anchor from the link text - # This handles cases where the file might not be in the map - fallback_anchor = re.sub(r'[^\w\u4e00-\u9fff]+', '-', link_text.lower()).strip('-') - return f'[{link_text}](#{fallback_anchor})' - - # Match markdown links: [text](target) but not image links ![text](target) - # Use negative lookbehind for ! - pattern = r'(? str: - """ - Fix relative image paths for single-page output. - - When combining files from different directories, relative image paths like - `_images/xxx.png` need to be prefixed with the source file's directory. - - Examples: - If file is from 07_coding/7.4_ide.md: - ![alt](_images/cursor.png) -> ![alt](07_coding/_images/cursor.png) - - Args: - content: The markdown content to process - file_path: The relative path of the source file (e.g., "07_coding/7.4_ide.md") - - Returns: - Content with fixed image paths - """ - from pathlib import PurePosixPath - - # Get the directory of the source file - source_dir = str(PurePosixPath(file_path).parent) - - # If the file is in the root directory, no path fixing needed - if source_dir == '.': - return content - - def replace_image(match): - alt_text = match.group(1) - image_path = match.group(2) - - # Skip external URLs - if image_path.startswith('http://') or image_path.startswith('https://'): - return match.group(0) - - # Skip absolute paths - if image_path.startswith('/'): - return match.group(0) - - # Skip paths that already have a directory prefix (not starting with _images/) - if not image_path.startswith('_images/') and not image_path.startswith('./_images/'): - # Check if it's already a full path like 07_coding/_images/ - if '/_images/' in image_path or image_path.startswith('../'): - return match.group(0) - - # Remove leading ./ if present - clean_path = image_path.lstrip('./') - - # Prepend the source directory - new_path = f"{source_dir}/{clean_path}" - - return f'![{alt_text}]({new_path})' - - # Match markdown image syntax: ![alt](path) - pattern = r'!\[([^\]]*)\]\(([^)]+)\)' - return re.sub(pattern, replace_image, content) - - -def clean_navigation_links(content: str) -> str: - """ - Remove navigation links (Next/Previous, arrows) from the end of the content. - """ - lines = content.rstrip().split('\n') - - # Navigation line patterns - nav_patterns = [ - r'^\s*[-=]{3,}\s*$', # Separator lines - r'^\s*(\*\*|__)?(Next|Previous|下一[节章页]|上一[节章页])(\*\*|__)?.*$', # Text based - r'^\s*(➡️|→|=>|==>|Example|Download)\s*.*$', # Arrow/Indicator based - r'^\s*\[(Next|Previous|下一[节章]|上一[节章]).*?\]\(.*?\)\s*$', # Link with nav text - ] - - # Also catch "Arrow [Link](Url)" specifically if not caught above - # And purely link lines that look like nav " [Title](Url) " relative short - - while lines: - last_line = lines[-1].strip() - if not last_line: - lines.pop() - continue - - is_nav = False - - # Check explicit patterns - for pattern in nav_patterns: - if re.match(pattern, last_line, re.IGNORECASE): - is_nav = True - break - - # Check "Arrow + Link" specifically (common in this book) - if not is_nav: - # Pattern: Arrow (optional) + Link - # e.g. "➡️ [Title](Link)" - if re.match(r'^\s*(➡️|→|=>|==>)\s*\[.+?\]\(.+?\)\s*$', last_line): - is_nav = True - - if is_nav: - # print(f"DEBUG: Removing nav line: {last_line}") - lines.pop() - else: - # Found a non-nav line, stop checking - break - - return '\n'.join(lines) - - -def clean_redundant_header(content: str, title: str, subtitle: str) -> str: - """ - Remove the title and subtitle from the beginning of the content if they match the book info. - """ - lines = content.split('\n') - - # Remove leading blank lines - while lines and not lines[0].strip(): - lines.pop(0) - - if not lines: - return content - - # Check for Title (H1) - # Case 1: Exact match "# Title" - # Case 2: Match with some whitespace flexibility - if re.match(r'^#\s+' + re.escape(title) + r'\s*$', lines[0].strip(), re.IGNORECASE): - lines.pop(0) - # Remove blank lines after title - while lines and not lines[0].strip(): - lines.pop(0) - - # Check for Subtitle (Blockquote) - if subtitle and lines and lines[0].strip().startswith(">"): - # Clean punctuation for comparison just in case - line_text = lines[0].strip().lstrip('>').strip() - if subtitle in line_text or line_text in subtitle: - lines.pop(0) - # Remove blank lines after subtitle - while lines and not lines[0].strip(): - lines.pop(0) - - # Also remove common separator lines like "---" that often follow the header - if lines and lines[0].strip().replace(' ', '') == '---': - lines.pop(0) - while lines and not lines[0].strip(): - lines.pop(0) - - return '\n'.join(lines) - - -def markdown_to_html(md_content: str) -> str: - """ - 将 Markdown 转换为 HTML。 - 简单实现,不依赖外部库。 - """ - lines = md_content.split('\n') - html_lines = [] - in_code_block = False - code_lang = '' - code_content = [] - in_list = False - in_table = False - table_rows = [] - - def process_inline(text: str) -> str: - """处理行内格式""" - # 代码 - text = re.sub(r'`([^`]+)`', r'\1', text) - # 粗体 - text = re.sub(r'\*\*([^*]+)\*\*', r'\1', text) - # 斜体 - text = re.sub(r'\*([^*]+)\*', r'\1', text) - # 链接 - text = re.sub(r'\[([^\]]+)\]\(([^)]+)\)', r'\1', text) - return text - - i = 0 - while i < len(lines): - line = lines[i] - - # 代码块 - if line.startswith('```'): - if in_code_block: - html_lines.append(f'
{html.escape(chr(10).join(code_content))}
') - code_content = [] - in_code_block = False - else: - code_lang = line[3:].strip() or 'text' - in_code_block = True - i += 1 - continue - - if in_code_block: - code_content.append(line) - i += 1 - continue - - # 表格 - if '|' in line and not line.strip().startswith('```'): - cells = [c.strip() for c in line.split('|')] - cells = [c for c in cells if c] # 移除空单元格 - - if cells and not all(re.match(r'^[-:]+$', c) for c in cells): - if not in_table: - in_table = True - table_rows = [] - table_rows.append(cells) - elif in_table and all(re.match(r'^[-:]+$', c) for c in cells): - pass # 跳过分隔行 - - # 检查下一行是否还是表格 - if i + 1 >= len(lines) or '|' not in lines[i + 1]: - if in_table and table_rows: - html_lines.append('') - for j, row in enumerate(table_rows): - tag = 'th' if j == 0 else 'td' - html_lines.append('') - for cell in row: - html_lines.append(f'<{tag}>{process_inline(cell)}') - html_lines.append('') - html_lines.append('
') - table_rows = [] - in_table = False - i += 1 - continue - - # 标题 - heading_match = re.match(r'^(#{1,6})\s+(.+)$', line) - if heading_match: - level = len(heading_match.group(1)) - text = process_inline(heading_match.group(2)) - # 生成简单的 ID - anchor = re.sub(r'[^\w\u4e00-\u9fff]+', '-', heading_match.group(2).lower()).strip('-') - html_lines.append(f'{text}') - i += 1 - continue - - # 引用 - if line.startswith('>'): - quote_text = process_inline(line[1:].strip()) - html_lines.append(f'

{quote_text}

') - i += 1 - continue - - # 水平线 - if re.match(r'^-{3,}$|^\*{3,}$|^_{3,}$', line.strip()): - html_lines.append('
') - i += 1 - continue - - # 无序列表 - list_match = re.match(r'^(\s*)[\*\-]\s+(.+)$', line) - if list_match: - if not in_list: - html_lines.append('') - in_list = False - i += 1 - continue - - # 有序列表 - ol_match = re.match(r'^(\s*)\d+\.\s+(.+)$', line) - if ol_match: - if not in_list: - html_lines.append('
    ') - in_list = True - html_lines.append(f'
  1. {process_inline(ol_match.group(2))}
  2. ') - # 检查下一行 - if i + 1 >= len(lines) or not re.match(r'^\s*\d+\.\s+', lines[i + 1]): - html_lines.append('
') - in_list = False - i += 1 - continue - - # 空行 - if not line.strip(): - i += 1 - continue - - # 注释行(跳过) - if line.strip().startswith('\n") - md_body.append(f'
\n') # 添加锚点 - md_body.append(content) - md_body.append("\n") - - processed_count += 1 - print(f" ✅ {title}") - - except Exception as e: - print(f" ❌ 读取失败 {file_path}: {e}") - skipped_count += 1 - - # 生成 Markdown 文件 (包含标题头) - final_md = '\n'.join(md_header + md_body) - # 规范化空行 - final_md = re.sub(r'\n{4,}', '\n\n\n', final_md) - - md_path = project_dir / output_md - md_path.write_text(final_md, encoding='utf-8') - - # 生成 HTML 文件 (仅使用正文,因为 HTML 模板已有 Header) - print("\n🔄 正在生成 HTML...") - final_html_md = '\n'.join(md_body) - html_content = markdown_to_html(final_html_md) - - # 填充 HTML 模板 - current_date = datetime.now().strftime("%Y-%m-%d") - final_html = HTML_TEMPLATE.format( - title=book_title, - subtitle=book_subtitle, - content=html_content, - date=current_date - ) - - html_path = project_dir / output_html - html_path.write_text(final_html, encoding='utf-8') - - # 统计信息 - md_size = md_path.stat().st_size - html_size = html_path.stat().st_size - - print(f"\n{'=' * 50}") - print(f"📖 合并完成!") - print(f" 输出目录: {project_dir}") - print(f" Markdown: {md_path.name} ({md_size / 1024:.1f} KB)") - print(f" HTML: {html_path.name} ({html_size / 1024:.1f} KB)") - print(f" 处理章节: {processed_count}") - print(f" 跳过文件: {skipped_count}") - print(f"{'=' * 50}") - - -def main(): - parser = argparse.ArgumentParser(description="GitBook 书籍自动合并工具") - parser.add_argument("path", nargs="?", default=".", help="书籍根目录路径 (包含 SUMMARY.md 的目录)") - parser.add_argument("--output-md", default="single-page.md", help="输出 Markdown 文件名") - parser.add_argument("--output-html", default="single-page.html", help="输出 HTML 文件名") - - args = parser.parse_args() - - target_dir = Path(args.path).resolve() - - if not target_dir.is_dir(): - print(f"❌ 错误: 目录不存在: {target_dir}") - sys.exit(1) - - print("=" * 50) - print("通用书籍合并工具 v2.0") - print("=" * 50) - print(f"工作目录: {target_dir}\n") - - combine_book(target_dir, args.output_md, args.output_html) - - -if __name__ == '__main__': - main()