+
+{content}
+
+
+
+
+
+
+
+"""
+
+
+def extract_book_info(project_dir: Path) -> tuple[str, str]:
+ """
+ 从 README.md 或 SUMMARY.md 中提取书籍标题和副标题。
+
+ Returns:
+ (title, subtitle)
+ """
+ title = "Untitled Book"
+ subtitle = "Generated Book"
+
+ # 优先尝试 README.md
+ readme_path = project_dir / 'README.md'
+ if readme_path.exists():
+ try:
+ content = readme_path.read_text(encoding='utf-8')
+ lines = content.split('\n')
+ for line in lines[:10]: # 只看前10行
+ match = re.match(r'^#\s+(.+)$', line)
+ if match:
+ title = match.group(1).strip()
+ break
+
+ # 尝试查找引用块作为副标题
+ for line in lines[:20]:
+ match = re.match(r'^>\s+(.+)$', line)
+ if match:
+ subtitle = match.group(1).strip()
+ break
+ return title, subtitle
+ except Exception:
+ pass
+
+ # 其次尝试 SUMMARY.md
+ summary_path = project_dir / 'SUMMARY.md'
+ if summary_path.exists():
+ try:
+ content = summary_path.read_text(encoding='utf-8')
+ lines = content.split('\n')
+ for line in lines[:5]:
+ match = re.match(r'^#\s+(.+)$', line)
+ if match:
+ title = match.group(1).strip()
+ return title, subtitle
+ except Exception:
+ pass
+
+ return title, subtitle
+
+
+def parse_summary(summary_path: Path) -> list[tuple[str, str, int]]:
+ """
+ 解析 SUMMARY.md,提取所有章节链接。
+
+ Returns:
+ list of (title, file_path, indent_level)
+ """
+ entries = []
+ if not summary_path.exists():
+ return entries
+
+ content = summary_path.read_text(encoding='utf-8')
+
+ # 匹配 Markdown 链接格式: * [标题](文件路径) 或 - [标题](文件路径)
+ # 支持多级缩进
+ pattern = r'^(\s*)[\*\-]\s*\[([^\]]+)\]\(([^)]+)\)'
+
+ for line in content.split('\n'):
+ match = re.match(pattern, line)
+ if match:
+ indent = len(match.group(1))
+ title = match.group(2)
+ file_path = match.group(3)
+
+ # 跳过外部链接
+ if file_path.startswith('http'):
+ continue
+
+ entries.append((title, file_path, indent))
+
+ return entries
+
+
+def convert_internal_links_to_anchors(content: str, file_to_anchor_map: dict[str, str]) -> str:
+ """
+ Convert internal markdown file links to anchor links for single-page output.
+
+ Examples:
+ [Title](1.2_xxx.md) -> [Title](#anchor-id)
+ [Title](../04_mcp/README.md) -> [Title](#anchor-id)
+ [Title](file.md#section) -> [Title](#section)
+
+ Args:
+ content: The markdown content to process
+ file_to_anchor_map: Mapping from file paths to their anchor IDs
+
+ Returns:
+ Content with internal links converted to anchors
+ """
+ def replace_link(match):
+ link_text = match.group(1)
+ link_target = match.group(2)
+
+ # Skip external URLs and mailto links
+ if link_target.startswith('http://') or link_target.startswith('https://') or link_target.startswith('mailto:'):
+ return match.group(0)
+
+ # Skip image links (they start with !)
+ # Check the character before the match - this is handled by the regex not matching ![]()
+
+ # Handle anchor-only links
+ if link_target.startswith('#'):
+ return match.group(0)
+
+ # Split target into file path and anchor
+ if '#' in link_target:
+ file_path, anchor = link_target.split('#', 1)
+ # If there's a specific anchor, use it directly
+ return f'[{link_text}](#{anchor})'
+ else:
+ file_path = link_target
+
+ # Normalize the file path (remove ./, ../ prefixes and get the basename for matching)
+ # Extract just the filename for simple matching
+ normalized_path = file_path.replace('\\', '/').strip()
+
+ # Try to find a matching anchor in the map
+ # First try exact match
+ if normalized_path in file_to_anchor_map:
+ return f'[{link_text}](#{file_to_anchor_map[normalized_path]})'
+
+ # Try matching by filename only (for links like ../04_mcp/README.md)
+ from pathlib import PurePosixPath
+ filename = PurePosixPath(normalized_path).name
+
+ # Search for matching file in the map
+ for path, anchor in file_to_anchor_map.items():
+ if PurePosixPath(path).name == filename:
+ # For README.md, we need to be more specific - check parent directory
+ if filename == 'README.md':
+ # Try to match by parent directory
+ parts = normalized_path.replace('../', '').replace('./', '').split('/')
+ if len(parts) >= 2:
+ parent_dir = parts[-2]
+ path_parts = path.split('/')
+ if len(path_parts) >= 2 and path_parts[-2] == parent_dir:
+ return f'[{link_text}](#{anchor})'
+ continue
+ return f'[{link_text}](#{anchor})'
+
+ # If no match found, generate an anchor from the link text
+ # This handles cases where the file might not be in the map
+ fallback_anchor = re.sub(r'[^\w\u4e00-\u9fff]+', '-', link_text.lower()).strip('-')
+ return f'[{link_text}](#{fallback_anchor})'
+
+ # Match markdown links: [text](target) but not image links 
+ # Use negative lookbehind for !
+ pattern = r'(? str:
+ """
+ Fix relative image paths for single-page output.
+
+ When combining files from different directories, relative image paths like
+ `_images/xxx.png` need to be prefixed with the source file's directory.
+
+ Examples:
+ If file is from 07_coding/7.4_ide.md:
+  -> 
+
+ Args:
+ content: The markdown content to process
+ file_path: The relative path of the source file (e.g., "07_coding/7.4_ide.md")
+
+ Returns:
+ Content with fixed image paths
+ """
+ from pathlib import PurePosixPath
+
+ # Get the directory of the source file
+ source_dir = str(PurePosixPath(file_path).parent)
+
+ # If the file is in the root directory, no path fixing needed
+ if source_dir == '.':
+ return content
+
+ def replace_image(match):
+ alt_text = match.group(1)
+ image_path = match.group(2)
+
+ # Skip external URLs
+ if image_path.startswith('http://') or image_path.startswith('https://'):
+ return match.group(0)
+
+ # Skip absolute paths
+ if image_path.startswith('/'):
+ return match.group(0)
+
+ # Skip paths that already have a directory prefix (not starting with _images/)
+ if not image_path.startswith('_images/') and not image_path.startswith('./_images/'):
+ # Check if it's already a full path like 07_coding/_images/
+ if '/_images/' in image_path or image_path.startswith('../'):
+ return match.group(0)
+
+ # Remove leading ./ if present
+ clean_path = image_path.lstrip('./')
+
+ # Prepend the source directory
+ new_path = f"{source_dir}/{clean_path}"
+
+ return f''
+
+ # Match markdown image syntax: 
+ pattern = r'!\[([^\]]*)\]\(([^)]+)\)'
+ return re.sub(pattern, replace_image, content)
+
+
+def clean_navigation_links(content: str) -> str:
+ """
+ Remove navigation links (Next/Previous, arrows) from the end of the content.
+ """
+ lines = content.rstrip().split('\n')
+
+ # Navigation line patterns
+ nav_patterns = [
+ r'^\s*[-=]{3,}\s*$', # Separator lines
+ r'^\s*(\*\*|__)?(Next|Previous|下一[节章页]|上一[节章页])(\*\*|__)?.*$', # Text based
+ r'^\s*(➡️|→|=>|==>|Example|Download)\s*.*$', # Arrow/Indicator based
+ r'^\s*\[(Next|Previous|下一[节章]|上一[节章]).*?\]\(.*?\)\s*$', # Link with nav text
+ ]
+
+ # Also catch "Arrow [Link](Url)" specifically if not caught above
+ # And purely link lines that look like nav " [Title](Url) " relative short
+
+ while lines:
+ last_line = lines[-1].strip()
+ if not last_line:
+ lines.pop()
+ continue
+
+ is_nav = False
+
+ # Check explicit patterns
+ for pattern in nav_patterns:
+ if re.match(pattern, last_line, re.IGNORECASE):
+ is_nav = True
+ break
+
+ # Check "Arrow + Link" specifically (common in this book)
+ if not is_nav:
+ # Pattern: Arrow (optional) + Link
+ # e.g. "➡️ [Title](Link)"
+ if re.match(r'^\s*(➡️|→|=>|==>)\s*\[.+?\]\(.+?\)\s*$', last_line):
+ is_nav = True
+
+ if is_nav:
+ # print(f"DEBUG: Removing nav line: {last_line}")
+ lines.pop()
+ else:
+ # Found a non-nav line, stop checking
+ break
+
+ return '\n'.join(lines)
+
+
+def clean_redundant_header(content: str, title: str, subtitle: str) -> str:
+ """
+ Remove the title and subtitle from the beginning of the content if they match the book info.
+ """
+ lines = content.split('\n')
+
+ # Remove leading blank lines
+ while lines and not lines[0].strip():
+ lines.pop(0)
+
+ if not lines:
+ return content
+
+ # Check for Title (H1)
+ # Case 1: Exact match "# Title"
+ # Case 2: Match with some whitespace flexibility
+ if re.match(r'^#\s+' + re.escape(title) + r'\s*$', lines[0].strip(), re.IGNORECASE):
+ lines.pop(0)
+ # Remove blank lines after title
+ while lines and not lines[0].strip():
+ lines.pop(0)
+
+ # Check for Subtitle (Blockquote)
+ if subtitle and lines and lines[0].strip().startswith(">"):
+ # Clean punctuation for comparison just in case
+ line_text = lines[0].strip().lstrip('>').strip()
+ if subtitle in line_text or line_text in subtitle:
+ lines.pop(0)
+ # Remove blank lines after subtitle
+ while lines and not lines[0].strip():
+ lines.pop(0)
+
+ # Also remove common separator lines like "---" that often follow the header
+ if lines and lines[0].strip().replace(' ', '') == '---':
+ lines.pop(0)
+ while lines and not lines[0].strip():
+ lines.pop(0)
+
+ return '\n'.join(lines)
+
+
+def markdown_to_html(md_content: str) -> str:
+ """
+ 将 Markdown 转换为 HTML。
+ 简单实现,不依赖外部库。
+ """
+ lines = md_content.split('\n')
+ html_lines = []
+ in_code_block = False
+ code_lang = ''
+ code_content = []
+ in_list = False
+ in_table = False
+ table_rows = []
+
+ def process_inline(text: str) -> str:
+ """处理行内格式"""
+ # 代码
+ text = re.sub(r'`([^`]+)`', r'\1', text)
+ # 粗体
+ text = re.sub(r'\*\*([^*]+)\*\*', r'\1', text)
+ # 斜体
+ text = re.sub(r'\*([^*]+)\*', r'\1', text)
+ # 链接
+ text = re.sub(r'\[([^\]]+)\]\(([^)]+)\)', r'\1', text)
+ return text
+
+ i = 0
+ while i < len(lines):
+ line = lines[i]
+
+ # 代码块
+ if line.startswith('```'):
+ if in_code_block:
+ html_lines.append(f'
{html.escape(chr(10).join(code_content))}
')
+ code_content = []
+ in_code_block = False
+ else:
+ code_lang = line[3:].strip() or 'text'
+ in_code_block = True
+ i += 1
+ continue
+
+ if in_code_block:
+ code_content.append(line)
+ i += 1
+ continue
+
+ # 表格
+ if '|' in line and not line.strip().startswith('```'):
+ cells = [c.strip() for c in line.split('|')]
+ cells = [c for c in cells if c] # 移除空单元格
+
+ if cells and not all(re.match(r'^[-:]+$', c) for c in cells):
+ if not in_table:
+ in_table = True
+ table_rows = []
+ table_rows.append(cells)
+ elif in_table and all(re.match(r'^[-:]+$', c) for c in cells):
+ pass # 跳过分隔行
+
+ # 检查下一行是否还是表格
+ if i + 1 >= len(lines) or '|' not in lines[i + 1]:
+ if in_table and table_rows:
+ html_lines.append('
')
+ for j, row in enumerate(table_rows):
+ tag = 'th' if j == 0 else 'td'
+ html_lines.append('
')
+ for cell in row:
+ html_lines.append(f'<{tag}>{process_inline(cell)}{tag}>')
+ html_lines.append('