mirror of
https://github.com/yeasy/docker_practice.git
synced 2026-03-10 11:54:37 +00:00
1002 lines
32 KiB
Go
1002 lines
32 KiB
Go
#!/usr/bin/env python3
|
||
"""
|
||
通用书籍合并工具 (Generic Book Combiner)
|
||
|
||
功能:
|
||
1. 自动扫描当前或指定目录。
|
||
2. 解析 SUMMARY.md 获取章节结构。
|
||
3. 解析 README.md 获取书籍标题和简介信息。
|
||
4. 生成 single-page.md 和 single-page.html。
|
||
"""
|
||
|
||
import re
|
||
import html
|
||
import argparse
|
||
import sys
|
||
from pathlib import Path
|
||
from datetime import datetime
|
||
|
||
# HTML 模板
|
||
HTML_TEMPLATE = """<!DOCTYPE html>
|
||
<html lang="zh-CN" data-theme="dark">
|
||
<head>
|
||
<meta charset="UTF-8">
|
||
<meta name="viewport" content="width=device-width, initial-scale=1.0">
|
||
<title>{title}</title>
|
||
<style>
|
||
:root {{
|
||
/* Common variables */
|
||
--transition-speed: 0.3s;
|
||
}}
|
||
|
||
/* Dark Theme (Default/Cyberpunk) */
|
||
:root[data-theme="dark"] {{
|
||
--bg-color: #1a1a2e;
|
||
--text-color: #e4e4e4;
|
||
--heading-color: #00d4ff;
|
||
--link-color: #00d4ff;
|
||
--code-bg: #16213e;
|
||
--border-color: #0f3460;
|
||
--accent: #e94560;
|
||
--quote-bg: rgba(233, 69, 96, 0.1);
|
||
--toc-bg: #16213e;
|
||
--table-even-bg: rgba(15, 52, 96, 0.3);
|
||
--th-bg: #16213e;
|
||
}}
|
||
|
||
/* Light Theme */
|
||
:root[data-theme="light"] {{
|
||
--bg-color: #ffffff;
|
||
--text-color: #333333;
|
||
--heading-color: #2c3e50;
|
||
--link-color: #0366d6;
|
||
--code-bg: #f6f8fa;
|
||
--border-color: #eaecef;
|
||
--accent: #0366d6;
|
||
--quote-bg: #f0f7ff;
|
||
--toc-bg: #f6f8fa;
|
||
--table-even-bg: #f6f8fa;
|
||
--th-bg: #f6f8fa;
|
||
}}
|
||
|
||
/* Sepia Theme */
|
||
:root[data-theme="sepia"] {{
|
||
--bg-color: #f4ecd8;
|
||
--text-color: #5b4636;
|
||
--heading-color: #433422;
|
||
--link-color: #a44806;
|
||
--code-bg: #eaddcf;
|
||
--border-color: #d3cabd;
|
||
--accent: #a44806;
|
||
--quote-bg: #eaddcf;
|
||
--toc-bg: #eaddcf;
|
||
--table-even-bg: #eaddcf;
|
||
--th-bg: #eaddcf;
|
||
}}
|
||
|
||
* {{
|
||
box-sizing: border-box;
|
||
transition: background-color var(--transition-speed), color var(--transition-speed), border-color var(--transition-speed);
|
||
}}
|
||
|
||
body {{
|
||
font-family: -apple-system, BlinkMacSystemFont, 'Segoe UI', Roboto, 'Helvetica Neue', Arial, sans-serif;
|
||
line-height: 1.8;
|
||
color: var(--text-color);
|
||
background: var(--bg-color);
|
||
max-width: 900px;
|
||
margin: 0 auto;
|
||
padding: 2rem;
|
||
}}
|
||
|
||
h1, h2, h3, h4, h5, h6 {{
|
||
color: var(--heading-color);
|
||
margin-top: 2rem;
|
||
margin-bottom: 1rem;
|
||
border-bottom: 1px solid var(--border-color);
|
||
padding-bottom: 0.5rem;
|
||
}}
|
||
|
||
h1 {{ font-size: 2.5rem; border-bottom: 3px solid var(--accent); }}
|
||
h2 {{ font-size: 2rem; }}
|
||
h3 {{ font-size: 1.5rem; border-bottom: none; }}
|
||
h4, h5, h6 {{ border-bottom: none; }}
|
||
|
||
a {{
|
||
color: var(--link-color);
|
||
text-decoration: none;
|
||
}}
|
||
|
||
a:hover {{
|
||
text-decoration: underline;
|
||
}}
|
||
|
||
code {{
|
||
background: var(--code-bg);
|
||
padding: 0.2rem 0.4rem;
|
||
border-radius: 4px;
|
||
font-family: 'SF Mono', 'Fira Code', Consolas, monospace;
|
||
font-size: 0.9em;
|
||
}}
|
||
|
||
pre {{
|
||
background: var(--code-bg);
|
||
padding: 1rem;
|
||
border-radius: 8px;
|
||
overflow-x: auto;
|
||
border: 1px solid var(--border-color);
|
||
}}
|
||
|
||
pre code {{
|
||
padding: 0;
|
||
background: none;
|
||
}}
|
||
|
||
blockquote {{
|
||
border-left: 4px solid var(--accent);
|
||
margin: 1rem 0;
|
||
padding: 0.5rem 1rem;
|
||
background: var(--quote-bg);
|
||
border-radius: 0 8px 8px 0;
|
||
}}
|
||
|
||
table {{
|
||
width: 100%;
|
||
border-collapse: collapse;
|
||
margin: 1rem 0;
|
||
}}
|
||
|
||
th, td {{
|
||
border: 1px solid var(--border-color);
|
||
padding: 0.75rem;
|
||
text-align: left;
|
||
}}
|
||
|
||
th {{
|
||
background: var(--th-bg);
|
||
color: var(--heading-color);
|
||
}}
|
||
|
||
tr:nth-child(even) {{
|
||
background: var(--table-even-bg);
|
||
}}
|
||
|
||
hr {{
|
||
border: none;
|
||
border-top: 2px solid var(--border-color);
|
||
margin: 3rem 0;
|
||
}}
|
||
|
||
img {{
|
||
max-width: 100%;
|
||
height: auto;
|
||
}}
|
||
|
||
ul, ol {{
|
||
padding-left: 1.5rem;
|
||
}}
|
||
|
||
li {{
|
||
margin: 0.5rem 0;
|
||
}}
|
||
|
||
.toc {{
|
||
background: var(--toc-bg);
|
||
padding: 1.5rem;
|
||
border-radius: 8px;
|
||
margin: 2rem 0;
|
||
border: 1px solid var(--border-color);
|
||
}}
|
||
|
||
.toc h2 {{
|
||
margin-top: 0;
|
||
border-bottom: none;
|
||
}}
|
||
|
||
.toc ul {{
|
||
list-style: none;
|
||
padding-left: 0;
|
||
}}
|
||
|
||
.toc li {{
|
||
margin: 0.3rem 0;
|
||
}}
|
||
|
||
.toc a {{
|
||
color: var(--text-color);
|
||
}}
|
||
|
||
.toc a:hover {{
|
||
color: var(--link-color);
|
||
}}
|
||
|
||
/* Theme Switcher Styles */
|
||
.theme-switch {{
|
||
position: fixed;
|
||
top: 20px;
|
||
right: 20px;
|
||
background: var(--code-bg);
|
||
border: 1px solid var(--border-color);
|
||
border-radius: 8px;
|
||
padding: 5px;
|
||
display: flex;
|
||
gap: 5px;
|
||
z-index: 1000;
|
||
opacity: 0.8;
|
||
transition: opacity 0.3s;
|
||
}}
|
||
.theme-switch:hover {{
|
||
opacity: 1;
|
||
}}
|
||
.theme-btn {{
|
||
background: none;
|
||
border: none;
|
||
cursor: pointer;
|
||
padding: 5px 10px;
|
||
border-radius: 4px;
|
||
font-size: 14px;
|
||
color: var(--text-color);
|
||
transition: 0.2s;
|
||
}}
|
||
.theme-btn:hover {{
|
||
background: var(--border-color);
|
||
}}
|
||
.theme-btn.active {{
|
||
background: var(--accent);
|
||
color: white;
|
||
}}
|
||
|
||
.chapter-marker {{
|
||
display: none;
|
||
}}
|
||
|
||
.header {{
|
||
text-align: center;
|
||
padding: 2rem 0;
|
||
border-bottom: 3px solid var(--accent);
|
||
margin-bottom: 2rem;
|
||
}}
|
||
|
||
.header h1 {{
|
||
border: none;
|
||
margin: 0;
|
||
}}
|
||
|
||
.header p {{
|
||
color: #888;
|
||
margin: 0.5rem 0 0 0;
|
||
}}
|
||
|
||
@media (max-width: 768px) {{
|
||
body {{
|
||
padding: 1rem;
|
||
}}
|
||
|
||
h1 {{ font-size: 1.8rem; }}
|
||
h2 {{ font-size: 1.5rem; }}
|
||
h3 {{ font-size: 1.2rem; }}
|
||
|
||
.theme-switch {{
|
||
top: 10px;
|
||
right: 10px;
|
||
}}
|
||
}}
|
||
|
||
@media print {{
|
||
body {{
|
||
background: white;
|
||
color: black;
|
||
max-width: none;
|
||
}}
|
||
|
||
h1, h2, h3, h4, h5, h6 {{
|
||
color: black;
|
||
}}
|
||
|
||
pre, code, .toc {{
|
||
background: #f5f5f5;
|
||
}}
|
||
|
||
.theme-switch {{
|
||
display: none;
|
||
}}
|
||
}}
|
||
</style>
|
||
<script>
|
||
// Init theme immediately to prevent flash
|
||
(function() {{
|
||
const savedTheme = localStorage.getItem('theme') || 'dark';
|
||
document.documentElement.setAttribute('data-theme', savedTheme);
|
||
}})();
|
||
</script>
|
||
</head>
|
||
<body>
|
||
<div class="theme-switch">
|
||
<button class="theme-btn" onclick="setTheme('dark')" id="btn-dark">🌙</button>
|
||
<button class="theme-btn" onclick="setTheme('light')" id="btn-light">☀️</button>
|
||
<button class="theme-btn" onclick="setTheme('sepia')" id="btn-sepia">☕</button>
|
||
</div>
|
||
|
||
<div class="header">
|
||
<h1>{title}</h1>
|
||
<p>{subtitle}</p>
|
||
</div>
|
||
|
||
{content}
|
||
|
||
<hr>
|
||
<footer style="text-align: center; color: #666; padding: 2rem 0;">
|
||
<p>{date}</p>
|
||
</footer>
|
||
<script src="https://cdn.jsdelivr.net/npm/mermaid/dist/mermaid.min.js"></script>
|
||
<script>
|
||
// Theme Logic
|
||
function setTheme(theme) {{
|
||
document.documentElement.setAttribute('data-theme', theme);
|
||
localStorage.setItem('theme', theme);
|
||
updateButtons(theme);
|
||
|
||
// Re-render mermaid if needed (theme change might require config update,
|
||
// but for now simple CSS swap is usually enough for diagrams if using transparent backgrounds.
|
||
// However, Mermaid default 'dark' theme might look bad on light.
|
||
// Ideally we reload or re-init, but that's complex.
|
||
// For now, let's keep Mermaid dark-ish or neutral.)
|
||
}}
|
||
|
||
function updateButtons(theme) {{
|
||
document.querySelectorAll('.theme-btn').forEach(btn => btn.classList.remove('active'));
|
||
document.getElementById('btn-' + theme).classList.add('active');
|
||
}}
|
||
|
||
// Init Buttons
|
||
document.addEventListener("DOMContentLoaded", function() {{
|
||
const currentTheme = document.documentElement.getAttribute('data-theme');
|
||
updateButtons(currentTheme);
|
||
|
||
// Find all code blocks with language-mermaid
|
||
var mermaidBlocks = document.querySelectorAll('pre code.language-mermaid');
|
||
|
||
mermaidBlocks.forEach(function(block) {{
|
||
var pre = block.parentElement;
|
||
var div = document.createElement('div');
|
||
div.className = 'mermaid';
|
||
div.textContent = block.textContent;
|
||
|
||
// Replace pre with div
|
||
pre.parentNode.replaceChild(div, pre);
|
||
}});
|
||
|
||
// Initialize mermaid
|
||
const isDark = currentTheme === 'dark';
|
||
mermaid.initialize({{
|
||
startOnLoad: true,
|
||
theme: 'base',
|
||
themeVariables: {{
|
||
darkMode: isDark,
|
||
background: 'transparent',
|
||
lineColor: isDark ? '#e4e4e4' : '#333333',
|
||
stroke: isDark ? '#e4e4e4' : '#333333',
|
||
primaryTextColor: isDark ? '#e4e4e4' : '#333333',
|
||
secondaryColor: isDark ? '#16213e' : '#f6f8fa',
|
||
tertiaryColor: isDark ? '#16213e' : '#f6f8fa'
|
||
}},
|
||
securityLevel: 'loose'
|
||
}});
|
||
}});
|
||
</script>
|
||
</body>
|
||
</html>
|
||
"""
|
||
|
||
|
||
def extract_book_info(project_dir: Path) -> tuple[str, str]:
|
||
"""
|
||
从 README.md 或 SUMMARY.md 中提取书籍标题和副标题。
|
||
|
||
Returns:
|
||
(title, subtitle)
|
||
"""
|
||
title = "Untitled Book"
|
||
subtitle = "Generated Book"
|
||
|
||
# 优先尝试 README.md
|
||
readme_path = project_dir / 'README.md'
|
||
if readme_path.exists():
|
||
try:
|
||
content = readme_path.read_text(encoding='utf-8')
|
||
lines = content.split('\n')
|
||
for line in lines[:10]: # 只看前10行
|
||
match = re.match(r'^#\s+(.+)$', line)
|
||
if match:
|
||
title = match.group(1).strip()
|
||
break
|
||
|
||
# 尝试查找引用块作为副标题
|
||
for line in lines[:20]:
|
||
match = re.match(r'^>\s+(.+)$', line)
|
||
if match:
|
||
subtitle = match.group(1).strip()
|
||
break
|
||
return title, subtitle
|
||
except Exception:
|
||
pass
|
||
|
||
# 其次尝试 SUMMARY.md
|
||
summary_path = project_dir / 'SUMMARY.md'
|
||
if summary_path.exists():
|
||
try:
|
||
content = summary_path.read_text(encoding='utf-8')
|
||
lines = content.split('\n')
|
||
for line in lines[:5]:
|
||
match = re.match(r'^#\s+(.+)$', line)
|
||
if match:
|
||
title = match.group(1).strip()
|
||
return title, subtitle
|
||
except Exception:
|
||
pass
|
||
|
||
return title, subtitle
|
||
|
||
|
||
def parse_summary(summary_path: Path) -> list[tuple[str, str, int]]:
|
||
"""
|
||
解析 SUMMARY.md,提取所有章节链接。
|
||
|
||
Returns:
|
||
list of (title, file_path, indent_level)
|
||
"""
|
||
entries = []
|
||
if not summary_path.exists():
|
||
return entries
|
||
|
||
content = summary_path.read_text(encoding='utf-8')
|
||
|
||
# 匹配 Markdown 链接格式: * [标题](文件路径) 或 - [标题](文件路径)
|
||
# 支持多级缩进
|
||
pattern = r'^(\s*)[\*\-]\s*\[([^\]]+)\]\(([^)]+)\)'
|
||
|
||
for line in content.split('\n'):
|
||
match = re.match(pattern, line)
|
||
if match:
|
||
indent = len(match.group(1))
|
||
title = match.group(2)
|
||
file_path = match.group(3)
|
||
|
||
# 跳过外部链接
|
||
if file_path.startswith('http'):
|
||
continue
|
||
|
||
entries.append((title, file_path, indent))
|
||
|
||
return entries
|
||
|
||
|
||
def convert_internal_links_to_anchors(content: str, file_to_anchor_map: dict[str, str]) -> str:
|
||
"""
|
||
Convert internal markdown file links to anchor links for single-page output.
|
||
|
||
Examples:
|
||
[Title](1.2_xxx.md) -> [Title](#anchor-id)
|
||
[Title](../04_mcp/README.md) -> [Title](#anchor-id)
|
||
[Title](file.md#section) -> [Title](#section)
|
||
|
||
Args:
|
||
content: The markdown content to process
|
||
file_to_anchor_map: Mapping from file paths to their anchor IDs
|
||
|
||
Returns:
|
||
Content with internal links converted to anchors
|
||
"""
|
||
def replace_link(match):
|
||
link_text = match.group(1)
|
||
link_target = match.group(2)
|
||
|
||
# Skip external URLs and mailto links
|
||
if link_target.startswith('http://') or link_target.startswith('https://') or link_target.startswith('mailto:'):
|
||
return match.group(0)
|
||
|
||
# Skip image links (they start with !)
|
||
# Check the character before the match - this is handled by the regex not matching ![]()
|
||
|
||
# Handle anchor-only links
|
||
if link_target.startswith('#'):
|
||
return match.group(0)
|
||
|
||
# Split target into file path and anchor
|
||
if '#' in link_target:
|
||
file_path, anchor = link_target.split('#', 1)
|
||
# If there's a specific anchor, use it directly
|
||
return f'[{link_text}](#{anchor})'
|
||
else:
|
||
file_path = link_target
|
||
|
||
# Normalize the file path (remove ./, ../ prefixes and get the basename for matching)
|
||
# Extract just the filename for simple matching
|
||
normalized_path = file_path.replace('\\', '/').strip()
|
||
|
||
# Try to find a matching anchor in the map
|
||
# First try exact match
|
||
if normalized_path in file_to_anchor_map:
|
||
return f'[{link_text}](#{file_to_anchor_map[normalized_path]})'
|
||
|
||
# Try matching by filename only (for links like ../04_mcp/README.md)
|
||
from pathlib import PurePosixPath
|
||
filename = PurePosixPath(normalized_path).name
|
||
|
||
# Search for matching file in the map
|
||
for path, anchor in file_to_anchor_map.items():
|
||
if PurePosixPath(path).name == filename:
|
||
# For README.md, we need to be more specific - check parent directory
|
||
if filename == 'README.md':
|
||
# Try to match by parent directory
|
||
parts = normalized_path.replace('../', '').replace('./', '').split('/')
|
||
if len(parts) >= 2:
|
||
parent_dir = parts[-2]
|
||
path_parts = path.split('/')
|
||
if len(path_parts) >= 2 and path_parts[-2] == parent_dir:
|
||
return f'[{link_text}](#{anchor})'
|
||
continue
|
||
return f'[{link_text}](#{anchor})'
|
||
|
||
# If no match found, generate an anchor from the link text
|
||
# This handles cases where the file might not be in the map
|
||
fallback_anchor = re.sub(r'[^\w\u4e00-\u9fff]+', '-', link_text.lower()).strip('-')
|
||
return f'[{link_text}](#{fallback_anchor})'
|
||
|
||
# Match markdown links: [text](target) but not image links 
|
||
# Use negative lookbehind for !
|
||
pattern = r'(?<!!)\[([^\]]+)\]\(([^)]+)\)'
|
||
return re.sub(pattern, replace_link, content)
|
||
|
||
|
||
def fix_image_paths(content: str, file_path: str) -> str:
|
||
"""
|
||
Fix relative image paths for single-page output.
|
||
|
||
When combining files from different directories, relative image paths like
|
||
`_images/xxx.png` need to be prefixed with the source file's directory.
|
||
|
||
Examples:
|
||
If file is from 07_coding/7.4_ide.md:
|
||
 -> 
|
||
|
||
Args:
|
||
content: The markdown content to process
|
||
file_path: The relative path of the source file (e.g., "07_coding/7.4_ide.md")
|
||
|
||
Returns:
|
||
Content with fixed image paths
|
||
"""
|
||
from pathlib import PurePosixPath
|
||
|
||
# Get the directory of the source file
|
||
source_dir = str(PurePosixPath(file_path).parent)
|
||
|
||
# If the file is in the root directory, no path fixing needed
|
||
if source_dir == '.':
|
||
return content
|
||
|
||
def replace_image(match):
|
||
alt_text = match.group(1)
|
||
image_path = match.group(2)
|
||
|
||
# Skip external URLs
|
||
if image_path.startswith('http://') or image_path.startswith('https://'):
|
||
return match.group(0)
|
||
|
||
# Skip absolute paths
|
||
if image_path.startswith('/'):
|
||
return match.group(0)
|
||
|
||
# Skip paths that already have a directory prefix (not starting with _images/)
|
||
if not image_path.startswith('_images/') and not image_path.startswith('./_images/'):
|
||
# Check if it's already a full path like 07_coding/_images/
|
||
if '/_images/' in image_path or image_path.startswith('../'):
|
||
return match.group(0)
|
||
|
||
# Remove leading ./ if present
|
||
clean_path = image_path.lstrip('./')
|
||
|
||
# Prepend the source directory
|
||
new_path = f"{source_dir}/{clean_path}"
|
||
|
||
return f''
|
||
|
||
# Match markdown image syntax: 
|
||
pattern = r'!\[([^\]]*)\]\(([^)]+)\)'
|
||
return re.sub(pattern, replace_image, content)
|
||
|
||
|
||
def clean_navigation_links(content: str) -> str:
|
||
"""
|
||
Remove navigation links (Next/Previous, arrows) from the end of the content.
|
||
"""
|
||
lines = content.rstrip().split('\n')
|
||
|
||
# Navigation line patterns
|
||
nav_patterns = [
|
||
r'^\s*[-=]{3,}\s*$', # Separator lines
|
||
r'^\s*(\*\*|__)?(Next|Previous|下一[节章页]|上一[节章页])(\*\*|__)?.*$', # Text based
|
||
r'^\s*(➡️|→|=>|==>|Example|Download)\s*.*$', # Arrow/Indicator based
|
||
r'^\s*\[(Next|Previous|下一[节章]|上一[节章]).*?\]\(.*?\)\s*$', # Link with nav text
|
||
]
|
||
|
||
# Also catch "Arrow [Link](Url)" specifically if not caught above
|
||
# And purely link lines that look like nav " [Title](Url) " relative short
|
||
|
||
while lines:
|
||
last_line = lines[-1].strip()
|
||
if not last_line:
|
||
lines.pop()
|
||
continue
|
||
|
||
is_nav = False
|
||
|
||
# Check explicit patterns
|
||
for pattern in nav_patterns:
|
||
if re.match(pattern, last_line, re.IGNORECASE):
|
||
is_nav = True
|
||
break
|
||
|
||
# Check "Arrow + Link" specifically (common in this book)
|
||
if not is_nav:
|
||
# Pattern: Arrow (optional) + Link
|
||
# e.g. "➡️ [Title](Link)"
|
||
if re.match(r'^\s*(➡️|→|=>|==>)\s*\[.+?\]\(.+?\)\s*$', last_line):
|
||
is_nav = True
|
||
|
||
if is_nav:
|
||
# print(f"DEBUG: Removing nav line: {last_line}")
|
||
lines.pop()
|
||
else:
|
||
# Found a non-nav line, stop checking
|
||
break
|
||
|
||
return '\n'.join(lines)
|
||
|
||
|
||
def clean_redundant_header(content: str, title: str, subtitle: str) -> str:
|
||
"""
|
||
Remove the title and subtitle from the beginning of the content if they match the book info.
|
||
"""
|
||
lines = content.split('\n')
|
||
|
||
# Remove leading blank lines
|
||
while lines and not lines[0].strip():
|
||
lines.pop(0)
|
||
|
||
if not lines:
|
||
return content
|
||
|
||
# Check for Title (H1)
|
||
# Case 1: Exact match "# Title"
|
||
# Case 2: Match with some whitespace flexibility
|
||
if re.match(r'^#\s+' + re.escape(title) + r'\s*$', lines[0].strip(), re.IGNORECASE):
|
||
lines.pop(0)
|
||
# Remove blank lines after title
|
||
while lines and not lines[0].strip():
|
||
lines.pop(0)
|
||
|
||
# Check for Subtitle (Blockquote)
|
||
if subtitle and lines and lines[0].strip().startswith(">"):
|
||
# Clean punctuation for comparison just in case
|
||
line_text = lines[0].strip().lstrip('>').strip()
|
||
if subtitle in line_text or line_text in subtitle:
|
||
lines.pop(0)
|
||
# Remove blank lines after subtitle
|
||
while lines and not lines[0].strip():
|
||
lines.pop(0)
|
||
|
||
# Also remove common separator lines like "---" that often follow the header
|
||
if lines and lines[0].strip().replace(' ', '') == '---':
|
||
lines.pop(0)
|
||
while lines and not lines[0].strip():
|
||
lines.pop(0)
|
||
|
||
return '\n'.join(lines)
|
||
|
||
|
||
def markdown_to_html(md_content: str) -> str:
|
||
"""
|
||
将 Markdown 转换为 HTML。
|
||
简单实现,不依赖外部库。
|
||
"""
|
||
lines = md_content.split('\n')
|
||
html_lines = []
|
||
in_code_block = False
|
||
code_lang = ''
|
||
code_content = []
|
||
in_list = False
|
||
in_table = False
|
||
table_rows = []
|
||
|
||
def process_inline(text: str) -> str:
|
||
"""处理行内格式"""
|
||
# 代码
|
||
text = re.sub(r'`([^`]+)`', r'<code>\1</code>', text)
|
||
# 粗体
|
||
text = re.sub(r'\*\*([^*]+)\*\*', r'<strong>\1</strong>', text)
|
||
# 斜体
|
||
text = re.sub(r'\*([^*]+)\*', r'<em>\1</em>', text)
|
||
# 链接
|
||
text = re.sub(r'\[([^\]]+)\]\(([^)]+)\)', r'<a href="\2">\1</a>', text)
|
||
return text
|
||
|
||
i = 0
|
||
while i < len(lines):
|
||
line = lines[i]
|
||
|
||
# 代码块
|
||
if line.startswith('```'):
|
||
if in_code_block:
|
||
html_lines.append(f'<pre><code class="language-{code_lang}">{html.escape(chr(10).join(code_content))}</code></pre>')
|
||
code_content = []
|
||
in_code_block = False
|
||
else:
|
||
code_lang = line[3:].strip() or 'text'
|
||
in_code_block = True
|
||
i += 1
|
||
continue
|
||
|
||
if in_code_block:
|
||
code_content.append(line)
|
||
i += 1
|
||
continue
|
||
|
||
# 表格
|
||
if '|' in line and not line.strip().startswith('```'):
|
||
cells = [c.strip() for c in line.split('|')]
|
||
cells = [c for c in cells if c] # 移除空单元格
|
||
|
||
if cells and not all(re.match(r'^[-:]+$', c) for c in cells):
|
||
if not in_table:
|
||
in_table = True
|
||
table_rows = []
|
||
table_rows.append(cells)
|
||
elif in_table and all(re.match(r'^[-:]+$', c) for c in cells):
|
||
pass # 跳过分隔行
|
||
|
||
# 检查下一行是否还是表格
|
||
if i + 1 >= len(lines) or '|' not in lines[i + 1]:
|
||
if in_table and table_rows:
|
||
html_lines.append('<table>')
|
||
for j, row in enumerate(table_rows):
|
||
tag = 'th' if j == 0 else 'td'
|
||
html_lines.append('<tr>')
|
||
for cell in row:
|
||
html_lines.append(f'<{tag}>{process_inline(cell)}</{tag}>')
|
||
html_lines.append('</tr>')
|
||
html_lines.append('</table>')
|
||
table_rows = []
|
||
in_table = False
|
||
i += 1
|
||
continue
|
||
|
||
# 标题
|
||
heading_match = re.match(r'^(#{1,6})\s+(.+)$', line)
|
||
if heading_match:
|
||
level = len(heading_match.group(1))
|
||
text = process_inline(heading_match.group(2))
|
||
# 生成简单的 ID
|
||
anchor = re.sub(r'[^\w\u4e00-\u9fff]+', '-', heading_match.group(2).lower()).strip('-')
|
||
html_lines.append(f'<h{level} id="{anchor}">{text}</h{level}>')
|
||
i += 1
|
||
continue
|
||
|
||
# 引用
|
||
if line.startswith('>'):
|
||
quote_text = process_inline(line[1:].strip())
|
||
html_lines.append(f'<blockquote><p>{quote_text}</p></blockquote>')
|
||
i += 1
|
||
continue
|
||
|
||
# 水平线
|
||
if re.match(r'^-{3,}$|^\*{3,}$|^_{3,}$', line.strip()):
|
||
html_lines.append('<hr>')
|
||
i += 1
|
||
continue
|
||
|
||
# 无序列表
|
||
list_match = re.match(r'^(\s*)[\*\-]\s+(.+)$', line)
|
||
if list_match:
|
||
if not in_list:
|
||
html_lines.append('<ul>')
|
||
in_list = True
|
||
html_lines.append(f'<li>{process_inline(list_match.group(2))}</li>')
|
||
# 检查下一行
|
||
if i + 1 >= len(lines) or not re.match(r'^\s*[\*\-]\s+', lines[i + 1]):
|
||
html_lines.append('</ul>')
|
||
in_list = False
|
||
i += 1
|
||
continue
|
||
|
||
# 有序列表
|
||
ol_match = re.match(r'^(\s*)\d+\.\s+(.+)$', line)
|
||
if ol_match:
|
||
if not in_list:
|
||
html_lines.append('<ol>')
|
||
in_list = True
|
||
html_lines.append(f'<li>{process_inline(ol_match.group(2))}</li>')
|
||
# 检查下一行
|
||
if i + 1 >= len(lines) or not re.match(r'^\s*\d+\.\s+', lines[i + 1]):
|
||
html_lines.append('</ol>')
|
||
in_list = False
|
||
i += 1
|
||
continue
|
||
|
||
# 空行
|
||
if not line.strip():
|
||
i += 1
|
||
continue
|
||
|
||
# 注释行(跳过)
|
||
if line.strip().startswith('<!--'):
|
||
i += 1
|
||
continue
|
||
|
||
# 普通段落
|
||
html_lines.append(f'<p>{process_inline(line)}</p>')
|
||
i += 1
|
||
|
||
return '\n'.join(html_lines)
|
||
|
||
|
||
def combine_book(project_dir: Path, output_md: str = 'single-page.md', output_html: str = 'single-page.html'):
|
||
"""
|
||
合并全书内容到单一文件(Markdown 和 HTML)。
|
||
"""
|
||
summary_path = project_dir / 'SUMMARY.md'
|
||
|
||
if not summary_path.exists():
|
||
print(f"❌ 错误: 在 {project_dir} 中找不到 SUMMARY.md")
|
||
return
|
||
|
||
# 提取书籍元数据
|
||
book_title, book_subtitle = extract_book_info(project_dir)
|
||
print(f"📘 书籍: {book_title}")
|
||
if book_subtitle:
|
||
print(f" 副标题: {book_subtitle}")
|
||
|
||
# 解析目录结构
|
||
entries = parse_summary(summary_path)
|
||
print(f"📚 找到 {len(entries)} 个章节条目")
|
||
|
||
if not entries:
|
||
print("⚠️ SUMMARY.md 中没有找到有效的章节链接")
|
||
return
|
||
|
||
# 第一遍: 构建文件路径到锚点的映射
|
||
file_to_anchor_map = {}
|
||
for title, file_path, indent in entries:
|
||
full_path = project_dir / file_path
|
||
if full_path.exists():
|
||
anchor = re.sub(r'[^\w\u4e00-\u9fff]+', '-', title.lower()).strip('-')
|
||
# 存储多种路径格式以便匹配
|
||
file_to_anchor_map[file_path] = anchor
|
||
# 也存储不带目录前缀的版本
|
||
from pathlib import PurePosixPath
|
||
filename = PurePosixPath(file_path).name
|
||
if filename != 'README.md': # README.md 需要特殊处理避免冲突
|
||
file_to_anchor_map[filename] = anchor
|
||
|
||
print(f"🔗 构建了 {len(file_to_anchor_map)} 个链接映射")
|
||
|
||
# 收集所有内容
|
||
md_header = []
|
||
|
||
# 添加书籍标题
|
||
md_header.append(f"# {book_title}\n")
|
||
if book_subtitle:
|
||
md_header.append(f"> {book_subtitle}\n")
|
||
md_header.append("---\n")
|
||
|
||
md_body = []
|
||
|
||
processed_count = 0
|
||
skipped_count = 0
|
||
|
||
# 第二遍: 处理内容并转换链接
|
||
for title, file_path, indent in entries:
|
||
full_path = project_dir / file_path
|
||
|
||
if not full_path.exists():
|
||
# 尝试相对于 SUMMARY.md 的路径
|
||
print(f" ⚠️ 找不到文件: {file_path}")
|
||
skipped_count += 1
|
||
continue
|
||
|
||
try:
|
||
content = full_path.read_text(encoding='utf-8')
|
||
|
||
# 清理导航链接
|
||
content = clean_navigation_links(content)
|
||
|
||
# 清理重复的书籍标题头(针对 README.md 等)
|
||
content = clean_redundant_header(content, book_title, book_subtitle)
|
||
|
||
# 转换内部链接为锚点链接
|
||
content = convert_internal_links_to_anchors(content, file_to_anchor_map)
|
||
|
||
# 修复图片路径
|
||
content = fix_image_paths(content, file_path)
|
||
|
||
# 添加分隔符和章节内容
|
||
# 这里可以添加锚点 ID 以便跳转
|
||
anchor = re.sub(r'[^\w\u4e00-\u9fff]+', '-', title.lower()).strip('-')
|
||
|
||
md_body.append(f"\n\n<!-- FILE: {file_path} -->\n")
|
||
md_body.append(f'<div id="{anchor}"></div>\n') # 添加锚点
|
||
md_body.append(content)
|
||
md_body.append("\n")
|
||
|
||
processed_count += 1
|
||
print(f" ✅ {title}")
|
||
|
||
except Exception as e:
|
||
print(f" ❌ 读取失败 {file_path}: {e}")
|
||
skipped_count += 1
|
||
|
||
# 生成 Markdown 文件 (包含标题头)
|
||
final_md = '\n'.join(md_header + md_body)
|
||
# 规范化空行
|
||
final_md = re.sub(r'\n{4,}', '\n\n\n', final_md)
|
||
|
||
md_path = project_dir / output_md
|
||
md_path.write_text(final_md, encoding='utf-8')
|
||
|
||
# 生成 HTML 文件 (仅使用正文,因为 HTML 模板已有 Header)
|
||
print("\n🔄 正在生成 HTML...")
|
||
final_html_md = '\n'.join(md_body)
|
||
html_content = markdown_to_html(final_html_md)
|
||
|
||
# 填充 HTML 模板
|
||
current_date = datetime.now().strftime("%Y-%m-%d")
|
||
final_html = HTML_TEMPLATE.format(
|
||
title=book_title,
|
||
subtitle=book_subtitle,
|
||
content=html_content,
|
||
date=current_date
|
||
)
|
||
|
||
html_path = project_dir / output_html
|
||
html_path.write_text(final_html, encoding='utf-8')
|
||
|
||
# 统计信息
|
||
md_size = md_path.stat().st_size
|
||
html_size = html_path.stat().st_size
|
||
|
||
print(f"\n{'=' * 50}")
|
||
print(f"📖 合并完成!")
|
||
print(f" 输出目录: {project_dir}")
|
||
print(f" Markdown: {md_path.name} ({md_size / 1024:.1f} KB)")
|
||
print(f" HTML: {html_path.name} ({html_size / 1024:.1f} KB)")
|
||
print(f" 处理章节: {processed_count}")
|
||
print(f" 跳过文件: {skipped_count}")
|
||
print(f"{'=' * 50}")
|
||
|
||
|
||
def main():
|
||
parser = argparse.ArgumentParser(description="GitBook 书籍自动合并工具")
|
||
parser.add_argument("path", nargs="?", default=".", help="书籍根目录路径 (包含 SUMMARY.md 的目录)")
|
||
parser.add_argument("--output-md", default="single-page.md", help="输出 Markdown 文件名")
|
||
parser.add_argument("--output-html", default="single-page.html", help="输出 HTML 文件名")
|
||
|
||
args = parser.parse_args()
|
||
|
||
target_dir = Path(args.path).resolve()
|
||
|
||
if not target_dir.is_dir():
|
||
print(f"❌ 错误: 目录不存在: {target_dir}")
|
||
sys.exit(1)
|
||
|
||
print("=" * 50)
|
||
print("通用书籍合并工具 v2.0")
|
||
print("=" * 50)
|
||
print(f"工作目录: {target_dir}\n")
|
||
|
||
combine_book(target_dir, args.output_md, args.output_html)
|
||
|
||
|
||
if __name__ == '__main__':
|
||
main()
|