style(docs): standardize heading formats and numbering

- Enforce Level 1-3 structural numbering based on SUMMARY.md hierarchy - Remove structural numbering from Level 4+ headings - Eliminate single child headings by converting to bold text - Auto-inject summary text for headings with multiple children missing intro text - Exclude Appendix chapters from structural numbering - Avoid modifying code block contents - Add script to detect non-standard English usage in headers
2026-03-11 04:14:38 +00:00 · 2026-02-21 22:40:33 -08:00
parent 33af380be1
commit 175aaae48a
136 changed files with 1015 additions and 766 deletions
--- a/format_headings.py
+++ b/format_headings.py
@@ -0,0 +1,250 @@
+import os
+import re
+
+ENG_ALLOWLIST = {
+    'DOCKER', 'KUBERNETES', 'XML', 'LLM', 'RAG', 'LINUX', 'UBUNTU', 'MAC', 'MACOS', 
+    'WINDOWS', 'API', 'JSON', 'YAML', 'REGISTRY', 'HUB', 'REPOSITORY', 'TAG', 'IMAGE', 
+    'CONTAINER', 'DEBIAN', 'FEDORA', 'CENTOS', 'RASPBERRY', 'PI', 'PULL', 'LIST', 
+    'RM', 'COMMIT', 'BUILD', 'RUN', 'DAEMON', 'STOP', 'NEXUS', 'VOLUMES', 'TMPFS', 
+    'DNS', 'PORT', 'BUILDX', 'BUILDKIT', 'COMPOSE', 'DJANGO', 'RAILS', 'WORDPRESS', 
+    'LNMP', 'NAMESPACE', 'CGROUPS', 'UFS', 'PODMAN', 'PROMETHEUS', 'ELK', 'BUSYBOX', 
+    'ALPINE', 'DEVOPS', 'ACTIONS', 'DRONE', 'IDE', 'VS', 'CODE', 'NGINX', 'PHP', 
+    'NODE.JS', 'MYSQL', 'MONGODB', 'REDIS', 'MINIO', 'DOCKERD', 'TENCENTCLOUD', 
+    'ALICLOUD', 'AWS', 'COREOS', 'KUBEADM', 'CONTAINERD', 'DESKTOP', 'KIND', 'K3S', 
+    'SYSTEMD', 'DASHBOARD', 'KUBECTL', 'ETCD', 'ETCDCTL', 'VM', 'VAGRANT', 'LXC',
+    'GITHUB', 'GOOGLE', 'CLOUD', 'NPM', 'MAVEN', 'ACR', 'TCR', 'ECR', 'HARBOR',
+    'CNCF', 'SIGSTORE', 'NOTATION', 'SCOUT', 'TRIVY', 'CMD', 'ENTRYPOINT', 'ENV', 'ARG',
+    'VOLUME', 'EXPOSE', 'WORKDIR', 'USER', 'HEALTHCHECK', 'ONBUILD', 'LABEL', 'SHELL',
+    'COPY', 'ADD', 'DOCKERFILE', 'CI', 'CD', 'OS'
+}
+
+def parse_summary():
+    if not os.path.exists('SUMMARY.md'):
+        return {}
+    with open('SUMMARY.md', 'r', encoding='utf-8') as f:
+        content = f.read()
+
+    file_to_context = {}
+    chapter_idx = 0
+    section_idx = 0
+    is_appendix = False
+    
+    for line in content.split('\n'):
+        if '## 附录' in line or '附录' in line and line.startswith('## '):
+            is_appendix = True
+            
+        m_chap = re.match(r'^\* \[(第[一二三四五六七八九十百]+章[^\]]*)\]\((.*?)\)', line)
+        if m_chap:
+            title = m_chap.group(1).replace(' ', '：', 1) 
+            if '：' not in title:
+                title = title.replace('章', '章：')
+            filepath = m_chap.group(2)
+            chapter_idx += 1
+            section_idx = 0
+            file_to_context[filepath] = {
+                'level': 1,
+                'title': title,
+                'chap_num': chapter_idx,
+                'is_app': False
+            }
+            continue
+            
+        m_sec = re.match(r'^\s+\* \[(.*?)\]\((.*?)\)', line)
+        if m_sec:
+            title = m_sec.group(1)
+            filepath = m_sec.group(2)
+            section_idx += 1
+            
+            if is_appendix or 'appendix' in filepath:
+                file_to_context[filepath] = {
+                    'level': 2,
+                    'title': title,
+                    'is_app': True
+                }
+            else:
+                file_to_context[filepath] = {
+                    'level': 2,
+                    'title': title,
+                    'chap_num': chapter_idx,
+                    'sec_num': section_idx,
+                    'is_app': False
+                }
+            
+        m_app = re.match(r'^\* \[(附录[^\]]*)\]\((.*?)\)', line)
+        if m_app:
+            title = m_app.group(1)
+            filepath = m_app.group(2)
+            file_to_context[filepath] = {
+                'level': 1,
+                'title': title,
+                'is_app': True
+            }
+            continue
+
+    return file_to_context
+
+def check_english(title):
+    words = re.findall(r'[a-zA-Z\.]+', title)
+    for w in words:
+        if w.upper() not in ENG_ALLOWLIST and w.upper() != 'DOCKER':
+            print(f"    [!] Notice: English word '{w}' in title: {title}")
+
+def process_file(filepath, context):
+    try:
+        with open(filepath, 'r', encoding='utf-8') as f:
+            lines = f.readlines()
+    except Exception as e:
+        print(f"Error reading {filepath}: {e}")
+        return False
+
+    headings = []
+    in_code_block = False
+    for i, line in enumerate(lines):
+        line_stripped = line.strip()
+        if line_stripped.startswith('```'):
+            in_code_block = not in_code_block
+            
+        if not in_code_block:
+            match = re.match(r'^(#{1,6})\s+(.*)', line)
+            if match:
+                level = len(match.group(1))
+                title = match.group(2).strip()
+                headings.append({'level': level, 'title': title, 'line_idx': i, 'children': []})
+            
+    for i, h in enumerate(headings):
+        level = h['level']
+        for j in range(i+1, len(headings)):
+            if headings[j]['level'] <= level:
+                break
+            if headings[j]['level'] == level + 1:
+                h['children'].append(j)
+
+    actions = {}
+    
+    def has_text_between(start_idx, end_idx):
+        for text_ln in range(start_idx + 1, end_idx):
+            content = lines[text_ln].strip()
+            if content and not content.startswith('#'):
+                return True
+        return False
+
+    is_app = context.get('is_app', False)
+    chap_num = context.get('chap_num', 0)
+    sec_num = context.get('sec_num', 0)
+    
+    h2_counter = sec_num if sec_num > 0 else 0
+    h3_counter = 0
+
+    for i, h in enumerate(headings):
+        level = h['level']
+        title = h['title']
+        ln = h['line_idx']
+        
+        original_title = title
+        check_english(title)
+
+        if level == 1:
+            if not is_app and chap_num > 0:
+                pass 
+            elif is_app:
+                title = re.sub(r'^[\d\.]+\s*', '', title)
+                m = re.match(r'^(附录[一二三四五六七八九十]*)\s*(.*)', title)
+                if m:
+                    p1 = m.group(1).strip()
+                    p2 = m.group(2).strip()
+                    if p2.startswith(':') or p2.startswith('：'):
+                        p2 = p2[1:].strip()
+                    title = f"{p1}：{p2}" if p2 else p1
+
+        elif level == 2:
+            if not is_app:
+                clean_title = re.sub(r'^[\d\.]+\s*', '', title)
+                title = f"{chap_num}.{h2_counter} {clean_title}" if h2_counter > 0 else clean_title
+            else:
+                title = re.sub(r'^[\d\.]+\s*', '', title)
+            h3_counter = 0 
+
+        elif level == 3:
+            h3_counter += 1
+            if not is_app:
+                clean_title = re.sub(r'^[\d\.]+\s*', '', title)
+                if h2_counter > 0:
+                    title = f"{chap_num}.{h2_counter}.{h3_counter} {clean_title}"
+            else:
+                title = re.sub(r'^[\d\.]+\s*', '', title)
+
+        elif level >= 4:
+            m = re.match(r'^([\d\.]+)\s+(.*)', title)
+            if m:
+                nums = m.group(1)
+                rest = m.group(2)
+                if '.' in nums.strip('.'):
+                    title = rest
+                    
+        if title != original_title:
+            actions[ln] = f"{'#' * level} {title}\n"
+            h['title'] = title
+            
+        children_indices = h['children']
+        if len(children_indices) == 1:
+            child_idx = children_indices[0]
+            child_h = headings[child_idx]
+            child_ln = child_h['line_idx']
+            child_title = child_h['title']
+            
+            if child_ln in actions:
+                modified_line = actions[child_ln]
+                m_child = re.match(r'^(#{1,6})\s+(.*)', modified_line)
+                if m_child:
+                    child_title = m_child.group(2).strip()
+            
+            actions[child_ln] = f"**{child_title}**\n\n"
+            
+        elif len(children_indices) >= 2:
+            child_idx = children_indices[0]
+            child_ln = headings[child_idx]['line_idx']
+            if not has_text_between(ln, child_ln):
+                if level < 4:
+                    if ln in actions:
+                        actions[ln] = actions[ln].rstrip() + "\n\n涵盖了如下重点内容：\n\n"
+                    else:
+                        actions[ln] = lines[ln].rstrip() + "\n\n涵盖了如下重点内容：\n\n"
+
+    if not actions:
+        return False
+        
+    new_lines = []
+    for i, line in enumerate(lines):
+        if i in actions:
+            if actions[i].startswith('**'):
+                pass
+            new_lines.append(actions[i])
+        else:
+            new_lines.append(line)
+            
+    with open(filepath, 'w', encoding='utf-8') as f:
+        f.writelines(new_lines)
+    return True
+
+if __name__ == "__main__":
+    file_contexts = parse_summary()
+    modified = 0
+    for filepath, context in file_contexts.items():
+        if os.path.exists(filepath):
+            if process_file(filepath, context):
+                modified += 1
+                print(f"  -> MODIFIED: {filepath}")
+    
+    for root, dirs, files in os.walk('.'):
+        if '.git' in root or 'node_modules' in root or '.gemini' in root:
+            continue
+        for file in files:
+            if file.endswith('.md') and file not in ['SUMMARY.md', 'README.md', 'CONTRIBUTING.md', 'CHANGELOG.md']:
+                filepath = os.path.join(root, file)
+                clean_path = filepath.replace('./', '')
+                if clean_path not in file_contexts:
+                    if process_file(clean_path, {'is_app': True}): 
+                        modified += 1
+                        print(f"  -> MODIFIED: {clean_path}")
+
+    print(f"\nTotal Modified {modified} files")