style(docs): standardize heading formats and numbering

- Enforce Level 1-3 structural numbering based on SUMMARY.md hierarchy
- Remove structural numbering from Level 4+ headings
- Eliminate single child headings by converting to bold text
- Auto-inject summary text for headings with multiple children missing intro text
- Exclude Appendix chapters from structural numbering
- Avoid modifying code block contents
- Add script to detect non-standard English usage in headers
This commit is contained in:
Baohua Yang
2026-02-21 22:40:33 -08:00
parent 33af380be1
commit 175aaae48a
136 changed files with 1015 additions and 766 deletions

250
format_headings.py Normal file
View File

@@ -0,0 +1,250 @@
import os
import re
ENG_ALLOWLIST = {
'DOCKER', 'KUBERNETES', 'XML', 'LLM', 'RAG', 'LINUX', 'UBUNTU', 'MAC', 'MACOS',
'WINDOWS', 'API', 'JSON', 'YAML', 'REGISTRY', 'HUB', 'REPOSITORY', 'TAG', 'IMAGE',
'CONTAINER', 'DEBIAN', 'FEDORA', 'CENTOS', 'RASPBERRY', 'PI', 'PULL', 'LIST',
'RM', 'COMMIT', 'BUILD', 'RUN', 'DAEMON', 'STOP', 'NEXUS', 'VOLUMES', 'TMPFS',
'DNS', 'PORT', 'BUILDX', 'BUILDKIT', 'COMPOSE', 'DJANGO', 'RAILS', 'WORDPRESS',
'LNMP', 'NAMESPACE', 'CGROUPS', 'UFS', 'PODMAN', 'PROMETHEUS', 'ELK', 'BUSYBOX',
'ALPINE', 'DEVOPS', 'ACTIONS', 'DRONE', 'IDE', 'VS', 'CODE', 'NGINX', 'PHP',
'NODE.JS', 'MYSQL', 'MONGODB', 'REDIS', 'MINIO', 'DOCKERD', 'TENCENTCLOUD',
'ALICLOUD', 'AWS', 'COREOS', 'KUBEADM', 'CONTAINERD', 'DESKTOP', 'KIND', 'K3S',
'SYSTEMD', 'DASHBOARD', 'KUBECTL', 'ETCD', 'ETCDCTL', 'VM', 'VAGRANT', 'LXC',
'GITHUB', 'GOOGLE', 'CLOUD', 'NPM', 'MAVEN', 'ACR', 'TCR', 'ECR', 'HARBOR',
'CNCF', 'SIGSTORE', 'NOTATION', 'SCOUT', 'TRIVY', 'CMD', 'ENTRYPOINT', 'ENV', 'ARG',
'VOLUME', 'EXPOSE', 'WORKDIR', 'USER', 'HEALTHCHECK', 'ONBUILD', 'LABEL', 'SHELL',
'COPY', 'ADD', 'DOCKERFILE', 'CI', 'CD', 'OS'
}
def parse_summary():
if not os.path.exists('SUMMARY.md'):
return {}
with open('SUMMARY.md', 'r', encoding='utf-8') as f:
content = f.read()
file_to_context = {}
chapter_idx = 0
section_idx = 0
is_appendix = False
for line in content.split('\n'):
if '## 附录' in line or '附录' in line and line.startswith('## '):
is_appendix = True
m_chap = re.match(r'^\* \[([一二三四五六七八九十百]+[^\]]*)\]\((.*?)\)', line)
if m_chap:
title = m_chap.group(1).replace(' ', '', 1)
if '' not in title:
title = title.replace('章', '')
filepath = m_chap.group(2)
chapter_idx += 1
section_idx = 0
file_to_context[filepath] = {
'level': 1,
'title': title,
'chap_num': chapter_idx,
'is_app': False
}
continue
m_sec = re.match(r'^\s+\* \[(.*?)\]\((.*?)\)', line)
if m_sec:
title = m_sec.group(1)
filepath = m_sec.group(2)
section_idx += 1
if is_appendix or 'appendix' in filepath:
file_to_context[filepath] = {
'level': 2,
'title': title,
'is_app': True
}
else:
file_to_context[filepath] = {
'level': 2,
'title': title,
'chap_num': chapter_idx,
'sec_num': section_idx,
'is_app': False
}
m_app = re.match(r'^\* \[(附录[^\]]*)\]\((.*?)\)', line)
if m_app:
title = m_app.group(1)
filepath = m_app.group(2)
file_to_context[filepath] = {
'level': 1,
'title': title,
'is_app': True
}
continue
return file_to_context
def check_english(title):
words = re.findall(r'[a-zA-Z\.]+', title)
for w in words:
if w.upper() not in ENG_ALLOWLIST and w.upper() != 'DOCKER':
print(f" [!] Notice: English word '{w}' in title: {title}")
def process_file(filepath, context):
try:
with open(filepath, 'r', encoding='utf-8') as f:
lines = f.readlines()
except Exception as e:
print(f"Error reading {filepath}: {e}")
return False
headings = []
in_code_block = False
for i, line in enumerate(lines):
line_stripped = line.strip()
if line_stripped.startswith('```'):
in_code_block = not in_code_block
if not in_code_block:
match = re.match(r'^(#{1,6})\s+(.*)', line)
if match:
level = len(match.group(1))
title = match.group(2).strip()
headings.append({'level': level, 'title': title, 'line_idx': i, 'children': []})
for i, h in enumerate(headings):
level = h['level']
for j in range(i+1, len(headings)):
if headings[j]['level'] <= level:
break
if headings[j]['level'] == level + 1:
h['children'].append(j)
actions = {}
def has_text_between(start_idx, end_idx):
for text_ln in range(start_idx + 1, end_idx):
content = lines[text_ln].strip()
if content and not content.startswith('#'):
return True
return False
is_app = context.get('is_app', False)
chap_num = context.get('chap_num', 0)
sec_num = context.get('sec_num', 0)
h2_counter = sec_num if sec_num > 0 else 0
h3_counter = 0
for i, h in enumerate(headings):
level = h['level']
title = h['title']
ln = h['line_idx']
original_title = title
check_english(title)
if level == 1:
if not is_app and chap_num > 0:
pass
elif is_app:
title = re.sub(r'^[\d\.]+\s*', '', title)
m = re.match(r'^(附录[一二三四五六七八九十]*)\s*(.*)', title)
if m:
p1 = m.group(1).strip()
p2 = m.group(2).strip()
if p2.startswith(':') or p2.startswith(''):
p2 = p2[1:].strip()
title = f"{p1}{p2}" if p2 else p1
elif level == 2:
if not is_app:
clean_title = re.sub(r'^[\d\.]+\s*', '', title)
title = f"{chap_num}.{h2_counter} {clean_title}" if h2_counter > 0 else clean_title
else:
title = re.sub(r'^[\d\.]+\s*', '', title)
h3_counter = 0
elif level == 3:
h3_counter += 1
if not is_app:
clean_title = re.sub(r'^[\d\.]+\s*', '', title)
if h2_counter > 0:
title = f"{chap_num}.{h2_counter}.{h3_counter} {clean_title}"
else:
title = re.sub(r'^[\d\.]+\s*', '', title)
elif level >= 4:
m = re.match(r'^([\d\.]+)\s+(.*)', title)
if m:
nums = m.group(1)
rest = m.group(2)
if '.' in nums.strip('.'):
title = rest
if title != original_title:
actions[ln] = f"{'#' * level} {title}\n"
h['title'] = title
children_indices = h['children']
if len(children_indices) == 1:
child_idx = children_indices[0]
child_h = headings[child_idx]
child_ln = child_h['line_idx']
child_title = child_h['title']
if child_ln in actions:
modified_line = actions[child_ln]
m_child = re.match(r'^(#{1,6})\s+(.*)', modified_line)
if m_child:
child_title = m_child.group(2).strip()
actions[child_ln] = f"**{child_title}**\n\n"
elif len(children_indices) >= 2:
child_idx = children_indices[0]
child_ln = headings[child_idx]['line_idx']
if not has_text_between(ln, child_ln):
if level < 4:
if ln in actions:
actions[ln] = actions[ln].rstrip() + "\n\n涵盖了如下重点内容\n\n"
else:
actions[ln] = lines[ln].rstrip() + "\n\n涵盖了如下重点内容\n\n"
if not actions:
return False
new_lines = []
for i, line in enumerate(lines):
if i in actions:
if actions[i].startswith('**'):
pass
new_lines.append(actions[i])
else:
new_lines.append(line)
with open(filepath, 'w', encoding='utf-8') as f:
f.writelines(new_lines)
return True
if __name__ == "__main__":
file_contexts = parse_summary()
modified = 0
for filepath, context in file_contexts.items():
if os.path.exists(filepath):
if process_file(filepath, context):
modified += 1
print(f" -> MODIFIED: {filepath}")
for root, dirs, files in os.walk('.'):
if '.git' in root or 'node_modules' in root or '.gemini' in root:
continue
for file in files:
if file.endswith('.md') and file not in ['SUMMARY.md', 'README.md', 'CONTRIBUTING.md', 'CHANGELOG.md']:
filepath = os.path.join(root, file)
clean_path = filepath.replace('./', '')
if clean_path not in file_contexts:
if process_file(clean_path, {'is_app': True}):
modified += 1
print(f" -> MODIFIED: {clean_path}")
print(f"\nTotal Modified {modified} files")