Markdown文件拆分
将一个 Markdown 文件根据标题结构拆分成多个文件,可以使用以下 Python 脚本来实现。这个脚本支持两种模式:一种是将每个标题对应的内容拆分成单独的文件,另一种是根据标题层级创建目录结构。
import os
import re
import argparse
from collections import defaultdict
def clean_title(title: str) -> str:
"""去除markdown格式并清理非法文件名字符"""
# 去掉markdown格式
title = re.sub(r"[*_`~]", "", title)
# 去掉链接
title = re.sub(r"\[(.*?)\]\(.*?\)", r"\1", title)
# 去掉图片
title = re.sub(r"!\[(.*?)\]\(.*?\)", r"\1", title)
# 清理非法字符
title = re.sub(r'[\\/:"*?<>|]+', "", title)
return title.strip()
def parse_markdown(file_path):
"""解析markdown标题结构"""
with open(file_path, "r", encoding="utf-8") as f:
lines = f.readlines()
headers = []
for i, line in enumerate(lines):
m = re.match(r'^(#+)\s+(.*)', line)
if m:
level = len(m.group(1))
title = m.group(2).strip()
headers.append({
"level": level,
"title": title,
"line": i
})
return lines, headers
def find_base_level(headers):
"""找到文档中最高级标题"""
return min(h["level"] for h in headers)
def ensure_unique(name, counter):
"""同级重名处理"""
if counter[name] == 0:
counter[name] += 1
return name
else:
new_name = f"{name}({counter[name]})"
counter[name] += 1
return new_name
def split_markdown(lines, headers, base_level, max_level, min_level):
"""根据标题切片"""
sections = []
for i, h in enumerate(headers):
rel_level = h["level"] - base_level + 1
if rel_level < max_level or rel_level > min_level:
continue
start = h["line"]
end = len(lines)
for j in range(i + 1, len(headers)):
if headers[j]["level"] <= h["level"]:
end = headers[j]["line"]
break
sections.append({
"title": h["title"],
"level": rel_level,
"content": "".join(lines[start:end])
})
return sections
def write_file_mode(sections, output_dir):
os.makedirs(output_dir, exist_ok=True)
counter = defaultdict(int)
for s in sections:
name = clean_title(s["title"])
name = ensure_unique(name, counter)
path = os.path.join(output_dir, f"{name}.md")
with open(path, "w", encoding="utf-8") as f:
f.write(s["content"])
def write_tree_mode(sections, output_dir, max_level):
stack = []
counters = defaultdict(lambda: defaultdict(int))
for s in sections:
level = s["level"]
while len(stack) >= level:
stack.pop()
title = clean_title(s["title"])
parent_key = "/".join(stack)
title = ensure_unique(title, counters[parent_key])
stack.append(title)
if level == max_level:
dir_path = os.path.join(output_dir, *stack)
os.makedirs(dir_path, exist_ok=True)
else:
dir_path = os.path.join(output_dir, *stack[:-1])
os.makedirs(dir_path, exist_ok=True)
path = os.path.join(dir_path, f"{title}.md")
with open(path, "w", encoding="utf-8") as f:
f.write(s["content"])
def main():
parser = argparse.ArgumentParser()
parser.add_argument("input_file")
parser.add_argument("--output", default="output")
parser.add_argument("--mode", choices=["file", "tree"], default="file")
parser.add_argument("--max_level", type=int, default=1)
parser.add_argument("--min_level", type=int)
args = parser.parse_args()
lines, headers = parse_markdown(args.input_file)
if not headers:
print("No headers found.")
return
base_level = find_base_level(headers)
if args.mode == "file":
min_level = args.max_level
else:
if args.min_level:
min_level = args.min_level
else:
min_level = max(h["level"] - base_level + 1 for h in headers)
sections = split_markdown(
lines,
headers,
base_level,
args.max_level,
min_level
)
if args.mode == "file":
write_file_mode(sections, args.output)
else:
write_tree_mode(sections, args.output, args.max_level)
if __name__ == "__main__":
main()