import requests
import html2text
import os
def extract_entries(entries, result_list):
"""
递归提取嵌套结构中无子节点的条目信息
:param entries: 当前层级的条目列表
:param result_list: 存储结果的列表
"""
for entry in entries:
# 检查是否存在子节点
if "children" not in entry or not entry["children"]:
# 提取目标字段并拼接URL
result_list.append({
"title": entry.get("title", ""),
"scm": entry.get("scm", ""),
"url": "https://help.aliyun.com" + entry.get("url", "")
})
else:
# 递归处理子节点
extract_entries(entry["children"], result_list)
def main():
url = "https://help.aliyun.com/help/json/menupath.json?alias=%2Fhologres%2Fproduct-overview%2F&website=cn&language=zh"
response = requests.get(url)
data = response.json()
result = []
extract_entries(data['data']['children'], result) # 从顶层children开始递归
# 打印结果(或根据需求存储/处理)
i = 1
for item in result:
print(item['url'])
print(i)
md(item['url'], i)
i += 1
def md(url, title):
response = requests.get(url)
html_content = response.content.decode('utf-8') # 解码为字符串
# 初始化 html2text 转换器,设置标题样式为 ATX
converter = html2text.HTML2Text()
converter.ignore_links = False
converter.heading_style = "ATX" # 使用 ATX 风格的标题(即 # Heading)
markdown_text = converter.handle(html_content)
print(markdown_text)
# 确保目录存在
os.makedirs("./md", exist_ok=True)
# 保存 Markdown 文件
with open(f"./md/{title}.md", "w", encoding="utf-8") as f:
f.write(markdown_text)
if __name__ == "__main__":
# md('https://help.aliyun.com/zh/hologres/product-overview/what-is-hologres', 1)
main()