2025-07-25 16:59:34 +08:00

66 lines
1.9 KiB
Python
Raw Permalink Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

import requests
import html2text
import os
def extract_entries(entries, result_list):
"""
递归提取嵌套结构中无子节点的条目信息
:param entries: 当前层级的条目列表
:param result_list: 存储结果的列表
"""
for entry in entries:
# 检查是否存在子节点
if "children" not in entry or not entry["children"]:
# 提取目标字段并拼接URL
result_list.append({
"title": entry.get("title", ""),
"scm": entry.get("scm", ""),
"url": "https://help.aliyun.com" + entry.get("url", "")
})
else:
# 递归处理子节点
extract_entries(entry["children"], result_list)
def main():
url = "https://help.aliyun.com/help/json/menupath.json?alias=%2Fhologres%2Fproduct-overview%2F&website=cn&language=zh"
response = requests.get(url)
data = response.json()
result = []
extract_entries(data['data']['children'], result) # 从顶层children开始递归
# 打印结果(或根据需求存储/处理)
i = 1
for item in result:
print(item['url'])
print(i)
md(item['url'], i)
i += 1
def md(url, title):
response = requests.get(url)
html_content = response.content.decode('utf-8') # 解码为字符串
# 初始化 html2text 转换器,设置标题样式为 ATX
converter = html2text.HTML2Text()
converter.ignore_links = False
converter.heading_style = "ATX" # 使用 ATX 风格的标题(即 # Heading
markdown_text = converter.handle(html_content)
print(markdown_text)
# 确保目录存在
os.makedirs("./md", exist_ok=True)
# 保存 Markdown 文件
with open(f"./md/{title}.md", "w", encoding="utf-8") as f:
f.write(markdown_text)
if __name__ == "__main__":
# md('https://help.aliyun.com/zh/hologres/product-overview/what-is-hologres', 1)
main()