66 lines
1.9 KiB
Python
66 lines
1.9 KiB
Python
import requests
|
||
import html2text
|
||
import os
|
||
|
||
|
||
def extract_entries(entries, result_list):
|
||
"""
|
||
递归提取嵌套结构中无子节点的条目信息
|
||
:param entries: 当前层级的条目列表
|
||
:param result_list: 存储结果的列表
|
||
"""
|
||
for entry in entries:
|
||
# 检查是否存在子节点
|
||
if "children" not in entry or not entry["children"]:
|
||
# 提取目标字段并拼接URL
|
||
result_list.append({
|
||
"title": entry.get("title", ""),
|
||
"scm": entry.get("scm", ""),
|
||
"url": "https://help.aliyun.com" + entry.get("url", "")
|
||
})
|
||
else:
|
||
# 递归处理子节点
|
||
extract_entries(entry["children"], result_list)
|
||
|
||
|
||
def main():
|
||
url = "https://help.aliyun.com/help/json/menupath.json?alias=%2Fhologres%2Fproduct-overview%2F&website=cn&language=zh"
|
||
response = requests.get(url)
|
||
data = response.json()
|
||
|
||
result = []
|
||
extract_entries(data['data']['children'], result) # 从顶层children开始递归
|
||
|
||
# 打印结果(或根据需求存储/处理)
|
||
i = 1
|
||
for item in result:
|
||
print(item['url'])
|
||
print(i)
|
||
md(item['url'], i)
|
||
i += 1
|
||
|
||
|
||
def md(url, title):
|
||
response = requests.get(url)
|
||
html_content = response.content.decode('utf-8') # 解码为字符串
|
||
|
||
# 初始化 html2text 转换器,设置标题样式为 ATX
|
||
converter = html2text.HTML2Text()
|
||
converter.ignore_links = False
|
||
converter.heading_style = "ATX" # 使用 ATX 风格的标题(即 # Heading)
|
||
|
||
markdown_text = converter.handle(html_content)
|
||
print(markdown_text)
|
||
|
||
# 确保目录存在
|
||
os.makedirs("./md", exist_ok=True)
|
||
|
||
# 保存 Markdown 文件
|
||
with open(f"./md/{title}.md", "w", encoding="utf-8") as f:
|
||
f.write(markdown_text)
|
||
|
||
|
||
if __name__ == "__main__":
|
||
# md('https://help.aliyun.com/zh/hologres/product-overview/what-is-hologres', 1)
|
||
main()
|