66 lines
1.9 KiB
Python
66 lines
1.9 KiB
Python
|
import requests
|
|||
|
import html2text
|
|||
|
import os
|
|||
|
|
|||
|
|
|||
|
def extract_entries(entries, result_list):
|
|||
|
"""
|
|||
|
递归提取嵌套结构中无子节点的条目信息
|
|||
|
:param entries: 当前层级的条目列表
|
|||
|
:param result_list: 存储结果的列表
|
|||
|
"""
|
|||
|
for entry in entries:
|
|||
|
# 检查是否存在子节点
|
|||
|
if "children" not in entry or not entry["children"]:
|
|||
|
# 提取目标字段并拼接URL
|
|||
|
result_list.append({
|
|||
|
"title": entry.get("title", ""),
|
|||
|
"scm": entry.get("scm", ""),
|
|||
|
"url": "https://help.aliyun.com" + entry.get("url", "")
|
|||
|
})
|
|||
|
else:
|
|||
|
# 递归处理子节点
|
|||
|
extract_entries(entry["children"], result_list)
|
|||
|
|
|||
|
|
|||
|
def main():
|
|||
|
url = "https://help.aliyun.com/help/json/menupath.json?alias=%2Fhologres%2Fproduct-overview%2F&website=cn&language=zh"
|
|||
|
response = requests.get(url)
|
|||
|
data = response.json()
|
|||
|
|
|||
|
result = []
|
|||
|
extract_entries(data['data']['children'], result) # 从顶层children开始递归
|
|||
|
|
|||
|
# 打印结果(或根据需求存储/处理)
|
|||
|
i = 1
|
|||
|
for item in result:
|
|||
|
print(item['url'])
|
|||
|
print(i)
|
|||
|
md(item['url'], i)
|
|||
|
i += 1
|
|||
|
|
|||
|
|
|||
|
def md(url, title):
|
|||
|
response = requests.get(url)
|
|||
|
html_content = response.content.decode('utf-8') # 解码为字符串
|
|||
|
|
|||
|
# 初始化 html2text 转换器,设置标题样式为 ATX
|
|||
|
converter = html2text.HTML2Text()
|
|||
|
converter.ignore_links = False
|
|||
|
converter.heading_style = "ATX" # 使用 ATX 风格的标题(即 # Heading)
|
|||
|
|
|||
|
markdown_text = converter.handle(html_content)
|
|||
|
print(markdown_text)
|
|||
|
|
|||
|
# 确保目录存在
|
|||
|
os.makedirs("./md", exist_ok=True)
|
|||
|
|
|||
|
# 保存 Markdown 文件
|
|||
|
with open(f"./md/{title}.md", "w", encoding="utf-8") as f:
|
|||
|
f.write(markdown_text)
|
|||
|
|
|||
|
|
|||
|
if __name__ == "__main__":
|
|||
|
# md('https://help.aliyun.com/zh/hologres/product-overview/what-is-hologres', 1)
|
|||
|
main()
|