-------- <--
requests==2.26.0 ------- <--
urllib3==1.26.7
"""
def crawl(url):
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/95.0.4638.54 Safari/537.36",
}
print("crawl...")
# 配置header破反爬
response = requests.get(url, headers=headers)
# 200就继续
if response.status_code == 200:
html = response.content.decode("utf8")
# print(html)
tree = etree.HTML(html)
print("look for text...")
# 找到需要的html块
title = tree.xpath('//*[@id="articleContentId"]/text()')[0]
block = tree.xpath('//*[@id="content_views"]')
# html
ohtml = unescape(etree.tostring(block[0]).decode("utf8"))
# 纯文本
text = block[0].xpath('string(.)').strip()
# print("html:", ohtml)
# print("text:", text)
print("title:", title)
save(ohtml, text)
# 完成!
print("finish!")
else:
print("failed!")
def save(html, text):
if "output" not in os.listdir():
# 不存在输出文件夹就创建
os.mkdir("output")
os.mkdir("output/html")
os.mkdir("output/text")
os.mkdir("output/markdown")
with open(f"output/html/{title}.html", 'w', encoding='utf8') as html_file:
# 保存html
print("write html...")
html_file.write(html)
with open(f"output/text/{title}.txt", 'w', encoding='utf8') as txt_file:
# 保存纯文本
print("write text...")
txt_file.write(text)
with open(f"output/markdown/{title}.md", 'w', encoding='utf8') as md_file:
# 保存markdown
print("write markdown...")
text_maker = HTML2Text()
# md转换
md_text = text_maker.handle(html)
md_file.write(md_text)
if __name__ == '__main__':
# 你想要爬取的文章url
url = "url"
crawl(url)
参考资料
[Xpath如何提取一个标签里的所有文本?_对明天的期待丶的博客-
CSDN博客](https://blog.csdn.net/qq_39429962/article/details/84196938?ops_request_misc=&request_id=&biz_id=102&utm_term=xpath%20div%E6%A0%87%E7%AD%BE%E4%B8%8B%E7%BA%AF%E6%96%87%E6%9C%AC&utm_medium=distribute.pc_search_result.none-
task-
blog-2~all~sobaiduweb~default-0-84196938.pc_search_result_control_group&spm=1018.2226.3001.4187
"Xpath如何提取一个标签里的所有文本?_对明天的期待丶的博客-CSDN博客")
[python中HTML文档转义与反转义方法介绍_codingforhaifeng的博客-CSDN博客_python
反转义](https://blog.csdn.net/codingforhaifeng/article/details/80615008?ops_request_misc=%257B%2522request%255Fid%2522%253A%2522163560612316780271561525%2522%252C%2522scm%2522%253A%252220140713.130102334..%2522%257D&request_id=163560612316780271561525&biz_id=0&utm_medium=distribute.pc_search_result.none-
task-
blog-2~all~baidu_landing_v2~default-1-80615008.pc_search_result_control_group&utm_term=python+html%E5%8F%8D%E8%BD%AC%E4%B9%89&spm=1018.2226.3001.4187
"python中HTML文档转义与反转义方法介绍_codingforhaifeng的博客-CSDN博客_python 反转义")
[html文件转md文件_OzupeSir-
CSDN博客_html转md](https://blog.csdn.net/weixin_45611266/article/details/102563758?ops_request_misc=%257B%2522request%255Fid%2522%253A%2522163560314016780262529248%2522%252C%2522scm%2522%253A%252220140713.130102334..%2522%257D&request_id=163560314016780262529248&biz_id=0&utm_medium=distribute.pc_search_result.none-
task-
blog-2~all~sobaiduend~default-2-102563758.pc_search_result_control_group&utm_term=html%E8%BD%ACmd&spm=1018.2226.3001.4187
"html文件转md文件_OzupeSir-CSDN博客_html转md")
[两万字博文教你python爬虫requests库【详解篇】_孤寒者的博客-CSDN博客_python
requests库](https://blog.csdn.net/qq_44907926/article/details/118667559?ops_request_misc=%257B%2522request%255Fid%2522%253A%2522163560953216780366582463%2522%252C%2522scm%2522%253A%252220140713.130102334..%2522%257D&request_id=163560953216780366582463&biz_id=0&utm_medium=distribute.pc_search_result.none-
task-
blog-2~all~top_positive~default-1-118667559.pc_search_result_control_group&utm_term=requests&spm=1018.2226.3001.4187
"两万字博文教你python爬虫requests库【详解篇】_孤寒者的博客-CSDN博客_python requests库")
感谢大佬帮助
搞定
快乐
\( ̄︶ ̄*\))
睡觉。
( ̄o ̄) . z Z
评论区