分享一个 python 脚本,可用于爬取 LearnKu 社区的优质文档,存储到本地,供己学习
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46
| import os import time
import requests from lxml import etree
BASEDIR = r"C:\Users\pzj\Desktop\go入门指南"
base_url = "https://learnku.com/docs/the-way-to-go"
url = "https://learnku.com/docs/the-way-to-go/book-intro/3560"
content_length = 22
if __name__ == "__main__": content = requests.get(url).text tree = etree.HTML(content) for i in range(3, 3 + content_length):
title = tree.xpath("/html/body/div[2]/div[%s]/div[1]/text()" % i) title = title[0].strip()
a_link = tree.xpath("/ html / body / div[2] / div[%s] / div[2] / a/@href" % i)
droptitle_list = list( map( lambda x: x.strip().split(".")[-1].strip(), tree.xpath("/html/body/div[2]/div[%s]/div[2]/a/div/text()" % i), ) ) md_urls = list(map(lambda x: base_url + "/" + x.split("/")[-1] + ".md", a_link))
md_dir = os.path.join(BASEDIR, title) folder = os.path.exists(md_dir) if not folder: os.makedirs(md_dir) for j in range(len(md_urls)): md_text = requests.get(md_urls[j]).text md_file = os.path.join(md_dir, str(j + 1) + "." + droptitle_list[j] + ".md") with open(md_file, "w", encoding="utf-8") as f: f.write(md_text) print(md_file + " 爬取成功") time.sleep(0.5)
|