import requests
import json
from lxml import etree
class Yz():
# 请求的地址猿著: https://www.lmonkey.com/essence
url = 'https://www.lmonkey.com/essence'
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/84.0.4147.89 Safari/537.36'
}
# 爬取的数据
data = ''
# 存储到文件的地址
filepath = './yz.json'
# 初始化
def __init__(self):
# 发送请求
result = requests.get(url=self.url, headers=self.headers)
print(result.status_code)
if result.status_code == 200:
# 将请求的内容写入文件
# ./代表当前目录
with open('./yz.html', 'w', encoding='utf-8') as fp:
fp.write(result.text)
if self.parsedata():
self.writedata()
# 解析分析数据
def parsedata(self):
# 解析数据
html = etree.parse('./yz.html', etree.HTMLParser())
# 提取数据,需要文章标题,文章url,作者
authors = html.xpath('//div[contains(@class,"old_content")]//div[contains(@class,"flex-fill")]/p/strong/a/text()')
titles = html.xpath('//div[contains(@class,"old_content")]//div[contains(@class,"flex-fill")]//div[contains(@class,"topic_title")]//text()')
title_urls = html.xpath('//div[contains(@class,"old_content")]//div[contains(@class,"flex-fill")]/a[2]/@href')
# res = zip(authors, titles, title_urls)
# print(list(res))
# print(*zip(authors, titles, title_urls)) # 同上两行代码结果相同
data = []
for i in range(0, len(authors)):
res = {'author': authors[i], 'title': titles[i], 'title_url': title_urls[i]}
data.append(res)
self.data = data
return True
# 将数据写入文件/数据库
def writedata(self):
# 将提取到的数据写入文件,最好写入数据库
with open(self.filepath, 'w', encoding='utf-8') as fp:
json.dump(self.data, fp, ensure_ascii=False) # ensure_ascii=False 解决json中文乱码问题
# 实例化对象
Yz()