import requests
from bs4 import BeautifulSoup
# 定义请求url
url = 'https://so.gushiwen.cn/shiwen/'
# 定义请求头
headers = {
'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/77.0.3865.90 Safari/537.36'
}
res = requests.get(url=url, headers=headers)
# 判断是否成功并获取源码
if res.status_code == 200:
print('请求成功')
# 解析数据
soup = BeautifulSoup(res.text, 'lxml')
# 获取数据
# 获取文章大div
divs = soup.find_all('div', class_="sons")
#print(divs)
data = []
for i in divs:
cont = i.find('div', class_="cont")
#判断非空 cont.p is not None
if cont.p:
varlist = {
'title':cont.p.text,
'author':cont.find('p',class_="source").text,
'url':cont.p.find('a')['href'],
'content':cont.find('div',class_="contson").text
}
data.append(varlist)
print('标题:'+cont.p.text+'\t'+cont.find('p',class_="source").text+'\t'+'链接: https://so.gushiwen.cn/'+cont.p.find('a')['href']+'\n'+cont.find('div',class_="contson").text)
# 写入数据
with open('./gushi.txt', 'w', encoding='utf-8') as fp:
for i in data:
print(i)
fp.write('标题:'+i['title']+'\t作者:'+i['author']+'\t链接: https://so.gushiwen.cn/'+i['url']+'\n内容:'+i['content']+'\n')
print('文件已写入!')
总结:写入本地文件时,需要将data.append(varlist)