import requests
from lxml import etree
import re
import pymysql
from time import sleep
from concurrent.futures import ThreadPoolExecutor
def get_conn():
# 创建连接
conn = pymysql.connect(host="127.0.0.1",
user="root",
password="root",
db="novels",
charset="utf8")
# 创建游标
cursor = conn.cursor()
return conn, cursor
def close_conn(conn, cursor):
cursor.close()
conn.close()
def get_xpath_resp(url):
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/106.0.0.0 Safari/537.36'}
resp = requests.get(url, headers=headers)
tree = etree.HTML(resp.text) # 用etree解析html
return tree, resp
def get_chapters(url):
tree, _ = get_xpath_resp(url)
# 获取小说名字
# 修改此处,检查列表是否为空
novel_name_elements = tree.xpath('//*[@id="info"]/h1/text()')
if novel_name_elements:
novel_name = novel_name_elements[0]
else:
print("未找到小说名称,请检查 XPath 表达式或网页结构。")
novel_name = "未知小说名称"
# 获取小说数据节点
dds = tree.xpath('/html/body/div[4]/dl/dd')
title_list = []
link_list = []
for d in dds[:15]:
title = d.xpath('./a/text()')[0] # 章节标题
title_list.append(title)
link = d.xpath('./a/@href')[0] # 章节链接
chapter_url = url + link # 构造完整链接
link_list.append(chapter_url)
return title_list, link_list, novel_name
def get_content(novel_name, title, url):
try:
cursor = None
conn = None
conn, cursor = get_conn()
# 插入数据的sql
sql = 'INSERT INTO novel(novel_name,chapter_name,content) VALUES(%s,%s,%s)'
tree, resp = get_xpath_resp(url)
# 获取内容
content = re.findall('<div id="content">(.*?)</div>', resp.text)[0]
# 对内容进行清洗
content = content.replace('<br />', '\n').replace(' ', ' ').replace(
'全本小说网 www.qb5.tw,最快更新<a href="https://www.qb5.tw/book_116659/">宇宙职业选手</a>最新章节!<br><br>', '')
print(title, content)
cursor.execute(sql, [novel_name, title, content]) # 插入数据
conn.commit() # 提交事务保存数据
except:
pass
finally:
sleep(2)
close_conn(conn, cursor) # 关闭数据库
if __name__ == '__main__':
# 获取小说名字,标题链接,章节名称
title_list, link_list, novel_name = get_chapters('https://www.3.org/book_116659/')
with ThreadPoolExecutor(5) as t: # 创建5个线程
for title, link in zip(title_list, link_list):
t.submit(get_content, novel_name, title, link) # 启动线程
小说
import re
import os
import requests
filename = 'music\\'
# 如果没有则创建文件夹
if not os.path.exists(filename):
os.makedirs(filename)
# 请求网址(如果想要爬取其他的榜单的歌曲内容,只需要改这个 url 即可)
url = 'https://music.163.com/playlist?id=3778678'
# 伪造请求头
headers = {
'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/127.0.0.0 Safari/537.36'
}
# 发送请求
response = requests.get(url, headers=headers)
# re.findall
# 这个函数用于在字符串中查找所有与正则表达式模式匹配的部分,并返回一个包含所有匹配项的列表
# r 前缀表示这是一个原始字符串,其中的反斜杠不会被解释为转义字符
# (\d+): 捕获组,匹配一个或多个数字
# (.*?): 捕获组,非贪婪匹配任何字符(包括空字符),直到遇到 </a>
html_data = re.findall(r'<li><a href="/song\?id=(\d+)">(.*?)</a>', response.text)
# 正则表达式提取出来的一个内容返回是列表 里面每一个元素都是元组
for num_id, title in html_data:
# 调用接口
music_url = f'https://music.163.com/song/media/outer/url?id={num_id}.mp3'
# 发送请求获取二进制数据
music_content = requests.get(music_url, headers=headers)
# 保存
with open('music\\' + title + '.mp3', 'wb') as f:
f.write(music_content.content)
print(num_id, title)
歌曲
import csv
import requests
import re
import time
def main(page):
url = f'https://tieba.baidu.com/p/7882177660?pn={page}'
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/106.0.0.0 Safari/537.36'
}
resp = requests.get(url, headers=headers)
html = resp.text
# 评论内容
comments = re.findall('style="display:;"> (.*?)</div>', html)
# 评论用户
users = re.findall('class="p_author_name j_user_card" href=".*?" target="_blank">(.*?)</a>', html)
# 评论时间
comment_times = re.findall('楼</span><span class="tail-info">(.*?)</span><div', html)
for u, c, t in zip(users, comments, comment_times):
# 筛选数据,过滤掉异常数据
if 'img' in c or 'div' in c or len(u) > 50:
continue
csvwriter.writerow((u, t, c))
print(u, t, c)
print(f'第{page}页爬取完毕')
if __name__ == '__main__':
with open('01.csv', 'a', encoding='utf-8')as f:
csvwriter = csv.writer(f)
csvwriter.writerow(('评论用户', '评论时间', '评论内容'))
for page in range(1, 8): # 爬取前7页的内容
main(page)
time.sleep(2)
某贴吧评论
import re
import urllib.error
import urllib.request
import xlwt
from bs4 import BeautifulSoup
def main():
# 修改为豆瓣电影Top250的URL
baseurl = "https://movie.douban.com/top250?start="
datelist = getDate(baseurl)
savepath = ".\\douban_top250.xls"
saveDate(datelist, savepath)
print("爬取完毕")
# 编译正则表达式模式
findlink = re.compile(r'<a href="(.*?)">')
findimg = re.compile(r'<img.*src="(.*?)"', re.S)
findtitle = re.compile(r'<span class="title">(.*)</span>') # 修正:添加了结束括号
findrating = re.compile(r'<span class="rating_num" property="v:average">(.*)</span>')
findjudge = re.compile(r'<span>(\d+)人评价</span>') # 修正:使用+而非*
findinq = re.compile(r'<span class="inq">(.*)</span>')
def getDate(baseurl):
datalist = []
for i in range(0, 10): # 10页,每页25条,共250条
url = baseurl + str(i * 25)
print(f"正在爬取第{i + 1}页: {url}")
html = askURL(url)
if not html: # 如果获取页面失败,跳过当前页
continue
soup = BeautifulSoup(html, "html.parser")
items = soup.find_all('div', class_="item")
print(f"找到{len(items)}个电影项")
for item in items:
try:
data = []
item_str = str(item)
# 提取链接
links = re.findall(findlink, item_str)
data.append(links[0] if links else "")
# 提取图片
imgs = re.findall(findimg, item_str)
data.append(imgs[0] if imgs else "")
# 提取标题
titles = re.findall(findtitle, item_str)
data.append(titles[0] if titles else "")
# 提取评分
ratings = re.findall(findrating, item_str)
data.append(ratings[0] if ratings else "")
# 提取评价数
judges = re.findall(findjudge, item_str)
data.append(judges[0] if judges else "")
# 提取概况
inqs = re.findall(findinq, item_str)
data.append(inqs[0].replace("。", "") if inqs else "")
datalist.append(data)
print(f"成功获取: {data[2]}") # 打印电影标题
except Exception as e:
print(f"处理项目时出错: {e}")
continue
print(f"总共获取了{len(datalist)}条数据")
return datalist
def askURL(url):
head = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.97 Safari/537.36"
}
try:
request = urllib.request.Request(url, headers=head)
response = urllib.request.urlopen(request, timeout=10) # 添加超时设置
html = response.read().decode("utf-8")
return html
except urllib.error.URLError as e:
print(f"URL错误: {url}, 错误信息: {e}")
return ""
except Exception as e:
print(f"请求出错: {url}, 错误信息: {e}")
return ""
def saveDate(datalist, savepath):
workbook = xlwt.Workbook(encoding='utf-8')
worksheet = workbook.add_sheet('电影', cell_overwrite_ok=True)
# 列标题
col = ("电影详情", "图片", "影片", "评分", "评价数", "概况")
for i in range(len(col)):
worksheet.write(0, i, col[i])
# 写入数据
for i, data in enumerate(datalist):
print(f"写入第{i + 1}条数据")
for j in range(len(data)):
worksheet.write(i + 1, j, data[j])
workbook.save(savepath)
print(f"数据已保存到: {savepath}")
if __name__ == "__main__":
main()
douban