python将电视剧按收视率进行排序_python--爬取豆瓣热门国产电视剧保存为文件

本文介绍了一种使用Python爬取豆瓣网站上热门电影信息的方法,包括电影名称、评分、海报图片及详情链接,并通过两种不同的技术实现:一是利用Requests结合JSON解析数据;二是运用Requests与lxml库的etree模块来抓取所需信息。此外,还提供了一个简单的多线程图片下载示例。

摘要生成于 C知道 ,由 DeepSeek-R1 满血版支持, 前往体验 >

# -*- coding: utf-8 -*-

__author__ = 'Frank Li'

import requests

import json

class HotSpider(object):

def __init__(self):

self.url = "https://m.douban.com/rexxar/api/v2/subject_collection/filter_tv_domestic_hot/items?os=android&for_mobile=1&start={}&count=18&loc_id=108288"

self.session = requests.session()

self.headers = {"Referer": "https://m.douban.com/tv/chinese",

"User-Agent": "Mozilla/5.0 (Linux; Android 6.0; Nexus 5 Build/MRA58N) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/68.0.3440.106 Mobile Safari/537.36"}

def parse_2_list_from_str(self,url):

return json.loads(self.session.get(url,headers=self.headers).content.decode())['subject_collection_items']

def save_as_file(self,content_list,file):

with open(file,'a',encoding='utf-8') as f:

for content in content_list:

f.write(json.dumps(content,ensure_ascii=False))

f.write('\n')

def run(self):

url = self.url.format(0)

num = 0

total = 500

while num

print(url)

self.save_as_file(self.parse_2_list_from_str(url),'hot.json')

num+=18

url=self.url.format(num)

if __name__ == '__main__':

hot_spider = HotSpider()

hot_spider.run()

使用 xpath 爬取正在热映的 电影保存为 json 文件

# -*- coding: utf-8 -*-

__author__ = 'Frank Li'

import requests

from lxml import etree

import json

url = "https://movie.douban.com/cinema/nowplaying/changsha/"

headers = {"Referer":"https://movie.douban.com/cinema/nowplaying/changsha/",

"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/68.0.3440.106 Safari/537.36"}

sess = requests.session()

response = sess.get(url,headers=headers)

html_str = response.content.decode()

element = etree.HTML(html_str)

movie_img_list = element.xpath("//div[@class='mod-bd']/ul[@class='lists']//li[@class='poster']//img/@src")

movie_name_list = element.xpath("//div[@class='mod-bd']/ul[@class='lists']//li[@class='stitle']/a/@title")

movie_addr_list = element.xpath("//div[@class='mod-bd']/ul[@class='lists']//li[@class='stitle']/a/@href")

movie_score_list = element.xpath("//div[@class='mod-bd']/ul[@class='lists']//li[@class='srating']/span[@class='subject-rate']/text()")

for name,img,addr,score in zip(movie_name_list,movie_img_list,movie_addr_list,movie_score_list):

item = {}

item['name'] = name

item['img'] = img

item['addr'] = addr

item['score'] = score

with open('movie.json','a',encoding='utf-8') as f:

item_json = json.dumps(item, ensure_ascii=False, indent=2)

print(item_json)

f.write(item_json)

f.write('\n')

f.flush()

保存下来的 movie.json 文件

{

"name": "碟中谍6:全面瓦解",

"img": "https://img3.doubanio.com/view/photo/s_ratio_poster/public/p2529365085.jpg",

"addr": "https://movie.douban.com/subject/26336252/?from=playing_poster",

"score": "8.3"

}

{

"name": "阿尔法:狼伴归途",

"img": "https://img1.doubanio.com/view/photo/s_ratio_poster/public/p2530871439.jpg",

"addr": "https://movie.douban.com/subject/26810318/?from=playing_poster",

"score": "6.5"

}

{

"name": "蚁人2:黄蜂女现身",

"img": "https://img1.doubanio.com/view/photo/s_ratio_poster/public/p2529389608.jpg",

"addr": "https://movie.douban.com/subject/26636712/?from=playing_poster",

"score": "7.5"

}

{

"name": "传奇的诞生",

"img": "https://img1.doubanio.com/view/photo/s_ratio_poster/public/p2531286907.jpg",

"addr": "https://movie.douban.com/subject/3073268/?from=playing_poster",

"score": "7.6"

}

{

"name": "快把我哥带走",

"img": "https://img3.doubanio.com/view/photo/s_ratio_poster/public/p2531080870.jpg",

"addr": "https://movie.douban.com/subject/30122633/?from=playing_poster",

"score": "7.0"

}

{

"name": "道高一丈",

"img": "https://img1.doubanio.com/view/photo/s_ratio_poster/public/p2530863118.jpg",

"addr": "https://movie.douban.com/subject/26954268/?from=playing_poster",

"score": "5.7"

}

{

"name": "李宗伟:败者为王",

"img": "https://img3.doubanio.com/view/photo/s_ratio_poster/public/p2530870325.jpg",

"addr": "https://movie.douban.com/subject/27195119/?from=playing_poster",

"score": "7.1"

}

{

"name": "西虹市首富",

"img": "https://img1.doubanio.com/view/photo/s_ratio_poster/public/p2529206747.jpg",

"addr": "https://movie.douban.com/subject/27605698/?from=playing_poster",

"score": "6.7"

}

{

"name": "一出好戏",

"img": "https://img3.doubanio.com/view/photo/s_ratio_poster/public/p2529571873.jpg",

"addr": "https://movie.douban.com/subject/26985127/?from=playing_poster",

"score": "7.3"

}

{

"name": "精灵旅社3:疯狂假期",

"img": "https://img3.doubanio.com/view/photo/s_ratio_poster/public/p2530591543.jpg",

"addr": "https://movie.douban.com/subject/26630714/?from=playing_poster",

"score": "6.9"

}

{

"name": "苏丹",

"img": "https://img3.doubanio.com/view/photo/s_ratio_poster/public/p2529570494.jpg",

"addr": "https://movie.douban.com/subject/26728641/?from=playing_poster",

"score": "7.0"

}

{

"name": "巨齿鲨",

"img": "https://img3.doubanio.com/view/photo/s_ratio_poster/public/p2530572643.jpg",

"addr": "https://movie.douban.com/subject/26426194/?from=playing_poster",

"score": "6.0"

}

{

"name": "藏北秘岭-重返无人区",

"img": "https://img3.doubanio.com/view/photo/s_ratio_poster/public/p2532522676.jpg",

"addr": "https://movie.douban.com/subject/30208007/?from=playing_poster",

"score": "6.2"

}

{

"name": "那些女人",

"img": "https://img3.doubanio.com/view/photo/s_ratio_poster/public/p2530146643.jpg",

"addr": "https://movie.douban.com/subject/26574965/?from=playing_poster",

"score": "5.3"

}

{

"name": "草戒指",

"img": "https://img1.doubanio.com/view/photo/s_ratio_poster/public/p2531782507.jpg",

"addr": "https://movie.douban.com/subject/27204180/?from=playing_poster",

"score": "5.6"

}

{

"name": "吻隐者",

"img": "https://img3.doubanio.com/view/photo/s_ratio_poster/public/p2531980221.jpg",

"addr": "https://movie.douban.com/subject/26928809/?from=playing_poster",

"score": "7.6"

}

{

"name": "禹神传之寻找神力",

"img": "https://img3.doubanio.com/view/photo/s_ratio_poster/public/p2532781444.jpg",

"addr": "https://movie.douban.com/subject/30227727/?from=playing_poster",

"score": "6.6"

}

{

"name": "大师兄",

"img": "https://img1.doubanio.com/view/photo/s_ratio_poster/public/p2528842218.jpg",

"addr": "https://movie.douban.com/subject/27201353/?from=playing_poster",

"score": "6.2"

}

简单多线程 图片下载

import requests

from bs4 import BeautifulSoup

import os

import threading

def download_img(src,target=None):

parent_dir = './img'

os.makedirs(parent_dir,exist_ok=True)

r = requests.get(src,stream=True)

target = src.split('/')[-1]

target = os.path.join(parent_dir,target)

print(threading.current_thread(),' start to download img: ',target)

with open(target,'wb') as tar_file:

for chunk in r.iter_content(chunk_size=128):

tar_file.write(chunk)

print('saved {}'.format(target))

if __name__ == '__main__':

URL = 'https://tieba.baidu.com/p/6034793219'

html = requests.get(URL).text

soup = BeautifulSoup(html,'lxml')

# print(html)

imgs = []

srcs = soup.find_all('img',{'class':'BDE_Image'})

for src in srcs:

imgs.append(src['src'])

threads = []

for i,img in enumerate(imgs):

t = threading.Thread(target=download_img,args=(img,),name='Thread-{}'.format(i))

t.start()

threads.append(t)

for t in threads:

t.join()

评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值