通过抓包爬取某视频网站的电影
burpsuite工具
1. 先使用抓包工具进行抓包分析,得到对应的接口
2. 直接用requests库来请求响应数据
3. 存入数据库
代码
from functools import reduce
import pymysql
import requests
import json
def connectdatabase():
conn = pymysql.connect(
host='127.0.0.1',
user='root',
password='root',
db='site',
port=3308,
charset='utf8'
)
return conn
def handle_request(pageno):
url='https://api.web.360kan.com/v1/filter/list?catid=1&rank=rankhot&cat=%E5%96%9C%E5%89%A7&year=&area=&act=&size=35&pageno={}&callback=__jp2'.format(pageno)
resp = requests.get(url=url,headers={
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:94.0) Gecko/20100101 Firefox/94.0'
})
return resp.text
def merge(x,y):
return x+'/'+y
def parse_data(movielist,sql,cusor):
for move in movielist:
actorlist = move['actor']
arealist = move['area']
description = move['description']
directorlist = move['director']
doubanscore = move['doubanscore']
moviecategorylist = move['moviecategory']
movename = move['title']
pubdate = move['pubdate']
playlink_siteslist = move['playlink_sites']
play_address = ''
for site in playlink_siteslist:
linkaddress = move['playlinks'][site]
play_address += (site+':'+linkaddress+';')
movecover = 'https:'+move['cover']
movecomment = move['comment']
for area in arealist:
for movetype in moviecategorylist:
cusor.execute(sql,(reduce(merge,actorlist),area,str(description),reduce(merge,directorlist),doubanscore,movetype,movename,pubdate,play_address,movecomment,movecover))
print(reduce(merge,actorlist),area,str(description),reduce(merge,directorlist),doubanscore,movetype,movename,pubdate,play_address,movecomment,movecover)
conn = connectdatabase()
cusor = conn.cursor()
for i in range(1,20):
movielist = json.loads((handle_request(i))[6:-2])['data']['movies']
sql = "insert into movie values(NULL,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s)"
parse_data(movielist, sql, cusor)
conn.commit()
cusor.close()
conn.close()
总结:遇到数据无法回显时用抓包工具抓一下包,再去使用selenium,最后再去考虑js逆向的问题。