本身小红书很难抓取,通过间接形式获取百度小红书的快照信息
import requests
from lxml import etree
import re
import time
import datetime
def down(url):
headers={
"User-Agent":"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3578.80 Safari/537.36"
}
html=requests.get(url,headers=headers).text
return etree.HTML(html)
def down1(url):
# headers1 = {
# "Accept":"text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8",
# "Accept-Encoding":"gzip, deflate, br",
# "Accept-Language":"zh-CN,zh;q=0.9",
# "Cache-Control":"max-age=0",
# "Connection": "keep-alive",
# "Cookie": "xhsTrackerId=8e993978-8150-4204-c245-f2417e4ce69b; xhsuid=jHNcslZpozk1QmD4; Hm_lvt_b344979f0455853bf22b3ef05fa7b4ac=1544432527; extra_exp_ids=; xhsTracker=url=/discovery/item/5c397811000000000f009c53&searchengine=baidu; Hm_lvt_d0ae755ac51e3c5ff9b1596b0c09c826=1547308270,1547308279,1547348414,1547348980; ANTI=e77b3b070e|1547350336|9f4b320d7a; beaker.session.id=3c0a891f26966b4ad5c3b01c6b74c5b7a670e68bgAJ9cQEoVQhfZXhwaXJlc3ECY2RhdGV0aW1lCmRhdGV0aW1lCnEDVQoH4wETDQonBxo4hVJxBFUDX2lkcQVVIGRhY2QzM2ViMmY2MTQyMmU4ZWJkMDRiYTZkYzk3ZGQ1cQZVDl9hY2Nlc3NlZF90aW1lcQdHQdcOq88lax5VDl9jcmVhdGlvbl90aW1lcQhHQdcOec6wKLd1Lg==; Hm_lvt_9df7d19786b04345ae62033bd17f6278=1547349835; Hm_lpvt_9df7d19786b04345ae62033bd17f6278=1547349835; Hm_lpvt_d0ae755ac51e3c5ff9b1596b0c09c826=1547349835; noteIndex=1; xhs_spses.1e22=*; xhs_spid.1e22=8cbf8b0883f59b82.1544429742.2.1547349838.1544434053.74a6b317-2213-437d-9f22-7fb9edf68bf2",
# # "Cookie": "xhsTrackerId=8e993978-8150-4204-c245-f2417e4ce69b; xhsuid=jHNcslZpozk1QmD4; Hm_lvt_b344979f0455853bf22b3ef05fa7b4ac=1544432527; xhs_spses.5dde=*; Hm_lvt_9df7d19786b04345ae62033bd17f6278=1547298659,1547299711,1547300836,1547301004; Hm_lvt_d0ae755ac51e3c5ff9b1596b0c09c826=1547298659,1547299711,1547300836,1547301004; Hm_lpvt_9df7d19786b04345ae62033bd17f6278=1547302835; beaker.session.id=6c977fa61db8572aec6d70227c75d59461257160gAJ9cQEoVQhfZXhwaXJlc3ECY2RhdGV0aW1lCmRhdGV0aW1lCnEDVQoH4wETDQonBxo4hVJxBFUDX2lkcQVVIGRhY2QzM2ViMmY2MTQyMmU4ZWJkMDRiYTZkYzk3ZGQ1cQZVDl9hY2Nlc3NlZF90aW1lcQdHQdcOfexX43pVDl9jcmVhdGlvbl90aW1lcQhHQdcOec6wKLd1Lg==; Hm_lpvt_d0ae755ac51e3c5ff9b1596b0c09c826=1547302835; xhs_spid.5dde=6e70bc4555378797.1544585706.4.1547304787.1545195746.715a2b88-d73d-4a43-b9b3-a8347ced896a; xhsTracker=url=/discovery/item/5a5241fe4b88451632c9a8c6&searchengine=baidu; extra_exp_ids=; ANTI=e77b3b070e|1547306039|5b2386f725",
# "Host": "m.xiaohongshu.com",
# # "Referer":"https://www.xiaohongshu.com/discovery/item/5c397811000000000f009c53",
# "Upgrade-Insecure-Requests": "1",
# "User-Agent": "Mozilla/5.0 (Linux; Android 5.0; SM-G900P Build/LRX21T) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3578.80 Mobile Safari/537.36",
# }
html = requests.get(url=url,allow_redirects=False)
print(html.status_code)
# print(html.headers["location"])
if html.status_code == 302:
new_id_url = html.headers["location"]
print(new_id_url)
return new_id_url
else:
print("++++++++++++++++")
print(url)
# print(requests.get(url=url,headers=headers1).text)
return etree.HTML(requests.get(url=url).text)
def down2(url):
headers={
"User-Agent":"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3578.80 Safari/537.36"
}
html=requests.get(url,headers=headers)
html.encoding="gb2312"
# print(html.text)
return etree.HTML(html.text)
for j in range(2):
print("第%d页"%(j+1))
url1="https://www.baidu.com/s?ie=utf-8&f=8&wd=资生堂&pn={}&si=xiaohongshu.com&ct=2097152&gpc=stf%3D1514736000%2C1522598399%7Cstftype%3D2".format(j*10)
# url1="https://www.baidu.com/s?ie=utf-8&f=8&wd={}&si=xiaohongshu.com&ct=2097152&gpc=stf%3D1547368671%2C1547455071%7Cstftype%3D1"
html1=down(url1)
if html1.xpath('//div[@id="page"]/strong'):
# for i in range(1):
for i in range(len(html1.xpath('//div[contains(@class,"result")]/h3/a'))):
titile1=html1.xpath('//div[contains(@class,"result")][{}]/div[@class="c-abstract"]'.format(i+1))[0]
titile2=titile1.xpath('string(.)')
titile=re.findall('- (.*)',titile2)[0]
print(titile)
link1=html1.xpath('//div[contains(@class,"result")][{}]/h3/a/@href'.format(i+1))[0]
detil_url=down1(link1)
kuaizao_url=html1.xpath('//div[@class="f13"]/a[2]/@href')[i]
print(kuaizao_url)
try:
kuaizao_html = down2(kuaizao_url)
like = kuaizao_html.xpath('//div[@class="operation-block"]/span[1]/span/text()')[0]
comment = kuaizao_html.xpath('//div[@class="operation-block"]/span[2]/span/text()')[0]
star = kuaizao_html.xpath('//div[@class="operation-block"]/span[3]/span/text()')[0]
time = kuaizao_html.xpath('//div[@class="publish-date"]/span[2]/text()')[0]
user = kuaizao_html.xpath('//span[@class="name-detail"]/text()')[0]
user_img = kuaizao_html.xpath('//div[@class="right-card"]//div[@class="left-img"]/img/@src')[0]
if kuaizao_html.xpath('//ul[@class="slide"]/li[1]/span/@style'):
detil_img_url1 = kuaizao_html.xpath('//ul[@class="slide"]/li[1]/span/@style')[0]
print(detil_img_url1)
detil_img_url = re.findall('url\(//(.*?)\)', detil_img_url1, re.S)[0]
else:
detil_img_url = ""
content1=kuaizao_html.xpath("//div[@class='left-card']//div[@class='content']")[0]
content=content1.xpath('string(.)')
print(titile,detil_url,content,user, user_img, time, like, comment, star, "https://" + detil_img_url)
except:
pass