小红书百度快照抓取

最新推荐文章于 2025-02-10 21:13:51 发布
原创最新推荐文章于 2025-02-10 21:13:51 发布 · 4.1k 阅读
0 ·
CC 4.0 BY-SA版权
爬虫专栏收录该内容
29 篇文章
订阅专栏
本身小红书很难抓取，通过间接形式获取百度小红书的快照信息
import requests
from lxml import etree
import re
import time
import datetime


def down(url):

    headers={
    "User-Agent":"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3578.80 Safari/537.36"
    }
    html=requests.get(url,headers=headers).text
    return  etree.HTML(html)


def down1(url):
    # headers1 = {
    #     "Accept":"text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8",
    #     "Accept-Encoding":"gzip, deflate, br",
    #     "Accept-Language":"zh-CN,zh;q=0.9",
    #     "Cache-Control":"max-age=0",
    #     "Connection": "keep-alive",
    #     "Cookie": "xhsTrackerId=8e993978-8150-4204-c245-f2417e4ce69b; xhsuid=jHNcslZpozk1QmD4; Hm_lvt_b344979f0455853bf22b3ef05fa7b4ac=1544432527; extra_exp_ids=; xhsTracker=url=/discovery/item/5c397811000000000f009c53&searchengine=baidu; Hm_lvt_d0ae755ac51e3c5ff9b1596b0c09c826=1547308270,1547308279,1547348414,1547348980; ANTI=e77b3b070e|1547350336|9f4b320d7a; beaker.session.id=3c0a891f26966b4ad5c3b01c6b74c5b7a670e68bgAJ9cQEoVQhfZXhwaXJlc3ECY2RhdGV0aW1lCmRhdGV0aW1lCnEDVQoH4wETDQonBxo4hVJxBFUDX2lkcQVVIGRhY2QzM2ViMmY2MTQyMmU4ZWJkMDRiYTZkYzk3ZGQ1cQZVDl9hY2Nlc3NlZF90aW1lcQdHQdcOq88lax5VDl9jcmVhdGlvbl90aW1lcQhHQdcOec6wKLd1Lg==; Hm_lvt_9df7d19786b04345ae62033bd17f6278=1547349835; Hm_lpvt_9df7d19786b04345ae62033bd17f6278=1547349835; Hm_lpvt_d0ae755ac51e3c5ff9b1596b0c09c826=1547349835; noteIndex=1; xhs_spses.1e22=*; xhs_spid.1e22=8cbf8b0883f59b82.1544429742.2.1547349838.1544434053.74a6b317-2213-437d-9f22-7fb9edf68bf2",
    #     # "Cookie": "xhsTrackerId=8e993978-8150-4204-c245-f2417e4ce69b; xhsuid=jHNcslZpozk1QmD4; Hm_lvt_b344979f0455853bf22b3ef05fa7b4ac=1544432527; xhs_spses.5dde=*; Hm_lvt_9df7d19786b04345ae62033bd17f6278=1547298659,1547299711,1547300836,1547301004; Hm_lvt_d0ae755ac51e3c5ff9b1596b0c09c826=1547298659,1547299711,1547300836,1547301004; Hm_lpvt_9df7d19786b04345ae62033bd17f6278=1547302835; beaker.session.id=6c977fa61db8572aec6d70227c75d59461257160gAJ9cQEoVQhfZXhwaXJlc3ECY2RhdGV0aW1lCmRhdGV0aW1lCnEDVQoH4wETDQonBxo4hVJxBFUDX2lkcQVVIGRhY2QzM2ViMmY2MTQyMmU4ZWJkMDRiYTZkYzk3ZGQ1cQZVDl9hY2Nlc3NlZF90aW1lcQdHQdcOfexX43pVDl9jcmVhdGlvbl90aW1lcQhHQdcOec6wKLd1Lg==; Hm_lpvt_d0ae755ac51e3c5ff9b1596b0c09c826=1547302835; xhs_spid.5dde=6e70bc4555378797.1544585706.4.1547304787.1545195746.715a2b88-d73d-4a43-b9b3-a8347ced896a; xhsTracker=url=/discovery/item/5a5241fe4b88451632c9a8c6&searchengine=baidu; extra_exp_ids=; ANTI=e77b3b070e|1547306039|5b2386f725",
    #     "Host": "m.xiaohongshu.com",
    #     # "Referer":"https://www.xiaohongshu.com/discovery/item/5c397811000000000f009c53",
    #     "Upgrade-Insecure-Requests": "1",
    #     "User-Agent": "Mozilla/5.0 (Linux; Android 5.0; SM-G900P Build/LRX21T) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3578.80 Mobile Safari/537.36",
    # }
    html = requests.get(url=url,allow_redirects=False)
    print(html.status_code)
    # print(html.headers["location"])
    if html.status_code == 302:
        new_id_url = html.headers["location"]
        print(new_id_url)

        return new_id_url
    else:
        print("++++++++++++++++")
        print(url)
        # print(requests.get(url=url,headers=headers1).text)
        return etree.HTML(requests.get(url=url).text)


def down2(url):

    headers={
    "User-Agent":"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3578.80 Safari/537.36"
    }
    html=requests.get(url,headers=headers)
    html.encoding="gb2312"
    # print(html.text)
    return  etree.HTML(html.text)



for j in range(2):
    print("第%d页"%(j+1))
    url1="https://www.baidu.com/s?ie=utf-8&f=8&wd=资生堂&pn={}&si=xiaohongshu.com&ct=2097152&gpc=stf%3D1514736000%2C1522598399%7Cstftype%3D2".format(j*10)
    # url1="https://www.baidu.com/s?ie=utf-8&f=8&wd={}&si=xiaohongshu.com&ct=2097152&gpc=stf%3D1547368671%2C1547455071%7Cstftype%3D1"
    html1=down(url1)
    if html1.xpath('//div[@id="page"]/strong'):
        # for i in range(1):
        for i in range(len(html1.xpath('//div[contains(@class,"result")]/h3/a'))):
            titile1=html1.xpath('//div[contains(@class,"result")][{}]/div[@class="c-abstract"]'.format(i+1))[0]
            titile2=titile1.xpath('string(.)')
            titile=re.findall('- (.*)',titile2)[0]
            print(titile)
            link1=html1.xpath('//div[contains(@class,"result")][{}]/h3/a/@href'.format(i+1))[0]
            detil_url=down1(link1)
            kuaizao_url=html1.xpath('//div[@class="f13"]/a[2]/@href')[i]
            print(kuaizao_url)
            try:
                kuaizao_html = down2(kuaizao_url)
                like = kuaizao_html.xpath('//div[@class="operation-block"]/span[1]/span/text()')[0]
                comment = kuaizao_html.xpath('//div[@class="operation-block"]/span[2]/span/text()')[0]
                star = kuaizao_html.xpath('//div[@class="operation-block"]/span[3]/span/text()')[0]
                time = kuaizao_html.xpath('//div[@class="publish-date"]/span[2]/text()')[0]
                user = kuaizao_html.xpath('//span[@class="name-detail"]/text()')[0]
                user_img = kuaizao_html.xpath('//div[@class="right-card"]//div[@class="left-img"]/img/@src')[0]
                if kuaizao_html.xpath('//ul[@class="slide"]/li[1]/span/@style'):
                    detil_img_url1 = kuaizao_html.xpath('//ul[@class="slide"]/li[1]/span/@style')[0]
                    print(detil_img_url1)
                    detil_img_url = re.findall('url\(//(.*?)\)', detil_img_url1, re.S)[0]
                else:
                    detil_img_url = ""
                content1=kuaizao_html.xpath("//div[@class='left-card']//div[@class='content']")[0]
                content=content1.xpath('string(.)')


                print(titile,detil_url,content,user, user_img, time, like, comment, star, "https://" + detil_img_url)
            except:
                pass