微博个人资料信息抓取

最新推荐文章于 2024-12-03 13:30:34 发布

loong_XL

最新推荐文章于 2024-12-03 13:30:34 发布

阅读量1.6k

点赞数

CC 4.0 BY-SA版权

分类专栏：爬虫

本文链接：https://blog.csdn.net/weixin_42357472/article/details/85223585

爬虫专栏收录该内容

29 篇文章

订阅专栏

import gevent
import gevent.monkey

gevent.monkey.patch_all()
import re
from lxml import etree
import requests
import json
import pandas as pd
import time
import csv

import threading



s=requests.Session()
cookie="。。。。。"
headers2 = {
    # "Cookie":"SINAGLOBAL=7238757845138.87.1528291392417; 1",
        "Accept":"*/*",
        "Accept-Encoding":"gzip, deflate, br",
        "Accept-Languag":"zh-CN,zh;q=0.9",
    "Connection":"keep-alive",
    "Content-Type":"application/x-www-form-urlencoded",
    "Cookie":cookie,
    "Host":"weibo.com",
    "Referer":"https://weibo.com/u/1549364094?profile_ftype=1&is_all=1",
    "User-Agent":"Mozilla/5.0 (Macintosh; U; Intel Mac OS X 10_6_8; en-us) AppleWebKit/534.50 (KHTML, like Gecko) Version/5.1 Safari/534.50",
    "X-Requested-With":"XMLHttpRequest",
}
headers1 = {
    # "Cookie":"SINAGLOBA",
    #     "Accept": "*/*",
    #     "Accept-Encoding": "gzip, deflate, br",
    #     "Accept-Languag": "zh-CN,zh;q=0.9",
    "Connection":"keep-alive",
    "Cookie":cookie,
    "Host":"weibo.com",
    "User-Agent":"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3578.80 Safari/537.36",
    "Upgrade-Insecure-Requests":"1",
}

def request1(url):

    html=s.get(url,headers=headers2)
    # print(html.text)
    json1=json.loads(html.text)['data']

    return etree.HTML(json1)

def request2(url):

    html = s.get(url,headers=headers1)
    # print(html.text)
    return html.text


lists_all=[]
# url_id="210926262"
content_all=pd.read_csv(r"C:\Users\Lavector\Desktop\百事小红书\lists2.csv",engine='python',header=None).values.tolist()   #OSError: Initializing from file failed 加上engine='python'
print(content_all)

def download(content):
    try:

        url_id=content[1]
        name=content[0]
        home_url="https://weibo.com/{}?profile_ftype=1&is_all=1#_0".format(url_id)
        print(home_url)
        time.sleep(10)
        a=request2(home_url)
        # print(a)
        content_id = re.findall("page_id']='(.*?)';",a)[0]
        domain_id= re.findall("domain']='(.*?)';",a)[0]
        MyProfileFeed_id=re.findall("Pl_Official_MyProfileFeed__(\d+)",a)[0]
        print(content_id)
        print(domain_id)
        print(MyProfileFeed_id)


        #个人简介
        username=re.findall('<title>(.*?)的微博',a)[0]  #re.findall('<h1.*?>(.*?)<',a)
        # username=aa.xpath('//h1/h')
        print(username)
        info = re.findall('，(.*?)的微博主页.*?description',a)[0]
        print(info)
        person_url="https://weibo.com/p/{}/info?mod=pedit_more".format(content_id)
        print(person_url)
        time.sleep(10)
        try:
            if request2(person_url):
                b=request2(person_url)
                info_html=re.findall('domid":"Pl_Official_PersonalInfo__.*?"html":"(.*?)"}',b)[0].strip().replace("\\r","").replace("\\n","").replace("\\","")
                print(info_html)
                info_html=etree.HTML(info_html)

                information={}
                for i in range(len(info_html.xpath('//span[contains(@class,"pt_title")]'))):
                    bb=info_html.xpath('//span[contains(@class,"pt_title")]/text()')[i].strip()
                    try:
                        if bb == "博客：":
                            cc = info_html.xpath('//li[contains(@class,"li_1")][{}]/a/text()'.format(i + 1))[0].strip()

                        elif bb == "个性域名：":
                            cc = info_html.xpath('//li[contains(@class,"li_1")][{}]/span[2]/a/text()'.format(i + 1))[0].strip()

                        elif bb == "标签：":
                            print("++++++++++++++++111")
                            cc = info_html.xpath('//a[@node-type="tag"]/text()')
                            print(cc)
                            # cc=dd.xpath('string(.)').strip()
                        else:
                            cc = info_html.xpath('//li[contains(@class,"li_1")][{}]/span[2]/text()'.format(i + 1))[0].strip()
                    except:
                        pass
                    information["{}".format(bb)]=cc
                print(information)
                lists_all.append([name,username,info,information])


            with open("lists24.csv", "w", encoding="utf-8", newline="") as f:
                k = csv.writer(f, dialect="excel")
                k.writerow(["名字", "昵称", "info", "简介"])
                for list1 in lists_all:
                    k.writerow(list1)
        except:
            pass

    except:
        lists_all_set=list(set(lists_all))
        with open("lists25.csv", "w", encoding="utf-8", newline="") as f:
            k = csv.writer(f, dialect="excel")
            k.writerow(["名字", "昵称", "info", "简介"])
            for list1 in lists_all_set:
                k.writerow(list1)


# 多线程
# if __name__ == "__main__":
#     length=len(content_all)
#     xclist=[[],[],[],[],[],[],[],[],[],[]]
#     N=len(xclist)
#     for i in range(length):
#         xclist[i%N].append(content_all[i])
#
#     for i in range(10):
#
#             for m in range(len(xclist[i])):
#                 t=threading.Thread(target=download,args=(xclist[i][m],))
#                 t.start()


#协程版
if __name__ == "__main__":
    length=len(content_all)
    xclist = []
    for i in range(length):
        xclist.append(gevent.spawn(download,content_all[i] ))
    print(xclist)



    gevent.joinall(xclist)

在这里插入图片描述