python crawler - selenium + xpath爬取用户详情页信息 + 分类存储excel

最新推荐文章于 2024-08-23 14:28:37 发布

Dave_L

最新推荐文章于 2024-08-23 14:28:37 发布

阅读量929

点赞数 1

CC 4.0 BY-SA版权

分类专栏：爬虫文章标签： python 爬虫 selenium

本文为博主原创文章，未经博主允许也可以转载。

本文链接：https://blog.csdn.net/FrankAx/article/details/125665290

爬虫专栏收录该内容

5 篇文章

订阅专栏

爬取网站地址

明显看到page=1是控制具体页数的，得到每页的信息；
但是动态加载数据的，没法直接抓网页
采用selenium + 枚举id的方式，枚举所有用户详情页（不得已
最后根据是否售出存到对应sheet中

import requests

from selenium import webdriver
import time


import xlsxwriter

from selenium.webdriver.common.by import By

lists_name = []
dict_name = {}  # 已售出用户昵称集合

# 爬取所有已售出角色昵称
def init1():
    urls = [u'https://fukuaxiaotuandui.com/#/account/selled?clasId=&page={}&server_name=&profession=&yusuanId=&sexId=&levelId=&pingfenId=&sort=pingfen%20DESC&keywords=&min_price=&max_price='.format(str(i)) for i in range(1, 44)]
    # urls = ['https://fukuaxiaotuandui.com/#/account/selled?clasId=&page=1&server_name=&profession=&yusuanId=&sexId=&levelId=&pingfenId=&sort=pingfen%20DESC&keywords=&min_price=&max_price=', 'https://fukuaxiaotuandui.com/#/account/selled?clasId=&page=2&server_name=&profession=&yusuanId=&sexId=&levelId=&pingfenId=&sort=pingfen%20DESC&keywords=&min_price=&max_price=']
    global lists_name
    global dict_name
    for url in urls:

        browser = webdriver.Chrome()
        browser.implicitly_wait(1)
        try:
            browser.get(url)
        except Exception as e:
            browser.quit()
            continue
        lists_list = []
        try:
            lists_user = browser.find_elements(By.CLASS_NAME, 'list-cname')
            for user in lists_user:
                lists_name.append(user.text[4:])
        except Exception as e:
            browser.quit()
            continue

        browser.quit()  # 浏览器关闭
    row1 = 0
    col1 = 0
    work1 = xlsxwriter.Workbook('name.xlsx')
    solved_name = work1.add_worksheet()
    for i in range(len(lists_name)):
        print(lists_name[i])
        dict_name[lists_name[i]] = 1
        solved_name.write(row1, col1, lists_name[i])
        row1 += 1
    work1.close()

def solve():

#   枚举过程中发现这段id极其稀疏，浪费大量时间，故特判
#   unspider = ['2257', '2260', '2261', '2324', '2325', '2327', '2328', '2329', '2331', '2453', '2464', '2467', '2468', '2475', '3078',
# '3181', '3199', '3570', '3951', '3956', '3963', '4149', '4175', '4203', '4292', '4295', '4297', '4333', '4337', '4348',
# '4360', '4400', '4554', '4656', '4789']

    # 拿不到这个网站network请求链接的响应数据，就采用直接枚举id的方法访问详情页，但会访问到大量无效界面，需要异常处理
    urls = [u'https://fukuaxiaotuandui.com/#/account/detail?id={}'.format(str(i)) for i in range(4839, 8250)]
    # urls = ['https://fukuaxiaotuandui.com/#/account/detail?id=1', 'https://fukuaxiaotuandui.com/#/account/detail?id=6619', 'https://fukuaxiaotuandui.com/#/account/detail?id=4839', 'https://fukuaxiaotuandui.com/#/account/detail?id=7283']
    # 创建excel
    workbook = xlsxwriter.Workbook('result1.xlsx')
    # 已售出sheet
    worksheet = workbook.add_worksheet()
    worksheet.write(0, 0, "昵称")
    worksheet.write(0, 1, "大区")
    worksheet.write(0, 2, "职业")
    worksheet.write(0, 3, "性别")
    worksheet.write(0, 4, "总评")
    worksheet.write(0, 5, "衣品")
    worksheet.write(0, 6, "价格")
    worksheet.write(0, 7, "交易时间")
    row = 1
    col = 0

    # 未售出sheet
    worksheet_unsolved = workbook.add_worksheet()
    worksheet_unsolved.write(0, 0, "昵称")
    worksheet_unsolved.write(0, 1, "大区")
    worksheet_unsolved.write(0, 2, "职业")
    worksheet_unsolved.write(0, 3, "性别")
    worksheet_unsolved.write(0, 4, "总评")
    worksheet_unsolved.write(0, 5, "衣品")
    worksheet_unsolved.write(0, 6, "价格")
    row_unsolved = 1
    col_unsolved = 0

    # 爬取每个用户详情页数据
    for url in urls:

        browser = webdriver.Chrome()
        browser.implicitly_wait(1)  # 等1s加载
        browser.get(url)
        lists = [' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ']
        try:
            lists[0] = browser.find_element(By.XPATH, '//*[@id="app"]/div[1]/div[2]/div[2]/div/div[1]').text.split(':')[1]
        except Exception as e:
            browser.quit()
            continue
        
        try:
            lists[1] = browser.find_element(By.XPATH, '//*[@id="app"]/div[1]/div[2]/div[2]/div/div[2]').text.split(':')[1]
        except Exception as e:
            browser.quit()
            continue
        
        try:
            lists[2] = browser.find_element(By.XPATH, '//*[@id="app"]/div[1]/div[2]/div[2]/div/div[3]/div[1]').text.split('：')[1]
        except Exception as e:
            browser.quit()
            continue
        
        try:
            lists[3] = browser.find_element(By.XPATH, '//*[@id="app"]/div[1]/div[2]/div[2]/div/div[3]/div[2]').text.split('：')[1]
        except Exception as e:
            browser.quit()
            continue
        
        try:
            lists[4] = browser.find_element(By.XPATH, '//*[@id="app"]/div[1]/div[2]/div[2]/div/div[4]/div[1]').text.split('：')[1]
        except Exception as e:
            browser.quit()
            continue
        
        try:
            lists[5] = browser.find_element(By.XPATH, '//*[@id="app"]/div[1]/div[2]/div[2]/div/div[4]/div[2]').text.split('衣')[0]
        except Exception as e:
            browser.quit()
            continue
        
        try:
            lists[6] = browser.find_element(By.XPATH, '//*[@id="app"]/div[1]/div[2]/div[2]/div/div[5]').text.split(':')[1]
        except Exception as e:
            browser.quit()
            continue
        
        try:
            lists[7] = browser.find_element(By.XPATH, '//*[@id="app"]/div[1]/div[2]/div[2]/div/div[6]').text[0:-2]
        except Exception as e:
            browser.quit()
            continue


        
       # print(idx)
       # print(zone)
       # print(occupation)
       # print(sex)
       # print(score)
       # print(yipin)
       # print(price)
       # print(time)    # 出售时间
        if lists[0] in dict_name:   # 如果在已售出字典
            for i in range(8):
                worksheet.write(row, col, lists[i])
                col += 1
                if i == 7:
                    row += 1
                    col = 0
        else:
            for i in range(7):
                worksheet_unsolved.write(row_unsolved, col_unsolved, lists[i])
                col_unsolved += 1
                if i == 6:
                    row_unsolved += 1
                    col_unsolved = 0
        browser.quit()  # 浏览器关闭


    workbook.close()

if __name__ == '__main__' :
    init1()
    solve()

爬取网站链接

这个很简单，没有js动态加载，完全能通过接口收到所有想要的角色信息，直接访问用户详情页都省了。

直接解析返回数据json：

import requests
import xlsxwriter

header = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/103.0.0.0 Safari/537.36'}

def solve():

    urls = [u'https://app.cuizhidao.com/api/accounts/search?tag=&pageSize=20&pageNum={}&beginPrice=&endPrice='.format(str(i)) for i in range(1, 80)]
    # urls = ['https://fukuaxiaotuandui.com/#/account/detail?id=1', 'https://fukuaxiaotuandui.com/#/account/detail?id=6619', 'https://fukuaxiaotuandui.com/#/account/detail?id=4839', 'https://fukuaxiaotuandui.com/#/account/detail?id=7283']
    workbook = xlsxwriter.Workbook('result_czd.xlsx')
    worksheet = workbook.add_worksheet()
    worksheet.write(0, 0, "昵称")
    worksheet.write(0, 1, "职业")
    worksheet.write(0, 2, "性别")
    worksheet.write(0, 3, "价格")
    worksheet.write(0, 4, "描述")
    worksheet.write(0, 5, "大区")
    worksheet.write(0, 6, "得分")
    worksheet.write(0, 7, "交易时间")
    row = 1
    col = 0

    worksheet_unsolved = workbook.add_worksheet()
    worksheet_unsolved.write(0, 0, "昵称")
    worksheet_unsolved.write(0, 1, "职业")
    worksheet_unsolved.write(0, 2, "性别")
    worksheet_unsolved.write(0, 3, "价格")
    worksheet_unsolved.write(0, 4, "描述")
    worksheet_unsolved.write(0, 5, "大区")
    worksheet_unsolved.write(0, 6, "得分")

    row_unsolved = 1
    col_unsolved = 0

    for url in urls:

        json1 = requests.get(url, headers=header).json()
        # print(json1)
        lists_user = dict(json1).get("data").get("list")



        for user in lists_user:
            lists = [' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ']
            lists[0] = user["name"]
            lists[1] = user["genreName"]
            lists[2] = ("男" if user["sex"] == 0 else "女")
            lists[3] = user["price"]
            lists[4] = user["tag"]
            lists[5] = user["title"]
            lists[6] = user["score"]
            if "soldTime" in user:
                lists[7] = user["soldTime"]
        
            if lists[7] == ' ':
                for i in range(7):
                    worksheet_unsolved.write(row_unsolved, col_unsolved, lists[i])
                    col_unsolved += 1
                    if i == 6:
                        row_unsolved += 1
                        col_unsolved = 0
            else:
                for i in range(8):
                    worksheet.write(row, col, lists[i])
                    col += 1
                    if i == 7:
                        row += 1
                        col = 0

    workbook.close()

if __name__ == '__main__' :
    solve()