明显看到page=1
是控制具体页数的,得到每页的信息;
但是动态加载数据的,没法直接抓网页
采用selenium + 枚举id的方式,枚举所有用户详情页(不得已
最后根据是否售出存到对应sheet中
import requests
from selenium import webdriver
import time
import xlsxwriter
from selenium.webdriver.common.by import By
lists_name = []
dict_name = {} # 已售出用户昵称集合
# 爬取所有已售出角色昵称
def init1():
urls = [u'https://fukuaxiaotuandui.com/#/account/selled?clasId=&page={}&server_name=&profession=&yusuanId=&sexId=&levelId=&pingfenId=&sort=pingfen%20DESC&keywords=&min_price=&max_price='.format(str(i)) for i in range(1, 44)]
# urls = ['https://fukuaxiaotuandui.com/#/account/selled?clasId=&page=1&server_name=&profession=&yusuanId=&sexId=&levelId=&pingfenId=&sort=pingfen%20DESC&keywords=&min_price=&max_price=', 'https://fukuaxiaotuandui.com/#/account/selled?clasId=&page=2&server_name=&profession=&yusuanId=&sexId=&levelId=&pingfenId=&sort=pingfen%20DESC&keywords=&min_price=&max_price=']
global lists_name
global dict_name
for url in urls:
browser = webdriver.Chrome()
browser.implicitly_wait(1)
try:
browser.get(url)
except Exception as e:
browser.quit()
continue
lists_list = []
try:
lists_user = browser.find_elements(By.CLASS_NAME, 'list-cname')
for user in lists_user:
lists_name.append(user.text[4:])
except Exception as e:
browser.quit()
continue
browser.quit() # 浏览器关闭
row1 = 0
col1 = 0
work1 = xlsxwriter.Workbook('name.xlsx')
solved_name = work1.add_worksheet()
for i in range(len(lists_name)):
print(lists_name[i])
dict_name[lists_name[i]] = 1
solved_name.write(row1, col1, lists_name[i])
row1 += 1
work1.close()
def solve():
# 枚举过程中发现这段id极其稀疏,浪费大量时间,故特判
# unspider = ['2257', '2260', '2261', '2324', '2325', '2327', '2328', '2329', '2331', '2453', '2464', '2467', '2468', '2475', '3078',
# '3181', '3199', '3570', '3951', '3956', '3963', '4149', '4175', '4203', '4292', '4295', '4297', '4333', '4337', '4348',
# '4360', '4400', '4554', '4656', '4789']
# 拿不到这个网站network请求链接的响应数据,就采用直接枚举id的方法访问详情页,但会访问到大量无效界面,需要异常处理
urls = [u'https://fukuaxiaotuandui.com/#/account/detail?id={}'.format(str(i)) for i in range(4839, 8250)]
# urls = ['https://fukuaxiaotuandui.com/#/account/detail?id=1', 'https://fukuaxiaotuandui.com/#/account/detail?id=6619', 'https://fukuaxiaotuandui.com/#/account/detail?id=4839', 'https://fukuaxiaotuandui.com/#/account/detail?id=7283']
# 创建excel
workbook = xlsxwriter.Workbook('result1.xlsx')
# 已售出sheet
worksheet = workbook.add_worksheet()
worksheet.write(0, 0, "昵称")
worksheet.write(0, 1, "大区")
worksheet.write(0, 2, "职业")
worksheet.write(0, 3, "性别")
worksheet.write(0, 4, "总评")
worksheet.write(0, 5, "衣品")
worksheet.write(0, 6, "价格")
worksheet.write(0, 7, "交易时间")
row = 1
col = 0
# 未售出sheet
worksheet_unsolved = workbook.add_worksheet()
worksheet_unsolved.write(0, 0, "昵称")
worksheet_unsolved.write(0, 1, "大区")
worksheet_unsolved.write(0, 2, "职业")
worksheet_unsolved.write(0, 3, "性别")
worksheet_unsolved.write(0, 4, "总评")
worksheet_unsolved.write(0, 5, "衣品")
worksheet_unsolved.write(0, 6, "价格")
row_unsolved = 1
col_unsolved = 0
# 爬取每个用户详情页数据
for url in urls:
browser = webdriver.Chrome()
browser.implicitly_wait(1) # 等1s加载
browser.get(url)
lists = [' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ']
try:
lists[0] = browser.find_element(By.XPATH, '//*[@id="app"]/div[1]/div[2]/div[2]/div/div[1]').text.split(':')[1]
except Exception as e:
browser.quit()
continue
try:
lists[1] = browser.find_element(By.XPATH, '//*[@id="app"]/div[1]/div[2]/div[2]/div/div[2]').text.split(':')[1]
except Exception as e:
browser.quit()
continue
try:
lists[2] = browser.find_element(By.XPATH, '//*[@id="app"]/div[1]/div[2]/div[2]/div/div[3]/div[1]').text.split(':')[1]
except Exception as e:
browser.quit()
continue
try:
lists[3] = browser.find_element(By.XPATH, '//*[@id="app"]/div[1]/div[2]/div[2]/div/div[3]/div[2]').text.split(':')[1]
except Exception as e:
browser.quit()
continue
try:
lists[4] = browser.find_element(By.XPATH, '//*[@id="app"]/div[1]/div[2]/div[2]/div/div[4]/div[1]').text.split(':')[1]
except Exception as e:
browser.quit()
continue
try:
lists[5] = browser.find_element(By.XPATH, '//*[@id="app"]/div[1]/div[2]/div[2]/div/div[4]/div[2]').text.split('衣')[0]
except Exception as e:
browser.quit()
continue
try:
lists[6] = browser.find_element(By.XPATH, '//*[@id="app"]/div[1]/div[2]/div[2]/div/div[5]').text.split(':')[1]
except Exception as e:
browser.quit()
continue
try:
lists[7] = browser.find_element(By.XPATH, '//*[@id="app"]/div[1]/div[2]/div[2]/div/div[6]').text[0:-2]
except Exception as e:
browser.quit()
continue
# print(idx)
# print(zone)
# print(occupation)
# print(sex)
# print(score)
# print(yipin)
# print(price)
# print(time) # 出售时间
if lists[0] in dict_name: # 如果在已售出字典
for i in range(8):
worksheet.write(row, col, lists[i])
col += 1
if i == 7:
row += 1
col = 0
else:
for i in range(7):
worksheet_unsolved.write(row_unsolved, col_unsolved, lists[i])
col_unsolved += 1
if i == 6:
row_unsolved += 1
col_unsolved = 0
browser.quit() # 浏览器关闭
workbook.close()
if __name__ == '__main__' :
init1()
solve()
这个很简单,没有js动态加载,完全能通过接口收到所有想要的角色信息,直接访问用户详情页都省了。
直接解析返回数据json:
import requests
import xlsxwriter
header = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/103.0.0.0 Safari/537.36'}
def solve():
urls = [u'https://app.cuizhidao.com/api/accounts/search?tag=&pageSize=20&pageNum={}&beginPrice=&endPrice='.format(str(i)) for i in range(1, 80)]
# urls = ['https://fukuaxiaotuandui.com/#/account/detail?id=1', 'https://fukuaxiaotuandui.com/#/account/detail?id=6619', 'https://fukuaxiaotuandui.com/#/account/detail?id=4839', 'https://fukuaxiaotuandui.com/#/account/detail?id=7283']
workbook = xlsxwriter.Workbook('result_czd.xlsx')
worksheet = workbook.add_worksheet()
worksheet.write(0, 0, "昵称")
worksheet.write(0, 1, "职业")
worksheet.write(0, 2, "性别")
worksheet.write(0, 3, "价格")
worksheet.write(0, 4, "描述")
worksheet.write(0, 5, "大区")
worksheet.write(0, 6, "得分")
worksheet.write(0, 7, "交易时间")
row = 1
col = 0
worksheet_unsolved = workbook.add_worksheet()
worksheet_unsolved.write(0, 0, "昵称")
worksheet_unsolved.write(0, 1, "职业")
worksheet_unsolved.write(0, 2, "性别")
worksheet_unsolved.write(0, 3, "价格")
worksheet_unsolved.write(0, 4, "描述")
worksheet_unsolved.write(0, 5, "大区")
worksheet_unsolved.write(0, 6, "得分")
row_unsolved = 1
col_unsolved = 0
for url in urls:
json1 = requests.get(url, headers=header).json()
# print(json1)
lists_user = dict(json1).get("data").get("list")
for user in lists_user:
lists = [' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ']
lists[0] = user["name"]
lists[1] = user["genreName"]
lists[2] = ("男" if user["sex"] == 0 else "女")
lists[3] = user["price"]
lists[4] = user["tag"]
lists[5] = user["title"]
lists[6] = user["score"]
if "soldTime" in user:
lists[7] = user["soldTime"]
if lists[7] == ' ':
for i in range(7):
worksheet_unsolved.write(row_unsolved, col_unsolved, lists[i])
col_unsolved += 1
if i == 6:
row_unsolved += 1
col_unsolved = 0
else:
for i in range(8):
worksheet.write(row, col, lists[i])
col += 1
if i == 7:
row += 1
col = 0
workbook.close()
if __name__ == '__main__' :
solve()