get_all_url

最新推荐文章于 2024-04-27 10:06:24 发布

原创最新推荐文章于 2024-04-27 10:06:24 发布 · 279 阅读

CC 4.0 BY-SA版权

文章标签：

# -*- coding: utf-8 -*-
"""
 @Time   : 2020/10/29 13:47 
 @Athor   : LinXiao
 @功能   :
"""
# ------------------------------
# 获取每个城市的总页数
import io
import sys
import time
from pprint import pprint

import lxml
import requests
from bs4 import BeautifulSoup
from loguru import logger
from lxml import etree
import random

from redis import Redis
from requests.exceptions import ProxyError


from spider.alifapai_pc import broswer_head, pagination, broswer_head_city_frist
from tools.city_name import city_to_gb2312, hanzi_to_pinyin

# sys.stdout=io.TextIOWrapper(sys.stdout.buffer, encoding='gb18030')

redis_url=Redis(db=10)

CITY_NAMES=['成都', '西安', '重庆', '武汉', '青岛', '广州', '长沙', '兰州']


# 获取总共的页数
def get_page_total(first_page):
    time.sleep(random.random())
    ip, headers, payload, files, proxies=broswer_head_city_frist()
    time.sleep(random.random())
    # time.sleep(random.randint(2, 6))
    sleeptime=random.randint(15, 35)
    time.sleep(sleeptime)
    print(f'随机休眠 {sleeptime}s')
    print('开始请求页面.......')
    try:
        try:
            start=time.time()
            response=requests.request("GET", first_page, headers=headers, data=payload, files=files, proxies=proxies,
                                  timeout=40)
            pprint(response.text)
            end=time.time()
            print(f'页面请求 Sucess! 用时{end - start}S')
            try:
                # 转码
                html_content=response.text.encode(response.encoding).decode("gbk")
                # 开始xpath解析
                tree_html=etree.HTML(html_content)
                page_total_str=(tree_html.xpath('//*[@id="J_LimitFixed"]/ul/li[1]/em/text()'))[0]
                # except Exception as e:
                #     print('解析不到页面数量!')
    
                # page_total_str=(tree_html.xpath('/html/body/div[3]/div[4]/span/em/text()'))[0]
                items_count=int(page_total_str)
                if items_count <= 40:
                    page_total=1
                else:
                    page_total=items_count // 40 + 1
                print(f'page_tatol is: {page_total}')
                return page_total
            except Exception as e:
                print('解析错误!')
                        
        except Exception as e:
            logger.error(f'请求出错!....{e} ,删除不可用ip!')
            redis_ip=Redis(db=8)
            redis_ip.lrem("proxy_ip", 0, ip)  # 移除表中所有与 value 相等的值
            


    # soup = BeautifulSoup(html_content,'lxml')
    # page_total_str = str(soup.select('em[class="page-total"]'))   # 返回 <em class="page-total">2</em>
    # page_total = int(page_total_str.split('</em>')[0].split('total">')[1])

    # page_total_str=(tree_html.xpath('//*[@class="page-skip"]/em/text()'))# '16'  list index out of range
    # try:
    # page_total_str1=(tree_html.xpath('//*[@id="J_LimitFixed"]/ul/li[1]/em/text()'))
    # print(page_total_str1)
    page_total_str=(tree_html.xpath('//*[@id="J_LimitFixed"]/ul/li[1]/em/text()'))[0]
    # except Exception as e:
    #     print('解析不到页面数量!')

    # page_total_str=(tree_html.xpath('/html/body/div[3]/div[4]/span/em/text()'))[0]
    items_count=int(page_total_str)
    if items_count <= 40:
        page_total = 1
    else:
        page_total = items_count//40 + 1
    print(f'page_tatol is: {page_total}')
    return page_total





# 构造所有page页的url
def get_all_page(page_tatol, cityname):
    for page_num in range(1, int(page_tatol + 1)):  # 32页就是2020年8月29号   (只要 九月份以前的数据)
        parm=pagination()  # spm=a213w.7398504.pagination.8.6NzcEktGwdiVP0
        # pre_url = "https://sf.taobao.com/item_list.htm?spm=a213w.7398504.pagination.7.14f14cc6QQLvCs&category=50025969&auction_source=0&city=%B3%C9%B6%BC&st_param=-1&auction_start_seg=-1&page=9"
        pre_url="https://sf.taobao.com/item_list.htm?"
        city_pinyin=hanzi_to_pinyin(cityname)

        # sorder = (x for x in range(0,2))
        city_code=city_to_gb2312(cityname)
        suffix=f"&category=50025969&auction_source=0&city={city_code}&st_param=-1&sorder=0&auction_start_seg=-1&page={page_num}"

        url=pre_url + parm + suffix
        print(url)
        redis_url.lpush(str(city_pinyin), url)
    logger.info(f"已经获取并保存 {cityname} 全部url")


if __name__ == '__main__':
    # first_page="https://sf.taobao.com/item_list.htm?spm=a213w.7398504.pagination.1.6e464cc6ZhiDi4&category=50025969&auction_source=0&city=%B3%C9%B6%BC&sorder=4&st_param=-1&auction_start_seg=-1&page=2"
    # first_page="https://sf.taobao.com/item_list.htm?spm=a213w.7398504.filter.105.501c4cc6MHjcIg&category=50025969&auction_source=0&item_biz_type=6&city=%B3%C9%B6%BC&sorder=1&st_param=-1&auction_start_seg=-1"
    # first_page = "https://sf.taobao.com/item_list.htm?spm=a213w.7398504.filter.104.e3954cc6Ph5abU&category=50025969&auction_source=0&city=%B3%C9%B6%BC&sorder=0&st_param=-1&auction_start_seg=-1"
    # get_page_total(first_page)


    pre_url="https://sf.taobao.com/item_list.htm?"
    for cityname in CITY_NAMES:
        print(cityname)

        city_code=city_to_gb2312(cityname)
        parm=pagination()  # spm=a213w.7398504.pagination.8.6NzcEktGwdiVP0

        suffix=f'&category=50025969&auction_source=0&city={city_code}&sorder=0&st_param=-1&auction_start_seg=-1&page=1'
        url=pre_url + parm + suffix
        print(url)
        page_total=get_page_total(url)   # 返回正在进行的每个城市的总共多少页
        print(f'{cityname} 正在进行 拍卖的房源总共有{page_total} 页')

        # 生成每个城市的正在进行的所有的项目的url

        # get_all_page(page_tatol, cityname)