get_all_url

# -*- coding: utf-8 -*-
"""
 @Time   : 2020/10/29 13:47 
 @Athor   : LinXiao
 @功能   :
"""
# ------------------------------
# 获取每个城市的总页数
import io
import sys
import time
from pprint import pprint

import lxml
import requests
from bs4 import BeautifulSoup
from loguru import logger
from lxml import etree
import random

from redis import Redis
from requests.exceptions import ProxyError


from spider.alifapai_pc import broswer_head, pagination, broswer_head_city_frist
from tools.city_name import city_to_gb2312, hanzi_to_pinyin

# sys.stdout=io.TextIOWrapper(sys.stdout.buffer, encoding='gb18030')

redis_url=Redis(db=10)

CITY_NAMES=['成都', '西安', '重庆', '武汉', '青岛', '广州', '长沙', '兰州']


# 获取总共的页数
def get_page_total(first_page):
    time.sleep(random.random())
    ip, headers, payload, files, proxies=broswer_head_city_frist()
    time.sleep(random.random())
    # time.sleep(random.randint(2, 6))
    sleeptime=random.randint(15, 35)
    time.sleep(sleeptime)
    print(f'随机休眠 {sleeptime}s')
    print('开始请求页面.......')
    try:
        try:
            start=time.time()
            response=requests.request("GET", first_page, headers=headers, data=payload, files=files, proxies=proxies,
                                  timeout=40)
            pprint(response.text)
            end=time.time()
            print(f'页面请求 Sucess! 用时{end - start}S')
            try:
                # 转码
                html_content=response.text.encode(response.encoding).decode("gbk")
                # 开始xpath解析
                tree_html=etree.HTML(html_content)
                page_total_str=(tree_html.xpath('//*[@id="J_LimitFixed"]/ul/li[1]/em/text()'))[0]
                # except Exception as e:
                #     print('解析不到页面数量!')
    
                # page_total_str=(tree_html.xpath('/html/body/div[3]/div[4]/span/em/text()'))[0]
                items_count=int(page_total_str)
                if items_count <= 40:
                    page_total=1
                else:
                    page_total=items_count // 40 + 1
                print(f'page_tatol is: {page_total}')
                return page_total
            except Exception as e:
                print('解析错误!')
                        
        except Exception as e:
            logger.error(f'请求出错!....{e} ,删除不可用ip!')
            redis_ip=Redis(db=8)
            redis_ip.lrem("proxy_ip", 0, ip)  # 移除表中所有与 value 相等的值
            


    # soup = BeautifulSoup(html_content,'lxml')
    # page_total_str = str(soup.select('em[class="page-total"]'))   # 返回 <em class="page-total">2</em>
    # page_total = int(page_total_str.split('</em>')[0].split('total">')[1])

    # page_total_str=(tree_html.xpath('//*[@class="page-skip"]/em/text()'))# '16'  list index out of range
    # try:
    # page_total_str1=(tree_html.xpath('//*[@id="J_LimitFixed"]/ul/li[1]/em/text()'))
    # print(page_total_str1)
    page_total_str=(tree_html.xpath('//*[@id="J_LimitFixed"]/ul/li[1]/em/text()'))[0]
    # except Exception as e:
    #     print('解析不到页面数量!')

    # page_total_str=(tree_html.xpath('/html/body/div[3]/div[4]/span/em/text()'))[0]
    items_count=int(page_total_str)
    if items_count <= 40:
        page_total = 1
    else:
        page_total = items_count//40 + 1
    print(f'page_tatol is: {page_total}')
    return page_total





# 构造所有page页的url
def get_all_page(page_tatol, cityname):
    for page_num in range(1, int(page_tatol + 1)):  # 32页就是2020年8月29号   (只要 九月份以前的数据)
        parm=pagination()  # spm=a213w.7398504.pagination.8.6NzcEktGwdiVP0
        # pre_url = "https://sf.taobao.com/item_list.htm?spm=a213w.7398504.pagination.7.14f14cc6QQLvCs&category=50025969&auction_source=0&city=%B3%C9%B6%BC&st_param=-1&auction_start_seg=-1&page=9"
        pre_url="https://sf.taobao.com/item_list.htm?"
        city_pinyin=hanzi_to_pinyin(cityname)

        # sorder = (x for x in range(0,2))
        city_code=city_to_gb2312(cityname)
        suffix=f"&category=50025969&auction_source=0&city={city_code}&st_param=-1&sorder=0&auction_start_seg=-1&page={page_num}"

        url=pre_url + parm + suffix
        print(url)
        redis_url.lpush(str(city_pinyin), url)
    logger.info(f"已经获取并保存 {cityname} 全部url")


if __name__ == '__main__':
    # first_page="https://sf.taobao.com/item_list.htm?spm=a213w.7398504.pagination.1.6e464cc6ZhiDi4&category=50025969&auction_source=0&city=%B3%C9%B6%BC&sorder=4&st_param=-1&auction_start_seg=-1&page=2"
    # first_page="https://sf.taobao.com/item_list.htm?spm=a213w.7398504.filter.105.501c4cc6MHjcIg&category=50025969&auction_source=0&item_biz_type=6&city=%B3%C9%B6%BC&sorder=1&st_param=-1&auction_start_seg=-1"
    # first_page = "https://sf.taobao.com/item_list.htm?spm=a213w.7398504.filter.104.e3954cc6Ph5abU&category=50025969&auction_source=0&city=%B3%C9%B6%BC&sorder=0&st_param=-1&auction_start_seg=-1"
    # get_page_total(first_page)


    pre_url="https://sf.taobao.com/item_list.htm?"
    for cityname in CITY_NAMES:
        print(cityname)

        city_code=city_to_gb2312(cityname)
        parm=pagination()  # spm=a213w.7398504.pagination.8.6NzcEktGwdiVP0

        suffix=f'&category=50025969&auction_source=0&city={city_code}&sorder=0&st_param=-1&auction_start_seg=-1&page=1'
        url=pre_url + parm + suffix
        print(url)
        page_total=get_page_total(url)   # 返回正在进行的每个城市的总共多少页
        print(f'{cityname} 正在进行 拍卖的房源总共有{page_total} 页')

        # 生成每个城市的正在进行的所有的项目的url

        # get_all_page(page_tatol, cityname)
import asyncio import os import pickle import time import aiohttp from bs4 import BeautifulSoup async def get_url(url, headers, urls, num): # 获取目录num下全部url url_f = url + "/5755/index_" + str(num) + ".html" # 产生目录url async with aiohttp.ClientSession() as session: async with session.get(url=url_f, headers=headers)as resp: # 一个目录至少20个 bs = BeautifulSoup(await resp.text(), "html.parser") # 从目录找到章节ip data = bs.find_all("section", class_="zxjz fk")[1] data = [i["href"] for i in data.find_all("a")] for l in range(len(data)): urls[l + (num - 1) * 20] = url + data[l] print("目录", num, "完成") async def get_text(url, headers, str_data, str_data_ok, num): # 获取章节需要保存的内容 if str_data_ok[num]: # 保存过,不管了 return async with aiohttp.ClientSession() as session: async with session.get(url=url, headers=headers)as resp: bs = BeautifulSoup(await resp.text(), "html.parser") # 找到需要保存的所有内容 title = bs.find_all("div", id="nr_title")[0] # 章节名 text = '\n\n' + title.string + '\n\n' # 这样可以识别出目录。 nr1 = bs.find_all("div", id="nr1")[0] # 章节内容 text += nr1.text str_data[num] = text # 保存章节 # 完工,标注该章结束 str_data_ok[num] = True async def get_all_url(url_1, headers, urls): # 获取所有url存到urls tasks = [asyncio.create_task(get_url(url_1, headers, urls, num)) for num in range(1, 61)] # tasks = [asyncio.create_task(get_url(url_1, headers, urls, num)) for num in range(1, 2)] # 先来20章试试水 await asyncio.wait(tasks) async def get_all_text(urls, headers, str_data, str_data_ok, start, end): # 获取所有章节内容存到str_data # tasks = [asyncio.create_task(get_text(urls[num], headers, str_data, num)) for num in range(len(urls))] # tasks = [asyncio.create_task(get_text(urls[num], headers, str_data, num)) for num in range(20)] # 小网站,给人爬崩了,一次请求一部分。 tasks = [asyncio.create_task(get_text(urls[num], head
最新发布
03-23
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值