【2018-8-20更新(第3版)】筛选7变胆小傲云苍龙宝宝

本文介绍了一个用于爬取天龙八部游戏中特定商品信息的爬虫程序,通过迭代改进,从原始的正则表达式和urllib库使用,到引入BeautifulSoup进行网页解析,最终实现了更高效稳定的数据抓取。
#!/usr/bin/python
#coding=UTF-8
import urllib
import re
import sys
import time
import urllib2
import os
import chardet
import time
reload(sys)
sys.setdefaultencoding("utf-8")
def getHtml(url):
    req = urllib2.Request(url)
    req.add_header('User-Agent','Mozilla/5.0')
    html = urllib2.urlopen(req).read()
    return html

def getGoodsURL(url):#获取当前页面的商品URL
    html = getHtml(url)
    reg = r'<dt class="title"><a href="(.+?)" target'
    goodsre = re.compile(reg)
    urllist = re.findall(goodsre,html)
    urllist = list(set(urllist))
    return urllist

def GetMiddleStr(content,startStr,endStr):#获取指定的部分字符串
    startIndex = content.index(startStr)
    if startIndex>=0:
        startIndex += len(startStr)
        endIndex = content.index(endStr)
    return content[startIndex:endIndex]

def getDict(goodurl):#根据商品url,返回一个字典
    start_str= "charObj = "
    end_str = ',"items"'
    html = getHtml(goodurl)
    myjson = GetMiddleStr(html,start_str,end_str)+"}"
    mydict = eval(myjson)
    return mydict

def file_edit(wr_str):# 写入txt文件
    f1 = open(r'D:\pet.txt','a')
    f1.write(wr_str)
    f1.close()

def getPet(goodurl,pet,level):#商品url是否有指定的宝宝指定的变异等级
    roledata = getDict(goodurl)
    petList = roledata["petList"]
    if petList:
        for i in range(0,len(petList)):
            petname = petList[i]["petVarLevelExplain"] #珍兽名称
            petvarlevel = petList[i]["petVarLevel"]    #变异等级
            xingge = petList[i]["petXingGe"]  #性格 0:胆小, 1:谨慎, 2:忠诚,3:精明,4:勇猛
            if (pet in petname) and (petvarlevel==level):
                return "yes"
def goods(url,m,p):# 获取第m页至第p页的所有商品url
    all_goods = []
    for n in range(m,p+1):
        goodsList = getGoodsURL(url + str(n))
        all_goods = all_goods + goodsList
    return all_goods

if __name__ == "__main__":
    #等级110以上,逍遥,价格111--2222元,公示商品
    url = "http://tl.cyg.changyou.com/goods/public?world_id=0&profession=8&price=111-2222&level=110-119&have_chosen=profession*8%20price*111-2222%20level*110-119&page_num="
    g_List = goods(url,1,15)
    for i in range(0,len(g_List)):
        sys.stdout.write(str(i)+"/"+str(len(g_List))+"\r")
        sys.stdout.flush()
        if getPet(g_List[i],"傲云苍龙",7)=="yes":
            print i,"--",g_List[i]
    print "---end----"

更新一版,相比上一版,执行效率有所提升。在于使用了BeautifulSoup(html,’lxml’)
这一版,把python环境升级到了python 3.6.4

#coding=utf-8
import urllib
from bs4 import BeautifulSoup
import re
import sys
def get_html(url):#获取html
    req = urllib.request.Request(url)
    req.add_header('User-Agent','Mozilla/5.0')
    response  = urllib.request.urlopen(req)
    html = response.read().decode('utf-8')
    return html

def get_goods(url):#根据搜索列表页,获取商品的url,每页应该是20个
    html = get_html(url)
    soup = BeautifulSoup(html,'lxml')
    soup.prettify()
    goods = soup.find_all('dt',attrs={'class':'title'})
    g_list = []
    for g in goods:
        g_list.append(g.a.get('href'))
    return g_list

def get_all_pages(url):# 获取所有页的URL
    html = get_html(url + str(1))
    soup = BeautifulSoup(html,'lxml')
    soup.prettify()
    pages = soup.find_all('a',attrs={'class':'num'})
    l = []
    for p in pages:
        l.append(int(p.string))
    max_page = max(l)#获取共有多少页
    all_pages_list = []
    for i in range(1,max_page+1):
        all_pages_list.append(url + str(i))
    return all_pages_list

def pet_info(good_url):#根据商品详情url,获取角色的宝宝信息,输出符合要求的宝宝
    start_str= "charObj = "
    end_str = ',"items"'
    html = get_html(good_url)
    startIndex = html.index(start_str)
    startIndex = startIndex + len(start_str)
    endIndex = html.index(end_str)
    role_json = html[startIndex:endIndex]+"}"
    role_dict = eval(role_json)
    petList = role_dict["petList"]
    if petList == []:
        return "no_pet"
    else:
        for i in range(0,len(petList)):
            petname = petList[i]["petVarLevelExplain"] #珍兽名称
            petvarlevel = petList[i]["petVarLevel"]    #变异等级
            if (petname == "傲云苍龙") and (petvarlevel >= 7):
                return "matching"

if __name__ == "__main__":
    url = "http://tl.cyg.changyou.com/goods/public?world_id=0&profession=8&price=11-2345&level=110-119&have_chosen=profession*8%20price*11-2345%20level*110-119&page_num="
    pages_list = get_all_pages(url)
    print("共:",len(pages_list),"页")
    for p in range(0,len(pages_list)):
        p_goods = get_goods(pages_list[p])
        print("开始分析第",p+1,"页,共有",len(p_goods),"个商品")
        for g in range(0,len(p_goods)):
            if pet_info(p_goods[g]) == "matching":
                print(g+1,"--match--",p_goods[g])
            else:
                sys.stdout.write(str(g+1)+"/"+str(len(p_goods))+",NO match"+"\r")
                sys.stdout.flush()

今天又更新一版。

#coding=utf-8
from urllib import request,parse
from bs4 import BeautifulSoup
import re
import sys
import time

def get_html(url):#获取html
    req = request.Request(url)
    req.add_header('User-Agent','Mozilla/5.0')
    response  = request.urlopen(req)
    html = response.read().decode('utf-8')
    time.sleep(5)
    return html

def get_goods(url):#根据搜索列表页,获取角色商品的url,每页应该是20个
    html = get_html(url)
    soup = BeautifulSoup(html,'lxml')
    soup.prettify()
    goods = soup.find_all('dt',attrs={'class':'title'})
    g_list = []
    for g in goods:
        g_list.append(g.a.get('href'))
    return g_list

def get_all_pages(url):# 获取所有页的URL
    html = get_html(url + str(1))
    soup = BeautifulSoup(html,'lxml')
    soup.prettify()
    pages = soup.find_all('a',attrs={'class':'num'})
    l = []
    for p in pages:
        l.append(int(p.string))
    max_page = max(l)#获取共有多少页
    all_pages_list = []
    for i in range(1,max_page+1):
        all_pages_list.append(url + str(i))
    return all_pages_list

def pet_info(good_url):#根据商品详情url,获取角色的宝宝信息,输出符合要求的宝宝
    start_str= "charObj = "
    end_str = ',"items"'
    html = get_html(good_url)
    startIndex = html.index(start_str)
    startIndex = startIndex + len(start_str)
    endIndex = html.index(end_str)
    role_json = html[startIndex:endIndex]+"}"
    role_dict = eval(role_json)
    petList = role_dict["petList"]
    if petList == []:
        return "no_pet"
    else:
        for i in range(0,len(petList)):
            petname = petList[i]["petVarLevelExplain"] #珍兽名称
            petvarlevel = petList[i]["petVarLevel"]    #变异等级0:宝宝,9:成年
            xingge = petList[i]["petXingGe"]  #性格 0:胆小, 1:谨慎, 2:忠诚,3:精明,4:勇猛
            if (petname == "傲云苍龙") and (petvarlevel == 7):
                return "matching"

if __name__ == "__main__":
    host_url = "http://tl.cyg.changyou.com/goods/public?world_id=0&"
    info_dict = {
        'profession':8,
        'price':'222-2333',
        'level':'110-119',
        'xinfa':'4001-0',
        'xiulian':'10001-0',
        'jinjiexiulian':'3001-0',
        'equipscore':'100001-400000'
    }
    tt = parse.urlencode(info_dict)
    have_chosen = parse.quote(tt.encode('GBK'))
    info_dict['have_chosen'] = have_chosen
    url = host_url + parse.urlencode(info_dict) +"&page_num="
    pages_list = get_all_pages(url)
    print("共:",len(pages_list),"页")
    for p in range(0,len(pages_list)):
        p_goods = get_goods(pages_list[p])
        print("开始分析第",p+1,"页,共有",len(p_goods),"个商品")
        for g in range(0,len(p_goods)):
            if pet_info(p_goods[g]) == "matching":
                print(g+1,"--match--",p_goods[g])
            else:
                sys.stdout.write(str(g+1)+"/"+str(len(p_goods))+",NO match"+"\r")
                sys.stdout.flush()
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包

打赏作者

zhizunyu2009

你的鼓励将是我创作的最大动力

¥1 ¥2 ¥4 ¥6 ¥10 ¥20
扫码支付:¥1
获取中
扫码支付

您的余额不足,请更换扫码支付或充值

打赏作者

实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值