#!/usr/bin/python
#coding=UTF-8
import urllib
import re
import sys
import time
import urllib2
import os
import chardet
import time
reload(sys)
sys.setdefaultencoding("utf-8")
def getHtml(url):
req = urllib2.Request(url)
req.add_header('User-Agent','Mozilla/5.0')
html = urllib2.urlopen(req).read()
return html
def getGoodsURL(url):#获取当前页面的商品URL
html = getHtml(url)
reg = r'<dt class="title"><a href="(.+?)" target'
goodsre = re.compile(reg)
urllist = re.findall(goodsre,html)
urllist = list(set(urllist))
return urllist
def GetMiddleStr(content,startStr,endStr):#获取指定的部分字符串
startIndex = content.index(startStr)
if startIndex>=0:
startIndex += len(startStr)
endIndex = content.index(endStr)
return content[startIndex:endIndex]
def getDict(goodurl):#根据商品url,返回一个字典
start_str= "charObj = "
end_str = ',"items"'
html = getHtml(goodurl)
myjson = GetMiddleStr(html,start_str,end_str)+"}"
mydict = eval(myjson)
return mydict
def file_edit(wr_str):# 写入txt文件
f1 = open(r'D:\pet.txt','a')
f1.write(wr_str)
f1.close()
def getPet(goodurl,pet,level):#商品url是否有指定的宝宝指定的变异等级
roledata = getDict(goodurl)
petList = roledata["petList"]
if petList:
for i in range(0,len(petList)):
petname = petList[i]["petVarLevelExplain"] #珍兽名称
petvarlevel = petList[i]["petVarLevel"] #变异等级
xingge = petList[i]["petXingGe"] #性格 0:胆小, 1:谨慎, 2:忠诚,3:精明,4:勇猛
if (pet in petname) and (petvarlevel==level):
return "yes"
def goods(url,m,p):# 获取第m页至第p页的所有商品url
all_goods = []
for n in range(m,p+1):
goodsList = getGoodsURL(url + str(n))
all_goods = all_goods + goodsList
return all_goods
if __name__ == "__main__":
#等级110以上,逍遥,价格111--2222元,公示商品
url = "http://tl.cyg.changyou.com/goods/public?world_id=0&profession=8&price=111-2222&level=110-119&have_chosen=profession*8%20price*111-2222%20level*110-119&page_num="
g_List = goods(url,1,15)
for i in range(0,len(g_List)):
sys.stdout.write(str(i)+"/"+str(len(g_List))+"\r")
sys.stdout.flush()
if getPet(g_List[i],"傲云苍龙",7)=="yes":
print i,"--",g_List[i]
print "---end----"
更新一版,相比上一版,执行效率有所提升。在于使用了BeautifulSoup(html,’lxml’)
这一版,把python环境升级到了python 3.6.4
#coding=utf-8
import urllib
from bs4 import BeautifulSoup
import re
import sys
def get_html(url):#获取html
req = urllib.request.Request(url)
req.add_header('User-Agent','Mozilla/5.0')
response = urllib.request.urlopen(req)
html = response.read().decode('utf-8')
return html
def get_goods(url):#根据搜索列表页,获取商品的url,每页应该是20个
html = get_html(url)
soup = BeautifulSoup(html,'lxml')
soup.prettify()
goods = soup.find_all('dt',attrs={'class':'title'})
g_list = []
for g in goods:
g_list.append(g.a.get('href'))
return g_list
def get_all_pages(url):# 获取所有页的URL
html = get_html(url + str(1))
soup = BeautifulSoup(html,'lxml')
soup.prettify()
pages = soup.find_all('a',attrs={'class':'num'})
l = []
for p in pages:
l.append(int(p.string))
max_page = max(l)#获取共有多少页
all_pages_list = []
for i in range(1,max_page+1):
all_pages_list.append(url + str(i))
return all_pages_list
def pet_info(good_url):#根据商品详情url,获取角色的宝宝信息,输出符合要求的宝宝
start_str= "charObj = "
end_str = ',"items"'
html = get_html(good_url)
startIndex = html.index(start_str)
startIndex = startIndex + len(start_str)
endIndex = html.index(end_str)
role_json = html[startIndex:endIndex]+"}"
role_dict = eval(role_json)
petList = role_dict["petList"]
if petList == []:
return "no_pet"
else:
for i in range(0,len(petList)):
petname = petList[i]["petVarLevelExplain"] #珍兽名称
petvarlevel = petList[i]["petVarLevel"] #变异等级
if (petname == "傲云苍龙") and (petvarlevel >= 7):
return "matching"
if __name__ == "__main__":
url = "http://tl.cyg.changyou.com/goods/public?world_id=0&profession=8&price=11-2345&level=110-119&have_chosen=profession*8%20price*11-2345%20level*110-119&page_num="
pages_list = get_all_pages(url)
print("共:",len(pages_list),"页")
for p in range(0,len(pages_list)):
p_goods = get_goods(pages_list[p])
print("开始分析第",p+1,"页,共有",len(p_goods),"个商品")
for g in range(0,len(p_goods)):
if pet_info(p_goods[g]) == "matching":
print(g+1,"--match--",p_goods[g])
else:
sys.stdout.write(str(g+1)+"/"+str(len(p_goods))+",NO match"+"\r")
sys.stdout.flush()
今天又更新一版。
#coding=utf-8
from urllib import request,parse
from bs4 import BeautifulSoup
import re
import sys
import time
def get_html(url):#获取html
req = request.Request(url)
req.add_header('User-Agent','Mozilla/5.0')
response = request.urlopen(req)
html = response.read().decode('utf-8')
time.sleep(5)
return html
def get_goods(url):#根据搜索列表页,获取角色商品的url,每页应该是20个
html = get_html(url)
soup = BeautifulSoup(html,'lxml')
soup.prettify()
goods = soup.find_all('dt',attrs={'class':'title'})
g_list = []
for g in goods:
g_list.append(g.a.get('href'))
return g_list
def get_all_pages(url):# 获取所有页的URL
html = get_html(url + str(1))
soup = BeautifulSoup(html,'lxml')
soup.prettify()
pages = soup.find_all('a',attrs={'class':'num'})
l = []
for p in pages:
l.append(int(p.string))
max_page = max(l)#获取共有多少页
all_pages_list = []
for i in range(1,max_page+1):
all_pages_list.append(url + str(i))
return all_pages_list
def pet_info(good_url):#根据商品详情url,获取角色的宝宝信息,输出符合要求的宝宝
start_str= "charObj = "
end_str = ',"items"'
html = get_html(good_url)
startIndex = html.index(start_str)
startIndex = startIndex + len(start_str)
endIndex = html.index(end_str)
role_json = html[startIndex:endIndex]+"}"
role_dict = eval(role_json)
petList = role_dict["petList"]
if petList == []:
return "no_pet"
else:
for i in range(0,len(petList)):
petname = petList[i]["petVarLevelExplain"] #珍兽名称
petvarlevel = petList[i]["petVarLevel"] #变异等级0:宝宝,9:成年
xingge = petList[i]["petXingGe"] #性格 0:胆小, 1:谨慎, 2:忠诚,3:精明,4:勇猛
if (petname == "傲云苍龙") and (petvarlevel == 7):
return "matching"
if __name__ == "__main__":
host_url = "http://tl.cyg.changyou.com/goods/public?world_id=0&"
info_dict = {
'profession':8,
'price':'222-2333',
'level':'110-119',
'xinfa':'4001-0',
'xiulian':'10001-0',
'jinjiexiulian':'3001-0',
'equipscore':'100001-400000'
}
tt = parse.urlencode(info_dict)
have_chosen = parse.quote(tt.encode('GBK'))
info_dict['have_chosen'] = have_chosen
url = host_url + parse.urlencode(info_dict) +"&page_num="
pages_list = get_all_pages(url)
print("共:",len(pages_list),"页")
for p in range(0,len(pages_list)):
p_goods = get_goods(pages_list[p])
print("开始分析第",p+1,"页,共有",len(p_goods),"个商品")
for g in range(0,len(p_goods)):
if pet_info(p_goods[g]) == "matching":
print(g+1,"--match--",p_goods[g])
else:
sys.stdout.write(str(g+1)+"/"+str(len(p_goods))+",NO match"+"\r")
sys.stdout.flush()