拍拍产品序列
代码如下:
# coding=gbk
#取和拍拍所有产品的编号
import urllib.request
import re
all_data = []
RootPath = 'd:/pydir/'
ourl = 'http://*.paipai.com/'
for i in range(1,11):
print(i)
url = ourl+'0000000000-'+str(i)+'-1/shop.html' # 后面不能再加:?keyword=#item之类的,不然会造成读取数据出错.
print(url)
data = urllib.request.urlopen(url).read()
data = data.decode('GBK')#转成字符串,不然会是乱码,一直提示说:can't use a string pattern on a bytes-like object
r1 = re.compile('<a href="http://auction1.paipai.com/([^"]+)(.*)>([^<>]+?)</a>',re.I)#忽略大小写
c_t = r1.findall(data)
print(c_t)
c_t = c_t[6:len(c_t)]#前六种商品都是推荐的产品
for i,x in enumerate(c_t):
c_t[i] = x[0]+','+x[2]
all_data.append(c_t[i])
print(c_t)
filehandler = open(RootPath+'paipai.txt','w') #以追加模式打开文件,如果文件不存在则创建
filehandler.write('/n'.join(all_data))
filehandler.close()
#print(data)