python抓取淘宝关键字信息

最新推荐文章于 2025-07-03 10:48:27 发布

原创最新推荐文章于 2025-07-03 10:48:27 发布 · 2.1k 阅读

3 ·

CC 4.0 BY-SA版权

文章标签：

#python #batch

Python 专栏收录该内容

15 篇文章

订阅专栏

这里写图片描述

懒得写字，不要说，就是干。

# coding=utf-8
import urllib.request
import re
from urllib.request import urlopen, urlretrieve

#打开网页，获取网页内容
def url_open(url):
    headers=("user-agent","(Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/55.0.2883.87 UBrowser/6.2.4091.2 Safari/537.36")
    opener=urllib.request.build_opener()
    opener.addheaders=[headers]
    urllib.request.install_opener(opener)
    data=urllib.request.urlopen(url).read().decode("utf-8","ignore")
    return data

if __name__=='__main__':
    try:
        #定义要查询的商品关键词
        keywd="黑牛旗舰店"
        print("关键字->"+keywd)
        keywords=urllib.request.quote(keywd)
        #定义要爬取的页数
        num=5
        for i in range(num):
            url="https://s.taobao.com/search?q="+keywords+"&imgfile=&commend=all&ssid=s5-e&search_type=item&sourceId=tb.index&spm=a21bo.50862.201856-taobao-item.1&ie=utf8&bcoffset=4&ntoffset=4&p4ppushleft=1%2C48&s="+str(i*44)
            data=url_open(url)
            #定义各个字段正则匹配规则
            img_pat='"pic_url":"(//.*?)"'
            name_pat='"raw_title":"(.*?)"'
            nick_pat='"nick":"(.*?)"'
            price_pat='"view_price":"(.*?)"'
            fee_pat='"view_fee":"(.*?)"'
            sales_pat='"view_sales":"(.*?)"'
            comment_pat='"comment_count":"(.*?)"'
            city_pat='"item_loc":"(.*?)"'
            #查找满足匹配规则的内容，并存在列表中
            imgL=re.compile(img_pat).findall(data)
            nameL=re.compile(name_pat).findall(data)
            nickL=re.compile(nick_pat).findall(data)
            priceL=re.compile(price_pat).findall(data)
            feeL=re.compile(fee_pat).findall(data)
            salesL=re.compile(sales_pat).findall(data)
            commentL=re.compile(comment_pat).findall(data)
            cityL=re.compile(city_pat).findall(data)

            for j in range(len(imgL)):
                img="http:"+imgL[j]#商品图片链接
                name=nameL[j]#商品名称
                nick=nickL[j]#淘宝店铺名称
                price=priceL[j]#商品价格
                fee=feeL[j]#运费
                sales=salesL[j]#商品付款人数
                comment=commentL[j]#商品评论数，会存在为空值的情况
                if(comment==""):
                    comment=0
                city=cityL[j]#店铺所在城市
                #print('正在爬取第'+str(i)+"页，第"+str(j)+"个商品信息...")
                description=str(price)+"#"+str(sales)+"#"+nick+"#"+name
                #print(nameL[j]+"#商品名称#"+nickL[j]+"#淘宝店铺名称#"+priceL[j]+"#商品价格")
                print("【"+nickL[j]+"】"+nameL[j]+"，￥ "+priceL[j]+"，"+salesL[j])
                #try:
                #    urllib.request.urlretrieve(img,'e://taobao//'+description+".jpg")
                #except Exception as e:
                #    print(e)
    except Exception as e:
        print(str(e))
    print("任务完成")