python 爬虫自动爬取图片

import os
import time
from lxml import etree
import requests
# 获取图片链接xpath地址
def get_href_xp():
    href_xp = []
    for i in range(30):
        href_xp.append(f'//*[@id="infinite_scroll"]/div[{i+1}]/div[1]/div/div[1]/a')
    return href_xp
# 获取图片名称xpath地址
def get_img_name_xp():
    image_name_xp = []
    for i in range(30):
        image_name_xp.append(f'//*[@id="infinite_scroll"]/div[{i+1}]/div[1]/div/div[1]/a/img')
    return image_name_xp
# 获取图片名称
def get_xp_image_name(xp):
    image_name_list = []
    content = tree.xpath(xp)
    for item in range(len(content)):
        image_name_list.append(content[item].attrib['alt'])
    return image_name_list
# 获取图片链接
def get_xp_html_url(xp):
    html_url_list = []
    content = tree.xpath(xp)
    for item in range(len(content)):
        html_url_list.append('http://www.mm288.com' + content[item].attrib['href'])
    return html_url_list  
# 获取图片链接的图片数量(一个女模特的图片数量,也是链接数)
def image_number(url):
    response = requests.get(url).content.decode('utf-8')
    tree = etree.HTML(response)
    content = tree.xpath('//*[@id="picnum"]/span[2]')
    for item in range(len(content)):
        return content[item].text  
# 女模特图片链接地址拼接
def enlarge(url,number):
    img_url_list = []
    url_head = url[:-5]
    url_till = url[-5:]
    for i in range(number):
        img_url_list.append(url_head + '_'+ f'{i+1}'+ url_till)
    return img_url_list


# 获取女模特图片下载地址
def image_download_url(url):
    response = requests.get(url).content.decode('utf-8')
    tree = etree.HTML(response)
    content = tree.xpath('/html/body/div[2]/div[2]/div[2]/ul[1]/li[2]/a')
    for item in range(len(content)):
        return content[item].attrib['href']
def download(url_list,file_name):
    headers = {
        'Host': 'img1.085p.com',
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/87.0.4280.88 Safari/537.36'
    }
    os.makedirs(f'.\\beautiful_gril\\{file_name}')
    k = 1
    for link in url_list:
        r = requests.get(link, headers=headers)
        with open(f'.\\beautiful_gril\\{file_name}\\{k}.jpg','wb') as f:
            time.sleep(1)
            f.write(r.content)
            f.close()
        requests.close()
        k += 1
if __name__ == "__main__":
    response =    requests.get('http://www.mm288.com/meinv/').content.decode('utf-8')
    tree = etree.HTML(response)
    # 获取图片链接xpath地址
    href_xpath_list = get_href_xp()
    # 获取图片名称xpath地址
    image_name_xpath_list = get_img_name_xp()
    href_list = []
    image_name_list = []
    # 获取图片链接
    for i in href_xpath_list:
        href_list.append(get_xp_html_url(i))
    # 获取图片名称
    for i in image_name_xpath_list:
        image_name_list.append(get_xp_image_name(i))
    # 由于获得的href_list是个二维矩阵,故将他转为一维数组            
    L_href_list = []
    for i in range(len(href_list)):
        L_href_list.append(href_list[i][0])
    href_list = L_href_list
    # 由于获得的image_name_list是个二维矩阵,故将他转为一维数组       
    L_image_name_list = []
    for i in range(len(image_name_list)):
        L_image_name_list.append(image_name_list[i][0])
    image_name_list = L_image_name_list
    
    
    image = []
    image_load_list = []
    image_load = []
    for link in href_list:
        number = image_number(link)
        image.append(enlarge(link,int(number)))
    for item in image:
        image_load = []
        for link in item:
            image_load.append(image_download_url(link))    
        image_load_list.append(image_load)    
    
    key = 13
    for url_list in image_load_list[13:]:
        file_name = image_name_list[key]
    download(url_list,file_name)
    key += 1
    

    
评论 2
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包

打赏作者

须知少时凌云志,曾许人间第一流

你的鼓励将是我创作的最大动力

¥1 ¥2 ¥4 ¥6 ¥10 ¥20
扫码支付:¥1
获取中
扫码支付

您的余额不足,请更换扫码支付或充值

打赏作者

实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值