import os
import time
from lxml import etree
import requests
# 获取图片链接xpath地址
def get_href_xp():
href_xp = []
for i in range(30):
href_xp.append(f'//*[@id="infinite_scroll"]/div[{i+1}]/div[1]/div/div[1]/a')
return href_xp
# 获取图片名称xpath地址
def get_img_name_xp():
image_name_xp = []
for i in range(30):
image_name_xp.append(f'//*[@id="infinite_scroll"]/div[{i+1}]/div[1]/div/div[1]/a/img')
return image_name_xp
# 获取图片名称
def get_xp_image_name(xp):
image_name_list = []
content = tree.xpath(xp)
for item in range(len(content)):
image_name_list.append(content[item].attrib['alt'])
return image_name_list
# 获取图片链接
def get_xp_html_url(xp):
html_url_list = []
content = tree.xpath(xp)
for item in range(len(content)):
html_url_list.append('http://www.mm288.com' + content[item].attrib['href'])
return html_url_list
# 获取图片链接的图片数量(一个女模特的图片数量,也是链接数)
def image_number(url):
response = requests.get(url).content.decode('utf-8')
tree = etree.HTML(response)
content = tree.xpath('//*[@id="picnum"]/span[2]')
for item in range(len(content)):
return content[item].text
# 女模特图片链接地址拼接
def enlarge(url,number):
img_url_list = []
url_head = url[:-5]
url_till = url[-5:]
for i in range(number):
img_url_list.append(url_head + '_'+ f'{i+1}'+ url_till)
return img_url_list
# 获取女模特图片下载地址
def image_download_url(url):
response = requests.get(url).content.decode('utf-8')
tree = etree.HTML(response)
content = tree.xpath('/html/body/div[2]/div[2]/div[2]/ul[1]/li[2]/a')
for item in range(len(content)):
return content[item].attrib['href']
def download(url_list,file_name):
headers = {
'Host': 'img1.085p.com',
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/87.0.4280.88 Safari/537.36'
}
os.makedirs(f'.\\beautiful_gril\\{file_name}')
k = 1
for link in url_list:
r = requests.get(link, headers=headers)
with open(f'.\\beautiful_gril\\{file_name}\\{k}.jpg','wb') as f:
time.sleep(1)
f.write(r.content)
f.close()
requests.close()
k += 1
if __name__ == "__main__":
response = requests.get('http://www.mm288.com/meinv/').content.decode('utf-8')
tree = etree.HTML(response)
# 获取图片链接xpath地址
href_xpath_list = get_href_xp()
# 获取图片名称xpath地址
image_name_xpath_list = get_img_name_xp()
href_list = []
image_name_list = []
# 获取图片链接
for i in href_xpath_list:
href_list.append(get_xp_html_url(i))
# 获取图片名称
for i in image_name_xpath_list:
image_name_list.append(get_xp_image_name(i))
# 由于获得的href_list是个二维矩阵,故将他转为一维数组
L_href_list = []
for i in range(len(href_list)):
L_href_list.append(href_list[i][0])
href_list = L_href_list
# 由于获得的image_name_list是个二维矩阵,故将他转为一维数组
L_image_name_list = []
for i in range(len(image_name_list)):
L_image_name_list.append(image_name_list[i][0])
image_name_list = L_image_name_list
image = []
image_load_list = []
image_load = []
for link in href_list:
number = image_number(link)
image.append(enlarge(link,int(number)))
for item in image:
image_load = []
for link in item:
image_load.append(image_download_url(link))
image_load_list.append(image_load)
key = 13
for url_list in image_load_list[13:]:
file_name = image_name_list[key]
download(url_list,file_name)
key += 1
python 爬虫自动爬取图片
最新推荐文章于 2023-10-19 14:00:00 发布