import shutil,os
from bs4 import BeautifulSoup
from urllib.request import Request,urlopen,urlretrieve
import xlwt
import requests
class Picture_downLoad(object):
def __init__(self):
self.base_url = 'http://www.meishij.net/'
self.current_page = 1
def start_Load(self):
# 判断是否存在指定文件,如果有就删除
if os.path.exists('cate'):
#删除树状结构的文件夹
shutil.rmtree('cate',True)
os.mkdir('cate')
os.chdir('cate')
self.get_page_with_url('https://www.meishij.net/chufang/diy/?&page=1')
def get_page_with_url(self,url):
headers = {
'User-Agent':'Mozilla/5.0 (Windows NT 6.1; Win64; x64; rv:61.0) Gecko/20100101 Firefox/61.0'
}
request = Request(url,headers = headers)
try:
respone = urlopen(request)
code = respone.read().decode()
except Exception as e:
print('请求失败', + e)
else:
self.get_data_with_page(code)
def get_data_with_page(self,code):
print('正在下载第{}页...'.format(self.current_page))
soup = BeautifulSoup(code,'lxml')
page_list = '{}Page1'.format(self.current_page)
os.mkdir(page_list)
os.chdir(page_list)
image_list = soup.select('img.img')
for image in image_list:
image_src = image.get('src')
image_alt = image.get('alt')
image_alt = image_alt.split('(')[0] + '.jpg'
image_alt= image_alt.replace('|', '')
print(image_alt)
print(image_src)
urlretrieve(image_src,image_alt)
os.chdir(os.path.pardir)
self.current_page += 1
self.get_next_page(code)
def get_next_page(self, code):
soup = BeautifulSoup(code, 'lxml')
next_page = soup.select('a.next')
next_page=next_page[0]
url = next_page.get('href')
self.get_page_with_url(url)
downLoad = Picture_downLoad()
downLoad.start_Load()