1.simple version
网站没有反扒机制的:
import urllib.request
import re
url="http://ohhappyday.com/" # 1.我们要爬取图片的地址
page = urllib.request.urlopen(url) # 2. 打开网址 print(page)
html = page.read().decode("utf-8") # 3. 获取html源码
imglist = re.findall('img src="(http.*?)"',html) # 4. 在html中匹配出符合条件的字符串
x=0
for imgurl in imglist: # 遍历图片地址列表
urllib.request.urlretrieve(imgurl,'./picture/pic%s.jpg' %x) # 第四行 获取图片并保存
x=x+1
2rd version
网站有反扒机制的
import urllib.request
import re
Head = {'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:60.0) Gecko/20100101 Firefox/60.0'}
url="https://www.microsoft.com/en-us/research/people/shihan/" # 我们要爬取图片的地址
page1=urllib.request.Request(url,headers = Head)
page=urllib.request.urlopen(page1)
html = page.read().decode("utf-8")
imglist = re.findall('content="(.*?\.png)"',html) # 第三行 在html中匹配出符合条件的字符串
opener=urllib.request.build_opener()
opener.addheaders=[('User-Agent','Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/36.0.1941.0 Safari/537.36')]
urllib.request.install_opener(opener)
x=0
for imgurl in imglist: # 遍历图片地址列表
urllib.request.urlretrieve(imgurl,'./picture/picture%s.jpg' %x) # 第四行 获取图片并保存
#urllib.request.urlretrieve(imgurl, ./picture/pic%s.jpg'%x)
x=x+1
3rd
爬出来的url不全,没有http开头
import urllib.request
import re
Head = {'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:60.0) Gecko/20100101 Firefox/60.0'}
url="https://www.microsoft.com/en-us/research/group/software-analytics/" # 我们要爬取图片的地址
page1=urllib.request.Request(url,headers = Head)
page=urllib.request.urlopen(page1)
html = page.read().decode("utf-8")
imglist3 = re.findall('src=\'(//[^\s]*?.jpg)\'',html) # 第三行 在html中匹配出符合条件的字符串
opener=urllib.request.build_opener()
opener.addheaders=[('User-Agent','Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/36.0.1941.0 Safari/537.36')]
urllib.request.install_opener(opener)
x=0
for imgurl in imglist3: # 遍历图片地址列表
urllib.request.urlretrieve("https:"+imgurl,'./picture/3picture%s.jpg' %x) # 第四行 获取图片并保存
#urllib.request.urlretrieve(imgurl, ./picture/pic%s.jpg'%x)
x=x+1
4rd
正则表达式设计不对
imglist41 = re.findall('img alt=\'([^\d]*?)\' src=\'(//[^\s]*?.jpg)\' srcset=',html)
#存储也可以带上名字
opener=urllib.request.build_opener()
opener.addheaders=[('User-Agent','Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/36.0.1941.0 Safari/537.36')]
urllib.request.install_opener(opener)
x=0
for imgurl in imglist41: # 遍历图片地址列表
urllib.request.urlretrieve("https:"+imgurl[1],'./picture/%s.jpg' %imgurl[0]) # 第四行 获取图片并保存
#urllib.request.urlretrieve(imgurl, ./picture/pic%s.jpg'%x)
x=x+1
5rd
爬取特定人的
import urllib.request
import re
Head = {'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:60.0) Gecko/20100101 Firefox/60.0'}
url="https://www.microsoft.com/en-us/research/group/software-analytics/" # 我们要爬取图片的地址
page1=urllib.request.Request(url,headers = Head)
page=urllib.request.urlopen(page1)
html = page.read().decode("utf-8")
imglist51 = re.findall('img alt=\'([^\d]* Shi Han)\' src=\'(//[^\s]*?.png)\' srcset=',html) #.* 代表匹配除换行符之外的所有字符 ?为非贪婪模式
imglist61 = imglist51+ re.findall('img alt=\'([^\d]*?)\' src=\'(//[^\s]*?.jpg)\' srcset=',html)
opener=urllib.request.build_opener()
opener.addheaders=[('User-Agent','Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/36.0.1941.0 Safari/537.36')]
urllib.request.install_opener(opener)
x=0
for imgurl in imglist61: # 遍历图片地址列表
print
print(x)
urllib.request.urlretrieve("https:"+imgurl[1],'./picture/%s.jpg' %imgurl[0]) # 第四行 获取图片并保存
#urllib.request.urlretrieve(imgurl, ./picture/pic%s.jpg'%x)
x=x+1
if x == 3 :
break