聚焦爬虫:爬取页面中指定的页面内容
- 编码流程:
指定urL
发起请求
获取响应数据
一数据解析
持久化存储 - 数据解析分类
正则
bs4
xpath (ok) - 数据解析原理概述:
解析的局部的文本内容都会在标签之间或者标签对应的属性中进行存储
1、进行指定标签的定位
2、·标签或者标签对应的属性中存储的数据值进行提取(解析)
正则解析
<div class="thumb">
<a href="/article/124119959" target="_blank">
<img src="//pic.qiushibaike.com/system/pictures/12411/124119959/medium/VOAJGW4N8N8MR5AN.jpg" alt="糗事#124119959" class="illustration" width="100%" height="auto">
</a>
</div>
正则提取:
ex = '<div class="thumb">.*?<img src="(.*?)" alt.*?</div>'
import os
import re
import requests
#需求:抓取糗图网站糗图模块的所有糗图
def get_image():
url = 'https://www.qiushibaike.com/imgrank/'
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/89.0.4389.82 Safari/537.36'
}
#创建一个文件夹保存图片
if not os.path.exists('./image'):
os.mkdir('./image')
#设置一个通用的url模板
url = 'https://www.qiushibaike.com/imgrank/page/{}/'
for i in range(1,13):
new_url = url.format(i)
#使用通用爬虫对url对应的一整页进行爬取
response = requests.get(url=new_url,headers=headers).text
#使用聚焦爬虫将页面的所有糗图进行解析/提取
#图片所在位置
# <div class="thumb">
# <a href="/article/124119959" target="_blank">
# <img src="//pic.qiushibaike.com/system/pictures/12411/124119959/medium/VOAJGW4N8N8MR5AN.jpg" alt="糗事#124119959" class="illustration" width="100%" height="auto">
# </a>
# </div>
#使用正则
ex = '<div class="thumb">.*?<img src="(.*?)" alt.*?</div>'
img_src_list = re.findall(ex,response,re.S)
# print(img_src_list)
for src in img_src_list:
#拼接出完整的url
src = 'https:'+src
#请求到图片的二进制数据
img_data = requests.get(url=src,headers=headers).content
#生成图片名称
img_name = src.split('/')[-1]
#图片存储的路径
img_path = './image/'+img_name
with open(img_path,'wb') as f:
f.write(img_data)
if __name__ == '__main__':
get_image()
bs4进行数据解析
- 数据解析的原理:
1、标签定位
2、提取标签、标签属性中存储的数据值 - bs4数据解析的原理:
1、实例化一个 Beautifulsoup对象,并且将页面源码数据加载到该对象中
2、通过调用 Beautifulsoup对象中相关的属性或者方法进行标签定位和数据提取 - 环境安装
pip install bs4
pip install lxml
- 如何实例化 Beautifulsoup对象:
from bs4 import Beautifulsoup
对象的实例化:
- 将本地的htm文档中的数据加载到该对象中
f = open(' /test.html','r', encoding="utf-8')
soup =Beautifulsoup(f,'lxml')
- 将互联网上获取的页面源码加载到该对象中
page_text = response.text
soup = Beatifulsoup(page_text,'lxml')
-
提供的用于数据解析的方法和属性:
soup.tagName:返回的是文档中第一次出现的 tagName对应的标签
soup.find():- find( ‘tagName’):等同于soup.div
- 属性定位
- soup.find(‘div’,class_/id/attr=‘song’)
soup.find_all(‘tagName’):返回符合要求的所有标签
select:- select(‘某种选择器(id,class,标签选择器)’),返回的是一个列表
- 层级选择器:
- soup.select(’.tang > ul > li > a’) : >表示的是一个层级
- soup.select(’.tang > ul a’):空格表示的多个层级
获取标签之间的文本数据:
- soup.a.text/string/get_text()
- text/get_text():可以获取某一个标签中所有的文本内容
- string:只可以获取该标签下面直系的文本内容
获取标签中属性值:
- soup.a[‘href’]
from bs4 import BeautifulSoup
#进将本地的htm1文中的数据加载到该对象中
f = open('./test.html','r',encoding='utf-8')
soup =BeautifulSoup(f, 'lxml')
print(soup)
# print(soup,a) #soup.tagName 返回的是htmL中第一次出现的 tagName标签
print(soup. div)
# find(tagName) :等同于soup.div
print(soup.find('div'))
# print(soup.div)
print(soup.find('div',class_='song').string)
print(soup.find_all('a'))
print(soup.select('.tang '))
print(soup. select('.tang >ul a')[0]['href'])
实例:
import requests
from bs4 import BeautifulSoup
def get_sanguo():
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/89.0.4389.82 Safari/537.36'
}
url = 'https://www.shicimingju.com/book/sanguoyanyi.html'
page_text = requests.get(url=url,headers=headers)
page_text.encoding = 'utf-8'
page_text = page_text.text
# print(page_text.text)
# print(page_text.apparent_encoding )
# exit()
#在首页中解析出标题和详情页的url
soup = BeautifulSoup(page_text,'lxml')
li_list = soup.select('.book-mulu > ul > li')
f = open('./sanguo.txt','w',encoding='utf-8')
for li in li_list:
title = li.a.string
detail_url = 'https://www.shicimingju.com' +li.a['href']
#对详情页发起请求,解析章节内容
detail_page_text = requests.get(url=detail_url,headers=headers)
detail_page_text.encoding = 'utf-8'
detail_page_text = detail_page_text.text
#解析详情中相关的章节内容
detail_soup = BeautifulSoup(detail_page_text,'lxml')
div_tag = detail_soup.find('div',class_='chapter_content')
#解析到章节的内容
content = div_tag.text
f.write(title+':'+content+'\n')
print(title,'抓取成功')
if __name__ == '__main__':
get_sanguo()
xpath解析
- xpath解析原理
1、实例化一个etree的对象,且需要将解析的页面源码数据加载到该对象中
2、调用etree对象中的xpath方法给结合着xpath表达式实现标签的定位和内容的捕获 - 环境的安装
pip install lxml
- 如何实例化一个 etree对象:
from lxml import etree
- 将本地的htm文档中的源码数据加载到 etree对象中
etree.parse(filepath) - 可以将从互联网上获取的源码数据加载到该对象中
etree.HTML(page_text)
- xpath表达式:
/ :表示的是从根节点开始定位。表示的是一个层级
//:表示的是多个层级可以表示从任意位置开始定位
属性定位://div[ class=‘song’] tag[@ attrname=" attrvalue"]
索引定位://div[@ class=“song”]/p[3] 索引是从1开始的。
- 取文本:
/text():获取的是标签中直系的文本内容
//text():标签中非直系的文本内容(所有的文本内容) - 取属性
@attrname ==>例如:img/src
练习
1、爬取58二手房源信息
import time
import requests
from lxml import etree
def get_58():
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/89.0.4389.82 Safari/537.36'
}
url = 'https://sz.58.com/ershoufang/p{}/?utm_source=market&PGTID=0d30000c-0000-4bd7-7215-448488f17d0e&ClickID=1'
for i in range(1,10):
time.sleep(5)
new_url = url.format(i)
response = requests.get(url=new_url,headers=headers).text
tree = etree.HTML(response)
text_list = tree.xpath('//section[@class="list"][1]/div')
f = open('./58.txt','w',encoding='utf-8')
for li in text_list:
page_title = li.xpath('./a/div[2]/div[1]/div[1]/h3/text()')[0]
page_price = li.xpath('./a/div[2]/div[2]/p/span[@class="property-price-total-num"]/text()')[0]
content = '房源名:'+page_title + ";" + '价格:' + page_price + '万'
f.write(content+'\n')
if __name__ == '__main__':
get_58()
2、爬取图片
import requests
from lxml import etree
import os
def get_image():
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/89.0.4389.82 Safari/537.36'
}
url = 'https://pic.netbian.com/4kmeinv/'
response = requests.get(url=url,headers=headers)
# print(response.apparent_encoding) #查看格式类型
# response.encoding = 'utf-8'
response.encoding = 'gbk'
response = response.text
tree = etree.HTML(response)
li_list = tree.xpath('//div[@class="slist"]/ul/li')
if not os.path.exists('./ccimage'):
os.mkdir('./ccimage')
for i in li_list:
image_url = 'https://pic.netbian.com'+ i.xpath('./a/img/@src')[0]
image_name = i.xpath('./a/img/@alt')[0]+'.jpg'
image_path = 'ccimage/' + image_name
content = requests.get(url=image_url,headers=headers).content
with open(image_path,'wb') as f:
f.write(content)
if __name__ == '__main__':
get_image()
3、获取全国城市名称
import requests
from lxml import etree
def get_city():
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/89.0.4389.82 Safari/537.36'
}
url = 'https://www.aqistudy.cn/historydata/'
response = requests.get(url=url,headers=headers).text
tree = etree.HTML(response)
#解析到热门城市和所有城市的A标签
# //div[@class="bottom"]/ul/li/a 热门城市a标签的层级关系
# //div[@class="bottom"]/ul/div[2]/li/a 全部城市a标签的层级关系
a_list = tree.xpath('//div[@class="bottom"]/ul/li/a | //div[@class="bottom"]/ul/div[2]/li/a ')
all_city_name = []
for a in a_list:
city_name = a.xpath('./text()')[0]
all_city_name.append(city_name)
print(all_city_name)
if __name__ == '__main__':
get_city()