分析目的:
- 选定起始人:选定一位关注数或粉丝数较多的大V作为爬虫起始点
- 获取粉丝和关注列表:通过知乎接口获得该大V的粉丝列表和关注列表
- 获取列表用户信息:通过知乎接口获得列表中每位用户的详细信息
- 获取每位用户的粉丝和关注:进一步对列表中的每一个用户,获取他们的粉丝和关注列表,实现递归爬取
难点:
- 递归爬取
- 存储到MongoDB【需要去重】
1、在cmd中的操作
C:\Users\Administrator\Desktop>scrapy startproject zhihuuser C:\Users\Administrator\Desktop>cd zhihuuser C:\Users\Administrator\Desktop\zhihuuser>scrapy genspider demo zhihu.com
2、编辑items.py文件,设置爬取的数据
from scrapy import Item,Field class ZhihuuserItem(Item): id=Field() name=Field() avatar_url=Field() headline=Field() url=Field() url_token=Field() gender=Field() cover_url=Field() answer_count=Field() articles_count=Field() location=Field() education=Field()
3、编辑zhihu.py爬虫文件,爬取知乎用户信息【自己、关注的人、粉丝】===>三个方向
import scrapy import json from zhihuuser.items import ZhihuuserItem class DemoSpider(scrapy.Spider): name = 'demo' allowed_domains = ['zhihu.com'] start_urls = ['http://zhihu.com/'] start_user='song-de-90' #"宋德"用户作为爬虫的起始点 user_url='https://www.zhihu.com/api/v4/members/{user}?include={include}' #用户自己的主页 user_query = 'allow_message,is_followed,is_following,is_org,is_blocking,employments,answer_count,follower_count,articles_count,gender,badge[?(type=best_answerer)].topics' followees_url='https://www.zhihu.com/api/v4/members/{user}/followees?include={include}&offset={offset}&limit={limit}' #用户关注的人的主页 followees_query='data[*].answer_count,articles_count,gender,follower_count,is_followed,is_following,badge[?(type=best_answerer)].topics' followers_url='https://www.zhihu.com/api/v4/members/{user}/followers?include={include}&offset={offset}&limit={limit}' #用户粉丝的主页 followers_query='data[*].answer_count,articles_count,gender,follower_count,is_followed,is_following,badge[?(type=best_answerer)].topics' def start_requests(self): #根据请求的url,设置回调函数callback yield scrapy.Request(url=self.user_url.format(user=self.start_user,include=self.user_query),callback=self.parse_user) yield scrapy.Request(url=self.followees_url.format(user=self.start_user,include=self.followees_query,offset=0,limit=20),callback=self.parse_followees) yield scrapy.Request(url=self.followers_url.format(user=self.start_user,include=self.followers_query,offset=0,limit=20),callback=self.parse_followers) def parse_user(self, response): #解析用户自己的信息,重点url_token,将其传入followees_url和followers_url result=json.loads(response.text) item=ZhihuuserItem() for field in item.fields: #这样可以不用取出每个item if field in result.keys(): item[field]=result.get(field) #做到每个item变量的一一赋值 yield item yield scrapy.Request(url=self.followees_url.format(user=result.get('url_token'),include=self.followees_query,offset=0,limit=20),callback=self.parse_followees) yield scrapy.Request(url=self.followers_url.format(user=result.get('url_token'),include=self.followers_query,offset=0,limit=20),callback=self.parse_followers) def parse_followees(self,response): #解析“关注了”用户的信息 result=json.loads(response.text) if 'data' in result.keys(): for result in result.get('data'): yield scrapy.Request(url=self.user_url.format(user=result.get('url_token'),include=self.user_query),callback=self.parse_user) if 'paging' in result.keys() and result.get('paging').get('is_end')==False: #用于处理分页 next_page=result.get('paging').get('next') #其实next_page对应的网址就是在当前followees_url的offset上加上20 yield scrapy.Request(url=next_page,callback=self.parse_followees) def parse_followers(self,response): ##解析“关注者”用户的信息 【即:粉丝】 result=json.loads(response.text) if 'data' in result.keys(): for result in result.get('data'): yield scrapy.Request(url=self.user_url.format(user=result.get('url_token'),include=self.user_query),callback=self.parse_user) if 'paging' in result.keys() and result.get('paging').get('is_end')==False: #用于处理分页 next_page=result.get('paging').get('next') #其实next_page对应的网址就是在当前followers_url的offset上加上20 yield scrapy.Request(url=next_page,callback=self.parse_followers)
4、编辑settings.py文件
BOT_NAME = 'zhihuuser' SPIDER_MODULES = ['zhihuuser.spiders'] NEWSPIDER_MODULE = 'zhihuuser.spiders' ROBOTSTXT_OBEY = False DEFAULT_REQUEST_HEADERS = { 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', 'Accept-Language': 'en', 'User-Agent':'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.25 Safari/537.36', } ITEM_PIPELINES = { 'zhihuuser.pipelines.ZhihuuserPipeline': 300, } MONGO_URI='localhost' MONGO_DATABASE='zhihu'
5、编辑pipelines.py文件 将爬取的item数据保存到MongoDB数据库中
此时需要先在cmd中操作:C:\Users\Administrator>mongod --dbpath D:\data\db --port 27017
目的:打开mongodb在27017端口的数据库
一定要执行该步骤,否则数据不能保存到mongodb数据库中,也不能使用MongoDB Compass桌面控制软件
import pymongo class ZhihuuserPipeline(object): def __init__(self,mongo_uri,mongo_database): self.mongo_uri=mongo_uri self.mongo_database=mongo_database @classmethod def from_crawler(cls,crawler): return cls(mongo_uri=crawler.settings.get('MONGO_URI'), mongo_database=crawler.settings.get('MONGO_DATABASE','items')) def open_spider(self,spider): self.client=pymongo.MongoClient(self.mongo_uri) self.db=self.client[self.mongo_database] def close_spider(self,spider): self.client.close() self.db.close() def process_item(self, item, spider): self.db['info'].update({'url_token':item['url_token']},{'$set':item},True) #去重的存储 print('存入MongoDB成功!') return item
6、运行
C:\Users\Administrator\Desktop\zhihuuser>scrapy crawl demo
此时MongoDB中的‘zhihu’数据库中的‘info’表中出现,且存放爬取的相应信息。