30-多线程爬虫-爬取糗事百科

最新推荐文章于 2021-04-05 10:37:10 发布

转载最新推荐文章于 2021-04-05 10:37:10 发布 · 300 阅读

0 ·

CC 4.0 BY-SA版权

原文链接：https://my.oschina.net/pansy0425/blog/3089765

文章标签：

#爬虫 #python #json

本文详细介绍了一种爬取糗事百科网站数据的爬虫实现方法，包括单进程和多线程版本，涵盖爬虫设计、多线程原理及Queue应用，适合爬虫初学者实践。

摘要生成于 C知道，由 DeepSeek-R1 满血版支持，前往体验 >

说明：爬取 糗事百科 'https://www.qiushibaike.com/8hr/page/{}/'.format(page_num)

爬取 信息：题目、好笑、评论、转发、发表人

1、单进程版本的糗事百科爬虫（常规的爬虫）

import requests
from bs4 import BeautifulSoup
import re

class QSBK(object):
    def __init__(self):
        self.headers={'User-Agent':'Mozilla/5.0'}
        self.base_url='https://www.qiushibaike.com/8hr/page/{}/'

    def get_page(self,url):
        try:
            r=requests.get(url,headers=self.headers)
            r.raise_for_status()
            #r.encoding=r.apparent_encoding
            return r.text
        except:
            return ""

    def get_num(self,url):
        html=self.get_page(url)
        soup=BeautifulSoup(html,'html.parser')
        ul=soup.find('ul',{'class':{'pagination'}})
        page_info=ul('li')[-2]
        page_num=page_info.find('span',{'class':{'page-numbers'}}).text.strip()
        return int(page_num)

    def get_info(self,url):
        html = self.get_page(url)
        soup = BeautifulSoup(html, 'html.parser')
        div = soup.find('div', {'class': {'recommend-article'}})
        lis = div('ul')[0]('li')
        for li in lis:
            title=li.find('a',{'class':{'recmd-content'}}).text.strip()
            laugh=li.find('div',{'class':{'recmd-num'}})('span')[0].text.strip()
            comment=li.find('div',{'class':{'recmd-num'}})('span')[3].text.strip()
            info=[title,laugh,comment]
            print(info)

    def start_work(self):
        start_url='https://www.qiushibaike.com/8hr/page/1/'
        num=self.get_num(start_url)
        for i in range(1,num+1):
            print('当前爬取第{}页'.format(i))
            try:
                url=self.base_url.format(i)
                self.get_info(url)
            except:
                continue

qsbk=QSBK()
qsbk.start_work()

2、多线程介绍（这边只是简单的介绍几点）

进程里包含的执行单元叫线程，一个进程可以包含多个线程。
一个进程的内存空间是共享的，每个进程里的线程都可以使用这个共享空间。
一个线程在使用这个共享空间的时候，其他线程必须等它结束。通过“锁”实现，作用就是防止多个线程同时使用这块内存空间。先使用的线程会将该空间上锁，其他线程就在门口等待，打开“锁”之后再去进行。
进程：表示程序的一次执行线程：CPU运行的基本调度单位
GIL锁：Python中的执行通行证，而且只有一个，拿到通行证的线程就可以进行CPU执行任务。没有的就需要等到。
Python的多线程适用于：大量密集的I/O处理【爬虫就是由大量的I/O组成】
Python的多进程适用于：大量的密集并行计算

3、Queue（队列对象）使用import Queue

队列是线程间最常用的交换数据的形式

Queue是先进先出，是线性安全的，所以可以不使用“锁”

import queue
myqueue=queue.Queue(maxsize=10)  #创建一个“队列”对象
myqueue.put(10)    #将一个值放到队列中
res=myqueue.get()  #将一个值从队列中取出
print(res)  #10

4、多进程版本的糗事百科爬虫（使用Queue）

分析流程图：

import threading,requests,json
from queue import Queue
from bs4 import BeautifulSoup
CRAWL_EXIT=False  #爬虫变量==>采集信息
PARSE_EXIT=False  #爬虫变量==>解析信息

class ThreadCrawl(threading.Thread):  #继承多线程，重写run函数
    def __init__(self,threadName,pageQueue,dataQueue):
        super(ThreadCrawl,self).__init__()  #调用父类的方法初始化
        self.threadName=threadName  #线程名
        self.pageQueue=pageQueue    #页码队列
        self.dataQueue=dataQueue
        self.headers={'User-Agent':'Mozilla/5.0'}

    def run(self):
        print('启动：{}'.format(self.threadName))
        while not CRAWL_EXIT:
            try:
            # 取出一个数字，先进先出
            # 可选参数block，默认为True
            # 1、如果队列为空，block为True的话，不会接受，会进入堵塞状态，直到队列有新的数据
            # 2、如果队列为空，block为False的话，就弹出一个Queue.empty()异常
                page=self.pageQueue.get(block=False)
                url='https://www.qiushibaike.com/8hr/page/{}/'.format(page)
                r=requests.get(url,headers=self.headers)
                r.raise_for_status()
                #r.encoding=r.apparent_encoding
                html=r.text
                self.dataQueue.put(html)
            except:
                pass
        print('结束：{}'.format(self.threadName))


class ThreadParse(threading.Thread):  #继承多线程，重写run函数
    def __init__(self,threadName,dataQueue):
        super(ThreadParse,self).__init__()
        self.threadName=threadName
        self.dataQueue=dataQueue
        self.headers={'User-Agent':'Mozilla/5.0'}

    def run(self):
        print('启动：{}'.format(self.threadName))
        while not PARSE_EXIT:
            try:
                html=self.dataQueue.get(block=False)
                self.parse(html)
            except:
                pass
        print('结束：{}'.format(self.threadName))

    def parse(self,html):
        soup = BeautifulSoup(html, 'html.parser')
        div = soup.find('div', {'class': {'recommend-article'}})
        lis = div('ul')[0]('li')
        for li in lis:
            title = li.find('a', {'class': {'recmd-content'}}).text.strip()
            laugh = li.find('div', {'class': {'recmd-num'}})('span')[0].text.strip()
            comment = li.find('div', {'class': {'recmd-num'}})('span')[3].text.strip()
            item={'title':title,'laugh':laugh,'comment':comment}
            array=json.dumps(item,ensure_ascii=False)
            with open('qsbk.json','a',encoding='utf-8')as f:
                f.write(array)
                f.write('\n')


if __name__ == '__main__':
    pageQueue=Queue(10)  #pageQueue页码队列，表示10个页面
    for i in range(1,11):
        pageQueue.put(i)  #放入1-10，表示前10页，先进先出
    dataQueue=Queue()  #采集结果（每页的html源码）的数据队列，参数为空（不限制）

    crawllist=['采集线程1','采集线程2','采集线程3']  #3个采集线程的名字
    threadcrawl=[]  #存储3个采集线程
    for threadName in crawllist:
        thread=ThreadCrawl(threadName,pageQueue,dataQueue)
        thread.start()
        threadcrawl.append(thread)

    parselist=['解析线程1','解析线程2','解析线程3']  #3个解析线程的名字
    threadparse=[]  #存储3个解析线程
    for threadName in parselist:
        thread=ThreadParse(threadName,dataQueue)
        thread.start()
        threadparse.append(thread)

    while not pageQueue.empty():  #等待pageQueue队列为空（即：之前的操作执行完毕）
        pass
    global CRAWL_EXIT
    CRAWL_EXIT=True
    print('pageQueue为空，采集线程结束')

    for thread in threadcrawl:  #阻塞状态
        thread.join()

    for thread in threadparse:  #阻塞状态
        thread.join()

转载于:https://my.oschina.net/pansy0425/blog/3089765