利用XPath爬取百度贴吧指定内容

最新推荐文章于 2021-03-18 04:38:55 发布

原创最新推荐文章于 2021-03-18 04:38:55 发布 · 1k 阅读

5 ·

CC 4.0 BY-SA版权

文章标签：

#Python #XPath

爬虫入门专栏收录该内容

39 篇文章

订阅专栏

本文介绍了一种使用Python和XPath抓取百度贴吧回帖信息的方法，包括回帖人的用户名、回帖内容及回帖时间。通过多线程加速爬取过程，实现了对指定贴吧页面上所有回帖信息的有效抓取。

摘要生成于 C知道，由 DeepSeek-R1 满血版支持，前往体验 >

以下是利用XPath爬取百度贴吧上的回帖人回贴信息回贴时间的代码

from lxml import etree
from multiprocessing.dummy import Pool as ThreadPool
import requests
import string
import json


def towrite(contentdict):
    f.writelines('回贴时间:' + str(contentdict['topic_reply_time']) + '\n')
    f.writelines('回贴内容:' + str(contentdict['topic_reply_content']) + '\n')
    f.writelines('回帖人:' + str(contentdict['user_name']) + '\n\n')

def spider(url):
    html = requests.get(url)
    selector = etree.HTML(html.text)     # 转换为可用xpath处理的对象
    content_field = selector.xpath('//div[@class="l_post j_l_post l_post_bright  "]')        # 获取每一条回贴内容
    item = {}
    for each in content_field:
        # json.loads可将json格式的数据解析成字典格式   reply_info代表回贴的信息
        reply_info = json.loads(each.xpath('@data-field')[0])

        # 这里用到了先抓到再抓小的原则 所以XPath路径不用加那个//之类的啦
        author = reply_info['author']['user_name']
        content = each.xpath('div[@class="d_post_content_main"]/div/cc/div[@class="d_post_content j_d_post_content  clearfix"]/text()')[0]
        content = str.strip(content)
        reply_time = reply_info['content']['date']
        print(content)
        print(reply_time)
        print(author)
        item['user_name'] = author
        item['topic_reply_content'] = content
        item['topic_reply_time'] = reply_time
        towrite(item)

if __name__=='__main__':
    pool = ThreadPool(8)
    f = open('content.txt','a',encoding='utf-8')
    page = []
    for i in range(1,31):
        newpage = 'http://tieba.baidu.com/p/3522395718?pn=' + str(i)
        page.append(newpage)

    result = pool.map(spider,page)
    pool.close()
    pool.join()
    f.close()

Result