以下是利用XPath爬取百度贴吧上的回帖人 回贴信息 回贴时间的代码
from lxml import etree
from multiprocessing.dummy import Pool as ThreadPool
import requests
import string
import json
def towrite(contentdict):
f.writelines('回贴时间:' + str(contentdict['topic_reply_time']) + '\n')
f.writelines('回贴内容:' + str(contentdict['topic_reply_content']) + '\n')
f.writelines('回帖人:' + str(contentdict['user_name']) + '\n\n')
def spider(url):
html = requests.get(url)
selector = etree.HTML(html.text) # 转换为可用xpath处理的对象
content_field = selector.xpath('//div[@class="l_post j_l_post l_post_bright "]') # 获取每一条回贴内容
item = {}
for each in content_field:
# json.loads可将json格式的数据解析成字典格式 reply_info代表回贴的信息
reply_info = json.loads(each.xpath('@data-field')[0])
# 这里用到了先抓到再抓小的原则 所以XPath路径不用加那个//之类的啦
author = reply_info['author']['user_name']
content = each.xpath('div[@class="d_post_content_main"]/div/cc/div[@class="d_post_content j_d_post_content clearfix"]/text()')[0]
content = str.strip(content)
reply_time = reply_info['content']['date']
print(content)
print(reply_time)
print(author)
item['user_name'] = author
item['topic_reply_content'] = content
item['topic_reply_time'] = reply_time
towrite(item)
if __name__=='__main__':
pool = ThreadPool(8)
f = open('content.txt','a',encoding='utf-8')
page = []
for i in range(1,31):
newpage = 'http://tieba.baidu.com/p/3522395718?pn=' + str(i)
page.append(newpage)
result = pool.map(spider,page)
pool.close()
pool.join()
f.close()