#coding:utf-8 #__author__='wang' import requests,xlwt from lxml import etree class TB(object): def get_page_code(self,page_number): ''' 根据网页获取当前页的源码 :param page_number: 表示第几页 :return: 返回的是一个文档树 ''' url = 'http://tieba.baidu.com/p/5068350333?pn=1' + str(page_number) content = requests.get(url).content.decode('utf-8').encode ('utf-8') #parser参数可以通过HTMLParser类来实现parser的覆盖,HTMLParser类 提供了修改解析数据的编码方式 html_obj = etree.HTML(content,parser=etree.HTMLParser (encoding=('utf-8'))) return html_obj def get_content(self,html_obj): data = [] content_list = html_obj.xpath('//div[contains(@class,"l_post")]') for div_obj in content_list: #获取用户名div节点和a节点之间还有其他的标签节点,我们在这种情况下 需要使用.//进行元素节点定位 #.//表示从当前节点(div_obj)定位指定的元素标签,而忽略当前节点 和要匹配的节点中间的其他节点 username = div_obj.xpath('.//a[contains (@class,"p_author_name")]/text()')[0] text_list = div_obj.xpath('.//div[contains (@id,"post_content")]/text()') text_resule = '' for text in text_list: text_resule += text data.append((username,text_resule)) return data if __name__=='__main__': tb = TB() html_obj = tb.get_page_code(1) data = tb.get_content(html_obj) #将数据写入到Excel中 workbook = xlwt.Workbook(encoding=('utf-8')) sheet = workbook.add_sheet('贴吧数据') sheet.write(0,0,u'贴吧昵称') sheet.write(0,1,u'跟帖内容') row = 1 for data_tuple in data: usrename = data_tuple[0] content = data_tuple[1] sheet.write(row,0,usrename) sheet.write(row,1,content) row += 1 workbook.save(u'贴吧信息.xls')
xpath百度贴吧
最新推荐文章于 2021-02-04 05:54:33 发布