Python抓取开源中国资讯(使用BeautifulSoup库)
效果
代码
#coding=utf8
import requests
import re
import xlrd
import xlwt
import time
from bs4 import BeautifulSoup
myfile=xlwt.Workbook()
table1=myfile.add_sheet(u"9.9PDD",cell_overwrite_ok=True)
table1.write(0,0,u"资讯")
user_agent = 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.8; rv:16.0) Gecko/20120813 Firefox/16.0'
headers = { 'User-Agent' : user_agent }
#print(soup.prettify())
class sousuo():
def __init__(self,url,table):
self.url=url
self.table=table
def chaxun(self):
url = self.url
r=requests.get(url,headers=headers)
#编码格式,应该大部分是UTF-8
r.encoding = 'UTF-8'
html=r.text
soup=BeautifulSoup(html, "html.parser")
print(soup.title)
#print(soup.prettify())
#资讯
i=1
for tag in soup.find_all(class_="sc sc-text text-gradient wrap summary"):
if tag.string is not None:
print(tag.string)
self.table.write(i,0,tag.string)
i+=1
s1=sousuo('https://www.oschina.net/news/project',table1)
s1.chaxun()
filename=str(time.strftime('%Y%m%d%H%M%S',time.localtime()))+"oschina.xlsx"
myfile.save(filename)
print(u"Python抓取开源中国资讯(使用BeautifulSoup库):%s"%time.strftime('%Y%m%d%H%M%S',time.localtime()))