Python抓取开源中国资讯（使用BeautifulSoup库）

最新推荐文章于 2024-11-28 10:53:06 发布

原创最新推荐文章于 2024-11-28 10:53:06 发布 · 2.8k 阅读

2 ·

CC 4.0 BY-SA版权

文章标签：

#python #oschina #BeautifulSoup

Python 专栏收录该内容

15 篇文章

订阅专栏

Python抓取开源中国资讯（使用BeautifulSoup库）

效果

这里写图片描述

代码

#coding=utf8
import requests
import re
import xlrd
import xlwt
import time
from bs4 import BeautifulSoup
myfile=xlwt.Workbook()
table1=myfile.add_sheet(u"9.9PDD",cell_overwrite_ok=True)
table1.write(0,0,u"资讯")

user_agent = 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.8; rv:16.0) Gecko/20120813 Firefox/16.0'
headers = { 'User-Agent' : user_agent }
#print(soup.prettify())
class sousuo():
    def __init__(self,url,table):
        self.url=url
        self.table=table

    def chaxun(self):
        url = self.url
        r=requests.get(url,headers=headers)
        #编码格式，应该大部分是UTF-8
        r.encoding = 'UTF-8' 
        html=r.text

        soup=BeautifulSoup(html, "html.parser")
        print(soup.title)
        #print(soup.prettify())


        #资讯
        i=1
        for tag in soup.find_all(class_="sc sc-text text-gradient wrap summary"):
            if tag.string is not None:
                print(tag.string)
                self.table.write(i,0,tag.string)
                i+=1

s1=sousuo('https://www.oschina.net/news/project',table1)
s1.chaxun()

filename=str(time.strftime('%Y%m%d%H%M%S',time.localtime()))+"oschina.xlsx"
myfile.save(filename)
print(u"Python抓取开源中国资讯（使用BeautifulSoup库）:%s"%time.strftime('%Y%m%d%H%M%S',time.localtime()))