python网页采集工具_python采集网站资源-CSDN下载

共4个文件

pyc：2个

py：1个

bat：1个

需积分: 50 123 浏览量 2016-10-10 10:41:34 上传评论收藏 5KB ZIP 举报

Python网页采集工具是一种高效的方法，用于自动化地从网站获取数据，尤其在处理大量网页信息时，比手动操作更便捷、高效。"采集"这个标签直接指向了这一领域的核心任务——数据抓取。在这个过程中，我们可以利用Python编程语言，结合各种库和工具，实现对网页内容的智能提取。在描述中提到的几种传统方法并不总是有效或理想。例如，右键保存图片可能在某些情况下不可用，而截图虽然简单，但确实可能导致图片质量下降。更深入的方法是查看页面源代码，这为我们揭示了网页背后的HTML结构，从而可以手动复制或解析所需的数据。然而，对于大规模的数据采集，这种方法仍然是低效的。 Python中，常用的网页采集库如BeautifulSoup、Scrapy和Requests等，能够帮助我们解析HTML、CSS选择器，甚至处理JavaScript渲染的页面。例如，`HightWord.py`可能是一个实现了高级文本抽取功能的脚本，可能包含对网页文本的特定关键词或模式的搜索。 `HtmlHelper.pyc`和`MysqlHelper.pyc`则暗示了数据处理和存储的环节。`HtmlHelper`可能是辅助解析HTML文档的类库，包含了处理DOM树、提取标签、查找链接等功能。而`MysqlHelper`则很可能用于与MySQL数据库交互，将采集到的数据存储在结构化的数据库中，方便后续分析和检索。在实际应用中，Python采集工具通常包括以下步骤： 1. 发送HTTP请求：使用`Requests`库向目标URL发送GET或POST请求，获取网页内容。 2. 解析HTML：使用`BeautifulSoup`等库解析HTML响应，找到目标数据所在的元素。 3. 数据提取：通过CSS选择器、正则表达式或XPath表达式定位并提取所需数据。 4. 数据清洗：去除无关信息，标准化格式，处理异常情况。 5. 数据存储：使用`MysqlHelper`这样的库将数据保存到MySQL数据库中，或者保存为CSV、JSON等文件格式。 6. 错误处理和重复检查：确保采集过程的稳定性和数据的准确性。此外，为了遵守网站的robots.txt规则和避免IP被封禁，我们还需要考虑爬虫策略，如设置延时、使用代理IP池，以及尊重网站的爬虫政策。 Python网页采集工具是IT领域中的重要工具，它允许开发者高效地获取网络上的大量信息，无论是进行数据分析、市场研究，还是构建个性化的信息聚合系统，都有着广泛的应用。通过熟练掌握这些技术和工具，你可以轻松应对各种网页数据采集挑战。

资源推荐

资源详情

资源评论

收起资源包目录

SpiderContent.zip （4个子文件）

HightWord.py 14KB

HtmlHelper.pyc 4KB

MysqlHelper.pyc 1KB

采集文章工具.bat 119B

#!/usr/bin/env python #-*- coding:utf-8 -*- #------------------------------------------------------------------------------- # Name: HightWord.py # Purpose: 文章采集工具 # # Author: wolfxin2010 # QQ:361248642 # # Created: 5/7/2014 #------------------------------------------------------------------------------- import MysqlHelper,random,time,HtmlHelper,re,datetime from goose import Goose from goose.text import StopWordsChinese class AnalyseTags(): def __init__(self): self.mysql=MysqlHelper.MySQL('localhost','root','.Cc.bang.518.DAvi..8!') self.goose=Goose({'stopwords_class' : StopWordsChinese}) self.htmlhelper=HtmlHelper.RequestHtml() def spiderIfengNews(self): url="http://news.ifeng.com/listpage/11502/%s/%s/rtlist.shtml" % (datetime.datetime.now().strftime('%Y%m%d'),random.randint(1,5)); tmphtmlcode=self.htmlhelper.GetHtmlCode(url) htmlcode=tmphtmlcode if len(htmlcode)>0: pattern_url=re.compile(r'<a href="(http://news\.ifeng\.com/a/[0-9\/]+?_0\.shtml)" target="_blank">') #"(http://2014\.qq\.com/a/[0-9/]+?\.htm)" hoturls=pattern_url.findall(htmlcode) for hoturl in hoturls: if len(hoturl)<=0:continue isExist=self.mysql.ExecQuery("select count(*) from newwordpress.pre_discuzcontent where url='%s'" % hoturl) if isExist[0][0]>0:continue self.randomSleep() try: if len(hoturl)>5: wen_htmlcode=self.htmlhelper.GetHtmlCode(hoturl) if len(wen_htmlcode)>0: #标题 tmp_title=self.htmlhelper.GetHtmlResults(wen_htmlcode,'h1' , 'id','artical_topic') if len(tmp_title)<=0:continue wen_title=HtmlHelper.RequestHtml.StripHtmlTags(r''.join(str(tmp_title[0]))) wen_title=wen_title.replace('(图)','') #文章 tmp_content=self.htmlhelper.GetHtmlResults(wen_htmlcode,'div' , 'id','main_content') if len(tmp_content)<=0:continue wen_content=HtmlHelper.RequestHtml.StripHtmlTags(r''.join(str(tmp_content[0]))) wen_content=wen_content.replace('\n \n','\n') comments=['', '', '', '']; hui_htmlcode=self.htmlhelper.GetOpenUrl('http://comment.ifeng.com/get.php?callback=newCommentListCallBack&orderby=&docUrl=%s&format=js&job=1&p=1&pageSize=2&callback=newCommentListCallBack&skey=bd025a' % hoturl) if len(hui_htmlcode)>0: pattern_url=re.compile(r'"comment_contents":"([\w\W]+?)"') tmpcomments=pattern_url.findall(hui_htmlcode) if len(tmpcomments)>0: for i in range(len(tmpcomments)): if i>4:break comments[i]=eval('u"'+tmpcomments[i]+'"') comments[i]=comments[i].decode('utf-8') self.saveTomysql(wen_title,wen_content[:4500],hoturl,comments) print u'%s==>%s'%(hoturl,wen_title) else: print ("【Goose异常】:%s" % (hoturl)) except Exception,ex: print 'spiderIfengNews:%s-->%s-->%s\n' % (hoturl,Exception,ex) def spiderSohu(self,url=''): if len(url)<2:return tmphtmlcode=self.htmlhelper.GetHtmlCode(url) htmlcode=tmphtmlcode.decode('cp936').encode('utf-8') if len(htmlcode)>0: pattern_url=re.compile(r'</em><a href="(http://[\w\W]+?\.shtml)" target="_blank">') hoturls=pattern_url.findall(htmlcode) for hoturl in hoturls: if len(hoturl)<=0:continue isExist=self.mysql.ExecQuery("select count(*) from newwordpress.pre_newcontent where url='%s'" % hoturl) if isExist[0][0]>0:continue if hoturl.find('sina.com')>-1 or hoturl.find('baidu.com')>-1 or hoturl.find('/shipin/')>-1:continue self.randomSleep() try: if len(hoturl)>5: contents=self.goose.extract(url=hoturl) if contents.title !=None and len(contents.cleaned_text)>0: print u'%s---%s' % (hoturl,contents.title) tmptitle=contents.title isdelimiter=tmptitle.find('_') if isdelimiter!=-1: tmptitle=tmptitle[:isdelimiter] else: isdelimiter=tmptitle.find('-') if isdelimiter!=-1: tmptitle=tmptitle[:isdelimiter] else: tmptitle=tmptitle self.save2mysql(tmptitle,contents.cleaned_text[:4500],hoturl) else: print ("【Goose异常】:%s" % (hoturl)) except Exception,ex: print 'spiderSohu:%s-->%s-->%s\n' % (hoturl,Exception,ex) def spiderChinanews(self,url=''): if len(url)<2:return tmphtmlcode=self.htmlhelper.GetHtmlCode(url) htmlcode=tmphtmlcode.decode('cp936').encode('utf-8') if len(htmlcode)>0: pattern_url=re.compile(r'<div class="dd_bt"><a href="([\W\w]+?\.shtml)">') hoturls=pattern_url.findall(htmlcode) for hoturl in hoturls: if len(hoturl)<=0:continue hoturl='http://www.chinanews.com%s' % hoturl isExist=self.mysql.ExecQuery("select count(*) from newwordpress.pre_newcontent where url='%s'" % hoturl) if isExist[0][0]>0:continue if hoturl.find('sina.com')>-1 or hoturl.find('baidu.com')>-1 or hoturl.find('/shipin/')>-1:continue self.randomSleep() try: if len(hoturl)>5: contents=self.goose.extract(url=hoturl) if contents.title !=None and len(contents.cleaned_text)>0: print u'%s---%s' % (hoturl,contents.title) tmptitle=contents.title isdelimiter=tmptitle.find('_') if isdelimiter!=-1: tmptitle=tmptitle[:isdelimiter] else: isdelimiter=tmptitle.find('-') if isdelimiter!=-1: tmptitle=tmptitle[:isdelimiter] else: tmptitle=tmptitle self.save2mysql(tmptitle,contents.cleaned_text[:4500],hoturl) else: print ("【Goose异常】:%s" % (hoturl)) except Exception,ex: print 'spiderChinanews:%s-->%s-->%s\n' % (hoturl,Exception,ex) def spider163News(self): url="http://news.163.com/special/0001220O/news_json.js?0.%s0661733004760" % random.randint(100,999) tmphtmlcode=self.htmlhelper.GetHtmlCode(url) htmlcode=tm

评论收藏

内容反馈