#!/usr/bin/env python
#-*- coding:utf-8 -*-
#-------------------------------------------------------------------------------
# Name: HightWord.py
# Purpose: 文章采集工具
#
# Author: wolfxin2010
# QQ:361248642
#
# Created: 5/7/2014
#-------------------------------------------------------------------------------
import MysqlHelper,random,time,HtmlHelper,re,datetime
from goose import Goose
from goose.text import StopWordsChinese
class AnalyseTags():
def __init__(self):
self.mysql=MysqlHelper.MySQL('localhost','root','.Cc.bang.518.DAvi..8!')
self.goose=Goose({'stopwords_class' : StopWordsChinese})
self.htmlhelper=HtmlHelper.RequestHtml()
def spiderIfengNews(self):
url="http://news.ifeng.com/listpage/11502/%s/%s/rtlist.shtml" % (datetime.datetime.now().strftime('%Y%m%d'),random.randint(1,5));
tmphtmlcode=self.htmlhelper.GetHtmlCode(url)
htmlcode=tmphtmlcode
if len(htmlcode)>0:
pattern_url=re.compile(r'<a href="(http://news\.ifeng\.com/a/[0-9\/]+?_0\.shtml)" target="_blank">') #"(http://2014\.qq\.com/a/[0-9/]+?\.htm)"
hoturls=pattern_url.findall(htmlcode)
for hoturl in hoturls:
if len(hoturl)<=0:continue
isExist=self.mysql.ExecQuery("select count(*) from newwordpress.pre_discuzcontent where url='%s'" % hoturl)
if isExist[0][0]>0:continue
self.randomSleep()
try:
if len(hoturl)>5:
wen_htmlcode=self.htmlhelper.GetHtmlCode(hoturl)
if len(wen_htmlcode)>0:
#标题
tmp_title=self.htmlhelper.GetHtmlResults(wen_htmlcode,'h1' , 'id','artical_topic')
if len(tmp_title)<=0:continue
wen_title=HtmlHelper.RequestHtml.StripHtmlTags(r''.join(str(tmp_title[0])))
wen_title=wen_title.replace('(图)','')
#文章
tmp_content=self.htmlhelper.GetHtmlResults(wen_htmlcode,'div' , 'id','main_content')
if len(tmp_content)<=0:continue
wen_content=HtmlHelper.RequestHtml.StripHtmlTags(r''.join(str(tmp_content[0])))
wen_content=wen_content.replace('\n \n','\n')
comments=['', '', '', ''];
hui_htmlcode=self.htmlhelper.GetOpenUrl('http://comment.ifeng.com/get.php?callback=newCommentListCallBack&orderby=&docUrl=%s&format=js&job=1&p=1&pageSize=2&callback=newCommentListCallBack&skey=bd025a' % hoturl)
if len(hui_htmlcode)>0:
pattern_url=re.compile(r'"comment_contents":"([\w\W]+?)"')
tmpcomments=pattern_url.findall(hui_htmlcode)
if len(tmpcomments)>0:
for i in range(len(tmpcomments)):
if i>4:break
comments[i]=eval('u"'+tmpcomments[i]+'"')
comments[i]=comments[i].decode('utf-8')
self.saveTomysql(wen_title,wen_content[:4500],hoturl,comments)
print u'%s==>%s'%(hoturl,wen_title)
else:
print ("【Goose异常】:%s" % (hoturl))
except Exception,ex:
print 'spiderIfengNews:%s-->%s-->%s\n' % (hoturl,Exception,ex)
def spiderSohu(self,url=''):
if len(url)<2:return
tmphtmlcode=self.htmlhelper.GetHtmlCode(url)
htmlcode=tmphtmlcode.decode('cp936').encode('utf-8')
if len(htmlcode)>0:
pattern_url=re.compile(r'</em><a href="(http://[\w\W]+?\.shtml)" target="_blank">')
hoturls=pattern_url.findall(htmlcode)
for hoturl in hoturls:
if len(hoturl)<=0:continue
isExist=self.mysql.ExecQuery("select count(*) from newwordpress.pre_newcontent where url='%s'" % hoturl)
if isExist[0][0]>0:continue
if hoturl.find('sina.com')>-1 or hoturl.find('baidu.com')>-1 or hoturl.find('/shipin/')>-1:continue
self.randomSleep()
try:
if len(hoturl)>5:
contents=self.goose.extract(url=hoturl)
if contents.title !=None and len(contents.cleaned_text)>0:
print u'%s---%s' % (hoturl,contents.title)
tmptitle=contents.title
isdelimiter=tmptitle.find('_')
if isdelimiter!=-1:
tmptitle=tmptitle[:isdelimiter]
else:
isdelimiter=tmptitle.find('-')
if isdelimiter!=-1:
tmptitle=tmptitle[:isdelimiter]
else:
tmptitle=tmptitle
self.save2mysql(tmptitle,contents.cleaned_text[:4500],hoturl)
else:
print ("【Goose异常】:%s" % (hoturl))
except Exception,ex:
print 'spiderSohu:%s-->%s-->%s\n' % (hoturl,Exception,ex)
def spiderChinanews(self,url=''):
if len(url)<2:return
tmphtmlcode=self.htmlhelper.GetHtmlCode(url)
htmlcode=tmphtmlcode.decode('cp936').encode('utf-8')
if len(htmlcode)>0:
pattern_url=re.compile(r'<div class="dd_bt"><a href="([\W\w]+?\.shtml)">')
hoturls=pattern_url.findall(htmlcode)
for hoturl in hoturls:
if len(hoturl)<=0:continue
hoturl='http://www.chinanews.com%s' % hoturl
isExist=self.mysql.ExecQuery("select count(*) from newwordpress.pre_newcontent where url='%s'" % hoturl)
if isExist[0][0]>0:continue
if hoturl.find('sina.com')>-1 or hoturl.find('baidu.com')>-1 or hoturl.find('/shipin/')>-1:continue
self.randomSleep()
try:
if len(hoturl)>5:
contents=self.goose.extract(url=hoturl)
if contents.title !=None and len(contents.cleaned_text)>0:
print u'%s---%s' % (hoturl,contents.title)
tmptitle=contents.title
isdelimiter=tmptitle.find('_')
if isdelimiter!=-1:
tmptitle=tmptitle[:isdelimiter]
else:
isdelimiter=tmptitle.find('-')
if isdelimiter!=-1:
tmptitle=tmptitle[:isdelimiter]
else:
tmptitle=tmptitle
self.save2mysql(tmptitle,contents.cleaned_text[:4500],hoturl)
else:
print ("【Goose异常】:%s" % (hoturl))
except Exception,ex:
print 'spiderChinanews:%s-->%s-->%s\n' % (hoturl,Exception,ex)
def spider163News(self):
url="http://news.163.com/special/0001220O/news_json.js?0.%s0661733004760" % random.randint(100,999)
tmphtmlcode=self.htmlhelper.GetHtmlCode(url)
htmlcode=tm