python豆瓣mysql_Python3.5爬取豆瓣电视剧数据并且同步到mysql中

本文介绍了一款Python程序,该程序能够从豆瓣网站抓取电视剧名称,并将其存储到数据库中。通过调用豆瓣API,可以获取不同类型的电视剧列表,并进行数据清洗,去除特殊符号。之后将清洗后的数据插入到指定的数据库表中,同时同步到另一个表,并触发发布流程。

摘要生成于 C知道 ,由 DeepSeek-R1 满血版支持, 前往体验 >

#!/usr/local/bin/python

# -*- coding: utf-8 -*-

# Python: 3.5

# Author: zhenghai.zhang@xxx.com

# Program: 爬取豆瓣网站上所有电视剧的名称并写入数据库。

# Version: 0.1

# History: 2017.11.01

import requests,time, pymysql, re, datetime, json

from exchangelib import DELEGATE, Account, Credentials, Message, Mailbox, HTMLBody

host = 'xxx'

user = 'xxx'

passwd = 'xxx'

dbme = 'crawl'

dbtarget = 'back_brace'

table = 'tv_hotwords'

tabledelta = 'tv_hotwords_delta'

tablesync = 'slot_value'

port = 3306

tolist = ['zhenghai.zhang@xxx.com']

def get_tvs(urlbase, page):

try:

url = urlbase + str(page)

print(url)

try:

result = requests.get(url).text

jresult = json.loads(result)

tvs = jresult.get('subjects')

except:

print('爬取' + urlbase + str(page) + '失败!')

time.sleep(2)

return tvs

except:

print('获取第%s页电影列表失败' % page)

def tv_insert(host, user, passwd, dbme, port, table, tvs_list):

conn = pymysql.connect(host=host, user=user, passwd=passwd, db=dbme, port=port, charset="utf8")

cur = conn.cursor()

new_tvs = []

punc = "!?。"#$%&'()*+,-/:;<=>@[\]^_`{|}~⦅⦆「」、、〃》「」『』【】〔〕〖〗〘〙〚〛〜〝〞〟〰〾〿–—‘’‛“”„‟…‧﹏.()::。· "

punctuation = punc

for tv in tvs_list:

try:

tv['title'] = re.sub(r"[%s]+" % punctuation, "", tv.get('title'))

cmd = 'insert into %s(tv_id, tv_name) values("%s", "%s")' % (

table, tv.get('id'), tv.get('title'))

cur.execute(cmd)

new_tvs.append(tv)

except:

print(" " * 20, tv.get('title'), "already exists, skip……")

cur.close()

conn.commit()

conn.close()

return new_tvs

def tv_new_and_sync(host, user, passwd, dbme, dbtarget, port, tabledelta, tvs_list, tablesync):

conn = pymysql.connect(host=host, user=user, passwd=passwd, db=dbme, port=port, charset="utf8")

cur = conn.cursor()

cur.execute("delete from %s " % dbme+"."+tabledelta)

for tv in tvs_list:

try:

cmd = 'insert into %s(tv_id, tv_name) values("%s", "%s")' % (tabledelta, tv['id'], tv['title'])

cmdsync = 'insert into %s(slot_type_id, slot_value, create_by, modify_by, gmt_create, gmt_modify, out_value) values("%s", "%s", "%s", "%s", "%s", "%s", "%s")' % (dbtarget+"."+tablesync, "xxxxxx", tv['title'], "system", "system", datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S"), datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S"), "")

cur.execute(cmd)

cur.execute(cmdsync)

except pymysql.Error:

print(" " * 20, tv['title'], "already exists, skip……")

cur.close()

conn.commit()

conn.close()

def tv_new_to_release(host, user, passwd, dbtarget, port):

conn = pymysql.connect(host=host, user=user, passwd=passwd, db=dbtarget, port=port, charset="utf8")

cur = conn.cursor()

try:

cmdbacktoskill = 'insert into back_brace.release_task(app_type,app_status,type,ref_id,status,register_id,create_by,modify_by,gmt_create,gmt_modify) values("BACKBRACE","testpass","SLOT","xxxxxx","init","SLOT_BACKBRACE_TESTPASS" ,"zhenghai.zhang","zhenghai.zhang","%s","%s")' % (datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S"), datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S"))

cmdskilltoskillpro = 'insert into back_brace.release_task(app_type,app_status,type,ref_id,status,register_id,create_by,modify_by,gmt_create,gmt_modify) values("SKILL","deploy","SLOT","xxxxxx","init","SLOT_SKILL_DEPLOY" ,"zhenghai.zhang","zhenghai.zhang","%s","%s")' % (datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S"), datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S"))

print(cmdbacktoskill)

cur.execute(cmdbacktoskill)

print(cmdskilltoskillpro)

cur.execute(cmdskilltoskillpro)

except pymysql.Error:

print("write into back_brace.release_task error!!!")

cur.close()

conn.commit()

conn.close()

def Email(to, subject, body):

creds = Credentials(

username='xxxxxx',

password='xxxxxx')

account = Account(

primary_smtp_address='xxx@xxx.com',

credentials=creds,

autodiscover=True,

access_type=DELEGATE)

m = Message(

account=account,

subject=subject,

body=HTMLBody(body),

to_recipients=[Mailbox(email_address=to)])

m.send_and_save()

if __name__ == '__main__':

update_tvs = []

pages = 25 # 豆瓣每项电视剧只有前500部

# 美剧 英剧 韩剧 日剧 国产剧 港剧 日本动画 综艺

urlbaselist = ['https://movie.douban.com/j/search_subjects?type=tv&tag=%E7%BE%8E%E5%89%A7&sort=recommend&page_limit=20&page_start=',

'https://movie.douban.com/j/search_subjects?type=tv&tag=%E8%8B%B1%E5%89%A7&sort=recommend&page_limit=20&page_start=',

'https://movie.douban.com/j/search_subjects?type=tv&tag=%E9%9F%A9%E5%89%A7&sort=recommend&page_limit=20&page_start=',

'https://movie.douban.com/j/search_subjects?type=tv&tag=%E6%97%A5%E5%89%A7&sort=recommend&page_limit=20&page_start=',

'https://movie.douban.com/j/search_subjects?type=tv&tag=%E5%9B%BD%E4%BA%A7%E5%89%A7&sort=recommend&page_limit=20&page_start=',

'https://movie.douban.com/j/search_subjects?type=tv&tag=%E6%B8%AF%E5%89%A7&sort=recommend&page_limit=20&page_start=',

'https://movie.douban.com/j/search_subjects?type=tv&tag=%E6%97%A5%E6%9C%AC%E5%8A%A8%E7%94%BB&sort=recommend&page_limit=20&page_start=',

'https://movie.douban.com/j/search_subjects?type=tv&tag=%E7%BB%BC%E8%89%BA&sort=recommend&page_limit=20&page_start=']

for urlbase in urlbaselist:

for i in range(pages):

print("*"*30, i, "*"*30)

tvs_list = get_tvs(urlbase, i * 20)

new_tvs = tv_insert(host, user, passwd, dbme, port, table, tvs_list)

for tv in new_tvs:

print(tv['title'],"Added")

onetv = {}

onetv["id"] = tv["id"]

onetv["title"] = tv["title"]

update_tvs.append(onetv)

time.sleep(1)

print(update_tvs)

try:

tv_new_and_sync(host, user, passwd, dbme, dbtarget, port, tabledelta, update_tvs, tablesync) # 将增加的电影写入movie_hotwords_delta表中

except:

print("tv update and sync Error!")

try:

tv_new_to_release(host, user, passwd, dbtarget, port)

except:

print("tv_new_to_release error!!!")

subject = '本次新增电视剧名称'

body = "本次新增的电影名称为:


"

for movie in update_tvs:

body += movie['title'] + "
"

for to in tolist:

Email(to, subject, body)

评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值