python并发向mysql插入数据
前言
本次代码主要在mysql数据库中实现,并发插入10w,100w,500w行数据
环境
环境 | 描述 |
---|---|
系统环境 | ubuntu20.04 |
数据库 | mysql8.0.25 |
python | 3.9.5 |
数据表
create table t_normal (col1 int primary key auto_increment, col2 bigint,col3 timestamp )engine=innodb ;
create table t_partition (col1 int primary key auto_increment, col2 bigint, col3 timestamp)engine=innodb partition by hash(col1) partitions 4;
create table t_global (col1 int primary key auto_increment, col2 bigint, col3 timestamp)engine=innodb;
构建数据
10w.txt/100w.txt/500w.txt
------------------------------------------------------
10138000,2021-06-24 06:10:41
3602000,2021-06-24 06:10:41
25705000,2021-06-24 06:10:41
11145000,2021-06-24 06:10:41
17882000,2021-06-24 06:10:41
10666000,2021-06-24 06:10:41
31532000,2021-06-24 06:10:41
7480000,2021-06-24 06:10:41
实现步骤
step1: 连接数据库
def db_cnnection(self) :
conn = config (sys.argv[1])
host = conn.configp.get ('sqlnode1', 'host')
port = int (conn.configp.get ('sqlnode1', 'port'))
user = conn.configp.get ('sqlnode1', 'user')
password = conn.configp.get ('sqlnode1', 'password')
database = conn.configp.get ('sqlnode1', 'database')
maxconnections = 15
pool = PooledDB (
pymysql,
maxconnections,
host=host,
user=user,
port=port,
password=password,
database=database
)
return pool
上步骤主要依靠python的configparser模块,实现对ini文件的解析.将configparser模块中的get方法改写为
import configparser
class config (object) :
def __init__(self, path):
self.fpath = path
self.configp = configparser.ConfigParser ()
self.configp.read (self.fpath,encoding="UTF-8")
# 通过key获取value
def get(self, option, value) :
return self.configp.get (option, value)
config.ini文件
; define host
[hosts]
sqlnode1=172.21.0.3
sqlnode2=172.21.0.4
sqlnode3=172.21.0.5
; define sqlnode
[sqlnode1]
host=172.21.0.3
port=3306
user=greatdb
step2: 获取数据,将数据转换为元组的格式
def getData(self) :
with open (sys.argv[2], 'r') as f :
data = []
for file in f.readlines () :
file = file.replace ('\n', '')
file = tuple (file.split (','))
data.append (file)
return data
# data格式: (xx,xx),(xx,xx)
step3: 插入数据
def mysql_insert(self, *args) :
con = self.pool.connection ()
cur = con.cursor ()
for table in table_list :
sql = "INSERT INTO test.{}(col2,col3) VALUES(%s, %s)".format (table)
try :
cur.executemany (sql, args)
con.commit ()
except Exception as e :
con.rollback ()
logger.error ("SQL FAIL: ==>> {},result: ==>> {}".format (sql, e))
con.close ()
cur.close ()
executemany: 为批量执行,效果要比execute快
step4: 删除数据
当多次执行此程序时需要将数据清理后再导入!
def delete_data(self) :
st = time.time ()
con = self.pool.connection ()
cur = con.cursor ()
for table in table_list :
sql = "TRUNCATE TABLE test.{}".format (table)
cur.execute (sql)
# con.commit ()
cur.close ()
con.close ()
logger.info ("Clean Data .time cost: {} seconds".format (round (time.time () - st, 3)))
step5: 设置并发量
在此环境中,最大连接数为15(maxconnections)
,最大并发量为10(Queue(maxsize=10))
,切记最大并量不能超过最大连接数!
def task(self) :
q = Queue (maxsize=10)
while self.data :
content = self.data.pop ()
t = threading.Thread (target=self.mysql_insert, args=(content,))
q.put (t)
if (q.full () == True) or (len (self.data)) == 0 :
thread_list = []
while q.empty () == False :
t = q.get ()
thread_list.append (t)
t.start ()
for t in thread_list :
t.join ()
代码总体展示:
import pymysql
import threading
import re, sys
import time
from override_parser import config
from queue import Queue
from DBUtils.PooledDB import PooledDB
from general import logger
from ColumnOperation import ColumnOperation
class ThreadInsert (object) :
def __init__(self) :
start_time = time.time ()
self.pool = self.db_cnnection ()
self.data = self.getData ()
self.delete_data ()
self.task ()
logger.info ("Concurrent insert data success,time cost: {} seconds".format (round (time.time() - start_time), 5))
def db_cnnection(self) :
conn = config (sys.argv[1])
host = conn.configp.get ('sqlnode1', 'host')
port = int (conn.configp.get ('sqlnode1', 'port'))
user = conn.configp.get ('sqlnode1', 'user')
password = conn.configp.get ('sqlnode1', 'password')
database = conn.configp.get ('sqlnode1', 'database')
maxconnections = 15
pool = PooledDB (
pymysql,
maxconnections,
host=host,
user=user,
port=port,
password=password,
database=database
)
return pool
def getData(self) :
with open (sys.argv[2], 'r') as f :
data = [] # 将数据转换为元组的形式
for file in f.readlines () :
file = file.replace ('\n', '')
file = tuple (file.split (','))
data.append (file)
return data
def delete_data(self) :
st = time.time ()
con = self.pool.connection ()
cur = con.cursor ()
for table in table_list :
sql = "TRUNCATE TABLE test.{}".format (table)
cur.execute (sql)
# con.commit ()
cur.close ()
con.close ()
logger.info ("Clean Data .time cost: {} seconds".format (round (time.time () - st, 3)))
def mysql_insert(self, *args) :
con = self.pool.connection ()
cur = con.cursor ()
for table in table_list :
sql = "INSERT INTO test.{}(col2,col3) VALUES(%s, %s)".format (table)
try :
cur.executemany (sql, args)
con.commit ()
except Exception as e :
con.rollback ()
logger.error ("SQL FAIL: ==>> {},result: ==>> {}".format (sql, e))
con.close ()
cur.close ()
def task(self) :
q = Queue (maxsize=10) # 设定最大队列数和线程数
while self.data :
content = self.data.pop ()
t = threading.Thread (target=self.mysql_insert, args=(content,))
q.put (t)
if (q.full () == True) or (len (self.data)) == 0 :
thread_list = []
while q.empty () == False :
t = q.get ()
thread_list.append (t)
t.start ()
for t in thread_list :
t.join ()
if __name__ == '__main__' :
table_list = ["t_global", "t_normal", "t_partition"] # 并发插入的表名
ThreadInsert()
执行
python3 ThreadInsert.py config.ini 10w.txt