1500万数据13分钟写完,比第一版效率提升n呗
package com.xxx.warehouse.service
import org.apache.spark.sql.SparkSession
import org.apache.spark.sql.execution.datasources.jdbc.JDBCOptions
object Test_tidb2 {
def main(args: Array[String]) {
val spark = SparkSession.builder
.appName("Test_tidb2")
//.master("local[*]")
.config("spark.driver.allowMultipleContexts", true)
.config("hive.exec.dynamic.partition.mode", "nonstrict")
.enableHiveSupport()
.getOrCreate()
//spark.sparkContext.setLogLevel("ERROR")
val customer = spark.sql("select user_id,story_id,voice_type from ads.ads_ks_tips_rurn_pay_user_listen_story_finish_a_d where user_part = '1'")
// you might repartition source to make it balance across nodes
// and increase concurrency
val df = customer.repartition(32)
df.write
.mode(saveMode = "append")
.format("jdbc")
.option("driver", "com.mysql.jdbc.Driver")
// replace host and port as your and be sure to use rewrite batch
.option("url", "jdbc:mysql://tidb.dev.xxx.com:4000/ks_content_assets?rewriteBatchedStatements=true")
.option("useSSL", "false")
// As tested, 150 is good practice
.option(JDBCOptions.JDBC_BATCH_INSERT_SIZE, 150)
.option("dbtable", s"wxt_test") // database name and table name here
.option("isolationLevel", "NONE") // recommended to set isolationLevel to NONE if you have a large DF to load.
.option("user", "root") // TiDB user here
.option("password", "")
.save()
spark.stop()
}
}