sparksql将hive数据写入tidb【2】

原创已于 2022-04-28 13:46:43 修改 · 1.7k 阅读

3 ·

CC 4.0 BY-SA版权

文章标签：

#hive #spark #big data

于 2022-03-15 23:54:59 首次发布

spark 专栏收录该内容

18 篇文章

订阅专栏

本文介绍了使用Apache Spark的高效写入策略，通过13分钟处理1500万数据，对比初版实现了显著的性能提升，重点展示了如何配置JDBC连接和批处理参数以优化TiDB数据加载。

1500万数据13分钟写完，比第一版效率提升n呗

package com.xxx.warehouse.service


import org.apache.spark.sql.SparkSession
import org.apache.spark.sql.execution.datasources.jdbc.JDBCOptions

object Test_tidb2 {

 def main(args: Array[String]) {

    val spark = SparkSession.builder
      .appName("Test_tidb2")
      //.master("local[*]")
      .config("spark.driver.allowMultipleContexts", true)
      .config("hive.exec.dynamic.partition.mode", "nonstrict")
      .enableHiveSupport()
      .getOrCreate()

    //spark.sparkContext.setLogLevel("ERROR")

   val customer = spark.sql("select user_id,story_id,voice_type from ads.ads_ks_tips_rurn_pay_user_listen_story_finish_a_d where user_part = '1'")
   // you might repartition source to make it balance across nodes
   // and increase concurrency
   val df = customer.repartition(32)

   df.write
     .mode(saveMode = "append")
     .format("jdbc")
     .option("driver", "com.mysql.jdbc.Driver")
     // replace host and port as your and be sure to use rewrite batch
     .option("url", "jdbc:mysql://tidb.dev.xxx.com:4000/ks_content_assets?rewriteBatchedStatements=true")
     .option("useSSL", "false")
     // As tested, 150 is good practice
     .option(JDBCOptions.JDBC_BATCH_INSERT_SIZE, 150)
     .option("dbtable", s"wxt_test") // database name and table name here
     .option("isolationLevel", "NONE") // recommended to set isolationLevel to NONE if you have a large DF to load.
     .option("user", "root") // TiDB user here
     .option("password", "")
     .save()



   spark.stop()
  }
}