Spark-读写HBase,SparkStreaming操作,Spark的HBase相关操作

本文详细介绍Spark如何读写HBase,包括Spark Streaming实时写入HBase、整合Kafka实现exactly-once语义、同时消费多个主题数据、以及通过RDD和DataFrame方式读写HBase。涵盖从实时数据处理到数据持久化的全面解决方案。

摘要生成于 C知道 ,由 DeepSeek-R1 满血版支持, 前往体验 >

(1-4)原文地址:JasonLee’blog
(5-6)原文地址:Lu_Xiao_Yue
(7)原文地址:修行修心

1.sparkstreaming实时写入Hbase(saveAsNewAPIHadoopDataset方法)

import kafka.PropertiesScalaUtils
import net.sf.json.JSONObject
import org.apache.hadoop.hbase.client.{Put, Result}
import org.apache.hadoop.hbase.io.ImmutableBytesWritable
import org.apache.hadoop.hbase.util.Bytes
import org.apache.hadoop.mapreduce.Job
import org.apache.spark.SparkConf
import org.apache.hadoop.hbase.mapreduce.TableOutputFormat
import org.apache.kafka.common.serialization.StringDeserializer
import org.apache.spark.streaming.{Seconds, StreamingContext}
import org.apache.spark.streaming.kafka010.{ConsumerStrategies, KafkaUtils, LocationStrategies}
import spark.wordcount.kafkaStreams
 
/**
  * sparkstreaming写入hbase新的API;
  */
object sparkToHbase {
  def main(args: Array[String]): Unit = {
    val conf = new SparkConf().setAppName("Hbase Test")
    val scc = new StreamingContext(conf, Seconds(1))
    val sc = scc.sparkContext
    val tablename = "test"
    val mode = args(0).toString
    val zk_hbase = PropertiesScalaUtils.loadProperties("zk_hbase",mode)
    val zk_port = PropertiesScalaUtils.loadProperties("zk_port",mode)
    val hbase_master = PropertiesScalaUtils.loadProperties("hbase_master",mode)
    val hbase_rootdir = PropertiesScalaUtils.loadProperties("hbase_rootdir",mode)
    val zookeeper_znode_parent = PropertiesScalaUtils.loadProperties("zookeeper_znode_parent",mode)
    val topic = PropertiesScalaUtils.loadProperties("topic_combine",mode)
    val broker = PropertiesScalaUtils.loadProperties("broker",mode)
    sc.hadoopConfiguration.set("hbase.zookeeper.quorum",zk_hbase)
    sc.hadoopConfiguration.set("hbase.zookeeper.property.clientPort", zk_port)
    sc.hadoopConfiguration.set("hbase.master", hbase_master)
    sc.hadoopConfiguration.set("hbase.defaults.for.version.skip", "true")
    sc.hadoopConfiguration.set("hhbase.rootdir", hbase_rootdir)
    sc.hadoopConfiguration.set("zookeeper.znode.parent", zookeeper_znode_parent)
    sc.hadoopConfiguration.set(TableOutputFormat.OUTPUT_TABLE, tablename)
    val job = Job.getInstance(sc.hadoopConfiguration)
    job.setOutputKeyClass(classOf[ImmutableBytesWritable])
    job.setOutputValueClass(classOf[Result])
    job.setOutputFormatClass(classOf[TableOutputFormat[ImmutableBytesWritable]])
    val topicSet = Set(topic)
    val kafkaParams = Map[String, Object](
      "auto.offset.reset" -> "latest",   //latest;earliest
      "value.deserializer" -> classOf[StringDeserializer] //key,value的反序列化;
      , "key.deserializer" -> classOf[StringDeserializer]
      , "bootstrap.servers" -> broker
      , "group.id" -> "jason_test"
      , "enable.auto.commit" -> (true: java.lang.Boolean)
    )
    kafkaStreams = KafkaUtils.createDirectStream[String, String](
      scc,
      LocationStrategies.PreferConsistent,
      ConsumerStrategies.Subscribe[String, String](topicSet, kafkaParams))
    try {
      kafkaStreams.foreachRDD(rdd => {
        if(!rdd.isEmpty()){
          val save_rdd = rdd.map(x => {
            val json = JSONObject.fromObject(x.value())
            val put = new Put(Bytes.toBytes(json.get("rowkey").toString))
            insert_hb(json,put)
            (new ImmutableBytesWritable, put)
          })
          save_rdd.saveAsNewAPIHadoopDataset(job.getConfiguration())
        }
      })
    }catch {
      case e:Exception => println("报错了")
    }
    scc.start()
    scc.awaitTermination()
  }
  def insert_hb(json: JSONObject, onePut: Put): Unit = {
    val keys = json.keySet
    val iterator_redis = keys.iterator
    while (iterator_redis.hasNext) {
      val hb_col = iterator_redis.next().toString
      val col_value = json.get(hb_col).toString
      onePut.addColumn(Bytes.toBytes("f1"), Bytes.toBytes(hb_col), Bytes.toBytes(col_value))
    }
  }
}

2.sparkstreaming整合kafka实现exactly-once语义

手动维护kafka的offest
为了实现exactly-once的语义,我采用自己保存offest的方法,offest可以保存在zk,kafka,mysql,hbase,redis中自己根据情况而定,我选择把offest保存到redis中.创建Dstream之前,先判断是否消费过,如果没有消费就从头开始,如果已经消费过了,就从上次保存的offest处开始消费。

spark版本2.2.0,scala版本2.11.8,kafka版本0.10.1,hbase版本1.1.2.。

package test
 
import java.util
import kafka.{PropertiesScalaUtils, RedisKeysListUtils}
import kafka.SparkStreamingKafka.{dbIndex, kafkaStreams}
import org.apache.kafka.common.TopicPartition
import org.apache.kafka.common.serialization.StringDeserializer
import org.apache.log4j.{Level, Logger}
import org.apache.spark.SparkConf
import org.apache.spark.streaming.{Seconds, StreamingContext}
import org.apache.spark.streaming.kafka010.{ConsumerStrategies, KafkaUtils, LocationStrategies}
import redis.RedisPool
 
object sparkstreaming {
  def main(args: Array[String]): Unit = {
    Logger.getLogger("org.apache.spark").setLevel(Level.INFO)
    Logger.getLogger("org.eclipse.jetty.server").setLevel(Level.INFO)
    Logger.getLogger("org.apache.kafka.clients.consumer").setLevel(Level.INFO)
    val conf = new SparkConf().setAppName("sparkstreaming")
    conf.set("spark.streaming.kafka.maxRatePerPartition", "2000")
    conf.set("spark.serializer", "org.apache.spark.serializer.KryoSerializer")
    conf.set("spark.streaming.concurrentJobs", "10")
    conf.set("spark.streaming.kafka.maxRetries", "50")
    val scc = new StreamingContext(conf, Seconds(5))
    val topic = PropertiesScalaUtils.loadProperties("topic")
    val topicSet: Set[String] = Set(topic)
    val kafkaParams = Map[String, Object](
      "auto.offset.reset" -> "latest",
      "value.deserializer" -> classOf[StringDeserializer]
      , "key.deserializer" -> classOf[StringDeserializer]
      , "bootstrap.servers" -> PropertiesScalaUtils.loadProperties("broker")
      , "group.id" -> PropertiesScalaUtils.loadProperties("groupId")
      , "enable.auto.commit" -> (false: java.lang.Boolean)
    )
    val maxTotal = 200
    val maxIdle = 100
    val minIdle = 10
    val testOnBorrow = false
    val testOnReturn = false
    val maxWaitMillis = 500
    RedisPool.makePool(PropertiesScalaUtils.loadProperties("redisHost"), PropertiesScalaUtils.loadProperties("redisPort").toInt, maxTotal, maxIdle, minIdle, testOnBorrow, testOnReturn, maxWaitMillis)
    val jedis = RedisPool.getPool.getResource
    jedis.select(dbIndex)
    val keys: util.Set[String] = jedis.keys(topic + "*")
    if (keys.size() == 0) {
      kafkaStreams = KafkaUtils.createDirectStream[String, String](
        scc,
        LocationStrategies.PreferConsistent,
        ConsumerStrategies.Subscribe[String, String](topicSet, kafkaParams))
    } else {
      val fromOffsets: Map[TopicPartition, Long] = RedisKeysListUtils.getKeysList(PropertiesScalaUtils.loadProperties("redisHost"), PropertiesScalaUtils.loadProperties("redisPort").toInt, topic)
      kafkaStreams = KafkaUtils.createDirectStream[String, String](
        scc,
        LocationStrategies.PreferConsistent,
        ConsumerStrategies.Subscribe[String, String](topicSet, kafkaParams, fromOffsets))
    }
    RedisPool.getPool.returnResource(jedis)
    kafkaStreams.foreachRDD(rdd=>{
      if (!rdd.isEmpty()) {
      val offsetRanges = rdd.asInstanceOf[HasOffsetRanges].offsetRanges
      rdd.foreachPartition(partiton=>{
        val conf = HBaseConfiguration.create()
        conf.set("hbase.zookeeper.quorum", PropertiesScalaUtils.loadProperties("zk_hbase")) //zk的地址;
        conf.set("hbase.zookeeper.property.clientPort", PropertiesScalaUtils.loadProperties("zk_port"))
        conf.set("hbase.master", PropertiesScalaUtils.loadProperties("hbase_master"))
        conf.set("hbase.defaults.for.version.skip", "true")
        conf.set("hhbase.rootdir", PropertiesScalaUtils.loadProperties("hbase_rootdir"))
        conf.set("zookeeper.znode.parent", PropertiesScalaUtils.loadProperties("zookeeper_znode_parent"))
        myTable = new HTable(conf, TableName.valueOf(PropertiesScalaUtils.loadProperties("hbase_table")))
        myTable.setAutoFlush(false, false) //关闭自动提交
        myTable.setWriteBufferSize(3 * 1024 * 1024)
        partiton.foreach(pair=>{
          //自己的处理逻辑;
        })
        myTable.flushCommits()
        myTable.close()
        offsetRanges.foreach { offsetRange =>
          println("partition : " + offsetRange.partition + " fromOffset:  " + offsetRange.fromOffset + " untilOffset: " + offsetRange.untilOffset)
          val topic_partition_key_new = offsetRange.topic + "_" + offsetRange.partition
          jedis_jason.set(topic_partition_key_new, offsetRange.untilOffset + "")
        })
       }
      })
      scc.start()
      scc.awaitTermination()
    }
  }
 

3.sparkstreaming同时消费多个topic的数据实现exactly-once的语义

offest存到redis里了,当然也可以保存在zk,kafka,mysql,hbase中都可以。

用了3个topic,每个topic5个partition。

package spark
 
import java.io.File
import kafka.{PropertiesScalaUtils, RedisKeysListUtils}
import kafka.streamingRedisHive.{dbIndex}
import org.apache.kafka.clients.consumer.ConsumerRecord
import org.apache.kafka.common.serialization.StringDeserializer
import org.apache.log4j.{Level, Logger}
import org.apache.spark.TaskContext
import org.apache.spark.sql.SparkSession
import org.apache.spark.streaming.dstream.InputDStream
import org.apache.spark.streaming.{Seconds, StreamingContext}
import org.apache.spark.streaming.kafka010._
import redis.RedisPool
 
object moreTopic {
  def main(args: Array[String]): Unit = {
    Logger.getLogger("org.apache.spark").setLevel(Level.INFO)
    Logger.getLogger("org.eclipse.jetty.server").setLevel(Level.INFO)
    Logger.getLogger("org.apache.kafka.clients.consumer").setLevel(Level.INFO)
    val warehouseLocation = new File("hdfs://cluster/hive/warehouse").getAbsolutePath
    val spark = SparkSession.builder().appName("Spark Jason").config("spark.sql.warehouse.dir",    warehouseLocation).enableHiveSupport().getOrCreate()
    spark.conf.set("spark.streaming.concurrentJobs", 10)
    spark.conf.set("spark.streaming.kafka.maxRetries", 50)
    spark.conf.set("spark.streaming.stopGracefullyOnShutdown",true)
    spark.conf.set("spark.streaming.backpressure.enabled",true)
    spark.conf.set("spark.streaming.backpressure.initialRate",5000)
    spark.conf.set("spark.streaming.kafka.maxRatePerPartition", 3000)
    @transient
    val sc = spark.sparkContext
    val scc = new StreamingContext(sc, Seconds(2))
    val kafkaParams = Map[String, Object](
      "auto.offset.reset" -> "latest",
      "value.deserializer" -> classOf[StringDeserializer]
      , "key.deserializer" -> classOf[StringDeserializer]
      , "bootstrap.servers" -> PropertiesScalaUtils.loadProperties("broker")
      , "group.id" -> PropertiesScalaUtils.loadProperties("groupId")
      , "enable.auto.commit" -> (false: java.lang.Boolean)
    )
    var stream: InputDStream[ConsumerRecord[String, String]] = null
    val topics = Array("jason_20180519", "jason_0606","jason_test")
    val maxTotal = 200
    val maxIdle = 100
    val minIdle = 10
    val testOnBorrow = false
    val testOnReturn = false
    val maxWaitMillis = 5000
    RedisPool.makePool(PropertiesScalaUtils.loadProperties("redisHost"), PropertiesScalaUtils.loadProperties("redisPort").toInt, maxTotal, maxIdle, minIdle, testOnBorrow, testOnReturn, maxWaitMillis)
    val jedis = RedisPool.getPool.getResource
    jedis.select(dbIndex)
    val keys = jedis.keys(topics(0) + "*")
    val keys_2 = jedis.keys(topics(1) +"*")
    val keys_3 = jedis.keys(topics(2) +"*")
    if(keys.size() == 0 && keys_2.size() == 0 && keys_3.size() == 0){
      println("第一次启动,从头开始消费数据-----------------------------------------------------------")
      stream = KafkaUtils.createDirectStream[String, String](
        scc,
        LocationStrategies.PreferConsistent,
        ConsumerStrategies.Subscribe[String, String](topics, kafkaParams)
      )
    }else{
      println("不是第一次启动,从上次的offest开始消费数据-----------------------------------------------")
      stream = KafkaUtils.createDirectStream[String, String](
        scc,
        LocationStrategies.PreferConsistent,
        ConsumerStrategies.Subscribe[String, String](topics, kafkaParams, RedisKeysListUtils.getRedisOffest(topics,jedis)))
    }
    jedis.close()
    stream.foreachRDD(rdd=>{
      if (!rdd.isEmpty()) {
      val offsetRanges = rdd.asInstanceOf[HasOffsetRanges].offsetRanges
      rdd.foreachPartition(partition=>{
        val o = offsetRanges(TaskContext.get.partitionId)
        println(s"${o.topic} ${o.partition} ${o.fromOffset} ${o.untilOffset}")
        val jedis_jason = RedisPool.getPool.getResource
        jedis_jason.select(dbIndex)
        partition.foreach(pair=>{
          //自己的计算逻辑;
        })
        offsetRanges.foreach { offsetRange =>
          println("partition : " + offsetRange.partition + " fromOffset:  " + offsetRange.fromOffset + " untilOffset: " + offsetRange.untilOffset)
          val topic_partition_key_new = offsetRange.topic + "_" + offsetRange.partition
          jedis_jason.set(topic_partition_key_new, offsetRange.untilOffset + "")
        }
        jedis_jason.close()
      })
     }
    })
    scc.start()
    scc.awaitTermination()
  }
}

4.spark读取hbase数据(newAPIHadoopRDD方式)

package hbase
 
import org.apache.hadoop.hbase.HBaseConfiguration
import org.apache.hadoop.hbase.mapreduce.TableInputFormat
import org.apache.hadoop.hbase.util.Bytes
import org.apache.log4j.{Level, Logger}
import util.PropertiesScalaUtils
import org.apache.spark.sql.SparkSession
 
/**
  * spark读取hbase的数据
  */
object ReadHbase {
  def main(args: Array[String]): Unit = {
    Logger.getLogger("org.apache.spark").setLevel(Level.WARN)
    Logger.getLogger("org.apache.hadoop").setLevel(Level.WARN)
    Logger.getLogger("org.eclipse.jetty.server").setLevel(Level.OFF)
    val spark = SparkSession
      .builder
      .appName("read hbase")
      .master("local[4]")
      .config("spark.some.config.option", "config-value")
      .config("spark.serializer", "org.apache.spark.serializer.KryoSerializer")
      .getOrCreate
    val sc = spark.sparkContext
    val mode = "local"
    val zk_hbase = PropertiesScalaUtils.loadProperties("zk_hbase",mode)
    val zk_port = PropertiesScalaUtils.loadProperties("zk_port",mode)
    val hbase_master = PropertiesScalaUtils.loadProperties("hbase_master",mode)
    val hbase_rootdir = PropertiesScalaUtils.loadProperties("hbase_rootdir",mode)
    val zookeeper_znode_parent = PropertiesScalaUtils.loadProperties("zookeeper_znode_parent",mode)
    val hbase_table = PropertiesScalaUtils.loadProperties("hbase_table",mode)
 
    val conf = HBaseConfiguration.create()
    conf.set("hbase.zookeeper.quorum", zk_hbase)
    conf.set("hbase.zookeeper.property.clientPort", zk_port)
    conf.set("hbase.master", hbase_master)
    conf.set("hbase.defaults.for.version.skip", "true")
    conf.set("hhbase.rootdir", hbase_rootdir)
    conf.set("zookeeper.znode.parent", zookeeper_znode_parent)
    conf.set(TableInputFormat.INPUT_TABLE, "cbd:prod_base")
    val hbaseRDD = sc.newAPIHadoopRDD(conf, classOf[TableInputFormat],
      classOf[org.apache.hadoop.hbase.io.ImmutableBytesWritable],
      classOf[org.apache.hadoop.hbase.client.Result])
    hbaseRDD.sample(false,0.1)foreachPartition(fp=>{
      fp.foreach(f=>{
        val rowkey = Bytes.toString(f._2.getRow)
        val InsertTime = Bytes.toString(f._2.getValue("cf1".getBytes,"InsertTime".getBytes))
        val VipPrice = Bytes.toString(f._2.getValue("cf1".getBytes,"VipPrice".getBytes))
        println(s"Row key:$rowkey InsertTime:$InsertTime VipPrice:$VipPrice")
      })
    })
    println("元素的个数:"+hbaseRDD.count())
    sc.stop()
  }
}

5.spark读取hbase中的数据

import org.apache.hadoop.hbase.HBaseConfiguration
import org.apache.hadoop.hbase.client.Result
import org.apache.hadoop.hbase.io.ImmutableBytesWritable
import org.apache.hadoop.hbase.mapreduce.TableInputFormat
import org.apache.hadoop.hbase.util.Bytes
import org.apache.spark.rdd.RDD
import org.apache.spark.{SparkConf, SparkContext}

object HbaseRdd1 {

  def main(args: Array[String]): Unit = {
    val conf = HBaseConfiguration.create()
    val sc = new SparkContext(new SparkConf())
    //设置查询的表名
    conf.set(TableInputFormat.INPUT_TABLE, "student")
    // hbase
    val stuRDD: RDD[(ImmutableBytesWritable, Result)] = sc.newAPIHadoopRDD(conf, classOf[TableInputFormat],
      classOf[org.apache.hadoop.hbase.io.ImmutableBytesWritable],
      classOf[org.apache.hadoop.hbase.client.Result])

    val count = stuRDD.count()
    println("Students RDD Count:" + count)
    stuRDD.cache()

    //遍历输出
    stuRDD.foreach({ case (_,result) =>
      val key = Bytes.toString(result.getRow)
      val name = Bytes.toString(result.getValue("info".getBytes,"name".getBytes))
      val gender = Bytes.toString(result.getValue("info".getBytes,"gender".getBytes))
      val age = Bytes.toString(result.getValue("info".getBytes,"age".getBytes))
      println("Row key:"+key+" Name:"+name+" Gender:"+gender+" Age:"+age)
    })
  }
}

6.spark将数据写入到hbase

import org.apache.hadoop.hbase.HBaseConfiguration  
import org.apache.hadoop.hbase.mapreduce.TableOutputFormat  
import org.apache.spark._  
import org.apache.hadoop.mapreduce.Job  
import org.apache.hadoop.hbase.io.ImmutableBytesWritable  
import org.apache.hadoop.hbase.client.Result  
import org.apache.hadoop.hbase.client.Put  
import org.apache.hadoop.hbase.util.Bytes  

object SparkWriteHBase {  

  def main(args: Array[String]): Unit = {  
    val sparkConf = new SparkConf().setAppName("SparkWriteHBase").setMaster("local")  
    val sc = new SparkContext(sparkConf)        
    val tablename = "student"        
    sc.hadoopConfiguration.set(TableOutputFormat.OUTPUT_TABLE, tablename)  

    val job = new Job(sc.hadoopConfiguration)  
    job.setOutputKeyClass(classOf[ImmutableBytesWritable])  
    job.setOutputValueClass(classOf[Result])    
    job.setOutputFormatClass(classOf[TableOutputFormat[ImmutableBytesWritable]])    

    val indataRDD = sc.makeRDD(Array("3,Rongcheng,M,26","4,Guanhua,M,27")) //构建两行记录
    val rdd = indataRDD.map(_.split(',')).map{arr=>{  
      val put = new Put(Bytes.toBytes(arr(0))) //行健的值 
      put.add(Bytes.toBytes("info"),Bytes.toBytes("name"),Bytes.toBytes(arr(1)))  //info:name列的值
      put.add(Bytes.toBytes("info"),Bytes.toBytes("gender"),Bytes.toBytes(arr(2)))  //info:gender列的值
            put.add(Bytes.toBytes("info"),Bytes.toBytes("age"),Bytes.toBytes(arr(3).toInt))  //info:age列的值
      (new ImmutableBytesWritable, put)   
    }}        
    rdd.saveAsNewAPIHadoopDataset(job.getConfiguration())  
  }    
}  

7.Spark 读写 HBase 的两种方式(RDD、DataFrame)

7.1使用 saveAsHadoopDataset 写入数据

import org.apache.hadoop.hbase.{HBaseConfiguration, HTableDescriptor, TableName}
import org.apache.hadoop.hbase.client.{HBaseAdmin, Put, Result}
import org.apache.hadoop.hbase.io.ImmutableBytesWritable
import org.apache.hadoop.hbase.mapreduce.TableInputFormat
//import org.apache.hadoop.hbase.mapreduce.TableOutputFormat
import org.apache.hadoop.hbase.mapred.TableOutputFormat
import org.apache.hadoop.hbase.util.Bytes
import org.apache.hadoop.mapred.JobConf
//import org.apache.hadoop.mapreduce.Job
import org.apache.log4j.{Level, Logger}
import org.apache.spark.sql.SparkSession

/**
  * Created by blockchain on 18-9-9 下午3:45 in Beijing.
  */

object SparkHBaseRDD {
  def main(args: Array[String]) {
    // 屏蔽不必要的日志显示在终端上
    Logger.getLogger("org.apache.spark").setLevel(Level.WARN)

    val spark = SparkSession.builder().appName("SparkHBaseRDD").getOrCreate()
    val sc = spark.sparkContext

    val tablename = "SparkHBase"

    val hbaseConf = HBaseConfiguration.create()
    hbaseConf.set("hbase.zookeeper.quorum","localhost")  //设置zooKeeper集群地址,也可以通过将hbase-site.xml导入classpath,但是建议在程序里这样设置
    hbaseConf.set("hbase.zookeeper.property.clientPort", "2181")       //设置zookeeper连接端口,默认2181
    hbaseConf.set(TableOutputFormat.OUTPUT_TABLE, tablename)

    // 初始化job,TableOutputFormat 是 org.apache.hadoop.hbase.mapred 包下的
    val jobConf = new JobConf(hbaseConf)
    jobConf.setOutputFormat(classOf[TableOutputFormat])

    val indataRDD = sc.makeRDD(Array("2,jack,16", "1,Lucy,15", "5,mike,17", "3,Lily,14"))

    val rdd = indataRDD.map(_.split(',')).map{ arr=>
      /*一个Put对象就是一行记录,在构造方法中指定主键
       * 所有插入的数据 须用 org.apache.hadoop.hbase.util.Bytes.toBytes 转换
       * Put.addColumn 方法接收三个参数:列族,列名,数据*/
      val put = new Put(Bytes.toBytes(arr(0)))
      put.addColumn(Bytes.toBytes("cf1"),Bytes.toBytes("name"),Bytes.toBytes(arr(1)))
      put.addColumn(Bytes.toBytes("cf1"),Bytes.toBytes("age"),Bytes.toBytes(arr(2)))
      (new ImmutableBytesWritable, put)
    }
    rdd.saveAsHadoopDataset(jobConf)

    spark.stop()
  }
}

7.2使用 newAPIHadoopRDD 读取数据

import org.apache.hadoop.hbase.{HBaseConfiguration, HTableDescriptor, TableName}
import org.apache.hadoop.hbase.client.{HBaseAdmin, Put, Result}
import org.apache.hadoop.hbase.io.ImmutableBytesWritable
import org.apache.hadoop.hbase.mapreduce.TableInputFormat
//import org.apache.hadoop.hbase.mapreduce.TableOutputFormat
import org.apache.hadoop.hbase.mapred.TableOutputFormat
import org.apache.hadoop.hbase.util.Bytes
import org.apache.hadoop.mapred.JobConf
//import org.apache.hadoop.mapreduce.Job
import org.apache.log4j.{Level, Logger}
import org.apache.spark.sql.SparkSession

/**
  * Created by blockchain on 18-9-9 下午3:45 in Beijing.
  */

object SparkHBaseRDD {
  def main(args: Array[String]) {
    // 屏蔽不必要的日志显示在终端上
    Logger.getLogger("org.apache.spark").setLevel(Level.WARN)

    val spark = SparkSession.builder().appName("SparkHBaseRDD").getOrCreate()
    val sc = spark.sparkContext

    val tablename = "SparkHBase"

    val hbaseConf = HBaseConfiguration.create()
    hbaseConf.set("hbase.zookeeper.quorum","localhost")  //设置zooKeeper集群地址,也可以通过将hbase-site.xml导入classpath,但是建议在程序里这样设置
    hbaseConf.set("hbase.zookeeper.property.clientPort", "2181")       //设置zookeeper连接端口,默认2181
    hbaseConf.set(TableInputFormat.INPUT_TABLE, tablename)
    
    // 如果表不存在,则创建表
    val admin = new HBaseAdmin(hbaseConf)
    if (!admin.isTableAvailable(tablename)) {
      val tableDesc = new HTableDescriptor(TableName.valueOf(tablename))
      admin.createTable(tableDesc)
    }

    //读取数据并转化成rdd TableInputFormat 是 org.apache.hadoop.hbase.mapreduce 包下的
    val hBaseRDD = sc.newAPIHadoopRDD(hbaseConf, classOf[TableInputFormat],
      classOf[ImmutableBytesWritable],
      classOf[Result])

    hBaseRDD.foreach{ case (_ ,result) =>
      //获取行键
      val key = Bytes.toString(result.getRow)
      //通过列族和列名获取列
      val name = Bytes.toString(result.getValue("cf1".getBytes,"name".getBytes))
      val age = Bytes.toString(result.getValue("cf1".getBytes,"age".getBytes))
      println("Row key:"+key+"\tcf1.Name:"+name+"\tcf1.Age:"+age)
    }
    admin.close()

    spark.stop()
  }
}

7.3Spark DataFrame 通过 Phoenix 读写 HBase

添加依赖:

<dependency>
   <groupId>org.apache.phoenix</groupId>
   <artifactId>phoenix-core</artifactId>
   <version>${phoenix.version}</version>
</dependency>

<dependency>
  <groupId>org.apache.phoenix</groupId>
  <artifactId>phoenix-spark</artifactId>
  <version>${phoenix.version}</version>
</dependency>

代码:

import org.apache.log4j.{Level, Logger}
import org.apache.spark.sql.{SaveMode, SparkSession}

/**
  * Created by blockchain on 18-9-9 下午8:33 in Beijing.
  */

object SparkHBaseDataFrame {
  def main(args: Array[String]) {
    // 屏蔽不必要的日志显示在终端上
    Logger.getLogger("org.apache.spark").setLevel(Level.WARN)

    val spark = SparkSession.builder().appName("SparkHBaseDataFrame").getOrCreate()

    val url = s"jdbc:phoenix:localhost:2181"
    val dbtable = "PHOENIXTEST"

    //spark 读取 phoenix 返回 DataFrame 的 第一种方式
    val rdf = spark.read
      .format("jdbc")
      .option("driver", "org.apache.phoenix.jdbc.PhoenixDriver")
      .option("url", url)
      .option("dbtable", dbtable)
      .load()
    rdf.printSchema()

    //spark 读取 phoenix 返回 DataFrame 的 第二种方式
    val df = spark.read
      .format("org.apache.phoenix.spark")
      .options(Map("table" -> dbtable, "zkUrl" -> url))
      .load()
    df.printSchema()

    //spark DataFrame 写入 phoenix,需要先建好表
    df.write
      .format("org.apache.phoenix.spark")
      .mode(SaveMode.Overwrite)
      .options(Map("table" -> "PHOENIXTESTCOPY", "zkUrl" -> url))
      .save()

    spark.stop()
  }
}

评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值