【十一】Spark SQL DataFrames和RDD互操作

最新推荐文章于 2024-06-02 01:45:59 发布

原创最新推荐文章于 2024-06-02 01:45:59 发布 · 495 阅读

0 ·

CC 4.0 BY-SA版权

文章标签：

#基于sql API编程 # DataFrames和RDD互操作 #编程的方式 #反射机制的方式 #基于DataFrame的API编程

spark SQL 专栏收录该内容

20 篇文章

订阅专栏

官网介绍

一、用反射机制的方式DataFrames和RDD互操作（能推导出schema信息，代码更加简介，运行效果更加好，前提是已经准确知道schema构成）

Spark SQL 提供的scala接口能够自动把一个包含case class（类似于java中的一个bean）信息的RDD转换成DataFrame。case class定义的是表的schema信息。

二、用编程的方式DataFrames和RDD互操作

需要通过编程接口创建Dataset/DataFrame，，这个编程接口允许构造schema，然后把schema作用到已经存在的RDD之上。这种方式的代码比第一种方式的代码冗长。在事先不知道Dataset/DataFrame中的列、类型，只有在运行中才知道时，才选择使用这种方法。即当case class不能提前定义的时候。

这种方式需要遵从如下步骤:

1.用Rows创建RDD。

2.定义schema，使用structType来指定。

3.把这个schema作用到RDD的Rows上。通过sparkSession.createDataFrame方法

项目目录

pom.xml

<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/maven-v4_0_0.xsd">
  <modelVersion>4.0.0</modelVersion>
  <groupId>com.sid.com</groupId>
  <artifactId>sparksqltrain</artifactId>
  <version>1.0-SNAPSHOT</version>
  <inceptionYear>2008</inceptionYear>
  <properties>
    <scala.version>2.11.8</scala.version>
    <spark.version>2.2.0</spark.version>
  </properties>

  <repositories>
    <repository>
      <id>scala-tools.org</id>
      <name>Scala-Tools Maven2 Repository</name>
      <url>http://scala-tools.org/repo-releases</url>
    </repository>
  </repositories>

  <pluginRepositories>
    <pluginRepository>
      <id>scala-tools.org</id>
      <name>Scala-Tools Maven2 Repository</name>
      <url>http://scala-tools.org/repo-releases</url>
    </pluginRepository>
  </pluginRepositories>

  <dependencies>
    <!-- scala依赖 -->
    <dependency>
      <groupId>org.scala-lang</groupId>
      <artifactId>scala-library</artifactId>
      <version>${scala.version}</version>
    </dependency>
    <!-- spark依赖 -->
    <dependency>
      <groupId>org.apache.spark</groupId>
      <artifactId>spark-sql_2.11</artifactId>
      <version>${spark.version}</version>
    </dependency>
    <!-- hivecontext要用这个依赖-->
    <dependency>
      <groupId>org.apache.spark</groupId>
      <artifactId>spark-hive_2.11</artifactId>
      <version>${spark.version}</version>
    </dependency>

    <dependency>
      <groupId>org.spark-project.hive</groupId>
      <artifactId>hive-jdbc</artifactId>
      <version>1.2.1.spark2</version>
    </dependency>

  </dependencies>

  <build>
    <sourceDirectory>src/main/scala</sourceDirectory>
    <testSourceDirectory>src/test/scala</testSourceDirectory>
    <plugins>
      <plugin>
        <groupId>org.scala-tools</groupId>
        <artifactId>maven-scala-plugin</artifactId>
        <executions>
          <execution>
            <goals>
              <goal>compile</goal>
              <goal>testCompile</goal>
            </goals>
          </execution>
        </executions>
        <configuration>
          <scalaVersion>${scala.version}</scalaVersion>
          <args>
            <arg>-target:jvm-1.5</arg>
          </args>
        </configuration>
      </plugin>
      <plugin>
        <groupId>org.apache.maven.plugins</groupId>
        <artifactId>maven-eclipse-plugin</artifactId>
        <configuration>
          <downloadSources>true</downloadSources>
          <buildcommands>
            <buildcommand>ch.epfl.lamp.sdt.core.scalabuilder</buildcommand>
          </buildcommands>
          <additionalProjectnatures>
            <projectnature>ch.epfl.lamp.sdt.core.scalanature</projectnature>
          </additionalProjectnatures>
          <classpathContainers>
            <classpathContainer>org.eclipse.jdt.launching.JRE_CONTAINER</classpathContainer>
            <classpathContainer>ch.epfl.lamp.sdt.launching.SCALA_CONTAINER</classpathContainer>
          </classpathContainers>
        </configuration>
      </plugin>
    </plugins>
  </build>
  <reporting>
    <plugins>
      <plugin>
        <groupId>org.scala-tools</groupId>
        <artifactId>maven-scala-plugin</artifactId>
        <configuration>
          <scalaVersion>${scala.version}</scalaVersion>
        </configuration>
      </plugin>
    </plugins>
  </reporting>
</project>

代码DataFrameRDD.scala

package com.sid.com

import org.apache.spark.sql.{Row, SparkSession}
import org.apache.spark.sql.types.{IntegerType, StringType, StructField, StructType}


/**
  * DataFrame和RDD的互操作
  * */
object DataFrameRDD {
  def main(args: Array[String]): Unit = {
    val spark = SparkSession.builder().appName("DataFrameRDD").master("local[2]").getOrCreate()

    /**********************************DataFrame和RDD的互操作  反射的方式 *******************************/
    reflection(spark)

    /**********************************DataFrame和RDD的互操作  编程的方式 *******************************/
    program(spark)

    spark.stop()
  }

  /**********************************DataFrame和RDD的互操作  反射的方式 *******************************/
  def reflection(spark:SparkSession){
    /**
      * Read a text file from HDFS, a local file system (available on all nodes), or any
      * Hadoop-supported file system URI, and return it as an RDD of Strings.
      * @param path path to the text file on a supported file system
      * @param minPartitions suggested minimum number of partitions for the resulting RDD
      * @return RDD of lines of the text file
      */
    //得到RDD 每一行一个记录 每一行中的多个字段用逗号分隔
    val rdd =spark.sparkContext.textFile("file:///G:\\desktop\\people.txt")

    //将rdd转成一个case class  这里是People对象
    val rddPepole = rdd.map(_.split(",")).map(line => Pepole(line(0).toInt,line(1),line(2).toInt))

    //将RDD转换成DataFrame 用反射的方式 需要导入隐式转换
    import spark.implicits._
    val peopleDF = rddPepole.toDF()

    peopleDF.show()

    //基于DataFrame的API编程
    peopleDF.filter(peopleDF.col("age") > 30).show()

    //基于sql API编程 把DataFrame注册成一张临时表 表名people
    peopleDF.createOrReplaceTempView("people")
    spark.sql("select * from people where age > 30").show()

  }

  /**********************************DataFrame和RDD的互操作  编程的方式 *******************************/
  def program(spark:SparkSession): Unit ={
    val rdd =spark.sparkContext.textFile("file:///G:\\desktop\\people.txt")

    //1.用Rows创建RDD。
    val peopleRdd = rdd.map(_.split(",")).map(line => Row(line(0).toInt,line(1),line(2).toInt))

    //2.定义schema，使用structType来指定。
    val structType = StructType(Array(StructField("id",IntegerType,true),
      StructField("name",StringType,true),
      StructField("age",IntegerType,true)))

   // 3.把这个schema作用到RDD的Rows上。通过sparkSession.createDataFrame方法
    val pepoleDF = spark.createDataFrame(peopleRdd,structType)

    pepoleDF.printSchema()
    pepoleDF.show()

  }

  //用反射的方式把RDD转DataFrame需要用到case class
  case class Pepole(id:Int,name:String,age:Int)
}

people.txt

1,sid,30
2,zhangsna,31
3,lisi,32
4,wangwu,32

运行结果