def show1(ss:SparkSession):Unit={
val df=ss.read.json("E:\\data\\spark\\dataframe\\test\\read\\people.json")
df.show()}
结果
[30,Andy][null,Michael][19,Justin]
2.count
示例
def count1(ss:SparkSession):Unit={
val df=ss.read.json("E:\\data\\spark\\dataframe\\test\\read\\people.json")println(df.count())}
结果
3
3.first
示例
def first1(ss:SparkSession):Unit={
val df=ss.read.json("E:\\data\\spark\\dataframe\\test\\read\\people.json")println(df.first)}
结果
[30,Andy]
4.head
示例
def head1(ss:SparkSession):Unit={
val df=ss.read.json("E:\\data\\spark\\dataframe\\test\\read\\people.json")println(df.head())}
结果
[30,Andy]
5.take
示例
def take1(ss:SparkSession):Unit={
val df=ss.read.json("E:\\data\\spark\\dataframe\\test\\read\\people.json")
df.take(2).foreach(println)}
结果
[30,Andy][null,Michael]
二、基础函数
函数
说明
返回类型
toDF
返回一个重新指定columns的DataFrame
DataFrame
cache
缓存DataFrame
DataFrame
persist
根据StorageLevel持久化
DataFrame
unpersist
去除持久化
DataFrame
schema
返回DataFrame指定的Schema
StructType
printSchema
以树格式打印schema
Unit
columns
以Array形式返回全部列名
Array[String]
1.toDF
示例1
//RDD转DataFrame
def toDF1(ss:SparkSession):Unit={
val rdd=ss.sparkContext.textFile("E:\\data\\spark\\rdd\\test\\read\\app1.log")
val mapRdd=rdd.map(line=>line.split(",")).map{arr=>(arr(0),arr(1),arr(2),arr(3),arr(4),arr(5))}import ss.implicits._
val df=mapRdd.toDF("Date","Name","APP","DownLoad","Area","Version")
df.show()}
//DataSet转DataFrame
def toDF2(ss:SparkSession):Unit={import ss.implicits._
val ds=ss.createDataset(Seq(("张三",21,11111.11),("李四",22,22222.22),("王五",23,33333.33),("赵六",24,44444.44)))
val df=ds.toDF("Name","Age","Salary")
df.show()}
def columns1(ss:SparkSession):Unit={
val df=ss.read.json("E:\\data\\spark\\dataframe\\test\\read\\people.json")
df.columns.foreach(println)}
结果1
age
name
3.persist
示例
def persist1(ss:SparkSession):Unit={
val df=ss.read.option("header","true").csv("E:\\data\\spark\\rdd\\test\\read\\ml-25m\\genome-scores.csv")
df.persist(StorageLevel.MEMORY_AND_DISK)
val start1=System.currentTimeMillis()
val rows1=df.count()
val end1=System.currentTimeMillis()println("行数:"+rows1+" , "+(end1-start1)+"毫秒")
val start2=System.currentTimeMillis()
val rows2=df.count()
val end2=System.currentTimeMillis()println("行数:"+rows2+" , "+(end2-start2)+"毫秒")}
结果
行数:15584448,31027毫秒
行数:15584448,186毫秒
4.cache
示例
def cache1(ss:SparkSession):Unit={
val df=ss.read.option("header","true").csv("E:\\data\\spark\\rdd\\test\\read\\ml-25m\\genome-scores.csv")
df.cache()
val start1=System.currentTimeMillis()
val rows1=df.count()
val end1=System.currentTimeMillis()println("行数:"+rows1+" , "+(end1-start1)+"毫秒")
val start2=System.currentTimeMillis()
val rows2=df.count()
val end2=System.currentTimeMillis()println("行数:"+rows2+" , "+(end2-start2)+"毫秒")}
结果
行数:15584448,33180毫秒
行数:15584448,192毫秒
5.schema
示例
def schema1(ss:SparkSession):Unit={
val df=ss.read.json("E:\\data\\spark\\dataframe\\test\\read\\people.json")println(df.schema)}
def createOrReplaceTempView1(ss:SparkSession):Unit={
val df=ss.read.option("header","true").csv("E:\\data\\spark\\rdd\\test\\read\\ml-25m\\movies.csv")
df.createOrReplaceTempView("t_movie")
val selectDF=ss.sql("select movieId,title from t_movie where movieId>=5 and movieId<=10")
selectDF.show()}
结果
+-------+-------------------+|movieId| title|+-------+-------------------+|6| Heat (1995)||7| Sabrina (1995)||8|Tom and Huck (1995)||9|Sudden Death (1995)|+-------+-------------------+
三、集成语言
函数
说明
返回类型
select
选择一个列的集合
DataFrame
filter
使用给定的SQL表达式过滤器
DataFrame
where
使用给定表达式过滤行
DataFrame
groupBy
使用给定的列分组DataFrame,以便能进行聚合操作
GroupedData
limit
返回前几行数据,新DataFrame
DataFrame
sort
返回给定表达式排序的新DataFrame
DataFrame
distinct
返回新DataFrame,仅包含DataFrame的unique rows
DataFrame
col
基于列名选择列,并以一个Column的形式返回
Column
agg
在整体DataFrame部分组聚合
DataFrame
drop
drop一个列,并返回一个新DataFrame
DataFrame
1.select
示例1
def select1(ss:SparkSession):Unit={
val df=ss.read.json("E:\\data\\spark\\dataframe\\test\\read\\people.json")
df.select("name","age").show()}
def select2(ss:SparkSession):Unit={
val df=ss.read.json("E:\\data\\spark\\dataframe\\test\\read\\people.json")
df.select(df("name"),df("age")+1).show()}
def filter1(ss:SparkSession):Unit={
val df=ss.read.json("E:\\data\\spark\\dataframe\\test\\read\\people.json")//df.filter(df("age")<25).show()
df.filter("age<25").show()}
def groupBy2(ss:SparkSession):Unit={
val df=ss.read.json("E:\\data\\spark\\dataframe\\test\\read\\people1.json")
df.groupBy("name","age").count().show()}
def groupBy3(ss:SparkSession):Unit={
val df=ss.read.json("E:\\data\\spark\\dataframe\\test\\read\\people1.json")
df.createOrReplaceTempView("people")//注册临时表
val groupByDf=ss.sql("select name,count(name) as num from people group by name")
groupByDf.show()}
def sort2(ss:SparkSession):Unit={
val df=ss.read.json("E:\\data\\spark\\dataframe\\test\\read\\people.json")//df.sort("name","age").show()
df.sort(df("name"),df("age")).show()}
def sort3(ss:SparkSession):Unit={
val df=ss.read.json("E:\\data\\spark\\dataframe\\test\\read\\people.json")
df.sort(df("name").desc,df("age").asc).show()}
def orderBy1(ss:SparkSession):Unit={
val df=ss.read.json("E:\\data\\spark\\dataframe\\test\\read\\people.json")
df.orderBy(df("name"),df("age").asc).show()}