前言
本项目所使用的数据及其编写的代码,可戳 https://download.csdn.net/download/atuo200/12716083下载。本项目采用scala编写数据分析代码,若采用python编写数据分析代码,可参考 基于Spark的音乐专辑数据分析展示。
数据来源
数据集albums.csv,包含了10万条音乐专辑的数据。主要字段说明如下:
- album_title:音乐专辑名称
- genre:专辑类型
- year_of_pub: 专辑发行年份
- num_of_tracks: 每张专辑中单曲数量
- num_of_sales:专辑销量
- rolling_stone_critic:滚石网站的评分
- mtv_critic:全球最大音乐电视网MTV的评分
- music_maniac_critic:音乐达人的评分
数据分析
下面对音乐专辑数据集albums.csv进行了一系列的分析,包括:
- 统计各类型专辑的数量;
- 统计各类型专辑的销量总数;
- 统计近20年每年发行的专辑数量和单曲数量;
- 统计总销量前五的专辑类型的各年份销量;
编写scala代码
mkdir musicCount
cd musicCount
mkdir -p src/main/scala
src/main/scala
vi musicCount.scala
import org.apache.spark.sql.SparkSession
import org.apache.spark.SparkContext
import java.io._
object App {
def main(args:Array[String]){
val conf = new SparkConf().setAppName("genreSales")
val sc = new SparkContext(conf)
val spark=SparkSession.builder().getOrCreate()
//加载数据文件时这里采用本地文件系统,要确保各个spark节点相同路径下都有albums.csv;并也可采用hdfs文件系统,把albums.csv上传到hdfs中
val df = spark.read.format("com.databricks.spark.csv").option("header", "true").option("inferSchema", "false").option("delimiter",",").load("file:///usr/local/spark/data/mllib/albums.csv")
import spark.implicits._
val genre_count = df.groupBy("genre").count()
val genre_array = genre_count.filter(genre_count("count") > 2000)
val result1 = genre_array.toJSON.collectAsList.toString
val writer1 = new PrintWriter(new File("/usr/local/spark/data/music_diy/result1.json" ))
writer1.write(result1)
writer1.close()
val genre_sales = df.select(df("genre"), df("num_of_sales")).rdd.map(v => (v(0).toString, v(1).toString.toInt)).reduceByKey(_+_).collect()
val result2 = sc.parallelize(genre_sales).toDF().toJSON.collectAsList.toString
val writer2 = new PrintWriter(new File("/usr/local/spark/data/music_diy/result2.json" ))
writer2.write(result2)
writer2.close()
val tracksAndSales = df.select(df("year_of_pub"), df("num_of_tracks")).rdd.map(v => (v(0).toString.toInt, List(v(1).toString.toInt,1))).reduceByKey((x,y) => List(x(0) + y(0), x(1) + y(1))).sortByKey().collect()
val result3 = sc.parallelize(tracksAndSales).toDF().toJSON.collectAsList.toString
val writer3 = new PrintWriter(new File("/usr/local/spark/data/music_diy/result3.json" ))
writer3.write(result3)
writer3.close()
val tmp = df.groupBy("genre").count()
val genre_list = tmp.orderBy(tmp("count").desc).rdd.map(v=>v(0).toString).take(5)
val genreYearHotArray = df.select(df("genre"), df("year_of_pub"), df("num_of_sales")).rdd.filter(v => genre_list.contains(v(0))).map(v => ((v(0).toString, v(1).toString.toInt), v(2).toString.toInt)).reduceByKey(_+_).collect()
val result4 = sc.parallelize(genreYearHotArray).toDF().toJSON.collectAsList.toString
val writer4 = new PrintWriter(new File("/usr/local/spark/data/music_diy/result4.json" ))
writer4.write(result4)
writer4.close()
spark.close()
}
}
编写sbt打包文件
name := "peopleage project"
version := "1.0"
scalaVersion := "2.11.12"
libraryDependencies += "org.apache.spark" %% "spark-core" % "2.4.5"
libraryDependencies += "org.apache.spark" % "spark-sql_2.11"