fileStream 生成 UnionRDD of NewHadoopRDD s . 关于由 sc.newAPIHadoopFile 创建的 NewHadoopRDD 的好处是它们的 name 被设置为它们的路径 .
以下是您可以使用该知识做的示例:
def namedTextFileStream(ssc: StreamingContext, directory: String): DStream[String] =
ssc.fileStream[LongWritable, Text, TextInputFormat](directory)
.transform( rdd =>
new UnionRDD(rdd.context,
rdd.dependencies.map( dep =>
dep.rdd.asInstanceOf[RDD[(LongWritable, Text)]].map(_._2.toString).setName(dep.rdd.name)
)
)
)
def transformByFile[U: ClassTag](unionrdd: RDD[String],
transformFunc: String => RDD[String] => RDD[U]): RDD[U] = {
new UnionRDD(unionrdd.context,
unionrdd.dependencies.map{ dep =>
if (dep.rdd.isEmpty) None
else {
val filename = dep.rdd.name
Some(
transformFunc(filename)(dep.rdd.asInstanceOf[RDD[String]])
.setName(filename)
)
}
}.flatten
)
}
def main(args: Array[String]) = {
val conf = new SparkConf()
.setAppName("Process by file")
.setMaster("local[2]")
val ssc = new StreamingContext(conf, Seconds(30))
val dstream = namesTextFileStream(ssc, "/some/directory")
def byFileTransformer(filename: String)(rdd: RDD[String]): RDD[(String, String)] =
rdd.map(line => (filename, line))
val transformed = dstream.
transform(rdd => transformByFile(rdd, byFileTransformer))
// Do some stuff with transformed
ssc.start()
ssc.awaitTermination()
}