package mllib.cluster
import org.apache.spark.mllib.clustering.{KMeans, KMeansModel}
import org.apache.spark.mllib.linalg
import org.apache.spark.mllib.linalg.{DenseVector, Vectors}
import org.apache.spark.rdd.RDD
import org.apache.spark.{SparkConf, SparkContext}
import scala.io.{BufferedSource, Source}
/**
* created by LMR on 2019/6/10
*/
object kmeans {
def main(args: Array[String]): Unit = {
val conf: SparkConf = new SparkConf().setMaster("local[*]").setAppName("svm")
val sc = new SparkContext(conf)
//从windows本地读取数据,转化为RDD[Vector]
val source: BufferedSource = Source.fromFile("E:\\IDEAWorkPlace\\SparkTest\\src\\main\\scala\\mllib\\data\\kmeans_data.txt")
val lines: Array[String] = source.getLines().toArray
val vectors: Array[linalg.Vector] = lines.map { line =>
val splits: Array[Double] = line.split(" ").map(_.toDouble)
Vectors.dense(splits)
}
val data: RDD[linalg.Vector] = sc.parallelize(vectors)
//创建模型并训练
val initMode = "k-means||"
val numsClusters = 2
val numIteerations = 20
val model: KMeansModel = new KMeans()
.setInitializationMode(initMode)
.setK(numsClusters)
.setMaxIterations(numIteerations)
.run(data)
//计算误差
val WSSSE: Double = model.computeCost(data)
println(WSSSE)
}
}
完整代码/数据地址:git地址