spark 机器学习---基础统计学知识+相关性分析(平均差、方差、众数..)

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
scala> import org.apache.spark.mllib.stat.{MultivariateStatisticalSummary, Statistics}
import org.apache.spark.mllib.stat.{MultivariateStatisticalSummary, Statistics}

scala> val observations = sc.parallelize(
| Seq(
| Vectors.dense(1.0, 10.0, 100.0),
| Vectors.dense(2.0, 20.0, 200.0),
| Vectors.dense(3.0, 30.0, 300.0)
| )
| )
observations: org.apache.spark.rdd.RDD[org.apache.spark.mllib.linalg.Vector] = ParallelCollectionRDD[0] at parallelize at <console>:28

scala> observations
res8: org.apache.spark.rdd.RDD[org.apache.spark.mllib.linalg.Vector] = ParallelCollectionRDD[0] at parallelize at <console>:28

scala> val summary: MultivariateStatisticalSummary = Statistics.colStats(observations)
summary: org.apache.spark.mllib.stat.MultivariateStatisticalSummary = org.apache.spark.mllib.stat.MultivariateOnlineSummarizer@197409dd

scala> println(summary.mean) // a dense vector containing the mean value for each column
[2.0,20.0,200.0]

scala> println(summary.variance) // column-wise variance
[1.0,100.0,10000.0]

scala> println(summary.numNonzeros) // number of nonzeros in each column
[3.0,3.0,3.0]

数据装换为 Array[org.apache.spark.mllib.linalg.Vector],之后通过Statistics.colStats(data)来处理数据进行基本的数据转化

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
scala> beijing.flatMap(_.split(",")).map(values=>Vectors.dense(values.toDouble))
res19: org.apache.spark.rdd.RDD[org.apache.spark.mllib.linalg.Vector] = MapPartitionsRDD[9] at map at <console>:30

scala> val data = beijing.flatMap(_.split(",")).map(values=>Vectors.dense(values.toDouble))
data: org.apache.spark.rdd.RDD[org.apache.spark.mllib.linalg.Vector] = MapPartitionsRDD[11] at map at <console>:29

scala> data.take(10)
res22: Array[org.apache.spark.mllib.linalg.Vector] = Array([718.4], [380.7], [393.2], [544.4], [489.9], [488.8], [721.0], [665.3], [683.9], [673.3])

scala> Statistics.colStats(data)
res23: org.apache.spark.mllib.stat.MultivariateStatisticalSummary = org.apache.spark.mllib.stat.MultivariateOnlineSummarizer@35867f6f

scala> res23.max
res24: org.apache.spark.mllib.linalg.Vector = [813.2]

scala> res23.count
res25: Long = 30

相关性分析

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29

scala> val beijing2 =sc.textFile("file:/C:/Users/Administrator/Desktop/beijing2.txt")
beijing2: org.apache.spark.rdd.RDD[String] = file:/C:/Users/Administrator/Desktop/beijing2.txt MapPartitionsRDD[14] at textFile at <console>:28

scala> val data=beijing2.flatMap(_.split(",")).map(_.toDouble)
data: org.apache.spark.rdd.RDD[Double] = MapPartitionsRDD[16] at map at <console>:29

scala> val year =data.filter(_>1000)
year: org.apache.spark.rdd.RDD[Double] = MapPartitionsRDD[17] at filter at <console>:29

scala> val values=data.filter(_<=1000)
values: org.apache.spark.rdd.RDD[Double] = MapPartitionsRDD[18] at filter at <console>:29

scala> beijing2.first
res30: String = 1979,1980,1981,1982,1983,1984,1985,1986,1987,1988,1989,1990,1991,1992,1993,1994,1995,1996,1997,1998,1999,2000,2001,2002,2003,2004,2005,2006,2007,2008,718.4,380.7,393.2,544.4,489.9,488.8,721.0,665.3,683.9,673.3,442.2,697.3,747.9,541.5,506.7,813.2,572.5,700.9,430.9,731.7,266.9,371.1,338.9,370.4,444.9,483.5,410.7,318.0,483.9,626.3

scala> import org.apache.spark.mllib.stat._
import org.apache.spark.mllib.stat._

scala> Sta
StackOverflowError StackTraceElement Statistics

scala> Statistics.corr(year,va)
values var_pop var_samp variance

scala> Statistics.corr(year,values)
2018-10-24 17:16:55 WARN BLAS:61 - Failed to load implementation from: com.github.fommil.netlib.NativeSystemBLAS
2018-10-24 17:16:55 WARN BLAS:61 - Failed to load implementation from: com.github.fommil.netlib.NativeRefBLAS
res37: Double = -0.31536672957986056
打赏
  • 版权声明: 本博客所有文章除特别声明外,著作权归作者所有。转载请注明出处!
  • Copyrights © 2018-2020 丁振莹
  • 访问人数: | 浏览次数:

你的每一分支持,是我努力下去的最大的力量 ٩(๑❛ᴗ❛๑)۶

支付宝
微信