spark word count
[cloudera@quickstart ~]$ cat as.txt
i love hadoop
i love spark
o love hadoop and spark
scala> val rdd = sc.textFile("/user1/as.txt")
rdd: org.apache.spark.rdd.RDD[String] = /user1/as.txt MapPartitionsRDD[42] at textFile at <console>:31
scala> val rdd1 =rdd.flatMap(x=>x.split(" "))
rdd1: org.apache.spark.rdd.RDD[String] = MapPartitionsRDD[43] at flatMap at <console>:33
scala> val rdd2 =rdd1.map(x=>(x,1))
rdd2: org.apache.spark.rdd.RDD[(String, Int)] = MapPartitionsRDD[44] at map at <console>:35
scala> val rdd3 = rdd2.reduceByKey(_+_)
rdd3: org.apache.spark.rdd.RDD[(String, Int)] = ShuffledRDD[9] at reduceByKey at <console>:33
scala> val rdd4 = rdd3.map(x => x._1 + " " +x._2).collect()
rdd4: Array[String] = Array(love 3, spark 2, hadoop 2, i 2, o 1, and 1)
[cloudera@quickstart ~]$ cat as.txt
i love hadoop
i love spark
o love hadoop and spark
scala> val rdd = sc.textFile("/user1/as.txt")
rdd: org.apache.spark.rdd.RDD[String] = /user1/as.txt MapPartitionsRDD[42] at textFile at <console>:31
scala> val rdd1 =rdd.flatMap(x=>x.split(" "))
rdd1: org.apache.spark.rdd.RDD[String] = MapPartitionsRDD[43] at flatMap at <console>:33
scala> val rdd2 =rdd1.map(x=>(x,1))
rdd2: org.apache.spark.rdd.RDD[(String, Int)] = MapPartitionsRDD[44] at map at <console>:35
scala> val rdd3 = rdd2.reduceByKey(_+_)
rdd3: org.apache.spark.rdd.RDD[(String, Int)] = ShuffledRDD[9] at reduceByKey at <console>:33
scala> val rdd4 = rdd3.map(x => x._1 + " " +x._2).collect()
rdd4: Array[String] = Array(love 3, spark 2, hadoop 2, i 2, o 1, and 1)
No comments:
Post a Comment