Tuesday, September 27, 2016

Spark Union,Insection,


Union--Merging two RDDS
scala> val a = sc.parallelize(1 to 5)
scala> val b = sc.parallelize(6 to 10)
scala> val c = a++b
c: org.apache.spark.rdd.RDD[Int] = UnionRDD[2] at $plus$plus at <console>:31

scala> c.collect()
res1: Array[Int] = Array(1, 2, 3, 4, 5, 6, 7, 8, 9, 10)
-------------------
intersection
scala> val a = sc.parallelize(1 to 8)
a: org.apache.spark.rdd.RDD[Int] = ParallelCollectionRDD[3] at parallelize at <console>:27

scala> val b = sc.parallelize(6 to 10)

scala> val c = a++b
c: org.apache.spark.rdd.RDD[Int] = UnionRDD[4] at $plus$plus at <console>:31

scala> c.collect()
res2: Array[Int] = Array(1, 2, 3, 4, 5, 6, 7, 8, 6, 7, 8, 9, 10)
[cloudera@quickstart ~]$ cat abc.txt
Emp1,F,D1
Emp2,M,D2
Emp3,F,D3
Cat
Bat
Mat
[cloudera@quickstart ~]$ cat xyz.txt
Cat
Book
Bat


scala> val a = sc.textFile("/user1/abc.txt")
a: org.apache.spark.rdd.RDD[String] = /user1/abc.txt MapPartitionsRDD[33] at textFile at <console>:27

scala> val b =sc.textFile("/user1/xyz.txt")
b: org.apache.spark.rdd.RDD[String] = /user1/xyz.txt MapPartitionsRDD[35] at textFile at <console>:27

scala> val c = a++b
c: org.apache.spark.rdd.RDD[String] = UnionRDD[36] at $plus$plus at <console>:31

scala> c.collect()
res12: Array[String] = Array(Emp1,F,D1, Emp2,M,D2, Emp3,F,D3, Cat, Bat, Mat, Cat, Book, Bat)

scala> val c = a.intersection(b
     | )
c: org.apache.spark.rdd.RDD[String] = MapPartitionsRDD[42] at intersection at <console>:31

scala> c.collect()
res13: Array[String] = Array(Bat, Cat)
----------------------------
Sorting By Key -- It will sort according to key

scala> val a  = List((1,"a"),(7,"b"),(5,"F"))
a: List[(Int, String)] = List((1,a), (7,b), (5,F))

scala> val b=sc.parallelize(a);
b: org.apache.spark.rdd.RDD[(Int, String)] = ParallelCollectionRDD[43] at parallelize at <console>:29

scala> val c=b.sortByKey()
c: org.apache.spark.rdd.RDD[(Int, String)] = ShuffledRDD[46] at sortByKey at <console>:31

scala> c.collect()
res14: Array[(Int, String)] = Array((1,a), (5,F), (7,b))

------------
scala> val a = sc.parallelize(2 to 7)
a: org.apache.spark.rdd.RDD[Int] = ParallelCollectionRDD[10] at parallelize at <console>:27

scala> val b = sc.parallelize(5 to 9)
b: org.apache.spark.rdd.RDD[Int] = ParallelCollectionRDD[11] at parallelize at <console>:27

scala> val c = a.intersection(b)
c: org.apache.spark.rdd.RDD[Int] = MapPartitionsRDD[17] at intersection at <console>:31

scala> c.collect()
res6: Array[Int] = Array(6, 7, 5)
--------------------

No comments:

Post a Comment