Thursday, March 17, 2022

spark notes

Read all list of list in spark

import  org.apache.hadoop.fs.{FileSystem,Path}

FileSystem.get( sc.hadoopConfiguration ).listStatus( new Path("/shravan/json")).foreach( x => println(x.getPath )) 

#######

find Duilpcate and read HDFS files

val df = spark.read.option("header", "true").option("delimiter", "|").option("inferSchema", "true").csv("a.scv")

#find dulpicate

 df.groupBy("id","name","age").count().filter("count >1").coalesce(1).write.format("csv").mode("overwrite").save("/tmp/spark_output/datacsv1")

 

#########################3

Write  to hgfs

******use************

 **df2.coalesce(1).write.mode("overwrite").insertInto("db.tb")

df1.coalesce(1).write.format("orc").mode("overwrite").saveAsTable("db.tb");

df.coalesce(1).write.mode("overwrite").csv("/shravan/")

df.write.format("csv").save("/tmp/spark_output/datacsv")

df .coalesce(1) .write.mode("overwrite").option("header","true").csv("/shravan/"


######################

spark-shell --master yarn --conf spark.ui.port=0 --num-executors 50 --driver-memory 40g --executor-memory 20g --total-executor-cores 10 --queue xyz--conf spark.sql.crossJoin.enabled=true

df = df.repartition(1000)

spark.dynamicAllocation.enabled true
 spark.executor.memory 11168M
 spark.executor.cores 4

####################################################
Read properties

import java.util.Properties
import org.apache.hadoop.conf.Configuration
import org.apache.hadoop.fs.{FileSystem, Path}
  def readPropertiesFile(propFilePath: String): Properties = {
    val props = new Properties()
    try {
      val hdfsConf: Configuration = new Configuration();
      val fs = FileSystem.get(hdfsConf);
      val is = fs.open(new Path(propFilePath));

      props.load(is)
      if (props == null) throw new Exception("Properties object not created")

    } catch {
      case allException: Exception => {
        error(s"Error: Generic exception ${allException.printStackTrace}")
      }
    }
    props
  }


val propObject = readPropertiesFile("abc.properties")