Pig Lab8
package pig.udfs;
import java.io.IOException;
import org.apache.pig.EvalFunc;
import org.apache.pig.data.Tuple;
public class MathRank extends EvalFunc<Integer>
int prev = 0;
int cnt = 0;
public Integer exec(Tuple v)
throws IOException
int cv = (Integer) v.get(0);
if (cv!=prev) cnt++;
prev = cv;
return new Integer(cnt);
export into --> /home/training/Desktop/pjars.jar
grunt> emp = load 'pdemo/profiles'
>> using PigStorage(',')
>> as (name:chararray, sal:int);
grunt> e = order emp by sal desc;
grunt> register Desktop/pjars.jar;
grunt> define rank pig.udfs.MathRank();
grunt> ee = foreach e generate *, rank(sal) as rank;
grunt> dump ee
[training@localhost ~]$ cat > test1
100 345 890
123 346 340
340 240 140
[training@localhost ~]$ hadoop fs -copyFromLocal test1 pdemo
copyFromLocal: Target pdemo/test1 already exists
[training@localhost ~]$ hadoop fs -copyFromLocal test1 pdemo/t1
[training@localhost ~]$
Udf to find row level max.
package pig.udfs;
import java.io.IOException;
import org.apache.pig.EvalFunc;
import org.apache.pig.data.Tuple;
public class RowMax extends EvalFunc<Integer>
public Integer exec(Tuple v) throws IOException
int a = (Integer)v.get(0);
int b = (Integer)v.get(1);
int c = (Integer)v.get(2);
int big=0; // 10 34 5
if(a>big) big=a;
if(b>big) big=b;
if(c>big) big=c;
return new Integer(big);
export into --> Desktop/pjars.jar
grunt> t1 = load 'pdemo/t1'
>> as (a:int, b:int, c:int);
grunt> register Desktop/pjars.jar;
grunt> define rmax pig.udfs.RowMax();
grunt> r1 = foreach t1 generate *, rmax(*) as rm;
grunt> dump r1
the above udf fits only for 3 numeric fields.
example of v.getAll()
[training@localhost ~]$ cat > t2
100 200 400 600 120
300 450 123 567 678
[training@localhost ~]$ hadoop fs -copyFromLocal t2 pdemo
[training@localhost ~]$
package pig.udfs;
import java.io.IOException;
import java.util.List;
import org.apache.pig.EvalFunc;
import org.apache.pig.data.Tuple;
public class RMax extends EvalFunc<Integer>
public Integer exec(Tuple v) throws IOException
List<Object> lobs = v.getAll();
int max = 0;
for(Object o : lobs) // 120,456,345,654,600
int val = (Integer)o;
max = Math.max(val, max);
return new Integer(max);
grunt> t2 = load 'pdemo/t2'
>> as (a:int, b:int, c:int, d:int, e:int);
grunt> register Desktop/pjars.jar;
grunt> define rmax2 pig.udfs.RMax();
grunt> r2 = foreach t2 generate * , rmax2(*) as max;
grunt> dump r2
Handling Unstructured Text.
Filtering Dell news.
[training@localhost ~]$ cat > news
Modi met obama
Sonia went to Usa
Dell implement big data
Modi has order 1 lakh Dell laptops for opts
[training@localhost ~]$ hadoop fs -copyFromLocal news pdemo
[training@localhost ~]$
grunt> news = load 'pdemo/news'
>> as (line:chararray);
grunt> dump news
grunt> news2 = foreach news generate *,
>> INDEXOF(line,'Dell') as idx;
grunt> dump news2
(Modi met obama,-1)
(Sonia went to Usa,-1)
(Dell implement big data,0)
(Modi has order 1 lakh Dell laptops for opts,22)
no we can apply filter to fetch out Dell news.
if line contain Dell word, idx value will be >= 0.
if not, idx = -1
No comments:
Post a Comment