Pig : UDFs
Pig UDFS ---------- UDF ---> user defined functions. adv: i) custom functionalities. ii) reusability. Pig UDFs can be developed by java python ruby c++ javascript perl step1: Develop udf code. step2: export into jar file ex: /home/cloudera/Desktop/pigs.jar step3: register jar file into pig. grunt> register Desktop/pigs.jar step4: create temporory function for udf class. grunt> define ucase pig.analytics.ConvertUpper(); step5: calling the function: grunt>e = foreach emp generate id, ucase(name) as name, sal, ucase(sex) as sex, dno; package pig.analytics; import ..... --> ucase(name) ---> upper conversion public class ConvertUpper extends EvalFunc { public String exec(Tuple v) throws IOException { String str = (String)v.get(0); String res = str.toUpperCase(); retrun res; } } -------------------------- $ cat > samp 100,230,400 123,100,90 140,560,430 $ hadoop fs -copyFromLocal samp piglab grunt> s = load 'piglab/samp' using PigStorage(',') as (a:int, b:int, c:int); package pig.analytics; .... public class RMax extends EvalFunc { public Integer exec(Tuple v) throws IOException { int a =(Integer) v.get(0); int b =(Integer) v.get(1); int c =(Integer) v.get(2); int big = a; // 10,20,3 if (a>big) big = a; if (b>big) big = b; if (c>big) big = c; return big; } } export into jar : Desktop/pigs.jar grunt> register Desktop/pigs.jar; grunt> define rmax pig.analytics.RMax(); grunt> res = foreach s generate *, rmax(*) as max; -------------------------------- package pig.analytics; ....... public class RowMax extends EvalFunc { public Integer exec(Tuple v) throws IOException { List lobs = v.getAll() ; int max = 0; int cnt =0; // -20,-3,-40 for(Object o : lobs) { cnt++; int val = (Integer)o; if(cnt==1) max = val; max = Math.max(max, val); } return max; } } export in to jar : Desktop/pigs.jar grunt> register Desktop/pigs.jar grunt> define dynmax pig.analytics.RowMax(); grunt> r = foreach s generate *, dynmax(*) as m; ----------------------------------------- emp = load 'piglab/emp' using PigStorage(',') as (id:int, name:chararray, sal:int, sex:chararray, dno:int); grade() dname() gender() package pig.analytics; public class Gender extends EvalFunc { public String exec(Tuple v) throws IOException { String s =(String) v.get(0); s = s.toUpperCase(); if (s.matches("F")) s = "Female"; else s = "Male"; return s; } } ----------------- package pig.analytics; public class Grade extends EvalFunc { public String exec(Tuple v) throws IOException { String sal =(Integer) v.get(0); String grade; if (sal>=70000) grade="A"; else if (sal>=50000) grade="B"; else if (sal>=30000) grade="C"; else grade="D"; return grade; } } ------ package pig.analytics; public class DeptName extends EvalFunc { public String exec(Tuple v) throws IOException { int dno = (Integer)v.get(0); String dname; switch (dno){ case 11 : dname = "Marketing"; break; case 12 : dname = "HR"; break; case 13 : dname = "Finance"; break; default: dname = "Others"; } return dname; } } --------------------------------- --------------------------- export into jar : Desktop/pigs.jar; grunt> register Desktop/pigs.jar; grunt> define gender pig.analytics.Gender(); grunt> define grade pig.analytics.Grade(); grunt> define dept pig.analytics.DeptName(); grunt> res = foreach emp generate id, ucase(name) as name, sal, grade(sal) as grade, gender(sex) as sex, dept(dno) as dname ; ---------------------------------
Pig UDFS
----------
UDF ---> user defined functions.
adv:
i) custom functionalities.
ii) reusability.
Pig UDFs can be developed by
java
python
ruby
c++
javascript
perl
step1:
Develop udf code.
step2:
export into jar file
ex: /home/cloudera/Desktop/pigs.jar
step3:
register jar file into pig.
grunt> register Desktop/pigs.jar
step4:
create temporory function for udf class.
grunt> define ucase pig.analytics.ConvertUpper();
step5:
calling the function:
grunt>e = foreach emp generate
id, ucase(name) as name, sal,
ucase(sex) as sex, dno;
package pig.analytics;
import .....
--> ucase(name) ---> upper conversion
public class ConvertUpper extends EvalFunc
{
public String exec(Tuple v)
throws IOException
{
String str = (String)v.get(0);
String res = str.toUpperCase();
retrun res;
}
}
--------------------------
$ cat > samp
100,230,400
123,100,90
140,560,430
$ hadoop fs -copyFromLocal samp piglab
grunt> s = load 'piglab/samp'
using PigStorage(',')
as (a:int, b:int, c:int);
package pig.analytics;
....
public class RMax extends EvalFunc
{
public Integer exec(Tuple v)
throws IOException
{
int a =(Integer) v.get(0);
int b =(Integer) v.get(1);
int c =(Integer) v.get(2);
int big = a; // 10,20,3
if (a>big) big = a;
if (b>big) big = b;
if (c>big) big = c;
return big;
}
}
export into jar : Desktop/pigs.jar
grunt> register Desktop/pigs.jar;
grunt> define rmax pig.analytics.RMax();
grunt> res = foreach s generate *,
rmax(*) as max;
--------------------------------
package pig.analytics;
.......
public class RowMax
extends EvalFunc
{
public Integer exec(Tuple v) throws IOException
{
List