-- co groupinggrunt> cat piglab/emp101,aaaa,40000,m,11102,bbbbbb,50000,f,12103,cccc,50000,m,12104,dd,90000,f,13105,ee,10000,m,12106,dkd,40000,m,12107,sdkfj,80000,f,13108,iiii,50000,m,11grunt>[cloudera@quickstart ~]$ cat > emp2201,Ravi,80000,m,12203,Varuna,100000,f,13204,Vanila,50000,f,12205,Mani,30000,m,14206,Manisha,30000,f,14[cloudera@quickstart ~]$ hadoop fs -copyFromLocal emp2 piglab[cloudera@quickstart ~]$ sql: select dno, sum(sal) from ( select dno, sal from emp1 union all select dno, sal from emp2 ) e group by dno;runt> emp1 = load 'piglab/emp' using PigStorage(',') as (id:int, name:chararray, sal:int, sex:chararray, dno:int);grunt> emp2 = load 'piglab/emp2' using PigStorage(',') as (id:int, name:chararray, sal:int, sex:chararray, dno:int);grunt> describe emp1emp1: {id: int,name: chararray,sal: int,sex:chararray,dno: int}grunt> describe emp2emp2: {id: int,name: chararray,sal: int,sex:chararray,dno: int}grunt> e1 = foreach emp1 generate dno, sal;grunt> e2 = foreach emp2 generate dno, sal;grunt> e = union e1, e2;grunt> grp = group e by dno;grunt> res = foreach grp generate group as dno, SUM(e.sal) as tot;grunt> dump res(11,180000)(12,280000)(13,270000)(14,60000) -- in above output, we missed clarity, total salary for branch1 and brach2 above is the aggregation of all branches. -- we want, seperate total salary for branch 1 and seperate for branch2. -- for this cogroup is used. -- cogroup will construct seperate innerbags for each relation(dataset). so that seperate aggregations, we can apply.grunt> describe e1e1: {dno: int,sal: int}grunt> describe e2e2: {dno: int,sal: int}grunt> cg = cogroup e1 by dno, e2 by dno;grunt> describe cgcg: {group: int,e1: {(dno: int,sal: int)},e2:{(dno: int,sal: int)}}grunt> dump cg -- when cogroup is applied it returns n+1 fields . where n is number of input relations (datasets). first one is group 2nd onwards bags.(11,{(11,50000),(11,40000)},{(11,90000)})(12,{(12,40000),(12,10000),(12,50000),(12,50000)},{(12,50000),(12,80000)})(13,{(13,80000),(13,90000)},{(13,100000)})(14,{},{(14,30000),(14,30000)})grunt> res = foreach cg generate group as dno , SUM(e1.sal) as tot1, SUM(e2.sal) as tot2;grunt> describe resres: {dno: int,tot1: long,tot2: long}(11,90000,90000)(12,150000,130000)(13,170000,100000)(14,,60000)--------------------------------- how to perform, seperate aggregations on each dataset with out cogrouping.grunt> describe e1e1: {dno: int,sal: int}grunt> describe e2e2: {dno: int,sal: int}grunt> ee1 = foreach e1 generate *, 'branch1' asbranch;grunt> ee2 = foreach e2 generate *, 'branch2' asbranch;grunt> ee = union ee1 , ee2;grunt> grp = group ee by (dno, branch);grunt> res = foreach grp generate>> group.dno as dno, group.branch asbranch,>> SUM(ee.sal) as tot;grunt> describe resres: {dno: int,branch: chararray,tot: long}grunt> dump res(11,branch1,90000)(11,branch2,90000)(12,branch1,150000)(12,branch2,130000)(13,branch1,170000)(13,branch2,100000)(14,branch2,60000)--------------------------------- using Cogroup , multiple aggregations seperately for each dataset.grunt> describe e1e1: {dno: int,sal: int}grunt> describe e2;e2: {dno: int,sal: int}grunt> cg = cogroup e1 by dno, e2 by dno;grunt> res = foreach cg generate>> group as dno,>> SUM(e1.sal) as tot1,>> SUM(e2.sal) as tot2,>> COUNT(e1) as cnt1,>> COUNT(e2) as cnt2,>> MAX(e1.sal) as max1,>> MAX(e2.sal) as max2;grunt> describe res;res: {dno: int,tot1: long,tot2: long,cnt1:long,cnt2: long,max1: int,max2: int}grunt> dump res(11,90000,90000,2,1,50000,90000)(12,150000,130000,4,2,50000,80000)(13,170000,100000,2,1,90000,100000)(14,,60000,0,2,,30000)------------------------------ Entire Column aggregations using CoGroup. s1 = foreach emp1 generate sal; s2 = foreach emp2 generate sal; g = cogroup s1 all, s2 all ; r = foreach g generate SUM(s1.sal) as tot1, SUM(s2.sal) as tot2; r = foreach r generate * , tot1+tot2 as tot;dump r(410000,380000,790000)
-- co grouping grunt> cat piglab/emp 101,aaaa,40000,m,11 102,bbbbbb,50000,f,12 103,cccc,50000,m,12 104,dd,90000,f,13 105,ee,10000,m,12 106,dkd,40000,m,12 107,sdkfj,80000,f,13 108,iiii,50000,m,11 grunt> [cloudera@quickstart ~]$ cat > emp2 201,Ravi,80000,m,12 203,Varuna,100000,f,13 204,Vanila,50000,f,12 205,Mani,30000,m,14 206,Manisha,30000,f,14 [cloudera@quickstart ~]$ hadoop fs - copyFromLocal emp2 piglab [cloudera@quickstart ~]$ sql: select dno, sum(sal) from ( select dno, sal from emp1 union all select dno, sal from emp2 ) e group by dno; runt> emp1 = load 'piglab/emp' using PigStorage(',') as (id:int, name:chararray, sal:int, sex:chararray, dno:int); grunt> emp2 = load 'piglab/emp2' using PigStorage(',') as (id:int, name:chararray, sal:int, sex:chararray, dno:int); grunt> describe emp1 emp1: {id: int,name: chararray,sal: int,sex: chararray,dno: int} grunt> describe emp2 emp2: {id: int,name: chararray,sal: int,sex: chararray,dno: int} grunt> e1 = foreach emp1 generate dno, sal; grunt> e2 = foreach emp2 generate dno, sal; grunt> e = union e1, e2; grunt> grp = group e by dno; grunt> res = foreach grp generate group as dno, SUM(e.sal) as tot; grunt> dump res (11,180000) (12,280000) (13,270000) (14,60000) -- in above output, we missed clarity, total salary for branch1 and brach2 above is the aggregation of all branches. -- we want, seperate total salary for branch 1 and seperate for branch2. -- for this cogroup is used. -- cogroup will construct seperate innerbags for each relation(dataset). so that seperate aggregations, we can apply. grunt> describe e1 e1: {dno: int,sal: int} grunt> describe e2 e2: {dno: int,sal: int} grunt> cg = cogroup e1 by dno, e2 by dno; grunt> describe cg cg: {group: int,e1: {(dno: int,sal: int)},e2: {(dno: int,sal: int)}} grunt> dump cg -- when cogroup is applied it returns n+1 fields . where n is number of input relations (datasets). first one is group 2nd onwards bags. (11,{(11,50000),(11,40000)},{(11,90000)}) (12,{(12,40000),(12,10000),(12,50000), (12,50000)},{(12,50000),(12,80000)}) (13,{(13,80000),(13,90000)},{(13,100000)}) (14,{},{(14,30000),(14,30000)}) grunt> res = foreach cg generate group as dno , SUM(e1.sal) as tot1, SUM(e2.sal) as tot2; grunt> describe res res: {dno: int,tot1: long,tot2: long} (11,90000,90000) (12,150000,130000) (13,170000,100000) (14,,60000) --------------------------------- how to perform, seperate aggregations on each dataset with out cogrouping. grunt> describe e1 e1: {dno: int,sal: int} grunt> describe e2 e2: {dno: int,sal: int} grunt> ee1 = foreach e1 generate *, 'branch1' as branch; grunt> ee2 = foreach e2 generate *, 'branch2' as branch; grunt> ee = union ee1 , ee2; grunt> grp = group ee by (dno, branch); grunt> res = foreach grp generate >> group.dno as dno, group.branch as branch, >> SUM(ee.sal) as tot; grunt> describe res res: {dno: int,branch: chararray,tot: long} grunt> dump res (11,branch1,90000) (11,branch2,90000) (12,branch1,150000) (12,branch2,130000) (13,branch1,170000) (13,branch2,100000) (14,branch2,60000) --------------------------------- using Cogroup , multiple aggregations seperately for each dataset. grunt> describe e1 e1: {dno: int,sal: int} grunt> describe e2; e2: {dno: int,sal: int} grunt> cg = cogroup e1 by dno, e2 by dno; grunt> res = foreach cg generate >> group as dno, >> SUM(e1.sal) as tot1, >> SUM(e2.sal) as tot2, >> COUNT(e1) as cnt1, >> COUNT(e2) as cnt2, >> MAX(e1.sal) as max1, >> MAX(e2.sal) as max2; grunt> describe res; res: {dno: int,tot1: long,tot2: long,cnt1: long,cnt2: long,max1: int,max2: int} grunt> dump res (11,90000,90000,2,1,50000,90000) (12,150000,130000,4,2,50000,80000) (13,170000,100000,2,1,90000,100000) (14,,60000,0,2,,30000)
------------------------------ Entire Column aggregations using CoGroup.
s1 = foreach emp1 generate sal; s2 = foreach emp2 generate sal; g = cogroup s1 all, s2 all ; r = foreach g generate SUM(s1.sal) as tot1, SUM(s2.sal) as tot2; r = foreach r generate * , tot1+tot2 as tot; dump r (410000,380000,790000)