Pig : CoGroup examples Vs Union Examples

-- co groupinggrunt> cat piglab/emp101,aaaa,40000,m,11102,bbbbbb,50000,f,12103,cccc,50000,m,12104,dd,90000,f,13105,ee,10000,m,12106,dkd,40000,m,12107,sdkfj,80000,f,13108,iiii,50000,m,11grunt>[cloudera@quickstart ~]$ cat > emp2201,Ravi,80000,m,12203,Varuna,100000,f,13204,Vanila,50000,f,12205,Mani,30000,m,14206,Manisha,30000,f,14[cloudera@quickstart ~]$ hadoop fs -copyFromLocal emp2 piglab[cloudera@quickstart ~]$ sql:   select dno, sum(sal) from (      select dno, sal from emp1              union all      select dno, sal from emp2       ) e group by dno;runt> emp1 = load 'piglab/emp' using      PigStorage(',')    as (id:int, name:chararray, sal:int,    sex:chararray,      dno:int);grunt> emp2 = load 'piglab/emp2' using PigStorage(',')    as (id:int, name:chararray, sal:int,    sex:chararray,     dno:int);grunt> describe emp1emp1: {id: int,name: chararray,sal: int,sex:chararray,dno: int}grunt> describe emp2emp2: {id: int,name: chararray,sal: int,sex:chararray,dno: int}grunt> e1 = foreach emp1 generate dno, sal;grunt> e2 = foreach emp2 generate dno, sal;grunt> e = union e1, e2;grunt> grp = group e by dno;grunt> res = foreach grp generate group as dno,               SUM(e.sal) as tot;grunt> dump res(11,180000)(12,280000)(13,270000)(14,60000) -- in above output,    we missed clarity,    total salary for branch1 and brach2    above is the aggregation of all branches. -- we want,   seperate total salary for branch 1 and   seperate for branch2. -- for this cogroup is used.  -- cogroup  will construct seperate     innerbags for each relation(dataset).   so that seperate aggregations, we can apply.grunt> describe e1e1: {dno: int,sal: int}grunt> describe e2e2: {dno: int,sal: int}grunt> cg = cogroup e1 by dno, e2 by dno;grunt> describe cgcg: {group: int,e1: {(dno: int,sal: int)},e2:{(dno: int,sal: int)}}grunt> dump cg -- when cogroup is applied it returns  n+1 fields .  where n is number of input relations   (datasets).   first one is group   2nd onwards bags.(11,{(11,50000),(11,40000)},{(11,90000)})(12,{(12,40000),(12,10000),(12,50000),(12,50000)},{(12,50000),(12,80000)})(13,{(13,80000),(13,90000)},{(13,100000)})(14,{},{(14,30000),(14,30000)})grunt> res = foreach cg generate        group as dno ,         SUM(e1.sal) as tot1,         SUM(e2.sal) as tot2;grunt> describe resres: {dno: int,tot1: long,tot2: long}(11,90000,90000)(12,150000,130000)(13,170000,100000)(14,,60000)---------------------------------  how to perform,    seperate aggregations on each dataset  with out cogrouping.grunt> describe e1e1: {dno: int,sal: int}grunt> describe e2e2: {dno: int,sal: int}grunt> ee1 = foreach e1 generate *, 'branch1' asbranch;grunt> ee2 = foreach e2 generate *, 'branch2' asbranch;grunt> ee = union ee1 , ee2;grunt> grp = group ee by (dno, branch);grunt> res = foreach grp generate>>         group.dno as dno, group.branch asbranch,>>              SUM(ee.sal) as tot;grunt> describe resres: {dno: int,branch: chararray,tot: long}grunt> dump res(11,branch1,90000)(11,branch2,90000)(12,branch1,150000)(12,branch2,130000)(13,branch1,170000)(13,branch2,100000)(14,branch2,60000)--------------------------------- using Cogroup , multiple aggregations  seperately for each dataset.grunt> describe e1e1: {dno: int,sal: int}grunt> describe e2;e2: {dno: int,sal: int}grunt> cg = cogroup e1 by dno, e2 by dno;grunt> res = foreach cg generate>>     group as dno,>>     SUM(e1.sal) as tot1,>>     SUM(e2.sal) as tot2,>>     COUNT(e1) as cnt1,>>     COUNT(e2) as cnt2,>>     MAX(e1.sal) as max1,>>     MAX(e2.sal) as max2;grunt> describe res;res: {dno: int,tot1: long,tot2: long,cnt1:long,cnt2: long,max1: int,max2: int}grunt> dump res(11,90000,90000,2,1,50000,90000)(12,150000,130000,4,2,50000,80000)(13,170000,100000,2,1,90000,100000)(14,,60000,0,2,,30000)------------------------------   Entire Column aggregations using CoGroup. s1 = foreach emp1 generate sal; s2 = foreach emp2 generate sal; g = cogroup s1  all, s2 all ; r = foreach g generate      SUM(s1.sal) as tot1,       SUM(s2.sal) as tot2; r = foreach r generate * , tot1+tot2 as tot;dump r(410000,380000,790000)        


-- co grouping
grunt> cat piglab/emp
101,aaaa,40000,m,11
102,bbbbbb,50000,f,12
103,cccc,50000,m,12
104,dd,90000,f,13
105,ee,10000,m,12
106,dkd,40000,m,12
107,sdkfj,80000,f,13
108,iiii,50000,m,11
grunt>
[cloudera@quickstart ~]$ cat > emp2
201,Ravi,80000,m,12
203,Varuna,100000,f,13
204,Vanila,50000,f,12
205,Mani,30000,m,14
206,Manisha,30000,f,14
[cloudera@quickstart ~]$ hadoop fs -
copyFromLocal emp2 piglab
[cloudera@quickstart ~]$
 sql:
   select dno, sum(sal) from (
      select dno, sal from emp1
              union all
      select dno, sal from emp2
       ) e group by dno;
runt> emp1 = load 'piglab/emp' using      
PigStorage(',')
    as (id:int, name:chararray, sal:int,    
sex:chararray,
      dno:int);
grunt> emp2 = load 'piglab/emp2' using
PigStorage(',')
    as (id:int, name:chararray, sal:int,    
sex:chararray,
     dno:int);
grunt> describe emp1
emp1: {id: int,name: chararray,sal: int,sex:
chararray,dno: int}
grunt> describe emp2
emp2: {id: int,name: chararray,sal: int,sex:
chararray,dno: int}
grunt> e1 = foreach emp1 generate dno, sal;
grunt> e2 = foreach emp2 generate dno, sal;
grunt> e = union e1, e2;
grunt> grp = group e by dno;
grunt> res = foreach grp generate group as dno,
               SUM(e.sal) as tot;
grunt> dump res
(11,180000)
(12,280000)
(13,270000)
(14,60000)
 -- in above output,
    we missed clarity,
    total salary for branch1 and brach2
    above is the aggregation of all branches.
 -- we want,
   seperate total salary for branch 1 and
   seperate for branch2.
 -- for this cogroup is used.
  -- cogroup  will construct seperate
     innerbags for each relation(dataset).
   so that seperate aggregations, we can apply.
grunt> describe e1
e1: {dno: int,sal: int}
grunt> describe e2
e2: {dno: int,sal: int}
grunt> cg = cogroup e1 by dno, e2 by dno;
grunt> describe cg
cg: {group: int,e1: {(dno: int,sal: int)},e2:
{(dno: int,sal: int)}}
grunt> dump cg
 -- when cogroup is applied it returns
  n+1 fields .
  where n is number of input relations
   (datasets).
   first one is group
   2nd onwards bags.
(11,{(11,50000),(11,40000)},{(11,90000)})
(12,{(12,40000),(12,10000),(12,50000),
(12,50000)},{(12,50000),(12,80000)})
(13,{(13,80000),(13,90000)},{(13,100000)})
(14,{},{(14,30000),(14,30000)})
grunt> res = foreach cg generate
        group as dno ,
         SUM(e1.sal) as tot1,
         SUM(e2.sal) as tot2;
grunt> describe res
res: {dno: int,tot1: long,tot2: long}
(11,90000,90000)
(12,150000,130000)
(13,170000,100000)
(14,,60000)
---------------------------------
  how to perform,
    seperate aggregations on each dataset
  with out cogrouping.
grunt> describe e1
e1: {dno: int,sal: int}
grunt> describe e2
e2: {dno: int,sal: int}
grunt> ee1 = foreach e1 generate *, 'branch1' as
branch;
grunt> ee2 = foreach e2 generate *, 'branch2' as
branch;
grunt> ee = union ee1 , ee2;
grunt> grp = group ee by (dno, branch);
grunt> res = foreach grp generate
>>         group.dno as dno, group.branch as
branch,
>>              SUM(ee.sal) as tot;
grunt> describe res
res: {dno: int,branch: chararray,tot: long}
grunt> dump res
(11,branch1,90000)
(11,branch2,90000)
(12,branch1,150000)
(12,branch2,130000)
(13,branch1,170000)
(13,branch2,100000)
(14,branch2,60000)
---------------------------------
 using Cogroup , multiple aggregations
  seperately for each dataset.
grunt> describe e1
e1: {dno: int,sal: int}
grunt> describe e2;
e2: {dno: int,sal: int}
grunt> cg = cogroup e1 by dno, e2 by dno;
grunt> res = foreach cg generate
>>     group as dno,
>>     SUM(e1.sal) as tot1,
>>     SUM(e2.sal) as tot2,
>>     COUNT(e1) as cnt1,
>>     COUNT(e2) as cnt2,
>>     MAX(e1.sal) as max1,
>>     MAX(e2.sal) as max2;
grunt> describe res;
res: {dno: int,tot1: long,tot2: long,cnt1:
long,cnt2: long,max1: int,max2: int}
grunt> dump res
(11,90000,90000,2,1,50000,90000)
(12,150000,130000,4,2,50000,80000)
(13,170000,100000,2,1,90000,100000)
(14,,60000,0,2,,30000)


------------------------------
   Entire Column aggregations using CoGroup.

 s1 = foreach emp1 generate sal;
 s2 = foreach emp2 generate sal;
 g = cogroup s1  all, s2 all ;
 r = foreach g generate
      SUM(s1.sal) as tot1,
       SUM(s2.sal) as tot2;
 r = foreach r generate * , tot1+tot2 as tot;
dump r
(410000,380000,790000)