Pig : CoGroup examples Vs Union Examples

-- co groupinggrunt> cat piglab/emp101,aaaa,40000,m,11102,bbbbbb,50000,f,12103,cccc,50000,m,12104,dd,90000,f,13105,ee,10000,m,12106,dkd,40000,m,12107,sdkfj,80000,f,13108,iiii,50000,m,11grunt>[cloudera@quickstart ~]$ cat > emp2201,Ravi,80000,m,12203,Varuna,100000,f,13204,Vanila,50000,f,12205,Mani,30000,m,14206,Manisha,30000,f,14[cloudera@quickstart ~]$ hadoop fs -copyFromLocal emp2 piglab[cloudera@quickstart ~]$ sql: select dno, sum(sal) from ( select dno, sal from emp1 union all select dno, sal from emp2 ) e group by dno;runt> emp1 = load 'piglab/emp' using PigStorage(',') as (id:int, name:chararray, sal:int, sex:chararray, dno:int);grunt> emp2 = load 'piglab/emp2' using PigStorage(',') as (id:int, name:chararray, sal:int, sex:chararray, dno:int);grunt> describe emp1emp1: {id: int,name: chararray,sal: int,sex:chararray,dno: int}grunt> describe emp2emp2: {id: int,name: chararray,sal: int,sex:chararray,dno: int}grunt> e1 = foreach emp1 generate dno, sal;grunt> e2 = foreach emp2 generate dno, sal;grunt> e = union e1, e2;grunt> grp = group e by dno;grunt> res = foreach grp generate group as dno, SUM(e.sal) as tot;grunt> dump res(11,180000)(12,280000)(13,270000)(14,60000) -- in above output, we missed clarity, total salary for branch1 and brach2 above is the aggregation of all branches. -- we want, seperate total salary for branch 1 and seperate for branch2. -- for this cogroup is used. -- cogroup will construct seperate innerbags for each relation(dataset). so that seperate aggregations, we can apply.grunt> describe e1e1: {dno: int,sal: int}grunt> describe e2e2: {dno: int,sal: int}grunt> cg = cogroup e1 by dno, e2 by dno;grunt> describe cgcg: {group: int,e1: {(dno: int,sal: int)},e2:{(dno: int,sal: int)}}grunt> dump cg -- when cogroup is applied it returns n+1 fields . where n is number of input relations (datasets). first one is group 2nd onwards bags.(11,{(11,50000),(11,40000)},{(11,90000)})(12,{(12,40000),(12,10000),(12,50000),(12,50000)},{(12,50000),(12,80000)})(13,{(13,80000),(13,90000)},{(13,100000)})(14,{},{(14,30000),(14,30000)})grunt> res = foreach cg generate group as dno , SUM(e1.sal) as tot1, SUM(e2.sal) as tot2;grunt> describe resres: {dno: int,tot1: long,tot2: long}(11,90000,90000)(12,150000,130000)(13,170000,100000)(14,,60000)--------------------------------- how to perform, seperate aggregations on each dataset with out cogrouping.grunt> describe e1e1: {dno: int,sal: int}grunt> describe e2e2: {dno: int,sal: int}grunt> ee1 = foreach e1 generate , 'branch1' asbranch;grunt> ee2 = foreach e2 generate , 'branch2' asbranch;grunt> ee = union ee1 , ee2;grunt> grp = group ee by (dno, branch);grunt> res = foreach grp generate>> group.dno as dno, group.branch asbranch,>> SUM(ee.sal) as tot;grunt> describe resres: {dno: int,branch: chararray,tot: long}grunt> dump res(11,branch1,90000)(11,branch2,90000)(12,branch1,150000)(12,branch2,130000)(13,branch1,170000)(13,branch2,100000)(14,branch2,60000)--------------------------------- using Cogroup , multiple aggregations seperately for each dataset.grunt> describe e1e1: {dno: int,sal: int}grunt> describe e2;e2: {dno: int,sal: int}grunt> cg = cogroup e1 by dno, e2 by dno;grunt> res = foreach cg generate>> group as dno,>> SUM(e1.sal) as tot1,>> SUM(e2.sal) as tot2,>> COUNT(e1) as cnt1,>> COUNT(e2) as cnt2,>> MAX(e1.sal) as max1,>> MAX(e2.sal) as max2;grunt> describe res;res: {dno: int,tot1: long,tot2: long,cnt1:long,cnt2: long,max1: int,max2: int}grunt> dump res(11,90000,90000,2,1,50000,90000)(12,150000,130000,4,2,50000,80000)(13,170000,100000,2,1,90000,100000)(14,,60000,0,2,,30000)------------------------------ Entire Column aggregations using CoGroup. s1 = foreach emp1 generate sal; s2 = foreach emp2 generate sal; g = cogroup s1 all, s2 all ; r = foreach g generate SUM(s1.sal) as tot1, SUM(s2.sal) as tot2; r = foreach r generate * , tot1+tot2 as tot;dump r(410000,380000,790000)

nikhil

nikhil

Dec 3, 2024 - 16:46

0 1

-- co grouping
grunt> cat piglab/emp
101,aaaa,40000,m,11
102,bbbbbb,50000,f,12
103,cccc,50000,m,12
104,dd,90000,f,13
105,ee,10000,m,12
106,dkd,40000,m,12
107,sdkfj,80000,f,13
108,iiii,50000,m,11
grunt>
[cloudera@quickstart ~]$ cat > emp2
201,Ravi,80000,m,12
203,Varuna,100000,f,13
204,Vanila,50000,f,12
205,Mani,30000,m,14
206,Manisha,30000,f,14
[cloudera@quickstart ~]$ hadoop fs -
copyFromLocal emp2 piglab
[cloudera@quickstart ~]$
sql:
select dno, sum(sal) from (
select dno, sal from emp1
union all
select dno, sal from emp2
) e group by dno;
runt> emp1 = load 'piglab/emp' using
PigStorage(',')
as (id:int, name:chararray, sal:int,
sex:chararray,
dno:int);
grunt> emp2 = load 'piglab/emp2' using
PigStorage(',')
as (id:int, name:chararray, sal:int,
sex:chararray,
dno:int);
grunt> describe emp1
emp1: {id: int,name: chararray,sal: int,sex:
chararray,dno: int}
grunt> describe emp2
emp2: {id: int,name: chararray,sal: int,sex:
chararray,dno: int}
grunt> e1 = foreach emp1 generate dno, sal;
grunt> e2 = foreach emp2 generate dno, sal;
grunt> e = union e1, e2;
grunt> grp = group e by dno;
grunt> res = foreach grp generate group as dno,
SUM(e.sal) as tot;
grunt> dump res
(11,180000)
(12,280000)
(13,270000)
(14,60000)
-- in above output,
we missed clarity,
total salary for branch1 and brach2
above is the aggregation of all branches.
-- we want,
seperate total salary for branch 1 and
seperate for branch2.
-- for this cogroup is used.
-- cogroup will construct seperate
innerbags for each relation(dataset).
so that seperate aggregations, we can apply.
grunt> describe e1
e1: {dno: int,sal: int}
grunt> describe e2
e2: {dno: int,sal: int}
grunt> cg = cogroup e1 by dno, e2 by dno;
grunt> describe cg
cg: {group: int,e1: {(dno: int,sal: int)},e2:
{(dno: int,sal: int)}}
grunt> dump cg
-- when cogroup is applied it returns
n+1 fields .
where n is number of input relations
(datasets).
first one is group
2nd onwards bags.
(11,{(11,50000),(11,40000)},{(11,90000)})
(12,{(12,40000),(12,10000),(12,50000),
(12,50000)},{(12,50000),(12,80000)})
(13,{(13,80000),(13,90000)},{(13,100000)})
(14,{},{(14,30000),(14,30000)})
grunt> res = foreach cg generate
group as dno ,
SUM(e1.sal) as tot1,
SUM(e2.sal) as tot2;
grunt> describe res
res: {dno: int,tot1: long,tot2: long}
(11,90000,90000)
(12,150000,130000)
(13,170000,100000)
(14,,60000)
---------------------------------
how to perform,
seperate aggregations on each dataset
with out cogrouping.
grunt> describe e1
e1: {dno: int,sal: int}
grunt> describe e2
e2: {dno: int,sal: int}
grunt> ee1 = foreach e1 generate , 'branch1' as
branch;
grunt> ee2 = foreach e2 generate , 'branch2' as
branch;
grunt> ee = union ee1 , ee2;
grunt> grp = group ee by (dno, branch);
grunt> res = foreach grp generate
>> group.dno as dno, group.branch as
branch,
>> SUM(ee.sal) as tot;
grunt> describe res
res: {dno: int,branch: chararray,tot: long}
grunt> dump res
(11,branch1,90000)
(11,branch2,90000)
(12,branch1,150000)
(12,branch2,130000)
(13,branch1,170000)
(13,branch2,100000)
(14,branch2,60000)
---------------------------------
using Cogroup , multiple aggregations
seperately for each dataset.
grunt> describe e1
e1: {dno: int,sal: int}
grunt> describe e2;
e2: {dno: int,sal: int}
grunt> cg = cogroup e1 by dno, e2 by dno;
grunt> res = foreach cg generate
>> group as dno,
>> SUM(e1.sal) as tot1,
>> SUM(e2.sal) as tot2,
>> COUNT(e1) as cnt1,
>> COUNT(e2) as cnt2,
>> MAX(e1.sal) as max1,
>> MAX(e2.sal) as max2;
grunt> describe res;
res: {dno: int,tot1: long,tot2: long,cnt1:
long,cnt2: long,max1: int,max2: int}
grunt> dump res
(11,90000,90000,2,1,50000,90000)
(12,150000,130000,4,2,50000,80000)
(13,170000,100000,2,1,90000,100000)
(14,,60000,0,2,,30000)

------------------------------
Entire Column aggregations using CoGroup.

s1 = foreach emp1 generate sal;
s2 = foreach emp2 generate sal;
g = cogroup s1 all, s2 all ;
r = foreach g generate
SUM(s1.sal) as tot1,
SUM(s2.sal) as tot2;
r = foreach r generate * , tot1+tot2 as tot;
dump r
(410000,380000,790000)

Previous Article

Don't Blame Being Fat on Aging

Have Funds But Less Travel for Baby Boomers

What's Your Reaction?

0

Like

0

Dislike

0

Love

0

Funny

0

Angry

0

Sad

0

Wow

Related Posts

PCEP™ Certified Entry-Level Python Programmer (PCEP-30-...

nikhil Dec 3, 2024 0 0

Using Schema Registry for Data Quality in Apache Kafka

nikhil Dec 3, 2024 0 1

Increasing Hadoop Storage Scale by 4x!

nikhil Dec 3, 2024 0 0

HBase and data locality

nikhil Dec 3, 2024 0 0

What is Namenode in Hadoop? Key Functions, Handling Dat...

nikhil Dec 3, 2024 0 1

Hadoop Data Warehouse

nikhil Dec 3, 2024 0 0

Comments

This site uses cookies. By continuing to browse the site you are agreeing to our use of cookies.

ca-pub-4239506253673884