Pig : Cross Operator to Cartisian

 Cross:  -----    used cartisian product.    each element of left set, joins with each element of right set.   ds1 --> (a)           (b)           (c)   ds2 --> (1)           (2)   x = cross ds1, ds3    (a,1)    (a,2)    (b,1)    (b,2)    (c,1)    (c,2) emp = load 'piglab/emp' using PigStorage(',')     as (id:int, name:chararray, sal:int,   sex:chararray, dno:int); task:    find how many employees are below to the avg sal,   and above to the avg sal; sals = foreach emp generate sal; grp = group sals all; avgsal = foreach grp generate AVG(sals.sal) as avg; avgsal = foreach avgsal generate (int)avg; e = cross sals, avgsal; e = foreach e generate sals::sal as sal, avgsal::avg as avg; stats = foreach e generate        (sal>=avg ? 'Above':'Below') as stat; grp2 = group stats by stat; res = foreach grp2 generate group as stat,      COUNT(stats) as cnt; dump res (Above,2) (Below,6) --in above task, cross is used to , make available of avgsal to each row of the emplyee,   so that we can compare each sal with avg. -------------------------------------- 2nd example: [cloudera@quickstart ~]$ cat sales 01/01/2016,40000 01/03/2016,50000 01/25/2016,50000 02/01/2016,40000 02/03/2016,90000 02/25/2016,50000 03/01/2016,40000 03/03/2016,50000 04/25/2016,50000 05/01/2016,40000 05/03/2016,50000 06/25/2016,50000 06/01/2016,40000 06/03/2016,90000 06/25/2016,50000 07/01/2016,40000 07/03/2016,50000 07/25/2016,50000 08/01/2016,40000 09/03/2016,50000 09/25/2016,50000 10/01/2016,40000 10/03/2016,90000 10/25/2016,50000 10/01/2016,40000 11/03/2016,50000 12/25/2016,50000 12/01/2016,40000 12/03/2016,50000 12/25/2016,50000 12/01/2016,40000 12/03/2016,90000 12/25/2016,50000 12/01/2016,40000 12/03/2016,50000 12/25/2016,50000 [cloudera@quickstart ~]$ [cloudera@quickstart ~]$ hadoop fs -copyFromLocal sales piglab monthly sales report. sales = load 'piglab/sales'    using PigStorage(',')     as (dt:chararray, amt:int); sales2 = foreach sales generate      SUBSTRING(dt,0,2) as mon, amt; s = foreach sales2 generate (int)mon, amt; grp = group s by mon; mrep = foreach grp generate group as mon,               SUM(s.amt) as tot; --quarterly sales report: q = foreach mrep generate        (mon


 Cross:
 -----
   used cartisian product.

   each element of left set, joins with each element of right set.


  ds1 --> (a)
          (b)
          (c)

  ds2 --> (1)
          (2)

  x = cross ds1, ds3

   (a,1)
   (a,2)
   (b,1)
   (b,2)
   (c,1)
   (c,2)


emp = load 'piglab/emp' using PigStorage(',')
    as (id:int, name:chararray, sal:int,
  sex:chararray, dno:int);


task:
   find how many employees are below to the avg sal,
  and above to the avg sal;

sals = foreach emp generate sal;

grp = group sals all;

avgsal = foreach grp generate AVG(sals.sal) as avg;
avgsal = foreach avgsal generate (int)avg;


e = cross sals, avgsal;
e = foreach e generate sals::sal as sal, avgsal::avg as avg;


stats = foreach e generate
       (sal>=avg ? 'Above':'Below') as stat;
grp2 = group stats by stat;
res = foreach grp2 generate group as stat,
     COUNT(stats) as cnt;
dump res
(Above,2)
(Below,6)


--in above task, cross is used to , make available of avgsal to each row of the emplyee,
  so that we can compare each sal with avg.

--------------------------------------

2nd example:


[cloudera@quickstart ~]$ cat sales
01/01/2016,40000
01/03/2016,50000
01/25/2016,50000
02/01/2016,40000
02/03/2016,90000
02/25/2016,50000
03/01/2016,40000
03/03/2016,50000
04/25/2016,50000
05/01/2016,40000
05/03/2016,50000
06/25/2016,50000
06/01/2016,40000
06/03/2016,90000
06/25/2016,50000
07/01/2016,40000
07/03/2016,50000
07/25/2016,50000
08/01/2016,40000
09/03/2016,50000
09/25/2016,50000
10/01/2016,40000
10/03/2016,90000
10/25/2016,50000
10/01/2016,40000
11/03/2016,50000
12/25/2016,50000
12/01/2016,40000
12/03/2016,50000
12/25/2016,50000
12/01/2016,40000
12/03/2016,90000
12/25/2016,50000
12/01/2016,40000
12/03/2016,50000
12/25/2016,50000
[cloudera@quickstart ~]$

[cloudera@quickstart ~]$ hadoop fs -copyFromLocal sales piglab

monthly sales report.

sales = load 'piglab/sales'
   using PigStorage(',')
    as (dt:chararray, amt:int);

sales2 = foreach sales generate
     SUBSTRING(dt,0,2) as mon, amt;
s = foreach sales2 generate (int)mon, amt;

grp = group s by mon;
mrep = foreach grp generate group as mon,
              SUM(s.amt) as tot;

--quarterly sales report:

q = foreach mrep generate
       (mon <4 ? 1:
         (mon <7 ? 2:
           (mon <10 ? 3:4))) as qtr, tot;
qgrp = group q by qtr;
qrep = foreach qgrp generate
         group as qtr, SUM(q.tot) as tot;
dump qrep
(1,410000)
(2,370000)
(3,280000)
(4,780000)


qrep2 = foreach qrep generate *;

cr = cross qrep, qrep2;
cr = foreach cr generate
      qrep::qtr as q1, qrep2::qtr as q2,
      qrep::tot as tot1, qrep2::tot as tot2;

cr2 = filter cr by (q1-q2)==1;

rep = foreach cr2 generate *,
       ((tot1-tot2)*100)/tot2 as pgrowth;

dump rep;
--------------------------------

[cloudera@quickstart ~]$ cat matri
101,Amar,25,40000,hyd,m
102,Amala,23,50000,Del,f
103,Kiran,29,50000,hyd,m
104,Samantha,26,30000,hyd,f
105,Mani,30,70000,Del,m
106,Rakhul,24,40000,Del,f
107,Venu,34,100000,Pune,m
108,Ileana,29,200000,hyd,f

[cloudera@quickstart ~]$

hadoop fs -copyFromLocal matri piglab

applicants = load 'piglab/matri'
    using PigStorage(',')
    as (id:int, name:chararray,
       age:int, income:int, city:chararray,
        sex:chararray);

males = filter applicants by sex=='m';
fems = filter applicants by sex=='f';

mf = cross males, fems;

mf = foreach mf generate males::name as mname,
           fems::name as fname,
         males::age as mage,
         fems::age as fage;

res1 = filter mf by (mage>fage and mage-fage<4);

mlist = foreach res1 generate
           mname as src , fname as trg,
           mage as srcage, fage as trgage;
flist = foreach res1 generate
          fname as src, mname as trg,
           fage as srcage, mage as trgage;

list = union mlist, flist;




dump res1;