wiki:jazz/13-08-14

Version 9 (modified by jazz, 11 years ago) (diff)

--

2013-08-14


register /home/dvryaboy/src/pig/trunk/piggybank.jar;
DEFINE LogLoader
org.apache.pig.piggybank.storage.apachelog.CombinedLogLoader();
DEFINE DayExtractor
org.apache.pig.piggybank.evaluation.util.apachelogparser.DateExtractor('yyyy-MM-dd');

%default LOGS 'access_log.small'
%default GEO 'GeoLiteCity.dat'
# pig -x mapreduce -f scripts/blogparse.pig -param LOGS='/mirror.cloudera.com/logs/access_log.*'

#!/usr/bin/env perl
use warnings;
use strict;
use Geo::IP::PurePerl;

my ($path)=shift;
my $gi = Geo::IP::PurePerl->new($path);

while (<>) {
chomp;
if (/([^\t]*)\t(.*)/) {
my ($ip, $rest) = ($1, $2);
my ($country_code, undef, $country_name, $region, $city)
= $gi->get_city_record($ip);
print join("\t", $country_code||'', $country_name||'',
$region||'', $city||'', $ip, $rest), "\n";
}
}


#!/usr/bin/env bash
tar -xzf geo-pack.tgz
PERL5LIB=$PERL5LIB:$(pwd) ./geostream.pl $1

  • DEFINE iplookup `ipwrapper.sh $GEO`
    ship ('ipwrapper.sh')
    cache('/home/dvryaboy/tmp/$GEO#$GEO');

  • logs = LOAD '$LOGS' USING LogLoader as
    (remoteAddr, remoteLogname, user, time, method,
    uri, proto, status, bytes, referer, userAgent);

*


logs = FILTER logs BY bytes != '-' AND uri matches '/apache.*';

-- project just the columns we will need
logs = FOREACH logs GENERATE
remoteAddr,
DayExtractor(time) as day, uri, bytes, userAgent;

-- The filtering function is not actually in the PiggyBank.
-- We plan on contributing it soon.
notbots = FILTER logs BY (NOT
org.apache.pig.piggybank.filtering.IsBotUA(userAgent));

*


with_country = STREAM notbots THROUGH `ipwrapper.sh $GEO`
AS (country_code, country, state, city, ip, time, uri, bytes, userAgent);

geo_uri_groups = GROUP with_country BY country_code;

geo_uri_group_counts = FOREACH geo_uri_groups GENERATE
group,
COUNT(with_country) AS cnt,
SUM(with_country.bytes) AS total_bytes;

geo_uri_group_counts = ORDER geo_uri_group_counts BY cnt DESC;

STORE geo_uri_group_counts INTO 'by_country.tsv';