= 2013-08-14 = * http://blog.cloudera.com/blog/2009/06/analyzing-apache-logs-with-pig/ * {{{ #!html
register /home/dvryaboy/src/pig/trunk/piggybank.jar;
DEFINE LogLoader
org.apache.pig.piggybank.storage.apachelog.CombinedLogLoader();
DEFINE DayExtractor
org.apache.pig.piggybank.evaluation.util.apachelogparser.DateExtractor('yyyy-MM-dd');
}}} * {{{ #!html
%default LOGS 'access_log.small'
%default GEO 'GeoLiteCity.dat'
}}} * {{{ # pig -x mapreduce -f scripts/blogparse.pig -param LOGS='/mirror.cloudera.com/logs/access_log.*' }}} * {{{ #!html
#!/usr/bin/env perl
use warnings;
use strict;
use Geo::IP::PurePerl;

my ($path)=shift;
my $gi = Geo::IP::PurePerl->new($path);

while (<>) {
chomp;
if (/([^\t]*)\t(.*)/) {
my ($ip, $rest) = ($1, $2);
my ($country_code, undef, $country_name, $region, $city)
= $gi->get_city_record($ip);
print join("\t", $country_code||'', $country_name||'',
$region||'', $city||'', $ip, $rest), "\n";
}
}
}}} * {{{ #!html
#!/usr/bin/env bash
tar -xzf geo-pack.tgz
PERL5LIB=$PERL5LIB:$(pwd) ./geostream.pl $1
}}} * {{{ #!html
with_country = STREAM notbots THROUGH `ipwrapper.sh $GEO`
AS (country_code, country, state, city, ip, time, uri, bytes, userAgent);

geo_uri_groups = GROUP with_country BY country_code;

geo_uri_group_counts = FOREACH geo_uri_groups GENERATE
group,
COUNT(with_country) AS cnt,
SUM(with_country.bytes) AS total_bytes;

geo_uri_group_counts = ORDER geo_uri_group_counts BY cnt DESC;

STORE geo_uri_group_counts INTO 'by_country.tsv';
}}}