= 2013-08-14 =
*
{{{
#!html
register /home/dvryaboy/src/pig/trunk/piggybank.jar;
DEFINE LogLoader
org.apache.pig.piggybank.storage.apachelog.CombinedLogLoader();
DEFINE DayExtractor
org.apache.pig.piggybank.evaluation.util.apachelogparser.DateExtractor('yyyy-MM-dd');
}}}
*
{{{
#!html
%default LOGS 'access_log.small'
%default GEO 'GeoLiteCity.dat'
}}}
*
{{{
#!html
#!/usr/bin/env perl
use warnings;
use strict;
use Geo::IP::PurePerl;
my ($path)=shift;
my $gi = Geo::IP::PurePerl->new($path);
while (<>) {
chomp;
if (/([^\t]*)\t(.*)/) {
my ($ip, $rest) = ($1, $2);
my ($country_code, undef, $country_name, $region, $city)
= $gi->get_city_record($ip);
print join("\t", $country_code||'', $country_name||'',
$region||'', $city||'', $ip, $rest), "\n";
}
}
}}}
*
{{{
#!html
with_country = STREAM notbots THROUGH `ipwrapper.sh $GEO`
AS (country_code, country, state, city, ip, time, uri, bytes, userAgent);
geo_uri_groups = GROUP with_country BY country_code;
geo_uri_group_counts = FOREACH geo_uri_groups GENERATE
group,
COUNT(with_country) AS cnt,
SUM(with_country.bytes) AS total_bytes;
geo_uri_group_counts = ORDER geo_uri_group_counts BY cnt DESC;
STORE geo_uri_group_counts INTO 'by_country.tsv';
}}}