2013-08-14
register /home/dvryaboy/src/pig/trunk/piggybank.jar;
DEFINE LogLoader
org.apache.pig.piggybank.storage.apachelog.CombinedLogLoader();
DEFINE DayExtractor
org.apache.pig.piggybank.evaluation.util.apachelogparser.DateExtractor('yyyy-MM-dd');
%default LOGS 'access_log.small'
%default GEO 'GeoLiteCity.dat'
# pig -x mapreduce -f scripts/blogparse.pig -param LOGS='/mirror.cloudera.com/logs/access_log.*'
#!/usr/bin/env perl
use warnings;
use strict;
use Geo::IP::PurePerl;
my ($path)=shift;
my $gi = Geo::IP::PurePerl->new($path);
while (<>) {
chomp;
if (/([^\t]*)\t(.*)/) {
my ($ip, $rest) = ($1, $2);
my ($country_code, undef, $country_name, $region, $city)
= $gi->get_city_record($ip);
print join("\t", $country_code||'', $country_name||'',
$region||'', $city||'', $ip, $rest), "\n";
}
}
#!/usr/bin/env bash
tar -xzf geo-pack.tgz
PERL5LIB=$PERL5LIB:$(pwd) ./geostream.pl $1
-
DEFINE iplookup `ipwrapper.sh $GEO`
ship ('ipwrapper.sh')
cache('/home/dvryaboy/tmp/$GEO#$GEO');
-
logs = LOAD '$LOGS' USING LogLoader as
(remoteAddr, remoteLogname, user, time, method,
uri, proto, status, bytes, referer, userAgent);
*
logs = FILTER logs BY bytes != '-' AND uri matches '/apache.*';-- project just the columns we will need
logs = FOREACH logs GENERATE
remoteAddr,
DayExtractor(time) as day, uri, bytes, userAgent;-- The filtering function is not actually in the PiggyBank.
-- We plan on contributing it soon.
notbots = FILTER logs BY (NOT
org.apache.pig.piggybank.filtering.IsBotUA(userAgent));
*
with_country = STREAM notbots THROUGH `ipwrapper.sh $GEO`
AS (country_code, country, state, city, ip, time, uri, bytes, userAgent);
geo_uri_groups = GROUP with_country BY country_code;
geo_uri_group_counts = FOREACH geo_uri_groups GENERATE
group,
COUNT(with_country) AS cnt,
SUM(with_country.bytes) AS total_bytes;
geo_uri_group_counts = ORDER geo_uri_group_counts BY cnt DESC;
STORE geo_uri_group_counts INTO 'by_country.tsv';