wiki:jazz/13-08-14

Version 3 (modified by jazz, 11 years ago) (diff)

--

2013-08-14


  • register /home/dvryaboy/src/pig/trunk/piggybank.jar;
    DEFINE LogLoader
    org.apache.pig.piggybank.storage.apachelog.CombinedLogLoader();
    DEFINE DayExtractor
    org.apache.pig.piggybank.evaluation.util.apachelogparser.DateExtractor('yyyy-MM-dd');
    *
    %default LOGS 'access_log.small'
    %default GEO 'GeoLiteCity.dat'

  • #!/usr/bin/env perl
    use warnings;
    use strict;
    use Geo::IP::PurePerl;

    my ($path)=shift;
    my $gi = Geo::IP::PurePerl->new($path);

    while (<>) {
    chomp;
    if (/([^\t]*)\t(.*)/) {
    my ($ip, $rest) = ($1, $2);
    my ($country_code, undef, $country_name, $region, $city)
    = $gi->get_city_record($ip);
    print join("\t", $country_code||'', $country_name||'',
    $region||'', $city||'', $ip, $rest), "\n";
    }
    }

    *
    with_country = STREAM notbots THROUGH `ipwrapper.sh $GEO`
    AS (country_code, country, state, city, ip, time, uri, bytes, userAgent);

    geo_uri_groups = GROUP with_country BY country_code;

    geo_uri_group_counts = FOREACH geo_uri_groups GENERATE
    group,
    COUNT(with_country) AS cnt,
    SUM(with_country.bytes) AS total_bytes;

    geo_uri_group_counts = ORDER geo_uri_group_counts BY cnt DESC;

    STORE geo_uri_group_counts INTO 'by_country.tsv';