| 30 | |
| 31 | |
| 32 | {{{ |
| 33 | cd /opt/crawlzilla/nutch |
| 34 | |
| 35 | }}} |
| 36 | |
| 37 | === index === |
| 38 | * linkdb tw_yahoo_com_6/linkdb |
| 39 | |
| 40 | {{{ |
| 41 | #!java |
| 42 | Usage: LinkDb <linkdb> (-dir <segmentsDir> | <seg1> <seg2> ...) |
| 43 | }}} |
| 44 | |
| 45 | {{{ |
| 46 | $ /opt/crawlzilla/nutch/bin/nutch invertlinks /user/crawler/cw_yahoo_5/linkdb -dir /user/crawler/cw_yahoo_5/segments/ |
| 47 | }}} |
| 48 | |
| 49 | === index-lucene === |
| 50 | |
| 51 | * index-lucene tw_yahoo_com_6/indexes |
| 52 | |
| 53 | {{{ |
| 54 | #!java |
| 55 | Usage: Indexer <index> <crawldb> <linkdb> <segment> ... |
| 56 | }}} |
| 57 | |
| 58 | {{{ |
| 59 | $ /opt/crawlzilla/nutch/bin/nutch index /user/crawler/cw_yahoo_5/index /user/crawler/cw_yahoo_5/crawldb /user/crawler/cw_yahoo_5/linkdb /user/crawler/cw_yahoo_5/segments/20101027234843 /user/crawler/cw_yahoo_5/segments/20101027234956 /user/crawler/cw_yahoo_5/segments/20101027235315 /user/crawler/cw_yahoo_5/segments/20101028000804 /user/crawler/cw_yahoo_5/segments/20101028002826 |
| 60 | }}} |
| 61 | |
| 62 | |
| 63 | === dedup === |
| 64 | |
| 65 | * dedup 1: urls by time 100.00% |
| 66 | * dedup 2: content by hash 100.00% |
| 67 | * dedup 3: delete from index(es) |
| 68 | {{{ |
| 69 | /opt/crawlzilla/nutch/bin/nutch dedup /user/crawler/cw_yahoo_5/index |
| 70 | }}} |