| 1 | {{{ |
| 2 | #!text |
| 3 | Merging 18 segments to /user/crawler/newcrawl_3/segments/20110725175250 |
| 4 | SegmentMerger: adding /user/crawler/crawl1/segments/20110722163308/content |
| 5 | SegmentMerger: adding /user/crawler/crawl1/segments/20110722163308/crawl_fetch |
| 6 | SegmentMerger: adding /user/crawler/crawl1/segments/20110722163308/crawl_generate |
| 7 | SegmentMerger: adding /user/crawler/crawl1/segments/20110722163308/crawl_parse |
| 8 | SegmentMerger: adding /user/crawler/crawl1/segments/20110722163308/parse_data |
| 9 | SegmentMerger: adding /user/crawler/crawl1/segments/20110722163308/parse_text |
| 10 | SegmentMerger: adding /user/crawler/crawl2/segments/20110531151117/content |
| 11 | SegmentMerger: adding /user/crawler/crawl2/segments/20110531151117/crawl_fetch |
| 12 | SegmentMerger: adding /user/crawler/crawl2/segments/20110531151117/crawl_generate |
| 13 | SegmentMerger: adding /user/crawler/crawl2/segments/20110531151117/crawl_parse |
| 14 | SegmentMerger: adding /user/crawler/crawl2/segments/20110531151117/parse_data |
| 15 | SegmentMerger: adding /user/crawler/crawl2/segments/20110531151117/parse_text |
| 16 | SegmentMerger: adding /user/crawler/crawl2/segments/20110531151312/content |
| 17 | SegmentMerger: adding /user/crawler/crawl2/segments/20110531151312/crawl_fetch |
| 18 | SegmentMerger: adding /user/crawler/crawl2/segments/20110531151312/crawl_generate |
| 19 | SegmentMerger: adding /user/crawler/crawl2/segments/20110531151312/crawl_parse |
| 20 | SegmentMerger: adding /user/crawler/crawl2/segments/20110531151312/parse_data |
| 21 | SegmentMerger: adding /user/crawler/crawl2/segments/20110531151312/parse_text |
| 22 | SegmentMerger: using segment data from: |
| 23 | Exception in thread "main" java.io.IOException: No input paths specified in job |
| 24 | at org.apache.hadoop.mapred.FileInputFormat.listStatus(FileInputFormat.java:152) |
| 25 | at org.apache.hadoop.mapred.SequenceFileInputFormat.listStatus(SequenceFileInputFormat.java:44) |
| 26 | at org.apache.hadoop.mapred.FileInputFormat.getSplits(FileInputFormat.java:201) |
| 27 | at org.apache.hadoop.mapred.JobClient.writeOldSplits(JobClient.java:810) |
| 28 | at org.apache.hadoop.mapred.JobClient.submitJobInternal(JobClient.java:781) |
| 29 | at org.apache.hadoop.mapred.JobClient.submitJob(JobClient.java:730) |
| 30 | at org.apache.hadoop.mapred.JobClient.runJob(JobClient.java:1249) |
| 31 | at org.apache.nutch.segment.SegmentMerger.merge(SegmentMerger.java:638) |
| 32 | at org.apache.nutch.segment.SegmentMerger.main(SegmentMerger.java:683) |
| 33 | Update segments |
| 34 | LinkDb: starting at 2011-07-25 17:52:55 |
| 35 | LinkDb: linkdb: /user/crawler/newcrawl_3/linkdb |
| 36 | LinkDb: URL normalize: true |
| 37 | LinkDb: URL filter: true |
| 38 | LinkDb: java.io.IOException: No input paths specified in job |
| 39 | at org.apache.hadoop.mapred.FileInputFormat.listStatus(FileInputFormat.java:152) |
| 40 | at org.apache.hadoop.mapred.SequenceFileInputFormat.listStatus(SequenceFileInputFormat.java:44) |
| 41 | at org.apache.hadoop.mapred.FileInputFormat.getSplits(FileInputFormat.java:201) |
| 42 | at org.apache.hadoop.mapred.JobClient.writeOldSplits(JobClient.java:810) |
| 43 | at org.apache.hadoop.mapred.JobClient.submitJobInternal(JobClient.java:781) |
| 44 | at org.apache.hadoop.mapred.JobClient.submitJob(JobClient.java:730) |
| 45 | at org.apache.hadoop.mapred.JobClient.runJob(JobClient.java:1249) |
| 46 | at org.apache.nutch.crawl.LinkDb.invert(LinkDb.java:175) |
| 47 | at org.apache.nutch.crawl.LinkDb.run(LinkDb.java:292) |
| 48 | at org.apache.hadoop.util.ToolRunner.run(ToolRunner.java:65) |
| 49 | at org.apache.nutch.crawl.LinkDb.main(LinkDb.java:255) |
| 50 | |
| 51 | Index segments |
| 52 | ls: Cannot access /user/crawler/newcrawl_3/segments/*: No such file or directory. |
| 53 | [check] /opt/crawlzilla/nutch/bin/nutch index /user/crawler/newcrawl_3/newindexes /user/crawler/newcrawl_3/crawldb /user/crawler/newcrawl_3/linkdb |
| 54 | Usage: Indexer <index> <crawldb> <linkdb> <segment> ... |
| 55 | De-duplicate indexes |
| 56 | Dedup: starting at 2011-07-25 17:53:02 |
| 57 | Dedup: adding indexes in: /user/crawler/newcrawl_3/newindexes |
| 58 | DeleteDuplicates: org.apache.hadoop.mapred.InvalidInputException: Input path does not exist: hdfs://crawlweb1:9000/user/crawler/newcrawl_3/newindexes |
| 59 | at org.apache.hadoop.mapred.FileInputFormat.listStatus(FileInputFormat.java:190) |
| 60 | at org.apache.nutch.indexer.DeleteDuplicates$InputFormat.getSplits(DeleteDuplicates.java:149) |
| 61 | at org.apache.hadoop.mapred.JobClient.writeOldSplits(JobClient.java:810) |
| 62 | at org.apache.hadoop.mapred.JobClient.submitJobInternal(JobClient.java:781) |
| 63 | at org.apache.hadoop.mapred.JobClient.submitJob(JobClient.java:730) |
| 64 | at org.apache.hadoop.mapred.JobClient.runJob(JobClient.java:1249) |
| 65 | at org.apache.nutch.indexer.DeleteDuplicates.dedup(DeleteDuplicates.java:451) |
| 66 | at org.apache.nutch.indexer.DeleteDuplicates.run(DeleteDuplicates.java:519) |
| 67 | at org.apache.hadoop.util.ToolRunner.run(ToolRunner.java:65) |
| 68 | at org.apache.nutch.indexer.DeleteDuplicates.main(DeleteDuplicates.java:503) |
| 69 | |
| 70 | Merge indexes |
| 71 | IndexMerger: starting at 2011-07-25 17:53:07 |
| 72 | IndexMerger: merging indexes to: /user/crawler/newcrawl_3/index |
| 73 | IndexMerger: finished at 2011-07-25 17:53:07, elapsed: 00:00:00 |
| 74 | Some stats |
| 75 | CrawlDb statistics start: /user/crawler/newcrawl_3/crawldb |
| 76 | Statistics for CrawlDb: /user/crawler/newcrawl_3/crawldb |
| 77 | TOTAL urls: 514 |
| 78 | retry 0: 514 |
| 79 | min score: 0.0 |
| 80 | avg score: 0.010715953 |
| 81 | max score: 1.076 |
| 82 | status 1 (db_unfetched): 454 |
| 83 | status 2 (db_fetched): 52 |
| 84 | status 3 (db_gone): 2 |
| 85 | status 5 (db_redir_perm): 6 |
| 86 | CrawlDb statistics: done |
| 87 | finish on : /home/crawler/newcrawl_3 |
| 88 | |
| 89 | }}} |