| | 1 | {{{ |
| | 2 | #!text |
| | 3 | Merging 18 segments to /user/crawler/newcrawl_3/segments/20110725175250 |
| | 4 | SegmentMerger: adding /user/crawler/crawl1/segments/20110722163308/content |
| | 5 | SegmentMerger: adding /user/crawler/crawl1/segments/20110722163308/crawl_fetch |
| | 6 | SegmentMerger: adding /user/crawler/crawl1/segments/20110722163308/crawl_generate |
| | 7 | SegmentMerger: adding /user/crawler/crawl1/segments/20110722163308/crawl_parse |
| | 8 | SegmentMerger: adding /user/crawler/crawl1/segments/20110722163308/parse_data |
| | 9 | SegmentMerger: adding /user/crawler/crawl1/segments/20110722163308/parse_text |
| | 10 | SegmentMerger: adding /user/crawler/crawl2/segments/20110531151117/content |
| | 11 | SegmentMerger: adding /user/crawler/crawl2/segments/20110531151117/crawl_fetch |
| | 12 | SegmentMerger: adding /user/crawler/crawl2/segments/20110531151117/crawl_generate |
| | 13 | SegmentMerger: adding /user/crawler/crawl2/segments/20110531151117/crawl_parse |
| | 14 | SegmentMerger: adding /user/crawler/crawl2/segments/20110531151117/parse_data |
| | 15 | SegmentMerger: adding /user/crawler/crawl2/segments/20110531151117/parse_text |
| | 16 | SegmentMerger: adding /user/crawler/crawl2/segments/20110531151312/content |
| | 17 | SegmentMerger: adding /user/crawler/crawl2/segments/20110531151312/crawl_fetch |
| | 18 | SegmentMerger: adding /user/crawler/crawl2/segments/20110531151312/crawl_generate |
| | 19 | SegmentMerger: adding /user/crawler/crawl2/segments/20110531151312/crawl_parse |
| | 20 | SegmentMerger: adding /user/crawler/crawl2/segments/20110531151312/parse_data |
| | 21 | SegmentMerger: adding /user/crawler/crawl2/segments/20110531151312/parse_text |
| | 22 | SegmentMerger: using segment data from: |
| | 23 | Exception in thread "main" java.io.IOException: No input paths specified in job |
| | 24 | at org.apache.hadoop.mapred.FileInputFormat.listStatus(FileInputFormat.java:152) |
| | 25 | at org.apache.hadoop.mapred.SequenceFileInputFormat.listStatus(SequenceFileInputFormat.java:44) |
| | 26 | at org.apache.hadoop.mapred.FileInputFormat.getSplits(FileInputFormat.java:201) |
| | 27 | at org.apache.hadoop.mapred.JobClient.writeOldSplits(JobClient.java:810) |
| | 28 | at org.apache.hadoop.mapred.JobClient.submitJobInternal(JobClient.java:781) |
| | 29 | at org.apache.hadoop.mapred.JobClient.submitJob(JobClient.java:730) |
| | 30 | at org.apache.hadoop.mapred.JobClient.runJob(JobClient.java:1249) |
| | 31 | at org.apache.nutch.segment.SegmentMerger.merge(SegmentMerger.java:638) |
| | 32 | at org.apache.nutch.segment.SegmentMerger.main(SegmentMerger.java:683) |
| | 33 | Update segments |
| | 34 | LinkDb: starting at 2011-07-25 17:52:55 |
| | 35 | LinkDb: linkdb: /user/crawler/newcrawl_3/linkdb |
| | 36 | LinkDb: URL normalize: true |
| | 37 | LinkDb: URL filter: true |
| | 38 | LinkDb: java.io.IOException: No input paths specified in job |
| | 39 | at org.apache.hadoop.mapred.FileInputFormat.listStatus(FileInputFormat.java:152) |
| | 40 | at org.apache.hadoop.mapred.SequenceFileInputFormat.listStatus(SequenceFileInputFormat.java:44) |
| | 41 | at org.apache.hadoop.mapred.FileInputFormat.getSplits(FileInputFormat.java:201) |
| | 42 | at org.apache.hadoop.mapred.JobClient.writeOldSplits(JobClient.java:810) |
| | 43 | at org.apache.hadoop.mapred.JobClient.submitJobInternal(JobClient.java:781) |
| | 44 | at org.apache.hadoop.mapred.JobClient.submitJob(JobClient.java:730) |
| | 45 | at org.apache.hadoop.mapred.JobClient.runJob(JobClient.java:1249) |
| | 46 | at org.apache.nutch.crawl.LinkDb.invert(LinkDb.java:175) |
| | 47 | at org.apache.nutch.crawl.LinkDb.run(LinkDb.java:292) |
| | 48 | at org.apache.hadoop.util.ToolRunner.run(ToolRunner.java:65) |
| | 49 | at org.apache.nutch.crawl.LinkDb.main(LinkDb.java:255) |
| | 50 | |
| | 51 | Index segments |
| | 52 | ls: Cannot access /user/crawler/newcrawl_3/segments/*: No such file or directory. |
| | 53 | [check] /opt/crawlzilla/nutch/bin/nutch index /user/crawler/newcrawl_3/newindexes /user/crawler/newcrawl_3/crawldb /user/crawler/newcrawl_3/linkdb |
| | 54 | Usage: Indexer <index> <crawldb> <linkdb> <segment> ... |
| | 55 | De-duplicate indexes |
| | 56 | Dedup: starting at 2011-07-25 17:53:02 |
| | 57 | Dedup: adding indexes in: /user/crawler/newcrawl_3/newindexes |
| | 58 | DeleteDuplicates: org.apache.hadoop.mapred.InvalidInputException: Input path does not exist: hdfs://crawlweb1:9000/user/crawler/newcrawl_3/newindexes |
| | 59 | at org.apache.hadoop.mapred.FileInputFormat.listStatus(FileInputFormat.java:190) |
| | 60 | at org.apache.nutch.indexer.DeleteDuplicates$InputFormat.getSplits(DeleteDuplicates.java:149) |
| | 61 | at org.apache.hadoop.mapred.JobClient.writeOldSplits(JobClient.java:810) |
| | 62 | at org.apache.hadoop.mapred.JobClient.submitJobInternal(JobClient.java:781) |
| | 63 | at org.apache.hadoop.mapred.JobClient.submitJob(JobClient.java:730) |
| | 64 | at org.apache.hadoop.mapred.JobClient.runJob(JobClient.java:1249) |
| | 65 | at org.apache.nutch.indexer.DeleteDuplicates.dedup(DeleteDuplicates.java:451) |
| | 66 | at org.apache.nutch.indexer.DeleteDuplicates.run(DeleteDuplicates.java:519) |
| | 67 | at org.apache.hadoop.util.ToolRunner.run(ToolRunner.java:65) |
| | 68 | at org.apache.nutch.indexer.DeleteDuplicates.main(DeleteDuplicates.java:503) |
| | 69 | |
| | 70 | Merge indexes |
| | 71 | IndexMerger: starting at 2011-07-25 17:53:07 |
| | 72 | IndexMerger: merging indexes to: /user/crawler/newcrawl_3/index |
| | 73 | IndexMerger: finished at 2011-07-25 17:53:07, elapsed: 00:00:00 |
| | 74 | Some stats |
| | 75 | CrawlDb statistics start: /user/crawler/newcrawl_3/crawldb |
| | 76 | Statistics for CrawlDb: /user/crawler/newcrawl_3/crawldb |
| | 77 | TOTAL urls: 514 |
| | 78 | retry 0: 514 |
| | 79 | min score: 0.0 |
| | 80 | avg score: 0.010715953 |
| | 81 | max score: 1.076 |
| | 82 | status 1 (db_unfetched): 454 |
| | 83 | status 2 (db_fetched): 52 |
| | 84 | status 3 (db_gone): 2 |
| | 85 | status 5 (db_redir_perm): 6 |
| | 86 | CrawlDb statistics: done |
| | 87 | finish on : /home/crawler/newcrawl_3 |
| | 88 | |
| | 89 | }}} |