| 1 | |
| 2 | = 編譯 = |
| 3 | |
| 4 | 下載解壓縮 nutch-1.2 (目前用 nutch-1.2-bin.tar.gz) |
| 5 | {{{ |
| 6 | cd $nutch-1.2/ |
| 7 | vim src/java/org/apache/nutch/analysis/NutchAnalysis.jj |
| 8 | }}} |
| 9 | |
| 10 | {{{ |
| 11 | #!text |
| 12 | | <SIGRAM: (<CJK>)+ > |
| 13 | }}} |
| 14 | |
| 15 | * 用編譯器 javacc 編譯出七個java檔 |
| 16 | |
| 17 | {{{ |
| 18 | CharStream.java NutchAnalysisTokenManager.java |
| 19 | TokenMgrError.java |
| 20 | NutchAnalysisConstants.java ParseException.java |
| 21 | NutchAnalysis.java Token.java |
| 22 | }}} |
| 23 | |
| 24 | {{{ |
| 25 | cd $nutch-1.2/src/java/org/apache/nutch/analysis |
| 26 | javacc -OUTPUT_DIRECTORY=./ika/ NutchAnalysis.jj |
| 27 | mv ./ika/* ./ ; rmdir ika; |
| 28 | |
| 29 | }}} |
| 30 | |
| 31 | * 編譯剛編出來的 !NutchAnalysis.java |
| 32 | |
| 33 | {{{ |
| 34 | vim $nutch-1.2/src/java/org/apache/nutch/analysis/NutchAnalysis.java |
| 35 | }}} |
| 36 | |
| 37 | |
| 38 | * 加入ParseException (共兩處): |
| 39 | |
| 40 | {{{ |
| 41 | #!text |
| 42 | public static Query parseQuery(....) throws IOException,ParseException |
| 43 | }}} |
| 44 | |
| 45 | /opt/nutch-1.2/src/java/org/apache/nutch/searcher/Query.java |
| 46 | |
| 47 | {{{ |
| 48 | #!java |
| 49 | (:456) |
| 50 | public static Query parse(String queryString, String queryLang, Configuration conf) |
| 51 | throws IOException { |
| 52 | Query que; |
| 53 | try { |
| 54 | que = fixup(NutchAnalysis.parseQuery( |
| 55 | queryString, AnalyzerFactory.get(conf).get(queryLang), conf), conf); |
| 56 | }catch (org.apache.nutch.analysis.ParseException e){ |
| 57 | que = new Query(); |
| 58 | } |
| 59 | return que; |
| 60 | } |
| 61 | }}} |
| 62 | |
| 63 | |
| 64 | * 下載 IKAnalyzer3.2.8.jar (2011/07/29) 解壓縮 |
| 65 | [http://code.google.com/p/ik-analyzer/downloads/list] |
| 66 | |
| 67 | nutch-1.2 用的是 lucene-core-3.0.1.jar , 因此對應 ikanalyzer 為 3.2.8 版本 |
| 68 | |
| 69 | |
| 70 | || 3.1.6GA || 兼容 2.9.1 及先前版本 ||对 solr1.3、solr1.4 提供接口实现 || |
| 71 | || 3.2.0G 及后续版本 || 兼容 Lucene2.9 及 3.0 版本 || 仅对 solr1.4 提供接口实现 |
| 72 | 丌支持 Lucene2.4 及先前版本 || |
| 73 | |
| 74 | |
| 75 | IKAnalyzer3.2.8 bin.zip 內的 IKAnalyzer3.2.8.jar 解壓縮出來,分別放到以下資料夾 |
| 76 | |
| 77 | {{{ |
| 78 | cp IKAnalyzer3.2.8.jar $nutch-1.2/lib/ |
| 79 | cp IKAnalyzer3.2.8.jar $my_nutch_dir/lib/ |
| 80 | cp IKAnalyzer3.2.8.jar $my_tomcat_dir/webapps/ROOT/WEB-INF/lib |
| 81 | }}} |
| 82 | |
| 83 | * 修改 NutchDocumentAnalyzer.java 程式碼 |
| 84 | |
| 85 | {{{ |
| 86 | vim src/java/org/apache/nutch/analysis/NutchDocumentAnalyzer.java |
| 87 | }}} |
| 88 | |
| 89 | 將 |
| 90 | |
| 91 | {{{ |
| 92 | #!text |
| 93 | public TokenStream tokenStream(String fieldName, Reader reader) { |
| 94 | Analyzer analyzer; |
| 95 | if ("anchor".equals(fieldName)) |
| 96 | analyzer = ANCHOR_ANALYZER; |
| 97 | else |
| 98 | analyzer = CONTENT_ANALYZER; |
| 99 | |
| 100 | return analyzer.tokenStream(fieldName, reader); |
| 101 | } |
| 102 | }}} |
| 103 | |
| 104 | 改成 |
| 105 | |
| 106 | {{{ |
| 107 | #!text |
| 108 | public TokenStream tokenStream(String fieldName, Reader reader) { |
| 109 | Analyzer analyzer; |
| 110 | if ("anchor".equals(fieldName)) |
| 111 | analyzer = ANCHOR_ANALYZER; |
| 112 | else |
| 113 | //analyzer = CONTENT_ANALYZER; |
| 114 | analyzer = new org.wltea.analyzer.lucene.IKAnalyzer(); |
| 115 | return analyzer.tokenStream(fieldName, reader); |
| 116 | } |
| 117 | }}} |
| 118 | |
| 119 | |
| 120 | |
| 121 | * 修改 build.xml |
| 122 | |
| 123 | {{{ |
| 124 | #!text |
| 125 | <include name="IKAnalyzer*.jar"/> |
| 126 | }}} |
| 127 | |
| 128 | |
| 129 | |
| 130 | = 佈署 = |
| 131 | |
| 132 | * 重新編譯 nutch 並產生 nutch-job-1.2.job |
| 133 | |
| 134 | {{{ |
| 135 | ant |
| 136 | }}} |
| 137 | |
| 138 | * build/ 目錄裡面的 nutch-job-1.2.job 就是重編後的核心 |
| 139 | {{{ |
| 140 | ant jar; ant war; |
| 141 | }}} |
| 142 | |
| 143 | |
| 144 | * 將nutch-job-1.2.jar複製到我的nutchez資料夾內取代使用 |
| 145 | |
| 146 | |
| 147 | * 最後用nutch 的 crawl 抓取網頁,搜索的結果就是按ik分過的中文詞 |
| 148 | |
| 149 | |
| 150 | == 補充選項:加入字典檔 == |
| 151 | |
| 152 | |
| 153 | |
| 154 | 1. 編輯 IKAnalyzer.cfg.xml |
| 155 | |
| 156 | <properties> |
| 157 | <comment>IK Analyzer</comment> |
| 158 | <entry key="ext_dict">/cyc.dic</entry> |
| 159 | </properties> |
| 160 | |
| 161 | |
| 162 | 2. 編輯你的字典檔 cyc.dic ,一行一個關鍵字,如: |
| 163 | |
| 164 | 數學 |
| 165 | 嘉義縣網 |
| 166 | |
| 167 | 3. 用解壓縮工具打開 /opt/crawlzilla/nutch/nutch-1.2.job,塞入 cyc.dic 與 IKAnalyzer.cfg.xml |
| 168 | |
| 169 | 4. 重新啟動crawlzilla 的所有服務 |
| 170 | |
| 171 | |
| 172 | 之後抓的索引庫就有該中文分詞了了 |
| 173 | |
| 174 | 補充: |
| 175 | 如果有兩個字典檔以上的話,可以一起放到 nutch-1.0.job 的壓縮檔內, 修改 IKAnalyzer.cfg.xml |
| 176 | ,加入字典檔, 每個字典檔各用分號區隔。 |
| 177 | 如 |
| 178 | <entry key="ext_dict">/cyc.dic;/cyc2.dic</entry> |