[20] | 1 | /* |
---|
| 2 | * Cloud9: A MapReduce Library for Hadoop |
---|
| 3 | * |
---|
| 4 | * Licensed under the Apache License, Version 2.0 (the "License"); you |
---|
| 5 | * may not use this file except in compliance with the License. You may |
---|
| 6 | * obtain a copy of the License at |
---|
| 7 | * |
---|
| 8 | * http://www.apache.org/licenses/LICENSE-2.0 |
---|
| 9 | * |
---|
| 10 | * Unless required by applicable law or agreed to in writing, software |
---|
| 11 | * distributed under the License is distributed on an "AS IS" BASIS, |
---|
| 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or |
---|
| 13 | * implied. See the License for the specific language governing |
---|
| 14 | * permissions and limitations under the License. |
---|
| 15 | */ |
---|
| 16 | |
---|
| 17 | package tw.org.nchc.demo; |
---|
| 18 | |
---|
| 19 | import java.io.BufferedReader; |
---|
| 20 | import java.io.FileInputStream; |
---|
| 21 | import java.io.IOException; |
---|
| 22 | import java.io.InputStreamReader; |
---|
| 23 | |
---|
| 24 | import tw.org.nchc.tuple.Schema; |
---|
| 25 | import tw.org.nchc.tuple.Tuple; |
---|
| 26 | import tw.org.nchc.util.LocalTupleRecordWriter; |
---|
| 27 | |
---|
| 28 | /** |
---|
| 29 | * <p> |
---|
| 30 | * Demo that packs the sample collection into records using the tuple library, |
---|
| 31 | * illustrating the use of the {@link tw.org.nchc.tuple.Tuple} class. The |
---|
| 32 | * records are stored in a local SequenceFile; this file can then be transfered |
---|
| 33 | * over to HDFS to serve as the starting point for a MapReduce operation. |
---|
| 34 | * </p> |
---|
| 35 | * |
---|
| 36 | * <p> |
---|
| 37 | * Each record is a tuple; the first field of the tuple is a String with the |
---|
| 38 | * field name "text", which consists of the raw text of the record. |
---|
| 39 | * </p> |
---|
| 40 | * |
---|
| 41 | * @see DemoPackRecords2 |
---|
| 42 | * @see DemoReadPackedRecords |
---|
| 43 | */ |
---|
| 44 | public class DemoPackRecords { |
---|
| 45 | private DemoPackRecords() { |
---|
| 46 | } |
---|
| 47 | |
---|
| 48 | // define the tuple schema for the input record |
---|
| 49 | private static final Schema RECORD_SCHEMA = new Schema(); |
---|
| 50 | static { |
---|
| 51 | RECORD_SCHEMA.addField("text", String.class, ""); |
---|
| 52 | } |
---|
| 53 | |
---|
| 54 | // instantiate a single tuple |
---|
| 55 | private static Tuple tuple = RECORD_SCHEMA.instantiate(); |
---|
| 56 | |
---|
| 57 | /** |
---|
| 58 | * Runs the demo. |
---|
| 59 | */ |
---|
| 60 | public static void main(String[] args) throws IOException { |
---|
| 61 | String infile = "../umd-hadoop-dist/sample-input/bible+shakes.nopunc"; |
---|
| 62 | String outfile = "../umd-hadoop-dist/sample-input/bible+shakes.nopunc.packed"; |
---|
| 63 | |
---|
| 64 | // create LocalTupleRecordWriter to write tuples to a local SequenceFile |
---|
| 65 | LocalTupleRecordWriter writer = new LocalTupleRecordWriter(outfile); |
---|
| 66 | |
---|
| 67 | // read in raw text records, line separated |
---|
| 68 | BufferedReader data = new BufferedReader(new InputStreamReader( |
---|
| 69 | new FileInputStream(infile))); |
---|
| 70 | |
---|
| 71 | String line; |
---|
| 72 | while ((line = data.readLine()) != null) { |
---|
| 73 | // write the record |
---|
| 74 | tuple.set(0, line); |
---|
| 75 | writer.add(tuple); |
---|
| 76 | } |
---|
| 77 | |
---|
| 78 | data.close(); |
---|
| 79 | writer.close(); |
---|
| 80 | |
---|
| 81 | System.out.println("Wrote " + writer.getRecordCount() + " records."); |
---|
| 82 | } |
---|
| 83 | } |
---|