1 | /* |
---|
2 | * Cloud9: A MapReduce Library for Hadoop |
---|
3 | * |
---|
4 | * Licensed under the Apache License, Version 2.0 (the "License"); you |
---|
5 | * may not use this file except in compliance with the License. You may |
---|
6 | * obtain a copy of the License at |
---|
7 | * |
---|
8 | * http://www.apache.org/licenses/LICENSE-2.0 |
---|
9 | * |
---|
10 | * Unless required by applicable law or agreed to in writing, software |
---|
11 | * distributed under the License is distributed on an "AS IS" BASIS, |
---|
12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or |
---|
13 | * implied. See the License for the specific language governing |
---|
14 | * permissions and limitations under the License. |
---|
15 | */ |
---|
16 | |
---|
17 | package tw.org.nchc.demo; |
---|
18 | |
---|
19 | import java.io.BufferedReader; |
---|
20 | import java.io.FileInputStream; |
---|
21 | import java.io.IOException; |
---|
22 | import java.io.InputStreamReader; |
---|
23 | |
---|
24 | import tw.org.nchc.tuple.Schema; |
---|
25 | import tw.org.nchc.tuple.Tuple; |
---|
26 | import tw.org.nchc.util.LocalTupleRecordWriter; |
---|
27 | |
---|
28 | /** |
---|
29 | * <p> |
---|
30 | * Demo that packs the sample collection into records using the tuple library, |
---|
31 | * illustrating the use of the {@link tw.org.nchc.tuple.Tuple} class. The |
---|
32 | * records are stored in a local SequenceFile; this file can then be transfered |
---|
33 | * over to HDFS to serve as the starting point for a MapReduce operation. |
---|
34 | * </p> |
---|
35 | * |
---|
36 | * <p> |
---|
37 | * Each record is a tuple; the first field of the tuple is a String with the |
---|
38 | * field name "text", which consists of the raw text of the record. |
---|
39 | * </p> |
---|
40 | * |
---|
41 | * @see DemoPackRecords2 |
---|
42 | * @see DemoReadPackedRecords |
---|
43 | */ |
---|
44 | public class DemoPackRecords { |
---|
45 | private DemoPackRecords() { |
---|
46 | } |
---|
47 | |
---|
48 | // define the tuple schema for the input record |
---|
49 | private static final Schema RECORD_SCHEMA = new Schema(); |
---|
50 | static { |
---|
51 | RECORD_SCHEMA.addField("text", String.class, ""); |
---|
52 | } |
---|
53 | |
---|
54 | // instantiate a single tuple |
---|
55 | private static Tuple tuple = RECORD_SCHEMA.instantiate(); |
---|
56 | |
---|
57 | /** |
---|
58 | * Runs the demo. |
---|
59 | */ |
---|
60 | public static void main(String[] args) throws IOException { |
---|
61 | String infile = "../umd-hadoop-dist/sample-input/bible+shakes.nopunc"; |
---|
62 | String outfile = "../umd-hadoop-dist/sample-input/bible+shakes.nopunc.packed"; |
---|
63 | |
---|
64 | // create LocalTupleRecordWriter to write tuples to a local SequenceFile |
---|
65 | LocalTupleRecordWriter writer = new LocalTupleRecordWriter(outfile); |
---|
66 | |
---|
67 | // read in raw text records, line separated |
---|
68 | BufferedReader data = new BufferedReader(new InputStreamReader( |
---|
69 | new FileInputStream(infile))); |
---|
70 | |
---|
71 | String line; |
---|
72 | while ((line = data.readLine()) != null) { |
---|
73 | // write the record |
---|
74 | tuple.set(0, line); |
---|
75 | writer.add(tuple); |
---|
76 | } |
---|
77 | |
---|
78 | data.close(); |
---|
79 | writer.close(); |
---|
80 | |
---|
81 | System.out.println("Wrote " + writer.getRecordCount() + " records."); |
---|
82 | } |
---|
83 | } |
---|