Context Navigation

← Previous Revision
Latest Revision
Next Revision →
Blame
Revision Log

DemoPackRecords2.java @ 20

Last change on this file since 20 was 20, checked in by waue, 16 years ago
將改完的 hadoop 0.17版package 放來備份目前繼續開發 hadoop 0.16 + hbase 1.3
File size: 3.2 KB

Line
1	/*
2	* Cloud9: A MapReduce Library for Hadoop
3	*
4	* Licensed under the Apache License, Version 2.0 (the "License"); you
5	* may not use this file except in compliance with the License. You may
6	* obtain a copy of the License at
7	*
8	* http://www.apache.org/licenses/LICENSE-2.0
9	*
10	* Unless required by applicable law or agreed to in writing, software
11	* distributed under the License is distributed on an "AS IS" BASIS,
12	* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
13	* implied. See the License for the specific language governing
14	* permissions and limitations under the License.
15	*/
16
17	package tw.org.nchc.demo;
18
19	import java.io.BufferedReader;
20	import java.io.FileInputStream;
21	import java.io.IOException;
22	import java.io.InputStreamReader;
23	import java.util.StringTokenizer;
24
25	import org.apache.hadoop.io.Text;
26
27	import tw.org.nchc.tuple.ListWritable;
28	import tw.org.nchc.tuple.Schema;
29	import tw.org.nchc.tuple.Tuple;
30	import tw.org.nchc.util.LocalTupleRecordWriter;
31
32	/**
33	* <p>
34	* Demo that packs the sample collection into records using the tuple library,
35	* illustrating the use of the {@link tw.org.nchc.tuple.Tuple} and
36	* {@link tw.org.nchc.tuple.ListWritable} classes. The records are stored in
37	* a local SequenceFile; this file can then be transfered over to HDFS to serve
38	* as the starting point for a MapReduce operation.
39	* </p>
40	*
41	* <p>
42	* Each record is a tuple with two fields:
43	* </p>
44	*
45	* <ul>
46	*
47	* <li>the first field of the tuple is an Integer with the field name "length";
48	* its value is the length of the record in number of characters.</li>
49	*
50	* <li>the second field of the tuple is a ListWritable<Text> with the field
51	* name "tokens"; its value is a list of tokens that comprise the text of the
52	* record.</li>
53	*
54	* </ul>
55	*
56	* @see DemoPackRecords
57	* @see DemoReadPackedRecords2
58	*/
59	public class DemoPackRecords2 {
60	private DemoPackRecords2() {
61	}
62
63	// define the tuple schema for the input record
64	private static final Schema RECORD_SCHEMA = new Schema();
65	static {
66	RECORD_SCHEMA.addField("length", Integer.class);
67	RECORD_SCHEMA.addField("tokens", ListWritable.class, "");
68	}
69
70	// instantiate a single tuple
71	private static Tuple tuple = RECORD_SCHEMA.instantiate();
72
73	/**
74	* Runs the demo.
75	*/
76	public static void main(String[] args) throws IOException {
77	String infile = "../umd-hadoop-dist/sample-input/bible+shakes.nopunc";
78	String outfile = "../umd-hadoop-dist/sample-input/bible+shakes.nopunc.packed2";
79
80	// create LocalTupleRecordWriter to write tuples to a local SequenceFile
81	LocalTupleRecordWriter writer = new LocalTupleRecordWriter(outfile);
82
83	// read in raw text records, line separated
84	BufferedReader data = new BufferedReader(new InputStreamReader(
85	new FileInputStream(infile)));
86
87	String line;
88	while ((line = data.readLine()) != null) {
89	ListWritable<Text> tokens = new ListWritable<Text>();
90	StringTokenizer itr = new StringTokenizer(line);
91	while (itr.hasMoreTokens()) {
92	tokens.add(new Text(itr.nextToken()));
93	}
94
95	// write the record
96	tuple.set("length", line.length());
97	tuple.set("tokens", tokens);
98	writer.add(tuple);
99	}
100
101	data.close();
102	writer.close();
103
104	System.out.println("Wrote " + writer.getRecordCount() + " records.");
105	}
106	}

Note: See TracBrowser for help on using the repository browser.

Context Navigation

source: sample/hadoop-0.17/tw/org/nchc/demo/DemoPackRecords2.java @ 20

Download in other formats: