source: sample/hadoop-0.16/tw/org/nchc/tuple/Tuple.java @ 218

Last change on this file since 218 was 21, checked in by waue, 16 years ago

hadoop 0.16

File size: 16.0 KB
RevLine 
[21]1/*
2 * Cloud9: A MapReduce Library for Hadoop
3 *
4 * Licensed under the Apache License, Version 2.0 (the "License"); you
5 * may not use this file except in compliance with the License. You may
6 * obtain a copy of the License at
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
13 * implied. See the License for the specific language governing
14 * permissions and limitations under the License.
15 */
16
17package tw.org.nchc.tuple;
18
19import java.io.ByteArrayInputStream;
20import java.io.ByteArrayOutputStream;
21import java.io.DataInput;
22import java.io.DataInputStream;
23import java.io.DataOutput;
24import java.io.DataOutputStream;
25import java.io.IOException;
26import java.util.HashMap;
27import java.util.Map;
28
29import org.apache.hadoop.io.Writable;
30import org.apache.hadoop.io.WritableComparable;
31
32/**
33 * <p>
34 * Class that represents a tuple in Hadoop's data type system. Tuples are
35 * instantiated from a {@link Schema}. The Tuple class implements
36 * WritableComparable, so it can be directly used as MapReduce keys and values.
37 * The natural sort order of tuples is defined by an internally-generated byte
38 * representation and is not based on field values. This class, combined with
39 * {@link ListWritable}, allows the user to define arbitrarily complex data
40 * structures.
41 * </p>
42 *
43 * <p>
44 * All fields can either be indexed via its integer position or its field name.
45 * Each field is typed, which can be determined via {@link #getFieldType(int)}.
46 * Fields can either contain an object of the specified type or a special symbol
47 * String. The method {@link #containsSymbol(int)} can be used to check if a
48 * field contains a special symbol. If the field contains a special symbol,
49 * {@link #get(int)} will return <code>null</code>. If the field does not
50 * contain a special symbol, {@link #getSymbol(int)} will return
51 * <code>null</code>.
52 * </p>
53 *
54 * <p>
55 * Here is a typical usage scenario for special symbols: say you had tuples that
56 * represented <code>count(a, b)</code>, where <code>a</code> and
57 * <code>b</code> are tokens you observe. There is often a need to compute
58 * <code>count(a, *)</code>, for example, to derive conditional
59 * probabilities. In this case, you can use a special symbol to represent the
60 * <code>*</code>, and distinguish it from the lexical token '<code>*</code>'.
61 * </p>
62 *
63 * <p>
64 * The natural sort order of the Tuple is defined by {@link #compareTo(Object)}.
65 * Tuples are sorted by field, with special symbols always appearing first
66 * within each field.
67 * </p>
68 *
69 * @see ListWritable
70 * @see Schema
71 *
72 */
73public class Tuple implements WritableComparable {
74
75  protected static final byte SYMBOL = 0;
76  protected static final byte INT = 1;
77  protected static final byte BOOLEAN = 2;
78  protected static final byte LONG = 3;
79  protected static final byte FLOAT = 4;
80  protected static final byte DOUBLE = 5;
81  protected static final byte STRING = 6;
82  protected static final byte WRITABLE = 7;
83
84  private Object[] mObjects;
85  private String[] mSymbols;
86  private String[] mFields;
87  private Class<?>[] mTypes;
88
89  private Map<String, Integer> mFieldLookup = null;
90
91  protected Tuple(Object[] objects, String[] symbols, String[] fields,
92      Class<?>[] types) {
93    mObjects = objects;
94    mSymbols = symbols;
95    mFields = fields;
96    mTypes = types;
97  }
98
99  /**
100   * Creates an empty Tuple. This constructor is needed by Hadoop's framework
101   * for deserializing Writable objects. The preferred way to instantiate
102   * tuples is through {@link Schema#instantiate(Object...)}.
103   */
104  public Tuple() {
105  }
106
107  /**
108   * Factory method for deserializing a Tuple object.
109   *
110   * @param in
111   *            raw byte source of the Tuple
112   * @return a new Tuple
113   * @throws IOException
114   */
115  public static Tuple createFrom(DataInput in) throws IOException {
116    Tuple tuple = new Tuple();
117    tuple.readFields(in);
118
119    return tuple;
120  }
121
122  /**
123   * Sets the object at a particular field (by position) in this Tuple.
124   *
125   * @param i
126   *            field position
127   * @param o
128   *            object to set at the specified field
129   */
130  public void set(int i, Object o) {
131    if (o == null) {
132      throw new TupleException(
133          "Null values are not allowed for tuple fields!");
134    }
135
136    if (!o.getClass().equals(mTypes[i])) {
137      throw new TupleException("Field value of wrong type, expected "
138          + mTypes[i] + "!");
139    }
140
141    mObjects[i] = o;
142  }
143
144  /**
145   * Sets the object at a particular field (by name) in this Tuple.
146   *
147   * @param field
148   *            field name
149   * @param o
150   *            object to set at the specified field
151   */
152  public void set(String field, Object o) {
153    if (mFieldLookup == null)
154      initLookup();
155
156    if (!mFieldLookup.containsKey(field)) {
157      throw new TupleException("Field '" + field + "' does not exist!");
158    }
159
160    set(mFieldLookup.get(field), o);
161  }
162
163  /**
164   * Sets a special symbol at a particular field (by position) in this Tuple.
165   *
166   * @param i
167   *            field position
168   * @param s
169   *            special symbol to set at specified field
170   */
171  public void setSymbol(int i, String s) {
172    if (s == null) {
173      throw new TupleException("Null is not a valid symbol!");
174    }
175
176    mObjects[i] = null;
177    mSymbols[i] = s;
178  }
179
180  /**
181   * Sets a special symbol at a particular field (by name) in this Tuple.
182   *
183   * @param field
184   *            field name
185   * @param s
186   *            special symbol to set at specified field
187   */
188  public void setSymbol(String field, String s) {
189    if (mFieldLookup == null)
190      initLookup();
191
192    if (!mFieldLookup.containsKey(field)) {
193      throw new TupleException("Field '" + field + "' does not exist!");
194    }
195
196    setSymbol(mFieldLookup.get(field), s);
197  }
198
199  /**
200   * Returns object at a particular field (by position) in this Tuple. Returns
201   * <code>null</code> if the field contains a special symbol.
202   *
203   * @param i
204   *            field position
205   * @return object at field, or <code>null</code> if the field contains a
206   *         special symbol
207   */
208  public Object get(int i) {
209    return mObjects[i];
210  }
211
212  /**
213   * Returns object at a particular field (by name) in this Tuple. Returns
214   * <code>null</code> if the field contains a special symbol.
215   *
216   * @param field
217   *            field name
218   * @return object at field, or <code>null</code> if the field contains a
219   *         special symbol
220   */
221  public Object get(String field) {
222    if (mFieldLookup == null)
223      initLookup();
224
225    if (!mFieldLookup.containsKey(field)) {
226      throw new TupleException("Field '" + field + "' does not exist!");
227    }
228
229    return get(mFieldLookup.get(field));
230  }
231
232  /**
233   * Returns special symbol at a particular field (by position). Returns
234   * <code>null</code> if the field does not contain a special symbol.
235   *
236   * @param i
237   *            field position
238   * @return special symbol at field, or <code>null</code> if the field does
239   *         not contain a special symbol.
240   */
241  public String getSymbol(int i) {
242    if (mObjects[i] != null)
243      return null;
244
245    return mSymbols[i];
246  }
247
248  /**
249   * Returns special symbol at a particular field (by name). Returns
250   * <code>null</code> if the field does not contain a special symbol.
251   *
252   * @param field
253   *            field name
254   * @return special symbol at field, or <code>null</code> if the field does
255   *         not contain a special symbol.
256   */
257  public String getSymbol(String field) {
258    if (mFieldLookup == null)
259      initLookup();
260
261    if (!mFieldLookup.containsKey(field)) {
262      throw new TupleException("Field '" + field + "' does not exist!");
263    }
264
265    return getSymbol(mFieldLookup.get(field));
266  }
267
268  /**
269   * Determines if a particular field (by position) contains a special symbol.
270   *
271   * @param i
272   *            field position
273   * @return <code>true</code> if the field contains a special symbol, or
274   *         <code>false</code> otherwise
275   */
276  public boolean containsSymbol(int i) {
277    return mObjects[i] == null;
278  }
279
280  /**
281   * Determines if a particular field (by name) contains a special symbol.
282   *
283   * @param field
284   *            field name
285   * @return <code>true</code> if the field contains a special symbol, or
286   *         <code>false</code> otherwise
287   */
288  public boolean containsSymbol(String field) {
289    if (mFieldLookup == null)
290      initLookup();
291
292    if (!mFieldLookup.containsKey(field)) {
293      throw new TupleException("Field '" + field + "' does not exist!");
294    }
295
296    return containsSymbol(mFieldLookup.get(field));
297  }
298
299  /**
300   * Returns the type of a particular field (by position).
301   *
302   * @param i
303   *            field position
304   * @return type of the field
305   */
306  public Class<?> getFieldType(int i) {
307    return mTypes[i];
308  }
309
310  /**
311   * Returns the type of a particular field (by name).
312   *
313   * @param field
314   *            field name
315   * @return type of the field
316   */
317  public Class<?> getFieldType(String field) {
318    if (mFieldLookup == null)
319      initLookup();
320
321    if (!mFieldLookup.containsKey(field)) {
322      throw new TupleException("Field '" + field + "' does not exist!");
323    }
324
325    return getFieldType(mFieldLookup.get(field));
326  }
327
328  public int getFieldCount() {
329    return mFields.length;
330  }
331
332  /**
333   * Lazily construct the lookup table for this schema. Used to accelerate
334   * name-based lookups of schema information.
335   */
336  private void initLookup() {
337    mFieldLookup = new HashMap<String, Integer>();
338    for (int i = 0; i < mFields.length; ++i) {
339      mFieldLookup.put(mFields[i], new Integer(i));
340    }
341  }
342
343  /**
344   * Deserializes the Tuple.
345   *
346   * @param in
347   *            source for raw byte representation
348   */
349  public void readFields(DataInput in) throws IOException {
350    int numFields = in.readInt();
351
352    mObjects = new Object[numFields];
353    mSymbols = new String[numFields];
354    mFields = new String[numFields];
355    mTypes = new Class[numFields];
356
357    for (int i = 0; i < numFields; i++) {
358      mFields[i] = in.readUTF();
359    }
360
361    for (int i = 0; i < numFields; i++) {
362      byte type = in.readByte();
363
364      if (type == SYMBOL) {
365        String className = in.readUTF();
366        try {
367          mTypes[i] = Class.forName(className);
368        } catch (Exception e) {
369          e.printStackTrace();
370        }
371        mObjects[i] = null;
372        mSymbols[i] = in.readUTF();
373      } else if (type == INT) {
374        mTypes[i] = Integer.class;
375        mObjects[i] = in.readInt();
376      } else if (type == BOOLEAN) {
377        mTypes[i] = Boolean.class;
378        mObjects[i] = in.readBoolean();
379      } else if (type == LONG) {
380        mTypes[i] = Long.class;
381        mObjects[i] = in.readLong();
382      } else if (type == FLOAT) {
383        mTypes[i] = Float.class;
384        mObjects[i] = in.readFloat();
385      } else if (type == DOUBLE) {
386        mTypes[i] = Double.class;
387        mObjects[i] = in.readDouble();
388      } else if (type == STRING) {
389        mTypes[i] = String.class;
390        mObjects[i] = in.readUTF();
391      } else {
392        try {
393          String className = in.readUTF();
394          mTypes[i] = Class.forName(className);
395
396          int sz = in.readInt();
397          byte[] bytes = new byte[sz];
398          in.readFully(bytes);
399
400          Writable obj = (Writable) mTypes[i].newInstance();
401          obj.readFields(new DataInputStream(
402              new ByteArrayInputStream(bytes)));
403          mObjects[i] = obj;
404        } catch (Exception e) {
405          e.printStackTrace();
406        }
407      }
408    }
409  }
410
411  /**
412   * Serializes this Tuple.
413   *
414   * @param out
415   *            where to write the raw byte representation
416   */
417  public void write(DataOutput out) throws IOException {
418    out.writeInt(mFields.length);
419    for (int i = 0; i < mFields.length; i++) {
420      out.writeUTF(mFields[i]);
421    }
422
423    for (int i = 0; i < mFields.length; i++) {
424      if (mObjects[i] == null && mSymbols[i] == null) {
425        throw new TupleException("Cannot serialize null fields!");
426      }
427
428      if (containsSymbol(i)) {
429        out.writeByte(SYMBOL);
430        out.writeUTF(mTypes[i].getCanonicalName());
431        out.writeUTF(mSymbols[i]);
432      } else if (mTypes[i] == Integer.class) {
433        out.writeByte(INT);
434        out.writeInt((Integer) mObjects[i]);
435      } else if (mTypes[i] == Boolean.class) {
436        out.writeByte(BOOLEAN);
437        out.writeBoolean((Boolean) mObjects[i]);
438      } else if (mTypes[i] == Long.class) {
439        out.writeByte(LONG);
440        out.writeLong((Long) mObjects[i]);
441      } else if (mTypes[i] == Float.class) {
442        out.writeByte(FLOAT);
443        out.writeFloat((Float) mObjects[i]);
444      } else if (mTypes[i] == Double.class) {
445        out.writeByte(DOUBLE);
446        out.writeDouble((Double) mObjects[i]);
447      } else if (mTypes[i] == String.class) {
448        out.writeByte(STRING);
449        out.writeUTF(mObjects[i].toString());
450      } else {
451        out.writeByte(WRITABLE);
452
453        ByteArrayOutputStream bytesOut = new ByteArrayOutputStream();
454        DataOutputStream dataOut = new DataOutputStream(bytesOut);
455
456        out.writeUTF(mTypes[i].getCanonicalName());
457        ((Writable) mObjects[i]).write(dataOut);
458        out.writeInt(bytesOut.size());
459        out.write(bytesOut.toByteArray());
460      }
461    }
462  }
463
464  /**
465   * Generates human-readable String representation of this Tuple.
466   *
467   * @return human-readable String representation of this Tuple
468   */
469  public String toString() {
470    StringBuffer sb = new StringBuffer();
471
472    for (int i = 0; i < mFields.length; i++) {
473      if (i != 0)
474        sb.append(", ");
475      if (mSymbols[i] != null) {
476        sb.append(mSymbols[i]);
477      } else {
478        sb.append(mObjects[i]);
479      }
480    }
481
482    return "(" + sb.toString() + ")";
483  }
484
485  /**
486   * <p>
487   * Defines a natural sort order for the Tuple class. Following standard
488   * convention, this method returns a value less than zero, a value greater
489   * than zero, or zero if this Tuple should be sorted before, sorted after,
490   * or is equal to <code>obj</code>. The sort order is defined as follows:
491   * </p>
492   *
493   * <ul>
494   * <li>Each field in the Tuple is compared sequentially from first to last.</li>
495   * <li>Within each field, all special symbols are sorted before actual
496   * field tokens (i.e., the actual String, Integer, or whatever the field may
497   * contain).</li>
498   * <li>The special symbols are sorted lexicographically (being Strings).</li>
499   * <li>The field tokens are sorted by their natural order.</li>
500   * <li>If the field contents are identical (both contain same special
501   * symbol or field token), the next field in the tuple is considered.</li>
502   * <li>Two tuples are considered equal if all their fields are identical.</li>
503   * </ul>
504   *
505   * @return a value less than zero, a value greater than zero, or zero if
506   *         this Tuple should be sorted before, sorted after, or is equal to
507   *         <code>obj</code>.
508   */
509  public int compareTo(Object obj) {
510    Tuple that = (Tuple) obj;
511
512    // iterate through the fields
513    for (int i = 0; i < this.getFieldCount(); i++) {
514      // if both contain special symbol, then sort special symbols
515      if (this.containsSymbol(i) && that.containsSymbol(i)) {
516        String thisSymbol = this.getSymbol(i);
517        String thatSymbol = that.getSymbol(i);
518
519        // special symbols identical; move to next field
520        if (!thisSymbol.equals(thatSymbol)) {
521          return thisSymbol.compareTo(thatSymbol);
522        }
523      } else {
524        // special symbols always come first
525        if (this.containsSymbol(i))
526          return -1;
527
528        if (that.containsSymbol(i))
529          return 1;
530
531        @SuppressWarnings("unchecked")
532        Comparable<Object> thisField = (Comparable<Object>) this.get(i);
533
534        @SuppressWarnings("unchecked")
535        Comparable<Object> thatField = (Comparable<Object>) that.get(i);
536
537        // if the field tokens are identical, move to next field
538        if (!thisField.equals(thatField)) {
539          return thisField.compareTo(thatField);
540        }
541      }
542    }
543
544    return 0;
545  }
546
547  /**
548   * Returns a hash code for this Tuple.
549   *
550   * @return hash code for this Tuple
551   */
552  public int hashCode() {
553    int hash = 0;
554
555    for (int i = 0; i < mObjects.length; i++) {
556      if (mObjects[i] != null) {
557        hash += mObjects[i].hashCode();
558      } else {
559        hash += mSymbols[i].hashCode();
560      }
561    }
562
563    return hash;
564  }
565
566}
Note: See TracBrowser for help on using the repository browser.