Hadoop Reduces Side Connection Using Datajoin

Question

Hadoop Reduces Side Connection Using Datajoin

I am using the following code to do a side reduction join

/*
 * HadoopMapper.java
 *
 * Created on Apr 8, 2012, 5:39:51 PM
 */


import java.io.DataInput;
import java.io.DataOutput;
import java.io.IOException;
// import org.apache.commons.logging.Log;
// import org.apache.commons.logging.LogFactory;
import org.apache.hadoop.mapred.FileInputFormat;
import org.apache.hadoop.mapred.FileOutputFormat;
import org.apache.hadoop.mapred.JobClient;
import org.apache.hadoop.mapred.JobConf;
import org.apache.hadoop.mapred.TextInputFormat;
import org.apache.hadoop.mapred.TextOutputFormat;
import org.apache.hadoop.util.GenericOptionsParser;
import org.apache.hadoop.util.Tool;
import org.apache.hadoop.util.ToolRunner;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.io.Writable;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.conf.Configured;
import org.apache.hadoop.contrib.utils.join.*; 

/**
 *
 * @author 
 */
public class DataJoin extends Configured implements Tool 
    {
        public static class MapClass extends DataJoinMapperBase 
            {
                protected Text generateInputTag(String inputFile) 
                    {
                        String datasource = inputFile.split("-")[0];
                        return new Text(datasource);
                    }
            protected Text generateGroupKey(TaggedMapOutput aRecord) 
                {
                    String line = ((Text) aRecord.getData()).toString();
                    String[] tokens = line.split(",");
                    String groupKey = tokens[0];
                    return new Text(groupKey);
                }
            protected TaggedMapOutput generateTaggedMapOutput(Object value) 
                {
                    TaggedWritable retv = new TaggedWritable((Text) value);
                    retv.setTag(this.inputTag);
                    return retv;
                }
            }
        public static class Reduce extends DataJoinReducerBase 
            {
                protected TaggedMapOutput combine(Object[] tags, Object[] values) 
                    {
                        if (tags.length < 2) return null;
                        String joinedStr = "";
                        for (int i=0; i<values.length; i++) 
                        {
                            if (i > 0) joinedStr += ",";
                            TaggedWritable tw = (TaggedWritable) values[i];
                            String line = ((Text) tw.getData()).toString();
                            String[] tokens = line.split(",", 2);
                            joinedStr += tokens[1];
                        }
                        TaggedWritable retv = new TaggedWritable(new Text(joinedStr));
                        retv.setTag((Text) tags[0]);
                        return retv;
                    }
            }
        public static class TaggedWritable extends TaggedMapOutput 
            {
                private Writable data;
                public TaggedWritable(Writable data) 
                    {
                        this.tag = new Text("");
                        this.data = data;
                    }

                public Writable getData() 
                    {
                        return data;
                    }
                public void write(DataOutput out) throws IOException
                    {
                        this.tag.write(out);
                        this.data.write(out);
                    }
                public void readFields(DataInput in) throws IOException 
                    {
                        this.tag.readFields(in);
                        this.data.readFields(in);
                    }
            }
        public int run(String[] args) throws Exception 
            {


                                Configuration conf = getConf();
                JobConf job = new JobConf(conf, DataJoin.class);
                                String[] otherArgs = new GenericOptionsParser(conf, args).getRemainingArgs();
                                if (otherArgs.length != 2) 
                                {
                                  System.err.println("Usage: wordcount <in> <out>");
                                  System.exit(2);
                                }

                Path in = new Path(args[0]);
                Path out = new Path(args[1]);
                FileInputFormat.setInputPaths(job, in);
                FileOutputFormat.setOutputPath(job, out);
                job.setJobName("DataJoin");
                job.setMapperClass(MapClass.class);
                job.setReducerClass(Reduce.class);
                job.setInputFormat(TextInputFormat.class);
                job.setOutputFormat(TextOutputFormat.class);
                job.setOutputKeyClass(Text.class);
                job.setOutputValueClass(TaggedWritable.class);
                job.set("mapred.textoutputformat.separator", ",");
                JobClient.runJob(job);
                return 0;
            }
        public static void main(String[] args) throws Exception 
            {
                int res = ToolRunner.run(new Configuration(),
                new DataJoin(),
                args);
                System.exit(res);
            }
    }

I can compile my code. When I run in hadoop I get the following error with a combiner

12/04/17 19:59:29 INFO mapred.JobClient:  map 100% reduce 27%
12/04/17 19:59:38 INFO mapred.JobClient:  map 100% reduce 30%
12/04/17 19:59:47 INFO mapred.JobClient:  map 100% reduce 33%
12/04/17 20:00:23 INFO mapred.JobClient: Task Id : attempt_201204061316_0018_r_000000_2, Status : FAILED
java.lang.RuntimeException: java.lang.NoSuchMethodException: DataJoin$TaggedWritable.<init>()
        at org.apache.hadoop.util.ReflectionUtils.newInstance(ReflectionUtils.java:115)
        at org.apache.hadoop.io.serializer.WritableSerialization$WritableDeserializer.deserialize(WritableSerialization.java:62)
        at org.apache.hadoop.io.serializer.WritableSerialization$WritableDeserializer.deserialize(WritableSerialization.java:40)
        at org.apache.hadoop.mapred.Task$ValuesIterator.readNextValue(Task.java:1136)
        at org.apache.hadoop.mapred.Task$ValuesIterator.next(Task.java:1076)
        at org.apache.hadoop.mapred.ReduceTask$ReduceValuesIterator.moveToNext(ReduceTask.java:246)
        at org.apache.hadoop.mapred.ReduceTask$ReduceValuesIterator.next(ReduceTask.java:242)
        at org.apache.hadoop.contrib.utils.join.DataJoinReducerBase.regroup(DataJoinReducerBase.java:106)

The command I use to run hasoop is / hadoop / core / bin / hasoop jar / export / scratch / lopez / Join / DataJoin.jar DataJoin / export / scratch / user / lopez / Register / export / scratch / user / lopez / Join_Output

and the DataJoin.jar file contains the DataJoin $ TaggedWritable packed into it

I checked some forums and found out that the error may occur due to a non-static class. My program does not have a non-static class!

Can someone please help me

, , , . , .

INFO mapred.FileInputFormat: : 2

     Status : FAILED
    java.lang.ArrayIndexOutOfBoundsException: 1
    at DataJoin$Reduce.combine(DataJoin.java:69)
    at org.apache.hadoop.contrib.utils.join.DataJoinReducerBase.joinAndCollect(DataJoinReducerBase.java:205)
    at org.apache.hadoop.contrib.utils.join.DataJoinReducerBase.joinAndCollect(DataJoinReducerBase.java:214)
    at org.apache.hadoop.contrib.utils.join.DataJoinReducerBase.joinAndCollect(DataJoinReducerBase.java:214)
    at org.apache.hadoop.contrib.utils.join.DataJoinReducerBase.joinAndCollect(DataJoinReducerBase.java:181)
    at org.apache.hadoop.contrib.utils.join.DataJoinReducerBase.reduce(DataJoinReducerBase.java:135)
    at org.apache.hadoop.mapred.ReduceTask.runOldReducer(ReduceTask.java:468)



{


    Configuration conf = getConf();
    JobConf job = new JobConf(conf, DataJoin.class);
    String[] otherArgs = new GenericOptionsParser(conf, args).getRemainingArgs();
    if (otherArgs.length != 3) 
    {
      System.err.println("Usage: wordcount <in> <in1> <out>");
      System.exit(2);
    }

    Path in = new Path(args[0]);
    Path in1 = new Path(args[1]);
    Path out = new Path(args[2]);
    FileInputFormat.setInputPaths(job,in,in1);
    FileOutputFormat.setOutputPath(job, out);
    job.setJobName("DataJoin");
    job.setMapperClass(MapClass.class);
    job.setReducerClass(Reduce.class);
    job.setInputFormat(TextInputFormat.class);
    job.setOutputFormat(TextOutputFormat.class);
    job.setOutputKeyClass(Text.class);
    job.setOutputValueClass(TaggedWritable.class);
    job.set("mapred.textoutputformat.separator", ",");
    JobClient.runJob(job);
    return 0;

}

+3

hadoop

LGG 18 . '12 1:17

2

, TaggedWritable :

public TaggedWritable() {
    this(new Text(""));
}

0

Aloke 26 . '13 13:52

Chris White · Accepted Answer · 2012-04-18T03:03:26+0000

TaggedWritable (Hadoop ( ).

, readFields data.readFields(in) , data.

, GenericWritable ( , , ).

, :

public static class TaggedWritable extends TaggedMapOutput {
    private Writable data;

    public TaggedWritable() {
        this.tag = new Text();
    }

    public TaggedWritable(Writable data) {
        this.tag = new Text("");
        this.data = data;
    }

    public Writable getData() {
        return data;
    }

    public void setData(Writable data) {
        this.data = data;
    }

    public void write(DataOutput out) throws IOException {
        this.tag.write(out);
        out.writeUTF(this.data.getClass().getName());
        this.data.write(out);
    }

    public void readFields(DataInput in) throws IOException {
        this.tag.readFields(in);
        String dataClz = in.readUTF();
        if (this.data == null
                || !this.data.getClass().getName().equals(dataClz)) {
            this.data = (Writable) ReflectionUtils.newInstance(
                    Class.forName(dataClz), null);
        }
        this.data.readFields(in);
    }
}

Hadoop Reduces Side Connection Using Datajoin

More articles: