package ex2; import java.io.IOException; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.Path; import org.apache.hadoop.io.IntWritable; import org.apache.hadoop.io.LongWritable; import org.apache.hadoop.io.Text; import org.apache.hadoop.mapreduce.Job; import org.apache.hadoop.mapreduce.Mapper; import org.apache.hadoop.mapreduce.Reducer; import org.apache.hadoop.mapreduce.lib.input.FileInputFormat; import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat; public class NGrams { /* * Create N-Gram as keys */ public static class Map extends Mapper { private final static IntWritable one = new IntWritable(1); private Text word = new Text(); private static final int n = 5; public void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException { String line = value.toString(); String[] words = line.trim().split("\\s+"); for (int i = 0; i < words.length - n + 1; i++) { word.set(ngram(words, i, i + n)); context.write(word, one); } } /** * Generate ngram * * @param words * @param start * @param end * @return */ public static String ngram(String[] words, int start, int end) { StringBuilder sb = new StringBuilder(); for (int i = start; i < end; i++) sb.append((i > start ? " " : "") + words[i]); return sb.toString(); } } /* * Sum the values of each n-gram */ public static class Reduce extends Reducer { public void reduce(Text key, Iterable value, Context context) throws IOException, InterruptedException { int sum = 0; for (IntWritable intWritable : value) { sum += intWritable.get(); } context.write(key, new IntWritable(sum)); } } public static void main(String[] args) throws Exception { Configuration conf = new Configuration(); Job job = Job.getInstance(conf, "ngrams"); job.setJarByClass(NGrams.class); job.setOutputKeyClass(Text.class); job.setOutputValueClass(IntWritable.class); job.setMapperClass(Map.class); job.setReducerClass(Reduce.class); FileInputFormat.addInputPath(job, new Path(args[0])); FileOutputFormat.setOutputPath(job, new Path(args[1])); long begin = System.currentTimeMillis(); job.waitForCompletion(true); long end = System.currentTimeMillis(); long second = (end - begin) / 1000; System.err.println(job.getJobName() + " takes " + second + " seconds"); } }