案列-倒排索引 · 大數據

[TOC] # 分析 ![](https://box.kancloud.cn/ef92484a827af35468ead934d9977907_301x396.png) # 準備數據 ~~~ hello--a.txt 1 hello--b.txt 2 hello--c.txt 1 allen--b.txt 2 jerry--a.txt 2 allen--a.txt 1 jerry--c.txt 2 ~~~ # 代碼 ~~~ package com.index; import com.folwsum.FlowSumSort; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.fs.Path; import org.apache.hadoop.io.LongWritable; import org.apache.hadoop.io.Text; import org.apache.hadoop.mapreduce.Job; import org.apache.hadoop.mapreduce.Mapper; import org.apache.hadoop.mapreduce.Reducer; import org.apache.hadoop.mapreduce.lib.input.FileInputFormat; import org.apache.hadoop.mapreduce.lib.input.TextInputFormat; import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat; import org.apache.hadoop.mapreduce.lib.output.TextOutputFormat; import java.io.IOException; public class IndexStepTwo { public static class IndexStepTwoMapper extends Mapper<LongWritable, Text, Text, Text> { Text k = new Text(); Text v = new Text(); @Override protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException { String line = value.toString(); String[] fields = line.split(" "); String word_file = fields[0]; String count = fields[1]; String[] split = word_file.split("--"); String word = split[0]; String file = split[1]; k.set(word); v.set(file+"--"+count); context.write(k, v); // k hello v a.txt--1 } } public static class IndexStepTwoReduce extends Reducer<Text, Text, Text, Text> { Text v = new Text(); @Override protected void reduce(Text key, Iterable<Text> values, Context context) throws IOException, InterruptedException { StringBuffer sBuffer = new StringBuffer(); for (Text value : values) { //拼接下格式 sBuffer.append(value.toString()).append(" "); } v.set(sBuffer.toString()); context.write(key, v); } } public static void main(String[] args) throws IOException, ClassNotFoundException, InterruptedException { Configuration conf = new Configuration(); Job job = Job.getInstance(); job.setJarByClass(IndexStepTwo.class); //告訴程序，我們的程序所用的mapper類和reducer類是什么 job.setMapperClass(IndexStepTwoMapper.class); job.setReducerClass(IndexStepTwoReduce.class); //告訴框架，我們程序輸出的數據類型 job.setMapOutputKeyClass(Text.class); job.setMapOutputValueClass(Text.class); job.setOutputKeyClass(Text.class); job.setOutputValueClass(Text.class); //這里可以進行combiner組件的設置 job.setCombinerClass(IndexStepTwoReduce.class); //告訴框架，我們程序使用的數據讀取組件結果輸出所用的組件是什么 //TextInputFormat是mapreduce程序中內置的一種讀取數據組件準確的說叫做讀取文本文件的輸入組件 job.setInputFormatClass(TextInputFormat.class); job.setOutputFormatClass(TextOutputFormat.class); //告訴框架，我們要處理的數據文件在那個路勁下 FileInputFormat.setInputPaths(job, new Path("/Users/jdxia/Desktop/website/hdfs/index/input/")); //如果有這個文件夾就刪除 Path out = new Path("/Users/jdxia/Desktop/website/hdfs/index/output/"); FileSystem fileSystem = FileSystem.get(conf); if (fileSystem.exists(out)) { fileSystem.delete(out, true); } //告訴框架，我們的處理結果要輸出到什么地方 FileOutputFormat.setOutputPath(job, out); boolean res = job.waitForCompletion(true); System.exit(res ? 0 : 1); } } ~~~ # 結果展示里面的crc是個校驗文件 ~~~ allen a.txt--1 b.txt--2 hello c.txt--1 b.txt--2 a.txt--1 jerry c.txt--2 a.txt--2 ~~~ # 代碼前提如果準備的數據是這樣 ![](https://box.kancloud.cn/6f75bbd78ca78bd88029d4e9b9ef0eda_169x174.png) 那就要把他先變成這樣 ~~~ hello--a.txt 1 hello--b.txt 2 hello--c.txt 1 allen--b.txt 2 jerry--a.txt 2 allen--a.txt 1 jerry--c.txt 2 ~~~ 代碼 ~~~ public class IndexStepOne { public static class IndexStepOneMapper extends Mapper<LongWritable, Text, Text, IntWritable>{ Text k = new Text(); IntWritable v = new IntWritable(1); @Override protected void map(LongWritable key, Text value,Context context) throws IOException, InterruptedException { String line = value.toString(); String[] words = line.split(" "); FileSplit Split = (FileSplit)context.getInputSplit(); String filename = Split.getPath().getName(); //輸出key :單詞--文件名 value:1 for(String word : words){ k.set(word +"--"+ filename); context.write(k, v); } } } public static class IndexStepOneReducer extends Reducer<Text, IntWritable, Text, IntWritable>{ IntWritable v = new IntWritable(); @Override protected void reduce(Text key, Iterable<IntWritable> values, Context context) throws IOException, InterruptedException { int count = 0; for(IntWritable value : values){ count += value.get(); } v.set(count); context.write(key, v); } } public static void main(String[] args) throws Exception { Configuration conf = new Configuration(); Job job = Job.getInstance(conf); job.setJarByClass(IndexStepOne.class); //告訴程序，我們的程序所用的mapper類和reducer類是什么 job.setMapperClass(IndexStepOneMapper.class); job.setReducerClass(IndexStepOneReducer.class); //告訴框架，我們程序輸出的數據類型 job.setMapOutputKeyClass(Text.class); job.setMapOutputValueClass(IntWritable.class); job.setOutputKeyClass(Text.class); job.setOutputValueClass(IntWritable.class); //這里可以進行combiner組件的設置 job.setCombinerClass(IndexStepOneReducer.class); //告訴框架，我們程序使用的數據讀取組件結果輸出所用的組件是什么 //TextInputFormat是mapreduce程序中內置的一種讀取數據組件準確的說叫做讀取文本文件的輸入組件 job.setInputFormatClass(TextInputFormat.class); job.setOutputFormatClass(TextOutputFormat.class); //告訴框架，我們要處理的數據文件在那個路勁下 FileInputFormat.setInputPaths(job, new Path("D:/index/input")); //告訴框架，我們的處理結果要輸出到什么地方 FileOutputFormat.setOutputPath(job, new Path("D:/index/output-1")); boolean res = job.waitForCompletion(true); System.exit(res?0:1); } } ~~~