局部聚合Combiner · 大數據

接上面的單詞統計 ![](https://box.kancloud.cn/251d5fe7702b4416bb2997aaf189eed2_1109x646.png) combiner要做的就是在每個map傳輸的時候,就把數據先聚合下,然后再傳輸不然數據會一條一條傳輸,會很占用IO性能 **但是要注意,用combiner,不用影響你的業務邏輯** 比如對數據求平均值,這邊就對業務有影響了 ![](https://box.kancloud.cn/60b4d6f34539bde363c8eb13335543e5_918x551.png) **WorldCountCombiner類** ~~~ package com.hadooprpc; import org.apache.hadoop.io.IntWritable; import org.apache.hadoop.io.Text; import org.apache.hadoop.mapreduce.Reducer; import java.io.IOException; public class WorldCountCombiner extends Reducer<Text, IntWritable,Text,IntWritable> { @Override protected void reduce(Text key, Iterable<IntWritable> values, Context context) throws IOException, InterruptedException { int count = 0; for (IntWritable v : values) { count += v.get(); } context.write(key,new IntWritable(count)); } } ~~~ **WorldCountDriver類** ~~~ package com.hadooprpc; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.Path; import org.apache.hadoop.io.IntWritable; import org.apache.hadoop.io.Text; import org.apache.hadoop.mapreduce.Job; import org.apache.hadoop.mapreduce.lib.input.FileInputFormat; import org.apache.hadoop.mapreduce.lib.input.TextInputFormat; import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat; import java.io.IOException; /** * 本類是客戶端用來指定WorldCount job程序運行時所需要的很多參數 * 比如:指定那個類作為map階段的業務邏輯,那個類作為reduce階段的業務邏輯類 * 指定那個組件作為數據的讀取組件,數據結果輸出組件 * .... * 以及其他各種所需要的參數 */ public class WorldCountDriver { public static void main(String[] args) throws IOException, ClassNotFoundException, InterruptedException { Configuration conf = new Configuration(); //設置權限,也可以在vm那邊偽造 System.setProperty("HADOOP_USER_NAME", "root"); conf.set("fs.defaultFS","hdfs://master:9000"); conf.set("mapreduce.framework.name","yarn"); conf.set("yarn.resourcemanager.hostname","master"); Job job = Job.getInstance(conf); //告訴框架,我們程序的位置 // job.setJar("/root/wordCount.jar"); //上面這樣寫,不好,換了路徑又要重新寫,我們改為用他的類加載器加載他自己 job.setJarByClass(WorldCountDriver.class); //告訴框架,我們程序所用的mapper類和reduce類是什么 job.setMapperClass(WorldCountMapper.class); job.setReducerClass(WorldCountReducer.class); //告訴框架,我們程序所用的mapper類和reduce類是什么 job.setMapperClass(WorldCountMapper.class); job.setReducerClass(WorldCountReducer.class); //告訴框架我們程序輸出的類型 job.setMapOutputKeyClass(Text.class); job.setMapOutputValueClass(IntWritable.class); job.setOutputKeyClass(Text.class); job.setOutputValueClass(IntWritable.class); //設置combainer job.setCombinerClass(WorldCountCombiner.class); //告訴框架,我們程序使用的數據讀取組件,結果輸出所用的組件是什么 //TextInputFormat是mapreduce程序中內置的一種讀取數據組件,準備的叫做讀取文本的輸入組件 job.setInputFormatClass(TextInputFormat.class); //告訴框架,我們要處理的數據文件在那個路徑下 FileInputFormat.setInputPaths(job,new Path("/worldCount/input/")); //告訴框架我們的處理結果要輸出到什么地方 FileOutputFormat.setOutputPath(job,new Path("/worldCount/output/")); //這邊不用submit,因為一提交就和我這個沒關系了,我這就斷開了就看不見了 // job.submit(); //提交后,然后等待服務器端返回值,看是不是true boolean res = job.waitForCompletion(true); //設置成功就退出碼為0 System.exit(res?0:1); } } ~~~ 其他類和上面案列一樣