Map join 案例實操 · Hadoop2.x

1. 分析適用于關聯表中有小表的情形。可以將小表分發到所有的 map 節點，這樣，map 節點就可以在本地對自己所讀到的大表數據進行合并并輸出最終結果，可以大大提高合并操作的并發度，加快處理速度。 2. 實現代碼：（1）先在 driver 中添加緩存文件 ```java package com.kgc.mapreduce.driver; import com.kgc.mapreduce.entry.CustomerOrders; import com.kgc.mapreduce.mapper.MapJoinMapper; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.Path; import org.apache.hadoop.io.NullWritable; import org.apache.hadoop.mapreduce.Job; import org.apache.hadoop.mapreduce.lib.input.FileInputFormat; import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat; import java.net.URI; public class MapJoinDriver { public static void main(String[] args) throws Exception { // 1 獲取 job 信息 Configuration configuration = new Configuration(); Job job = Job.getInstance(configuration); // 2 設置加載 jar 包路徑 job.setJarByClass(MapJoinDriver.class); // 3 關聯 map job.setMapperClass(MapJoinMapper.class); // 4 設置最終輸出數據類型 job.setOutputKeyClass(CustomerOrders.class); job.setOutputValueClass(NullWritable.class); // 5 設置輸入輸出路徑 FileInputFormat.setInputPaths(job, new Path("file:///d:\\input")); FileOutputFormat.setOutputPath(job, new Path("file://d:\\output")); // 6 加載緩存數據 job.addCacheFile(new URI("file:///d:/customerinput/customers.csv")); // 7 map 端 join 的邏輯不需要 reduce 階段，設置 reducetask 數量為 0 job.setNumReduceTasks(0); // 8 提交 boolean result = job.waitForCompletion(true); System.exit(result ? 0 : 1); } } ``` （2）mapper 中讀取緩存的文件數據 ```java package com.kgc.mapreduce.mapper; import com.kgc.mapreduce.entry.CustomerOrders; import org.apache.commons.lang.StringUtils; import org.apache.hadoop.io.LongWritable; import org.apache.hadoop.io.NullWritable; import org.apache.hadoop.io.Text; import org.apache.hadoop.mapreduce.Mapper; import java.io.*; import java.net.URI; import java.util.HashMap; public class MapJoinMapper extends Mapper<LongWritable, Text, CustomerOrders, NullWritable> { HashMap<String, String> customerMap = new HashMap<>(); CustomerOrders customerOrders = new CustomerOrders(); @Override protected void setup(Context context) throws IOException, InterruptedException { // super.setup(context); URI[] cacheFiles = context.getCacheFiles(); if (null != cacheFiles && cacheFiles.length > 0) { String filename = cacheFiles[0].getPath().toString(); BufferedReader bufferedReader = new BufferedReader( new InputStreamReader(new FileInputStream(filename), "UTF-8")); String line; while (StringUtils.isNotEmpty(line = bufferedReader.readLine())) { String[] split = line.split(","); customerMap.put(split[0], split[1]); } bufferedReader.close(); } } @Override protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException { // 1 獲取一行 String line = value.toString(); // 2 截取 String[] fields = line.split(","); // 3 設置客戶 id customerOrders.setCustomerId(fields[1]); customerOrders.setOrderId(fields[0]); customerOrders.setOrderStatus(fields[3]); // 4 獲取客戶名稱 customerOrders.setCustomerName(customerMap.get(fields[2])); // 6 寫出 context.write(customerOrders, NullWritable.get()); } } ```