自定義InputFormat · JAVA

[TOC] # 需求將多個小文件合并成一個文件SequenceFile,SequenceFile里面存儲著多個小文件,存儲的形式為文件路徑+名稱為key,文件的內容為value 輸入數據 ~~~ one.txt two.txt three.txt ~~~ 輸出數據 ~~~ part-r-0000 ~~~ # 分析小文件的優化無非以下幾種方式： 1. 在數據采集的時候，就將小文件或小批數據合成大文件再上傳HDFS 2. 在業務處理之前，在HDFS上使用mapreduce程序對小文件進行合并 3. 在mapreduce處理時，可采用combineInputFormat提高效率 # 實現本節實現的是上述第二種方式程序的核心機制：自定義一個InputFormat 改寫RecordReader，實現一次讀取一個完整文件封裝為KV 在輸出時使用SequenceFileOutPutFormat輸出合并文件 **代碼如下** 自定義InputFromat ~~~ public class WholeFileInputFormat extends FileInputFormat<NullWritable, BytesWritable> { //設置每個小文件不可分片,保證一個小文件生成一個key-value鍵值對 @Override protected boolean isSplitable(JobContext context, Path file) { return false; } //創建個讀取的流 @Override public RecordReader<NullWritable, BytesWritable> createRecordReader(InputSplit split, TaskAttemptContext context) throws IOException, InterruptedException { WholeFileRecordReader reader = new WholeFileRecordReader(); reader.initialize(split, context); return reader; } } ~~~ 自定義RecordReader ~~~ class WholeFileRecordReader extends RecordReader<NullWritable, BytesWritable> { private FileSplit fileSplit; private Configuration conf; private BytesWritable value = new BytesWritable(); //默認沒有讀取文件 private boolean processed = false; //初始化方法 @Override public void initialize(InputSplit split, TaskAttemptContext context) throws IOException, InterruptedException { //獲取切片信息,我們知道切片就是文件,InputSplit是所有輸入的切片,不轉我們后面獲取不到切片的信息 this.fileSplit = (FileSplit) split; //獲取上下文信息 this.conf = context.getConfiguration(); } //通過流的方式一次讀取一個文件,幾個文件就循環幾次 @Override public boolean nextKeyValue() throws IOException, InterruptedException { //讀取一個一個文件 if (!processed) { //定義緩存區 byte[] contents = new byte[(int) fileSplit.getLength()]; //根據切片信息獲取文件路徑 Path file = fileSplit.getPath(); //根據文件路徑信息獲取文件系統 FileSystem fs = file.getFileSystem(conf); FSDataInputStream in = null; try { //讀取數據,打開文件輸入流 in = fs.open(file); //讀取文件內容,流的拷貝 IOUtils.readFully(in, contents, 0, contents.length); //輸出文件內容 value.set(contents, 0, contents.length); } finally { //關閉IO流 IOUtils.closeStream(in); IOUtils.closeStream(fs); } //表示不讓他這個方法再執行了 processed = true; return true; } return false; } //獲取當前的key @Override public NullWritable getCurrentKey() throws IOException, InterruptedException { return NullWritable.get(); } //獲取當前的value @Override public BytesWritable getCurrentValue() throws IOException,InterruptedException { return value; } //讀取的過程 @Override public float getProgress() throws IOException { //是否在讀取 return processed ? 1.0f : 0.0f; } //關流 @Override public void close() throws IOException { // do nothing } } ~~~ **定義mapreduce處理流程** 定義map處理流程 ~~~ import org.apache.hadoop.fs.Path; import org.apache.hadoop.io.BytesWritable; import org.apache.hadoop.io.NullWritable; import org.apache.hadoop.io.Text; import org.apache.hadoop.mapreduce.Mapper; import org.apache.hadoop.mapreduce.lib.input.FileSplit; import java.io.IOException; //自定義的FileRecordReader那邊泛型是什么這邊輸入就是什么 class SequenceFileMapper extends Mapper<NullWritable, BytesWritable, Text, BytesWritable> { private Text k = new Text(); @Override protected void setup(Context context) throws IOException, InterruptedException { //通過上下文獲取整個切片信息 FileSplit split = (FileSplit) context.getInputSplit(); //獲取路徑 Path path = split.getPath(); k.set(path.toString()); } //幾個文件就執行幾次map @Override protected void map(NullWritable key, BytesWritable value, Context context) throws IOException, InterruptedException { context.write(k, value); } } ~~~ 定義reducer處理流程 ~~~ class SequenceFileReducer extends Reducer<Text, BytesWritable, Text, BytesWritable> { @Override protected void reduce(Text key, Iterable<BytesWritable> values, Context context) throws IOException, InterruptedException { for(BytesWritable bytesWritable : values) { context.write(key, bytesWritable); } } } ~~~ 定義執行 ~~~ public class SequenceFileDriver { public static void main(String[] args) throws Exception { Configuration conf = new Configuration(); Job job = Job.getInstance(conf); job.setJarByClass(SequenceFileDriver.class); job.setMapperClass(SequenceFileMapper.class); job.setReducerClass(SequenceFileReducer.class); job.setInputFormatClass(WholeFileInputFormat.class); job.setOutputFormatClass(SequenceFileOutputFormat.class); job.setMapOutputKeyClass(Text.class); job.setMapOutputValueClass(BytesWritable.class); job.setOutputKeyClass(Text.class); job.setOutputValueClass(BytesWritable.class); //告訴框架，我們要處理的數據文件在那個路徑下 FileInputFormat.setInputPaths(job, new Path("/Users/jdxia/Desktop/website/data/input/")); //如果有這個文件夾就刪除 Path out = new Path("/Users/jdxia/Desktop/website/data/output/"); FileSystem fileSystem = FileSystem.get(conf); if (fileSystem.exists(out)) { fileSystem.delete(out, true); } //告訴框架，我們的處理結果要輸出到什么地方 FileOutputFormat.setOutputPath(job, out); boolean res = job.waitForCompletion(true); System.exit(res ? 0 : 1); } } ~~~