HDFS的I/O 流操作 · Hadoop2.x

使用流的方式實現文件上傳到HDFS系統或下載HDFS系統的文件到本地。 <br/> *`com/exa/hdfs001/HdfsClientIo.java`* ```java package com.exa.hdfs001; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.FSDataInputStream; import org.apache.hadoop.fs.FSDataOutputStream; import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.fs.Path; import org.apache.hadoop.io.IOUtils; import org.junit.Test; import java.io.File; import java.io.FileInputStream; import java.io.FileOutputStream; import java.io.IOException; import java.net.URI; import java.net.URISyntaxException; public class HdfsClientIo { /** * 本地文件上傳到HDFS文件系統, 文件已存在則覆蓋 */ @Test public void putFileToHDFS() throws URISyntaxException, IOException, InterruptedException { // 獲取文件系統 Configuration configuration = new Configuration(); FileSystem fs = FileSystem.get(new URI("hdfs://hadoop101:9000"), configuration, "root"); // 輸入流 FileInputStream fis = new FileInputStream(new File("d:/hello.txt")); // 輸出流 FSDataOutputStream fos = fs.create(new Path("/user/hadoop/input/hello.txt")); // 拷貝 IOUtils.copyBytes(fis, fos, configuration); // 資源關閉 IOUtils.closeStream(fis); IOUtils.closeStream(fos); fs.close(); } /** * 下載HDFS文件到本地，文件已存在則覆蓋 */ @Test public void getFileFromHDFS() throws URISyntaxException, IOException, InterruptedException { // 獲取文件系統 Configuration configuration = new Configuration(); FileSystem fs = FileSystem.get(new URI("hdfs://hadoop101:9000"), configuration, "root"); // 輸入流 FSDataInputStream fis = fs.open(new Path("/user/hadoop/input/hello.txt")); // 輸出流 FileOutputStream fos = new FileOutputStream(new File("d:/hello.txt")); // 拷貝 IOUtils.copyBytes(fis, fos, configuration); // 資源關閉 IOUtils.closeStream(fis); IOUtils.closeStream(fos); fs.close(); } /** * 分塊讀取HDFS文件 * 下載第一塊 */ @Test public void readFileSeek1() throws URISyntaxException, IOException, InterruptedException { // 獲取文件系統 Configuration configuration = new Configuration(); FileSystem fs = FileSystem.get(new URI("hdfs://hadoop101:9000"), configuration, "root"); // 輸入流 FSDataInputStream fis = fs.open(new Path("/user/hadoop/input/hadoop-2.6.0-cdh5.14.2.tar.gz")); // 輸出流 FileOutputStream fos = new FileOutputStream(new File("d:/hadoop-2.6.0-cdh5.14.2.tar.gz.part1")); // 緩沖區，每次都1024行數據，寫1024行數據 byte[] buf = new byte[1024]; // 讀取 // hadoop-2.6.0-cdh5.14.2.tar.gz 文件的總大小為413MB，第一次下載128MB for (int i = 0; i < 1024 * 128; i++) { fis.read(buf); // 先寫到buf緩沖區 fos.write(buf); // 再從緩沖區寫入fos } IOUtils.closeStream(fis); IOUtils.closeStream(fos); fs.close(); } /** * 分塊讀取HDFS文件 * 下載第二塊 */ @Test public void readFileSeek2() throws URISyntaxException, IOException, InterruptedException { // 獲取文件系統 Configuration configuration = new Configuration(); FileSystem fs = FileSystem.get(new URI("hdfs://hadoop101:9000"), configuration, "root"); // 輸入流 FSDataInputStream fis = fs.open(new Path("/user/hadoop/input/hadoop-2.6.0-cdh5.14.2.tar.gz")); // 定位輸入數據位置 // 第一次下載了128MB, ，需要定位到128這個位置。 // 文件單位為byte，1024byte=1kb，1024kb=1MB // 1024 * 1024 = 1024kb，1024kb * 128 = 128MB fis.seek(1024 * 1024 * 128); // 輸出流 FileOutputStream fos = new FileOutputStream(new File("d:/hadoop-2.6.0-cdh5.14.2.tar.gz.part2")); // 拷貝 IOUtils.copyBytes(fis, fos, configuration); IOUtils.closeStream(fis); IOUtils.closeStream(fos); fs.close(); } } ``` 程序中分塊讀取HDFS文件后，將兩個文件合并后就得到了原文件，如下將hadoop-2.6.0-cdh5.14.2.tar.gz.part2合并到hadoop-2.6.0-cdh5.14.2.tar.gz.part1中 ![](https://img.kancloud.cn/79/9d/799da7fa082ff6f10c1c1659a03734ca_877x88.png) 從上面的案例中可以看出，HDFS 提供的流讀取數據的方式，可以從任意位置開始讀取數據。這與后面 MapReduce 獲取數據分片相關。