生成HFile以及入库到HBase

问题导读：
1.如何通过mr生成HFile
2.改进后的HFileOutputFormat有什么新增特性？
3.HFile入库到HBase如何入库到HBase

一、MR生成HFile文件

package insert.tools.hfile;

import java.io.IOException;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.hbase.HBaseConfiguration;
import org.apache.hadoop.hbase.KeyValue;
import org.apache.hadoop.hbase.io.ImmutableBytesWritable;
import org.apache.hadoop.hbase.mapreduce.KeyValueSortReducer;
import org.apache.hadoop.hbase.util.Bytes;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;

public class TestHFileToHBase {

        public static class TestHFileToHBaseMapper extends Mapper {

                @Override
                protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
                        String[] values = value.toString().split("/t", 2);
                        byte[] row = Bytes.toBytes(values[0]);
                        ImmutableBytesWritable k = new ImmutableBytesWritable(row);
                        KeyValue kvProtocol = new KeyValue(row, "PROTOCOLID".getBytes(), "PROTOCOLID".getBytes(), values[1]
                                        .getBytes());
                        context.write(k, kvProtocol);

                        // KeyValue kvSrcip = new KeyValue(row, "SRCIP".getBytes(),
                        // "SRCIP".getBytes(), values[1].getBytes());
                        // context.write(k, kvSrcip);
//                         HFileOutputFormat.getRecordWriter 
                }

        }

        public static void main(String[] args) throws IOException, InterruptedException, ClassNotFoundException {
                Configuration conf = HBaseConfiguration.create();
                Job job = new Job(conf, "TestHFileToHBase");
                job.setJarByClass(TestHFileToHBase.class);

                job.setOutputKeyClass(ImmutableBytesWritable.class);
                job.setOutputValueClass(KeyValue.class);

                job.setMapperClass(TestHFileToHBaseMapper.class);
                job.setReducerClass(KeyValueSortReducer.class);
//                job.setOutputFormatClass(org.apache.hadoop.hbase.mapreduce.HFileOutputFormat.class);
                job.setOutputFormatClass(HFileOutputFormat.class);
                // job.setNumReduceTasks(4);
                // job.setPartitionerClass(org.apache.hadoop.hbase.mapreduce.SimpleTotalOrderPartitioner.class);

                // HBaseAdmin admin = new HBaseAdmin(conf);
//                HTable table = new HTable(conf, "hua");

                 HFileOutputFormat.configureIncrementalLoad(job, table);

                FileInputFormat.addInputPath(job, new Path(args[0]));
                FileOutputFormat.setOutputPath(job, new Path(args[1]));

                System.exit(job.waitForCompletion(true) ? 0 : 1);
        }

}
复制代码

二、改进后的HFileOutputFormat

源码中的HFileOutputFormat只适合一次生成一个列族的HFile，改进后的HFileOutputFormat适合同时多列族生成HFile文件。有add标签的是在源码上添加代码，有revise标签的是在源码上增加代码。参考：https://review.cloudera.org/r/12 ... 977#file17977line93

/**
 * Copyright 2009 The Apache Software Foundation
 *
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package insert.tools.hfile;

import java.io.IOException;
import java.net.URI;
import java.net.URISyntaxException;
import java.util.ArrayList;
import java.util.List;
import java.util.Map;
import java.util.TreeMap;
import java.util.TreeSet;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.filecache.DistributedCache;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.hbase.HConstants;
import org.apache.hadoop.hbase.KeyValue;
import org.apache.hadoop.hbase.client.HTable;
import org.apache.hadoop.hbase.client.Put;
import org.apache.hadoop.hbase.io.ImmutableBytesWritable;
import org.apache.hadoop.hbase.io.hfile.Compression;
import org.apache.hadoop.hbase.io.hfile.HFile;
import org.apache.hadoop.hbase.mapreduce.KeyValueSortReducer;
import org.apache.hadoop.hbase.mapreduce.hadoopbackport.TotalOrderPartitioner;
import org.apache.hadoop.hbase.regionserver.StoreFile;
import org.apache.hadoop.hbase.util.Bytes;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.SequenceFile;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.RecordWriter;
import org.apache.hadoop.mapreduce.TaskAttemptContext;
import org.apache.hadoop.mapreduce.lib.output.FileOutputCommitter;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;

import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;

import com.google.common.base.Preconditions;

/**
 * Writes HFiles. Passed KeyValues must arrive in order. Currently, can only
 * write files to a single column family at a time. Multiple column families
 * requires coordinating keys cross family. Writes current time as the sequence
 * id for the file. Sets the major compacted attribute on created hfiles.
 * 
 * @see KeyValueSortReducer
 */
public class HFileOutputFormat extends
                FileOutputFormat {
        static Log LOG = LogFactory.getLog(HFileOutputFormat.class);

        public RecordWriter getRecordWriter(
                        final TaskAttemptContext context) throws IOException,
                        InterruptedException {
                // Get the path of the temporary output file
                final Path outputPath = FileOutputFormat.getOutputPath(context);
                final Path outputdir = new FileOutputCommitter(outputPath, context)
                                .getWorkPath();
                Configuration conf = context.getConfiguration();
                final FileSystem fs = outputdir.getFileSystem(conf);
                // These configs. are from hbase-*.xml
                // revise
                // final long maxsize = conf.getLong("hbase.hregion.max.filesize",
                // 268435456);
                // final int blocksize = conf.getInt("hfile.min.blocksize.size", 65536);
                final long maxsize = conf.getLong("hbase.hregion.max.filesize",
                                HConstants.DEFAULT_MAX_FILE_SIZE);
                final int blocksize = conf.getInt("hfile.min.blocksize.size",
                                HFile.DEFAULT_BLOCKSIZE);
                // -revise

                // Invented config. Add to hbase-*.xml if other than default
                // compression.
                final String compression = conf.get("hfile.compression",
                                Compression.Algorithm.NONE.getName());

                return new RecordWriter() {
                        // Map of families to writers and how much has been output on the
                        // writer.
                        private final Map<byte[], WriterLength> writers = new TreeMap<byte[], WriterLength>(
                                        Bytes.BYTES_COMPARATOR);
                        private byte[] previousRow = HConstants.EMPTY_BYTE_ARRAY;
                        private final byte[] now = Bytes
                                        .toBytes(System.currentTimeMillis());
                        // add
                        private boolean rollRequested = false;

                        // -add

                        public void write(ImmutableBytesWritable row, KeyValue kv)
                                        throws IOException {
                                // add
                                // null input == user explicitly wants to flush
                                if (row == null && kv == null) {
                                        rollWriters();
                                        return;
                                }
                                byte[] rowKey = kv.getRow();
                                // -add

                                long length = kv.getLength();
                                byte[] family = kv.getFamily();
                                WriterLength wl = this.writers.get(family);
                                // revise
                                // if (wl == null
                                // || ((length + wl.written) >= maxsize)
                                // && Bytes.compareTo(this.previousRow, 0,
                                // this.previousRow.length, kv.getBuffer(), kv
                                // .getRowOffset(), kv.getRowLength()) != 0) {
                                // // Get a new writer.
                                // Path basedir = new Path(outputdir, Bytes.toString(family));
                                // if (wl == null) {
                                // wl = new WriterLength();
                                // this.writers.put(family, wl);
                                // if (this.writers.size() > 1)
                                // throw new IOException("One family only");
                                // // If wl == null, first file in family. Ensure family
                                // // dir exits.
                                // if (!fs.exists(basedir))
                                // fs.mkdirs(basedir);
                                // }
                                // wl.writer = getNewWriter(wl.writer, basedir);
                                // LOG
                                // .info("Writer="
                                // + wl.writer.getPath()
                                // + ((wl.written == 0) ? "" : ", wrote="
                                // + wl.written));
                                // wl.written = 0;
                                // }

                                // If this is a new column family, verify that the directory
                                // exists
                                if (wl == null) {
                                        fs.mkdirs(new Path(outputdir, Bytes.toString(family)));
                                }
                                // If any of the HFiles for the column families has reached
                                // maxsize, we need to roll all the writers
                                if (wl != null && wl.written + length >= maxsize) {
                                        this.rollRequested = true;
                                }
                                // This can only happen once a row is finished though
                                if (rollRequested
                                                && Bytes.compareTo(this.previousRow, rowKey) != 0) {
                                        rollWriters();
                                }
                                // create a new HLog writer, if necessary
                                if (wl == null || wl.writer == null) {
                                        wl = getNewWriter(family);
                                }
                                // we now have the proper HLog writer. full steam ahead

                                // -revise

                                kv.updateLatestStamp(this.now);
                                wl.writer.append(kv);
                                wl.written += length;
                                // Copy the row so we know when a row transition.
                                // revise
                                // this.previousRow = kv.getRow();
                                this.previousRow = rowKey;
                                // -revise
                        }

                        // revise
                        // /*
                        // * Create a new HFile.Writer. Close current if there is one.
                        // *
                        // * @param writer
                        // *
                        // * @param familydir
                        // *
                        // * @return A new HFile.Writer.
                        // *
                        // * @throws IOException
                        // */
                        // private HFile.Writer getNewWriter(final HFile.Writer writer,
                        // final Path familydir) throws IOException {
                        // close(writer);
                        // return new HFile.Writer(fs, StoreFile.getUniqueFile(fs,
                        // familydir), blocksize, compression,
                        // KeyValue.KEY_COMPARATOR);
                        // }
                        private void rollWriters() throws IOException {
                                for (WriterLength wl : this.writers.values()) {
                                        if (wl.writer != null) {
                                                LOG.info("Writer="
                                                                + wl.writer.getPath()
                                                                + ((wl.written == 0) ? "" : ", wrote="
                                                                                + wl.written));
                                                close(wl.writer);
                                        }
                                        wl.writer = null;
                                        wl.written = 0;
                                }
                                this.rollRequested = false;
                        }

                        /*
                         * Create a new HFile.Writer.
                         * 
                         * @param family
                         * 
                         * @return A WriterLength, containing a new HFile.Writer.
                         * 
                         * @throws IOException
                         */
                        private WriterLength getNewWriter(byte[] family) throws IOException {
                                WriterLength wl = new WriterLength();
                                Path familydir = new Path(outputdir, Bytes.toString(family));
                                wl.writer = new HFile.Writer(fs, StoreFile.getUniqueFile(fs,
                                                familydir), blocksize, compression,
                                                KeyValue.KEY_COMPARATOR);
                                this.writers.put(family, wl);
                                return wl;
                        }

                        // -revise

                        private void close(final HFile.Writer w) throws IOException {
                                if (w != null) {
                                        w.appendFileInfo(StoreFile.BULKLOAD_TIME_KEY, Bytes
                                                        .toBytes(System.currentTimeMillis()));
                                        w.appendFileInfo(StoreFile.BULKLOAD_TASK_KEY, Bytes
                                                        .toBytes(context.getTaskAttemptID().toString()));
                                        w.appendFileInfo(StoreFile.MAJOR_COMPACTION_KEY, Bytes
                                                        .toBytes(true));
                                        w.close();
                                }
                        }

                        // revise
                        // public void close(TaskAttemptContext c) throws IOException,
                        // InterruptedException {
                        // for (Map.Entry e : this.writers
                        // .entrySet()) {
                        // close(e.getValue().writer);
                        // }
                        // }
                        public void close(TaskAttemptContext c) throws IOException,
                                        InterruptedException {
                                for (WriterLength wl : this.writers.values()) {
                                        close(wl.writer);
                                }
                        }
                        // -revise
                };
        }

        /*
         * Data structure to hold a Writer and amount of data written on it.
         */
        static class WriterLength {
                long written = 0;
                HFile.Writer writer = null;
        }

        /**
         * Return the start keys of all of the regions in this table, as a list of
         * ImmutableBytesWritable.
         */
        private static List getRegionStartKeys(HTable table)
                        throws IOException {
                byte[][] byteKeys = table.getStartKeys();
                ArrayList ret = new ArrayList(
                                byteKeys.length);
                for (byte[] byteKey : byteKeys) {
                        ret.add(new ImmutableBytesWritable(byteKey));
                }
                return ret;
        }

        /**
         * Write out a SequenceFile that can be read by TotalOrderPartitioner that
         * contains the split points in startKeys.
         * 
         * @param partitionsPath
         *            output path for SequenceFile
         * @param startKeys
         *            the region start keys
         */
        private static void writePartitions(Configuration conf,
                        Path partitionsPath, List startKeys)
                        throws IOException {
                Preconditions.checkArgument(!startKeys.isEmpty(), "No regions passed");

                // We're generating a list of split points, and we don't ever
                // have keys < the first region (which has an empty start key)
                // so we need to remove it. Otherwise we would end up with an
                // empty reducer with index 0
                TreeSet sorted = new TreeSet(
                                startKeys);

                ImmutableBytesWritable first = sorted.first();
                Preconditions
                                .checkArgument(
                                                first.equals(HConstants.EMPTY_BYTE_ARRAY),
                                                "First region of table should have empty start key. Instead has: %s",
                                                Bytes.toStringBinary(first.get()));
                sorted.remove(first);

                // Write the actual file
                FileSystem fs = partitionsPath.getFileSystem(conf);
                SequenceFile.Writer writer = SequenceFile.createWriter(fs, conf,
                                partitionsPath, ImmutableBytesWritable.class,
                                NullWritable.class);

                try {
                        for (ImmutableBytesWritable startKey : sorted) {
                                writer.append(startKey, NullWritable.get());
                        }
                } finally {
                        writer.close();
                }
        }

        /**
         * Configure a MapReduce Job to perform an incremental load into the given
         * table. This
         * 

         * 
Inspects the table to configure a total order partitioner
         * 
Uploads the partitions file to the cluster and adds it to the
         * DistributedCache
         * 
Sets the number of reduce tasks to match the current number of
         * regions
         * 
Sets the output key/value class to match HFileOutputFormat's
         * requirements
         * 
Sets the reducer up to perform the appropriate sorting (either
         * KeyValueSortReducer or PutSortReducer)
         * 

         * The user should be sure to set the map output value class to either
         * KeyValue or Put before running this function.
         */
        public static void configureIncrementalLoad(Job job, HTable table)
                        throws IOException {
                Configuration conf = job.getConfiguration();
                job.setPartitionerClass(TotalOrderPartitioner.class);
                job.setOutputKeyClass(ImmutableBytesWritable.class);
                job.setOutputValueClass(KeyValue.class);
                job.setOutputFormatClass(HFileOutputFormat.class);

                // Based on the configured map output class, set the correct reducer to
                // properly
                // sort the incoming values.
                // TODO it would be nice to pick one or the other of these formats.
                if (KeyValue.class.equals(job.getMapOutputValueClass())) {
                        job.setReducerClass(KeyValueSortReducer.class);
                } else if (Put.class.equals(job.getMapOutputValueClass())) {
                        job.setReducerClass(PutSortReducer.class);
                } else {
                        LOG.warn("Unknown map output value type:"
                                        + job.getMapOutputValueClass());
                }

                LOG.info("Looking up current regions for table " + table);
                List startKeys = getRegionStartKeys(table);
                LOG.info("Configuring " + startKeys.size() + " reduce partitions "
                                + "to match current region count");
                job.setNumReduceTasks(startKeys.size());

                Path partitionsPath = new Path(job.getWorkingDirectory(), "partitions_"
                                + System.currentTimeMillis());
                LOG.info("Writing partition information to " + partitionsPath);

                FileSystem fs = partitionsPath.getFileSystem(conf);
                writePartitions(conf, partitionsPath, startKeys);
                partitionsPath.makeQualified(fs);
                URI cacheUri;
                try {
                        cacheUri = new URI(partitionsPath.toString() + "#"
                                        + TotalOrderPartitioner.DEFAULT_PATH);
                } catch (URISyntaxException e) {
                        throw new IOException(e);
                }
                DistributedCache.addCacheFile(cacheUri, conf);
                DistributedCache.createSymlink(conf);

                LOG.info("Incremental table output configured.");
        }

}
复制代码

三、MR生成HFile的注意事项

1. 无论是map还是reduce作为最终的输出结果，输出的key和value的类型应该是：或者< ImmutableBytesWritable, Put>。

2. Map或者reduce的输出类型是KeyValue 或Put对应KeyValueSortReducer或PutSortReducer。

3. MR例子中job.setOutputFormatClass(HFileOutputFormat.class); HFileOutputFormat是改进后的mr，可适用于多列族同时生成HFile文件，源码中只适合一次对单列族组织成HFile文件。

4. MR例子中HFileOutputFormat.configureIncrementalLoad(job, table);自动对job进行配置，SimpleTotalOrderPartitioner是需要先对key进行整体排序，然后划分到每个reduce中，保证每一个reducer中的的key最小最大值区间范围，是不会有交集的。

因为入库到HBase的时候，作为一个整体的Region，key是绝对有序的。

5. MR例子中最后生成HFile存储在HDFS上，输出路径下的子目录是各个列族。如果对HFile进行入库HBase，相当于move HFile到HBase的Region中，HFile子目录的列族内容没有了。

四、HFile入库到HBase

import org.apache.hadoop.hbase.client.HTable;
import org.apache.hadoop.hbase.mapreduce.LoadIncrementalHFiles;
import org.apache.hadoop.hbase.util.Bytes;

public class TestLoadIncrementalHFileToHBase {

        // private static final byte[] TABLE = Bytes.toBytes("hua");
        // private static final byte[] QUALIFIER = Bytes.toBytes("PROTOCOLID");
        // private static final byte[] FAMILY = Bytes.toBytes("PROTOCOLID");

        public static void main(String[] args) throws IOException {
                Configuration conf = HBaseConfiguration.create();
//                byte[] TABLE = Bytes.toBytes("hua");
                byte[] TABLE = Bytes.toBytes(args[0]);
                HTable table = new HTable(TABLE);
                LoadIncrementalHFiles loader = new LoadIncrementalHFiles(conf);
                loader.doBulkLoad(new Path(args[1]), table);
//                loader.doBulkLoad(new Path("/hua/testHFileResult/"), table);
        }

}
复制代码

五、HFile入库到HBase注意事项

1. 通过HBase中 LoadIncrementalHFiles的doBulkLoad方法，对生成的HFile文件入库，入库的第一个参数是表名，第二个参数是HFile的路径（以上MR生成HFile的输出路径），也可一个个列族录入到HBase中对应的表列族。

2. 如何入库的相关链接：

http://hbase.apache.org/docs/r0.89.20100726/bulk-loads.html

http://hbase.apache.org/docs/r0. ... e-summary.html#bulk

http://genius-bai.javaeye.com/blog/641927

3. 入库分为代码入库以及脚本入库。代码入库有两种，一种是

hadoop jar hbase-VERSION.jar completebulkload /myoutput mytable；

另外一种是通过以上的TestLoadIncrementalHFileToHBase类。

脚本入库为：jruby $HBASE_HOME/bin/loadtable.rb hbase-mytable hadoop-hbase-hfile-outputdir。

sunshine_junge · 发表于 2014-8-2 12:31:53

good,,,,,,,,

zqh_2014 · 发表于 2014-11-11 13:52:57

第一种MR生成HFile，有详细的资料吗？目前在HDFS上有一个txt文件，然后在eclipse运行代码，会有异常：
Exception in thread "main" java.lang.IllegalArgumentException: No regions passed
at org.apache.hadoop.hbase.mapreduce.HFileOutputFormat2.writePartitions(HFileOutputFormat2.java:307)
at org.apache.hadoop.hbase.mapreduce.HFileOutputFormat2.configurePartitioner(HFileOutputFormat2.java:527)
at org.apache.hadoop.hbase.mapreduce.HFileOutputFormat2.configureIncrementalLoad(HFileOutputFormat2.java:391)
at org.apache.hadoop.hbase.mapreduce.HFileOutputFormat.configureIncrementalLoad(HFileOutputFormat.java:88)
at TestHFileToHBase.main(TestHFileToHBase.java:73)