HBase based map reduce job unit testing

4
HBase based MapReduce Job Unit Testing made easy 07 ThursdayAUG 2014 POSTED BY ASHOK AGARWAL IN BIG DATA In one of the projects we were using Hbase as our data source for our map reduce jobs. Hbase Book provides lot of examples to write map reduce jobs using hbase tables as input source. Refer HBase Map Reduce Examples. Below MapReduce code uses the TableMapper. 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 package com.jbksoft.mapper; import org.apache.hadoop.hbase.client.Result; import org.apache.hadoop.hbase.io.ImmutableBytesWritable; import org.apache.hadoop.hbase.mapreduce.TableMapper; import org.apache.hadoop.hbase.util.Bytes; import org.apache.hadoop.io.IntWritable; import org.apache.hadoop.io.Text; import java.io.IOException; /** * Created with IntelliJ IDEA. * User: ashok.agarwal * Date: 8/6/14 * Time: 5:46 PM * * The mapper below is used for finding frequency of first name. */ public class MyTableMapper extends TableMapper<Text, IntWritable> { public static final byte[] COL_FAMILY = "CF".getBytes(); public static final byte[] FIRST_NAME_COL_QUALIFIER = "fn".getBytes(); public static final byte[] MIDDLE_NAME_COL_QUALIFIER = "mi".getBytes(); public static final byte[] LAST_NAME_COL_QUALIFIER = "ln".getBytes(); public void map(ImmutableBytesWritable row, Result value, Context context) throw InterruptedException { String rowKey = new String(row.get()); String[] keyParts = rowKey.split("/"); String firstName = Bytes.toString(value.getValue(COL_FAMILY, FIRST_NAME_COL_QUA String middleName = Bytes.toString(value.getValue(COL_FAMILY, MIDDLE_NAME_COL_Q String lastName = Bytes.toString(value.getValue(COL_FAMILY, LAST_NAME_COL_QUALI context.write(new Text(firstName), new IntWritable(1));

description

Unit testing for Hbase Map Reduce jobs can be easily implemented.

Transcript of HBase based map reduce job unit testing

Page 1: HBase based map reduce job unit testing

HBase based MapReduce Job Unit Testing made easy

07ThursdayAUG 2014

POSTED BY ASHOK AGARWAL IN BIG DATA

In one of the projects we were using Hbase as our data source for our map reduce

jobs. Hbase Book provides lot of examples to write map reduce jobs using hbase

tables as input source. Refer HBase Map Reduce Examples.

Below MapReduce code uses the TableMapper. 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36

package com.jbksoft.mapper; import org.apache.hadoop.hbase.client.Result; import org.apache.hadoop.hbase.io.ImmutableBytesWritable; import org.apache.hadoop.hbase.mapreduce.TableMapper; import org.apache.hadoop.hbase.util.Bytes; import org.apache.hadoop.io.IntWritable; import org.apache.hadoop.io.Text; import java.io.IOException; /** * Created with IntelliJ IDEA. * User: ashok.agarwal * Date: 8/6/14 * Time: 5:46 PM * * The mapper below is used for finding frequency of first name. */ public class MyTableMapper extends TableMapper<Text, IntWritable> { public static final byte[] COL_FAMILY = "CF".getBytes(); public static final byte[] FIRST_NAME_COL_QUALIFIER = "fn".getBytes(); public static final byte[] MIDDLE_NAME_COL_QUALIFIER = "mi".getBytes(); public static final byte[] LAST_NAME_COL_QUALIFIER = "ln".getBytes(); public void map(ImmutableBytesWritable row, Result value, Context context) throws IOException, InterruptedException { String rowKey = new String(row.get()); String[] keyParts = rowKey.split("/"); String firstName = Bytes.toString(value.getValue(COL_FAMILY, FIRST_NAME_COL_QUALIFIER)); String middleName = Bytes.toString(value.getValue(COL_FAMILY, MIDDLE_NAME_COL_QUALIFIER)); String lastName = Bytes.toString(value.getValue(COL_FAMILY, LAST_NAME_COL_QUALIFIER)); context.write(new Text(firstName), new IntWritable(1));

Page 2: HBase based map reduce job unit testing

37 38

} }

For above mapper the input key is of type ImmutableBytesWritable can be created

by making object of ImmutableBytesWritable type with byte array of row key. String key = csvCells[1] + "/" + csvCells[2] + "/" + csvCells[3]; ImmutableBytesWritable rowKey = new ImmutableBytesWritable(key.getBytes());

And the Result object can be created by adding below KeyValue Objects to

collections. new KeyValue(key.get(), COL_FAMILY, FIRST_NAME_COL_QUALIFIER, Bytes.toBytes(csvCells[1]))

Below is complete Junit Test Case code using mrunit. 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39

package com.jbksoft.test; import au.com.bytecode.opencsv.CSVReader; import com.jbksoft.mapper.MyTableMapper; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.hbase.HBaseConfiguration; import org.apache.hadoop.hbase.KeyValue; import org.apache.hadoop.hbase.client.Result; import org.apache.hadoop.hbase.io.ImmutableBytesWritable; import org.apache.hadoop.hbase.util.Bytes; import org.apache.hadoop.io.IntWritable; import org.apache.hadoop.io.Text; import org.apache.hadoop.mrunit.mapreduce.MapDriver; import org.junit.Before; import org.junit.Test; import java.io.StringReader; import java.util.ArrayList; import java.util.Arrays; import java.util.List; /** * Created with IntelliJ IDEA. * User: ashok.agarwal * Date: 8/6/14 * Time: 6:06 PM * Test Case for MyTableMapper */ public class MyTableMapperTest { MyTableMapper mapper; MapDriver<ImmutableBytesWritable, Result, Text, IntWritable> mapDriver; Configuration config; String path; static String[] CSV = { "\"2014-03-31\",\"GEORGE\",\"W\",\"BUSH\",\"USA\"", "\"2014-03-31\",\"SUSAN\",\"B\",\"ANTHONY\",\"USA\""

Page 3: HBase based map reduce job unit testing

40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95

}; @Before public void setup() throws Exception { path = getClass().getProtectionDomain().getCodeSource().getLocation().getPath(); config = HBaseConfiguration.create(); setConfig(config); mapper = new MyTableMapper(); mapDriver = MapDriver.newMapDriver(mapper); mapDriver.setConfiguration(config); } public void setConfig(Configuration config) { config.set("startDate", "2014-03-03T00:00:00Z"); config.set("period_in_days", "7"); config.set("outputPath", path + "data"); } @Test public void testMap1Input1Output() throws Exception { mapDriver.withInput(getKey(CSV[0]), getResult(CSV[0])); mapDriver.withOutput(new Text("GEORGE"), new IntWritable(1)); mapDriver.runTest(); } public ImmutableBytesWritable getKey(String csvRecord) throws Exception { CSVReader csvReader = new CSVReader(new StringReader(csvRecord), ','); String[] csvCells = csvReader.readNext(); // Key of record from Hbase String key = csvCells[1] + "/" + csvCells[2] + "/" + csvCells[3]; ImmutableBytesWritable rowKey = new ImmutableBytesWritable(key.getBytes()); return rowKey; } public Result getResult(String csvRecord) throws Exception { final byte[] COL_FAMILY = "CF".getBytes(); final byte[] FIRST_NAME_COL_QUALIFIER = "fn".getBytes(); final byte[] MIDDLE_NAME_COL_QUALIFIER = "mi".getBytes(); final byte[] LAST_NAME_COL_QUALIFIER = "ln".getBytes(); CSVReader csvReader = new CSVReader(new StringReader(csvRecord), ','); String[] csvCells = csvReader.readNext(); ImmutableBytesWritable key = getKey(csvRecord); List<KeyValue> kvs = new ArrayList<KeyValue>(); kvs.add(new KeyValue(key.get(), COL_FAMILY, FIRST_NAME_COL_QUALIFIER, Bytes.toBytes(csvCells[1])));

Page 4: HBase based map reduce job unit testing

96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113

kvs.add(new KeyValue(key.get(), COL_FAMILY, FIRST_NAME_COL_QUALIFIER, Bytes.toBytes(csvCells[2]))); kvs.add(new KeyValue(key.get(), COL_FAMILY, FIRST_NAME_COL_QUALIFIER, Bytes.toBytes(csvCells[3]))); return keyValueToResult(kvs); } protected Result keyValueToResult(List<KeyValue> kvs) { KeyValue[] kvsArray = kvs.toArray(new KeyValue[0]); Arrays.sort(kvsArray, KeyValue.COMPARATOR); List<KeyValue> kvsSorted = Arrays.asList(kvsArray); return new Result(kvsSorted); } }