From c73a87bc09b76cc2724aa7db9f53724ee2036dd0 Mon Sep 17 00:00:00 2001 From: Jesse Yates Date: Thu, 4 Apr 2013 17:36:09 -0700 Subject: [PATCH] Adding hbase-stats project to contrib/ This is the basis for supporting equal-depth histograms per region, guideposts, #49. Its basically a 0.94 backport of the code attached to HBASE-7958. Logically, there is no real difference, though the implementation is has some slight changes as on trunk there is no need for a coprocessor as the stats gather is all built in. Currently, HBASE-7958 is stil open for review and depends on the the system tables (HBASE-7999) or namespaces (HBASE-8105) patches. We resolve this by just giving the stats table a special name that should be fairly close to the pending name for the stats table in HBASE-7958. In the event that we cannot maintain the same table name for the stats table, we do have an opportunity to copy over the data to the neew table as there is currently a required downtime to upgrade from 0.94 to 0.96. This is actually a bit more advanced than the posted patch - a lot more work has gone into usability and verifying correctness. Further, there are some obvious changes as we need to support coprocessors rather than built options (but that is actually a relatively minor change). --- contrib/hbase-stat/README.md | 143 +++++ contrib/hbase-stat/pom.xml | 215 +++++++ .../protobuf/generated/StatisticProtos.java | 543 ++++++++++++++++++ .../salesforce/hbase/stats/BaseStatistic.java | 80 +++ .../hbase/stats/ColumnFamilyStatistic.java | 60 ++ .../hbase/stats/HistogramStatisticValue.java | 100 ++++ .../salesforce/hbase/stats/StatScanner.java | 91 +++ .../hbase/stats/StatisticReader.java | 187 ++++++ .../hbase/stats/StatisticTracker.java | 27 + .../hbase/stats/StatisticValue.java | 43 ++ .../hbase/stats/StatisticsTable.java | 160 ++++++ .../stats/cleanup/CleanupStatistics.java | 74 +++ .../stats/cleanup/RemoveRegionOnSplit.java | 68 +++ .../stats/cleanup/RemoveTableOnDelete.java | 33 ++ ...ualByteDepthHistogramStatisticTracker.java | 96 ++++ .../hbase/stats/impl/MinMaxKey.java | 121 ++++ .../hbase/stats/impl/TrackerUtil.java | 23 + .../HistogramStatisticReader.java | 37 ++ .../IndividualStatisticReader.java | 17 + .../IndividualStatisticWriter.java | 31 + .../serialization/PointStatisticReader.java | 52 ++ .../stats/serialization/StatisticSerDe.java | 70 +++ .../stats/serialization/package-info.java | 51 ++ .../hbase/stats/util/Constants.java | 40 ++ .../hbase/stats/util/SetupTableUtil.java | 118 ++++ .../hbase-stat/src/main/protobuf/README.txt | 26 + .../hbase-stat/src/main/protobuf/stats.proto | 14 + .../stats/TestEqualWidthHistogramOnTable.java | 91 +++ .../stats/TestEqualWidthHistogramStat.java | 105 ++++ .../hbase/stats/TestMinMaxKeyStats.java | 53 ++ .../hbase/stats/TestTrackerImpl.java | 145 +++++ .../hbase/stats/util/StatsTestUtil.java | 66 +++ .../hbase/stats/util/TestSetupTable.java | 59 ++ 33 files changed, 3039 insertions(+) create mode 100644 contrib/hbase-stat/README.md create mode 100644 contrib/hbase-stat/pom.xml create mode 100644 contrib/hbase-stat/src/main/java/com/salesforce/hbase/protobuf/generated/StatisticProtos.java create mode 100644 contrib/hbase-stat/src/main/java/com/salesforce/hbase/stats/BaseStatistic.java create mode 100644 contrib/hbase-stat/src/main/java/com/salesforce/hbase/stats/ColumnFamilyStatistic.java create mode 100644 contrib/hbase-stat/src/main/java/com/salesforce/hbase/stats/HistogramStatisticValue.java create mode 100644 contrib/hbase-stat/src/main/java/com/salesforce/hbase/stats/StatScanner.java create mode 100644 contrib/hbase-stat/src/main/java/com/salesforce/hbase/stats/StatisticReader.java create mode 100644 contrib/hbase-stat/src/main/java/com/salesforce/hbase/stats/StatisticTracker.java create mode 100644 contrib/hbase-stat/src/main/java/com/salesforce/hbase/stats/StatisticValue.java create mode 100644 contrib/hbase-stat/src/main/java/com/salesforce/hbase/stats/StatisticsTable.java create mode 100644 contrib/hbase-stat/src/main/java/com/salesforce/hbase/stats/cleanup/CleanupStatistics.java create mode 100644 contrib/hbase-stat/src/main/java/com/salesforce/hbase/stats/cleanup/RemoveRegionOnSplit.java create mode 100644 contrib/hbase-stat/src/main/java/com/salesforce/hbase/stats/cleanup/RemoveTableOnDelete.java create mode 100644 contrib/hbase-stat/src/main/java/com/salesforce/hbase/stats/impl/EqualByteDepthHistogramStatisticTracker.java create mode 100644 contrib/hbase-stat/src/main/java/com/salesforce/hbase/stats/impl/MinMaxKey.java create mode 100644 contrib/hbase-stat/src/main/java/com/salesforce/hbase/stats/impl/TrackerUtil.java create mode 100644 contrib/hbase-stat/src/main/java/com/salesforce/hbase/stats/serialization/HistogramStatisticReader.java create mode 100644 contrib/hbase-stat/src/main/java/com/salesforce/hbase/stats/serialization/IndividualStatisticReader.java create mode 100644 contrib/hbase-stat/src/main/java/com/salesforce/hbase/stats/serialization/IndividualStatisticWriter.java create mode 100644 contrib/hbase-stat/src/main/java/com/salesforce/hbase/stats/serialization/PointStatisticReader.java create mode 100644 contrib/hbase-stat/src/main/java/com/salesforce/hbase/stats/serialization/StatisticSerDe.java create mode 100644 contrib/hbase-stat/src/main/java/com/salesforce/hbase/stats/serialization/package-info.java create mode 100644 contrib/hbase-stat/src/main/java/com/salesforce/hbase/stats/util/Constants.java create mode 100644 contrib/hbase-stat/src/main/java/com/salesforce/hbase/stats/util/SetupTableUtil.java create mode 100644 contrib/hbase-stat/src/main/protobuf/README.txt create mode 100644 contrib/hbase-stat/src/main/protobuf/stats.proto create mode 100644 contrib/hbase-stat/src/test/java/com/salesforce/hbase/stats/TestEqualWidthHistogramOnTable.java create mode 100644 contrib/hbase-stat/src/test/java/com/salesforce/hbase/stats/TestEqualWidthHistogramStat.java create mode 100644 contrib/hbase-stat/src/test/java/com/salesforce/hbase/stats/TestMinMaxKeyStats.java create mode 100644 contrib/hbase-stat/src/test/java/com/salesforce/hbase/stats/TestTrackerImpl.java create mode 100644 contrib/hbase-stat/src/test/java/com/salesforce/hbase/stats/util/StatsTestUtil.java create mode 100644 contrib/hbase-stat/src/test/java/com/salesforce/hbase/stats/util/TestSetupTable.java diff --git a/contrib/hbase-stat/README.md b/contrib/hbase-stat/README.md new file mode 100644 index 00000000..d86eac56 --- /dev/null +++ b/contrib/hbase-stat/README.md @@ -0,0 +1,143 @@ +# hbase-stat +============= + +A simple statistics package for HBase. + +## Goal +======== + +Provide reasonable approximations (exact, if we can manage it) of statistical information about a table in HBase with minimal muss and fuss. + +We want to make it easy to gather statistics about your HBase tables - there should be little to no work on the users part beyond ensuring that the right things are setup. + +## Usage +========= + +###Cluster setup + +The only changes that need to be made to a generic configuration for a cluster is to add the RemoveTableOnDelete coprocessor to the list of Master Observers. This coprocessor cleans up the statistics for a table on delete, if that table has statistics 'enbabled' (see below). You should only need to add the following to your hbase-site.xml: + +``` + + hbase.coprocessor.master.classes + com.salesforce.hbase.stats.cleanup.RemoveTableOnDelete + +``` + +### Table creation + +All the work for gathering and cleaning statistics is handled via coprocessors. Generally, each statistic will have its own static methods for adding the coprocessor to the table (if not provided, the HTableDesriptor#addCoprocessor() method should suffice). For instance, to add the MinMaxKey statistic to a table, all you would do is: + +```java + HTableDescriptor primary = … + MinMaxKey.addToTable(primary) +``` + +At the very least, you should ensure that the table is created with the com.salesforce.hbase.stats.cleanup.RemoveRegionOnSplit coprocessor to ensure that when a region is removed (via splits or merges) that the stats for that region are also removed. This can be added manually (no recommended) or via the general setup table utility: + +```java + HTableDescriptor primary = new HTableDescriptor("primary"); + primary.addFamily(new HColumnDescriptor(FAM)); + + // ... + //Add your own stats here + //... + + // setup the stats table + HBaseAdmin admin = UTIL.getHBaseAdmin(); + //ensure statistics are enabled and the cleanup coprocessors setup + SetupTableUtil.setupTable(admin, primary, true, false); +``` + +#### SetupTableUtil + +In addition to settting up the cleanup coprocessors, the SetupTableUtil sets the 'stats enabled' flag in the primary table's descriptor. If this flag is enabled, the cleanup coprocessors (RemoveTableOnDelete and RemoveRegionOnSplit) will be enabled for the table + + * NOTE: if the cleanup coprocessors are not added to the table, setting the 'stats enabled' flag manually won't do anything. However, if you manually add the cleanup coprocessors, but don't enable stats on the descriptor, again, no cleanup will take place. It's highly recommended to use the SetupTableUtil to ensure you don't forget either side. + +You will also note that the SetupTableUtil has an option to ensure that the Statistics table is setup, *its highly recommneded that you use this option* to avoid accidentially forgetting and not having a statistics table when you go to write out statistics. With the wrong write configurations in hbase-site.xml, this could cause the statitic coprocessors to each block until they realize the table doesn't exist. + +To use it, you simply do the same as above, but ensure that the "ensureStatsTable" boolean flag is set: + +```java + SetupTableUtil.setupTable(admin, primary, true /* this flag! */, false); +``` + +### Reading Statistics + +Since statistics are kept in a single HTable, you could go any manually read them. However, each statistic could potentially have its own serialization and layout. Therefore, its recommended to the the StatisticReader to read a StatisticTable. Generally, all you will need to provide the StatisticReader is the type of statistic (Point or Histogram), name of the statistic and the underlying table. For instance, to read a histogram statistic "histo" for all the regions (and all the column families) of a table, you would do: + +```java + StatisticsTable stats = … + StatiticReader reader = new StatisticReader(stats, new HistogramStatisticDeserializer(), "histo"); + reader.read() +``` + +However, this is a bit of a pain as each statistic will have its own name and type. Therefore, the standard convention is for each StatisticTracker to provide a getStatisticReader(StatisticTable) method to read that statistic from the table. For instance, to read the EqualWidthHistogramStatistic, all you need to do is: + +```java + StatisticsTable stats = … + StatiticReader reader = EqualWidthHistogramStatistic.getStatisticsReader(stats); + reader.read(); +``` + +Some statistics are a little more complicated in the way they store their information, for instance using different column qualifiers at the same time to store different parts of the key. Generally, these should provide their own mechanisms to rebuild a stat from the serialized information. For instance, MinMaxKey provides an interpret method: + +```java + StatisticsTable stats = … + StatisticReader reader = MinMaxKey.getStatistcReader(stats); + List results = MinMaxKey.interpret(reader.read()); +``` + + +### Statistics Table Schema +=========================== + +The schema was inspired by OpenTSDB (opentsdb.net) where each statistic is first grouped by table, then region. After that, each statistic (MetricValue) is grouped by: + * type + * info + ** this is like the sub-type of the metric to help describe the actual type. For instance, on a min/max for the column, this could be 'min' + * value + +Suppose that we have a table called 'primary' with column 'col' and we are using the MinMaxKey statistic. Assuming the table has a single region, entries in the statistics table will look something like: + +``` +| Row | Column Family | Column Qualifier | Value +| primarycol | STAT | max_region_key | 10 +| primarycol | STAT | min_region_key | 3 +``` + +This is because the MinMaxKey statistic uses the column name (in this case 'col') as the type, we use the only CF on the stats table (STATS) and have to subtypes - info - elements: max_region_key and min_region_key, each with associated values. + +## Requirements +=============== + +* Java 1.6.0_34 or higher +* HBase-0.94.5 or higher + +### If building from source +* Maven 3.X + + +## Building from source +======================= + +From the base (hbase-stat) directory… + +To run tests + + $ mvn clean test + +To build a jar + + $ mvn clean package + +and then look in the target/ directory for the build jar + +## Roadmap / TODOs +================== + - Switch statistic cleanup to use a coprocessor based delete + - we want to delete an entire prefix, but that first requires doing a scan and then deleting everything back from the scan + + - Enable more fine-grained writing of statistics so different serialization mechanisms can be inserted. + - most of the plumbing is already there (StatisticReader/Writer), but need to work it into the cleaner mechanisms diff --git a/contrib/hbase-stat/pom.xml b/contrib/hbase-stat/pom.xml new file mode 100644 index 00000000..2ee2c3fd --- /dev/null +++ b/contrib/hbase-stat/pom.xml @@ -0,0 +1,215 @@ + + 4.0.0 + com.salesforce.hbase + hbase-stat + 0.0.1-SNAPSHOT + HBase Stat + A simple statistics package for HBase + + + + apache release + https://repository.apache.org/content/repositories/releases/ + + + apache non-releases + Apache non-releases + http://people.apache.org/~stack/m2/repository + + false + + + true + + + + + codehaus + Codehaus Public + http://repository.codehaus.org/ + + false + + + true + + + + + + 0.94.5 + 1.0.4 + 1.8.8 + 12.0.1 + + 1.8.5 + 4.10 + 900 + true + + 2.14 + + + + + com.google.guava + guava + ${guava.version} + + + org.apache.hadoop + hadoop-core + ${hadoop.version} + true + + + hsqldb + hsqldb + + + net.sf.kosmosfs + kfs + + + org.eclipse.jdt + core + + + net.java.dev.jets3t + jets3t + + + oro + oro + + + + + org.apache.hbase + hbase + ${hbase.version} + + + + org.codehaus.jackson + jackson-core-asl + ${jackson.version} + + + org.codehaus.jackson + jackson-mapper-asl + ${jackson.version} + + + org.codehaus.jackson + jackson-jaxrs + ${jackson.version} + + + org.codehaus.jackson + jackson-xc + ${jackson.version} + + + + + org.apache.hbase + hbase + ${hbase.version} + test-jar + test + + + org.apache.hadoop + hadoop-test + ${hadoop.version} + true + test + + + junit + junit + ${junit.version} + test + + + org.mockito + mockito-all + ${mockito-all.version} + test + + + + + + + + org.apache.maven.plugins + maven-compiler-plugin + 3.0 + + 1.6 + 1.6 + + + + + + + + org.apache.maven.plugins + maven-jar-plugin + 2.4 + + + prepare-package + + + test-jar + + + + + + + org/apache/jute/** + org/apache/zookeeper/** + **/*.jsp + log4j.properties + + + + + + org.apache.maven.plugins + maven-source-plugin + 2.2.1 + + + attach-sources + prepare-package + + jar-no-fork + + + + + + + maven-surefire-plugin + ${surefire.version} + + ${test.timeout} + -enableassertions -Xmx2048m + -Djava.security.egd=file:/dev/./urandom + ${test.output.tofile} + + + + + \ No newline at end of file diff --git a/contrib/hbase-stat/src/main/java/com/salesforce/hbase/protobuf/generated/StatisticProtos.java b/contrib/hbase-stat/src/main/java/com/salesforce/hbase/protobuf/generated/StatisticProtos.java new file mode 100644 index 00000000..ac4f03e1 --- /dev/null +++ b/contrib/hbase-stat/src/main/java/com/salesforce/hbase/protobuf/generated/StatisticProtos.java @@ -0,0 +1,543 @@ +// Generated by the protocol buffer compiler. DO NOT EDIT! +// source: stats.proto + +package com.salesforce.hbase.protobuf.generated; + +public final class StatisticProtos { + private StatisticProtos() {} + public static void registerAllExtensions( + com.google.protobuf.ExtensionRegistry registry) { + } + public interface HistogramOrBuilder + extends com.google.protobuf.MessageOrBuilder { + + // required int64 depthOrWidth = 1; + boolean hasDepthOrWidth(); + long getDepthOrWidth(); + + // repeated bytes value = 2; + java.util.List getValueList(); + int getValueCount(); + com.google.protobuf.ByteString getValue(int index); + } + public static final class Histogram extends + com.google.protobuf.GeneratedMessage + implements HistogramOrBuilder { + // Use Histogram.newBuilder() to construct. + private Histogram(Builder builder) { + super(builder); + } + private Histogram(boolean noInit) {} + + private static final Histogram defaultInstance; + public static Histogram getDefaultInstance() { + return defaultInstance; + } + + public Histogram getDefaultInstanceForType() { + return defaultInstance; + } + + public static final com.google.protobuf.Descriptors.Descriptor + getDescriptor() { + return com.salesforce.hbase.protobuf.generated.StatisticProtos.internal_static_Histogram_descriptor; + } + + protected com.google.protobuf.GeneratedMessage.FieldAccessorTable + internalGetFieldAccessorTable() { + return com.salesforce.hbase.protobuf.generated.StatisticProtos.internal_static_Histogram_fieldAccessorTable; + } + + private int bitField0_; + // required int64 depthOrWidth = 1; + public static final int DEPTHORWIDTH_FIELD_NUMBER = 1; + private long depthOrWidth_; + public boolean hasDepthOrWidth() { + return ((bitField0_ & 0x00000001) == 0x00000001); + } + public long getDepthOrWidth() { + return depthOrWidth_; + } + + // repeated bytes value = 2; + public static final int VALUE_FIELD_NUMBER = 2; + private java.util.List value_; + public java.util.List + getValueList() { + return value_; + } + public int getValueCount() { + return value_.size(); + } + public com.google.protobuf.ByteString getValue(int index) { + return value_.get(index); + } + + private void initFields() { + depthOrWidth_ = 0L; + value_ = java.util.Collections.emptyList();; + } + private byte memoizedIsInitialized = -1; + public final boolean isInitialized() { + byte isInitialized = memoizedIsInitialized; + if (isInitialized != -1) return isInitialized == 1; + + if (!hasDepthOrWidth()) { + memoizedIsInitialized = 0; + return false; + } + memoizedIsInitialized = 1; + return true; + } + + public void writeTo(com.google.protobuf.CodedOutputStream output) + throws java.io.IOException { + getSerializedSize(); + if (((bitField0_ & 0x00000001) == 0x00000001)) { + output.writeInt64(1, depthOrWidth_); + } + for (int i = 0; i < value_.size(); i++) { + output.writeBytes(2, value_.get(i)); + } + getUnknownFields().writeTo(output); + } + + private int memoizedSerializedSize = -1; + public int getSerializedSize() { + int size = memoizedSerializedSize; + if (size != -1) return size; + + size = 0; + if (((bitField0_ & 0x00000001) == 0x00000001)) { + size += com.google.protobuf.CodedOutputStream + .computeInt64Size(1, depthOrWidth_); + } + { + int dataSize = 0; + for (int i = 0; i < value_.size(); i++) { + dataSize += com.google.protobuf.CodedOutputStream + .computeBytesSizeNoTag(value_.get(i)); + } + size += dataSize; + size += 1 * getValueList().size(); + } + size += getUnknownFields().getSerializedSize(); + memoizedSerializedSize = size; + return size; + } + + private static final long serialVersionUID = 0L; + @java.lang.Override + protected java.lang.Object writeReplace() + throws java.io.ObjectStreamException { + return super.writeReplace(); + } + + @java.lang.Override + public boolean equals(final java.lang.Object obj) { + if (obj == this) { + return true; + } + if (!(obj instanceof com.salesforce.hbase.protobuf.generated.StatisticProtos.Histogram)) { + return super.equals(obj); + } + com.salesforce.hbase.protobuf.generated.StatisticProtos.Histogram other = (com.salesforce.hbase.protobuf.generated.StatisticProtos.Histogram) obj; + + boolean result = true; + result = result && (hasDepthOrWidth() == other.hasDepthOrWidth()); + if (hasDepthOrWidth()) { + result = result && (getDepthOrWidth() + == other.getDepthOrWidth()); + } + result = result && getValueList() + .equals(other.getValueList()); + result = result && + getUnknownFields().equals(other.getUnknownFields()); + return result; + } + + @java.lang.Override + public int hashCode() { + int hash = 41; + hash = (19 * hash) + getDescriptorForType().hashCode(); + if (hasDepthOrWidth()) { + hash = (37 * hash) + DEPTHORWIDTH_FIELD_NUMBER; + hash = (53 * hash) + hashLong(getDepthOrWidth()); + } + if (getValueCount() > 0) { + hash = (37 * hash) + VALUE_FIELD_NUMBER; + hash = (53 * hash) + getValueList().hashCode(); + } + hash = (29 * hash) + getUnknownFields().hashCode(); + return hash; + } + + public static com.salesforce.hbase.protobuf.generated.StatisticProtos.Histogram parseFrom( + com.google.protobuf.ByteString data) + throws com.google.protobuf.InvalidProtocolBufferException { + return newBuilder().mergeFrom(data).buildParsed(); + } + public static com.salesforce.hbase.protobuf.generated.StatisticProtos.Histogram parseFrom( + com.google.protobuf.ByteString data, + com.google.protobuf.ExtensionRegistryLite extensionRegistry) + throws com.google.protobuf.InvalidProtocolBufferException { + return newBuilder().mergeFrom(data, extensionRegistry) + .buildParsed(); + } + public static com.salesforce.hbase.protobuf.generated.StatisticProtos.Histogram parseFrom(byte[] data) + throws com.google.protobuf.InvalidProtocolBufferException { + return newBuilder().mergeFrom(data).buildParsed(); + } + public static com.salesforce.hbase.protobuf.generated.StatisticProtos.Histogram parseFrom( + byte[] data, + com.google.protobuf.ExtensionRegistryLite extensionRegistry) + throws com.google.protobuf.InvalidProtocolBufferException { + return newBuilder().mergeFrom(data, extensionRegistry) + .buildParsed(); + } + public static com.salesforce.hbase.protobuf.generated.StatisticProtos.Histogram parseFrom(java.io.InputStream input) + throws java.io.IOException { + return newBuilder().mergeFrom(input).buildParsed(); + } + public static com.salesforce.hbase.protobuf.generated.StatisticProtos.Histogram parseFrom( + java.io.InputStream input, + com.google.protobuf.ExtensionRegistryLite extensionRegistry) + throws java.io.IOException { + return newBuilder().mergeFrom(input, extensionRegistry) + .buildParsed(); + } + public static com.salesforce.hbase.protobuf.generated.StatisticProtos.Histogram parseDelimitedFrom(java.io.InputStream input) + throws java.io.IOException { + Builder builder = newBuilder(); + if (builder.mergeDelimitedFrom(input)) { + return builder.buildParsed(); + } else { + return null; + } + } + public static com.salesforce.hbase.protobuf.generated.StatisticProtos.Histogram parseDelimitedFrom( + java.io.InputStream input, + com.google.protobuf.ExtensionRegistryLite extensionRegistry) + throws java.io.IOException { + Builder builder = newBuilder(); + if (builder.mergeDelimitedFrom(input, extensionRegistry)) { + return builder.buildParsed(); + } else { + return null; + } + } + public static com.salesforce.hbase.protobuf.generated.StatisticProtos.Histogram parseFrom( + com.google.protobuf.CodedInputStream input) + throws java.io.IOException { + return newBuilder().mergeFrom(input).buildParsed(); + } + public static com.salesforce.hbase.protobuf.generated.StatisticProtos.Histogram parseFrom( + com.google.protobuf.CodedInputStream input, + com.google.protobuf.ExtensionRegistryLite extensionRegistry) + throws java.io.IOException { + return newBuilder().mergeFrom(input, extensionRegistry) + .buildParsed(); + } + + public static Builder newBuilder() { return Builder.create(); } + public Builder newBuilderForType() { return newBuilder(); } + public static Builder newBuilder(com.salesforce.hbase.protobuf.generated.StatisticProtos.Histogram prototype) { + return newBuilder().mergeFrom(prototype); + } + public Builder toBuilder() { return newBuilder(this); } + + @java.lang.Override + protected Builder newBuilderForType( + com.google.protobuf.GeneratedMessage.BuilderParent parent) { + Builder builder = new Builder(parent); + return builder; + } + public static final class Builder extends + com.google.protobuf.GeneratedMessage.Builder + implements com.salesforce.hbase.protobuf.generated.StatisticProtos.HistogramOrBuilder { + public static final com.google.protobuf.Descriptors.Descriptor + getDescriptor() { + return com.salesforce.hbase.protobuf.generated.StatisticProtos.internal_static_Histogram_descriptor; + } + + protected com.google.protobuf.GeneratedMessage.FieldAccessorTable + internalGetFieldAccessorTable() { + return com.salesforce.hbase.protobuf.generated.StatisticProtos.internal_static_Histogram_fieldAccessorTable; + } + + // Construct using com.salesforce.hbase.protobuf.generated.StatisticProtos.Histogram.newBuilder() + private Builder() { + maybeForceBuilderInitialization(); + } + + private Builder(BuilderParent parent) { + super(parent); + maybeForceBuilderInitialization(); + } + private void maybeForceBuilderInitialization() { + if (com.google.protobuf.GeneratedMessage.alwaysUseFieldBuilders) { + } + } + private static Builder create() { + return new Builder(); + } + + public Builder clear() { + super.clear(); + depthOrWidth_ = 0L; + bitField0_ = (bitField0_ & ~0x00000001); + value_ = java.util.Collections.emptyList();; + bitField0_ = (bitField0_ & ~0x00000002); + return this; + } + + public Builder clone() { + return create().mergeFrom(buildPartial()); + } + + public com.google.protobuf.Descriptors.Descriptor + getDescriptorForType() { + return com.salesforce.hbase.protobuf.generated.StatisticProtos.Histogram.getDescriptor(); + } + + public com.salesforce.hbase.protobuf.generated.StatisticProtos.Histogram getDefaultInstanceForType() { + return com.salesforce.hbase.protobuf.generated.StatisticProtos.Histogram.getDefaultInstance(); + } + + public com.salesforce.hbase.protobuf.generated.StatisticProtos.Histogram build() { + com.salesforce.hbase.protobuf.generated.StatisticProtos.Histogram result = buildPartial(); + if (!result.isInitialized()) { + throw newUninitializedMessageException(result); + } + return result; + } + + private com.salesforce.hbase.protobuf.generated.StatisticProtos.Histogram buildParsed() + throws com.google.protobuf.InvalidProtocolBufferException { + com.salesforce.hbase.protobuf.generated.StatisticProtos.Histogram result = buildPartial(); + if (!result.isInitialized()) { + throw newUninitializedMessageException( + result).asInvalidProtocolBufferException(); + } + return result; + } + + public com.salesforce.hbase.protobuf.generated.StatisticProtos.Histogram buildPartial() { + com.salesforce.hbase.protobuf.generated.StatisticProtos.Histogram result = new com.salesforce.hbase.protobuf.generated.StatisticProtos.Histogram(this); + int from_bitField0_ = bitField0_; + int to_bitField0_ = 0; + if (((from_bitField0_ & 0x00000001) == 0x00000001)) { + to_bitField0_ |= 0x00000001; + } + result.depthOrWidth_ = depthOrWidth_; + if (((bitField0_ & 0x00000002) == 0x00000002)) { + value_ = java.util.Collections.unmodifiableList(value_); + bitField0_ = (bitField0_ & ~0x00000002); + } + result.value_ = value_; + result.bitField0_ = to_bitField0_; + onBuilt(); + return result; + } + + public Builder mergeFrom(com.google.protobuf.Message other) { + if (other instanceof com.salesforce.hbase.protobuf.generated.StatisticProtos.Histogram) { + return mergeFrom((com.salesforce.hbase.protobuf.generated.StatisticProtos.Histogram)other); + } else { + super.mergeFrom(other); + return this; + } + } + + public Builder mergeFrom(com.salesforce.hbase.protobuf.generated.StatisticProtos.Histogram other) { + if (other == com.salesforce.hbase.protobuf.generated.StatisticProtos.Histogram.getDefaultInstance()) return this; + if (other.hasDepthOrWidth()) { + setDepthOrWidth(other.getDepthOrWidth()); + } + if (!other.value_.isEmpty()) { + if (value_.isEmpty()) { + value_ = other.value_; + bitField0_ = (bitField0_ & ~0x00000002); + } else { + ensureValueIsMutable(); + value_.addAll(other.value_); + } + onChanged(); + } + this.mergeUnknownFields(other.getUnknownFields()); + return this; + } + + public final boolean isInitialized() { + if (!hasDepthOrWidth()) { + + return false; + } + return true; + } + + public Builder mergeFrom( + com.google.protobuf.CodedInputStream input, + com.google.protobuf.ExtensionRegistryLite extensionRegistry) + throws java.io.IOException { + com.google.protobuf.UnknownFieldSet.Builder unknownFields = + com.google.protobuf.UnknownFieldSet.newBuilder( + this.getUnknownFields()); + while (true) { + int tag = input.readTag(); + switch (tag) { + case 0: + this.setUnknownFields(unknownFields.build()); + onChanged(); + return this; + default: { + if (!parseUnknownField(input, unknownFields, + extensionRegistry, tag)) { + this.setUnknownFields(unknownFields.build()); + onChanged(); + return this; + } + break; + } + case 8: { + bitField0_ |= 0x00000001; + depthOrWidth_ = input.readInt64(); + break; + } + case 18: { + ensureValueIsMutable(); + value_.add(input.readBytes()); + break; + } + } + } + } + + private int bitField0_; + + // required int64 depthOrWidth = 1; + private long depthOrWidth_ ; + public boolean hasDepthOrWidth() { + return ((bitField0_ & 0x00000001) == 0x00000001); + } + public long getDepthOrWidth() { + return depthOrWidth_; + } + public Builder setDepthOrWidth(long value) { + bitField0_ |= 0x00000001; + depthOrWidth_ = value; + onChanged(); + return this; + } + public Builder clearDepthOrWidth() { + bitField0_ = (bitField0_ & ~0x00000001); + depthOrWidth_ = 0L; + onChanged(); + return this; + } + + // repeated bytes value = 2; + private java.util.List value_ = java.util.Collections.emptyList();; + private void ensureValueIsMutable() { + if (!((bitField0_ & 0x00000002) == 0x00000002)) { + value_ = new java.util.ArrayList(value_); + bitField0_ |= 0x00000002; + } + } + public java.util.List + getValueList() { + return java.util.Collections.unmodifiableList(value_); + } + public int getValueCount() { + return value_.size(); + } + public com.google.protobuf.ByteString getValue(int index) { + return value_.get(index); + } + public Builder setValue( + int index, com.google.protobuf.ByteString value) { + if (value == null) { + throw new NullPointerException(); + } + ensureValueIsMutable(); + value_.set(index, value); + onChanged(); + return this; + } + public Builder addValue(com.google.protobuf.ByteString value) { + if (value == null) { + throw new NullPointerException(); + } + ensureValueIsMutable(); + value_.add(value); + onChanged(); + return this; + } + public Builder addAllValue( + java.lang.Iterable values) { + ensureValueIsMutable(); + super.addAll(values, value_); + onChanged(); + return this; + } + public Builder clearValue() { + value_ = java.util.Collections.emptyList();; + bitField0_ = (bitField0_ & ~0x00000002); + onChanged(); + return this; + } + + // @@protoc_insertion_point(builder_scope:Histogram) + } + + static { + defaultInstance = new Histogram(true); + defaultInstance.initFields(); + } + + // @@protoc_insertion_point(class_scope:Histogram) + } + + private static com.google.protobuf.Descriptors.Descriptor + internal_static_Histogram_descriptor; + private static + com.google.protobuf.GeneratedMessage.FieldAccessorTable + internal_static_Histogram_fieldAccessorTable; + + public static com.google.protobuf.Descriptors.FileDescriptor + getDescriptor() { + return descriptor; + } + private static com.google.protobuf.Descriptors.FileDescriptor + descriptor; + static { + java.lang.String[] descriptorData = { + "\n\013stats.proto\"0\n\tHistogram\022\024\n\014depthOrWid" + + "th\030\001 \002(\003\022\r\n\005value\030\002 \003(\014B?\n\'com.salesforc" + + "e.hbase.protobuf.generatedB\017StatisticPro" + + "tosH\001\240\001\001" + }; + com.google.protobuf.Descriptors.FileDescriptor.InternalDescriptorAssigner assigner = + new com.google.protobuf.Descriptors.FileDescriptor.InternalDescriptorAssigner() { + public com.google.protobuf.ExtensionRegistry assignDescriptors( + com.google.protobuf.Descriptors.FileDescriptor root) { + descriptor = root; + internal_static_Histogram_descriptor = + getDescriptor().getMessageTypes().get(0); + internal_static_Histogram_fieldAccessorTable = new + com.google.protobuf.GeneratedMessage.FieldAccessorTable( + internal_static_Histogram_descriptor, + new java.lang.String[] { "DepthOrWidth", "Value", }, + com.salesforce.hbase.protobuf.generated.StatisticProtos.Histogram.class, + com.salesforce.hbase.protobuf.generated.StatisticProtos.Histogram.Builder.class); + return null; + } + }; + com.google.protobuf.Descriptors.FileDescriptor + .internalBuildGeneratedFileFrom(descriptorData, + new com.google.protobuf.Descriptors.FileDescriptor[] { + }, assigner); + } + + // @@protoc_insertion_point(outer_class_scope) +} diff --git a/contrib/hbase-stat/src/main/java/com/salesforce/hbase/stats/BaseStatistic.java b/contrib/hbase-stat/src/main/java/com/salesforce/hbase/stats/BaseStatistic.java new file mode 100644 index 00000000..738765a7 --- /dev/null +++ b/contrib/hbase-stat/src/main/java/com/salesforce/hbase/stats/BaseStatistic.java @@ -0,0 +1,80 @@ +package com.salesforce.hbase.stats; + +import java.io.IOException; +import java.util.List; + +import org.apache.hadoop.hbase.CoprocessorEnvironment; +import org.apache.hadoop.hbase.HTableDescriptor; +import org.apache.hadoop.hbase.client.Scan; +import org.apache.hadoop.hbase.coprocessor.BaseRegionObserver; +import org.apache.hadoop.hbase.coprocessor.ObserverContext; +import org.apache.hadoop.hbase.coprocessor.RegionCoprocessorEnvironment; +import org.apache.hadoop.hbase.coprocessor.RegionObserver; +import org.apache.hadoop.hbase.regionserver.InternalScanner; +import org.apache.hadoop.hbase.regionserver.KeyValueScanner; +import org.apache.hadoop.hbase.regionserver.ScanType; +import org.apache.hadoop.hbase.regionserver.Store; +import org.apache.hadoop.hbase.regionserver.StoreScanner; +import org.apache.hadoop.hbase.util.Bytes; + + +/** + * Simple helper base class for all {@link RegionObserver RegionObservers} that need to access a + * {@link StatisticsTable}. + */ +public abstract class BaseStatistic extends BaseRegionObserver implements StatisticTracker { + + protected StatisticsTable stats; + + @Override + public void start(CoprocessorEnvironment e) throws IOException { + HTableDescriptor desc = ((RegionCoprocessorEnvironment) e).getRegion().getTableDesc(); + stats = StatisticsTable.getStatisticsTableForCoprocessor(e, desc.getName()); + } + + @Override + public void stop(CoprocessorEnvironment e) throws IOException { + stats.close(); + } + + @Override + public InternalScanner preCompactScannerOpen(ObserverContext c, + Store store, List scanners, ScanType scanType, long earliestPutTs, + InternalScanner s) throws IOException { + InternalScanner internalScan = s; + if (scanType.equals(ScanType.MAJOR_COMPACT)) { + // this is the first CP accessed, so we need to just create a major compaction scanner, just + // like in the compactor + if (s == null) { + Scan scan = new Scan(); + scan.setMaxVersions(store.getFamily().getMaxVersions()); + long smallestReadPoint = store.getHRegion().getSmallestReadPoint(); + internalScan = + new StoreScanner(store, store.getScanInfo(), scan, scanners, scanType, + smallestReadPoint, earliestPutTs); + } + InternalScanner scanner = getInternalScanner(c, store, internalScan, + store.getColumnFamilyName()); + if (scanner != null) { + internalScan = scanner; + } + } + return internalScan; + } + + /** + * Get an internal scanner that will update statistics. This should be a delegating + * {@link InternalScanner} so the original scan semantics are preserved. You should consider using + * the {@link StatScanner} for the delegating scanner. + * @param c + * @param store + * @param internalScan + * @return null if the existing scanner is sufficient, otherwise the scanner to use going + * forward + */ + protected InternalScanner getInternalScanner(ObserverContext c, + Store store, InternalScanner internalScan, String family) { + return new StatScanner(this, stats, c.getEnvironment().getRegion() + .getRegionInfo(), internalScan, Bytes.toBytes(family)); + } +} diff --git a/contrib/hbase-stat/src/main/java/com/salesforce/hbase/stats/ColumnFamilyStatistic.java b/contrib/hbase-stat/src/main/java/com/salesforce/hbase/stats/ColumnFamilyStatistic.java new file mode 100644 index 00000000..5b9bb87d --- /dev/null +++ b/contrib/hbase-stat/src/main/java/com/salesforce/hbase/stats/ColumnFamilyStatistic.java @@ -0,0 +1,60 @@ +package com.salesforce.hbase.stats; + +import java.util.ArrayList; +import java.util.Arrays; +import java.util.List; + +import org.apache.hadoop.hbase.util.Bytes; + +/** + * Encapsulate all the values of the statistics for a given {@link StatisticValue} over a single + * column family in a single region. + * @param type of statistic that is being retrieved/stored + */ +public class ColumnFamilyStatistic { + + private List values; + private final byte[] region; + private final byte[] columnfamily; + + public ColumnFamilyStatistic(byte[] region, byte[] columnfamily) { + this.region = region; + this.columnfamily = columnfamily; + this.values = new ArrayList(); + } + + public ColumnFamilyStatistic(byte[] region, byte[] columnfamily, S... values) { + this(region, columnfamily, Arrays.asList(values)); + } + + public ColumnFamilyStatistic(byte[] region, byte[] columnfamily, List values) { + this(region, columnfamily); + this.values.addAll(values); + } + + public byte[] getRegion() { + return region; + } + + public byte[] getColumnfamily() { + return columnfamily; + } + + public void add(S value) { + this.values.add(value); + } + + public void setValues(List values) { + this.values = values; + } + + public List getValues() { + return this.values; + } + + public String toString() { + return "stat:[region=" + Bytes.toString(region) + ", column=" + + Bytes.toString(columnfamily) + + ", stats:" + values + "]"; + } +} \ No newline at end of file diff --git a/contrib/hbase-stat/src/main/java/com/salesforce/hbase/stats/HistogramStatisticValue.java b/contrib/hbase-stat/src/main/java/com/salesforce/hbase/stats/HistogramStatisticValue.java new file mode 100644 index 00000000..fe5390a1 --- /dev/null +++ b/contrib/hbase-stat/src/main/java/com/salesforce/hbase/stats/HistogramStatisticValue.java @@ -0,0 +1,100 @@ +package com.salesforce.hbase.stats; + +import org.apache.hadoop.hbase.util.Bytes; + +import com.google.protobuf.ByteString; +import com.google.protobuf.InvalidProtocolBufferException; +import com.salesforce.hbase.protobuf.generated.StatisticProtos.Histogram; + +/** + * {@link StatisticValue} whose value is actually a histogram of data. + *

+ * Two different types of histograms are supported - fixed width and fixed depth. + *

    + *
  1. Fixed Width - the width of the each column is fixed, but there is a variable number + * of elements in each 'bucket' in the histogram. This is the 'usual' histogram. + *
      + *
    • You should only use {@link #addColumn(byte[])} as the width of each column is + * known
    • + *
    + *
  2. + *
  3. Fixed Depth - the width of the each column is variable, but there are a known number + * of elements in each 'bucket' in the histogram. For instance, this can be used to determine + * every n'th key + *
      + *
    • You should only use {@link #addColumn(int, byte[])} as the depth of each column is + * variable
    • + *
    + *
  4. + *
+ */ +public class HistogramStatisticValue extends StatisticValue { + + private Histogram.Builder builder = Histogram.newBuilder(); + + /** + * Build a statistic value - should only be used by the + * {@link com.salesforce.hbase.stats.serialization.HistogramStatisticReader} + * . + * @param value statistic instance to reference - no data is copied + * @throws InvalidProtocolBufferException if the data in the {@link StatisticValue} is not a + * histogram + */ + public HistogramStatisticValue(StatisticValue value) throws InvalidProtocolBufferException { + super(value.name, value.info, value.value); + // reset the builder based on the data + builder = Histogram.parseFrom(value.value).toBuilder(); + } + + /** + * Build a fixed-width/depth histogram. + * @param name name of the statistic + * @param info general info about the statistic + * @param widthOrDepth width of all the columns + */ + public HistogramStatisticValue(byte[] name, byte[] info, long widthOrDepth) { + super(name, info, null); + this.builder.setDepthOrWidth(widthOrDepth); + } + + /** + * Add a new fixed-depth column to this histogram - we already know the depth of the column (it + * was specified in the constructor), so we just need to get the next key boundary. + * @param value value of the next column - added to the end of the histogram + */ + public void addColumn(ByteString value) { + builder.addValue(value); + } + + /** + * Add a new fixed-width column to the histogram. We already know the width of the column, so we + * only care about getting the count of the keys in that column + * @param count count of the keys in the column + */ + public void addColumn(long count) { + builder.addValue(ByteString.copyFrom(Bytes.toBytes(count))); + } + + /** + * Get the raw bytes for the histogram. Can be rebuilt using {@link #getHistogram(byte[])}. This + * is a single-use method - after calling any previous calls to {@link #addColumn} are ignored. + */ + public byte[] getValue() { + byte[] data = builder.build().toByteArray(); + builder = Histogram.newBuilder(); + return data; + } + + /** + * Get the underlying columns for the histogram. After calling this method, any futher updates to + * the histogram are not guarranteed to work correctly. original columns every time. + * @return a deserialized version of the histogram + */ + public synchronized Histogram getHistogram() { + return this.builder.build(); + } + + public static Histogram getHistogram(byte[] raw) throws InvalidProtocolBufferException { + return Histogram.parseFrom(raw); + } +} \ No newline at end of file diff --git a/contrib/hbase-stat/src/main/java/com/salesforce/hbase/stats/StatScanner.java b/contrib/hbase-stat/src/main/java/com/salesforce/hbase/stats/StatScanner.java new file mode 100644 index 00000000..936dc0a1 --- /dev/null +++ b/contrib/hbase-stat/src/main/java/com/salesforce/hbase/stats/StatScanner.java @@ -0,0 +1,91 @@ +package com.salesforce.hbase.stats; + +import java.io.IOException; +import java.util.List; + +import org.apache.commons.logging.Log; +import org.apache.commons.logging.LogFactory; +import org.apache.hadoop.hbase.HRegionInfo; +import org.apache.hadoop.hbase.KeyValue; +import org.apache.hadoop.hbase.regionserver.InternalScanner; +import org.apache.hadoop.io.MultipleIOException; + +import com.google.common.collect.Lists; +import com.salesforce.hbase.stats.serialization.IndividualStatisticWriter; + +public class StatScanner implements InternalScanner { + private static final Log LOG = LogFactory.getLog(StatScanner.class); + private InternalScanner delegate; + private StatisticsTable stats; + private HRegionInfo region; + private StatisticTracker tracker; + private byte[] family; + + public StatScanner(StatisticTracker tracker, StatisticsTable stats, HRegionInfo region, + InternalScanner delegate, byte[] family) { + this.tracker = tracker; + this.stats = stats; + this.delegate = delegate; + this.region = region; + this.family = family; + } + + public boolean next(List result) throws IOException { + boolean ret = delegate.next(result); + updateStat(result); + return ret; + } + + public boolean next(List result, String metric) throws IOException { + boolean ret = delegate.next(result, metric); + updateStat(result); + return ret; + } + + public boolean next(List result, int limit) throws IOException { + boolean ret = delegate.next(result, limit); + updateStat(result); + return ret; + } + + public boolean next(List result, int limit, String metric) throws IOException { + boolean ret = delegate.next(result, limit, metric); + updateStat(result); + return ret; + } + + + /** + * Update the current statistics based on the lastest batch of key-values from the underlying + * scanner + * @param results next batch of {@link KeyValue}s + */ + protected void updateStat(final List results) { + for (KeyValue kv : results) { + tracker.updateStatistic(kv); + } + } + + public void close() throws IOException { + IOException toThrow = null; + try { + // update the statistics table + List data = this.tracker.getCurrentStats(); + stats.updateStats( + new IndividualStatisticWriter(region.getTableName(), region.getRegionName(), family), + data); + } catch (IOException e) { + LOG.error("Failed to update statistics table!", e); + toThrow = e; + } + // close the delegate scanner + try { + delegate.close(); + } catch (IOException e) { + if (toThrow == null) { + throw e; + } + throw MultipleIOException.createIOException(Lists.newArrayList(toThrow, e)); + } + } +} diff --git a/contrib/hbase-stat/src/main/java/com/salesforce/hbase/stats/StatisticReader.java b/contrib/hbase-stat/src/main/java/com/salesforce/hbase/stats/StatisticReader.java new file mode 100644 index 00000000..7e2b17be --- /dev/null +++ b/contrib/hbase-stat/src/main/java/com/salesforce/hbase/stats/StatisticReader.java @@ -0,0 +1,187 @@ +package com.salesforce.hbase.stats; + +import java.io.IOException; +import java.util.ArrayList; +import java.util.List; + +import org.apache.commons.logging.Log; +import org.apache.commons.logging.LogFactory; +import org.apache.hadoop.hbase.client.Get; +import org.apache.hadoop.hbase.client.HTableInterface; +import org.apache.hadoop.hbase.client.Result; +import org.apache.hadoop.hbase.client.ResultScanner; +import org.apache.hadoop.hbase.client.Scan; +import org.apache.hadoop.hbase.filter.PrefixFilter; +import org.apache.hadoop.hbase.util.Bytes; + +import com.salesforce.hbase.stats.serialization.IndividualStatisticReader; +import com.salesforce.hbase.stats.serialization.StatisticSerDe; +import com.salesforce.hbase.stats.util.Constants; + +/** + * Read a statistic from a {@link StatisticsTable}. This is an abstraction around the underlying + * serialization format of a given statistic, allow us to change the format without exposing any + * under the hood mechanics to the user. + * @param Type of statistic that should be read + */ +public class StatisticReader { + + private static final Log LOG = LogFactory.getLog(StatisticReader.class); + + private IndividualStatisticReader deserializer; + private byte[] name; + + private HTableInterface table; + + private byte[] source; + + // by default, we only return the latest version of a statistc + private static final int DEFAULT_VERSIONS = 1; + + public StatisticReader(StatisticsTable stats, IndividualStatisticReader statReader, + byte[] statisticName) { + this.table = stats.getUnderlyingTable(); + this.source = stats.getSourceTableName(); + this.deserializer = statReader; + this.name = statisticName; + } + + /** + * Read all the statistics with the reader for the current source table. If there have been + * multiple statistics updates for the same column family/region (same name, over a time range) + * then you will get only the latest version; to get older versions, use {@link #read(int)}. If + * there are multiple regions or multiple column families, there will be one ColumnFamilyStatistic + * per region-family pair (so with 2 regions, each with 2 column families who are tracked, there + * will be 4 results). + * @return a list of all the {@link ColumnFamilyStatistic}s gathered from that reader + * @throws IOException if we cannot read the statistics table properly + */ + public List> read() throws IOException { + return read(DEFAULT_VERSIONS); + } + + /** + * Read all the statistics with the reader for the current source table. If there have been + * multiple statistics updates for the same column family/region (same name, over a time range) + * then you will get all the versions, one per {@link ColumnFamilyStatistic}, with the most recent + * being first in the {@link ColumnFamilyStatistic#getValues()} list. If there are multiple + * regions or multiple column families, there will be one ColumnFamilyStatistic per region-family + * pair (so with 2 regions, each with 2 column families who are tracked and two different versions + * of keys for each family, there will be 8 results). + * @param versions max number of versions to read from the table + * @return a list of all the {@link ColumnFamilyStatistic}s gathered from that reader + * @throws IOException if we cannot read the statistics table properly + */ + public List> read(int versions) throws IOException { + byte[] scanPrefix = this.getRowKey(); + LOG.info("Reading for prefix: " + Bytes.toString(scanPrefix)); + return getResults(scanPrefix, versions); + } + + /** + * Read all the statistics with the reader for the current source table for the specified region + * name. If there have been multiple statistics updates for the same column family/region (same + * name, over a time range) then you will get only the latest version; to get older versions, use + * {@link #read(byte[], int)}.If there are multiple column families in the region there will be + * one ColumnFamilyStatistic per region-family pair (so with a single regions, with 2 column + * families that are tracked, there will be 2 results). + * @param region name of the region for which to get the stats + * @return a list of all the {@link ColumnFamilyStatistic}s gathered from that reader + * @throws IOException if we cannot read the statistics table properly + */ + public List> read(byte[] region) throws IOException { + return read(region, DEFAULT_VERSIONS); + } + + /** + * Read all the statistics with the reader for the current source table for the specified region + * name. If there have been multiple statistics updates for the same column family/region (same + * name, over a time range) then you will get all the versions, one per + * {@link ColumnFamilyStatistic}, with the most recent being first in the + * {@link ColumnFamilyStatistic#getValues()} list. If there are multiple column families in the + * region there will be one ColumnFamilyStatistic per region-family pair (so with a single + * regions, with 2 column families that are tracked and two different versions of stats, there + * will be 4 results). + * @param region name of the region for which to get the stats + * @param versions max number of statistic versions to read + * @return a list of all the {@link ColumnFamilyStatistic}s gathered from that reader + * @throws IOException if we cannot read the statistics table properly + */ + public List> read(byte[] region, int versions) throws IOException { + byte[] scanPrefix = this.getRowKey(region); + LOG.info("Reading for prefix: " + Bytes.toString(scanPrefix)); + return getResults(scanPrefix, versions); + } + + /** + * Read all the statistics with the reader for the current source table for the specified region + * name. If there have been multiple statistics updates for the same column family/region (same + * name, over a time range) then you will get only the latest version; to get older versions, use + * {@link #read(byte[], byte[], int)}. + * @param region name of the region for which to get the stats + * @param column name of the column family for which to get the stats + * @return a list of all the {@link ColumnFamilyStatistic}s gathered from that reader + * @throws IOException if we cannot read the statistics table properly + */ + public ColumnFamilyStatistic read(byte[] region, byte[] column) throws IOException { + return read(region, column, DEFAULT_VERSIONS); + } + + /** + * Read all the statistics with the reader for the current source table for the specified region + * name. If there have been multiple statistics updates for the same column family/region (same + * name, over a time range) then you will get all the versions, one per + * {@link ColumnFamilyStatistic}, with the most recent being first in the + * {@link ColumnFamilyStatistic#getValues()} list. + * @param region name of the region for which to get the stats + * @param column name of the column family for which to get the stats + * @param versions max number of statistic versions to read + * @return a list of all the {@link ColumnFamilyStatistic}s gathered from that reader + * @throws IOException if we cannot read the statistics table properly + */ + public ColumnFamilyStatistic read(byte[] region, byte[] column, int versions) + throws IOException { + byte[] row = this.getRowKey(region, column); + Get g = new Get(row); + g.setMaxVersions(versions); + Result r = table.get(g); + return deserializer.deserialize(r); + } + + /** + * Read the latest version of the statistic from the primary table + * @param t underlying table to read from + * @param reader reader to use to deserialize the raw results + * @param prefix key prefix to use when scanning + * @param versions number of versions to read from the table + * @return + * @throws IOException + */ + private List> getResults(byte[] prefix, int versions) throws IOException { + Scan scan = new Scan(prefix); + scan.addFamily(Constants.STATS_DATA_COLUMN_FAMILY); + scan.setFilter(new PrefixFilter(prefix)); + // we only return the latest version of the statistic + scan.setMaxVersions(versions); + ResultScanner scanner = table.getScanner(scan); + List> stats = new ArrayList>(); + for (Result r : scanner) { + LOG.info("Got result:" + r); + stats.add(deserializer.deserialize(r)); + } + return stats; + } + + private byte[] getRowKey() { + return StatisticSerDe.getRowPrefix(source, name); + } + + private byte[] getRowKey(byte[] regionname) { + return StatisticSerDe.getRowPrefix(source, regionname, name); + } + + private byte[] getRowKey(byte[] regionname, byte[] columnfamily) { + return StatisticSerDe.getRowKey(source, regionname, columnfamily, name); + } + +} \ No newline at end of file diff --git a/contrib/hbase-stat/src/main/java/com/salesforce/hbase/stats/StatisticTracker.java b/contrib/hbase-stat/src/main/java/com/salesforce/hbase/stats/StatisticTracker.java new file mode 100644 index 00000000..5ed64827 --- /dev/null +++ b/contrib/hbase-stat/src/main/java/com/salesforce/hbase/stats/StatisticTracker.java @@ -0,0 +1,27 @@ +package com.salesforce.hbase.stats; + +import java.util.List; + +import org.apache.hadoop.hbase.KeyValue; + +/** + * Track a statistic for the column on a given region + */ +public interface StatisticTracker { + + /** + * Reset the statistic after the completion fo the compaction + */ + public void clear(); + + /** + * @return the current statistics that the tracker has collected + */ + public List getCurrentStats(); + + /** + * Update the current statistics with the next {@link KeyValue} to be written + * @param kv next {@link KeyValue} to be written + */ + public void updateStatistic(KeyValue kv); +} \ No newline at end of file diff --git a/contrib/hbase-stat/src/main/java/com/salesforce/hbase/stats/StatisticValue.java b/contrib/hbase-stat/src/main/java/com/salesforce/hbase/stats/StatisticValue.java new file mode 100644 index 00000000..009f9ccb --- /dev/null +++ b/contrib/hbase-stat/src/main/java/com/salesforce/hbase/stats/StatisticValue.java @@ -0,0 +1,43 @@ +package com.salesforce.hbase.stats; + +import org.apache.hadoop.hbase.util.Bytes; + +/** + * Simple holder class for a single statistics on a column in a region. + *

+ * If you are build a histogram, should use the HistogramStat to store information, which internally + * uses a collection of {@link StatisticValue}s to build a larger histogram + */ +public class StatisticValue { + + protected byte[] name; + protected byte[] info; + protected byte[] value; + + public StatisticValue(byte[] name, byte[] info, byte[] value) { + this.name = name; + this.info = info; + this.value = value; + } + + public byte[] getType() { + return name; + } + + public byte[] getInfo() { + return info; + } + + public byte[] getValue() { + return value; + } + + protected void setValue(byte[] value) { + this.value = value; + } + + public String toString(){ + return "stat " + Bytes.toString(name) + ": [info:" + Bytes.toString(info) + ", value:" + + Bytes.toString(value) + "]"; + } +} \ No newline at end of file diff --git a/contrib/hbase-stat/src/main/java/com/salesforce/hbase/stats/StatisticsTable.java b/contrib/hbase-stat/src/main/java/com/salesforce/hbase/stats/StatisticsTable.java new file mode 100644 index 00000000..069fdd52 --- /dev/null +++ b/contrib/hbase-stat/src/main/java/com/salesforce/hbase/stats/StatisticsTable.java @@ -0,0 +1,160 @@ +package com.salesforce.hbase.stats; + +import java.io.Closeable; +import java.io.IOException; +import java.util.HashMap; +import java.util.List; +import java.util.Map; + +import org.apache.commons.lang.ArrayUtils; +import org.apache.commons.logging.Log; +import org.apache.commons.logging.LogFactory; +import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.hbase.CoprocessorEnvironment; +import org.apache.hadoop.hbase.HRegionInfo; +import org.apache.hadoop.hbase.HTableDescriptor; +import org.apache.hadoop.hbase.client.Delete; +import org.apache.hadoop.hbase.client.HTable; +import org.apache.hadoop.hbase.client.HTableInterface; +import org.apache.hadoop.hbase.client.Put; +import org.apache.hadoop.hbase.client.Result; +import org.apache.hadoop.hbase.client.ResultScanner; +import org.apache.hadoop.hbase.client.Scan; +import org.apache.hadoop.hbase.filter.PrefixFilter; +import org.apache.hadoop.hbase.util.Bytes; + +import com.salesforce.hbase.stats.serialization.IndividualStatisticWriter; +import com.salesforce.hbase.stats.util.Constants; + + +/** + * Wrapper to access the statistics table for an HTable. + *

+ * Each {@link StatisticsTable} is bound to access the statistics for a single 'primary' table. This + * helps decrease the chances of reading/writing the wrong statistic for the source table + *

+ * Each statistic is prefixed with the tablename and region from whence it came. + */ +public class StatisticsTable implements Closeable { + + private static final Log LOG = LogFactory.getLog(StatisticsTable.class); + /** Map of the currently open statistics tables */ + private static final Map tableMap = new HashMap(); + + /** + * @param env Environment wherein the coprocessor is attempting to update the stats table. + * @param primaryTableName name of the primary table on which we should collect stats + * @return the {@link StatisticsTable} for the given primary table. + * @throws IOException if the table cannot be created due to an underlying HTable creation error + */ + public synchronized static StatisticsTable getStatisticsTableForCoprocessor( + CoprocessorEnvironment env, byte[] primaryTableName) throws IOException { + StatisticsTable table = tableMap.get(primaryTableName); + if (table == null) { + table = new StatisticsTable(env.getTable(Constants.STATS_TABLE_NAME_BYTES), primaryTableName); + tableMap.put(Bytes.toString(primaryTableName), table); + } + return table; + } + + private final HTableInterface target; + private final byte[] sourceTableName; + + private StatisticsTable(HTableInterface target, byte[] sourceTableName) { + this.target = target; + this.sourceTableName = sourceTableName; + } + + public StatisticsTable(Configuration conf, HTableDescriptor source) throws IOException { + this(new HTable(conf, Constants.STATS_TABLE_NAME), source.getName()); + } + + /** + * Close the connection to the table + */ + @Override + public void close() throws IOException { + target.close(); + } + + public void removeStats() throws IOException { + removeRowsForPrefix(sourceTableName); + } + + public void removeStatsForRegion(HRegionInfo region) throws IOException { + removeRowsForPrefix(sourceTableName, region.getRegionName()); + } + + private void removeRowsForPrefix(byte[]... arrays) throws IOException { + byte[] row = null; + for (byte[] array : arrays) { + row = ArrayUtils.addAll(row, array); + } + Scan scan = new Scan(row); + scan.setFilter(new PrefixFilter(row)); + cleanupRows(scan); + } + + /** + * Delete all the rows that we find from the scanner + * @param scan scan used on the statistics table to determine which keys need to be deleted + * @throws IOException if we fail to communicate with the HTable + */ + private void cleanupRows(Scan scan) throws IOException { + // Because each region has, potentially, a bunch of different statistics, we need to go through + // an delete each of them as we find them + + // TODO switch this to a CP that lets us just do a filtered delete + + // first we have to scan the table to find the rows to delete + ResultScanner scanner = target.getScanner(scan); + Delete d = null; + // XXX possible memory issues here - we could be loading a LOT of stuff as we are doing a + // copy for each result + for (Result r : scanner) { + // create a delete for each result + d = new Delete(r.getRow()); + // let the table figure out when it wants to flush that stuff + target.delete(d); + } + } + + /** + * Update a list of statistics for the given region + * @param serializer to convert the actual statistics to puts in the statistics table + * @param data Statistics for the region that we should update. The type of the + * {@link StatisticValue} (T1), is used as a suffix on the row key; this groups different + * types of metrics together on a per-region basis. Then the + * {@link StatisticValue#getInfo()}is used as the column qualifier. Finally, + * {@link StatisticValue#getValue()} is used for the the value of the {@link Put}. This + * can be null or empty. + * @throws IOException if we fail to do any of the puts. Any single failure will prevent any + * future attempts for the remaining list of stats to update + */ + public void updateStats(IndividualStatisticWriter serializer, List data) + throws IOException { + // short circuit if we have nothing to write + if (data == null || data.size() == 0) { + return; + } + + // serialize each of the metrics with the associated serializer + for (StatisticValue metric : data) { + LOG.info("Writing statistic: " + metric); + target.put(serializer.serialize(metric)); + } + // make sure it all reaches the target table when we are done + target.flushCommits(); + } + + /** + * @return the underlying {@link HTableInterface} to which this table is writing + */ + HTableInterface getUnderlyingTable() { + return target; + } + + byte[] getSourceTableName() { + return this.sourceTableName; + } +} \ No newline at end of file diff --git a/contrib/hbase-stat/src/main/java/com/salesforce/hbase/stats/cleanup/CleanupStatistics.java b/contrib/hbase-stat/src/main/java/com/salesforce/hbase/stats/cleanup/CleanupStatistics.java new file mode 100644 index 00000000..a4135a68 --- /dev/null +++ b/contrib/hbase-stat/src/main/java/com/salesforce/hbase/stats/cleanup/CleanupStatistics.java @@ -0,0 +1,74 @@ +package com.salesforce.hbase.stats.cleanup; + +import java.io.IOException; +import java.util.ArrayList; +import java.util.List; + +import org.apache.commons.logging.Log; +import org.apache.commons.logging.LogFactory; +import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.hbase.HTableDescriptor; +import org.apache.hadoop.hbase.coprocessor.CoprocessorHost; + +import com.google.common.collect.Lists; + +/** + * Wrapper class around the necessary cleanup coprocessors. + *

+ * We cleanup stats for a table on a couple different instances: + *

    + *
  1. On table delete + *
      + *
    • This requires adding a coprocessor on the HMaster and must occure before HMaster startup. Use + * {@link #setupClusterConfiguration(Configuration)} to ensure this coprocessor is enabled.
    • + *
    + *
  2. + *
  3. On region split + *
      + *
    • The stats for the parent region of the split are removed from the stats
    • + *
    • This is via a region coprocessor, so it is merely added to the table descriptor via + * {@link #setupTable(HTableDescriptor)}
    • + *
    + *
  4. + */ +public class CleanupStatistics { + private static final Log LOG = LogFactory.getLog(CleanupStatistics.class); + + public static void verifyConfiguration(Configuration conf) { + String[] classes = conf.getStrings(CoprocessorHost.MASTER_COPROCESSOR_CONF_KEY); + List contains = Lists.newArrayList(classes); + String removeTableCleanupClassName = RemoveTableOnDelete.class.getName(); + if (!contains.contains(removeTableCleanupClassName)) { + throw new IllegalArgumentException( + removeTableCleanupClassName + + " must be specified as a master observer to cleanup table statistics, but its missing from the configuration! We only found: " + + classes); + } + } + + public static void setupClusterConfiguration(Configuration conf) { + String[] classes = conf.getStrings(CoprocessorHost.MASTER_COPROCESSOR_CONF_KEY); + List toAdd = classes == null ? new ArrayList() : Lists.newArrayList(classes); + String removeTableCleanupClassName = RemoveTableOnDelete.class.getName(); + if (!toAdd.contains(removeTableCleanupClassName)) { + toAdd.add(removeTableCleanupClassName); + conf.setStrings(CoprocessorHost.MASTER_COPROCESSOR_CONF_KEY, toAdd.toArray(new String[0])); + } + + // make sure we didn't screw anything up + verifyConfiguration(conf); + } + + /** + * Add all the necessary cleanup coprocessors to the table + * @param desc primary table for which we should cleanup + */ + public static void setupTable(HTableDescriptor desc) { + String clazz = RemoveRegionOnSplit.class.getName(); + try { + desc.addCoprocessor(clazz); + }catch(IOException e) { + LOG.info(clazz +" already added to table, not adding again."); + } + } +} \ No newline at end of file diff --git a/contrib/hbase-stat/src/main/java/com/salesforce/hbase/stats/cleanup/RemoveRegionOnSplit.java b/contrib/hbase-stat/src/main/java/com/salesforce/hbase/stats/cleanup/RemoveRegionOnSplit.java new file mode 100644 index 00000000..ee43615c --- /dev/null +++ b/contrib/hbase-stat/src/main/java/com/salesforce/hbase/stats/cleanup/RemoveRegionOnSplit.java @@ -0,0 +1,68 @@ +package com.salesforce.hbase.stats.cleanup; + +import java.io.IOException; +import java.util.List; + +import org.apache.hadoop.hbase.CoprocessorEnvironment; +import org.apache.hadoop.hbase.HTableDescriptor; +import org.apache.hadoop.hbase.coprocessor.BaseRegionObserver; +import org.apache.hadoop.hbase.coprocessor.ObserverContext; +import org.apache.hadoop.hbase.coprocessor.RegionCoprocessorEnvironment; +import org.apache.hadoop.hbase.regionserver.HRegion; +import org.apache.hadoop.hbase.regionserver.InternalScanner; +import org.apache.hadoop.hbase.regionserver.KeyValueScanner; +import org.apache.hadoop.hbase.regionserver.ScanType; +import org.apache.hadoop.hbase.regionserver.Store; + +import com.salesforce.hbase.stats.StatisticsTable; +import com.salesforce.hbase.stats.util.SetupTableUtil; + +/** + * Cleanup the stats for the parent region on region split + */ +public class RemoveRegionOnSplit extends BaseRegionObserver { + + protected StatisticsTable stats; + + @Override + public void start(CoprocessorEnvironment e) throws IOException { + HTableDescriptor desc = ((RegionCoprocessorEnvironment) e).getRegion().getTableDesc(); + if (SetupTableUtil.getStatsEnabled(desc)) { + stats = StatisticsTable.getStatisticsTableForCoprocessor(e, desc.getName()); + } + } + + @Override + public void stop(CoprocessorEnvironment e) throws IOException { + if (stats != null) { + stats.close(); + } + } + + @Override + public void postSplit(ObserverContext e, HRegion l, HRegion r) + throws IOException { + // stats aren't enabled on the table, so we are done + if (stats == null) { + return; + } + // get the parent + HRegion parent = e.getEnvironment().getRegion(); + // and remove it from the stats + stats.removeStatsForRegion(parent.getRegionInfo()); + } + + /** + * We override this method to ensure that any scanner from a previous coprocessor is returned. The + * default behavior is to return null, which completely hoses any other coprocessors + * setup before, making ordering of coprocessors very important. By returning the passed scanner, + * we can avoid easy to make configuration errors. + */ + @Override + public InternalScanner preCompactScannerOpen(ObserverContext c, + Store store, List scanners, ScanType scanType, long earliestPutTs, + InternalScanner s) throws IOException { + + return s; + } +} \ No newline at end of file diff --git a/contrib/hbase-stat/src/main/java/com/salesforce/hbase/stats/cleanup/RemoveTableOnDelete.java b/contrib/hbase-stat/src/main/java/com/salesforce/hbase/stats/cleanup/RemoveTableOnDelete.java new file mode 100644 index 00000000..53482444 --- /dev/null +++ b/contrib/hbase-stat/src/main/java/com/salesforce/hbase/stats/cleanup/RemoveTableOnDelete.java @@ -0,0 +1,33 @@ +package com.salesforce.hbase.stats.cleanup; + +import java.io.IOException; + +import org.apache.hadoop.hbase.HTableDescriptor; +import org.apache.hadoop.hbase.coprocessor.BaseMasterObserver; +import org.apache.hadoop.hbase.coprocessor.MasterCoprocessorEnvironment; +import org.apache.hadoop.hbase.coprocessor.ObserverContext; +import org.apache.hadoop.hbase.util.Bytes; + +import com.salesforce.hbase.stats.StatisticsTable; +import com.salesforce.hbase.stats.util.SetupTableUtil; + +public class RemoveTableOnDelete extends BaseMasterObserver { + + @Override + public void preDeleteTable(ObserverContext ctx, byte[] tableName) + throws IOException { + HTableDescriptor desc = ctx.getEnvironment().getMasterServices().getTableDescriptors() + .get(tableName); + if (desc == null) { + throw new IOException("Can't find table descriptor for table '" + Bytes.toString(tableName) + + "' that is about to be deleted!"); + } + // if we have turned on stats for this table + if (SetupTableUtil.getStatsEnabled(desc)) { + StatisticsTable stats = StatisticsTable.getStatisticsTableForCoprocessor( + ctx.getEnvironment(), desc.getName()); + stats.removeStats(); + stats.close(); + } + } +} \ No newline at end of file diff --git a/contrib/hbase-stat/src/main/java/com/salesforce/hbase/stats/impl/EqualByteDepthHistogramStatisticTracker.java b/contrib/hbase-stat/src/main/java/com/salesforce/hbase/stats/impl/EqualByteDepthHistogramStatisticTracker.java new file mode 100644 index 00000000..83e77203 --- /dev/null +++ b/contrib/hbase-stat/src/main/java/com/salesforce/hbase/stats/impl/EqualByteDepthHistogramStatisticTracker.java @@ -0,0 +1,96 @@ +package com.salesforce.hbase.stats.impl; + +import java.io.IOException; +import java.util.Collections; +import java.util.List; +import java.util.Map; + +import org.apache.hadoop.hbase.Coprocessor; +import org.apache.hadoop.hbase.CoprocessorEnvironment; +import org.apache.hadoop.hbase.HTableDescriptor; +import org.apache.hadoop.hbase.KeyValue; +import org.apache.hadoop.hbase.util.Bytes; + +import com.google.protobuf.ByteString; +import com.salesforce.hbase.stats.BaseStatistic; +import com.salesforce.hbase.stats.HistogramStatisticValue; +import com.salesforce.hbase.stats.StatisticReader; +import com.salesforce.hbase.stats.StatisticTracker; +import com.salesforce.hbase.stats.StatisticValue; +import com.salesforce.hbase.stats.StatisticsTable; +import com.salesforce.hbase.stats.serialization.HistogramStatisticReader; + +/** + * {@link StatisticTracker} that keeps track of an equal depth histogram. + *

    + * This is different from a traditional histogram in that we just keep track of the key at every 'n' + * bytes; another name for this is region "guide posts". + *

    + * When using this statistic, be very careful when selecting the byte width of each column - + * it could lead to an incredibly large histogram, which could crash the region server. + */ +public class EqualByteDepthHistogramStatisticTracker extends BaseStatistic { + + public static final String BYTE_DEPTH_CONF_KEY = "com.salesforce.guidepost.width"; + + private final static byte[] NAME = Bytes.toBytes("equal_depth_histogram"); + + private static final long DEFAULT_BYTE_DEPTH = 100; + + private long guidepostDepth; + private long byteCount = 0; + private HistogramStatisticValue histogram; + + public static void addToTable(HTableDescriptor desc, long depth) throws IOException { + Map props = Collections.singletonMap(BYTE_DEPTH_CONF_KEY, Long.toString(depth)); + desc.addCoprocessor(EqualByteDepthHistogramStatisticTracker.class.getName(), null, + Coprocessor.PRIORITY_USER, props); + } + + /** + * Get a reader for the statistic + * @param stats statistics table from which you want to read the stats + * @return a {@link StatisticReader} to get the raw Histogram stats. + */ + public static StatisticReader getStatistcReader(StatisticsTable stats) { + return new StatisticReader(stats, + new HistogramStatisticReader(), NAME); + } + + @Override + public void start(CoprocessorEnvironment e) throws IOException { + super.start(e); + //get the byte depth for this histogram + guidepostDepth = e.getConfiguration().getLong(BYTE_DEPTH_CONF_KEY, DEFAULT_BYTE_DEPTH); + this.histogram = newHistogram(); + } + + private HistogramStatisticValue newHistogram() { + return new HistogramStatisticValue(NAME, Bytes.toBytes("equal_width_histogram_" + + guidepostDepth + "bytes"), guidepostDepth); + } + + @Override + public List getCurrentStats() { + return Collections.singletonList((StatisticValue) histogram); + } + + @Override + public void clear() { + this.histogram = newHistogram(); + this.byteCount = 0; + } + + @Override + public void updateStatistic(KeyValue kv) { + byteCount += kv.getLength(); + // if we are at the next guide-post, add it to the histogram + if (byteCount >= guidepostDepth) { + // update the histogram + this.histogram.addColumn(ByteString.copyFrom(kv.getBuffer(), kv.getOffset(), kv.getLength())); + + //reset the count for the next key + byteCount = 0; + } + } +} diff --git a/contrib/hbase-stat/src/main/java/com/salesforce/hbase/stats/impl/MinMaxKey.java b/contrib/hbase-stat/src/main/java/com/salesforce/hbase/stats/impl/MinMaxKey.java new file mode 100644 index 00000000..76cebe7f --- /dev/null +++ b/contrib/hbase-stat/src/main/java/com/salesforce/hbase/stats/impl/MinMaxKey.java @@ -0,0 +1,121 @@ +package com.salesforce.hbase.stats.impl; + +import java.io.IOException; +import java.util.ArrayList; +import java.util.List; + +import org.apache.hadoop.hbase.HTableDescriptor; +import org.apache.hadoop.hbase.KeyValue; +import org.apache.hadoop.hbase.util.Bytes; + +import com.salesforce.hbase.stats.BaseStatistic; +import com.salesforce.hbase.stats.ColumnFamilyStatistic; +import com.salesforce.hbase.stats.StatisticReader; +import com.salesforce.hbase.stats.StatisticValue; +import com.salesforce.hbase.stats.StatisticsTable; +import com.salesforce.hbase.stats.serialization.PointStatisticReader; + +/** + * Coprocessor that just keeps track of the min/max key on a per-column family basis. + *

    + * This can then also be used to find the per-table min/max key for the table. + */ +public class MinMaxKey extends BaseStatistic { + + public static void addToTable(HTableDescriptor desc) throws IOException { + desc.addCoprocessor(MinMaxKey.class.getName()); + } + + private static final byte[] MAX_SUFFIX = Bytes.toBytes("max_region_key"); + private static final byte[] MIN_SUFFIX = Bytes.toBytes("min_region_key"); + private final static byte[] NAME = Bytes.toBytes("min_max_stat"); + + private byte[] min; + private byte[] max; + + @Override + public List getCurrentStats() { + List data = new ArrayList(2); + data.add(new StatisticValue(NAME, MIN_SUFFIX, min)); + data.add(new StatisticValue(NAME, MAX_SUFFIX, max)); + return data; + } + + @Override + public void clear() { + this.max = null; + this.min = null; + } + + @Override + public void updateStatistic(KeyValue kv) { + // first time through, so both are null + if (min == null) { + min = TrackerUtil.copyRow(kv); + max = TrackerUtil.copyRow(kv); + return; + } + if (Bytes.compareTo(kv.getBuffer(), kv.getRowOffset(), kv.getRowLength(), min, 0, min.length) < 0) { + min = TrackerUtil.copyRow(kv); + } + if (Bytes.compareTo(kv.getBuffer(), kv.getRowOffset(), kv.getRowLength(), max, 0, max.length) > 0) { + max = TrackerUtil.copyRow(kv); + } + } + + /** + * Find a reader for the the min/max key based on the type of serialization of the key. + * @param stats table from which you want to read the stats + * @return a {@link StatisticReader} to get the raw Min/Max stats. Use {@link #interpret(List)} to + * get a list of the most recent min/max values on a per-column, per-region basis. + */ + public static StatisticReader getStatistcReader(StatisticsTable stats) { + return new StatisticReader(stats, + new PointStatisticReader(), NAME); + } + + /** + * Combine the results from {@link #getStatistcReader(StatisticsTable)} into {@link MinMaxStat} + * results for easy digestion + * @param stat statistics from {@link #getStatistcReader(StatisticsTable)}. + * @return the min/max per column family per region + */ + public static List interpret(List> stat) { + List stats = new ArrayList(); + for (int i = 0; i < stat.size(); i++) { + // every two column family statistic is actually one statistic, so we need to combine them + ColumnFamilyStatistic minmax = stat.get(i++); + StatisticValue max = minmax.getValues().get(0); + StatisticValue min = minmax.getValues().get(1); + // we only return the most recent min/max combination for the column family/region + stats.add(new MinMaxStat(minmax.getRegion(), minmax.getColumnfamily(), max, min)); + } + return stats; + + } + + /** + * Abstraction of a statistic that combines two {@link StatisticValue}s to generate a single + * min/max stat for a single column family of a region. + */ + public static class MinMaxStat { + + public final byte[] region; + public final byte[] family; + public final byte[] max; + public final byte[] min; + + /** + * @param region region where the stat was obtained + * @param columnfamily column family for which the stat was calculated + * @param min the min key as a {@link StatisticValue} + * @param max the max key as a {@link StatisticValue} + */ + public MinMaxStat(byte[] region, byte[] columnfamily, StatisticValue max, StatisticValue min) { + this.region = region; + this.family = columnfamily; + this.max = max.getValue(); + this.min = min.getValue(); + } + } +} diff --git a/contrib/hbase-stat/src/main/java/com/salesforce/hbase/stats/impl/TrackerUtil.java b/contrib/hbase-stat/src/main/java/com/salesforce/hbase/stats/impl/TrackerUtil.java new file mode 100644 index 00000000..b4e8ba8e --- /dev/null +++ b/contrib/hbase-stat/src/main/java/com/salesforce/hbase/stats/impl/TrackerUtil.java @@ -0,0 +1,23 @@ +package com.salesforce.hbase.stats.impl; + +import java.util.Arrays; + +import org.apache.hadoop.hbase.KeyValue; + +import com.salesforce.hbase.stats.StatisticTracker; + +/** + * Utilities for {@link StatisticTracker}s. + */ +public class TrackerUtil { + + private TrackerUtil() { + // private ctor for utils + } + + public static byte[] copyRow(KeyValue kv) { + return Arrays.copyOfRange(kv.getBuffer(), kv.getRowOffset(), + kv.getRowOffset() + kv.getRowLength()); + } + +} diff --git a/contrib/hbase-stat/src/main/java/com/salesforce/hbase/stats/serialization/HistogramStatisticReader.java b/contrib/hbase-stat/src/main/java/com/salesforce/hbase/stats/serialization/HistogramStatisticReader.java new file mode 100644 index 00000000..17c82130 --- /dev/null +++ b/contrib/hbase-stat/src/main/java/com/salesforce/hbase/stats/serialization/HistogramStatisticReader.java @@ -0,0 +1,37 @@ +package com.salesforce.hbase.stats.serialization; + +import java.io.IOException; + +import org.apache.hadoop.hbase.client.Result; + +import com.google.protobuf.InvalidProtocolBufferException; +import com.salesforce.hbase.stats.ColumnFamilyStatistic; +import com.salesforce.hbase.stats.HistogramStatisticValue; +import com.salesforce.hbase.stats.StatisticValue; + +/** + * Get {@link HistogramStatisticValue}s from the underlying bytes. Expects serialization with the + * {@link IndividualStatisticWriter}. + */ +public class HistogramStatisticReader implements IndividualStatisticReader { + private final PointStatisticReader delegate; + + public HistogramStatisticReader() { + delegate = new PointStatisticReader(); + } + + public ColumnFamilyStatistic deserialize(Result r) throws IOException { + ColumnFamilyStatistic raw = delegate.deserialize(r); + // then re-wrap the results so we can read histograms + ColumnFamilyStatistic ret = + new ColumnFamilyStatistic(raw.getRegion(), raw.getColumnfamily()); + for (StatisticValue value : raw.getValues()) { + try { + ret.add(new HistogramStatisticValue(value)); + } catch (InvalidProtocolBufferException e) { + throw new IOException(e); + } + } + return ret; + } +} \ No newline at end of file diff --git a/contrib/hbase-stat/src/main/java/com/salesforce/hbase/stats/serialization/IndividualStatisticReader.java b/contrib/hbase-stat/src/main/java/com/salesforce/hbase/stats/serialization/IndividualStatisticReader.java new file mode 100644 index 00000000..9e56d3bd --- /dev/null +++ b/contrib/hbase-stat/src/main/java/com/salesforce/hbase/stats/serialization/IndividualStatisticReader.java @@ -0,0 +1,17 @@ +package com.salesforce.hbase.stats.serialization; + +import java.io.IOException; + +import org.apache.hadoop.hbase.client.Result; + +import com.salesforce.hbase.stats.ColumnFamilyStatistic; +import com.salesforce.hbase.stats.StatisticValue; + +/** + * Deserializer for a {@link StatisticValue} from the raw {@link Result}. This is the complement + * to the {@link IndividualStatisticWriter}. + * @param type of statistic value to deserialize + */ +public interface IndividualStatisticReader { + public ColumnFamilyStatistic deserialize(Result r) throws IOException; +} \ No newline at end of file diff --git a/contrib/hbase-stat/src/main/java/com/salesforce/hbase/stats/serialization/IndividualStatisticWriter.java b/contrib/hbase-stat/src/main/java/com/salesforce/hbase/stats/serialization/IndividualStatisticWriter.java new file mode 100644 index 00000000..ca02d6e7 --- /dev/null +++ b/contrib/hbase-stat/src/main/java/com/salesforce/hbase/stats/serialization/IndividualStatisticWriter.java @@ -0,0 +1,31 @@ +package com.salesforce.hbase.stats.serialization; + +import org.apache.hadoop.hbase.client.Put; + +import com.salesforce.hbase.stats.StatisticValue; +import com.salesforce.hbase.stats.util.Constants; + +/** + * Simple serializer that always puts generates the same formatted key for an individual + * statistic. This writer is used to write a single {@link StatisticValue} to the statistics + * table. They should be read back via an {@link IndividualStatisticReader}. + */ +public class IndividualStatisticWriter { + private final byte[] source; + private byte[] region; + private byte[] column; + + public IndividualStatisticWriter(byte[] sourcetable, byte[] region, byte[] column) { + this.source = sourcetable; + this.region = region; + this.column = column; + } + + public Put serialize(StatisticValue value) { + byte[] prefix = StatisticSerDe.getRowKey(source, region, column, value.getType()); + Put put = new Put(prefix); + put.add(Constants.STATS_DATA_COLUMN_FAMILY, value.getInfo(), value.getValue()); + return put; + } + +} \ No newline at end of file diff --git a/contrib/hbase-stat/src/main/java/com/salesforce/hbase/stats/serialization/PointStatisticReader.java b/contrib/hbase-stat/src/main/java/com/salesforce/hbase/stats/serialization/PointStatisticReader.java new file mode 100644 index 00000000..0b7400bc --- /dev/null +++ b/contrib/hbase-stat/src/main/java/com/salesforce/hbase/stats/serialization/PointStatisticReader.java @@ -0,0 +1,52 @@ +package com.salesforce.hbase.stats.serialization; + +import java.util.Arrays; + +import org.apache.hadoop.hbase.KeyValue; +import org.apache.hadoop.hbase.client.Result; +import org.apache.hadoop.hbase.util.Bytes; + +import com.salesforce.hbase.stats.ColumnFamilyStatistic; +import com.salesforce.hbase.stats.StatisticValue; + +/** + * Read simple {@link StatisticValue}s from raw {@link Result}s. Expects serialization with the + * {@link IndividualStatisticWriter}. + */ +public class PointStatisticReader implements IndividualStatisticReader { + + public ColumnFamilyStatistic deserialize(Result r) { + // break out the key based on its parts + // 1. start with getting the lengths of the key parts + byte[] row = r.getRow(); + int sizes[] = new int[StatisticSerDe.NUM_KEY_PARTS]; + int start = row.length - Bytes.SIZEOF_INT; + for (int i = StatisticSerDe.NUM_KEY_PARTS - 1; i >= 0; i--) { + sizes[i] = Bytes.toInt(row, start, Bytes.SIZEOF_INT); + start -= Bytes.SIZEOF_INT; + } + + // 1b. break out each part of the key so we can rebuild the statistic + start = sizes[0]; // this is the end of the table name, so we can just skip it immediately + int end = start + sizes[1]; + // for right now, we just copy the array over - its a bit inefficient, but we can always go to + // ByteBuffers later. + byte[] statname = Arrays.copyOfRange(row, start,end); + start += sizes[1]; + end= start+ sizes[2]; + byte[] region = Arrays.copyOfRange(row, start, end); + start += sizes[2]; + end= start+ sizes[3]; + byte[] family = Arrays.copyOfRange(row, start, end); + ColumnFamilyStatistic stat = + new ColumnFamilyStatistic(region, family); + for (KeyValue kv : r.list()) { + byte[] info = Arrays.copyOfRange(kv.getBuffer(), kv.getQualifierOffset(), + kv.getQualifierOffset() + kv.getQualifierLength()); + byte[] value = Arrays.copyOfRange(kv.getBuffer(), kv.getValueOffset(), kv.getValueOffset() + + kv.getValueLength()); + stat.add(new StatisticValue(statname, info, value)); + } + return stat; + } +} \ No newline at end of file diff --git a/contrib/hbase-stat/src/main/java/com/salesforce/hbase/stats/serialization/StatisticSerDe.java b/contrib/hbase-stat/src/main/java/com/salesforce/hbase/stats/serialization/StatisticSerDe.java new file mode 100644 index 00000000..15ca6e21 --- /dev/null +++ b/contrib/hbase-stat/src/main/java/com/salesforce/hbase/stats/serialization/StatisticSerDe.java @@ -0,0 +1,70 @@ +package com.salesforce.hbase.stats.serialization; + +import org.apache.hadoop.hbase.util.Bytes; + + +/** + * Simple utility class for managing multiple key parts of the statistic + */ +public class StatisticSerDe { + + private StatisticSerDe() { + // private ctor for utility classes + } + + /** Number of parts in our complex key */ + protected static final int NUM_KEY_PARTS = 4; + + /** + * Get the prefix based on the region, column and name of the statistic + * @param table name of the source table + * @param statName name of the statistic + * @return the row key that should be used for this statistic + */ + public static byte[] getRowPrefix(byte[] table, byte[] statName) { + byte[] prefix = table; + prefix = Bytes.add(prefix, statName); + return prefix; + } + + /** + * Get the prefix based on the region, column and name of the statistic + * @param table name of the source table + * @param regionname name of the region where the statistic was gathered + * @param statName name of the statistic + * @return the row key that should be used for this statistic + */ + public static byte[] getRowPrefix(byte[] table, byte[] regionname, byte[] statName) { + byte[] prefix = table; + prefix = Bytes.add(prefix, statName); + prefix = Bytes.add(prefix, regionname); + return prefix; + } + + /** + * Get the prefix based on the region, column and name of the statistic + * @param table name of the source table + * @param region name of the region where the statistic was gathered + * @param column column for which the statistic was gathered + * @param statName name of the statistic + * @return the row key that should be used for this statistic + */ + public static byte[] getRowKey(byte[] table, byte[] region, byte[] column, byte[] statName) { + // always starts with the source table + byte[] prefix = new byte[0]; + // then append each part of the key and + byte[][] parts = new byte[][] { table, statName, region, column }; + int[] sizes = new int[NUM_KEY_PARTS]; + // XXX - this where we would use orderly to get the sorting consistent + for (int i = 0; i < NUM_KEY_PARTS; i++) { + prefix = Bytes.add(prefix, parts[i]); + sizes[i] = parts[i].length; + } + // then we add on the sizes to the end of the key + for (int size : sizes) { + prefix = Bytes.add(prefix, Bytes.toBytes(size)); + } + + return prefix; + } +} \ No newline at end of file diff --git a/contrib/hbase-stat/src/main/java/com/salesforce/hbase/stats/serialization/package-info.java b/contrib/hbase-stat/src/main/java/com/salesforce/hbase/stats/serialization/package-info.java new file mode 100644 index 00000000..f74245b1 --- /dev/null +++ b/contrib/hbase-stat/src/main/java/com/salesforce/hbase/stats/serialization/package-info.java @@ -0,0 +1,51 @@ +package com.salesforce.hbase.stats.serialization; + + +/** +Overview of Statistics Serialization +

    +

      +
    • Overview
    • +
    • Reading the StatisticsTable
    • +
    • Overview These are the various pieces necessary to serialize a +statistic to a list of {@link KeyValue}s. Right now, this is implemented with a single +serialization order in the row key: + +
      +table | statistic name | region | column family
      +
      + +where table, region and column family are in reference to the source table. +

      +This lets you easily aggregate a a single statistics over a given region or quickly access a +single statistic for a given column in a given region. +

      +For cases you you want to know a statistic for a single family, but across all regions, you would +need to do the same scan as in the above case, but filter out other columns, which can be +inefficient, but isn't a killer because we won't have that many stores (perhaps on the +order of several thousand across all regions). +

      +We could extend this serialization to be more flexible (different key-part ordering for different +statistics based on desired access patterns), but this is orders of magnitude simpler. +

      Reading the StatisticsTable

      Some statistics can be read directly +from the statistics table since they are just simple point values. For instance, the +{@link com.salesforce.hbase.stats.impl.EqualDepthHistogramStatisticTracker} can be read using a +simple +{@link org.apache.hadoop.hbase.statistics.serialization.IndividualStatisticReader.HistogramStatisticReader} +. like this: +
      +
      +
      +Other statistics have a slightly more complicated internal structure - i.e the use multiple +column qualifiers - and should provide a special reader. For instance, +{@link com.salesforce.hbase.stats.impl.MinMaxKey} provides a custom reader than can be used like: + +
      + StatisticReader<StatisticValue> reader = MinMaxKey.getStatistcReader(primary);
      + StatisticsTable statTable = new StatisticsTable(UTIL.getConfiguration(), primary);
      + List<MinMaxStat> results = MinMaxKey.interpret(statTable.read(reader));
      +
      + */ \ No newline at end of file diff --git a/contrib/hbase-stat/src/main/java/com/salesforce/hbase/stats/util/Constants.java b/contrib/hbase-stat/src/main/java/com/salesforce/hbase/stats/util/Constants.java new file mode 100644 index 00000000..46765219 --- /dev/null +++ b/contrib/hbase-stat/src/main/java/com/salesforce/hbase/stats/util/Constants.java @@ -0,0 +1,40 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package com.salesforce.hbase.stats.util; + +import org.apache.hadoop.hbase.util.Bytes; + +/** + * General constants for hbase-stat + */ +public class Constants { + + + private Constants() { + // private ctor for utility class + } + + + /** Name of the column family to store all the statistics data */ + public static final byte[] STATS_DATA_COLUMN_FAMILY = Bytes.toBytes("STAT"); + + /** Name of the statistics table */ + public static final String STATS_TABLE_NAME = "_stats_"; + + public static final byte[] STATS_TABLE_NAME_BYTES = Bytes.toBytes(STATS_TABLE_NAME); +} diff --git a/contrib/hbase-stat/src/main/java/com/salesforce/hbase/stats/util/SetupTableUtil.java b/contrib/hbase-stat/src/main/java/com/salesforce/hbase/stats/util/SetupTableUtil.java new file mode 100644 index 00000000..62aaf958 --- /dev/null +++ b/contrib/hbase-stat/src/main/java/com/salesforce/hbase/stats/util/SetupTableUtil.java @@ -0,0 +1,118 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package com.salesforce.hbase.stats.util; + +import java.io.IOException; + +import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.hbase.HColumnDescriptor; +import org.apache.hadoop.hbase.HTableDescriptor; +import org.apache.hadoop.hbase.client.HBaseAdmin; +import org.apache.hadoop.hbase.util.Bytes; + +import com.google.common.base.Preconditions; +import com.salesforce.hbase.stats.cleanup.CleanupStatistics; + +import static com.salesforce.hbase.stats.util.Constants.STATS_TABLE_NAME; + +/** + * Utility helper class to ensure that your primary and statistics table is setup correctly + */ +public class SetupTableUtil { + + private static final String TABLE_STATS_ENABLED_DESC_KEY = "com.salesforce.hbase.stats.cleanup"; + + private SetupTableUtil() { + // private ctor for util classes + } + + + /** + * Ensure all the necessary coprocessors are added to a cluster's configuration + * @param conf {@link Configuration} to update + */ + public static void setupCluster(Configuration conf){ + CleanupStatistics.setupClusterConfiguration(conf); + } + + public static void setupTable(HBaseAdmin admin, HTableDescriptor primaryTable, + boolean ensureStatTable, boolean createStatTable) + throws IOException { + // add the right keys to the primary table + primaryTable.setValue(TABLE_STATS_ENABLED_DESC_KEY, "true"); + CleanupStatistics.setupTable(primaryTable); + + if (!ensureStatTable) { + return; + } + + // ensure that the stats table is setup correctly + boolean exists = admin.tableExists(STATS_TABLE_NAME); + HTableDescriptor statDesc = null; + if (exists) { + if (createStatTable) { + throw new IllegalStateException("Statistics table '" + STATS_TABLE_NAME + + " was requested to be created, but already exists!"); + } else { + // get the descriptor so we can verify it has the right properties + statDesc = admin.getTableDescriptor(Bytes.toBytes(STATS_TABLE_NAME)); + } + } else { + if (createStatTable) { + statDesc = createStatsTable(admin); + } + } + verifyStatsTable(statDesc); + } + + public static HTableDescriptor createStatsTable(HBaseAdmin admin) throws IOException { + HTableDescriptor statDesc = new HTableDescriptor(STATS_TABLE_NAME); + HColumnDescriptor col = new HColumnDescriptor(Constants.STATS_DATA_COLUMN_FAMILY); + col.setMaxVersions(1); + statDesc.addFamily(col); + admin.createTable(statDesc); + return statDesc; + } + + /** + * @param desc {@link HTableDescriptor} of the statistics table to verify + */ + public static void verifyStatsTable(HTableDescriptor desc) { + if (!desc.hasFamily(Constants.STATS_DATA_COLUMN_FAMILY)) { + throw new IllegalStateException("Statistics table '" + desc + + "' doesn't have expected column family: " + Bytes.toString(Constants.STATS_DATA_COLUMN_FAMILY)); + } + // only keep around a single version + int versions = desc.getFamily(Constants.STATS_DATA_COLUMN_FAMILY).getMaxVersions(); + Preconditions.checkState(versions == 1, + "Stats rows should only have a single version, but set to: " + versions); + } + + + /** + * @param desc {@link HTableDescriptor} to check + * @return true if statistics have been turned on for the table + */ + public static boolean getStatsEnabled(HTableDescriptor desc) { + String hasStats = desc.getValue(TABLE_STATS_ENABLED_DESC_KEY); + if (hasStats != null && hasStats.equals("true")) { + return true; + } + return false; + } +} \ No newline at end of file diff --git a/contrib/hbase-stat/src/main/protobuf/README.txt b/contrib/hbase-stat/src/main/protobuf/README.txt new file mode 100644 index 00000000..de45ecc8 --- /dev/null +++ b/contrib/hbase-stat/src/main/protobuf/README.txt @@ -0,0 +1,26 @@ +These are the protobuf definition files used by hbase-stat. The produced java +classes are generated into src/main/java/com/salesforce/hbase/protobuf/generated +and then checked in. The reasoning is that they change infrequently. + +To regnerate the classes after making definition file changes, ensure first that +the protobuf protoc tool is in your $PATH (You may need to download it and build +it first; its part of the protobuf package obtainable from here: +http://code.google.com/p/protobuf/downloads/list). Then run the following from +contrib/hbase-stat (You should be able to just copy and paste the below into a +terminal and hit return -- the protoc compiler runs fast): + + UNIX_PROTO_DIR=src/main/protobuf + JAVA_DIR=src/main/java/ + mkdir -p $JAVA_DIR 2> /dev/null + if which cygpath 2> /dev/null; then + PROTO_DIR=`cygpath --windows $UNIX_PROTO_DIR` + JAVA_DIR=`cygpath --windows $JAVA_DIR` + else + PROTO_DIR=$UNIX_PROTO_DIR + fi + for PROTO_FILE in $UNIX_PROTO_DIR/*.proto + do + protoc -I$PROTO_DIR --java_out=$JAVA_DIR $PROTO_FILE + done + +After you've done the above, check it in. \ No newline at end of file diff --git a/contrib/hbase-stat/src/main/protobuf/stats.proto b/contrib/hbase-stat/src/main/protobuf/stats.proto new file mode 100644 index 00000000..7677fb63 --- /dev/null +++ b/contrib/hbase-stat/src/main/protobuf/stats.proto @@ -0,0 +1,14 @@ +// Statistics protobufs - used for compatability with HBase 0.96 + +option java_package = "com.salesforce.hbase.protobuf.generated"; +option java_outer_classname = "StatisticProtos"; +option java_generate_equals_and_hash = true; +option optimize_for = SPEED; + +message Histogram{ + //all histograms have either a fixed depth or width + required int64 depthOrWidth = 1; + // fixed depth will have different values as row keys + //fixed width will just have counts + repeated bytes value = 2; +} \ No newline at end of file diff --git a/contrib/hbase-stat/src/test/java/com/salesforce/hbase/stats/TestEqualWidthHistogramOnTable.java b/contrib/hbase-stat/src/test/java/com/salesforce/hbase/stats/TestEqualWidthHistogramOnTable.java new file mode 100644 index 00000000..7132f151 --- /dev/null +++ b/contrib/hbase-stat/src/test/java/com/salesforce/hbase/stats/TestEqualWidthHistogramOnTable.java @@ -0,0 +1,91 @@ +package com.salesforce.hbase.stats; + +import static org.junit.Assert.assertArrayEquals; +import static org.junit.Assert.assertEquals; + +import java.util.List; + +import org.apache.hadoop.hbase.HTableDescriptor; +import org.apache.hadoop.hbase.KeyValue; +import org.apache.hadoop.hbase.client.HTable; +import org.apache.hadoop.hbase.util.Bytes; + +import com.google.protobuf.ByteString; +import com.salesforce.hbase.protobuf.generated.StatisticProtos.Histogram; +import com.salesforce.hbase.stats.impl.EqualByteDepthHistogramStatisticTracker; +import com.salesforce.hbase.stats.util.Constants; +import com.salesforce.hbase.stats.util.StatsTestUtil; + +/** + * A full, real table test of the the {@link EqualByteDepthHistogramStatisticTracker}. This is the + * complement to {@link TestEqualWidthHistogramStat}. + */ +public class TestEqualWidthHistogramOnTable extends TestTrackerImpl { + + // number of keys in each column + private final int columnWidth = 676; + // depth is the width (count of keys) times the number of bytes of each key, which in this case is + // fixed to 32 bytes, so we know the depth in all cases + private final int columnDepth = columnWidth * 32; + + @Override + protected void preparePrimaryTableDescriptor(HTableDescriptor primary) throws Exception { + EqualByteDepthHistogramStatisticTracker.addToTable(primary, columnDepth); + } + + @Override + protected void verifyStatistics(HTableDescriptor primary) throws Exception { + // scan the stats table for a raw count + HTable statTable = new HTable(UTIL.getConfiguration(), Constants.STATS_TABLE_NAME); + int count = StatsTestUtil.getKeyValueCount(statTable); + + // we should have just 1 stat - our histogram + assertEquals("Got an unexpected amount of stats!", 1, count); + StatisticsTable table = new StatisticsTable(UTIL.getConfiguration(), primary); + + // now get a custom reader to interpret the results + StatisticReader reader = EqualByteDepthHistogramStatisticTracker + .getStatistcReader(table); + List> stats = reader.read(); + + // should only have a single column family + assertEquals("More than one column family has statistics!", 1, stats.size()); + List values = stats.get(0).getValues(); + assertEquals("Wrong number of histograms for the column family/region", 1, values.size()); + Histogram histogram = values.get(0).getHistogram(); + assertEquals("Got an incorrect number of guideposts! Got: " + toStringFixedDepth(histogram), + 26, histogram.getValueList().size()); + + // make sure we got the correct guideposts + byte counter = 'a'; + for (ByteString column : histogram.getValueList()) { + byte[] guidepost = new byte[] { counter, 'z', 'z' }; + byte[] data = column.toByteArray(); + // row key is actually stored flipped, so we flip it here + byte[] actual = new byte[] { data[data.length - 3], data[data.length - 2], + data[data.length - 1] }; + assertArrayEquals( + "Guidepost should be:" + Bytes.toString(guidepost) + " , but was: " + + Bytes.toString(actual), guidepost, actual); + counter++; + } + + // cleanup + statTable.close(); + table.close(); + } + + /** + * @param histogram to print + * @return a string representation of the fixed depth histogram which stores keyvalues + */ + private String toStringFixedDepth(Histogram histogram) { + StringBuilder sb = new StringBuilder("histogram: " + histogram.getDepthOrWidth() + " depth, "); + for (ByteString bs : histogram.getValueList()) { + sb.append(new KeyValue(bs.toByteArray()).toString()); + sb.append(","); + } + return sb.toString(); + } + +} diff --git a/contrib/hbase-stat/src/test/java/com/salesforce/hbase/stats/TestEqualWidthHistogramStat.java b/contrib/hbase-stat/src/test/java/com/salesforce/hbase/stats/TestEqualWidthHistogramStat.java new file mode 100644 index 00000000..33b40f93 --- /dev/null +++ b/contrib/hbase-stat/src/test/java/com/salesforce/hbase/stats/TestEqualWidthHistogramStat.java @@ -0,0 +1,105 @@ +package com.salesforce.hbase.stats; + +import static org.junit.Assert.assertArrayEquals; +import static org.junit.Assert.assertEquals; + +import java.io.IOException; +import java.util.List; + +import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.hbase.HTableDescriptor; +import org.apache.hadoop.hbase.KeyValue; +import org.apache.hadoop.hbase.client.HTableInterface; +import org.apache.hadoop.hbase.coprocessor.RegionCoprocessorEnvironment; +import org.apache.hadoop.hbase.regionserver.HRegion; +import org.apache.hadoop.hbase.util.Bytes; +import org.junit.Test; +import org.mockito.Mockito; + +import com.google.protobuf.ByteString; +import com.google.protobuf.InvalidProtocolBufferException; +import com.salesforce.hbase.protobuf.generated.StatisticProtos.Histogram; +import com.salesforce.hbase.stats.impl.EqualByteDepthHistogramStatisticTracker; + +/** + * Simple unit test of equal width histograms. Doesn't test against a full cluster, but rather is + * just simple interface testing. + */ +public class TestEqualWidthHistogramStat { + + // number of keys in each column + private final int columnWidth = 676; + // depth is the width (count of keys) times the number of bytes of each key, which in this case is + // fixed to 3 bytes, so we know the depth in all cases + private final int columnDepth = columnWidth * 3; + + @Test + public void testSimpleStat() throws IOException { + EqualByteDepthHistogramStatisticTracker tracker = new EqualByteDepthHistogramStatisticTracker(); + // unfortunately, need to mock a lot here + RegionCoprocessorEnvironment env = Mockito.mock(RegionCoprocessorEnvironment.class); + HRegion mockRegion = Mockito.mock(HRegion.class); + String tableName = "testSimpleStatPrimary"; + HTableDescriptor primary = new HTableDescriptor(tableName); + Mockito.when(env.getRegion()).thenReturn(mockRegion); + Mockito.when(mockRegion.getTableDesc()).thenReturn(primary); + HTableInterface mockTable = Mockito.mock(HTableInterface.class); + Mockito.when(env.getTable((byte[]) Mockito.any())).thenReturn(mockTable); + + // setup the actual configuration that we care about + Configuration conf = new Configuration(false); + // setup our byte width == to [letter]zz + conf.setLong(EqualByteDepthHistogramStatisticTracker.BYTE_DEPTH_CONF_KEY, columnDepth); + Mockito.when(env.getConfiguration()).thenReturn(conf); + + // setup the tracker + tracker.start(env); + + // put some data in the tracker and check the histograms that come out + loadAndVerifyTracker(tracker); + + // should be able to clear it and get the exact same results + tracker.clear(); + loadAndVerifyTracker(tracker); + } + + /** + * @param tracker tracker to load with data and then validate + * @throws InvalidProtocolBufferException if protobufs are broken - should not be thrown since we + * are not serializing information + */ + private void loadAndVerifyTracker(EqualByteDepthHistogramStatisticTracker tracker) + throws InvalidProtocolBufferException { + // now feed the tracker a bunch of bytes + KeyValue kv = new KeyValue(); + byte[] k = new byte[3]; + for (byte b1 = 'a'; b1 <= 'z'; b1++) { + for (byte b2 = 'a'; b2 <= 'z'; b2++) { + for (byte b3 = 'a'; b3 <= 'z'; b3++) { + k[0] = b1; + k[1] = b2; + k[2] = b3; + kv = new KeyValue(k, 0, 3); + tracker.updateStatistic(kv); + } + } + } + + List stats = tracker.getCurrentStats(); + assertEquals("Got more than one histogram!", 1, stats.size()); + HistogramStatisticValue stat = (HistogramStatisticValue) stats.get(0); + Histogram histogram = stat.getHistogram(); + assertEquals("Got an incorrect number of guideposts!", 26, histogram.getValueList().size()); + + // make sure we got the correct guideposts + byte counter = 'a'; + for (ByteString column : histogram.getValueList()) { + byte[] guidepost = new byte[] { counter, 'z', 'z' }; + byte[] actual = column.toByteArray(); + assertArrayEquals( + "Guidepost should be:" + Bytes.toString(guidepost) + " , but was: " + + Bytes.toString(actual), guidepost, actual); + counter++; + } + } +} \ No newline at end of file diff --git a/contrib/hbase-stat/src/test/java/com/salesforce/hbase/stats/TestMinMaxKeyStats.java b/contrib/hbase-stat/src/test/java/com/salesforce/hbase/stats/TestMinMaxKeyStats.java new file mode 100644 index 00000000..e573da22 --- /dev/null +++ b/contrib/hbase-stat/src/test/java/com/salesforce/hbase/stats/TestMinMaxKeyStats.java @@ -0,0 +1,53 @@ +package com.salesforce.hbase.stats; + +import static org.junit.Assert.assertArrayEquals; +import static org.junit.Assert.assertEquals; + +import java.io.IOException; +import java.util.List; + +import org.apache.hadoop.hbase.HTableDescriptor; +import org.apache.hadoop.hbase.client.HTable; + +import com.salesforce.hbase.stats.impl.MinMaxKey; +import com.salesforce.hbase.stats.impl.MinMaxKey.MinMaxStat; +import com.salesforce.hbase.stats.util.Constants; +import com.salesforce.hbase.stats.util.StatsTestUtil; + +/** + * Test the min/max key on a real table + */ +public class TestMinMaxKeyStats extends TestTrackerImpl { + + @Override + protected void preparePrimaryTableDescriptor(HTableDescriptor primary) throws IOException { + // just track the Min/Max Key + MinMaxKey.addToTable(primary); + } + + @Override + protected void verifyStatistics(HTableDescriptor primary) throws IOException { + // scan the stats table for a raw count + HTable stats = new HTable(UTIL.getConfiguration(), Constants.STATS_TABLE_NAME); + int count = StatsTestUtil.getKeyValueCount(stats); + + // we should have 2 stats - a min and a max for the one column of the one region of the table + assertEquals("Got an unexpected amount of stats!", 2, count); + + // then do a read with the actual statistics + // we know we are going to collect MinMaxKey so reading ensures we are collecting correctly + StatisticsTable statTable = new StatisticsTable(UTIL.getConfiguration(), primary); + StatisticReader reader = MinMaxKey.getStatistcReader(statTable); + List results = MinMaxKey.interpret(reader.read()); + assertEquals("Unexpected number of min/max results!", 1, results.size()); + assertArrayEquals("Unexpected number of min result!", new byte[] { 'a', 'a', 'a' }, + results.get(0).min); + assertArrayEquals("Unexpected number of min result!", new byte[] { 'z', 'z', 'z' }, + results.get(0).max); + + // cleanup after ourselves + stats.close(); + statTable.close(); + } + +} diff --git a/contrib/hbase-stat/src/test/java/com/salesforce/hbase/stats/TestTrackerImpl.java b/contrib/hbase-stat/src/test/java/com/salesforce/hbase/stats/TestTrackerImpl.java new file mode 100644 index 00000000..d292e2ed --- /dev/null +++ b/contrib/hbase-stat/src/test/java/com/salesforce/hbase/stats/TestTrackerImpl.java @@ -0,0 +1,145 @@ +package com.salesforce.hbase.stats; + +import static org.junit.Assert.assertEquals; +import static org.junit.Assert.assertTrue; + +import java.util.List; + +import org.apache.commons.logging.Log; +import org.apache.commons.logging.LogFactory; +import org.apache.hadoop.hbase.HBaseTestingUtility; +import org.apache.hadoop.hbase.HColumnDescriptor; +import org.apache.hadoop.hbase.HTableDescriptor; +import org.apache.hadoop.hbase.client.HBaseAdmin; +import org.apache.hadoop.hbase.client.HTable; +import org.apache.hadoop.hbase.regionserver.HRegion; +import org.apache.hadoop.hbase.regionserver.HRegionServer; +import org.apache.hadoop.hbase.util.Bytes; +import org.junit.After; +import org.junit.AfterClass; +import org.junit.Before; +import org.junit.BeforeClass; +import org.junit.Test; + +import com.salesforce.hbase.stats.util.Constants; +import com.salesforce.hbase.stats.util.SetupTableUtil; +import com.salesforce.hbase.stats.util.StatsTestUtil; + +/** + * Helper test for testing an implementation of a statistic. + *

      + * Uses the {@link HBaseTestingUtility#loadTable(HTable, byte[])} to load the {@link #FAM} column + * family with data. The table is then flushed and compacted, ensuring statistics are gathered + * through the normal mechanisms. + *

      + * Use {@link #preparePrimaryTableDescriptor(HTableDescriptor)} to add your custom + * {@link StatisticTracker} to the table. + *

      + * Use {@link #verifyStatistics(HTableDescriptor)} to verify that all the correct statistics have + * been collected on the table, after it has been loaded, flushed and compacted. + */ +@SuppressWarnings("javadoc") +public abstract class TestTrackerImpl { + protected static final HBaseTestingUtility UTIL = new HBaseTestingUtility(); + public static final byte[] FAM = Bytes.toBytes("FAMILY"); + public static final Log LOG = LogFactory.getLog(TestTrackerImpl.class); + + @BeforeClass + public static void setupCluster() throws Exception { + SetupTableUtil.setupCluster(UTIL.getConfiguration()); + UTIL.startMiniCluster(); + } + + @AfterClass + public static void teardownCluster() throws Exception { + UTIL.shutdownMiniCluster(); + } + + @Before + public void setupTables() throws Exception { + HBaseAdmin admin = UTIL.getHBaseAdmin(); + // setup the stats table + SetupTableUtil.createStatsTable(admin); + // make sure the stats table got created + assertTrue("Stats table didn't get created!", admin.tableExists(Constants.STATS_TABLE_NAME)); + } + + @After + public void cleanupTables() throws Exception { + HBaseAdmin admin = UTIL.getHBaseAdmin(); + admin.disableTable(Constants.STATS_TABLE_NAME); + admin.deleteTable(Constants.STATS_TABLE_NAME); + admin.close(); + } + + /** + * Goes through a full end-to-end test of gathering statistics on a table. + *

      + * First, we create and verify the statistics table. Then we write some data to the primary table. + * Finally, we check that the given statistic is enabled and working correctly by reading the + * stats table. + * @throws Exception on failure + */ + @Test + public void testSimplePrimaryAndStatsTables() throws Exception { + HBaseAdmin admin = UTIL.getHBaseAdmin(); + + // setup our primary table + HTableDescriptor primary = new HTableDescriptor("testSimplePrimaryAndStatsTables"); + primary.addFamily(new HColumnDescriptor(FAM)); + + // make sure stats are enabled on the table + SetupTableUtil.setupTable(UTIL.getHBaseAdmin(), primary, false, false); + + // do any further setup on the table + preparePrimaryTableDescriptor(primary); + + // create the primary table + admin.createTable(primary); + + // load some data into our primary table + HTable primaryTable = new HTable(UTIL.getConfiguration(), primary.getName()); + UTIL.loadTable(primaryTable, FAM); + + // now flush and compact our table + HRegionServer server = UTIL.getRSForFirstRegionInTable(primary.getName()); + List regions = server.getOnlineRegions(primary.getName()); + assertTrue("Didn't find any regions for primary table!", regions.size() > 0); + // flush and compact all the regions of the primary table + for (HRegion region : regions) { + region.flushcache(); + region.compactStores(true); + } + + // make sure all the stats that we expect got written + verifyStatistics(primary); + + // then delete the table and make sure we don't have any more stats in our table + admin.disableTable(primary.getName()); + admin.deleteTable(primary.getName()); + + // make sure that we cleanup the stats on table delete + HTable stats = new HTable(UTIL.getConfiguration(), Constants.STATS_TABLE_NAME); + assertEquals("Stats table still has values after primary table delete", 0, + StatsTestUtil.getKeyValueCount(stats)); + + // and cleanup after ourselves + stats.close(); + } + + /** + * Prepare the primary table descriptor for the test. For instance, add the + * {@link StatisticTracker} to the table. This is called before the primary table is created + * @throws Exception on failure + */ + protected abstract void preparePrimaryTableDescriptor(HTableDescriptor primary) throws Exception; + + /** + * Verify the statistics on the given primary table after the table has been loaded, flushed, and + * compacted. + * @param primary {@link HTableDescriptor} for the primary table for which we were collecting + * statistics + * @throws Exception on failure + */ + protected abstract void verifyStatistics(HTableDescriptor primary) throws Exception; +} diff --git a/contrib/hbase-stat/src/test/java/com/salesforce/hbase/stats/util/StatsTestUtil.java b/contrib/hbase-stat/src/test/java/com/salesforce/hbase/stats/util/StatsTestUtil.java new file mode 100644 index 00000000..cb3c634c --- /dev/null +++ b/contrib/hbase-stat/src/test/java/com/salesforce/hbase/stats/util/StatsTestUtil.java @@ -0,0 +1,66 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package com.salesforce.hbase.stats.util; + +import java.io.IOException; + +import org.apache.hadoop.hbase.HTableDescriptor; +import org.apache.hadoop.hbase.KeyValue; +import org.apache.hadoop.hbase.client.HTable; +import org.apache.hadoop.hbase.client.Result; +import org.apache.hadoop.hbase.client.ResultScanner; +import org.apache.hadoop.hbase.client.Scan; + +import com.salesforce.hbase.stats.TestTrackerImpl; + +/** + * Helper utility for testing + */ +public class StatsTestUtil { + + /** + * @return a valid {@link HTableDescriptor} for the primary table on which we want to collect + * statistics + */ + public static HTableDescriptor getValidPrimaryTableDescriptor() { + HTableDescriptor table = new HTableDescriptor("primary_table_for_test"); + return table; + } + + /** + * Count the total number of rows in the table + * @param table the table to count + * @return the number of {@link KeyValue}s in the table + * @throws IOException if the table has an error while reading + */ + public static int getKeyValueCount(HTable table) throws IOException { + Scan scan = new Scan(); + scan.setMaxVersions(Integer.MAX_VALUE - 1); + + ResultScanner results = table.getScanner(scan); + int count = 0; + for (Result res : results) { + count += res.list().size(); + TestTrackerImpl.LOG.info(count + ") " + res); + } + results.close(); + + return count; + } + +} diff --git a/contrib/hbase-stat/src/test/java/com/salesforce/hbase/stats/util/TestSetupTable.java b/contrib/hbase-stat/src/test/java/com/salesforce/hbase/stats/util/TestSetupTable.java new file mode 100644 index 00000000..b9075dd5 --- /dev/null +++ b/contrib/hbase-stat/src/test/java/com/salesforce/hbase/stats/util/TestSetupTable.java @@ -0,0 +1,59 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one or more contributor license + * agreements. See the NOTICE file distributed with this work for additional information regarding + * copyright ownership. The ASF licenses this file to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance with the License. You may obtain a + * copy of the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required by applicable + * law or agreed to in writing, software distributed under the License is distributed on an "AS IS" + * BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License + * for the specific language governing permissions and limitations under the License. + */ +package com.salesforce.hbase.stats.util; + +import static org.junit.Assert.assertTrue; +import static org.junit.Assert.fail; + +import org.apache.hadoop.hbase.HBaseTestingUtility; +import org.apache.hadoop.hbase.HTableDescriptor; +import org.apache.hadoop.hbase.client.HBaseAdmin; +import org.junit.AfterClass; +import org.junit.BeforeClass; +import org.junit.Test; + +/** + * Ensure that we verify the tables are setup correctly + */ +public class TestSetupTable { + + private static final HBaseTestingUtility UTIL = new HBaseTestingUtility(); + + @BeforeClass + public static void setupCluster() throws Exception { + UTIL.startMiniCluster(); + } + + @AfterClass + public static void teardownCluster() throws Exception { + UTIL.shutdownMiniCluster(); + } + + @Test + public void testCreatesStatTable() throws Exception { + HTableDescriptor primary = StatsTestUtil.getValidPrimaryTableDescriptor(); + HBaseAdmin admin = UTIL.getHBaseAdmin(); + SetupTableUtil.setupTable(admin, primary, true, true); + + assertTrue("Statistics table didn't get created!", admin.tableExists(Constants.STATS_TABLE_NAME_BYTES)); + // make sure it it is a valid table + HTableDescriptor statDesc = admin.getTableDescriptor(Constants.STATS_TABLE_NAME_BYTES); + try { + SetupTableUtil.verifyStatsTable(statDesc); + } catch (Exception e) { + fail("Created statistics table isn't considered valid! Maybe missing a check in the creation?"); + } + + // cleanup after ourselves + admin.disableTable(Constants.STATS_TABLE_NAME_BYTES); + admin.deleteTable(Constants.STATS_TABLE_NAME_BYTES); + } +} \ No newline at end of file