From 09f62ab38b82be7ea5bc01e253d61a185a877fb8 Mon Sep 17 00:00:00 2001 From: "A. Unique TensorFlower" Date: Mon, 30 Oct 2017 16:54:23 -0700 Subject: [PATCH] Speeding up the case for sparse float columns that have only 1 value. PiperOrigin-RevId: 173971121 --- .../contrib/boosted_trees/lib/utils/example.h | 120 +++++++++++------- .../boosted_trees/lib/utils/example_test.cc | 53 +++++--- .../lib/utils/examples_iterable.cc | 4 + .../lib/utils/examples_iterable.h | 51 +++++--- .../lib/utils/examples_iterable_test.cc | 1 + 5 files changed, 149 insertions(+), 80 deletions(-) diff --git a/tensorflow/contrib/boosted_trees/lib/utils/example.h b/tensorflow/contrib/boosted_trees/lib/utils/example.h index 9514416660c..e388cf332c3 100644 --- a/tensorflow/contrib/boosted_trees/lib/utils/example.h +++ b/tensorflow/contrib/boosted_trees/lib/utils/example.h @@ -17,7 +17,6 @@ #define THIRD_PARTY_TENSORFLOW_CONTRIB_BOOSTED_TREES_LIB_UTILS_EXAMPLE_H_ #include -#include #include #include #include "tensorflow/contrib/boosted_trees/lib/utils/optional_value.h" @@ -25,55 +24,85 @@ namespace tensorflow { namespace boosted_trees { namespace utils { - -// A matrix that given feature column id and feature value id will return -// either a value or an optional. First index indicates feature column, second -// index - the index of the value within this column - for single valued, it -// will be 0. -// Allows double-subscript access [][]. +// Represents sparse vector that have a value for some feature indices within +// the feature column. +// Allows subscript access []. template -class SparseMatrix { - typedef std::vector> SparseMap; - - class Proxy { - public: - Proxy(const int32 feature_column_idx, const SparseMap& values) - : feature_column_idx_(feature_column_idx), values_(values) {} - - OptionalValue operator[](int feature_idx) const { - auto value_iter = std::find_if( - values_.begin(), values_.end(), - [this, &feature_idx](const std::tuple& element) { - return std::get<0>(element) == feature_column_idx_ && - std::get<1>(element) == feature_idx; - }); - - if (value_iter == values_.end()) { - return OptionalValue(); - } - // There is this feature column and feature id. - return OptionalValue(std::get<2>(*value_iter)); - } - - private: - int32 feature_column_idx_; - const SparseMap& values_; - }; - +class SparseMultidimensionalValues { public: - void addElement(const int32 feature_column_idx, const int32 feature_idx, - const T value) { - values_.emplace_back(feature_column_idx, feature_idx, value); + void Add(const int32 feature_idx, const T value) { + values_.emplace_back(feature_idx, value); } - void clear() { values_.clear(); } + void Clear() { values_.clear(); } - Proxy operator[](int feature_column_idx) const { - return Proxy(feature_column_idx, values_); + void Reserve(const int32 size) { values_.reserve(size); } + + OptionalValue operator[](int feature_idx) const { + auto value_iter = + std::find_if(values_.begin(), values_.end(), + [&feature_idx](const std::pair& element) { + return element.first == feature_idx; + }); + + if (value_iter == values_.end()) { + return OptionalValue(); + } + return OptionalValue(value_iter->second); } private: - SparseMap values_; + std::vector> values_; +}; + +// Represents storage for a sparse float feature column. Can store values either +// for one dimensional or a multivalent (multidimensional) sparse column. +// Allows subscript operator access [feature_id]. +template +class SparseFloatFeatureColumn { + public: + void Reserve(const int32 size) { + if (!single_dimensional_) { + mutlidimensional_values.Reserve(size); + } + } + + void SetDimension(const int32 dimension) { + single_dimensional_ = dimension <= 1; + } + + void Add(const int32 feature_idx, const float value) { + if (single_dimensional_) { + DCHECK_EQ(0, feature_idx); + single_value_ = value; + } else { + mutlidimensional_values.Add(feature_idx, value); + } + initialized_ = true; + } + + void Clear() { + single_dimensional_ = false; + initialized_ = false; + mutlidimensional_values.Clear(); + } + + OptionalValue operator[](int feature_idx) const { + if (!initialized_) { + return OptionalValue(); + } + if (single_dimensional_) { + return OptionalValue(single_value_); + } else { + return mutlidimensional_values[feature_idx]; + } + } + + private: + bool single_dimensional_; + bool initialized_; + T single_value_; + SparseMultidimensionalValues mutlidimensional_values; }; // Holds data for one example and enables lookup by feature column. @@ -87,9 +116,10 @@ struct Example { // Dense and sparse float features indexed by feature column. // TODO(salehay): figure out a design to support multivalent float features. std::vector dense_float_features; - // Sparse float features are allowed to be multivalent and thus can be - // represented as a sparse matrix. - SparseMatrix sparse_float_features; + + // Sparse float features columns (can be either single or multivalent + // (multidimensional). + std::vector> sparse_float_features; // Sparse integer features indexed by feature column. // Note that all integer features are assumed to be categorical, i.e. will diff --git a/tensorflow/contrib/boosted_trees/lib/utils/example_test.cc b/tensorflow/contrib/boosted_trees/lib/utils/example_test.cc index f78fd25022e..be9d63ee8ae 100644 --- a/tensorflow/contrib/boosted_trees/lib/utils/example_test.cc +++ b/tensorflow/contrib/boosted_trees/lib/utils/example_test.cc @@ -25,21 +25,33 @@ namespace { class ExampleTest : public ::testing::Test {}; TEST_F(ExampleTest, TestSparseMatrix) { - // Create the following matrix: - // row id | | 0.4 | 0.3 - // 0 | 1 | | 2 - // 1 | 3 | 1 | 5 - // 2 | | | -4 - // 3 | | | - SparseMatrix matrix; - matrix.addElement(0, 1, 0.4f); - matrix.addElement(0, 2, 0.3f); - matrix.addElement(1, 0, 1.f); - matrix.addElement(1, 2, 2.f); - matrix.addElement(2, 0, 3.f); - matrix.addElement(2, 1, 1.f); - matrix.addElement(2, 2, 5.f); - matrix.addElement(3, 2, -4.f); + // Create the following matrix (FC is feature column): + // FC | f0 | f1 | f2 + // multidimensional + // 0 | | 0.4 | 0.3 + // 1 | 1 | | 2 + // 2 | 3 | 1 | 5 + // 3 | | | + // one dimensional columns + // 4 | -4 + // 5 | + std::vector> matrix; + matrix.resize(6); + matrix[0].SetDimension(3); + matrix[1].SetDimension(3); + matrix[2].SetDimension(3); + matrix[3].SetDimension(3); + matrix[4].SetDimension(1); + matrix[5].SetDimension(1); + + matrix[0].Add(1, 0.4f); + matrix[0].Add(2, 0.3f); + matrix[1].Add(0, 1.f); + matrix[1].Add(2, 2.f); + matrix[2].Add(0, 3.f); + matrix[2].Add(1, 1.f); + matrix[2].Add(2, 5.f); + matrix[4].Add(0, -4.f); // Row 0. EXPECT_FALSE(matrix[0][0].has_value()); @@ -66,13 +78,14 @@ TEST_F(ExampleTest, TestSparseMatrix) { // Row 3. EXPECT_FALSE(matrix[3][0].has_value()); EXPECT_FALSE(matrix[3][1].has_value()); - EXPECT_TRUE(matrix[3][2].has_value()); - EXPECT_EQ(-4.f, matrix[3][2].get_value()); + EXPECT_FALSE(matrix[3][2].has_value()); // Row 4. - EXPECT_FALSE(matrix[4][0].has_value()); - EXPECT_FALSE(matrix[4][1].has_value()); - EXPECT_FALSE(matrix[4][2].has_value()); + EXPECT_TRUE(matrix[4][0].has_value()); + EXPECT_EQ(-4.f, matrix[4][0].get_value()); + + // Row 5. + EXPECT_FALSE(matrix[5][0].has_value()); } } // namespace diff --git a/tensorflow/contrib/boosted_trees/lib/utils/examples_iterable.cc b/tensorflow/contrib/boosted_trees/lib/utils/examples_iterable.cc index 3b287b1dcfe..e7e0b568c6f 100644 --- a/tensorflow/contrib/boosted_trees/lib/utils/examples_iterable.cc +++ b/tensorflow/contrib/boosted_trees/lib/utils/examples_iterable.cc @@ -36,12 +36,14 @@ ExamplesIterable::ExamplesIterable( // Create sparse float column iterables and values. sparse_float_column_iterables_.reserve(sparse_float_feature_columns.size()); sparse_float_column_values_.reserve(sparse_float_feature_columns.size()); + sparse_float_dimensions_.reserve(sparse_float_feature_columns.size()); for (auto& sparse_float_column : sparse_float_feature_columns) { sparse_float_column_iterables_.emplace_back( sparse_float_column.indices().template matrix(), example_start, example_end); sparse_float_column_values_.emplace_back( sparse_float_column.values().template vec()); + sparse_float_dimensions_.push_back(sparse_float_column.shape()[1]); } // Create sparse int column iterables and values. @@ -74,6 +76,8 @@ Iterator::Iterator(ExamplesIterable* iter, int64 example_idx) example_.dense_float_features.resize( iter_->dense_float_column_values_.size()); example_.sparse_int_features.resize(iter_->sparse_int_column_values_.size()); + example_.sparse_float_features.resize( + iter_->sparse_float_column_values_.size()); } } // namespace utils diff --git a/tensorflow/contrib/boosted_trees/lib/utils/examples_iterable.h b/tensorflow/contrib/boosted_trees/lib/utils/examples_iterable.h index 72b7486872e..5b33c815887 100644 --- a/tensorflow/contrib/boosted_trees/lib/utils/examples_iterable.h +++ b/tensorflow/contrib/boosted_trees/lib/utils/examples_iterable.h @@ -87,33 +87,51 @@ class ExamplesIterable { // Get sparse float values per column. auto& sparse_float_features = example_.sparse_float_features; - sparse_float_features.clear(); // Iterate through each sparse float feature column. for (size_t sparse_float_idx = 0; sparse_float_idx < iter_->sparse_float_column_iterables_.size(); ++sparse_float_idx) { + // Clear info from a previous instance. + sparse_float_features[sparse_float_idx].Clear(); + // Get range for values tensor. const auto& row_range = (*sparse_float_column_iterators_[sparse_float_idx]); DCHECK_EQ(example_idx_, row_range.example_idx); + // If the example has this feature column. if (row_range.start < row_range.end) { - // Retrieve original indices tensor. - const TTypes::ConstMatrix& indices = - iter_->sparse_float_column_iterables_[sparse_float_idx] - .sparse_indices(); + const int32 dimension = + iter_->sparse_float_dimensions_[sparse_float_idx]; + sparse_float_features[sparse_float_idx].SetDimension(dimension); + if (dimension <= 1) { + // single dimensional sparse feature column. + DCHECK_EQ(1, row_range.end - row_range.start); + sparse_float_features[sparse_float_idx].Add( + 0, iter_->sparse_float_column_values_[sparse_float_idx]( + row_range.start)); + } else { + // Retrieve original indices tensor. + const TTypes::ConstMatrix& indices = + iter_->sparse_float_column_iterables_[sparse_float_idx] + .sparse_indices(); - // For each value. - for (int64 row_idx = row_range.start; row_idx < row_range.end; - ++row_idx) { - // Get the feature id for the feature column and the value. - const int32 feature_id = indices(row_idx, 1); - DCHECK_EQ(example_idx_, indices(row_idx, 0)); + sparse_float_features[sparse_float_idx].Reserve(row_range.end - + row_range.start); - // Save the value to our sparse matrix. - sparse_float_features.addElement( - sparse_float_idx, feature_id, - iter_->sparse_float_column_values_[sparse_float_idx](row_idx)); + // For each value. + for (int64 row_idx = row_range.start; row_idx < row_range.end; + ++row_idx) { + // Get the feature id for the feature column and the value. + const int32 feature_id = indices(row_idx, 1); + DCHECK_EQ(example_idx_, indices(row_idx, 0)); + + // Save the value to our sparse matrix. + sparse_float_features[sparse_float_idx].Add( + feature_id, + iter_->sparse_float_column_values_[sparse_float_idx]( + row_idx)); + } } } } @@ -173,6 +191,9 @@ class ExamplesIterable { // Sparse float column values. std::vector::ConstVec> sparse_float_column_values_; + // Dimensions for sparse float feature columns. + std::vector sparse_float_dimensions_; + // Sparse int column iterables. std::vector sparse_int_column_iterables_; diff --git a/tensorflow/contrib/boosted_trees/lib/utils/examples_iterable_test.cc b/tensorflow/contrib/boosted_trees/lib/utils/examples_iterable_test.cc index 05c166edc61..d8a60886483 100644 --- a/tensorflow/contrib/boosted_trees/lib/utils/examples_iterable_test.cc +++ b/tensorflow/contrib/boosted_trees/lib/utils/examples_iterable_test.cc @@ -194,6 +194,7 @@ TEST_F(ExamplesIterableTest, Iterate) { {dense_float_tensor}, {sparse_float_tensor1, sparse_float_tensor2}, {sparse_int_tensor1, sparse_int_tensor2}, 0, 8); int64 example_idx = 0; + for (const auto& example : full_iterable) { validate_example_features(example_idx, example); ++example_idx;