tensorflow_metadata/proto/v0/statistics.proto

// Copyright 2017 The TensorFlow Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
// =============================================================================

// Definitions for aggregated feature statistics for datasets.
// TODO(b/80075690): make a Javascript build rule for this.
// TODO(b/80075691): migrate Facets to use this.
syntax = "proto3";

package tensorflow.metadata.v0;

import "google/protobuf/any.proto";
import "tensorflow_metadata/proto/v0/derived_feature.proto";
import "tensorflow_metadata/proto/v0/path.proto";

option cc_enable_arenas = true;
option java_package = "org.tensorflow.metadata.v0";
option java_multiple_files = true;

// Copied from Facets feature_statistics.proto
// Must be kept binary-compatible with the original, until all usages
// are updated to use this version, or we write a proto-to-proto converter.

// A list of features statistics for different datasets. If you wish to compare
// different datasets using this list, then the DatasetFeatureStatistics
// entries should all contain the same list of features.
// LINT.IfChange
message DatasetFeatureStatisticsList {
  repeated DatasetFeatureStatistics datasets = 1;
}

// The feature statistics for a single dataset.
message DatasetFeatureStatistics {
  // The name of the dataset.
  string name = 1;
  // The number of examples in the dataset.
  uint64 num_examples = 2;

  // Only valid if the weight feature was specified.
  // Treats a missing weighted feature as zero.
  double weighted_num_examples = 4;
  // The feature statistics for the dataset.
  repeated FeatureNameStatistics features = 3;

  // Cross feature statistics for the dataset.
  repeated CrossFeatureStatistics cross_features = 5;
}

// NextID: 8
message CrossFeatureStatistics {
  // The path of feature x.
  Path path_x = 1;
  // The path of feature y.
  Path path_y = 2;

  // Number of occurrences of this feature cross in the data. If any of
  // the features in the cross is missing, the example is ignored.
  uint64 count = 3;

  oneof cross_stats {
    NumericCrossStatistics num_cross_stats = 4;
    CategoricalCrossStatistics categorical_cross_stats = 5;
  }
}

message NumericCrossStatistics {
  // Pearson product-moment correlation coefficient.
  float correlation = 1;
  // Standard covariance. E[(X-E[X])*(Y-E[Y])]
  float covariance = 2;
}

message CategoricalCrossStatistics {
  LiftStatistics lift = 1;
}

message LiftStatistics {
  // Lift information for each value of path_y. Lift is defined for each pair of
  // values (x,y) as P(path_y=y|path_x=x)/P(path_y=y).
  repeated LiftSeries lift_series = 1;
  // Weighted lift information for each value of path_y. Weighted lift is
  // defined for each pair of values (x,y) as P(path_y=y|path_x=x)/P(path_y=y)
  // where probabilities are computed over weighted example space.
  repeated LiftSeries weighted_lift_series = 2;
}

// Container for lift information for a specific y-value.
message LiftSeries {
  // A bucket for referring to binned numeric features.
  message Bucket {
    // The low value of the bucket, inclusive.
    double low_value = 1;
    // The high value of the bucket, exclusive (unless the high_value is
    // positive infinity).
    double high_value = 2;
  }

  // The particular value of path_y corresponding to this LiftSeries. Each
  // element in lift_values corresponds to the lift a different x_value and
  // this specific y_value.
  oneof y_value {
    int32 y_int = 1;
    string y_string = 2;
    Bucket y_bucket = 3;
  }

  // The number of examples in which y_value appears.
  oneof y_count_value {
    uint64 y_count = 4;
    double weighted_y_count = 5;
  }

  // A container for lift information about a specific value of path_x.
  message LiftValue {
    oneof x_value {
      int32 x_int = 1;
      string x_string = 2;
    }
    // P(path_y=y|path_x=x) / P(path_y=y) for x_value and the enclosing y_value.
    // In terms of concrete fields, this number represents:
    // (x_and_y_count / x_count) / (y_count / num_examples)
    double lift = 3;
    // The number of examples in which x_value appears.
    oneof x_count_value {
      uint64 x_count = 4;
      double weighted_x_count = 5;
    }
    // The number of examples in which x_value appears and y_value appears.
    oneof x_and_y_count_value {
      uint64 x_and_y_count = 6;
      double weighted_x_and_y_count = 7;
    }
  }

  // The lifts for a each path_x value and this y_value.
  repeated LiftValue lift_values = 6;
}

// The complete set of statistics for a given feature name for a dataset.
// NextID: 11
message FeatureNameStatistics {
  // The types supported by the feature statistics. When aggregating
  // tf.Examples, if the bytelist contains a string, it is recommended to encode
  // it here as STRING instead of BYTES in order to calculate string-specific
  // statistical measures.
  enum Type {
    INT = 0;
    FLOAT = 1;
    STRING = 2;
    BYTES = 3;
    STRUCT = 4;
  }

  // One can identify a field either by the name (for simple fields), or by
  // a path (for structured fields). Note that:
  // name: "foo"
  // is equivalent to:
  // path: {step:"foo"}
  // Note: this oneof must be consistently either name or path across all
  // FeatureNameStatistics in one DatasetFeatureStatistics.
  oneof field_id {
    // The feature name
    string name = 1;

    // The path of the feature.
    Path path = 8;
  }

  // The data type of the feature
  Type type = 2;

  // The statistics of the values of the feature.
  oneof stats {
    NumericStatistics num_stats = 3;
    StringStatistics string_stats = 4;
    BytesStatistics bytes_stats = 5;
    StructStatistics struct_stats = 7;
  }

  // Any custom statistics can be stored in this list.
  repeated CustomStatistic custom_stats = 6;

  // If set, indicates that that this feature is derived for validation, and
  // stores metadata about its source.
  // Experimental and subject to change.
  DerivedFeatureSource validation_derived_source = 10;
  reserved 9;
}

// Common weighted statistics for all feature types. Statistics counting number
// of values (i.e., avg_num_values and tot_num_values) include NaNs.
// If the weighted column is missing, then this counts as a weight of 1
// for that example. For nested features with N nested levels (N > 1), the
// statistics counting number of values will rely on the innermost level.
message WeightedCommonStatistics {
  // Weighted number of examples not missing.
  double num_non_missing = 1;
  // Weighted number of examples missing.
  // Note that if the weighted column is zero, this does not count
  // as missing.
  double num_missing = 2;
  // average number of values, weighted by the number of examples.
  // avg_num_values = tot_num_values / num_non_missing.
  double avg_num_values = 3;
  // The total number of values in this feature.
  double tot_num_values = 4;
}

// Stores the name and value of any custom statistic. The value can be a string,
// double, or histogram.
message CustomStatistic {
  string name = 1;
  oneof val {
    double num = 2;
    string str = 3;
    Histogram histogram = 4;
    RankHistogram rank_histogram = 5;
    google.protobuf.Any any = 6;
  }
}

// Statistics for a numeric feature in a dataset.
message NumericStatistics {
  CommonStatistics common_stats = 1;
  // The mean of the values
  double mean = 2;
  // The standard deviation of the values
  double std_dev = 3;
  // The number of values that equal 0
  uint64 num_zeros = 4;
  // The minimum value
  double min = 5;
  // The median value
  double median = 6;
  // The maximum value
  double max = 7;
  // The histogram(s) of the feature values.
  repeated Histogram histograms = 8;

  // Weighted statistics for the feature, if the values have weights.
  WeightedNumericStatistics weighted_numeric_stats = 9;
}

// Statistics for a string feature in a dataset.
message StringStatistics {
  CommonStatistics common_stats = 1;
  // The number of unique values
  uint64 unique = 2;

  message FreqAndValue {
    string value = 2;

    // The number of times the value occurs. Stored as a double to be able to
    // handle weighted features.
    double frequency = 3;

    // Deleted fields.
    reserved 1;
  }
  // A sorted list of the most-frequent values and their frequencies, with
  // the most-frequent being first.
  repeated FreqAndValue top_values = 3;

  // The average length of the values
  float avg_length = 4;

  // The rank histogram for the values of the feature.
  // The rank is used to measure of how commonly the value is found in the
  // dataset. The most common value would have a rank of 1, with the second-most
  // common value having a rank of 2, and so on.
  RankHistogram rank_histogram = 5;

  // Weighted statistics for the feature, if the values have weights.
  WeightedStringStatistics weighted_string_stats = 6;

  // A vocabulary file, used for vocabularies too large to store in the proto
  // itself.  Note that the file may be relative to some context-dependent
  // directory.  E.g. in TFX the feature statistics will live in a PPP and
  // vocabulary file names will be relative to this PPP.
  string vocabulary_file = 7;

  // Counts the number of invalid utf8 strings present in leaf arrays for this
  // feature. Validation is only performed for byte- or string-like features (
  // those having type BYTES or STRING).
  uint64 invalid_utf8_count = 8;
}

// Statistics for a feature containing a NL domain.
message NaturalLanguageStatistics {
  // Fraction of feature input tokens considered in-vocab.
  double feature_coverage = 1;
  // Average token length of tokens used by the feature.
  double avg_token_length = 2;
  // Histogram containing the distribution of token lengths.
  Histogram token_length_histogram = 3;
  // Min / max sequence lengths.
  int64 min_sequence_length = 10;
  int64 max_sequence_length = 11;
  // Histogram containing the distribution of sequence lengths.
  Histogram sequence_length_histogram = 9;
  // Number of of sequences which do not match the location constraint.
  int64 location_misses = 4;

  // Reported sequences that are sampled from the input and have small
  // avg_token_length, low feature converage, or do not match the location
  // regex.
  repeated string reported_sequences = 5;

  message TokenStatistics {
    // Token for which the statistics are reported.
    oneof token {
      string string_token = 1;
      int64 int_token = 2;
    }

    // The number of times the value occurs. Stored as a double to be able to
    // handle weighted features.
    double frequency = 3;

    // Fraction of sequences containing the token.
    double fraction_of_sequences = 4;
    // Min number of token occurrences within a sequence.
    double per_sequence_min_frequency = 5;
    // Average number of token occurrences within a sequence.
    double per_sequence_avg_frequency = 6;
    // Maximum number of token occurrences within a sequence.
    double per_sequence_max_frequency = 7;
    // Token positions within a sequence. Normalized by sequence length.
    // (e.g. a token that occurres in position 0.5 occurs in the middle of
    // a sequence).
    Histogram positions = 8;
  }

  // Statistics for specified tokens. TokenStatistics are only reported for
  // tokens specified in SequenceValueConstraints in the schema.
  repeated TokenStatistics token_statistics = 6;

  // The rank histogram for the tokens of the feature.
  // The rank is used to measure of how commonly the token is found in the
  // dataset. The most common token would have a rank of 1, with the second-most
  // common value having a rank of 2, and so on.
  RankHistogram rank_histogram = 7;
  WeightedNaturalLanguageStatistics weighted_nl_statistics = 8;
}

// Statistics for a weighted numeric feature in a dataset.
message WeightedNumericStatistics {
  // The weighted mean of the values
  double mean = 1;
  // The weighted standard deviation of the values
  double std_dev = 2;
  // The weighted median of the values
  double median = 3;

  // The histogram(s) of the weighted feature values.
  repeated Histogram histograms = 4;
}

// Statistics for a weighted string feature in a dataset.
message WeightedStringStatistics {
  // A sorted list of the most-frequent values and their weighted frequencies,
  // with the most-frequent being first.
  repeated StringStatistics.FreqAndValue top_values = 1;

  // The rank histogram for the weighted values of the feature.
  RankHistogram rank_histogram = 2;
}

// Statistics for a weighted feature with an NL domain.
message WeightedNaturalLanguageStatistics {
  // Weighted feature coverage.
  double feature_coverage = 1;
  // Weighted average token length.
  double avg_token_length = 2;
  // Histogram containing the distribution of token lengths.
  Histogram token_length_histogram = 3;
  // Histogram containing the distribution of sequence lengths.
  Histogram sequence_length_histogram = 9;
  // Weighted number of sequences that do not match the location constraint.
  double location_misses = 4;
  // Per-token weighted statistics.
  NaturalLanguageStatistics.TokenStatistics token_statistics = 5;
  // The rank histogram with the weighted tokens for the feature.
  RankHistogram rank_histogram = 6;
}

// Statistics for a bytes feature in a dataset.
message BytesStatistics {
  CommonStatistics common_stats = 1;
  // The number of unique values
  uint64 unique = 2;

  // The average number of bytes in a value
  float avg_num_bytes = 3;
  // The minimum number of bytes in a value
  float min_num_bytes = 4;
  // The maximum number of bytes in a value
  float max_num_bytes = 5;
  // The maximum number of bytes in a value, as an int. Float will start having
  // a loss of precision for a large enough integer. This field preserves the
  // precision.
  int64 max_num_bytes_int = 6;
}

message StructStatistics {
  CommonStatistics common_stats = 1;
}

// Statistics about the presence and valency of feature values. Feature values
// could be nested lists. A feature in tf.Examples or other "flat" datasets has
// values of nest level 1 -- they are lists of primitives. A nest level N
// (N > 1) feature value is a list of lists of nest level (N - 1).
// This proto can be used to describe the presence and valency of values at each
// level.
message PresenceAndValencyStatistics {
  // Note: missing and non-missing counts are conditioned on the upper level
  // being non-missing (i.e. if the upper level is missing/null, all the levels
  // nested below are by definition missing, but not counted).
  // Number non-missing (not-null) values.
  uint64 num_non_missing = 1;
  // Number of missing (null) values.
  uint64 num_missing = 2;
  // Minimum length of the values (note that nulls are not considered).
  uint64 min_num_values = 3;
  // Maximum length of the values.
  uint64 max_num_values = 4;
  // Total number of values.
  uint64 tot_num_values = 5;
}

// Common statistics for all feature types. Statistics counting number of values
// (i.e., min_num_values, max_num_values, avg_num_values, and tot_num_values)
// include NaNs. For nested features with N nested levels (N > 1), the
// statistics counting number of values will rely on the innermost level.
message CommonStatistics {
  // The number of examples that include this feature. Note that this includes
  // examples that contain this feature with an explicitly empty list of values,
  // which may be permitted for variable length features.
  uint64 num_non_missing = 1;
  // The number of examples missing this feature.
  uint64 num_missing = 2;
  // The minimum number of values in a single example for this feature.
  uint64 min_num_values = 3;
  // The maximum number of values in a single example for this feature.
  uint64 max_num_values = 4;
  // The average number of values in a single example for this feature.
  // avg_num_values = tot_num_values / num_non_missing.
  float avg_num_values = 5;
  // The total number of values in this feature.
  uint64 tot_num_values = 8;
  // The quantiles histogram for the number of values in this feature.
  Histogram num_values_histogram = 6;
  WeightedCommonStatistics weighted_common_stats = 7;
  // The histogram for the number of features in the feature list (only set if
  // this feature is a non-context feature from a tf.SequenceExample).
  // This is different from num_values_histogram, as num_values_histogram tracks
  // the count of all values for a feature in an example, whereas this tracks
  // the length of the feature list for this feature in an example (where each
  // feature list can contain multiple values).
  Histogram feature_list_length_histogram = 9;

  // Contains presence and valency stats for each nest level of the feature.
  // The first item corresponds to the outermost level, and by definition,
  // the stats it contains equals to the corresponding stats defined above.
  // May not be populated if the feature is of nest level 1.
  repeated PresenceAndValencyStatistics presence_and_valency_stats = 10;

  // If not empty, it's parallel to presence_and_valency_stats.
  repeated WeightedCommonStatistics weighted_presence_and_valency_stats = 11;
}

// The data used to create a histogram of a numeric feature for a dataset.
message Histogram {
  // Each bucket defines its low and high values along with its count. The
  // low and high values must be a real number or positive or negative
  // infinity. They cannot be NaN or undefined. Counts of those special values
  // can be found in the numNaN and numUndefined fields.
  message Bucket {
    // The low value of the bucket, exclusive except for the first bucket.
    double low_value = 1;
    // The high value of the bucket, inclusive.
    double high_value = 2;

    // The number of items in the bucket. Stored as a double to be able to
    // handle weighted histograms.
    double sample_count = 4;

    // Deleted fields.
    reserved 3;
  }

  // The number of NaN values in the dataset.
  uint64 num_nan = 1;
  // The number of undefined values in the dataset.
  uint64 num_undefined = 2;

  // A list of buckets in the histogram, sorted from lowest bucket to highest
  // bucket.
  repeated Bucket buckets = 3;

  // The type of the histogram. A standard histogram has equal-width buckets.
  // The quantiles type is used for when the histogram message is used to store
  // quantile information (by using approximately equal-count buckets with
  // variable widths).
  enum HistogramType {
    STANDARD = 0;
    QUANTILES = 1;
  }

  // The type of the histogram.
  HistogramType type = 4;

  // An optional descriptive name of the histogram, to be used for labeling.
  string name = 5;
}

// The data used to create a rank histogram of a non-numeric feature of a
// dataset. The rank of a value in a feature can be used as a measure of how
// commonly the value is found in the entire dataset. With bucket sizes of one,
// this becomes a distribution function of all feature values.
message RankHistogram {
  // Each bucket defines its start and end ranks along with its count.
  message Bucket {
    // The low rank of the bucket, inclusive.
    uint64 low_rank = 1;
    // The high rank of the bucket, exclusive.
    uint64 high_rank = 2;

    // The label for the bucket. Can be used to list or summarize the values in
    // this rank bucket.
    string label = 4;

    // The number of items in the bucket. Stored as a double to be able to
    // handle weighted histograms.
    double sample_count = 5;

    // Deleted fields.
    reserved 3;
  }

  // A list of buckets in the histogram, sorted from lowest-ranked bucket to
  // highest-ranked bucket.
  repeated Bucket buckets = 1;

  // An optional descriptive name of the histogram, to be used for labeling.
  string name = 2;
}
// LINT.ThenChange(//tfx_bsl/cc/statistics/merge_util.cc)