-
Notifications
You must be signed in to change notification settings - Fork 51
/
Copy pathstatistics.proto
559 lines (486 loc) · 19.9 KB
/
statistics.proto
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
// Copyright 2017 The TensorFlow Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
// =============================================================================
// Definitions for aggregated feature statistics for datasets.
// TODO(b/80075690): make a Javascript build rule for this.
// TODO(b/80075691): migrate Facets to use this.
syntax = "proto3";
package tensorflow.metadata.v0;
import "google/protobuf/any.proto";
import "tensorflow_metadata/proto/v0/derived_feature.proto";
import "tensorflow_metadata/proto/v0/path.proto";
option cc_enable_arenas = true;
option java_package = "org.tensorflow.metadata.v0";
option java_multiple_files = true;
// Copied from Facets feature_statistics.proto
// Must be kept binary-compatible with the original, until all usages
// are updated to use this version, or we write a proto-to-proto converter.
// A list of features statistics for different datasets. If you wish to compare
// different datasets using this list, then the DatasetFeatureStatistics
// entries should all contain the same list of features.
// LINT.IfChange
message DatasetFeatureStatisticsList {
repeated DatasetFeatureStatistics datasets = 1;
}
// The feature statistics for a single dataset.
message DatasetFeatureStatistics {
// The name of the dataset.
string name = 1;
// The number of examples in the dataset.
uint64 num_examples = 2;
// Only valid if the weight feature was specified.
// Treats a missing weighted feature as zero.
double weighted_num_examples = 4;
// The feature statistics for the dataset.
repeated FeatureNameStatistics features = 3;
// Cross feature statistics for the dataset.
repeated CrossFeatureStatistics cross_features = 5;
}
// NextID: 8
message CrossFeatureStatistics {
// The path of feature x.
Path path_x = 1;
// The path of feature y.
Path path_y = 2;
// Number of occurrences of this feature cross in the data. If any of
// the features in the cross is missing, the example is ignored.
uint64 count = 3;
oneof cross_stats {
NumericCrossStatistics num_cross_stats = 4;
CategoricalCrossStatistics categorical_cross_stats = 5;
}
}
message NumericCrossStatistics {
// Pearson product-moment correlation coefficient.
float correlation = 1;
// Standard covariance. E[(X-E[X])*(Y-E[Y])]
float covariance = 2;
}
message CategoricalCrossStatistics {
LiftStatistics lift = 1;
}
message LiftStatistics {
// Lift information for each value of path_y. Lift is defined for each pair of
// values (x,y) as P(path_y=y|path_x=x)/P(path_y=y).
repeated LiftSeries lift_series = 1;
// Weighted lift information for each value of path_y. Weighted lift is
// defined for each pair of values (x,y) as P(path_y=y|path_x=x)/P(path_y=y)
// where probabilities are computed over weighted example space.
repeated LiftSeries weighted_lift_series = 2;
}
// Container for lift information for a specific y-value.
message LiftSeries {
// A bucket for referring to binned numeric features.
message Bucket {
// The low value of the bucket, inclusive.
double low_value = 1;
// The high value of the bucket, exclusive (unless the high_value is
// positive infinity).
double high_value = 2;
}
// The particular value of path_y corresponding to this LiftSeries. Each
// element in lift_values corresponds to the lift a different x_value and
// this specific y_value.
oneof y_value {
int32 y_int = 1;
string y_string = 2;
Bucket y_bucket = 3;
}
// The number of examples in which y_value appears.
oneof y_count_value {
uint64 y_count = 4;
double weighted_y_count = 5;
}
// A container for lift information about a specific value of path_x.
message LiftValue {
oneof x_value {
int32 x_int = 1;
string x_string = 2;
}
// P(path_y=y|path_x=x) / P(path_y=y) for x_value and the enclosing y_value.
// In terms of concrete fields, this number represents:
// (x_and_y_count / x_count) / (y_count / num_examples)
double lift = 3;
// The number of examples in which x_value appears.
oneof x_count_value {
uint64 x_count = 4;
double weighted_x_count = 5;
}
// The number of examples in which x_value appears and y_value appears.
oneof x_and_y_count_value {
uint64 x_and_y_count = 6;
double weighted_x_and_y_count = 7;
}
}
// The lifts for a each path_x value and this y_value.
repeated LiftValue lift_values = 6;
}
// The complete set of statistics for a given feature name for a dataset.
// NextID: 11
message FeatureNameStatistics {
// The types supported by the feature statistics. When aggregating
// tf.Examples, if the bytelist contains a string, it is recommended to encode
// it here as STRING instead of BYTES in order to calculate string-specific
// statistical measures.
enum Type {
INT = 0;
FLOAT = 1;
STRING = 2;
BYTES = 3;
STRUCT = 4;
}
// One can identify a field either by the name (for simple fields), or by
// a path (for structured fields). Note that:
// name: "foo"
// is equivalent to:
// path: {step:"foo"}
// Note: this oneof must be consistently either name or path across all
// FeatureNameStatistics in one DatasetFeatureStatistics.
oneof field_id {
// The feature name
string name = 1;
// The path of the feature.
Path path = 8;
}
// The data type of the feature
Type type = 2;
// The statistics of the values of the feature.
oneof stats {
NumericStatistics num_stats = 3;
StringStatistics string_stats = 4;
BytesStatistics bytes_stats = 5;
StructStatistics struct_stats = 7;
}
// Any custom statistics can be stored in this list.
repeated CustomStatistic custom_stats = 6;
// If set, indicates that that this feature is derived for validation, and
// stores metadata about its source.
// Experimental and subject to change.
DerivedFeatureSource validation_derived_source = 10;
reserved 9;
}
// Common weighted statistics for all feature types. Statistics counting number
// of values (i.e., avg_num_values and tot_num_values) include NaNs.
// If the weighted column is missing, then this counts as a weight of 1
// for that example. For nested features with N nested levels (N > 1), the
// statistics counting number of values will rely on the innermost level.
message WeightedCommonStatistics {
// Weighted number of examples not missing.
double num_non_missing = 1;
// Weighted number of examples missing.
// Note that if the weighted column is zero, this does not count
// as missing.
double num_missing = 2;
// average number of values, weighted by the number of examples.
// avg_num_values = tot_num_values / num_non_missing.
double avg_num_values = 3;
// The total number of values in this feature.
double tot_num_values = 4;
}
// Stores the name and value of any custom statistic. The value can be a string,
// double, or histogram.
message CustomStatistic {
string name = 1;
oneof val {
double num = 2;
string str = 3;
Histogram histogram = 4;
RankHistogram rank_histogram = 5;
google.protobuf.Any any = 6;
}
}
// Statistics for a numeric feature in a dataset.
message NumericStatistics {
CommonStatistics common_stats = 1;
// The mean of the values
double mean = 2;
// The standard deviation of the values
double std_dev = 3;
// The number of values that equal 0
uint64 num_zeros = 4;
// The minimum value
double min = 5;
// The median value
double median = 6;
// The maximum value
double max = 7;
// The histogram(s) of the feature values.
repeated Histogram histograms = 8;
// Weighted statistics for the feature, if the values have weights.
WeightedNumericStatistics weighted_numeric_stats = 9;
}
// Statistics for a string feature in a dataset.
message StringStatistics {
CommonStatistics common_stats = 1;
// The number of unique values
uint64 unique = 2;
message FreqAndValue {
string value = 2;
// The number of times the value occurs. Stored as a double to be able to
// handle weighted features.
double frequency = 3;
// Deleted fields.
reserved 1;
}
// A sorted list of the most-frequent values and their frequencies, with
// the most-frequent being first.
repeated FreqAndValue top_values = 3;
// The average length of the values
float avg_length = 4;
// The rank histogram for the values of the feature.
// The rank is used to measure of how commonly the value is found in the
// dataset. The most common value would have a rank of 1, with the second-most
// common value having a rank of 2, and so on.
RankHistogram rank_histogram = 5;
// Weighted statistics for the feature, if the values have weights.
WeightedStringStatistics weighted_string_stats = 6;
// A vocabulary file, used for vocabularies too large to store in the proto
// itself. Note that the file may be relative to some context-dependent
// directory. E.g. in TFX the feature statistics will live in a PPP and
// vocabulary file names will be relative to this PPP.
string vocabulary_file = 7;
// Counts the number of invalid utf8 strings present in leaf arrays for this
// feature. Validation is only performed for byte- or string-like features (
// those having type BYTES or STRING).
uint64 invalid_utf8_count = 8;
}
// Statistics for a feature containing a NL domain.
message NaturalLanguageStatistics {
// Fraction of feature input tokens considered in-vocab.
double feature_coverage = 1;
// Average token length of tokens used by the feature.
double avg_token_length = 2;
// Histogram containing the distribution of token lengths.
Histogram token_length_histogram = 3;
// Min / max sequence lengths.
int64 min_sequence_length = 10;
int64 max_sequence_length = 11;
// Histogram containing the distribution of sequence lengths.
Histogram sequence_length_histogram = 9;
// Number of of sequences which do not match the location constraint.
int64 location_misses = 4;
// Reported sequences that are sampled from the input and have small
// avg_token_length, low feature converage, or do not match the location
// regex.
repeated string reported_sequences = 5;
message TokenStatistics {
// Token for which the statistics are reported.
oneof token {
string string_token = 1;
int64 int_token = 2;
}
// The number of times the value occurs. Stored as a double to be able to
// handle weighted features.
double frequency = 3;
// Fraction of sequences containing the token.
double fraction_of_sequences = 4;
// Min number of token occurrences within a sequence.
double per_sequence_min_frequency = 5;
// Average number of token occurrences within a sequence.
double per_sequence_avg_frequency = 6;
// Maximum number of token occurrences within a sequence.
double per_sequence_max_frequency = 7;
// Token positions within a sequence. Normalized by sequence length.
// (e.g. a token that occurres in position 0.5 occurs in the middle of
// a sequence).
Histogram positions = 8;
}
// Statistics for specified tokens. TokenStatistics are only reported for
// tokens specified in SequenceValueConstraints in the schema.
repeated TokenStatistics token_statistics = 6;
// The rank histogram for the tokens of the feature.
// The rank is used to measure of how commonly the token is found in the
// dataset. The most common token would have a rank of 1, with the second-most
// common value having a rank of 2, and so on.
RankHistogram rank_histogram = 7;
WeightedNaturalLanguageStatistics weighted_nl_statistics = 8;
}
// Statistics for a weighted numeric feature in a dataset.
message WeightedNumericStatistics {
// The weighted mean of the values
double mean = 1;
// The weighted standard deviation of the values
double std_dev = 2;
// The weighted median of the values
double median = 3;
// The histogram(s) of the weighted feature values.
repeated Histogram histograms = 4;
}
// Statistics for a weighted string feature in a dataset.
message WeightedStringStatistics {
// A sorted list of the most-frequent values and their weighted frequencies,
// with the most-frequent being first.
repeated StringStatistics.FreqAndValue top_values = 1;
// The rank histogram for the weighted values of the feature.
RankHistogram rank_histogram = 2;
}
// Statistics for a weighted feature with an NL domain.
message WeightedNaturalLanguageStatistics {
// Weighted feature coverage.
double feature_coverage = 1;
// Weighted average token length.
double avg_token_length = 2;
// Histogram containing the distribution of token lengths.
Histogram token_length_histogram = 3;
// Histogram containing the distribution of sequence lengths.
Histogram sequence_length_histogram = 9;
// Weighted number of sequences that do not match the location constraint.
double location_misses = 4;
// Per-token weighted statistics.
NaturalLanguageStatistics.TokenStatistics token_statistics = 5;
// The rank histogram with the weighted tokens for the feature.
RankHistogram rank_histogram = 6;
}
// Statistics for a bytes feature in a dataset.
message BytesStatistics {
CommonStatistics common_stats = 1;
// The number of unique values
uint64 unique = 2;
// The average number of bytes in a value
float avg_num_bytes = 3;
// The minimum number of bytes in a value
float min_num_bytes = 4;
// The maximum number of bytes in a value
float max_num_bytes = 5;
// The maximum number of bytes in a value, as an int. Float will start having
// a loss of precision for a large enough integer. This field preserves the
// precision.
int64 max_num_bytes_int = 6;
}
message StructStatistics {
CommonStatistics common_stats = 1;
}
// Statistics about the presence and valency of feature values. Feature values
// could be nested lists. A feature in tf.Examples or other "flat" datasets has
// values of nest level 1 -- they are lists of primitives. A nest level N
// (N > 1) feature value is a list of lists of nest level (N - 1).
// This proto can be used to describe the presence and valency of values at each
// level.
message PresenceAndValencyStatistics {
// Note: missing and non-missing counts are conditioned on the upper level
// being non-missing (i.e. if the upper level is missing/null, all the levels
// nested below are by definition missing, but not counted).
// Number non-missing (not-null) values.
uint64 num_non_missing = 1;
// Number of missing (null) values.
uint64 num_missing = 2;
// Minimum length of the values (note that nulls are not considered).
uint64 min_num_values = 3;
// Maximum length of the values.
uint64 max_num_values = 4;
// Total number of values.
uint64 tot_num_values = 5;
}
// Common statistics for all feature types. Statistics counting number of values
// (i.e., min_num_values, max_num_values, avg_num_values, and tot_num_values)
// include NaNs. For nested features with N nested levels (N > 1), the
// statistics counting number of values will rely on the innermost level.
message CommonStatistics {
// The number of examples that include this feature. Note that this includes
// examples that contain this feature with an explicitly empty list of values,
// which may be permitted for variable length features.
uint64 num_non_missing = 1;
// The number of examples missing this feature.
uint64 num_missing = 2;
// The minimum number of values in a single example for this feature.
uint64 min_num_values = 3;
// The maximum number of values in a single example for this feature.
uint64 max_num_values = 4;
// The average number of values in a single example for this feature.
// avg_num_values = tot_num_values / num_non_missing.
float avg_num_values = 5;
// The total number of values in this feature.
uint64 tot_num_values = 8;
// The quantiles histogram for the number of values in this feature.
Histogram num_values_histogram = 6;
WeightedCommonStatistics weighted_common_stats = 7;
// The histogram for the number of features in the feature list (only set if
// this feature is a non-context feature from a tf.SequenceExample).
// This is different from num_values_histogram, as num_values_histogram tracks
// the count of all values for a feature in an example, whereas this tracks
// the length of the feature list for this feature in an example (where each
// feature list can contain multiple values).
Histogram feature_list_length_histogram = 9;
// Contains presence and valency stats for each nest level of the feature.
// The first item corresponds to the outermost level, and by definition,
// the stats it contains equals to the corresponding stats defined above.
// May not be populated if the feature is of nest level 1.
repeated PresenceAndValencyStatistics presence_and_valency_stats = 10;
// If not empty, it's parallel to presence_and_valency_stats.
repeated WeightedCommonStatistics weighted_presence_and_valency_stats = 11;
}
// The data used to create a histogram of a numeric feature for a dataset.
message Histogram {
// Each bucket defines its low and high values along with its count. The
// low and high values must be a real number or positive or negative
// infinity. They cannot be NaN or undefined. Counts of those special values
// can be found in the numNaN and numUndefined fields.
message Bucket {
// The low value of the bucket, exclusive except for the first bucket.
double low_value = 1;
// The high value of the bucket, inclusive.
double high_value = 2;
// The number of items in the bucket. Stored as a double to be able to
// handle weighted histograms.
double sample_count = 4;
// Deleted fields.
reserved 3;
}
// The number of NaN values in the dataset.
uint64 num_nan = 1;
// The number of undefined values in the dataset.
uint64 num_undefined = 2;
// A list of buckets in the histogram, sorted from lowest bucket to highest
// bucket.
repeated Bucket buckets = 3;
// The type of the histogram. A standard histogram has equal-width buckets.
// The quantiles type is used for when the histogram message is used to store
// quantile information (by using approximately equal-count buckets with
// variable widths).
enum HistogramType {
STANDARD = 0;
QUANTILES = 1;
}
// The type of the histogram.
HistogramType type = 4;
// An optional descriptive name of the histogram, to be used for labeling.
string name = 5;
}
// The data used to create a rank histogram of a non-numeric feature of a
// dataset. The rank of a value in a feature can be used as a measure of how
// commonly the value is found in the entire dataset. With bucket sizes of one,
// this becomes a distribution function of all feature values.
message RankHistogram {
// Each bucket defines its start and end ranks along with its count.
message Bucket {
// The low rank of the bucket, inclusive.
uint64 low_rank = 1;
// The high rank of the bucket, exclusive.
uint64 high_rank = 2;
// The label for the bucket. Can be used to list or summarize the values in
// this rank bucket.
string label = 4;
// The number of items in the bucket. Stored as a double to be able to
// handle weighted histograms.
double sample_count = 5;
// Deleted fields.
reserved 3;
}
// A list of buckets in the histogram, sorted from lowest-ranked bucket to
// highest-ranked bucket.
repeated Bucket buckets = 1;
// An optional descriptive name of the histogram, to be used for labeling.
string name = 2;
}
// LINT.ThenChange(//tfx_bsl/cc/statistics/merge_util.cc)