// Copyright 2021 The Chromium Authors
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file.
syntax = "proto2";
option optimize_for = LITE_RUNTIME;
package segmentation_platform.proto;
import "components/segmentation_platform/public/proto/aggregation.proto";
import "components/segmentation_platform/public/proto/output_config.proto";
import "components/segmentation_platform/public/proto/types.proto";
// The version is used to verify if the metadata provided by the server is
// supported in current version of the code. Update the version number for any
// new feature added to metadata proto, and add a log of the new changes in the
// current version in this file.
// Version 0 supports UMA features and aggregation in |features| field.
// Version 1 supports UMA features, custom inputs and sql features in
// |input_features| field.
// Version 2 supports training data output collection in |training_outputs|
// field.
// Version 3 supports trigger configurations for training data collection.
enum CurrentVersion {
METADATA_VERSION = 3;
}
// Version information for segmentation models.
message VersionInfo {
// Minimum model metadata version that is supported. Some newer
// features/fields might not be available before this version. This field is
// set on the server and read by the client to verify if model is valid.
optional int32 metadata_min_version = 1;
// Current model metadata version. This field is set by the client while
// sending a model download request to optimization guide server so that the
// server knows the capabilities of the client.
optional int32 metadata_cur_version = 2;
}
// Used to identify the source of the model whether it is a client side or
// server side model.
enum ModelSource {
UNKNOWN_MODEL_SOURCE = 0;
SERVER_MODEL_SOURCE = 1; // Represents server side model.
DEFAULT_MODEL_SOURCE = 2; // Represents client side model.
}
message UMAFeature {
// The type of signal this feature refers to.
// Note: SignalType::UKM_EVENT type is only used for SignalStorageConfig and
// should not be used as uma feature's signal type.
optional SignalType type = 1;
// The human readable name of the histogram or user action.
optional string name = 2;
// The hash of the histogram name or user action. Must match the result of
// base::HashMetricName.
optional fixed64 name_hash = 3;
// Number of buckets to include in the result. If set to 0, no data will be
// collected. This can be used to start storing data before it should be used.
// See documentation for Aggregation for details.
optional uint64 bucket_count = 4;
// The required length of the calculated result. See documentation for
// Aggregation for details.
optional uint64 tensor_length = 5;
// The type of aggregation to use for this particular feature.
optional Aggregation aggregation = 6;
// Only set if type == HISTOGRAM_ENUM.
// Matches are only valid when the enum ID matches any of these.
// Works like an OR condition, e.g.: [url, search, …] or just [url].
repeated int32 enum_ids = 7;
// Only set if aggregation == LATEST_OR_DEFAULT.
// Value used for model if latest value requested is not available in the
// database. The number of entries should be equal to the tensor_length.
repeated float default_values = 8;
}
message CustomInput {
// This parameter is required.
// 1. If the param is directly used as the input tensor field to the model,
// then this specifies the number of columns to fill in the tensor. In this
// case the value should be float.
// 2. If the param is used as a bind value for sql features, then this
// specifies the number of sql bindings to fill in the sql query.
optional int32 tensor_length = 1;
// Used to distinguish between different types of custom inputs.
enum FillPolicy {
// Custom functions provided by the engine that fills in the input feature
// to the model.
UNKNOWN_FILL_POLICY = 0;
// Output is the time at which model prediction is needed. Can be used to
// bind TIME type param to queries.
// Output type: Time
// Output length: 1
FILL_PREDICTION_TIME = 1;
// Output is two timestamps, the beginning and the end of last x days. Can
// be used to bind TIME type param to query within a time interval.
// Output type: Time
// Output length: 2
// Additional arg:
// `bucket_count`: Required. Number of buckets to include in the result.
TIME_RANGE_BEFORE_PREDICTION = 2;
// Used to determine whether a given page is a product details page and can
// be price tracked.
PRICE_TRACKING_HINTS = 3;
// This type of custom input is used directly to fill the input tensor to
// the model or to another query.
// Output type: ProcessedValue
// Output length: 1
// Additional arg:
// `name`: Optional. The name of the field to be looked up in input
// context. If missing then the |name| field is used.
FILL_FROM_INPUT_CONTEXT = 4;
// Output is a tensor of length 10 consisting of float values denoting
// various devices count by type with different form factor and os type.
// See `SyncDeviceInfoObserver` for description of each value.
// Output type: float
// Output length: 10
// Additional arg:
// `wait_for_device_info_in_seconds`: Number of seconds to wait for sync
// device info before timeout. If 0, then does not wait for sync and times
// out immediately if device info is not available.
// InputContext arg:
// `active_days_limit`: Number of days after which the device is
// considered not active after last sync. Must be INT.
FILL_SYNC_DEVICE_INFO = 5;
// Output is a tensor of length 1 consisting device RAM in MB.
// Output type: float
// Output length: 1
FILL_DEVICE_RAM_MB = 6;
// Output is a tensor of length 1 describing device OS level.
// Output type: float
// Output length: 1
FILL_DEVICE_OS_VERSION_NUMBER = 7;
// Output is a tensor of length 1 giving pixels per inch for the current
// device used by the user.
// Output type: float
// Output length: 1
FILL_DEVICE_PPI = 8;
// Fills metrics about a given tab. A `tab_id` and `session_tag` is expected
// from input_context.
// Output type: float
// Output length: `TabSessionSource::kNumInputs`
FILL_TAB_METRICS = 9;
// Fills a random number between [0, 1).
// Output type: float
// Output length: 1
FILL_RANDOM = 10;
// Fill various metrics from the shopping service. Currently only support
// shopping bookmark count.
// Output type: float
// Output length: 1
FILL_FROM_SHOPPING_SERVICE = 11;
}
// The fill type of the custom input.
optional FillPolicy fill_policy = 2;
// If the current chrome version does not support this fill type, use this
// value. If this is not specified and the function is unavailable, the model
// will not run due to missing input. The number of entries should be equal to
// the |tensor_length|.
repeated float default_value = 3;
// If the fill type need additional arguments, use this value.
map<string, string> additional_args = 4;
// The human readable name of the custom input.
optional string name = 5;
}
// Configuration for storing signals in the SQL database.
message SignalFilterConfig {
// Defines a single UKM event that should be stored.
message UkmEvent {
// Event hash of the UKM event.
optional uint64 event_hash = 1;
// List of metric hashes for the event, to store in the database. It is
// is required to provide list of necessary metrics.
// TODO: Support empty metric hash list, the database will store all the
// metrics for the UKM event.
repeated uint64 metric_hash_filter = 2;
}
// List of UKM events to store in the database.
repeated UkmEvent ukm_events = 1;
}
message SqlFeature {
// The query should select a single float column. The query can contain '?'
// which can be used to bind values using |bind_values| list.
// TODO(ssid): Consider expanding this to return multiple input tensor
// features.
optional string sql = 1;
// List of signals needed in the storage for the query.
optional SignalFilterConfig signal_filter = 2;
// Used to bind value for the SQL query.
message BindValue {
// The bind field numbers, in range of 0 to n-1, for n question marks in the
// SQL query.
repeated int32 bind_field_index = 1;
// Used to call Bind*() in sql::Statement.
enum ParamType {
UNKNOWN = 0;
NULL = 1;
BOOL = 2;
INT = 3;
INT64 = 4;
DOUBLE = 5;
STRING = 6;
TIME = 7;
}
optional ParamType param_type = 2;
// Value of the input to bind the query. The custom function should return
// the specified param type. The |tensor_length| should be 0 since these
// inputs can only be used for SQL bind values.
optional CustomInput value = 3;
}
repeated BindValue bind_values = 3;
// The human readable name of the ukm event and metric.
optional string name = 4;
}
// Contains a feature used as an input to the ML model.
message InputFeature {
oneof Feature {
// An UMAFeature type of input feature.
UMAFeature uma_feature = 1;
// A custom input type of input feature.
CustomInput custom_input = 2;
// Input feature computed using SQL query.
SqlFeature sql_feature = 3;
}
}
// Contains a list of training output generators. The ML model pipeline can
// iterate on different output candidates and select the final output generator.
message TrainingOutputs {
repeated TrainingOutput outputs = 1;
// Config for triggering the training outputs data collection for the current
// model.
message TriggerConfig {
// Describes how the training outputs are collected.
enum DecisionType {
// By default considered as PERIODIC type.
UNKNOWN = 0;
// The on demand scheduler will trigger training data collection when the
// client asks for a model execution with input context.
ONDEMAND = 1;
// The periodic scheduler will trigger training data collection everyday.
// Currently this period is fixed on the client to 1 day.
PERIODIC = 2;
}
optional DecisionType decision_type = 1;
message ObservationTrigger {
oneof trigger {
// The delay, in seconds, to collect output tensors after input tensors
// are collected. For example, output labels can be collected one week
// after input tensors are collected. Set to 0 if output tensors need to
// be collected in the same time period as input tensors.
uint64 delay_sec = 1;
// The user action or histogram to trigger a training data output
// collection. Note: Only the name and type should be used with
// bucket_duration = 0.
// TODO(crbug.com/40239034): Figure out how to include the trigger as
// one of the outputs automatically.
UMAOutput uma_trigger = 2;
}
}
// List of triggers, whichever is hit first is used to upload the training
// data.
repeated ObservationTrigger observation_trigger = 2;
// Only for PERIODIC trigger. The prediction and observation times can be
// exact or flexible. The exact prediction setting forces the prediction
// time to be the time at which the segment selection or classification
// result was changed. The input features will be collected till the
// prediction time. Flexible prediction time setting allows the collector to
// pick any point in the past as the prediction time, usually pick the
// current time. The training data collection is triggered once a day with a
// rolling window whenever Chrome is active. This setting uploads more
// training data samples. By default the prediction time is FLEXIBLE. The
// exact observation time setting will be used only in case of exact
// prediction case and the observation starts exactly after prediction time.
// Flexible observation can be used to get most recent user behavior by
// setting observation time to the time of upload, which could be later than
// end of the observation period. By default the observation time is EXACT.
optional bool use_exact_prediction_time = 3;
optional bool use_flexible_observation_time = 4;
}
optional TriggerConfig trigger_config = 2;
}
// Generic type to define how to generate the training data output.
// TODO(xingliu): Add more implementation details about how output training data
// is generated.
message TrainingOutput {
oneof output {
// Training data output is generated from UMA metrics.
UMAOutput uma_output = 1;
}
}
// Contains the information to generate the output for training data based on a
// particular UMA metric.
message UMAOutput {
// The UMA metric to generate the training data output.
optional UMAFeature uma_feature = 1;
// The duration to trigger a training data collection, unit in TimeUnit. If
// not specified or 0, the training data will be generated immediately after
// certain UMA is recorded.
optional uint64 duration = 2;
}
// Metadata about a segmentation model for a given segment. Contains information
// on how to use the model such as collecting signals, interpreting results etc.
// Next tag: 16
message SegmentationModelMetadata {
// Values for obsolete fields.
reserved 15;
// The version information needed to validate segmentation models.
optional VersionInfo version_info = 9;
// DEPRECATED: Use |input_features.uma_feature| instead. Only one of
// |features| or |input_features| can be used in the config, not both. An
// ordered list of required features.
repeated UMAFeature features = 1;
// An ordered list of required features and custom inputs. Only one of
// |features| or |input_features| can be used in the config, not both.
repeated InputFeature input_features = 10;
// A list of training data output definitions.
optional TrainingOutputs training_outputs = 11;
// The time unit to be used for the rest of this proto.
optional TimeUnit time_unit = 2;
// The size of each interval the data should be aggregated over.
optional uint64 bucket_duration = 3;
// For how long should data be stored for this model.
optional int64 signal_storage_length = 4;
// For how long do we have to have captured data for this model. If the
// relevant signals have been captured for a shorter amount of time than this,
// this model can never be selected.
optional int64 min_signal_collection_length = 5;
// Describes how long after a valid result has been calculated for this model
// it is OK to cache the result without recalculating with updated data.
optional int64 result_time_to_live = 6;
message DiscreteMapping {
// A mapping result from the raw continuous result to a discrete and
// comparable value based on |rank|.
message Entry {
// The minimum result of the model to be allowed to choose this mapping.
optional float min_result = 1;
// A feature specific rank.
optional int64 rank = 2;
}
// An ordered (based on their |min_result|) list of discrete mappings.
// To map a model evaluation result to a DiscreteMapping, choose the highest
// |min_value| that the evaluation result is at or above.
// E.g. for these mappings: [(0.0, 0), (0.4, 1), (0.7, 2), (0.9, 3)], a
// result of 0.7 would yield (0.7, 2), and 0.69 would yield (0.4, 1).
repeated Entry entries = 1;
}
map<string, DiscreteMapping> discrete_mappings = 7;
// The default key to use during the mapping process if no key has been
// provided.
optional string default_discrete_mapping = 8;
// The delay, in seconds, to collect output tensors after input tensors are
// collected. For example, output labels can be collected one week after input
// tensors are collected. If not specified, output tensors are collected in
// the same time period as input tensors.
// DEPRECATED: optional int64 output_collection_delay_sec = 12;
reserved 12;
// Whether the client should upload the input and output tensors through UKM.
optional bool upload_tensors = 13;
// Describes the return type of the model score. Used for recording
// histograms.
enum OutputDescription {
UNKNOWN_RETURN_TYPE = 0;
// Model returns either 0 or 1.
RETURN_TYPE_HEURISTIC = 1;
// Model returns an int corresponding to a specific subsegment. Assume
// between 0 and 100.
RETURN_TYPE_MULTISEGMENT = 2;
// Model returns a float between 0 and 1.
RETURN_TYPE_PROBABILITY = 3;
// Model returns any integer value.
RETURN_TYPE_INTEGER = 4;
}
// TODO(ritikagup@): Deprecate the field.
optional OutputDescription return_type = 14;
// Contains information about the model results. Supplied by the client. It
// gives a description of how should the results look like and how to
// interpret them.
optional OutputConfig output_config = 16;
}