chromium/components/segmentation_platform/public/proto/model_metadata.proto

// Copyright 2021 The Chromium Authors
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file.

syntax = "proto2";
option optimize_for = LITE_RUNTIME;

package segmentation_platform.proto;

import "components/segmentation_platform/public/proto/aggregation.proto";
import "components/segmentation_platform/public/proto/output_config.proto";
import "components/segmentation_platform/public/proto/types.proto";

// The version is used to verify if the metadata provided by the server is
// supported in current version of the code. Update the version number for any
// new feature added to metadata proto, and add a log of the new changes in the
// current version in this file.
// Version 0 supports UMA features and aggregation in |features| field.
// Version 1 supports UMA features, custom inputs and sql features in
// |input_features| field.
// Version 2 supports training data output collection in |training_outputs|
// field.
// Version 3 supports trigger configurations for training data collection.
enum CurrentVersion {
  METADATA_VERSION = 3;
}

// Version information for segmentation models.
message VersionInfo {
  // Minimum model metadata version that is supported. Some newer
  // features/fields might not be available before this version. This field is
  // set on the server and read by the client to verify if model is valid.
  optional int32 metadata_min_version = 1;

  // Current model metadata version. This field is set by the client while
  // sending a model download request to optimization guide server so that the
  // server knows the capabilities of the client.
  optional int32 metadata_cur_version = 2;
}

// Used to identify the source of the model whether it is a client side or
// server side model.
enum ModelSource {
  UNKNOWN_MODEL_SOURCE = 0;
  SERVER_MODEL_SOURCE = 1;   // Represents server side model.
  DEFAULT_MODEL_SOURCE = 2;  // Represents client side model.
}

message UMAFeature {
  // The type of signal this feature refers to.
  // Note: SignalType::UKM_EVENT type is only used for SignalStorageConfig and
  // should not be used as uma feature's signal type.
  optional SignalType type = 1;

  // The human readable name of the histogram or user action.
  optional string name = 2;

  // The hash of the histogram name or user action. Must match the result of
  // base::HashMetricName.
  optional fixed64 name_hash = 3;

  // Number of buckets to include in the result. If set to 0, no data will be
  // collected. This can be used to start storing data before it should be used.
  // See documentation for Aggregation for details.
  optional uint64 bucket_count = 4;

  // The required length of the calculated result. See documentation for
  // Aggregation for details.
  optional uint64 tensor_length = 5;

  // The type of aggregation to use for this particular feature.
  optional Aggregation aggregation = 6;

  // Only set if type == HISTOGRAM_ENUM.
  // Matches are only valid when the enum ID matches any of these.
  // Works like an OR condition, e.g.: [url, search, …] or just [url].
  repeated int32 enum_ids = 7;

  // Only set if aggregation == LATEST_OR_DEFAULT.
  // Value used for model if latest value requested is not available in the
  // database. The number of entries should be equal to the tensor_length.
  repeated float default_values = 8;
}

message CustomInput {
  // This parameter is required.
  // 1. If the param is directly used as the input tensor field to the model,
  // then this specifies the number of columns to fill in the tensor. In this
  // case the value should be float.
  // 2. If the param is used as a bind value for sql features, then this
  // specifies the number of sql bindings to fill in the sql query.
  optional int32 tensor_length = 1;

  // Used to distinguish between different types of custom inputs.
  enum FillPolicy {
    // Custom functions provided by the engine that fills in the input feature
    // to the model.
    UNKNOWN_FILL_POLICY = 0;
    // Output is the time at which model prediction is needed. Can be used to
    // bind TIME type param to queries.
    // Output type: Time
    // Output length: 1
    FILL_PREDICTION_TIME = 1;
    // Output is two timestamps, the beginning and the end of last x days. Can
    // be used to bind TIME type param to query within a time interval.
    // Output type: Time
    // Output length: 2
    // Additional arg:
    //   `bucket_count`: Required. Number of buckets to include in the result.
    TIME_RANGE_BEFORE_PREDICTION = 2;

    // Used to determine whether a given page is a product details page and can
    // be price tracked.
    PRICE_TRACKING_HINTS = 3;

    // This type of custom input is used directly to fill the input tensor to
    // the model or to another query.
    // Output type: ProcessedValue
    // Output length: 1
    // Additional arg:
    //   `name`: Optional. The name of the field to be looked up in input
    //    context. If missing then the |name| field is used.
    FILL_FROM_INPUT_CONTEXT = 4;

    // Output is a tensor of length 10 consisting of float values denoting
    // various devices count by type with different form factor and os type.
    // See `SyncDeviceInfoObserver` for description of each value.
    // Output type: float
    // Output length: 10
    // Additional arg:
    //   `wait_for_device_info_in_seconds`: Number of seconds to wait for sync
    //   device info before timeout. If 0, then does not wait for sync and times
    //   out immediately if device info is not available.
    // InputContext arg:
    //   `active_days_limit`: Number of days after which the device is
    //   considered not active after last sync. Must be INT.
    FILL_SYNC_DEVICE_INFO = 5;

    // Output is a tensor of length 1 consisting device RAM in MB.
    // Output type: float
    // Output length: 1
    FILL_DEVICE_RAM_MB = 6;

    // Output is a tensor of length 1 describing device OS level.
    // Output type: float
    // Output length: 1
    FILL_DEVICE_OS_VERSION_NUMBER = 7;

    // Output is a tensor of length 1 giving pixels per inch for the current
    // device used by the user.
    // Output type: float
    // Output length: 1
    FILL_DEVICE_PPI = 8;

    // Fills metrics about a given tab. A `tab_id` and `session_tag` is expected
    // from input_context.
    // Output type: float
    // Output length: `TabSessionSource::kNumInputs`
    FILL_TAB_METRICS = 9;

    // Fills a random number between [0, 1).
    // Output type: float
    // Output length: 1
    FILL_RANDOM = 10;

    // Fill various metrics from the shopping service. Currently only support
    // shopping bookmark count.
    // Output type: float
    // Output length: 1
    FILL_FROM_SHOPPING_SERVICE = 11;
  }

  // The fill type of the custom input.
  optional FillPolicy fill_policy = 2;

  // If the current chrome version does not support this fill type, use this
  // value. If this is not specified and the function is unavailable, the model
  // will not run due to missing input. The number of entries should be equal to
  // the |tensor_length|.
  repeated float default_value = 3;

  // If the fill type need additional arguments, use this value.
  map<string, string> additional_args = 4;

  // The human readable name of the custom input.
  optional string name = 5;
}

// Configuration for storing signals in the SQL database.
message SignalFilterConfig {
  // Defines a single UKM event that should be stored.
  message UkmEvent {
    // Event hash of the UKM event.
    optional uint64 event_hash = 1;
    // List of metric hashes for the event, to store in the database. It is
    // is required to provide list of necessary metrics.
    // TODO: Support empty metric hash list, the database will store all the
    // metrics for the UKM event.
    repeated uint64 metric_hash_filter = 2;
  }
  // List of UKM events to store in the database.
  repeated UkmEvent ukm_events = 1;
}

message SqlFeature {
  // The query should select a single float column. The query can contain '?'
  // which can be used to bind values using |bind_values| list.
  // TODO(ssid): Consider expanding this to return multiple input tensor
  // features.
  optional string sql = 1;

  // List of signals needed in the storage for the query.
  optional SignalFilterConfig signal_filter = 2;

  // Used to bind value for the SQL query.
  message BindValue {
    // The bind field numbers, in range of 0 to n-1, for n question marks in the
    // SQL query.
    repeated int32 bind_field_index = 1;

    // Used to call Bind*() in sql::Statement.
    enum ParamType {
      UNKNOWN = 0;
      NULL = 1;
      BOOL = 2;
      INT = 3;
      INT64 = 4;
      DOUBLE = 5;
      STRING = 6;
      TIME = 7;
    }
    optional ParamType param_type = 2;

    // Value of the input to bind the query. The custom function should return
    // the specified param type. The |tensor_length| should be 0 since these
    // inputs can only be used for SQL bind values.
    optional CustomInput value = 3;
  }
  repeated BindValue bind_values = 3;

  // The human readable name of the ukm event and metric.
  optional string name = 4;
}

// Contains a feature used as an input to the ML model.
message InputFeature {
  oneof Feature {
    // An UMAFeature type of input feature.
    UMAFeature uma_feature = 1;

    // A custom input type of input feature.
    CustomInput custom_input = 2;

    // Input feature computed using SQL query.
    SqlFeature sql_feature = 3;
  }
}

// Contains a list of training output generators. The ML model pipeline can
// iterate on different output candidates and select the final output generator.
message TrainingOutputs {
  repeated TrainingOutput outputs = 1;

  // Config for triggering the training outputs data collection for the current
  // model.
  message TriggerConfig {
    // Describes how the training outputs are collected.
    enum DecisionType {
      // By default considered as PERIODIC type.
      UNKNOWN = 0;
      // The on demand scheduler will trigger training data collection when the
      // client asks for a model execution with input context.
      ONDEMAND = 1;
      // The periodic scheduler will trigger training data collection everyday.
      // Currently this period is fixed on the client to 1 day.
      PERIODIC = 2;
    }
    optional DecisionType decision_type = 1;

    message ObservationTrigger {
      oneof trigger {
        // The delay, in seconds, to collect output tensors after input tensors
        // are collected. For example, output labels can be collected one week
        // after input tensors are collected. Set to 0 if output tensors need to
        // be collected in the same time period as input tensors.
        uint64 delay_sec = 1;
        // The user action or histogram to trigger a training data output
        // collection. Note: Only the name and type should be used with
        // bucket_duration = 0.
        // TODO(crbug.com/40239034): Figure out how to include the trigger as
        // one of the outputs automatically.
        UMAOutput uma_trigger = 2;
      }
    }
    // List of triggers, whichever is hit first is used to upload the training
    // data.
    repeated ObservationTrigger observation_trigger = 2;

    // Only for PERIODIC trigger. The prediction and observation times can be
    // exact or flexible. The exact prediction setting forces the prediction
    // time to be the time at which the segment selection or classification
    // result was changed. The input features will be collected till the
    // prediction time. Flexible prediction time setting allows the collector to
    // pick any point in the past as the prediction time, usually pick the
    // current time. The training data collection is triggered once a day with a
    // rolling window whenever Chrome is active. This setting uploads more
    // training data samples. By default the prediction time is FLEXIBLE. The
    // exact observation time setting will be used only in case of exact
    // prediction case and the observation starts exactly after prediction time.
    // Flexible observation can be used to get most recent user behavior by
    // setting observation time to the time of upload, which could be later than
    // end of the observation period. By default the observation time is EXACT.
    optional bool use_exact_prediction_time = 3;
    optional bool use_flexible_observation_time = 4;
  }
  optional TriggerConfig trigger_config = 2;
}

// Generic type to define how to generate the training data output.
// TODO(xingliu): Add more implementation details about how output training data
// is generated.
message TrainingOutput {
  oneof output {
    // Training data output is generated from UMA metrics.
    UMAOutput uma_output = 1;
  }
}

// Contains the information to generate the output for training data based on a
// particular UMA metric.
message UMAOutput {
  // The UMA metric to generate the training data output.
  optional UMAFeature uma_feature = 1;

  // The duration to trigger a training data collection, unit in TimeUnit. If
  // not specified or 0, the training data will be generated immediately after
  // certain UMA is recorded.
  optional uint64 duration = 2;
}

// Metadata about a segmentation model for a given segment. Contains information
// on how to use the model such as collecting signals, interpreting results etc.
// Next tag: 16
message SegmentationModelMetadata {
  // Values for obsolete fields.
  reserved 15;

  // The version information needed to validate segmentation models.
  optional VersionInfo version_info = 9;

  // DEPRECATED: Use |input_features.uma_feature| instead. Only one of
  // |features| or |input_features| can be used in the config, not both. An
  // ordered list of required features.
  repeated UMAFeature features = 1;

  // An ordered list of required features and custom inputs. Only one of
  // |features| or |input_features| can be used in the config, not both.
  repeated InputFeature input_features = 10;

  // A list of training data output definitions.
  optional TrainingOutputs training_outputs = 11;

  // The time unit to be used for the rest of this proto.
  optional TimeUnit time_unit = 2;

  // The size of each interval the data should be aggregated over.
  optional uint64 bucket_duration = 3;

  // For how long should data be stored for this model.
  optional int64 signal_storage_length = 4;

  // For how long do we have to have captured data for this model. If the
  // relevant signals have been captured for a shorter amount of time than this,
  // this model can never be selected.
  optional int64 min_signal_collection_length = 5;

  // Describes how long after a valid result has been calculated for this model
  // it is OK to cache the result without recalculating with updated data.
  optional int64 result_time_to_live = 6;

  message DiscreteMapping {
    // A mapping result from the raw continuous result to a discrete and
    // comparable value based on |rank|.
    message Entry {
      // The minimum result of the model to be allowed to choose this mapping.
      optional float min_result = 1;

      // A feature specific rank.
      optional int64 rank = 2;
    }

    // An ordered (based on their |min_result|) list of discrete mappings.
    // To map a model evaluation result to a DiscreteMapping, choose the highest
    // |min_value| that the evaluation result is at or above.
    // E.g. for these mappings: [(0.0, 0), (0.4, 1), (0.7, 2), (0.9, 3)], a
    // result of 0.7 would yield (0.7, 2), and 0.69 would yield (0.4, 1).
    repeated Entry entries = 1;
  }
  map<string, DiscreteMapping> discrete_mappings = 7;

  // The default key to use during the mapping process if no key has been
  // provided.
  optional string default_discrete_mapping = 8;

  // The delay, in seconds, to collect output tensors after input tensors are
  // collected. For example, output labels can be collected one week after input
  // tensors are collected. If not specified, output tensors are collected in
  // the same time period as input tensors.
  // DEPRECATED: optional int64 output_collection_delay_sec = 12;
  reserved 12;

  // Whether the client should upload the input and output tensors through UKM.
  optional bool upload_tensors = 13;

  // Describes the return type of the model score. Used for recording
  // histograms.
  enum OutputDescription {
    UNKNOWN_RETURN_TYPE = 0;
    // Model returns either 0 or 1.
    RETURN_TYPE_HEURISTIC = 1;
    // Model returns an int corresponding to a specific subsegment. Assume
    // between 0 and 100.
    RETURN_TYPE_MULTISEGMENT = 2;
    // Model returns a float between 0 and 1.
    RETURN_TYPE_PROBABILITY = 3;
    // Model returns any integer value.
    RETURN_TYPE_INTEGER = 4;
  }
  // TODO(ritikagup@): Deprecate the field.
  optional OutputDescription return_type = 14;

  // Contains information about the model results. Supplied by the client. It
  // gives a description of how should the results look like and how to
  // interpret them.
  optional OutputConfig output_config = 16;
}