// Copyright 2022 The MediaPipe Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
namespace tflite;
// TFLite metadata contains both human readable and machine readable information
// about what the model does and how to use the model. It can be used as a
// README file, which elaborates the details of the model, each input/ouput
// tensor, and each associated file.
//
// An important use case of TFLite metadata is the TFLite codegen tool, which
// automatically generates the model interface based on the properties of the
// model and the tensors. The model interface provides high-level APIs to
// interact with the model, such as preprocessing the input data and running
// inferences.
//
// Entries marked with "<Codegen usage>" are used in TFLite codegen tool to
// generate the model interface. It is recommended to fill in at least those
// enties to boost the codegen performance.
// The Metadata schema is versioned by the Semantic versioning number, such as
// MAJOR.MINOR.PATCH. It tracks the schema changes according to the rules below:
// * Bump up the MAJOR number when making potentially backwards incompatible
// changes. It must be incremented if the new changes break the backwards
// compatibility. It may also include minor and patch level changes as
// needed. The true backwards compatibility is indicated by the file
// identifier.
// * Bump up the MINOR number when making backwards compatible updates for
// major features, such as supporting new content types or adding new
// processing units.
// * Bump up the PATCH number when making small backwards compatible changes,
// such as adding a new fields or deprecating certain fields (not deleting
// them).
//
// ModelMetadata.min_parser_version indicates the minimum necessary metadata
// parser version to fully understand all fields in a given metadata flatbuffer.
//
// New fields and types will have associated comments with the schema version
// for which they were added.
//
// Schema Semantic version: 1.5.0
// This indicates the flatbuffer compatibility. The number will bump up when a
// break change is applied to the schema, such as removing fields or adding new
// fields to the middle of a table.
file_identifier "M001";
// History:
// 1.0.1 - Added VOCABULARY type to AssociatedFileType.
// 1.1.0 - Added BertTokenizerOptions to ProcessUnitOptions.
// Added SentencePieceTokenizerOptions to ProcessUnitOptions.
// Added input_process_units to SubGraphMetadata.
// Added output_process_units to SubGraphMetadata.
// 1.2.0 - Added input_tensor_group to SubGraphMetadata.
// Added output_tensor_group to SubGraphMetadata.
// 1.2.1 - Added RegexTokenizerOptions to ProcessUnitOptions.
// 1.3.0 - Added AudioProperties to ContentProperties.
// 1.4.0 - Added SCANN_INDEX_FILE type to AssociatedFileType.
// 1.4.1 - Added version to AssociatedFile.
// 1.5.0 - Added CustomMetadata in SubGraphMetadata.
// File extension of any written files.
file_extension "tflitemeta";
enum AssociatedFileType : byte {
UNKNOWN = 0,
// Files such as readme.txt.
DESCRIPTIONS = 1,
// Contains a list of labels (characters separated by "\n" or in lines) that
// annotate certain axis of the tensor. For example,
// the label file in image classification. Those labels annotate the
// the output tensor, such that each value in the output tensor is the
// probability of that corresponding category specified by the label. See the
// example label file used in image classification [1].
//
// <Codegen usage>:
// If an output tensor has an associated file as TENSOR_AXIS_LABELS, return
// the output as a mapping between the labels and probability in the model
// interface.
// If multiple files of the same type are present, the first one is used by
// default; additional ones are to be distinguished from one another by their
// specified locale.
//
// TODO: Add github example link.
TENSOR_AXIS_LABELS = 2,
// Contains a list of labels (characters separated by "\n" or in lines) that
// tensor values correspond to. For example, in
// the object detection model, one of the output tensors is the detected
// classes. And each value in the tensor refers to the index of label in the
// category label file. See the example label file used in object detection
// [1].
//
// <Codegen usage>:
// If an output tensor has an associated file as TENSOR_VALUE_LABELS, convert
// the tensor values into labels, and return a list of string as the output.
// If multiple files of the same type are present, the first one is used by
// default; additional ones are to be distinguished from one another by their
// specified locale.
//
// TODO: Add github example link.
TENSOR_VALUE_LABELS = 3,
// Contains sigmoid-based score calibration parameters, formatted as CSV.
// Lines contain for each index of an output tensor the scale, slope, offset
// and (optional) min_score parameters to be used for sigmoid fitting (in this
// order and in `strtof`-compatible [1] format). Scale should be a
// non-negative value.
// A line may be left empty to default calibrated scores for this index to
// default_score.
// In summary, each line should thus contain 0, 3 or 4 comma-separated values.
//
// See the example score calibration file used in image classification [2].
//
// See documentation for ScoreCalibrationOptions for details.
//
// [1]: https://en.cppreference.com/w/c/string/byte/strtof
// TODO: Add github example link.
TENSOR_AXIS_SCORE_CALIBRATION = 4,
// Contains a list of unique words (characters separated by "\n" or in lines)
// that help to convert natural language words to embedding vectors.
//
// See the example vocab file used in text classification [1].
//
// TODO: Add github example link.
// Added in: 1.0.1
VOCABULARY = 5,
// TODO: introduce the ScaNN index file with links once the code
// is released.
// Contains on-device ScaNN index file with LevelDB format.
// Added in: 1.4.0
SCANN_INDEX_FILE = 6,
}
table AssociatedFile {
// Name of this file. Need to be exact the same as the name of the actual file
// packed into the TFLite model as a zip file.
//
// <Codegen usage>:
// Locates to the actual file in the TFLite model.
name:string;
// A description of what the file is.
description:string;
// Type of the associated file. There may be special pre/post processing for
// some types. For example in image classification, a label file of the output
// will be used to convert object index into string.
//
// <Codegen usage>:
// Determines how to process the corresponding tensor.
type:AssociatedFileType;
// An optional locale for this associated file (if applicable). It is
// recommended to use an ISO 639-1 letter code (e.g. "en" for English),
// optionally completed by a two letter region code (e.g. "en-US" for US
// English and "en-CA" for Canadian English).
// Leverage this in order to specify e.g multiple label files translated in
// different languages.
locale:string;
// Version of the file specified by model creators.
// Added in: 1.4.1
version:string;
}
// The basic content type for all tensors.
//
// <Codegen usage>:
// Input feature tensors:
// 1. Generates the method to load data from a TensorBuffer.
// 2. Creates the preprocessing logic. The default processing pipeline is:
// [NormalizeOp, QuantizeOp].
// Output feature tensors:
// 1. Generates the method to return the output data to a TensorBuffer.
// 2. Creates the post-processing logic. The default processing pipeline is:
// [DeQuantizeOp].
table FeatureProperties {
}
// The type of color space of an image.
enum ColorSpaceType : byte {
UNKNOWN = 0,
RGB = 1,
GRAYSCALE = 2,
}
table ImageSize {
width:uint;
height:uint;
}
// The properties for image tensors.
//
// <Codegen usage>:
// Input image tensors:
// 1. Generates the method to load an image from a TensorImage.
// 2. Creates the preprocessing logic. The default processing pipeline is:
// [ResizeOp, NormalizeOp, QuantizeOp].
// Output image tensors:
// 1. Generates the method to return the output data to a TensorImage.
// 2. Creates the post-processing logic. The default processing pipeline is:
// [DeQuantizeOp].
table ImageProperties {
// The color space of the image.
//
// <Codegen usage>:
// Determines how to convert the color space of a given image from users.
color_space:ColorSpaceType;
// Indicates the default value of image width and height if the tensor shape
// is dynamic. For fixed-size tensor, this size will be consistent with the
// expected size.
default_size:ImageSize;
}
// The properties for tensors representing bounding boxes.
//
// <Codegen usage>:
// Input image tensors: NA.
// Output image tensors: parses the values into a data structure that represents
// bounding boxes. For example, in the generated wrapper for Android, it returns
// the output as android.graphics.Rect objects.
enum BoundingBoxType : byte {
UNKNOWN = 0,
// Represents the bounding box by using the combination of boundaries,
// {left, top, right, bottom}.
// The default order is {left, top, right, bottom}. Other orders can be
// indicated by BoundingBoxProperties.index.
BOUNDARIES = 1,
// Represents the bounding box by using the upper_left corner, width and
// height.
// The default order is {upper_left_x, upper_left_y, width, height}. Other
// orders can be indicated by BoundingBoxProperties.index.
UPPER_LEFT = 2,
// Represents the bounding box by using the center of the box, width and
// height. The default order is {center_x, center_y, width, height}. Other
// orders can be indicated by BoundingBoxProperties.index.
CENTER = 3,
}
// The properties for audio tensors.
// Added in: 1.3.0
table AudioProperties {
// The sample rate in Hz when the audio was captured.
sample_rate:uint;
// The channel count of the audio.
channels:uint;
}
enum CoordinateType : byte {
// The coordinates are float values from 0 to 1.
RATIO = 0,
// The coordinates are integers.
PIXEL = 1,
}
table BoundingBoxProperties {
// Denotes the order of the elements defined in each bounding box type. An
// empty index array represent the default order of each bounding box type.
// For example, to denote the default order of BOUNDARIES, {left, top, right,
// bottom}, the index should be {0, 1, 2, 3}. To denote the order {left,
// right, top, bottom}, the order should be {0, 2, 1, 3}.
//
// The index array can be applied to all bounding box types to adjust the
// order of their corresponding underlying elements.
//
// <Codegen usage>:
// Indicates how to parse the bounding box values.
index:[uint];
// <Codegen usage>:
// Indicates how to parse the bounding box values.
type:BoundingBoxType;
// <Codegen usage>:
// Indicates how to convert the bounding box back to the original image in
// pixels.
coordinate_type:CoordinateType;
}
union ContentProperties {
FeatureProperties,
ImageProperties,
BoundingBoxProperties,
// Added in: 1.3.0
AudioProperties,
}
table ValueRange {
min:int;
max:int;
}
table Content {
// The properties that the content may have, indicating the type of the
// Content.
//
// <Codegen usage>:
// Indicates how to process the tensor.
content_properties:ContentProperties;
// The range of dimensions that the content corresponds to. A NULL
// "range" indicates that the content uses up all dimensions,
// except the batch axis if applied.
//
// Here are all the possible situations of how a tensor is composed.
// Case 1: The tensor is a single object, such as an image.
// For example, the input of an image classifier
// (https://www.tensorflow.org/lite/models/image_classification/overview),
// a tensor of shape [1, 224, 224, 3]. Dimensions 1 to 3 correspond to the
// image. Since dimension 0 is a batch axis, which can be ignored,
// "range" can be left as NULL.
//
// Case 2: The tensor contains multiple instances of the same object.
// For example, the output tensor of detected bounding boxes of an object
// detection model
// (https://www.tensorflow.org/lite/models/object_detection/overview).
// The tensor shape is [1, 10, 4]. Here is the what the three dimensions
// represent for:
// dimension 0: the batch axis.
// dimension 1: the 10 objects detected with the highest confidence.
// dimension 2: the bounding boxes of the 10 detected objects.
// The tensor is essentially 10 bounding boxes. In this case,
// "range" should be {min=2; max=2;}.
//
// The output tensor of scores of the above object detection model has shape
// [1, 10], where
// dimension 0: the batch axis;
// dimension 1: the scores of the 10 detected objects.
// Set "range" to the number of dimensions which is {min=2; max=2;} to denote
// that every element in the tensor is an individual content object, i.e. a
// score in this example.
//
// Another example is the pose estimation model
// (https://www.tensorflow.org/lite/models/pose_estimation/overview).
// The output tensor of heatmaps is in the shape of [1, 9, 9, 17].
// Here is the what the four dimensions represent for:
// dimension 0: the batch axis.
// dimension 1/2: the heatmap image.
// dimension 3: 17 body parts of a person.
// Even though the last axis is body part, the real content of this tensor is
// the heatmap. "range" should be [min=1; max=2].
//
// Case 3: The tensor contains multiple different objects. (Not supported by
// Content at this point).
// Sometimes a tensor may contain multiple different objects, thus different
// contents. It is very common for regression models. For example, a model
// to predict the fuel efficiency
// (https://www.tensorflow.org/tutorials/keras/regression).
// The input tensor has shape [1, 9], consisting of 9 features, such as
// "Cylinders", "Displacement", "Weight", etc. In this case, dimension 1
// contains 9 different contents. However, since these sub-dimension objects
// barely need to be specifically processed, their contents are not recorded
// in the metadata. Through, the name of each dimension can be set through
// TensorMetadata.dimension_names.
//
// Note that if it is not case 3, a tensor can only have one content type.
//
// <Codegen usage>:
// Case 1: return a processed single object of certain content type.
// Case 2: return a list of processed objects of certain content type. The
// generated model interface have API to random access those objects from
// the output.
range:ValueRange;
}
// Parameters that are used when normalizing the tensor.
table NormalizationOptions{
// mean and std are normalization parameters. Tensor values are normalized
// on a per-channel basis, by the formula
// (x - mean) / std.
// If there is only one value in mean or std, we'll propagate the value to
// all channels.
//
// Quantized models share the same normalization parameters as their
// corresponding float models. For example, an image input tensor may have
// the normalization parameter of
// mean = 127.5f and std = 127.5f.
// The image value will be normalized from [0, 255] to [-1, 1].
// Then, for quantized models, the image data should be further quantized
// according to the quantization parameters. In the case of uint8, the image
// data will be scaled back to [0, 255], while for int8, the image data will
// be scaled to [-128, 127].
//
// Both the normalization parameters and quantization parameters can be
// retrieved through the metadata extractor library.
// TODO: add link for the metadata extractor library.
// Per-channel mean of the possible values used in normalization.
//
// <Codegen usage>:
// Apply normalization to input tensors accordingly.
mean:[float];
// Per-channel standard dev. of the possible values used in normalization.
//
// <Codegen usage>:
// Apply normalization to input tensors accordingly.
std:[float];
}
// The different possible score transforms to apply to uncalibrated scores
// before applying score calibration.
enum ScoreTransformationType : byte {
// Identity function: g(x) = x.
IDENTITY = 0,
// Log function: g(x) = log(x).
LOG = 1,
// Inverse logistic function: g(x) = log(x) - log(1-x).
INVERSE_LOGISTIC = 2,
}
// Options to perform score calibration on an output tensor through sigmoid
// functions. One of the main purposes of score calibration is to make scores
// across classes comparable, so that a common threshold can be used for all
// output classes. This is meant for models producing class predictions as
// output, e.g. image classification or detection models.
//
// For each index in the output tensor, this applies:
// * `f(x) = scale / (1 + e^-(slope*g(x)+offset))` if `x > min_score` or if no
// `min_score` has been specified,
// * `f(x) = default_score` otherwise or if no scale, slope and offset have been
// specified.
// Where:
// * scale, slope, offset and (optional) min_score are index-specific parameters
// * g(x) is an index-independent transform among those defined in
// ScoreTransformationType
// * default_score is an index-independent parameter.
// An AssociatedFile with type TANSOR_AXIS_SCORE_CALIBRATION specifying the
// index-specific parameters must be associated with the corresponding
// TensorMetadata for score calibration be applied.
//
// See the example score calibration file used in image classification [1].
// TODO: Add github example link.
table ScoreCalibrationOptions {
// The function to use for transforming the uncalibrated score before
// applying score calibration.
score_transformation:ScoreTransformationType;
// The default calibrated score to apply if the uncalibrated score is
// below min_score or if no parameters were specified for a given index.
default_score:float;
}
// Performs thresholding on output tensor values, in order to filter out
// low-confidence results.
table ScoreThresholdingOptions {
// The recommended global threshold below which results are considered
// low-confidence and should be filtered out.
global_score_threshold:float;
}
// Performs Bert tokenization as in tf.text.BertTokenizer
// (https://github.com/tensorflow/text/blob/3599f6fcd2b780a2dc413b90fb9315464f10b314/docs/api_docs/python/text/BertTokenizer.md)
// Added in: 1.1.0
table BertTokenizerOptions {
// The vocabulary files used in the BertTokenizer.
vocab_file:[AssociatedFile];
}
// Performs SentencePiece tokenization as in tf.text.SentencepieceTokenizer
// (https://github.com/tensorflow/text/blob/3599f6fcd2b780a2dc413b90fb9315464f10b314/docs/api_docs/python/text/SentencepieceTokenizer.md).
// Added in: 1.1.0
table SentencePieceTokenizerOptions {
// The SentencePiece model files used in the SentencePieceTokenizer.
sentencePiece_model:[AssociatedFile];
// The optional vocabulary model files used in the SentencePieceTokenizer.
vocab_file:[AssociatedFile];
}
// Splits strings by the occurrences of delim_regex_pattern and converts the
// tokens into ids. For example, given
// delim_regex_pattern: "\W+",
// string: "Words, words, words.",
// the tokens after split are: "Words", "words", "words", "".
// And then the tokens can be converted into ids according to the vocab_file.
// Added in: 1.2.1
table RegexTokenizerOptions {
delim_regex_pattern:string;
// The vocabulary files used to convert this tokens into ids.
vocab_file:[AssociatedFile];
}
// Options that are used when processing the tensor.
union ProcessUnitOptions {
NormalizationOptions,
ScoreCalibrationOptions,
ScoreThresholdingOptions,
// Added in: 1.1.0
BertTokenizerOptions,
// Added in: 1.1.0
SentencePieceTokenizerOptions,
// Added in: 1.2.1
RegexTokenizerOptions
}
// A process unit that is used to process the tensor out-of-graph.
table ProcessUnit {
options:ProcessUnitOptions;
}
// Statistics to describe a tensor.
table Stats {
// Max and min are not currently used in tflite.support codegen. They mainly
// serve as references for users to better understand the model. They can also
// be used to validate model pre/post processing results.
// If there is only one value in max or min, we'll propagate the value to
// all channels.
// Per-channel maximum value of the tensor.
max:[float];
// Per-channel minimum value of the tensor.
min:[float];
}
// Metadata of a group of tensors. It may contain several tensors that will be
// grouped together in codegen. For example, the TFLite object detection model
// example (https://www.tensorflow.org/lite/models/object_detection/overview)
// has four outputs: classes, scores, bounding boxes, and number of detections.
// If the four outputs are bundled together using TensorGroup (for example,
// named as "detection result"), the codegen tool will generate the class,
// `DetectionResult`, which contains the class, score, and bounding box. And the
// outputs of the model will be converted to a list of `DetectionResults` and
// the number of detection. Note that the number of detection is a single
// number, therefore is inappropriate for the list of `DetectionResult`.
// Added in: 1.2.0
table TensorGroup {
// Name of tensor group.
//
// <codegen usage>:
// Name of the joint class of the tensor group.
name:string;
// Names of the tensors to group together, corresponding to
// TensorMetadata.name.
//
// <codegen usage>:
// Determines which tensors will be added to this group. All tensors in the
// group should have the same number of elements specified by Content.range.
tensor_names:[string];
}
// Detailed information of an input or output tensor.
table TensorMetadata {
// Name of the tensor.
//
// <Codegen usage>:
// The name of this tensor in the generated model interface.
name:string;
// A description of the tensor.
description:string;
// A list of names of the dimensions in this tensor. The length of
// dimension_names need to match the number of dimensions in this tensor.
//
// <Codegen usage>:
// The name of each dimension in the generated model interface. See "Case 2"
// in the comments of Content.range.
dimension_names:[string];
// The content that represents this tensor.
//
// <Codegen usage>:
// Determines how to process this tensor. See each item in ContentProperties
// for the default process units that will be applied to the tensor.
content:Content;
// The process units that are used to process the tensor out-of-graph.
//
// <Codegen usage>:
// Contains the parameters of the default processing pipeline for each content
// type, such as the normalization parameters in all content types. See the
// items under ContentProperties for the details of the default processing
// pipeline.
process_units:[ProcessUnit];
// The statistics of the tensor values.
stats:Stats;
// A list of associated files of this tensor.
//
// <Codegen usage>:
// Contains processing parameters of this tensor, such as normalization.
associated_files:[AssociatedFile];
}
table CustomMetadata {
name:string;
data:[ubyte] (force_align: 16);
}
table SubGraphMetadata {
// Name of the subgraph.
//
// Note that, since TFLite only support one subgraph at this moment, the
// Codegen tool will use the name in ModelMetadata in the generated model
// interface.
name:string;
// A description explains details about what the subgraph does.
description:string;
// Metadata of all input tensors used in this subgraph. It matches exactly
// the input tensors specified by `SubGraph.inputs` in the TFLite
// schema.fbs file[2]. The number of `TensorMetadata` in the array should
// equal to the number of indices in `SubGraph.inputs`.
//
// [2]: tensorflow/lite/schema/schema.fbs
// <Codegen usage>:
// Determines how to process the inputs.
input_tensor_metadata:[TensorMetadata];
// Metadata of all output tensors used in this subgraph. It matches exactly
// the output tensors specified by `SubGraph.outputs` in the TFLite
// schema.fbs file[2]. The number of `TensorMetadata` in the array should
// equal to the number of indices in `SubGraph.outputs`.
//
// <Codegen usage>:
// Determines how to process the outputs.
output_tensor_metadata:[TensorMetadata];
// A list of associated files of this subgraph.
associated_files:[AssociatedFile];
// Input process units of the subgraph. Some models may have complex pre and
// post processing logics where the process units do not work on one tensor at
// a time, but in a similar way of a TFLite graph. For example, in the
// MobileBert model (https://www.tensorflow.org/lite/models/bert_qa/overview),
// the inputs are: ids / mask / segment ids;
// the outputs are: end logits / start logits.
// The preprocessing converts the query string and the context string to the
// model inputs, and the post-processing converts the model outputs to the
// answer string.
// Added in: 1.1.0
input_process_units:[ProcessUnit];
// Output process units of the subgraph.
// Added in: 1.1.0
output_process_units:[ProcessUnit];
// Metadata of all input tensor groups used in this subgraph.
//
// <codegen usage>:
// Bundles the corresponding elements of the underlying input tensors together
// into a class, and converts those individual tensors into a list of the
// class objects.
// Added in: 1.2.0
input_tensor_groups:[TensorGroup];
// Metadata of all output tensor groups used in this subgraph.
//
// <codegen usage>:
// Bundles the corresponding elements of the underlying output tensors
// together into a class, and converts those individual tensors into a list of
// the class objects.
// Added in: 1.2.0
output_tensor_groups:[TensorGroup];
// A list of custom metadata.
// Added in: 1.5.0.
custom_metadata:[CustomMetadata];
}
table ModelMetadata {
// Name of the model.
//
// <Codegen usage>:
// The name of the model in the generated model interface.
name:string;
// Model description in schema.
description:string;
// Version of the model that specified by model creators.
version:string;
// Noted that, the minimum required TFLite runtime version that the model is
// compatible with, has already been added as a metadata entry in tflite
// schema. We'll decide later if we want to move it here, and keep it with
// other metadata entries.
// Metadata of all the subgraphs of the model. The 0th is assumed to be the
// main subgraph.
//
// <Codegen usage>:
// Determines how to process the inputs and outputs.
subgraph_metadata:[SubGraphMetadata];
// The person who creates this model.
author:string;
// Licenses that may apply to this model.
license:string;
// A list of associated files of this model.
associated_files:[AssociatedFile];
// The minimum metadata parser version that can fully understand the fields in
// the metadata flatbuffer. The version is effectively the largest version
// number among the versions of all the fields populated and the smallest
// compatible version indicated by the file identifier.
//
// This field is automatically populated by the MetadataPopulator when
// the metadata is populated into a TFLite model.
min_parser_version:string;
}
root_type ModelMetadata;