chromium/third_party/mediapipe/src/mediapipe/calculators/tensor/audio_to_tensor_calculator.proto

// Copyright 2022 The MediaPipe Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//      http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

syntax = "proto2";

package mediapipe;

import "mediapipe/framework/calculator.proto";

message AudioToTensorCalculatorOptions {
  extend mediapipe.CalculatorOptions {
    optional AudioToTensorCalculatorOptions ext = 448635064;
  }

  // The required number of channels the output audio tensor has.
  // If set to 1, multichannel signals will be automatically mixed down to mono.
  optional int64 num_channels = 1;

  // The required number of samples per channel the output audio tensor has.
  optional int64 num_samples = 2;

  // The number of overlapping samples per channel the output audio tensor has.
  optional int64 num_overlapping_samples = 3 [default = 0];

  // The target number of samples per second (hertz) of the audio buffers that
  // will be converted into tensors.
  optional double target_sample_rate = 4;

  // Whether to treat the input audio stream as a continuous stream or a batch
  // of unrelated audio buffers.
  optional bool stream_mode = 5 [default = true];

  // Set to false to disable checks for jitter in timestamp values. Useful with
  // live audio input.
  optional bool check_inconsistent_timestamps = 6 [default = true];

  // Size of the fft in number of bins. If set, the calculator outputs fft
  // tensors.
  optional int64 fft_size = 7;

  // The amount of padding samples to add before the audio after resampling.
  // Note that the timestamps shift. Currently, only zero padding is supported.
  optional int64 padding_samples_before = 8;

  // The amount of padding samples to add after the audio after resampling.
  // Currently, only zero padding is supported.
  optional int64 padding_samples_after = 9;

  // Determines the "flushing" behavior in stream mode.
  enum FlushMode {
    // Unspecified (causes an error). Won't be used because of the default.
    NONE = 0;
    // Emit a packet with the entire remainder at `Timestamp::Max`.
    ENTIRE_TAIL_AT_TIMESTAMP_MAX = 1;
    // Continue emitting framed packets with relevant timestamps.
    PROCEED_AS_USUAL = 2;
  }

  optional FlushMode flush_mode = 10 [default = ENTIRE_TAIL_AT_TIMESTAMP_MAX];

  enum DftTensorFormat {
    DFT_TENSOR_FORMAT_UNKNOWN = 0;
    // The output dft tensor without dc and nyquist components.
    WITHOUT_DC_AND_NYQUIST = 1;
    // The output dft tensor contains the nyquist component as the last
    // two values.
    WITH_NYQUIST = 2;
    // The output dft tensor contains the dc component as the first two values
    // and the nyquist component as the last two values.
    WITH_DC_AND_NYQUIST = 3;
  }
  optional DftTensorFormat dft_tensor_format = 11 [default = WITH_NYQUIST];

  // The volume gain, measured in dB.
  // Scale the input audio amplitude by 10^(volume_gain_db/20).
  optional double volume_gain_db = 12;

  // The source number of samples per second (hertz) of the input audio buffers.
  optional double source_sample_rate = 13;
}