// Copyright 2022 The MediaPipe Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
syntax = "proto2";
package mediapipe;
import "mediapipe/framework/calculator.proto";
message AudioToTensorCalculatorOptions {
extend mediapipe.CalculatorOptions {
optional AudioToTensorCalculatorOptions ext = 448635064;
}
// The required number of channels the output audio tensor has.
// If set to 1, multichannel signals will be automatically mixed down to mono.
optional int64 num_channels = 1;
// The required number of samples per channel the output audio tensor has.
optional int64 num_samples = 2;
// The number of overlapping samples per channel the output audio tensor has.
optional int64 num_overlapping_samples = 3 [default = 0];
// The target number of samples per second (hertz) of the audio buffers that
// will be converted into tensors.
optional double target_sample_rate = 4;
// Whether to treat the input audio stream as a continuous stream or a batch
// of unrelated audio buffers.
optional bool stream_mode = 5 [default = true];
// Set to false to disable checks for jitter in timestamp values. Useful with
// live audio input.
optional bool check_inconsistent_timestamps = 6 [default = true];
// Size of the fft in number of bins. If set, the calculator outputs fft
// tensors.
optional int64 fft_size = 7;
// The amount of padding samples to add before the audio after resampling.
// Note that the timestamps shift. Currently, only zero padding is supported.
optional int64 padding_samples_before = 8;
// The amount of padding samples to add after the audio after resampling.
// Currently, only zero padding is supported.
optional int64 padding_samples_after = 9;
// Determines the "flushing" behavior in stream mode.
enum FlushMode {
// Unspecified (causes an error). Won't be used because of the default.
NONE = 0;
// Emit a packet with the entire remainder at `Timestamp::Max`.
ENTIRE_TAIL_AT_TIMESTAMP_MAX = 1;
// Continue emitting framed packets with relevant timestamps.
PROCEED_AS_USUAL = 2;
}
optional FlushMode flush_mode = 10 [default = ENTIRE_TAIL_AT_TIMESTAMP_MAX];
enum DftTensorFormat {
DFT_TENSOR_FORMAT_UNKNOWN = 0;
// The output dft tensor without dc and nyquist components.
WITHOUT_DC_AND_NYQUIST = 1;
// The output dft tensor contains the nyquist component as the last
// two values.
WITH_NYQUIST = 2;
// The output dft tensor contains the dc component as the first two values
// and the nyquist component as the last two values.
WITH_DC_AND_NYQUIST = 3;
}
optional DftTensorFormat dft_tensor_format = 11 [default = WITH_NYQUIST];
// The volume gain, measured in dB.
// Scale the input audio amplitude by 10^(volume_gain_db/20).
optional double volume_gain_db = 12;
// The source number of samples per second (hertz) of the input audio buffers.
optional double source_sample_rate = 13;
}