chromium/content/browser/speech/network_speech_recognition_engine_impl.h

// Copyright 2024 The Chromium Authors
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file.

#ifndef CONTENT_BROWSER_SPEECH_NETWORK_SPEECH_RECOGNITION_ENGINE_IMPL_H_
#define CONTENT_BROWSER_SPEECH_NETWORK_SPEECH_RECOGNITION_ENGINE_IMPL_H_

#include <stdint.h>

#include <memory>
#include <string>
#include <string_view>
#include <vector>

#include "base/memory/raw_ptr.h"
#include "base/memory/ref_counted.h"
#include "base/sequence_checker.h"
#include "components/speech/audio_encoder.h"
#include "components/speech/chunked_byte_buffer.h"
#include "components/speech/downstream_loader.h"
#include "components/speech/downstream_loader_client.h"
#include "components/speech/upstream_loader.h"
#include "components/speech/upstream_loader_client.h"
#include "content/browser/speech/speech_recognition_engine.h"
#include "content/common/content_export.h"
#include "content/public/browser/speech_recognition_session_preamble.h"
#include "media/mojo/mojom/speech_recognition_error.mojom.h"
#include "media/mojo/mojom/speech_recognition_grammar.mojom.h"
#include "media/mojo/mojom/speech_recognition_result.mojom.h"
#include "services/network/public/cpp/simple_url_loader_stream_consumer.h"

class AudioChunk;

namespace base {
class TimeDelta;
}

namespace network {
class SharedURLLoaderFactory;
}

namespace content {

// This is the network implementation for `SpeechRecognitionEngine`, which is
// supporting continuous recognition by means of interaction with the Google
// streaming speech recognition webservice.
//
// This class establishes two HTTPS connections with the webservice for each
// session, herein called "upstream" and "downstream". Audio chunks are sent on
// the upstream by means of a chunked HTTP POST upload. Recognition results are
// retrieved in a full-duplex fashion (i.e. while pushing audio on the upstream)
// on the downstream by means of a chunked HTTP GET request. Pairing between the
// two stream is handled through a randomly generated key, unique for each
// request, which is passed in the &pair= arg to both stream request URLs. In
// the case of a regular session, the upstream is closed when the audio capture
// ends (notified through a |AudioChunksEnded| call) and the downstream waits
// for a corresponding server closure (eventually some late results can come
// after closing the upstream). Both streams are guaranteed to be closed when
// |EndRecognition| call is issued.

class CONTENT_EXPORT NetworkSpeechRecognitionEngineImpl
    : public SpeechRecognitionEngine,
      public speech::UpstreamLoaderClient,
      public speech::DownstreamLoaderClient {};

}  // namespace content

#endif  // CONTENT_BROWSER_SPEECH_NETWORK_SPEECH_RECOGNITION_ENGINE_IMPL_H_