chromium/content/public/android/java/src/org/chromium/content/browser/SpeechRecognitionImpl.java

// Copyright 2013 The Chromium Authors
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file.

package org.chromium.content.browser;

import android.annotation.SuppressLint;
import android.content.ComponentName;
import android.content.Context;
import android.content.Intent;
import android.content.pm.PackageManager;
import android.content.pm.ResolveInfo;
import android.content.pm.ServiceInfo;
import android.os.Build;
import android.os.Bundle;
import android.speech.RecognitionListener;
import android.speech.RecognitionService;
import android.speech.RecognizerIntent;
import android.speech.SpeechRecognizer;

import org.jni_zero.CalledByNative;
import org.jni_zero.JNINamespace;
import org.jni_zero.NativeMethods;

import org.chromium.base.BuildInfo;
import org.chromium.base.ContextUtils;
import org.chromium.base.Log;
import org.chromium.base.PackageUtils;
import org.chromium.content.R;
import org.chromium.content_public.browser.SpeechRecognition;
import org.chromium.media.mojom.SpeechRecognitionErrorCode;
import org.chromium.ui.widget.Toast;

import java.util.ArrayList;
import java.util.List;

/** Implementation of {@link SpeechRecognition}. */
@JNINamespace("content")
public class SpeechRecognitionImpl {
    private static final String TAG = "SpeechRecog";

    // Constants describing the speech recognition provider we depend on.
    private static final String AGSA_PACKAGE_NAME = "com.google.android.googlequicksearchbox";
    private static final int AGSA_MIN_VERSION = 300207030;
    private static final String SSBG_PACKAGE_NAME = "com.google.android.tts";

    // We track the recognition state to remember what events we need to send when recognition is
    // being aborted. Once Android's recognizer is cancelled, its listener won't yield any more
    // events, but we still need to call OnSoundEnd and OnAudioEnd if corresponding On*Start were
    // called before.
    private static final int STATE_IDLE = 0;
    private static final int STATE_AWAITING_SPEECH = 1;
    private static final int STATE_CAPTURING_SPEECH = 2;
    private int mState;

    // The speech recognition provider (if any) matching PROVIDER_PACKAGE_NAME and
    // PROVIDER_MIN_VERSION as selected by initialize().
    private static ComponentName sRecognitionProvider;

    private final Intent mIntent;
    private final RecognitionListener mListener;
    private SpeechRecognizer mRecognizer;

    // Native pointer to C++ SpeechRecognizerImplAndroid.
    private long mNativeSpeechRecognizerImplAndroid;

    // Remember if we are using continuous recognition.
    private boolean mContinuous;

    // Internal class to handle events from Android's SpeechRecognizer and route them to native.
    class Listener implements RecognitionListener {

        @Override
        public void onBeginningOfSpeech() {
            if (mNativeSpeechRecognizerImplAndroid == 0) return;

            mState = STATE_CAPTURING_SPEECH;
            SpeechRecognitionImplJni.get()
                    .onSoundStart(mNativeSpeechRecognizerImplAndroid, SpeechRecognitionImpl.this);
        }

        @Override
        public void onBufferReceived(byte[] buffer) {}

        @Override
        public void onEndOfSpeech() {
            // Ignore onEndOfSpeech in continuous mode to let terminate() take care of ending
            // events. The Android API documentation is vague as to when onEndOfSpeech is called in
            // continuous mode, whereas the Web Speech API defines a stronger semantic on the
            // equivalent (onsoundend) event. Thus, the only way to provide a valid onsoundend
            // event is to trigger it when the last result is received or the session is aborted.
            if (!mContinuous) {
                if (mNativeSpeechRecognizerImplAndroid == 0) return;
                SpeechRecognitionImplJni.get()
                        .onSoundEnd(mNativeSpeechRecognizerImplAndroid, SpeechRecognitionImpl.this);
                // Since Android doesn't have a dedicated event for when audio capture is finished,
                // we fire it after speech has ended.
                SpeechRecognitionImplJni.get()
                        .onAudioEnd(mNativeSpeechRecognizerImplAndroid, SpeechRecognitionImpl.this);
                mState = STATE_IDLE;
            }
        }

        @Override
        public void onError(int error) {
            int code = SpeechRecognitionErrorCode.NONE;

            // Translate Android SpeechRecognizer errors to Web Speech API errors.
            switch (error) {
                case SpeechRecognizer.ERROR_AUDIO:
                    code = SpeechRecognitionErrorCode.AUDIO_CAPTURE;
                    break;
                case SpeechRecognizer.ERROR_CLIENT:
                    code = SpeechRecognitionErrorCode.ABORTED;
                    break;
                case SpeechRecognizer.ERROR_RECOGNIZER_BUSY:
                case SpeechRecognizer.ERROR_INSUFFICIENT_PERMISSIONS:
                    code = SpeechRecognitionErrorCode.NOT_ALLOWED;
                    break;
                case SpeechRecognizer.ERROR_NETWORK_TIMEOUT:
                case SpeechRecognizer.ERROR_NETWORK:
                case SpeechRecognizer.ERROR_SERVER:
                    code = SpeechRecognitionErrorCode.NETWORK;
                    break;
                case SpeechRecognizer.ERROR_NO_MATCH:
                    code = SpeechRecognitionErrorCode.NO_MATCH;
                    break;
                case SpeechRecognizer.ERROR_SPEECH_TIMEOUT:
                    code = SpeechRecognitionErrorCode.NO_SPEECH;
                    break;
                default:
                    assert false;
                    return;
            }

            terminate(code);
        }

        @Override
        public void onEvent(int event, Bundle bundle) {}

        @Override
        public void onPartialResults(Bundle bundle) {
            handleResults(bundle, true);
        }

        @Override
        public void onReadyForSpeech(Bundle bundle) {
            if (mNativeSpeechRecognizerImplAndroid == 0) return;

            mState = STATE_AWAITING_SPEECH;
            SpeechRecognitionImplJni.get()
                    .onAudioStart(mNativeSpeechRecognizerImplAndroid, SpeechRecognitionImpl.this);
        }

        @Override
        public void onResults(Bundle bundle) {
            handleResults(bundle, false);
            // We assume that onResults is called only once, at the end of a session, thus we
            // terminate. If one day the recognition provider changes dictation mode behavior to
            // call onResults several times, we should terminate only if (!mContinuous).
            terminate(SpeechRecognitionErrorCode.NONE);
        }

        @Override
        public void onRmsChanged(float rms) {}

        private void handleResults(Bundle bundle, boolean provisional) {
            if (mNativeSpeechRecognizerImplAndroid == 0) return;

            if (mContinuous && provisional) {
                // In continuous mode, Android's recognizer sends final results as provisional.
                provisional = false;
            }

            ArrayList<String> list =
                    bundle.getStringArrayList(SpeechRecognizer.RESULTS_RECOGNITION);
            String[] results = list.toArray(new String[list.size()]);

            float[] scores = bundle.getFloatArray(SpeechRecognizer.CONFIDENCE_SCORES);

            SpeechRecognitionImplJni.get()
                    .onRecognitionResults(
                            mNativeSpeechRecognizerImplAndroid,
                            SpeechRecognitionImpl.this,
                            results,
                            scores,
                            provisional);
        }
    }

    /**
     * This method must be called before any instance of SpeechRecognition can be created. It will
     * query Android's package manager to find a suitable speech recognition provider that supports
     * continuous recognition.
     */
    // TODO(crbug.com/40479664): Fix this properly.
    public static boolean initialize() {
        Context context = ContextUtils.getApplicationContext();
        if (!SpeechRecognizer.isRecognitionAvailable(context)) return false;

        sRecognitionProvider = createRecognitionProvider();
        if (sRecognitionProvider == null) return false;
        return true;
    }

    /** Returns null if there is no Google LLC provided RecognitionService available on device. */
    private static ComponentName createRecognitionProvider() {
        if (android.os.Build.VERSION.SDK_INT >= Build.VERSION_CODES.S
                || BuildInfo.getInstance().isAutomotive) {
            return getComponent(SSBG_PACKAGE_NAME, -1);
        } else {
            return getComponent(AGSA_PACKAGE_NAME, AGSA_MIN_VERSION);
        }
    }

    @SuppressLint("WrongConstant")
    private static ComponentName getComponent(String packageName, int packageMinVersion) {
        Context context = ContextUtils.getApplicationContext();
        PackageManager pm = context.getPackageManager();
        Intent intent = new Intent(RecognitionService.SERVICE_INTERFACE);
        final List<ResolveInfo> list = pm.queryIntentServices(intent, PackageManager.GET_SERVICES);

        for (ResolveInfo resolve : list) {
            ServiceInfo service = resolve.serviceInfo;

            if (!service.packageName.equals(packageName)) continue;

            // No need to verify package version if packageMinVersion != -1.
            if (packageMinVersion != -1) {
                if (PackageUtils.getPackageVersion(service.packageName) < packageMinVersion) {
                    continue;
                }
            }

            return new ComponentName(service.packageName, service.name);
        }

        return null;
    }

    private SpeechRecognitionImpl(long nativeSpeechRecognizerImplAndroid) {
        mContinuous = false;
        mNativeSpeechRecognizerImplAndroid = nativeSpeechRecognizerImplAndroid;
        mListener = new Listener();
        mIntent = new Intent(RecognizerIntent.ACTION_RECOGNIZE_SPEECH);

        if (sRecognitionProvider != null) {
            mRecognizer =
                    SpeechRecognizer.createSpeechRecognizer(
                            ContextUtils.getApplicationContext(), sRecognitionProvider);
        } else {
            // It is possible to force-enable the speech recognition web platform feature (using a
            // command-line flag) even if initialize() failed to find the PROVIDER_PACKAGE_NAME
            // provider, in which case the first available speech recognition provider is used.
            // Caveat: Continuous mode may not work as expected with a different provider.
            mRecognizer =
                    SpeechRecognizer.createSpeechRecognizer(ContextUtils.getApplicationContext());
        }

        mRecognizer.setRecognitionListener(mListener);
    }

    // This function destroys everything when recognition is done, taking care to properly tear
    // down by calling On{Sound,Audio}End if corresponding On{Audio,Sound}Start were called.
    private void terminate(int error) {
        if (mNativeSpeechRecognizerImplAndroid == 0) return;

        if (mState != STATE_IDLE) {
            if (mState == STATE_CAPTURING_SPEECH) {
                SpeechRecognitionImplJni.get()
                        .onSoundEnd(mNativeSpeechRecognizerImplAndroid, SpeechRecognitionImpl.this);
            }
            SpeechRecognitionImplJni.get()
                    .onAudioEnd(mNativeSpeechRecognizerImplAndroid, SpeechRecognitionImpl.this);
            mState = STATE_IDLE;
        }

        if (error != SpeechRecognitionErrorCode.NONE) {
            SpeechRecognitionImplJni.get()
                    .onRecognitionError(
                            mNativeSpeechRecognizerImplAndroid, SpeechRecognitionImpl.this, error);
        }

        try {
            mRecognizer.destroy();
        } catch (IllegalArgumentException e) {
            // Intentionally swallow exception. This incorrectly throws exception on some samsung
            // devices, causing crashes.
            Log.w(TAG, "Destroy threw exception " + mRecognizer, e);
        }
        mRecognizer = null;
        SpeechRecognitionImplJni.get()
                .onRecognitionEnd(mNativeSpeechRecognizerImplAndroid, SpeechRecognitionImpl.this);
        mNativeSpeechRecognizerImplAndroid = 0;
    }

    @CalledByNative
    private static SpeechRecognitionImpl createSpeechRecognition(
            long nativeSpeechRecognizerImplAndroid) {
        return new SpeechRecognitionImpl(nativeSpeechRecognizerImplAndroid);
    }

    @CalledByNative
    private void startRecognition(String language, boolean continuous, boolean interimResults) {
        if (mRecognizer == null) return;

        mContinuous = continuous;
        mIntent.putExtra("android.speech.extra.DICTATION_MODE", continuous);
        mIntent.putExtra(RecognizerIntent.EXTRA_LANGUAGE, language);
        mIntent.putExtra(RecognizerIntent.EXTRA_PARTIAL_RESULTS, interimResults);
        try {
            mRecognizer.startListening(mIntent);
        } catch (SecurityException e) {
            Context context = ContextUtils.getApplicationContext();
            String msg =
                    context.getResources().getString(R.string.speech_recognition_service_not_found);
            Toast.makeText(context, msg, Toast.LENGTH_SHORT).show();
        }
    }

    @CalledByNative
    private void abortRecognition() {
        if (mRecognizer == null) return;

        mRecognizer.cancel();
        terminate(SpeechRecognitionErrorCode.ABORTED);
    }

    @CalledByNative
    private void stopRecognition() {
        if (mRecognizer == null) return;

        mContinuous = false;
        mRecognizer.stopListening();
    }

    @NativeMethods
    interface Natives {
        // Native JNI calls to content/browser/speech/speech_recognizer_impl_android.cc
        void onAudioStart(long nativeSpeechRecognizerImplAndroid, SpeechRecognitionImpl caller);

        void onSoundStart(long nativeSpeechRecognizerImplAndroid, SpeechRecognitionImpl caller);

        void onSoundEnd(long nativeSpeechRecognizerImplAndroid, SpeechRecognitionImpl caller);

        void onAudioEnd(long nativeSpeechRecognizerImplAndroid, SpeechRecognitionImpl caller);

        void onRecognitionResults(
                long nativeSpeechRecognizerImplAndroid,
                SpeechRecognitionImpl caller,
                String[] results,
                float[] scores,
                boolean provisional);

        void onRecognitionError(
                long nativeSpeechRecognizerImplAndroid, SpeechRecognitionImpl caller, int error);

        void onRecognitionEnd(long nativeSpeechRecognizerImplAndroid, SpeechRecognitionImpl caller);
    }
}