dictation.ts | Explore in Territory

// Copyright 2021 The Chromium Authors
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file.

import {Context} from '/common/action_fulfillment/context_checker.js';
import {Macro} from '/common/action_fulfillment/macros/macro.js';
import {MacroName} from '/common/action_fulfillment/macros/macro_names.js';
import {TestImportManager} from '/common/testing/test_import_manager.js';

import {FocusHandler} from './focus_handler.js';
import {InputControllerImpl} from './input_controller_impl.js';
import {LocaleInfo} from './locale_info.js';
import {MetricsUtils} from './metrics_utils.js';
import {SpeechParser} from './parse/speech_parser.js';
import {HintContext, UIController, UIState} from './ui_controller.js';

type ErrorEvent = chrome.speechRecognitionPrivate.SpeechRecognitionErrorEvent;
type ResultEvent = chrome.speechRecognitionPrivate.SpeechRecognitionResultEvent;
type StartOptions = chrome.speechRecognitionPrivate.StartOptions;
type StopEvent = chrome.speechRecognitionPrivate.SpeechRecognitionStopEvent;
import StreamType = chrome.audio.StreamType;
import SpeechRecognitionType = chrome.speechRecognitionPrivate.SpeechRecognitionType;
type PrefObject = chrome.settingsPrivate.PrefObject;
import ToastType = chrome.accessibilityPrivate.ToastType;

/**
 * Main class for the Chrome OS dictation feature.
 * TODO(b/314204374): Eliminate instances of null.
 */
export class Dictation {
  private inputController_: InputControllerImpl|null = null;
  private uiController_: UIController|null = null;
  private speechParser_: SpeechParser|null = null;
  /** Whether or not Dictation is active. */
  private active_ = false;
  private cancelTone_: HTMLAudioElement|null =
      new Audio('dictation/earcons/null_selection.wav');
  private startTone_: HTMLAudioElement|null =
      new Audio('dictation/earcons/audio_initiate.wav');
  private endTone_: HTMLAudioElement|null =
      new Audio('dictation/earcons/audio_end.wav');
  private noSpeechTimeoutMs_: number = Dictation.Timeouts.NO_SPEECH_NETWORK_MS;
  private stopTimeoutId_: number|null = null;
  private interimText_ = '';
  private chromeVoxEnabled_ = false;
  private speechRecognitionOptions_: StartOptions|null = null;
  private metricsUtils_: MetricsUtils|null = null;
  private focusHandler_: FocusHandler|null = null;
  // API Listeners //
  private speechRecognitionStopListener_:
      ((event: StopEvent) => void)|null = null;
  private speechRecognitionResultListener_:
      ((event: ResultEvent) => Promise<void>)|null = null;
  private speechRecognitionErrorListener_:
      ((event: ErrorEvent) => void)|null = null;
  private prefsListener_: ((prefs: PrefObject[]) => void)|null = null;
  private onToggleDictationListener_: ((active: boolean) => void)|null = null;
  private isContextCheckingFeatureEnabled_ = false;
  private prevMacro_: Macro|null = null;

  constructor() {
    this.initialize_();
  }

  /** Sets up Dictation's speech recognizer and various listeners. */
  private initialize_(): void {
    this.focusHandler_ = new FocusHandler();
    this.inputController_ = new InputControllerImpl(
        () => this.stopDictation_(/*notify=*/ true), this.focusHandler_);
    this.uiController_ = new UIController();
    this.speechParser_ = new SpeechParser(this.inputController_);
    this.speechParser_.refresh();

    // Set default speech recognition properties. Locale will be updated when
    // `updateFromPrefs_` is called.
    this.speechRecognitionOptions_ = {
      locale: 'en-US',
      interimResults: true,
    };

    this.speechRecognitionStopListener_ = () =>
        this.onSpeechRecognitionStopped_();
    this.speechRecognitionResultListener_ = event =>
        this.onSpeechRecognitionResult_(event);
    this.speechRecognitionErrorListener_ = () =>
        this.onSpeechRecognitionError_();
    this.prefsListener_ = prefs => this.updateFromPrefs_(prefs);
    this.onToggleDictationListener_ = activated =>
        this.onToggleDictation_(activated);

    // Setup speechRecognitionPrivate API listeners.
    chrome.speechRecognitionPrivate.onStop.addListener(
        this.speechRecognitionStopListener_);
    chrome.speechRecognitionPrivate.onResult.addListener(
        this.speechRecognitionResultListener_);
    chrome.speechRecognitionPrivate.onError.addListener(
        this.speechRecognitionErrorListener_);

    chrome.settingsPrivate.getAllPrefs(prefs => this.updateFromPrefs_(prefs));
    chrome.settingsPrivate.onPrefsChanged.addListener(this.prefsListener_);

    // Listen for Dictation toggles (activated / deactivated) from the Ash
    // Browser process.
    chrome.accessibilityPrivate.onToggleDictation.addListener(
        this.onToggleDictationListener_);

    const contextCheckingFeature =
        chrome.accessibilityPrivate.AccessibilityFeature
            .DICTATION_CONTEXT_CHECKING;
    chrome.accessibilityPrivate.isFeatureEnabled(
        contextCheckingFeature, enabled => {
          this.isContextCheckingFeatureEnabled_ = enabled;
        });
  }

  /** Performs any destruction before dictation object is destroyed. */
  onDictationDisabled(): void {
    if (this.speechRecognitionStopListener_) {
      chrome.speechRecognitionPrivate.onStop.removeListener(
          this.speechRecognitionStopListener_);
    }
    if (this.speechRecognitionResultListener_) {
      chrome.speechRecognitionPrivate.onResult.removeListener(
          this.speechRecognitionResultListener_);
    }
    if (this.speechRecognitionErrorListener_) {
      chrome.speechRecognitionPrivate.onError.removeListener(
          this.speechRecognitionErrorListener_);
    }
    if (this.prefsListener_) {
      chrome.settingsPrivate.onPrefsChanged.removeListener(this.prefsListener_);
    }
    if (this.onToggleDictationListener_) {
      chrome.accessibilityPrivate.onToggleDictation.removeListener(
          this.onToggleDictationListener_);
    }
    if (this.inputController_) {
      this.inputController_.removeListeners();
    }
  }

  /**
   * Called when Dictation is toggled.
   * @param activated Whether Dictation was just activated.
   */
  private onToggleDictation_(activated: boolean): void {
    if (activated && !this.active_) {
      this.startDictation_();
    } else {
      this.stopDictation_(/*notify=*/ false);
    }
  }

  private startDictation_(): void {
    this.active_ = true;
    if (this.chromeVoxEnabled_) {
      // Silence ChromeVox in case it was speaking. It can speak over the start
      // tone and also cause a feedback loop if the user is not using
      // headphones. This does not stop ChromeVox from speaking additional
      // utterances added to the queue later.
      chrome.accessibilityPrivate.silenceSpokenFeedback();
    }
    this.setStopTimeout_(
        Dictation.Timeouts.NO_FOCUSED_IME_MS,
        Dictation.StopReason.NO_FOCUSED_IME);
    // TODO(b/314203187): Determine if not null assertion is acceptable.
    this.inputController_!.connect(() => this.verifyMicrophoneNotMuted_());
  }

  /**
   * Checks if the microphone is muted. If it is, then we stop Dictation and
   * show a notification to the user. If the microphone isn't muted, then we
   * proceed to start speech recognition. Because this is async, this method
   * checks that startup state is still correct before proceeding.
   */
  private verifyMicrophoneNotMuted_(): void {
    if (!this.active_) {
      this.stopDictation_(/*notify=*/ true);
      return;
    }

    // TODO(b:299677121): Determine if it's possible for no mics to be
    // available. If that scenario is possible, we may have to use
    // `chrome.audio.getDevices` and verify that there's at least one input
    // device.
    chrome.audio.getMute(StreamType.INPUT, (muted: boolean) => {
      if (muted) {
        this.stopDictation_(/*notify=*/ true);
        chrome.accessibilityPrivate.showToast(ToastType.DICTATION_MIC_MUTED);
        return;
      }

      this.maybeStartSpeechRecognition_();
    });
  }

  /**
   * Called when Dictation has set itself as the IME during start-up. Because
   * this is async, checks that startup state is still correct before starting
   * speech recognition.
   */
  private maybeStartSpeechRecognition_(): void {
    if (this.active_) {
      // TODO(b/314203187): Determine if not null assertion is acceptable.
      chrome.speechRecognitionPrivate.start(
          this.speechRecognitionOptions_!,
          (type: SpeechRecognitionType) =>
              this.onSpeechRecognitionStarted_(type));
    } else {
      // We are no longer starting up - perhaps a stop came
      // through during the async callbacks. Ensure cleanup
      // by calling stopDictation_().
      this.stopDictation_(/*notify=*/ true);
    }
  }

  /**
   * Stops Dictation and notifies the browser.
   * @param notify True if we should notify the browser that Dictation
   * stopped.
   */
  private stopDictation_(notify: boolean): void {
    if (!this.active_) {
      return;
    }

    this.active_ = false;
    // Stop speech recognition.
    chrome.speechRecognitionPrivate.stop({}, () => {});
    if (this.interimText_) {
      // TODO(b/314203187): Determine if not null assertion is acceptable.
      this.endTone_!.play();
    } else {
      // TODO(b/314203187): Determine if not null assertion is acceptable.
      this.cancelTone_!.play();
    }

    // Clear any timeouts.
    this.clearStopTimeout_();

    // TODO(b/314203187): Determine if not null assertion is acceptable.
    this.inputController_!.commitText(this.interimText_);
    this.hideCommandsUI_();
    this.inputController_!.disconnect();
    Dictation.removeAsInputMethod();

    // Notify the browser that Dictation turned off.
    if (notify) {
      chrome.accessibilityPrivate.toggleDictation();
    }
  }

  /**
   * Sets the timeout to stop Dictation.
   * @param reason Optional reason for why Dictation
   *     stopped automatically.
   */
  private setStopTimeout_(durationMs: number, reason?: Dictation.StopReason):
      void {
    if (this.stopTimeoutId_ !== null) {
      clearTimeout(this.stopTimeoutId_);
    }
    this.stopTimeoutId_ = setTimeout(() => {
      this.stopDictation_(/*notify=*/ true);

      if (reason === Dictation.StopReason.NO_FOCUSED_IME) {
        chrome.accessibilityPrivate.showToast(
            ToastType.DICTATION_NO_FOCUSED_TEXT_FIELD);
      }
    }, durationMs);
  }

  /** Called when the Speech Recognition engine receives a recognition event. */
  private async onSpeechRecognitionResult_(event: ResultEvent): Promise<void> {
    if (!this.active_) {
      return;
    }

    const transcript = event.transcript;
    const isFinal = event.isFinal;
    this.setStopTimeout_(
        isFinal ? this.noSpeechTimeoutMs_ :
                  Dictation.Timeouts.NO_NEW_SPEECH_MS);
    await this.processSpeechRecognitionResult_(transcript, isFinal);
  }

  /**
   * Processes a speech recognition result.
   * @param isFinal Whether this is a finalized transcript or an
   *     interim result.
   */
  private async processSpeechRecognitionResult_(
      transcript: string, isFinal: boolean): Promise<void> {
    if (!isFinal) {
      this.showInterimText_(transcript);
      return;
    }

    // TODO(b/314203187): Determine if not null assertion is acceptable.
    let macro = await this.speechParser_!.parse(transcript);
    MetricsUtils.recordMacroRecognized(macro);
    macro = this.handleRepeat_(macro);

    // Check if the macro can execute.
    // TODO(crbug.com/1264544): Deal with ambiguous results here.
    const checkContextResult = macro.checkContext();
    if (!checkContextResult.canTryAction &&
        this.isContextCheckingFeatureEnabled_) {
      this.showMacroExecutionFailed_(
          macro, transcript, checkContextResult.failedContext);
      return;
    }

    // Try to run the macro.
    const runMacroResult = macro.run();
    if (!runMacroResult.isSuccess) {
      this.showMacroExecutionFailed_(macro, transcript);
      return;
    }
    if (macro.getName() === MacroName.LIST_COMMANDS) {
      // ListCommandsMacro opens a new tab, thereby changing the cursor focus
      // and ending the Dictation session.
      return;
    }

    // Provide feedback to the user that the macro executed successfully.
    this.showMacroExecuted_(macro, transcript);
  }

  /**
   * Called when Speech Recognition starts up and begins listening. Passed as
   * a callback to speechRecognitionPrivate.start().
   * @param type The type of speech recognition used.
   */
  private onSpeechRecognitionStarted_(type: SpeechRecognitionType): void {
    if (chrome.runtime.lastError) {
      // chrome.runtime.lastError will be set if the call to
      // speechRecognitionPrivate.start() caused an error. When this happens,
      // the speech recognition private API will turn the associated recognizer
      // off. To align with this, we should call `stopDictation_`.
      this.stopDictation_(/*notify=*/ true);
      return;
    }

    if (!this.active_) {
      return;
    }

    this.noSpeechTimeoutMs_ = type === SpeechRecognitionType.NETWORK ?
        Dictation.Timeouts.NO_SPEECH_NETWORK_MS :
        Dictation.Timeouts.NO_SPEECH_ONDEVICE_MS;
    this.setStopTimeout_(this.noSpeechTimeoutMs_);

    // TODO(b/314203187): Determine if not null assertion is acceptable.
    this.startTone_!.play();
    this.clearInterimText_();

    // Record metrics.
    this.metricsUtils_ = new MetricsUtils(type, LocaleInfo.locale);
    this.metricsUtils_.recordSpeechRecognitionStarted();

    // TODO(b/314203187): Determine if not null assertion is acceptable.
    this.uiController_!.setState(
        UIState.STANDBY, {context: HintContext.STANDBY});
    this.focusHandler_!.refresh();
  }

  /**
   * Called when speech recognition stops or when speech recognition encounters
   * an error.
   */
  private onSpeechRecognitionStopped_(): void {
    if (this.metricsUtils_ !== null) {
      this.metricsUtils_.recordSpeechRecognitionStopped();
    }
    this.metricsUtils_ = null;

    // Stop dictation if it wasn't already stopped.
    this.stopDictation_(/*notify=*/ true);
  }

  private onSpeechRecognitionError_(): void {
    // TODO: Dictation does not surface speech recognition errors to the user.
    // Informing the user of errors, for example lack of network connection or a
    // missing microphone, would be a useful feature.
    this.stopDictation_(/*notify=*/ true);
  }

  private updateFromPrefs_(prefs: PrefObject[]): void {
    prefs.forEach(pref => {
      switch (pref.key) {
        case Dictation.DICTATION_LOCALE_PREF:
          if (pref.value) {
            const locale = pref.value;
            // TODO(b/314203187): Determine if not null assertion is acceptable.
            this.speechRecognitionOptions_!.locale = locale;
            LocaleInfo.locale = locale;
            this.speechParser_!.refresh();
          }
          break;
        case Dictation.SPOKEN_FEEDBACK_PREF:
          if (pref.value) {
            this.chromeVoxEnabled_ = true;
          } else {
            this.chromeVoxEnabled_ = false;
          }
          // Use a longer hints timeout when ChromeVox is enabled.
          // TODO(b/314203187): Determine if not null assertion is acceptable.
          this.uiController_!.setHintsTimeoutDuration(this.chromeVoxEnabled_);
          break;
        default:
          return;
      }
    });
  }

  /** Shows the interim result in the UI. */
  private showInterimText_(text: string): void {
    // TODO(crbug.com/40792919): Need to find a way to show interim text that is
    // only whitespace. Google Cloud Speech can return a newline character
    // although SODA does not seem to do that. The newline character looks wrong
    // here.
    this.interimText_ = text;
    // TODO(b/314203187): Determine if not null assertion is acceptable.
    this.uiController_!.setState(UIState.RECOGNIZING_TEXT, {text});
  }

  /** Clears the interim result in the UI. */
  private clearInterimText_(): void {
    this.interimText_ = '';
    // TODO(b/314203187): Determine if not null assertion is acceptable.
    this.uiController_!.setState(UIState.STANDBY);
  }

  /**
   * Shows that a macro was executed in the UI by putting a checkmark next to
   * the transcript.
   */
  private showMacroExecuted_(macro: Macro, transcript: string): void {
    MetricsUtils.recordMacroSucceeded(macro);

    if (macro.getName() === MacroName.INPUT_TEXT_VIEW ||
        macro.getName() === MacroName.NEW_LINE) {
      this.clearInterimText_();
      // TODO(b/314203187): Determine if not null assertion is acceptable.
      this.uiController_!.setState(
          UIState.STANDBY, {context: HintContext.TEXT_COMMITTED});
      return;
    }
    this.interimText_ = '';
    const context = macro.getName() === MacroName.SELECT_ALL_TEXT ?
        HintContext.TEXT_SELECTED :
        HintContext.MACRO_SUCCESS;
    // TODO(b/314203187): Determine if not null assertion is acceptable.
    this.uiController_!.setState(
        UIState.MACRO_SUCCESS, {text: transcript, context});
  }

  /**
   * Shows a message in the UI that a command failed to execute.
   * TODO(crbug.com/40792919): Optionally use the MacroError to provide
   * additional context.
   * @param transcript The user's spoken transcript, shown so they
   *     understand the final speech recognized which might be helpful in
   *     understanding why the command failed.
   */
  private showMacroExecutionFailed_(
      macro: Macro, transcript: string, failedContext?: Context): void {
    MetricsUtils.recordMacroFailed(macro);

    this.interimText_ = '';
    let text = '';
    if (!failedContext) {
      text = chrome.i18n.getMessage(
          'dictation_command_failed_generic', [transcript]);
    } else {
      const reason = Dictation.getFailedContextReason(failedContext);
      text = chrome.i18n.getMessage(
          'dictation_command_failed_with_reason', [transcript, reason]);
    }

    // TODO(b/314203187): Determine if not null assertion is acceptable.
    this.uiController_!.setState(UIState.MACRO_FAIL, {
      text,
      context: HintContext.STANDBY,
    });
  }

  /**
   * Hides the commands UI bubble.
   */
  /* eslint-disable-next-line @typescript-eslint/naming-convention */
  private hideCommandsUI_(): void {
    this.interimText_ = '';
    // TODO(b/314203187): Determine if not null assertion is acceptable.
    this.uiController_!.setState(UIState.HIDDEN);
  }

  private clearStopTimeout_(): void {
    if (this.stopTimeoutId_ !== null) {
      clearTimeout(this.stopTimeoutId_);
      this.stopTimeoutId_ = null;
    }
  }

  /**
   * Removes AccessibilityCommon as an input method so it doesn't show up in
   * the shelf input method picker UI.
   */
  static removeAsInputMethod(): void {
    chrome.languageSettingsPrivate.removeInputMethod(
        InputControllerImpl.IME_ENGINE_ID);
  }

  /** Used to set the NO_FOCUSED_IME_MS timeout for testing purposes only. */
  setNoFocusedImeTimeoutForTesting(duration: number): void {
    Dictation.Timeouts.NO_FOCUSED_IME_MS = duration;
  }

  private handleRepeat_(macro: Macro): Macro {
    if (macro.getName() !== MacroName.REPEAT) {
      // If this macro is not the RepeatMacro, save it and return the existing
      // macro.
      this.prevMacro_ = macro;
      return macro;
    }

    // Handle cases where `macro` is the RepeatMacro.
    if (!this.prevMacro_) {
      // If there is no previous macro, return the RepeatMacro.
      return macro;
    }

    // Otherwise, return the previous macro.
    return this.prevMacro_;
  }

  /** Disables Pumpkin for tests that use regex-based command parsing. */
  disablePumpkinForTesting(): void {
    // TODO(b/314203187): Determine if not null assertion is acceptable.
    this.speechParser_!.disablePumpkinForTesting();
  }

  static getFailedContextReason(context: Context): string {
    switch (context) {
      case Context.INACTIVE_INPUT_CONTROLLER:
        return chrome.i18n.getMessage(
            'dictation_context_error_reason_inactive_input_controller');
      case Context.EMPTY_EDITABLE:
        return chrome.i18n.getMessage(
            'dictation_context_error_reason_empty_editable');
      case Context.NO_SELECTION:
        return chrome.i18n.getMessage(
            'dictation_context_error_reason_no_selection');
      case Context.INVALID_INPUT:
        return chrome.i18n.getMessage(
            'dictation_context_error_reason_invalid_input');
      case Context.NO_PREVIOUS_MACRO:
        return chrome.i18n.getMessage(
            'dictation_context_error_reason_no_previous_macro');
      default:
        break;
    }

    throw new Error(
        'Cannot get error message for unsupported context: ' + context);
  }
}

export namespace Dictation {
  /** Dictation locale pref. */
  export const DICTATION_LOCALE_PREF: string = 'settings.a11y.dictation_locale';

  /** ChromeVox enabled pref. */
  export const SPOKEN_FEEDBACK_PREF: string = 'settings.accessibility';

  /** Timeout durations. */
  export const Timeouts = {
    NO_SPEECH_NETWORK_MS: 10 * 1000,
    NO_SPEECH_ONDEVICE_MS: 20 * 1000,
    NO_NEW_SPEECH_MS: 5 * 1000,
    NO_FOCUSED_IME_MS: 1000,
  };

  export enum StopReason {
    NO_FOCUSED_IME = 'Dictation stopped automatically: No focused IME',
  }
}

TestImportManager.exportForTesting(Dictation);
chromium/chrome/browser/resources/chromeos/accessibility/accessibility_common/dictation/dictation.ts