soda.ts | Explore in Territory

// Copyright 2024 The Chromium Authors
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file.

import {assertExists} from '../utils/assert.js';
import {Infer, z} from '../utils/schema.js';
import {lazyInit, sliceWhen} from '../utils/utils.js';

import {
  FinalResult,
  PartialResult,
  SodaEvent,
  SpeakerLabelCorrectionEvent,
  TimeDelta,
  TimingInfo,
} from './types.js';

// A time range in milliseconds.
export const timeRangeSchema = z.object({
  startMs: z.number(),
  endMs: z.number(),
});
export type TimeRange = Infer<typeof timeRangeSchema>;

export const textPartSchema = z.object({
  kind: z.literal('textPart'),
  text: z.string(),
  timeRange: z.nullable(timeRangeSchema),
  leadingSpace: z.nullable(z.boolean()),
  speakerLabel: z.autoNullOptional(z.string()),
  // Since the transcription saved to the disk are always finalResult, and this
  // is only used in intermediate partialResult, only include this field in
  // partialResult to save some disk space.
  partial: z.optional(z.literal(true)),
});

export type TextPart = Infer<typeof textPartSchema>;

export const textSeparatorSchema = z.object({
  kind: z.literal('textSeparator'),
});
export type TextSeparator = Infer<typeof textSeparatorSchema>;
export const textSeparator: TextSeparator = {
  kind: 'textSeparator',
};

export const textTokenSchema = z.union([textPartSchema, textSeparatorSchema]);
export type TextToken = Infer<typeof textTokenSchema>;

function toMs(timeDelta: TimeDelta): number;
function toMs(timeDelta: TimeDelta|null): number|null;
function toMs(timeDelta: TimeDelta|null): number|null {
  if (timeDelta === null) {
    return null;
  }
  return Number(timeDelta.microseconds) / 1e3;
}

function parseTimingInfo(
  timingInfo: TimingInfo|null,
  offsetMs: number,
): TimeRange|null {
  if (timingInfo === null) {
    return null;
  }
  const {audioStartTime, eventEndTime} = timingInfo;
  return {
    startMs: toMs(audioStartTime) + offsetMs,
    endMs: toMs(eventEndTime) + offsetMs,
  };
}

function flattenEvent(
  ev: FinalResult|PartialResult,
  offsetMs: number,
  speakerLabelEnabled: boolean,
  isPartialResult = false,
): TextPart[] {
  const {hypothesisPart, timingEvent} = ev;

  const result: TextPart[] = [];
  const eventTimeRange = parseTimingInfo(timingEvent, offsetMs);
  if (eventTimeRange === null) {
    // TODO(pihsun): Check if this can actually happen.
    console.error('soda event has no timestamp', ev);
  }

  if (hypothesisPart === null || hypothesisPart.length === 0) {
    return [];
  }

  for (const [i, part] of hypothesisPart.entries()) {
    const timeRange: TimeRange|null = (() => {
      if (eventTimeRange === null || part.alignment === null) {
        return null;
      }
      const startMs = toMs(part.alignment);
      const endMs = i !== hypothesisPart.length - 1 ?
        toMs(assertExists(hypothesisPart[i + 1]).alignment) :
        eventTimeRange.endMs - eventTimeRange.startMs;
      if (endMs === null) {
        return null;
      }
      // TODO(pihsun): Have a "time" type so we don't have to remember
      // which number is in which unit.
      return {
        startMs: startMs + eventTimeRange.startMs,
        endMs: endMs + eventTimeRange.startMs,
      };
    })();

    result.push({
      kind: 'textPart',
      text: assertExists(part.text[0]),
      timeRange,
      leadingSpace: part.leadingSpace,
      speakerLabel: speakerLabelEnabled ? part.speakerLabel : null,
      partial: isPartialResult ? true : undefined,
    });
  }
  return result;
}

// Transforms the raw soda events into a form that's more easily usable by UI.
export class SodaEventTransformer {
  private readonly tokens: TextToken[] = [];

  // The last tokens from the PartialResult in SodaEvent with partial result.
  private partialResultTokens: TextToken[]|null = null;

  constructor(private readonly speakerLabelEnabled: boolean) {}

  getTranscription(): Transcription {
    const tokens = [...this.tokens];
    if (this.partialResultTokens !== null) {
      if (tokens.length > 0) {
        tokens.push(textSeparator);
      }
      tokens.push(...this.partialResultTokens);
    }
    return new Transcription(tokens);
  }

  private handleSpeakerLabelCorrectionEvent(
    ev: SpeakerLabelCorrectionEvent,
    offsetMs: number,
  ) {
    if (!this.speakerLabelEnabled) {
      // Don't handle speaker label correction event when it's not enabled.
      return;
    }
    const {hypothesisParts} = ev;
    for (const correctionPart of hypothesisParts) {
      const speakerLabel = correctionPart.speakerLabel ?? null;
      const startMs = toMs(correctionPart.alignment);
      if (startMs === null) {
        console.error('speaker label correction event without timestamp', ev);
        continue;
      }
      // We search backward since it's more likely that the corrected token is
      // recent.
      // TODO(pihsun): assert that the tokens have increasing timestamp, and
      // binary search for efficiency.
      let found = false;
      for (let i = this.tokens.length - 1; i >= 0; i--) {
        const token = assertExists(this.tokens[i]);
        if (token.kind === 'textSeparator') {
          continue;
        }
        if (token.timeRange?.startMs === startMs + offsetMs &&
            token.text === correctionPart.text[0]) {
          // TODO(pihsun): This inline updates this.tokens, which works now
          // since getTokens always return a copy, but ideally we want either
          // immutable update, or signal/proxy with nested change detection, or
          // have a clearer boundary on which values (especially object/array)
          // should be immutably updated for lit change detection.
          token.speakerLabel = speakerLabel;
          found = true;
          break;
        }
      }
      if (!found) {
        console.error(
          'speaker label correction event without corresponding previous part?',
          ev,
        );
      }
    }
  }

  /**
   * Adds a SODA event.
   * An offset can be passed to shift the timestamp in the event, since the
   * transcription can be stopped and started while recording.
   *
   * @param event The SODA event.
   * @param offsetMs Offset of the start of the SODA session in microseconds.
   */
  addEvent(event: SodaEvent, offsetMs: number): void {
    if ('partialResult' in event) {
      this.partialResultTokens = flattenEvent(
        event.partialResult,
        offsetMs,
        this.speakerLabelEnabled,
        /* isPartialResult= */ true,
      );
      // Don't update tokens since it'll be added in getTokens.
      return;
    }
    if ('finalResult' in event) {
      // New final result, remove the partial result event.
      this.partialResultTokens = null;
      const {finalResult} = event;
      if (this.tokens.length > 0) {
        this.tokens.push(textSeparator);
      }
      this.tokens.push(
        ...flattenEvent(finalResult, offsetMs, this.speakerLabelEnabled),
      );
      return;
    }
    if ('labelCorrectionEvent' in event) {
      this.handleSpeakerLabelCorrectionEvent(
        event.labelCorrectionEvent,
        offsetMs,
      );
      return;
    }
    console.error('unknown event type', event);
  }
}

export const transcriptionSchema = z.transform(
  z.object({
    // Transcriptions in form of text tokens.
    //
    // Since transcription can be enabled / disabled during the recording, the
    // `textTokens` might only contain part of the transcription when
    // transcription is enabled.
    //
    // If the transcription is never enabled while recording, `textTokens` will
    // be null (to show a different state in playback view).
    textTokens: z.nullable(z.array(textTokenSchema)),
  }),
  {
    test(input) {
      return input instanceof Transcription;
    },
    decode({textTokens}) {
      if (textTokens === null) {
        return null;
      }
      return new Transcription(textTokens);
    },
    encode(val) {
      if (val === null) {
        return {textTokens: null};
      }
      return {textTokens: val.textTokens};
    },
  },
);

const MAX_DESCRIPTION_LENGTH = 512;

export class Transcription {
  constructor(readonly textTokens: TextToken[]) {}

  isEmpty(): boolean {
    return this.textTokens.length === 0;
  }

  /**
   * Concatenates textTokens into the string representation of the
   * transcription.
   *
   * This is also used to export the transcription into a txt file.
   *
   * TODO(pihsun): Have a different function for exporting to text format and
   * when exporting representation used for summary input.
   * TODO(pihsun): Include speaker label in the output.
   */
  toPlainText = lazyInit((): string => {
    const ret: string[] = [];
    let startOfParagraph = true;
    // TODO(pihsun): This currently don't include the speaker label, but since
    // the speaker label is a little bit not accurate on the start of sentence,
    // including it might make the result weird.
    for (const token of this.textTokens) {
      if (token.kind === 'textSeparator') {
        ret.push('\n');
        startOfParagraph = true;
        continue;
      }
      if (!startOfParagraph && (token.leadingSpace ?? true)) {
        ret.push(' ');
      }
      ret.push(token.text);
      startOfParagraph = false;
    }
    return ret.join('');
  });

  toShortDescription = lazyInit((): string => {
    if (this.textTokens === null) {
      return '';
    }
    const transcription = this.toPlainText();
    if (transcription.length <= MAX_DESCRIPTION_LENGTH - 3) {
      return transcription;
    }
    return transcription.substring(0, MAX_DESCRIPTION_LENGTH - 3) + '...';
  });

  /**
   * Gets the list of speaker label in the transcription.
   *
   * The returned label is ordered by the first appearance of the label in the
   * transcription.
   */
  getSpeakerLabels = lazyInit((): string[] => {
    const speakerLabels = new Set<string>();
    for (const token of this.textTokens) {
      if (token.kind === 'textPart' && token.speakerLabel !== null &&
          !token.partial) {
        speakerLabels.add(token.speakerLabel);
      }
    }
    return Array.from(speakerLabels);
  });

  /**
   * Splits the transcription into several paragraphs.
   *
   * Each paragraph have continuous timestamp, and a single speaker label.
   */
  getParagraphs = lazyInit((): TextPart[][] => {
    const slicedTokens = sliceWhen(this.textTokens, (a, b) => {
      if (a.kind === 'textSeparator' || b.kind === 'textSeparator') {
        return true;
      }
      if (a.timeRange === null && b.timeRange === null) {
        return false;
      }
      if (a.timeRange?.endMs !== b.timeRange?.startMs) {
        // TODO(pihsun): This currently is not used since we already
        // split across result border, and within the same result the
        // time ranges are always continuous.
        return true;
      }
      if (a.partial !== b.partial) {
        return true;
      }
      if (!a.partial && a.speakerLabel !== b.speakerLabel) {
        return true;
      }
      return false;
    });

    return slicedTokens.filter((tokens) => {
      return tokens.every((t) => t.kind === 'textPart');
    });
  });
}
chromium/ash/webui/recorder_app_ui/resources/core/soda/soda.ts