chromium/ash/webui/recorder_app_ui/resources/core/recording_data_manager.ts

// Copyright 2024 The Chromium Authors
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file.

import {MAX_SPEAKER_COLORS} from '../components/styles/speaker_label.js';

import {SAMPLE_RATE, SAMPLES_PER_SLICE} from './audio_constants.js';
import {DataDir} from './data_dir.js';
import {computed, ReadonlySignal, signal} from './reactive/signal.js';
import {Transcription, transcriptionSchema} from './soda/soda.js';
import {
  ExportAudioFormat,
  ExportSettings,
  ExportTranscriptionFormat,
} from './state/settings.js';
import {assert, assertExhaustive, assertExists} from './utils/assert.js';
import {AsyncJobQueue} from './utils/async_job_queue.js';
import {Infer, z} from './utils/schema.js';
import {ulid} from './utils/ulid.js';
import {asyncLazyInit, downloadFile} from './utils/utils.js';

/**
 * The base recording metadata.
 *
 * Since metadata are all read in the main page, cached in memory, and can be
 * edited, it should be small and takes constant memory.
 */
const baseRecordingMetadataSchema = z.object({
  id: z.string(),
  title: z.string(),
  durationMs: z.number(),

  // Use number instead of Date to make metadata JSON-serializable.
  // The number here is same as the value returned by Date.now(), which is the
  // number of milliseconds elapsed since the epoch.
  recordedAt: z.number(),
});

type BaseRecordingMetadata = Infer<typeof baseRecordingMetadataSchema>;

// Note that a recalculation should be triggered if this value is changed, by
// changing the version value in the versionedTimelineSegmentsSchema.
// TODO(pihsun): Test threshold and see what the normal background noise level
// is.
const NO_AUDIO_POWER_THRESHOLD = 10;

/**
 * The type of a segment in the timeline.
 *
 * The values are set to numbers to save space when written on disk, and the
 * values are explicitly written out since it's written to disk and need to be
 * backward compatible.
 */
export enum TimelineSegmentKind {
  // No audio (power < NO_AUDIO_POWER_THRESHOLD)
  NO_AUDIO = 0,

  // Some audio but it's not speech or transcription is not enabled.
  AUDIO = 1,

  // Audio is speech (defined by the transcription timestamp), but speaker
  // label is not enabled.
  SPEECH = 2,

  // Audio is speech with the "first" to "fifth" speaker label. Note that this
  // is only used for the color which cycles after the fifth speaker, so the
  // sixth speaker will be labeled as `SPEECH_SPEAKER_COLOR_1` here.
  SPEECH_SPEAKER_COLOR_1 = 3,
  SPEECH_SPEAKER_COLOR_2 = 4,
  SPEECH_SPEAKER_COLOR_3 = 5,
  SPEECH_SPEAKER_COLOR_4 = 6,
  SPEECH_SPEAKER_COLOR_5 = 7,
}

// Note that to avoid floating point calculation error, all "time length" /
// "time offset" of the timeline segment calculations are done in number of
// samples instead of seconds.
const timelineSegmentSchema = z.tuple([
  // The length of the segment in number of samples.
  z.number(),
  // The type of the segment.
  z.nativeEnum(TimelineSegmentKind),
]);

type TimelineSegment = Infer<typeof timelineSegmentSchema>;

const TIMELINE_SEGMENT_VERSION = 1;

const vesionedTimelineSegmentsSchema = z.object({
  version: z.literal(TIMELINE_SEGMENT_VERSION),
  segments: z.array(timelineSegmentSchema),
});

type VersionedTimelineSegments = Infer<typeof vesionedTimelineSegmentsSchema>;

/**
 * The recording metadata that are derived from other data.
 *
 * These are used in recording list.
 */
const derivedRecordingMetadataSchema = z.object({
  // Total number of speakers in the recording. `null` if the transcription
  // isn't enabled in the recording. `undefined` or missing field when it's not
  // computed yet.
  numSpeakers: z.optional(z.nullable(z.number())),

  // Length in number of samples, and type of each segment of the recording.
  // Used by the recording list.
  timelineSegments: z.catch(
    z.optional(vesionedTimelineSegmentsSchema),
    undefined,
  ),

  // Description of the recording used by the recording list. This contains a
  // truncated version of the transcription with maximum length
  // `MAX_DESCRIPTION_LENGTH`.
  description: z.string(),

  // There was a field named `powerAverages` here with type number[]. If we
  // ever want to add a field with a same name back, we need to ensure the
  // schema is not compatible.
  // TODO(pihsun): Have some kind of "deprecated" field marker in schema, that
  // just decode everything into undefined.
});

const recordingMetadataSchema = z.intersection([
  baseRecordingMetadataSchema,
  derivedRecordingMetadataSchema,
]);

export type RecordingMetadata = Infer<typeof recordingMetadataSchema>;

const audioPowerSchema = z.object({
  // Array of integer in [0, 255] representing the powers. Each data point
  // corresponds to a slice passed to the audio worklets with SAMPLES_PER_SLICE
  // audio samples.
  // TODO(pihsun): Compression. Use a UInt8Array and CompressionStream?
  powers: z.array(z.number()),
});

type AudioPower = Infer<typeof audioPowerSchema>;

/**
 * The recording create parameters without id field, used for creating new
 * recordings.
 */
export type RecordingCreateParams = Omit<
  AudioPower&BaseRecordingMetadata&{transcription: Transcription | null}, 'id'>;

// TODO(pihsun): Type-safe wrapper for reading / writing specific type of file?
function metadataName(id: string) {
  return `${id}.meta.json`;
}

function audioPowerName(id: string) {
  return `${id}.powers.json`;
}

function transcriptionName(id: string) {
  return `${id}.transcription.json`;
}

function audioName(id: string) {
  return `${id}.webm`;
}

function calculatePowerSegments(powers: number[]): TimelineSegment[] {
  const segments: TimelineSegment[] = [];
  for (const power of powers) {
    const label = power < NO_AUDIO_POWER_THRESHOLD ?
      TimelineSegmentKind.NO_AUDIO :
      TimelineSegmentKind.AUDIO;
    if (segments.length === 0 || assertExists(segments.at(-1))[1] !== label) {
      segments.push([SAMPLES_PER_SLICE, label]);
    } else {
      assertExists(segments.at(-1))[0] += SAMPLES_PER_SLICE;
    }
  }
  return segments;
}

function speakerLabelToTimelineSegmentKind(
  speakerLabels: string[],
  speakerLabel: string|null,
): TimelineSegmentKind {
  if (speakerLabel === null) {
    return TimelineSegmentKind.SPEECH;
  }
  const speakerLabelIdx = speakerLabels.indexOf(speakerLabel);
  assert(speakerLabelIdx !== -1);
  return assertExists(
    [
      TimelineSegmentKind.SPEECH_SPEAKER_COLOR_1,
      TimelineSegmentKind.SPEECH_SPEAKER_COLOR_2,
      TimelineSegmentKind.SPEECH_SPEAKER_COLOR_3,
      TimelineSegmentKind.SPEECH_SPEAKER_COLOR_4,
      TimelineSegmentKind.SPEECH_SPEAKER_COLOR_5,
    ][speakerLabelIdx % MAX_SPEAKER_COLORS],
  );
}

interface OverlayTimelineSegment {
  /**
   * Start time in number of samples.
   */
  start: number;
  /**
   * End time in number of samples.
   */
  end: number;
  /**
   * Type of the segment.
   */
  kind: TimelineSegmentKind;
}

function calculateSpeechSegments(
  transcription: Transcription|null,
): OverlayTimelineSegment[] {
  if (transcription === null) {
    return [];
  }
  const speakerLabels = transcription.getSpeakerLabels();
  let currentSampleOffset = 0;
  const segments: OverlayTimelineSegment[] = [];
  for (const paragraph of transcription.getParagraphs()) {
    const firstPart = assertExists(paragraph[0]);
    const lastPart = assertExists(paragraph.at(-1));

    const startMs = firstPart.timeRange?.startMs ?? null;
    const endMs = lastPart.timeRange?.endMs ?? null;
    if (startMs === null || endMs === null) {
      // TODO(pihsun): Check if there's any possibility that the timestamp is
      // missing. Asserting that timestamp exists at type level, and assert
      // that the ranges are incrasing would simplify many code.
      continue;
    }
    // The timestamps should be increasing.
    const start = Math.round((startMs / 1000) * SAMPLE_RATE);
    const end = Math.round((endMs / 1000) * SAMPLE_RATE);
    assert(currentSampleOffset <= start && start <= end);
    if (end > start) {
      segments.push({
        start,
        end,
        kind: speakerLabelToTimelineSegmentKind(
          speakerLabels,
          firstPart.speakerLabel,
        ),
      });
    }
    currentSampleOffset = end;
  }
  return segments;
}

/**
 * Overlays the `segments` onto `baseSegments`, splitting the segments if
 * necessary.
 *
 * The total length of the returned segment will be the same as `baseSegments`.
 */
function overlaySegments(
  baseSegments: TimelineSegment[],
  overlaySegments: OverlayTimelineSegment[],
): TimelineSegment[] {
  let overlayIdx = 0;
  const ret: TimelineSegment[] = [];
  let time = 0;

  function forwardTime(end: number, kind: TimelineSegmentKind) {
    assert(end >= time);
    const length = end - time;
    if (length > 0) {
      ret.push([length, kind]);
    }
    time = end;
  }

  for (const [baseLength, baseKind] of baseSegments) {
    const baseEnd = time + baseLength;
    while (overlayIdx < overlaySegments.length) {
      const {start, end, kind} = assertExists(overlaySegments[overlayIdx]);
      if (start >= baseEnd) {
        // Next overlay after current segment.
        break;
      }
      if (start >= time) {
        forwardTime(start, baseKind);
      }
      if (end >= baseEnd) {
        // TODO(pihsun): This split the overlay segment into multiple based on
        // the boundary of the baseSegments. We don't necessary need to do this
        // but the code is easier to write with this, and the resulting number
        // of segments is still the same order of magnitude.
        forwardTime(baseEnd, kind);
        break;
      }
      forwardTime(end, kind);
      overlayIdx++;
    }
    forwardTime(baseEnd, baseKind);
  }
  return ret;
}

// Note that this is not an exact upper limit of the number of segments
// returned from `simplifySegments`, and The current heuristic result in at
// most `2 * SIMPLIFY_SEGMENT_RESOLUTION - 1` segments.
const SIMPLIFY_SEGMENT_RESOLUTION = 512;

/**
 * Simplifies the segments by merging the shorter segments, so the total number
 * of segments are at most constant regardless of the recording length.
 *
 * The herustic to achieve this is:
 * * Define `threshold` to be `max(total length / SIMPLIFY_SEGMENT_RESOLUTION, 1
 *   second)`.
 * * Segments longer than the `threshold` are preserved as is.
 * * Consecutive segments shorter than `threshold` are merge together,
 *   until it's longer than the `threshold`. The new kind will be the most
 *   frequent kind in the range.
 *
 * Note that this doesn't guarantee that all returned segments are longer than
 * `threshold`, as there might be some segments shorter than `threshold` left
 * between longer segments.
 */
function simplifySegments(segments: TimelineSegment[]): TimelineSegment[] {
  const totalLength =
    segments.map(([length]) => length).reduce((a, b) => a + b, 0);
  const threshold = Math.max(
    totalLength / SIMPLIFY_SEGMENT_RESOLUTION,
    SAMPLE_RATE,
  );

  const ret: TimelineSegment[] = [];
  let currentGroup: TimelineSegment[] = [];
  let currentGroupLength = 0;

  function endCurrentGroup() {
    if (currentGroup.length === 0) {
      return;
    }
    const kindLengths = new Map<TimelineSegmentKind, number>();
    for (const [length, kind] of currentGroup) {
      kindLengths.set(kind, (kindLengths.get(kind) ?? 0) + length);
    }
    let kind: TimelineSegmentKind|null = null;
    let maxLength = 0;
    for (const [k, length] of kindLengths.entries()) {
      if (length >= maxLength) {
        maxLength = length;
        kind = k;
      }
    }
    ret.push([currentGroupLength, assertExists(kind)]);
    currentGroup = [];
    currentGroupLength = 0;
  }

  function push(segment: TimelineSegment) {
    currentGroup.push(segment);
    currentGroupLength += segment[0];
  }

  for (const segment of segments) {
    const [length] = segment;
    if (length >= threshold) {
      endCurrentGroup();
    }
    push(segment);
    if (currentGroupLength >= threshold) {
      endCurrentGroup();
    }
  }
  endCurrentGroup();
  return ret;
}

function calculateTimelineSegments(
  powers: number[],
  transcription: Transcription|null,
): VersionedTimelineSegments {
  const powerSegments = calculatePowerSegments(powers);
  const speechSegments = calculateSpeechSegments(transcription);

  const segments = simplifySegments(
    overlaySegments(powerSegments, speechSegments),
  );
  return {
    version: TIMELINE_SEGMENT_VERSION,
    segments,
  };
}

/**
 * Gets the default filename without extension for the exported file.
 *
 * The returned filename is used as a suggested name to Chrome and Chrome will
 * further remove all characters that are not safe for filename, so we don't
 * need to filter any characters here.
 */
export function getDefaultFileNameWithoutExtension(
  meta: RecordingMetadata,
): string {
  // Replace all ":" with ".", since Chrome defaults to replacing ':' with '_',
  // and we want to align with screen capture/recording default name which use
  // '.' as separator.
  // Example default title: `Audio recording 2024-12-04 12:34:56`
  // Example default filename: `Audio recording 2024-12-04 12.34.56.<extension>`
  return meta.title.replaceAll(':', '.');
}

// TODO(pihsun): Use a Map when draft.ts supports it.
export type RecordingMetadataMap = Record<string, RecordingMetadata>;

/**
 * The recording data manager to manage the audio file (.webm) and metadata
 * file (.json) pairs.
 */
export class RecordingDataManager {
  /**
   * Cached metadata for all recordings.
   *
   * The cache might be inconsistent to the file on disk if there are multiple
   * recorder instance running, but since we're only allowing a single instance
   * of recorder app, that shouldn't be an issue.
   *
   * TODO(pihsun): Check if there's any case we'll have more than one
   * instance running.
   */
  private readonly cachedMetadataMap = signal<RecordingMetadataMap>({});

  /**
   * Queues for writing metadata to the disk. Each recording ID gets it own
   * queue.
   *
   * TODO(pihsun): Have an on exit handler that waits for all writes to finish
   * before exiting app.
   */
  private readonly metadataWriteQueues = new Map<string, AsyncJobQueue>();

  static async create(dataDir: DataDir): Promise<RecordingDataManager> {
    async function getMetadataFromFilename(
      name: string,
    ): Promise<RecordingMetadata> {
      const file = await dataDir.read(name);
      const text = await file.text();
      // TODO(shik): Add versioning, and error handling.
      // Currently the JSON is not exposed in the real filesystem and we simply
      // use the parsed result.
      const data = recordingMetadataSchema.parseJson(text);
      return data;
    }

    const filenames = await dataDir.list();
    const metadataMap = Object.fromEntries(
      await Promise.all(
        filenames.filter((x) => x.endsWith('.meta.json')).map(async (x) => {
          const meta = await getMetadataFromFilename(x);
          return [meta.id, meta] as const;
        }),
      ),
    );
    return new RecordingDataManager(dataDir, metadataMap);
  }

  private constructor(
    private readonly dataDir: DataDir,
    metadata: RecordingMetadataMap,
  ) {
    this.cachedMetadataMap.value = metadata;

    for (const [id, meta] of Object.entries(metadata)) {
      this.fillDerivedMetadata(id, meta).catch((e) => {
        console.error(`error while filling derived data for ${id}`, e);
      });
    }
  }

  private async fillDerivedMetadata(id: string, meta: RecordingMetadata) {
    const getTranscription = asyncLazyInit(() => this.getTranscription(id));
    const getPowers = asyncLazyInit(() => this.getAudioPower(id));
    let changed = false;

    if (meta.numSpeakers === undefined) {
      changed = true;
      const transcription = await getTranscription();
      meta = {
        ...meta,
        numSpeakers: transcription?.getSpeakerLabels().length ?? null,
      };
    }

    if (meta.timelineSegments === undefined) {
      changed = true;
      const transcription = await getTranscription();
      const {powers} = await getPowers();
      meta = {
        ...meta,
        timelineSegments: calculateTimelineSegments(powers, transcription),
      };
    }

    if (changed) {
      this.setMetadata(id, meta);
    }
  }

  private getWriteQueueFor(id: string) {
    const queue = this.metadataWriteQueues.get(id);
    if (queue !== undefined) {
      return queue;
    }
    const newQueue = new AsyncJobQueue('keepLatest');
    this.metadataWriteQueues.set(id, newQueue);
    return newQueue;
  }

  /**
   * @return The created recording id.
   */
  async createRecording(
    {transcription, powers, ...meta}: RecordingCreateParams,
    audio: Blob,
  ): Promise<string> {
    const id = ulid();
    const numSpeakers = transcription?.getSpeakerLabels().length ?? null;
    const description = transcription?.toShortDescription() ?? '';
    const timelineSegments = calculateTimelineSegments(powers, transcription);
    const fullMeta = {
      id,
      description,
      timelineSegments,
      numSpeakers,
      ...meta,
    };
    this.setMetadata(id, fullMeta);
    await Promise.all([
      this.dataDir.write(
        audioPowerName(id),
        audioPowerSchema.stringifyJson({powers}),
      ),
      this.dataDir.write(
        transcriptionName(id),
        transcriptionSchema.stringifyJson(transcription),
      ),
      this.dataDir.write(audioName(id), audio),
    ]);
    return id;
  }

  /**
   * Gets the current cached metadata list.
   */
  getAllMetadata(): ReadonlySignal<RecordingMetadataMap> {
    return this.cachedMetadataMap;
  }

  private getMetadataRaw(id: string): RecordingMetadata|null {
    return this.cachedMetadataMap.value[id] ?? null;
  }

  getMetadata(id: string): ReadonlySignal<RecordingMetadata|null> {
    return computed(() => this.getMetadataRaw(id));
  }

  setMetadata(id: string, meta: RecordingMetadata): void {
    this.cachedMetadataMap.mutate((m) => {
      m[id] = meta;
    });
    this.getWriteQueueFor(id).push(async () => {
      await this.dataDir.write(
        metadataName(id),
        recordingMetadataSchema.stringifyJson(meta),
      );
    });
  }

  async getAudioFile(id: string): Promise<File> {
    const name = audioName(id);
    const file = await this.dataDir.read(name);
    return file;
  }

  /**
   * Gets the transcription of the given recording.
   *
   * Since transcription can be enabled / disabled during the recording, the
   * transcription might only contain part of the transcription when
   * transcription is enabled.
   *
   * If the transcription is never enabled while recording, the function will
   * return null, which is different from returning an empty transcription
   * (transcription was enabled but no speech is detected).
   */
  async getTranscription(id: string): Promise<Transcription|null> {
    const name = transcriptionName(id);
    const file = await this.dataDir.read(name);
    const text = await file.text();
    return transcriptionSchema.parseJson(text);
  }

  async getAudioPower(id: string): Promise<AudioPower> {
    const name = audioPowerName(id);
    const file = await this.dataDir.read(name);
    const text = await file.text();
    return audioPowerSchema.parseJson(text);
  }

  async clear(): Promise<void> {
    // TODO(pihsun): There's definitely race condition between this and the
    // background writes in this.metadataWriteQueues, but since this is only
    // used in the /dev page it might not worth the extra effort to make it
    // 100% correct.
    await this.dataDir.clear();
    this.cachedMetadataMap.value = {};
  }

  remove(id: string): void {
    this.cachedMetadataMap.mutate((m) => {
      delete m[id];
    });
    this.getWriteQueueFor(id).push(async () => {
      await this.dataDir.remove(metadataName(id));
    });
    // TODO(pihsun): Since all removal of audio should be independent of each
    // other, there's not much reason to put this in a queue.
    void this.dataDir.remove(audioName(id));
    void this.dataDir.remove(transcriptionName(id));
    void this.dataDir.remove(audioPowerName(id));
  }

  private async exportAudio(id: string, format: ExportAudioFormat) {
    const metadata = this.getMetadataRaw(id);
    if (metadata === null) {
      return;
    }

    const file = await this.getAudioFile(id);
    switch (format) {
      case ExportAudioFormat.WEBM_ORIGINAL: {
        const filename = getDefaultFileNameWithoutExtension(metadata) + '.webm';
        downloadFile(filename, file);
        break;
      }
      default:
        assertExhaustive(format);
    }
  }

  private async exportTranscription(
    id: string,
    format: ExportTranscriptionFormat,
  ) {
    const metadata = this.getMetadataRaw(id);
    if (metadata === null) {
      return;
    }
    const transcription = await this.getTranscription(id);
    if (transcription === null || transcription.isEmpty()) {
      return;
    }

    switch (format) {
      case ExportTranscriptionFormat.TXT: {
        const text = transcription.toPlainText();
        const blob = new Blob([text], {type: 'text/plain'});
        const filename = getDefaultFileNameWithoutExtension(metadata) + '.txt';
        downloadFile(filename, blob);
        break;
      }
      default:
        assertExhaustive(format);
    }
  }

  async exportRecording(id: string, settings: ExportSettings): Promise<void> {
    if (settings.transcription) {
      await this.exportTranscription(id, settings.transcriptionFormat);
    }
    if (settings.audio) {
      await this.exportAudio(id, settings.audioFormat);
    }
  }
}