documents/dev/snippets/javascript/nuk-speech-to-text.md

Speech to Text

upload-transcribe.tsx

import {
  getTranscription,
  uploadAudioForTranscription,
} from "@/app/utils/speechToText";
import { Button } from "@/components/ui/button";
import FileUploadModal from "@/components/ui/file-upload-modal";
import { timeout } from "@/lib/utils";
import { Microphone } from "@mynaui/icons-react";

export default function UploadTranscribeModal() {
  return (
    <FileUploadModal
      maxFiles={1}
      handleUpload={async (files) => {
        if (files[0]) {
          console.log("transcribe", files);
          const res = await uploadAudioForTranscription(files[0]);

          await timeout(1000);
          const transcript = await getTranscription(res.id);
          console.log(transcript);
        }
      }}
    >
      <Button variant="secondary" size={"icon"}>
        <Microphone className="h-4 w-4" />
      </Button>
    </FileUploadModal>
  );
}

media-recorder-input.tsx

import { getTranscription, uploadAudioForTranscription } from "@/app/utils/speechToText";
import { ButtonVariant, LoadingButton } from "@/components/ui/button";
import { ID, timeout } from "@/lib/utils";
import { Microphone } from "@mynaui/icons-react";
import { useEffect, useRef, useState } from "react";

async function getMediaStream() {
  let stream = null;
  try {
    // constraints - only audio needed for this app
    stream = await navigator.mediaDevices.getUserMedia({ audio: true });
  } catch (err) {
    console.error(`The following getUserMedia error occurred: ${err}`);
  }
  return stream;
}

async function getFileTranscription(file: File) {
  console.log("transcribe", file);
  const res = await uploadAudioForTranscription(file);

  await timeout(1000);
  return await getTranscription(res.id);
}

type SpeechState = "not_ready" | "ready" | "recording" | "transcribing"
const variant = {
  not_ready: "secondary",
  ready: "default",
  recording: "destructive",
  transcribing: "secondary",
};

export default function SpeechInput({
  onTranscribe,
}: {
  onTranscribe: (text: string) => void;
}) {
  const [mediaRecorder, setMediaRecorder] = useState<MediaRecorder>();
  const [speechState, setSpeechState] = useState<SpeechState>("not_ready");
  const [recordedClip, setRecordedClip] = useState<{
    url: string;
    file: File;
  }>();

  const isTranscribing = speechState === "transcribing";

  useEffect(() => {
    if (!recordedClip) return;
    getFileTranscription(recordedClip.file).then((result) => {
      if (result) onTranscribe(result.text);
      setSpeechState("ready");
    });
  }, [recordedClip]);

  const _ = useRef({
    chunks: [] as BlobPart[],
    type: "audio/mpeg",
  }).current;

  const requestMediaRecorder = () => {
    getMediaStream().then((stream) => {
      if (!stream) return;
      const mediaRecorder = new MediaRecorder(stream);
      setMediaRecorder(mediaRecorder);
      setSpeechState("ready");

      mediaRecorder.addEventListener("dataavailable", (e) => {
        _.chunks.push(e.data);
      });

      mediaRecorder.addEventListener("stop", () => {
        console.log("MediaRecorder", mediaRecorder.state);

        const id = ID();
        const blob = new Blob(_.chunks, { type: _.type });
        const file = new File([blob], `${id}.mp3`);
        const url = window.URL.createObjectURL(blob);
        setRecordedClip({ url, file });
      });
    });
  };

  const onClick = () => {
    if (!mediaRecorder) {
      return requestMediaRecorder();
    }

    if(isTranscribing) {
      console.log("transcribing, please wait...");
      return;
    }

    if (speechState === "ready") {
      _.chunks = [];
      setSpeechState("recording");
      mediaRecorder.start();
      console.log("MediaRecorder", mediaRecorder.state);
    } else if(speechState === "recording") {
      setSpeechState("transcribing");
      mediaRecorder.stop();
    }
  };

  return (
    <LoadingButton
      loading={isTranscribing}
      variant={variant[speechState] as ButtonVariant}
      size={isTranscribing ? "default" : "icon"}
      onClick={onClick}
    >
      {!isTranscribing && <Microphone className="h-4 w-4" />}
      {isTranscribing && "transcribing..."}
    </LoadingButton>
  );
}

speechToText.ts

import { timeout } from "@/lib/utils";
import { getMD5 } from ".";

export async function uploadAudioForTranscription(file: File) {
  const id = await getMD5(file);
  const formData = new FormData();
  formData.append("id", id);
  formData.append("file", file);

  try {
    const res = await fetch("https://nuk.scriptsync.app/_transcribe", {
      method: "POST",
      body: formData,
    });
    const data = await res.json();
    console.log("Speech file processing");
    return data;
  } catch (err) {
    console.log("Error uploading file", err);
    return null;
  }
}

export async function getTranscription(
  id: string,
  interval = 2000,
  tries = 30
) {
  try {
    const res = await fetch(`https://nuk.scriptsync.app/_transcript/${id}`);
    const data = (await res.json()) as Obj;

    if (data.text) {
      return data; // { text, segments, language }
    } else if (data.status === "processing" || data.status === "queued") {
      if (tries === 0) return null;
      await timeout(interval);
      return await getTranscription(id, interval, tries - 1);
    } else if (data.status === "error") {
      return null;
    }
    return null;
  } catch (err) {
    console.log("Error getting file", err);
    return null;
  }
}