From 7c1a45f219a808c64c30298c84f3d77c07845079 Mon Sep 17 00:00:00 2001 From: chmjkb Date: Thu, 27 Feb 2025 15:31:20 +0100 Subject: [PATCH 01/35] wip --- docs/docs/computer-vision/_category_.json | 2 +- docs/docs/speech-to-text/_category_.json | 7 ++ docs/docs/speech-to-text/whisper.md | 111 ++++++++++++++++++++++ 3 files changed, 119 insertions(+), 1 deletion(-) create mode 100644 docs/docs/speech-to-text/_category_.json create mode 100644 docs/docs/speech-to-text/whisper.md diff --git a/docs/docs/computer-vision/_category_.json b/docs/docs/computer-vision/_category_.json index 5aa6c0263c..1a78d5e75f 100644 --- a/docs/docs/computer-vision/_category_.json +++ b/docs/docs/computer-vision/_category_.json @@ -1,6 +1,6 @@ { "label": "Computer Vision", - "position": 3, + "position": 4, "link": { "type": "generated-index" } diff --git a/docs/docs/speech-to-text/_category_.json b/docs/docs/speech-to-text/_category_.json new file mode 100644 index 0000000000..554e3476a1 --- /dev/null +++ b/docs/docs/speech-to-text/_category_.json @@ -0,0 +1,7 @@ +{ + "label": "Speech To Text", + "position": 3, + "link": { + "type": "generated-index" + } +} diff --git a/docs/docs/speech-to-text/whisper.md b/docs/docs/speech-to-text/whisper.md new file mode 100644 index 0000000000..b8a6b74af9 --- /dev/null +++ b/docs/docs/speech-to-text/whisper.md @@ -0,0 +1,111 @@ +--- +title: useSpeechToText +sidebar_position: 1 +--- + +With the latest `v0.3.0` release we introduce a new hook - `useSpeechToText`. Speech to text is a task that allows to transform spoken language to written text. It is commonly used to implement features such as transcription, or voice assistants. Currently, all the models are running on XNNPack backend. We support `Whisper (tiny.en)` and `Moonshine` models. + +:::info +Currently, we do not support direct microphone input streaming to the model. Instead, in `v0.3.0`, we provide a method that accepts an URL to the audio file. +::: + +## Reference + +```typescript +import { useSpeechToText } from 'react-native-executorch'; + +const model = useSpeechToText({ + modelName: 'moonshine', +}); + +const audioUrl = 'https://your-url.com/never-gonna-give-you-up.mp3'; + +try { + const transcription = await model.transcribe(audioUrl); + console.log(transcription); +} catch (error) { + console.error(error); +} +``` + +### Arguments + +**`modelName`** +Can be either `'whisper'` or `'moonshine'`. The first one will use the [Whisper](https://openai.com/index/whisper/), while the latter will run [Moonshine](https://github.com/usefulsensors/moonshine). For best performance, we recommend using Moonshine. + +### Returns + +| Field | Type | Description | +| -------------- | ------------------------------------------------------------ | -------------------------------------------------------------------------------------------------------- | +| `transcribe` | `(input: string) => Promise` | Starts a transcription process for an audio at given `input` URL. Resolves a promise with the output transcription when the model is finished. | +| `error` | string | null | Contains the error message if the model failed to load. | +| `response` | string | null | This property is updated with each generated token. | +| `isGenerating` | `boolean` | Indicates whether the model is currently processing an inference. | +| `isReady` | `boolean` | Indicates whether the model has successfully loaded and is ready for inference. | + +## Running the model + +To run the model, you can use the `transcribe` method. It accepts one argument, which is the URL to the audio file. The function returns a promise, which will return the generated tokens when everything succeeds. If the model fails, it will throw an error. If you want to stream tokens, you can also use the `.response` property which is updated with each generated token, analogously to the useLLM hook. + + +## Example + +```typescript +import { Button, Text } from 'react-native'; +import { useSpeechToText } from 'react-native-executorch'; + +function App() { + const model = useSpeechToText({ + modelName: 'whisper', + }); + const audioUrl = 'file:///Users/.../never-gonna-give-you-up.mp3'; + + return ( +