From 7c1a45f219a808c64c30298c84f3d77c07845079 Mon Sep 17 00:00:00 2001
From: chmjkb <jakubchmura1607@gmail.com>
Date: Thu, 27 Feb 2025 15:31:20 +0100
Subject: [PATCH 01/35] wip

---
 docs/docs/computer-vision/_category_.json |   2 +-
 docs/docs/speech-to-text/_category_.json  |   7 ++
 docs/docs/speech-to-text/whisper.md       | 111 ++++++++++++++++++++++
 3 files changed, 119 insertions(+), 1 deletion(-)
 create mode 100644 docs/docs/speech-to-text/_category_.json
 create mode 100644 docs/docs/speech-to-text/whisper.md
diff --git a/docs/docs/computer-vision/_category_.json b/docs/docs/computer-vision/_category_.json
index 5aa6c0263c..1a78d5e75f 100644
--- a/docs/docs/computer-vision/_category_.json
+++ b/docs/docs/computer-vision/_category_.json
@@ -1,6 +1,6 @@
 {
   "label": "Computer Vision",
-  "position": 3,
+  "position": 4,
   "link": {
     "type": "generated-index"
   }
diff --git a/docs/docs/speech-to-text/_category_.json b/docs/docs/speech-to-text/_category_.json
new file mode 100644
index 0000000000..554e3476a1
--- /dev/null
+++ b/docs/docs/speech-to-text/_category_.json
@@ -0,0 +1,7 @@
+{
+  "label": "Speech To Text",
+  "position": 3,
+  "link": {
+    "type": "generated-index"
+  }
+}
diff --git a/docs/docs/speech-to-text/whisper.md b/docs/docs/speech-to-text/whisper.md
new file mode 100644
index 0000000000..b8a6b74af9
--- /dev/null
+++ b/docs/docs/speech-to-text/whisper.md
@@ -0,0 +1,111 @@
+---
+title: useSpeechToText
+sidebar_position: 1
+---
+
+With the latest `v0.3.0` release we introduce a new hook - `useSpeechToText`. Speech to text is a task that allows to transform spoken language to written text. It is commonly used to implement features such as transcription, or voice assistants. Currently, all the models are running on XNNPack backend. We support `Whisper (tiny.en)` and `Moonshine` models.
+
+:::info
+Currently, we do not support direct microphone input streaming to the model. Instead, in  `v0.3.0`, we provide a method that accepts an URL to the audio file.
+:::
+
+## Reference
+
+```typescript
+import { useSpeechToText } from 'react-native-executorch';
+
+const model = useSpeechToText({
+  modelName: 'moonshine',
+});
+
+const audioUrl = 'https://your-url.com/never-gonna-give-you-up.mp3';
+
+try {
+  const transcription = await model.transcribe(audioUrl);
+  console.log(transcription);
+} catch (error) {
+  console.error(error);
+}
+```
+
+### Arguments
+
+**`modelName`**
+Can be either `'whisper'` or `'moonshine'`. The first one will use the [Whisper](https://openai.com/index/whisper/), while the latter will run [Moonshine](https://github.com/usefulsensors/moonshine). For best performance, we recommend using Moonshine.
+
+### Returns
+
+| Field          | Type                                                         | Description                                                                                              |
+| -------------- | ------------------------------------------------------------ | -------------------------------------------------------------------------------------------------------- |
+| `transcribe`      | `(input: string) => Promise<string>` | Starts a transcription process for an audio at given `input` URL. Resolves a promise with the output transcription when the model is finished. |
+| `error`        | <code>string &#124; null</code>                              | Contains the error message if the model failed to load.                                                  |
+| `response`        | <code>string &#124; null</code>                              | This property is updated with each generated token.                                                  |
+| `isGenerating` | `boolean`                                                    | Indicates whether the model is currently processing an inference.                                        |
+| `isReady`      | `boolean`                                                    | Indicates whether the model has successfully loaded and is ready for inference.                          |
+
+## Running the model
+
+To run the model, you can use the `transcribe` method. It accepts one argument, which is the URL to the audio file. The function returns a promise, which will return the generated tokens when everything succeeds. If the model fails, it will throw an error. If you want to stream tokens, you can also use the `.response` property which is updated with each generated token, analogously to the useLLM hook.
+
+
+## Example
+
+```typescript
+import { Button, Text } from 'react-native';
+import { useSpeechToText } from 'react-native-executorch';
+
+function App() {
+  const model = useSpeechToText({
+    modelName: 'whisper',
+  });
+  const audioUrl = 'file:///Users/.../never-gonna-give-you-up.mp3';
+
+  return (
+    <Button
+      onPress=(async () => {
+        const transcription = await model.transcribe(audioUrl);;
+      })
+    />
+    <Text>{model.response}</Text>
+  )
+  // ... Rest of your component
+}
+```
+## Supported models
+- [Whisper tiny.en](https://github.com/openai/whisper)
+- [Moonshine](https://github.com/usefulsensors/moonshine)
+
+## Benchmarks
+
+### Model size
+
+| Model             | XNNPACK [MB] |
+| ----------------- | ------------ |
+| Whisper | 231     |
+| Moonshine | 149   |
+
+### Memory usage
+
+| Model             | Android (XNNPACK) [MB] | iOS (XNNPACK) [MB] |
+| ----------------- | ---------------------- | ------------------ |
+| Whisper | ❌                    | 950                 |
+
+### Inference time
+
+:::warning warning
+Times presented in the tables are measured as consecutive runs of the model. Initial run times may be up to 2x longer due to model loading and initialization.
+:::
+
+#### Decoder
+
+| Model | iPhone 16 Pro (XNNPack) | iPhone 13 Pro (XNNPack) | iPhone SE 3 (XNNPack) | Samsung Galaxy S24 (XNNPack) |
+| ------ | -----------------------| ---------------------- | --------------------- | ---------------------------- |
+| Whisper (tiny.en) | 18.5 tokens/s | 12.4 tokens/s | 12.4 tokens/s | 20.0 tokens/s |
+
+#### Encoder
+| Model | iPhone 16 Pro (XNNPack) | iPhone 13 Pro (XNNPack) | iPhone SE 3 (XNNPack) | Samsung Galaxy S24 (XNNPack) |
+| ------ | -----------------------| ---------------------- | --------------------- | ---------------------------- |
+| Whisper (tiny.en) | 0.71s | 1.06s | 1.18s | 1.00s |
+
+#
+

From 12038d65274af1c2b37c2129ab8d45d563ea5c23 Mon Sep 17 00:00:00 2001
From: chmjkb <jakubchmura1607@gmail.com>
Date: Fri, 28 Feb 2025 10:56:40 +0100
Subject: [PATCH 02/35] wip

---
 docs/docs/speech-to-text/whisper.md | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/docs/docs/speech-to-text/whisper.md b/docs/docs/speech-to-text/whisper.md
index b8a6b74af9..e5125ee1a4 100644
--- a/docs/docs/speech-to-text/whisper.md
+++ b/docs/docs/speech-to-text/whisper.md
@@ -37,7 +37,8 @@ Can be either `'whisper'` or `'moonshine'`. The first one will use the [Whisper]
 
 | Field          | Type                                                         | Description                                                                                              |
 | -------------- | ------------------------------------------------------------ | -------------------------------------------------------------------------------------------------------- |
-| `transcribe`      | `(input: string) => Promise<string>` | Starts a transcription process for an audio at given `input` URL. Resolves a promise with the output transcription when the model is finished. |
+| `transcribe`      | `(input: number[]) => Promise<string>` | Starts a transcription process for a given input array, which should be a waveform at 16khz. Resolves a promise with the output transcription when the model is finished. |
+| `loadAudio`     |   `(url: string) => number[]`                                 | Loads audio file from given URL and returns a waveform, which serves as an input to `transcribe()`.         |
 | `error`        | <code>string &#124; null</code>                              | Contains the error message if the model failed to load.                                                  |
 | `response`        | <code>string &#124; null</code>                              | This property is updated with each generated token.                                                  |
 | `isGenerating` | `boolean`                                                    | Indicates whether the model is currently processing an inference.                                        |

From c66003920f494a9906711e7f776a40b31feef97b Mon Sep 17 00:00:00 2001
From: chmjkb <jakubchmura1607@gmail.com>
Date: Fri, 28 Feb 2025 11:53:28 +0100
Subject: [PATCH 03/35] wip

---
 docs/docs/speech-to-text/whisper.md | 23 ++++++++++++++++-------
 1 file changed, 16 insertions(+), 7 deletions(-)

diff --git a/docs/docs/speech-to-text/whisper.md b/docs/docs/speech-to-text/whisper.md
index e5125ee1a4..212fa2c695 100644
--- a/docs/docs/speech-to-text/whisper.md
+++ b/docs/docs/speech-to-text/whisper.md
@@ -3,7 +3,7 @@ title: useSpeechToText
 sidebar_position: 1
 ---
 
-With the latest `v0.3.0` release we introduce a new hook - `useSpeechToText`. Speech to text is a task that allows to transform spoken language to written text. It is commonly used to implement features such as transcription, or voice assistants. Currently, all the models are running on XNNPack backend. We support `Whisper (tiny.en)` and `Moonshine` models.
+With the latest `v0.3.0` release we introduce a new hook - `useSpeechToText`. Speech to text is a task that allows to transform spoken language to written text. It is commonly used to implement features such as transcription or voice assistants. As of now, [every supported STT model](#supported-models) runs on the XNNPack backend.
 
 :::info
 Currently, we do not support direct microphone input streaming to the model. Instead, in  `v0.3.0`, we provide a method that accepts an URL to the audio file.
@@ -21,7 +21,8 @@ const model = useSpeechToText({
 const audioUrl = 'https://your-url.com/never-gonna-give-you-up.mp3';
 
 try {
-  const transcription = await model.transcribe(audioUrl);
+  const audio = await model.loadAudio(audioUrl);
+  const transcription = await model.transcribe(audio);
   console.log(transcription);
 } catch (error) {
   console.error(error);
@@ -29,15 +30,22 @@ try {
 ```
 
 ### Arguments
+**`encoderSource`**
+A string that specifies the location of a .pte file for the encoder. For further information on passing model sources, check out [Loading Models](https://docs.swmansion.com/react-native-executorch/docs/fundamentals/loading-models).
+
+`decoderSource`
+Analogous to the encoderSource, this takes in a string which is a source for the decoder part of the model.
+
+`tokenizerSource`
+A string that specifies the location to the tokenizer for the model. This works just as the encoder and decoder do.
+
 
-**`modelName`**
-Can be either `'whisper'` or `'moonshine'`. The first one will use the [Whisper](https://openai.com/index/whisper/), while the latter will run [Moonshine](https://github.com/usefulsensors/moonshine). For best performance, we recommend using Moonshine.
 
 ### Returns
 
 | Field          | Type                                                         | Description                                                                                              |
 | -------------- | ------------------------------------------------------------ | -------------------------------------------------------------------------------------------------------- |
-| `transcribe`      | `(input: number[]) => Promise<string>` | Starts a transcription process for a given input array, which should be a waveform at 16khz. Resolves a promise with the output transcription when the model is finished. |
+| `transcribe`      | `(input: number[]) => Promise<string>` | Starts a transcription process for a given input array, which should be a waveform at 16khz. This can be obtained from the `.loadAudio` function. Resolves a promise with the output transcription when the model is finished. |
 | `loadAudio`     |   `(url: string) => number[]`                                 | Loads audio file from given URL and returns a waveform, which serves as an input to `transcribe()`.         |
 | `error`        | <code>string &#124; null</code>                              | Contains the error message if the model failed to load.                                                  |
 | `response`        | <code>string &#124; null</code>                              | This property is updated with each generated token.                                                  |
@@ -46,7 +54,7 @@ Can be either `'whisper'` or `'moonshine'`. The first one will use the [Whisper]
 
 ## Running the model
 
-To run the model, you can use the `transcribe` method. It accepts one argument, which is the URL to the audio file. The function returns a promise, which will return the generated tokens when everything succeeds. If the model fails, it will throw an error. If you want to stream tokens, you can also use the `.response` property which is updated with each generated token, analogously to the useLLM hook.
+To run the model, you can use the `transcribe` method. It accepts one argument, which is the waveform representation of the audio. The function returns a promise, which will return the generated tokens when everything succeeds. If the model fails, it will throw an error. If you want to stream tokens, you can also use the `.response` property which is updated with each generated token, analogously to the useLLM hook.
 
 
 ## Example
@@ -64,7 +72,8 @@ function App() {
   return (
     <Button
       onPress=(async () => {
-        const transcription = await model.transcribe(audioUrl);;
+        const waveform = await model.transcribe(audioUrl);
+        await model.transcribe(audio);
       })
     />
     <Text>{model.response}</Text>

From 36ef6e43727eec11d27d7990889e217b1659066e Mon Sep 17 00:00:00 2001
From: chmjkb <jakubchmura1607@gmail.com>
Date: Mon, 3 Mar 2025 15:20:31 +0100
Subject: [PATCH 04/35] wip

---
 docs/docs/speech-to-text/whisper.md | 37 +++++++++++++++++++----------
 1 file changed, 25 insertions(+), 12 deletions(-)

diff --git a/docs/docs/speech-to-text/whisper.md b/docs/docs/speech-to-text/whisper.md
index 212fa2c695..4bcb3de914 100644
--- a/docs/docs/speech-to-text/whisper.md
+++ b/docs/docs/speech-to-text/whisper.md
@@ -6,15 +6,18 @@ sidebar_position: 1
 With the latest `v0.3.0` release we introduce a new hook - `useSpeechToText`. Speech to text is a task that allows to transform spoken language to written text. It is commonly used to implement features such as transcription or voice assistants. As of now, [every supported STT model](#supported-models) runs on the XNNPack backend.
 
 :::info
-Currently, we do not support direct microphone input streaming to the model. Instead, in  `v0.3.0`, we provide a method that accepts an URL to the audio file.
+Currently, we do not support direct microphone input streaming to the model. Instead, in  `v0.3.0`, we provide a way to transcribe an audio file.
 :::
 
 ## Reference
 
 ```typescript
-import { useSpeechToText } from 'react-native-executorch';
+import { useSpeechToText, MOONSINE_TOKENIZER_URL, MOONSHINE_ENCODER_URL, MOONSHINE_DECODER_URL } from 'react-native-executorch';
 
 const model = useSpeechToText({
+  encoderSource: MOONSHINE_ENCODER_URL,
+  decoderSource: MOONSHINE_DECODER_URL,
+  tokenizerSource: MOONSHINE_TOKENIZER_URL
   modelName: 'moonshine',
 });
 
@@ -22,7 +25,7 @@ const audioUrl = 'https://your-url.com/never-gonna-give-you-up.mp3';
 
 try {
   const audio = await model.loadAudio(audioUrl);
-  const transcription = await model.transcribe(audio);
+  const transcription = await model.transcribe();
   console.log(transcription);
 } catch (error) {
   console.error(error);
@@ -39,14 +42,17 @@ Analogous to the encoderSource, this takes in a string which is a source for the
 `tokenizerSource`
 A string that specifies the location to the tokenizer for the model. This works just as the encoder and decoder do.
 
+`modelName`
+A literal of `"moonshine" | "whisper"` which serves as an identifier for which model should be used
+
 
 
 ### Returns
 
 | Field          | Type                                                         | Description                                                                                              |
 | -------------- | ------------------------------------------------------------ | -------------------------------------------------------------------------------------------------------- |
-| `transcribe`      | `(input: number[]) => Promise<string>` | Starts a transcription process for a given input array, which should be a waveform at 16khz. This can be obtained from the `.loadAudio` function. Resolves a promise with the output transcription when the model is finished. |
-| `loadAudio`     |   `(url: string) => number[]`                                 | Loads audio file from given URL and returns a waveform, which serves as an input to `transcribe()`.         |
+| `transcribe`      | `(input: number[]?) => Promise<string>` | Starts a transcription process for a given input array, which should be a waveform at 16kHz. This can be obtained from the `loadAudio` function. When no input is provided, it uses an internal state which is set by calling `loadAudio`. Resolves a promise with the output transcription when the model is finished. |
+| `loadAudio`     |   `(url: string) => number[]`                                 | Loads audio file from given URL and returns a waveform, which serves as an input to `transcribe()`. It also sets an internal state for the input, so when you call `loadAudio`, you don't need to pass anything to `transcribe`.         |
 | `error`        | <code>string &#124; null</code>                              | Contains the error message if the model failed to load.                                                  |
 | `response`        | <code>string &#124; null</code>                              | This property is updated with each generated token.                                                  |
 | `isGenerating` | `boolean`                                                    | Indicates whether the model is currently processing an inference.                                        |
@@ -54,17 +60,20 @@ A string that specifies the location to the tokenizer for the model. This works
 
 ## Running the model
 
-To run the model, you can use the `transcribe` method. It accepts one argument, which is the waveform representation of the audio. The function returns a promise, which will return the generated tokens when everything succeeds. If the model fails, it will throw an error. If you want to stream tokens, you can also use the `.response` property which is updated with each generated token, analogously to the useLLM hook.
+To run the model, you can use the `transcribe` method. It accepts one optional argument, which is the waveform representation of the audio. If you called `loadAudio` beforehand, you don't need to pass anything to `transcribe`. However, you can still pass this argument if you want to use your own audio. This function returns a promise, which will return the generated tokens when everything succeeds. If the model fails during inference, it will throw an error. If you want to obtain tokens in streaming fashion, you can also use the `.response` property which is updated with each generated token, analogously to the useLLM hook.
 
 
 ## Example
 
 ```typescript
 import { Button, Text } from 'react-native';
-import { useSpeechToText } from 'react-native-executorch';
+import { useSpeechToText, WHISPER_TOKENIZER_URL, WHISPER_ENCODER_URL, WHISPER_DECODER_URL } from 'react-native-executorch';
 
 function App() {
   const model = useSpeechToText({
+    encoderSource: WHISPER_ENCODER_URL,
+    decoderSource: WHISPER_DECODER_URL,
+    tokenizerSource: WHISPER_TOKENIZER_URL
     modelName: 'whisper',
   });
   const audioUrl = 'file:///Users/.../never-gonna-give-you-up.mp3';
@@ -72,8 +81,9 @@ function App() {
   return (
     <Button
       onPress=(async () => {
-        const waveform = await model.transcribe(audioUrl);
-        await model.transcribe(audio);
+        // Alternatively, you can obtain audio from any other source and pass it to transcribe()
+        model.loadAudio(audioUrl);
+        await model.transcribe();
       })
     />
     <Text>{model.response}</Text>
@@ -99,23 +109,26 @@ function App() {
 | Model             | Android (XNNPACK) [MB] | iOS (XNNPACK) [MB] |
 | ----------------- | ---------------------- | ------------------ |
 | Whisper | ❌                    | 950                 |
+| Moonshine | ❌                    | 868                 |
 
 ### Inference time
 
 :::warning warning
-Times presented in the tables are measured as consecutive runs of the model. Initial run times may be up to 2x longer due to model loading and initialization.
+Given that Whisper accepts a 30 seconds audio chunks, we employed a streaming algorithm to maintain consistency across long audio files. Therefore, data presented in this table may differ from what you experience in your apps.
 :::
 
 #### Decoder
 
 | Model | iPhone 16 Pro (XNNPack) | iPhone 13 Pro (XNNPack) | iPhone SE 3 (XNNPack) | Samsung Galaxy S24 (XNNPack) |
 | ------ | -----------------------| ---------------------- | --------------------- | ---------------------------- |
-| Whisper (tiny.en) | 18.5 tokens/s | 12.4 tokens/s | 12.4 tokens/s | 20.0 tokens/s |
+| Whisper (tiny.en) | 8.65 tokens/s | 5.41 tokens/s | 5,31 tokens/s | 20.0 tokens/s |
+| Moonshine | 13.23 tokens/s | 7.77 tokens/s | 7.61 tokens/s | 20.0 tokens/s |
 
 #### Encoder
 | Model | iPhone 16 Pro (XNNPack) | iPhone 13 Pro (XNNPack) | iPhone SE 3 (XNNPack) | Samsung Galaxy S24 (XNNPack) |
 | ------ | -----------------------| ---------------------- | --------------------- | ---------------------------- |
-| Whisper (tiny.en) | 0.71s | 1.06s | 1.18s | 1.00s |
+| Whisper (tiny.en) | 1.00s | 1.40s | 1.49s | 1.00s |
+| Moonshine | 0.48s | 0.69s | 0.69s | 1.00s |
 
 #
 

From 13583708cb470ce77aedba775a65567c8d961515 Mon Sep 17 00:00:00 2001
From: chmjkb <jakubchmura1607@gmail.com>
Date: Mon, 3 Mar 2025 15:39:26 +0100
Subject: [PATCH 05/35] wip

---
 docs/docs/speech-to-text/whisper.md | 64 ++++++++++++++---------------
 1 file changed, 31 insertions(+), 33 deletions(-)

diff --git a/docs/docs/speech-to-text/whisper.md b/docs/docs/speech-to-text/whisper.md
index 4bcb3de914..224b2c2aeb 100644
--- a/docs/docs/speech-to-text/whisper.md
+++ b/docs/docs/speech-to-text/whisper.md
@@ -6,18 +6,18 @@ sidebar_position: 1
 With the latest `v0.3.0` release we introduce a new hook - `useSpeechToText`. Speech to text is a task that allows to transform spoken language to written text. It is commonly used to implement features such as transcription or voice assistants. As of now, [every supported STT model](#supported-models) runs on the XNNPack backend.
 
 :::info
-Currently, we do not support direct microphone input streaming to the model. Instead, in  `v0.3.0`, we provide a way to transcribe an audio file.
+Currently, we do not support direct microphone input streaming to the model. Instead, in  v0.3.0, we provide a way to transcribe an audio file.
 :::
 
 ## Reference
 
 ```typescript
-import { useSpeechToText, MOONSINE_TOKENIZER_URL, MOONSHINE_ENCODER_URL, MOONSHINE_DECODER_URL } from 'react-native-executorch';
+import { useSpeechToText, MOONSHINE_TINY_TOKENIZER_URL, MOONSHINE_TINY_ENCODER_URL, MOONSHINE_TINY_DECODER_URL } from 'react-native-executorch';
 
 const model = useSpeechToText({
-  encoderSource: MOONSHINE_ENCODER_URL,
-  decoderSource: MOONSHINE_DECODER_URL,
-  tokenizerSource: MOONSHINE_TOKENIZER_URL
+  encoderSource: MOONSHINE_TINY_ENCODER_URL,
+  decoderSource: MOONSHINE_TINY_DECODER_URL,
+  tokenizerSource: MOONSHINE_TINY_TOKENIZER_URL
   modelName: 'moonshine',
 });
 
@@ -49,14 +49,14 @@ A literal of `"moonshine" | "whisper"` which serves as an identifier for which m
 
 ### Returns
 
-| Field          | Type                                                         | Description                                                                                              |
-| -------------- | ------------------------------------------------------------ | -------------------------------------------------------------------------------------------------------- |
-| `transcribe`      | `(input: number[]?) => Promise<string>` | Starts a transcription process for a given input array, which should be a waveform at 16kHz. This can be obtained from the `loadAudio` function. When no input is provided, it uses an internal state which is set by calling `loadAudio`. Resolves a promise with the output transcription when the model is finished. |
-| `loadAudio`     |   `(url: string) => number[]`                                 | Loads audio file from given URL and returns a waveform, which serves as an input to `transcribe()`. It also sets an internal state for the input, so when you call `loadAudio`, you don't need to pass anything to `transcribe`.         |
-| `error`        | <code>string &#124; null</code>                              | Contains the error message if the model failed to load.                                                  |
-| `response`        | <code>string &#124; null</code>                              | This property is updated with each generated token.                                                  |
-| `isGenerating` | `boolean`                                                    | Indicates whether the model is currently processing an inference.                                        |
-| `isReady`      | `boolean`                                                    | Indicates whether the model has successfully loaded and is ready for inference.                          |
+| Field          | Type                                    | Description                                                                                                                                                                                                                                                                                                             |
+| -------------- | --------------------------------------- | ----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
+| `transcribe`   | `(input: number[]?) => Promise<string>` | Starts a transcription process for a given input array, which should be a waveform at 16kHz. This can be obtained from the `loadAudio` function. When no input is provided, it uses an internal state which is set by calling `loadAudio`. Resolves a promise with the output transcription when the model is finished. |
+| `loadAudio`    | `(url: string) => number[]`             | Loads audio file from given URL and returns a waveform, which serves as an input to `transcribe()`. It also sets an internal state for the input, so when you call `loadAudio`, you don't need to pass anything to `transcribe`.                                                                                        |
+| `error`        | <code>string &#124; null</code>         | Contains the error message if the model failed to load.                                                                                                                                                                                                                                                                 |
+| `response`     | <code>string &#124; null</code>         | This property is updated with each generated token.                                                                                                                                                                                                                                                                     |
+| `isGenerating` | `boolean`                               | Indicates whether the model is currently processing an inference.                                                                                                                                                                                                                                                       |
+| `isReady`      | `boolean`                               | Indicates whether the model has successfully loaded and is ready for inference.                                                                                                                                                                                                                                         |
 
 ## Running the model
 
@@ -67,13 +67,13 @@ To run the model, you can use the `transcribe` method. It accepts one optional a
 
 ```typescript
 import { Button, Text } from 'react-native';
-import { useSpeechToText, WHISPER_TOKENIZER_URL, WHISPER_ENCODER_URL, WHISPER_DECODER_URL } from 'react-native-executorch';
+import { useSpeechToText, WHISPER_TINY_TOKENIZER_URL, WHISPER_TINY_ENCODER_URL, WHISPER_TINY_DECODER_URL } from 'react-native-executorch';
 
 function App() {
   const model = useSpeechToText({
-    encoderSource: WHISPER_ENCODER_URL,
-    decoderSource: WHISPER_DECODER_URL,
-    tokenizerSource: WHISPER_TOKENIZER_URL
+    encoderSource: WHISPER_TINY_ENCODER_URL,
+    decoderSource: WHISPER_TINY_DECODER_URL,
+    tokenizerSource: WHISPER_TINY_TOKENIZER_URL
     modelName: 'whisper',
   });
   const audioUrl = 'file:///Users/.../never-gonna-give-you-up.mp3';
@@ -92,8 +92,8 @@ function App() {
 }
 ```
 ## Supported models
-- [Whisper tiny.en](https://github.com/openai/whisper)
-- [Moonshine](https://github.com/usefulsensors/moonshine)
+- [Whisper (tiny.en)](https://github.com/openai/whisper)
+- [Moonshine (tiny)](https://github.com/usefulsensors/moonshine)
 
 ## Benchmarks
 
@@ -101,15 +101,15 @@ function App() {
 
 | Model             | XNNPACK [MB] |
 | ----------------- | ------------ |
-| Whisper | 231     |
-| Moonshine | 149   |
+| Whisper (tiny.en) | 231          |
+| Moonshine tiny    | 149          |
 
 ### Memory usage
 
 | Model             | Android (XNNPACK) [MB] | iOS (XNNPACK) [MB] |
 | ----------------- | ---------------------- | ------------------ |
-| Whisper | ❌                    | 950                 |
-| Moonshine | ❌                    | 868                 |
+| Whisper (tiny.en) | ❌                      | 950                |
+| Moonshine (tiny)  | ❌                      | 868                |
 
 ### Inference time
 
@@ -119,16 +119,14 @@ Given that Whisper accepts a 30 seconds audio chunks, we employed a streaming al
 
 #### Decoder
 
-| Model | iPhone 16 Pro (XNNPack) | iPhone 13 Pro (XNNPack) | iPhone SE 3 (XNNPack) | Samsung Galaxy S24 (XNNPack) |
-| ------ | -----------------------| ---------------------- | --------------------- | ---------------------------- |
-| Whisper (tiny.en) | 8.65 tokens/s | 5.41 tokens/s | 5,31 tokens/s | 20.0 tokens/s |
-| Moonshine | 13.23 tokens/s | 7.77 tokens/s | 7.61 tokens/s | 20.0 tokens/s |
+| Model             | iPhone 16 Pro (XNNPack) | iPhone 13 Pro (XNNPack) | iPhone SE 3 (XNNPack) | Samsung Galaxy S24 (XNNPack) |
+| ----------------- | ----------------------- | ----------------------- | --------------------- | ---------------------------- |
+| Whisper (tiny.en) | 8.65 tokens/s           | 5.41 tokens/s           | 5.31 tokens/s         | 20.0 tokens/s                |
+| Moonshine (tiny)  | 13.23 tokens/s          | 7.77 tokens/s           | 7.61 tokens/s         | 20.0 tokens/s                |
 
 #### Encoder
-| Model | iPhone 16 Pro (XNNPack) | iPhone 13 Pro (XNNPack) | iPhone SE 3 (XNNPack) | Samsung Galaxy S24 (XNNPack) |
-| ------ | -----------------------| ---------------------- | --------------------- | ---------------------------- |
-| Whisper (tiny.en) | 1.00s | 1.40s | 1.49s | 1.00s |
-| Moonshine | 0.48s | 0.69s | 0.69s | 1.00s |
-
-#
+| Model             | iPhone 16 Pro (XNNPack) | iPhone 13 Pro (XNNPack) | iPhone SE 3 (XNNPack) | Samsung Galaxy S24 (XNNPack) |
+| ----------------- | ----------------------- | ----------------------- | --------------------- | ---------------------------- |
+| Whisper (tiny.en) | 1.00s                   | 1.40s                   | 1.49s                 | 1.00s                        |
+| Moonshine (tiny)  | 0.48s                   | 0.69s                   | 0.69s                 | 1.00s                        |
 

From b64d90e7bbc7070a9dd29d271b8e4f37088f9306 Mon Sep 17 00:00:00 2001
From: chmjkb <jakubchmura1607@gmail.com>
Date: Mon, 3 Mar 2025 15:40:59 +0100
Subject: [PATCH 06/35] delete android benchmarks as they have to be redone

---
 docs/docs/speech-to-text/whisper.md | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/docs/docs/speech-to-text/whisper.md b/docs/docs/speech-to-text/whisper.md
index 224b2c2aeb..536fbaacc2 100644
--- a/docs/docs/speech-to-text/whisper.md
+++ b/docs/docs/speech-to-text/whisper.md
@@ -121,12 +121,12 @@ Given that Whisper accepts a 30 seconds audio chunks, we employed a streaming al
 
 | Model             | iPhone 16 Pro (XNNPack) | iPhone 13 Pro (XNNPack) | iPhone SE 3 (XNNPack) | Samsung Galaxy S24 (XNNPack) |
 | ----------------- | ----------------------- | ----------------------- | --------------------- | ---------------------------- |
-| Whisper (tiny.en) | 8.65 tokens/s           | 5.41 tokens/s           | 5.31 tokens/s         | 20.0 tokens/s                |
-| Moonshine (tiny)  | 13.23 tokens/s          | 7.77 tokens/s           | 7.61 tokens/s         | 20.0 tokens/s                |
+| Whisper (tiny.en) | 8.65 tokens/s           | 5.41 tokens/s           | 5.31 tokens/s         | ❌                            |
+| Moonshine (tiny)  | 13.23 tokens/s          | 7.77 tokens/s           | 7.61 tokens/s         | ❌                            |
 
 #### Encoder
 | Model             | iPhone 16 Pro (XNNPack) | iPhone 13 Pro (XNNPack) | iPhone SE 3 (XNNPack) | Samsung Galaxy S24 (XNNPack) |
 | ----------------- | ----------------------- | ----------------------- | --------------------- | ---------------------------- |
-| Whisper (tiny.en) | 1.00s                   | 1.40s                   | 1.49s                 | 1.00s                        |
-| Moonshine (tiny)  | 0.48s                   | 0.69s                   | 0.69s                 | 1.00s                        |
+| Whisper (tiny.en) | 1.00s                   | 1.40s                   | 1.49s                 | ❌                            |
+| Moonshine (tiny)  | 0.48s                   | 0.69s                   | 0.69s                 | ❌                            |
 

From 2f076a778891004cd98f00a06d868f100abe2066 Mon Sep 17 00:00:00 2001
From: chmjkb <jakubchmura1607@gmail.com>
Date: Mon, 3 Mar 2025 15:45:43 +0100
Subject: [PATCH 07/35] typo fix

---
 docs/docs/speech-to-text/whisper.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docs/docs/speech-to-text/whisper.md b/docs/docs/speech-to-text/whisper.md
index 536fbaacc2..99b8485ec4 100644
--- a/docs/docs/speech-to-text/whisper.md
+++ b/docs/docs/speech-to-text/whisper.md
@@ -51,7 +51,7 @@ A literal of `"moonshine" | "whisper"` which serves as an identifier for which m
 
 | Field          | Type                                    | Description                                                                                                                                                                                                                                                                                                             |
 | -------------- | --------------------------------------- | ----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
-| `transcribe`   | `(input: number[]?) => Promise<string>` | Starts a transcription process for a given input array, which should be a waveform at 16kHz. This can be obtained from the `loadAudio` function. When no input is provided, it uses an internal state which is set by calling `loadAudio`. Resolves a promise with the output transcription when the model is finished. |
+| `transcribe`   | `(input?: number[]) => Promise<string>` | Starts a transcription process for a given input array, which should be a waveform at 16kHz. This can be obtained from the `loadAudio` function. When no input is provided, it uses an internal state which is set by calling `loadAudio`. Resolves a promise with the output transcription when the model is finished. |
 | `loadAudio`    | `(url: string) => number[]`             | Loads audio file from given URL and returns a waveform, which serves as an input to `transcribe()`. It also sets an internal state for the input, so when you call `loadAudio`, you don't need to pass anything to `transcribe`.                                                                                        |
 | `error`        | <code>string &#124; null</code>         | Contains the error message if the model failed to load.                                                                                                                                                                                                                                                                 |
 | `response`     | <code>string &#124; null</code>         | This property is updated with each generated token.                                                                                                                                                                                                                                                                     |

From 7ad17d6db4cd20c9fea2fe409372d8eab6168c70 Mon Sep 17 00:00:00 2001
From: chmjkb <jakubchmura1607@gmail.com>
Date: Mon, 3 Mar 2025 15:47:59 +0100
Subject: [PATCH 08/35] fix lies

---
 docs/docs/speech-to-text/whisper.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docs/docs/speech-to-text/whisper.md b/docs/docs/speech-to-text/whisper.md
index 99b8485ec4..5a79e558b1 100644
--- a/docs/docs/speech-to-text/whisper.md
+++ b/docs/docs/speech-to-text/whisper.md
@@ -52,7 +52,7 @@ A literal of `"moonshine" | "whisper"` which serves as an identifier for which m
 | Field          | Type                                    | Description                                                                                                                                                                                                                                                                                                             |
 | -------------- | --------------------------------------- | ----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
 | `transcribe`   | `(input?: number[]) => Promise<string>` | Starts a transcription process for a given input array, which should be a waveform at 16kHz. This can be obtained from the `loadAudio` function. When no input is provided, it uses an internal state which is set by calling `loadAudio`. Resolves a promise with the output transcription when the model is finished. |
-| `loadAudio`    | `(url: string) => number[]`             | Loads audio file from given URL and returns a waveform, which serves as an input to `transcribe()`. It also sets an internal state for the input, so when you call `loadAudio`, you don't need to pass anything to `transcribe`.                                                                                        |
+| `loadAudio`    | `(url: string) => void`                 | Loads audio file from given url. It sets an internal state which serves as an input to `transcribe()`.                                                                                                                                                                                                                  |
 | `error`        | <code>string &#124; null</code>         | Contains the error message if the model failed to load.                                                                                                                                                                                                                                                                 |
 | `response`     | <code>string &#124; null</code>         | This property is updated with each generated token.                                                                                                                                                                                                                                                                     |
 | `isGenerating` | `boolean`                               | Indicates whether the model is currently processing an inference.                                                                                                                                                                                                                                                       |

From c8c55735d71714ceb24238d815e95a0d6ddfa8f9 Mon Sep 17 00:00:00 2001
From: chmjkb <jakubchmura1607@gmail.com>
Date: Mon, 3 Mar 2025 15:52:35 +0100
Subject: [PATCH 09/35] cosmetics

---
 docs/docs/speech-to-text/whisper.md | 20 ++++++++++----------
 1 file changed, 10 insertions(+), 10 deletions(-)

diff --git a/docs/docs/speech-to-text/whisper.md b/docs/docs/speech-to-text/whisper.md
index 5a79e558b1..cbd387635e 100644
--- a/docs/docs/speech-to-text/whisper.md
+++ b/docs/docs/speech-to-text/whisper.md
@@ -24,7 +24,7 @@ const model = useSpeechToText({
 const audioUrl = 'https://your-url.com/never-gonna-give-you-up.mp3';
 
 try {
-  const audio = await model.loadAudio(audioUrl);
+  await model.loadAudio(audioUrl);
   const transcription = await model.transcribe();
   console.log(transcription);
 } catch (error) {
@@ -43,20 +43,20 @@ Analogous to the encoderSource, this takes in a string which is a source for the
 A string that specifies the location to the tokenizer for the model. This works just as the encoder and decoder do.
 
 `modelName`
-A literal of `"moonshine" | "whisper"` which serves as an identifier for which model should be used
+A literal of `"moonshine" | "whisper"` which serves as an identifier for which model should be used.
 
 
 
 ### Returns
 
-| Field          | Type                                    | Description                                                                                                                                                                                                                                                                                                             |
-| -------------- | --------------------------------------- | ----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
-| `transcribe`   | `(input?: number[]) => Promise<string>` | Starts a transcription process for a given input array, which should be a waveform at 16kHz. This can be obtained from the `loadAudio` function. When no input is provided, it uses an internal state which is set by calling `loadAudio`. Resolves a promise with the output transcription when the model is finished. |
-| `loadAudio`    | `(url: string) => void`                 | Loads audio file from given url. It sets an internal state which serves as an input to `transcribe()`.                                                                                                                                                                                                                  |
-| `error`        | <code>string &#124; null</code>         | Contains the error message if the model failed to load.                                                                                                                                                                                                                                                                 |
-| `response`     | <code>string &#124; null</code>         | This property is updated with each generated token.                                                                                                                                                                                                                                                                     |
-| `isGenerating` | `boolean`                               | Indicates whether the model is currently processing an inference.                                                                                                                                                                                                                                                       |
-| `isReady`      | `boolean`                               | Indicates whether the model has successfully loaded and is ready for inference.                                                                                                                                                                                                                                         |
+| Field          | Type                                    | Description                                                                                                                                                                                                                                                         |
+| -------------- | --------------------------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
+| `transcribe`   | `(input?: number[]) => Promise<string>` | Starts a transcription process for a given input array, which should be a waveform at 16kHz. When no input is provided, it uses an internal state which is set by calling `loadAudio`. Resolves a promise with the output transcription when the model is finished. |
+| `loadAudio`    | `(url: string) => void`                 | Loads audio file from given url. It sets an internal state which serves as an input to `transcribe()`.                                                                                                                                                              |
+| `error`        | <code>string &#124; null</code>         | Contains the error message if the model failed to load.                                                                                                                                                                                                             |
+| `response`     | <code>string &#124; null</code>         | This property is updated with each generated token. If you're looking to obtain tokens as they're generated, you should use this property.                                                                                                                                                                                                                 |
+| `isGenerating` | `boolean`                               | Indicates whether the model is currently processing an inference.                                                                                                                                                                                                   |
+| `isReady`      | `boolean`                               | Indicates whether the model has successfully loaded and is ready for inference.                                                                                                                                                                                     |
 
 ## Running the model
 

From 389f6e1251df51ba7f2296c47ec2d9d7d15abc30 Mon Sep 17 00:00:00 2001
From: chmjkb <jakubchmura1607@gmail.com>
Date: Wed, 5 Mar 2025 08:07:36 +0100
Subject: [PATCH 10/35] wip

---
 .../{whisper.md => speech-to-text.md}          | 18 ++++++++++++------
 1 file changed, 12 insertions(+), 6 deletions(-)
 rename docs/docs/speech-to-text/{whisper.md => speech-to-text.md} (90%)

diff --git a/docs/docs/speech-to-text/whisper.md b/docs/docs/speech-to-text/speech-to-text.md
similarity index 90%
rename from docs/docs/speech-to-text/whisper.md
rename to docs/docs/speech-to-text/speech-to-text.md
index cbd387635e..d9dee32025 100644
--- a/docs/docs/speech-to-text/whisper.md
+++ b/docs/docs/speech-to-text/speech-to-text.md
@@ -31,21 +31,27 @@ try {
   console.error(error);
 }
 ```
+### Streaming
+Given that STT models need to take in a specified sequence length, there is a need to chunk the input audio. Chunking audio may result in cutting speech mid-sentence, which might be hard to understand for the model. To make it work, we employed an algorithm that uses overlapping audio chunks which might introduce some overhead, but gives way better results.
 
 ### Arguments
 **`encoderSource`**
 A string that specifies the location of a .pte file for the encoder. For further information on passing model sources, check out [Loading Models](https://docs.swmansion.com/react-native-executorch/docs/fundamentals/loading-models).
 
-`decoderSource`
-Analogous to the encoderSource, this takes in a string which is a source for the decoder part of the model.
+`decoderSource?`
+Analogous to the encoderSource, this takes in a string which is a source for the decoder part of the model. Defaults to our [HuggingFace repositories]().
 
-`tokenizerSource`
-A string that specifies the location to the tokenizer for the model. This works just as the encoder and decoder do.
+`tokenizerSource?`
+A string that specifies the location to the tokenizer for the model. This works just as the encoder and decoder do. Defaults to our [HuggingFace repositories]().
 
-`modelName`
-A literal of `"moonshine" | "whisper"` which serves as an identifier for which model should be used.
+`modelName?`
+A literal of `"moonshine" | "whisper"` which serves as an identifier for which model should be used. Defaults to our [HuggingFace repositories]().
 
+`overlapSeconds?`
+Specifies the length of overlap between each audio chunk.
 
+`windowSize?`
+Specifies the size of each audio chunk.
 
 ### Returns
 

From 19c7e9d966e28347635d1d2f5c1b070ee87e2ddd Mon Sep 17 00:00:00 2001
From: chmjkb <jakubchmura1607@gmail.com>
Date: Wed, 5 Mar 2025 08:16:26 +0100
Subject: [PATCH 11/35] change tokenizer urls

---
 docs/docs/speech-to-text/speech-to-text.md | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/docs/docs/speech-to-text/speech-to-text.md b/docs/docs/speech-to-text/speech-to-text.md
index d9dee32025..254a6a742d 100644
--- a/docs/docs/speech-to-text/speech-to-text.md
+++ b/docs/docs/speech-to-text/speech-to-text.md
@@ -12,12 +12,12 @@ Currently, we do not support direct microphone input streaming to the model. Ins
 ## Reference
 
 ```typescript
-import { useSpeechToText, MOONSHINE_TINY_TOKENIZER_URL, MOONSHINE_TINY_ENCODER_URL, MOONSHINE_TINY_DECODER_URL } from 'react-native-executorch';
+import { useSpeechToText, MOONSHINE_TOKENIZER_URL, MOONSHINE_TINY_ENCODER_URL, MOONSHINE_TINY_DECODER_URL } from 'react-native-executorch';
 
 const model = useSpeechToText({
   encoderSource: MOONSHINE_TINY_ENCODER_URL,
   decoderSource: MOONSHINE_TINY_DECODER_URL,
-  tokenizerSource: MOONSHINE_TINY_TOKENIZER_URL
+  tokenizerSource: MOONSHINE_TOKENIZER_URL
   modelName: 'moonshine',
 });
 
@@ -73,13 +73,13 @@ To run the model, you can use the `transcribe` method. It accepts one optional a
 
 ```typescript
 import { Button, Text } from 'react-native';
-import { useSpeechToText, WHISPER_TINY_TOKENIZER_URL, WHISPER_TINY_ENCODER_URL, WHISPER_TINY_DECODER_URL } from 'react-native-executorch';
+import { useSpeechToText, WHISPER_TOKENIZER_URL, WHISPER_TINY_ENCODER_URL, WHISPER_TINY_DECODER_URL } from 'react-native-executorch';
 
 function App() {
   const model = useSpeechToText({
     encoderSource: WHISPER_TINY_ENCODER_URL,
     decoderSource: WHISPER_TINY_DECODER_URL,
-    tokenizerSource: WHISPER_TINY_TOKENIZER_URL
+    tokenizerSource: WHISPER_TOKENIZER_URL
     modelName: 'whisper',
   });
   const audioUrl = 'file:///Users/.../never-gonna-give-you-up.mp3';

From 80b6fd71c1b26c1f18a58f1980c776ca7e0b6dcf Mon Sep 17 00:00:00 2001
From: chmjkb <jakubchmura1607@gmail.com>
Date: Wed, 5 Mar 2025 08:21:36 +0100
Subject: [PATCH 12/35] Add info about constants, improve styling

---
 docs/docs/speech-to-text/speech-to-text.md | 20 ++++++++++----------
 1 file changed, 10 insertions(+), 10 deletions(-)

diff --git a/docs/docs/speech-to-text/speech-to-text.md b/docs/docs/speech-to-text/speech-to-text.md
index 254a6a742d..416c07dfbf 100644
--- a/docs/docs/speech-to-text/speech-to-text.md
+++ b/docs/docs/speech-to-text/speech-to-text.md
@@ -35,22 +35,22 @@ try {
 Given that STT models need to take in a specified sequence length, there is a need to chunk the input audio. Chunking audio may result in cutting speech mid-sentence, which might be hard to understand for the model. To make it work, we employed an algorithm that uses overlapping audio chunks which might introduce some overhead, but gives way better results.
 
 ### Arguments
-**`encoderSource`**
-A string that specifies the location of a .pte file for the encoder. For further information on passing model sources, check out [Loading Models](https://docs.swmansion.com/react-native-executorch/docs/fundamentals/loading-models).
+**`modelName`**
+A literal of `"moonshine" | "whisper"` which serves as an identifier for which model should be used.
 
-`decoderSource?`
-Analogous to the encoderSource, this takes in a string which is a source for the decoder part of the model. Defaults to our [HuggingFace repositories]().
+**`encoderSource?`**
+A string that specifies the location of a .pte file for the encoder. For further information on passing model sources, check out [Loading Models](https://docs.swmansion.com/react-native-executorch/docs/fundamentals/loading-models). Defaults to (constants)[https://github.com/software-mansion/react-native-executorch/blob/main/src/constants/modelUrls.ts] for given model.
 
-`tokenizerSource?`
-A string that specifies the location to the tokenizer for the model. This works just as the encoder and decoder do. Defaults to our [HuggingFace repositories]().
+**`decoderSource?`**
+Analogous to the encoderSource, this takes in a string which is a source for the decoder part of the model. Defaults to [constants](https://github.com/software-mansion/react-native-executorch/blob/main/src/constants/modelUrls.ts) for given model.
 
-`modelName?`
-A literal of `"moonshine" | "whisper"` which serves as an identifier for which model should be used. Defaults to our [HuggingFace repositories]().
+**`tokenizerSource?`**
+A string that specifies the location to the tokenizer for the model. This works just as the encoder and decoder do. Defaults to [constants](https://github.com/software-mansion/react-native-executorch/blob/main/src/constants/modelUrls.ts) for given model.
 
-`overlapSeconds?`
+**`overlapSeconds?`**
 Specifies the length of overlap between each audio chunk.
 
-`windowSize?`
+**`windowSize?`**
 Specifies the size of each audio chunk.
 
 ### Returns

From 3ecc0025f5cc3f1a20c64394b426496160138cbb Mon Sep 17 00:00:00 2001
From: chmjkb <jakubchmura1607@gmail.com>
Date: Wed, 5 Mar 2025 08:31:09 +0100
Subject: [PATCH 13/35] add missing param, rename response to sequence

---
 docs/docs/speech-to-text/speech-to-text.md | 9 +++++----
 1 file changed, 5 insertions(+), 4 deletions(-)

diff --git a/docs/docs/speech-to-text/speech-to-text.md b/docs/docs/speech-to-text/speech-to-text.md
index 416c07dfbf..304c2619cc 100644
--- a/docs/docs/speech-to-text/speech-to-text.md
+++ b/docs/docs/speech-to-text/speech-to-text.md
@@ -60,13 +60,14 @@ Specifies the size of each audio chunk.
 | `transcribe`   | `(input?: number[]) => Promise<string>` | Starts a transcription process for a given input array, which should be a waveform at 16kHz. When no input is provided, it uses an internal state which is set by calling `loadAudio`. Resolves a promise with the output transcription when the model is finished. |
 | `loadAudio`    | `(url: string) => void`                 | Loads audio file from given url. It sets an internal state which serves as an input to `transcribe()`.                                                                                                                                                              |
 | `error`        | <code>string &#124; null</code>         | Contains the error message if the model failed to load.                                                                                                                                                                                                             |
-| `response`     | <code>string &#124; null</code>         | This property is updated with each generated token. If you're looking to obtain tokens as they're generated, you should use this property.                                                                                                                                                                                                                 |
+| `sequence`     | <code>string &#124; null</code>         | This property is updated with each generated token. If you're looking to obtain tokens as they're generated, you should use this property.                                                                                                                                                                                                                 |
 | `isGenerating` | `boolean`                               | Indicates whether the model is currently processing an inference.                                                                                                                                                                                                   |
-| `isReady`      | `boolean`                               | Indicates whether the model has successfully loaded and is ready for inference.                                                                                                                                                                                     |
+| `isReady`      | `boolean`                               | Indicates whether the model has successfully loaded and is ready for inference. |
+| `downloadProgress`      | `number`                               | Tracks the progress of the model download process.                                                                                                                          |
 
 ## Running the model
 
-To run the model, you can use the `transcribe` method. It accepts one optional argument, which is the waveform representation of the audio. If you called `loadAudio` beforehand, you don't need to pass anything to `transcribe`. However, you can still pass this argument if you want to use your own audio. This function returns a promise, which will return the generated tokens when everything succeeds. If the model fails during inference, it will throw an error. If you want to obtain tokens in streaming fashion, you can also use the `.response` property which is updated with each generated token, analogously to the useLLM hook.
+To run the model, you can use the `transcribe` method. It accepts one optional argument, which is the waveform representation of the audio. If you called `loadAudio` beforehand, you don't need to pass anything to `transcribe`. However, you can still pass this argument if you want to use your own audio. This function returns a promise, which will return the generated tokens when everything succeeds. If the model fails during inference, it will throw an error. If you want to obtain tokens in streaming fashion, you can also use the `.sequence` property which is updated with each generated token, analogously to the useLLM hook.
 
 
 ## Example
@@ -92,7 +93,7 @@ function App() {
         await model.transcribe();
       })
     />
-    <Text>{model.response}</Text>
+    <Text>{model.sequence}</Text>
   )
   // ... Rest of your component
 }

From 0e6193ca22c3deda7a9ee2eed3be54d6b098be1a Mon Sep 17 00:00:00 2001
From: chmjkb <jakubchmura1607@gmail.com>
Date: Wed, 5 Mar 2025 08:34:58 +0100
Subject: [PATCH 14/35] fix link

---
 docs/docs/speech-to-text/speech-to-text.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docs/docs/speech-to-text/speech-to-text.md b/docs/docs/speech-to-text/speech-to-text.md
index 304c2619cc..8d2e8d945b 100644
--- a/docs/docs/speech-to-text/speech-to-text.md
+++ b/docs/docs/speech-to-text/speech-to-text.md
@@ -39,7 +39,7 @@ Given that STT models need to take in a specified sequence length, there is a ne
 A literal of `"moonshine" | "whisper"` which serves as an identifier for which model should be used.
 
 **`encoderSource?`**
-A string that specifies the location of a .pte file for the encoder. For further information on passing model sources, check out [Loading Models](https://docs.swmansion.com/react-native-executorch/docs/fundamentals/loading-models). Defaults to (constants)[https://github.com/software-mansion/react-native-executorch/blob/main/src/constants/modelUrls.ts] for given model.
+A string that specifies the location of a .pte file for the encoder. For further information on passing model sources, check out [Loading Models](https://docs.swmansion.com/react-native-executorch/docs/fundamentals/loading-models). Defaults to [constants](https://github.com/software-mansion/react-native-executorch/blob/main/src/constants/modelUrls.ts) for given model.
 
 **`decoderSource?`**
 Analogous to the encoderSource, this takes in a string which is a source for the decoder part of the model. Defaults to [constants](https://github.com/software-mansion/react-native-executorch/blob/main/src/constants/modelUrls.ts) for given model.

From 95bae0223d31c9250dbfec0af5ff4ff24883e9e2 Mon Sep 17 00:00:00 2001
From: chmjkb <jakubchmura1607@gmail.com>
Date: Wed, 5 Mar 2025 08:35:33 +0100
Subject: [PATCH 15/35] fix example

---
 docs/docs/speech-to-text/speech-to-text.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docs/docs/speech-to-text/speech-to-text.md b/docs/docs/speech-to-text/speech-to-text.md
index 8d2e8d945b..cd8e9c57d6 100644
--- a/docs/docs/speech-to-text/speech-to-text.md
+++ b/docs/docs/speech-to-text/speech-to-text.md
@@ -89,7 +89,7 @@ function App() {
     <Button
       onPress=(async () => {
         // Alternatively, you can obtain audio from any other source and pass it to transcribe()
-        model.loadAudio(audioUrl);
+        await model.loadAudio(audioUrl);
         await model.transcribe();
       })
     />

From 631c330f2a03afdf8e4a483d8ecdf2614dcdcfd2 Mon Sep 17 00:00:00 2001
From: chmjkb <jakubchmura1607@gmail.com>
Date: Wed, 5 Mar 2025 10:28:03 +0100
Subject: [PATCH 16/35] add benchmarks to benchmarks section of the docs

---
 docs/docs/benchmarks/inference-time.md     | 24 ++++++++++++++++++----
 docs/docs/benchmarks/memory-usage.md       |  9 ++++++++
 docs/docs/benchmarks/model-size.md         |  9 ++++++++
 docs/docs/speech-to-text/speech-to-text.md | 20 +++++++++---------
 4 files changed, 48 insertions(+), 14 deletions(-)

diff --git a/docs/docs/benchmarks/inference-time.md b/docs/docs/benchmarks/inference-time.md
index c1f91a3b7b..d0595b55ba 100644
--- a/docs/docs/benchmarks/inference-time.md
+++ b/docs/docs/benchmarks/inference-time.md
@@ -32,11 +32,27 @@ Times presented in the tables are measured as consecutive runs of the model. Ini
 
 | Model                 | iPhone 16 Pro (XNNPACK) [tokens/s] | iPhone 13 Pro (XNNPACK) [tokens/s] | iPhone SE 3 (XNNPACK) [tokens/s] | Samsung Galaxy S24 (XNNPACK) [tokens/s] | OnePlus 12 (XNNPACK) [tokens/s] |
 | --------------------- | ---------------------------------- | ---------------------------------- | -------------------------------- | --------------------------------------- | ------------------------------- |
-| LLAMA3_2_1B           | 16.1                               | 11.4                               | ❌                               | 15.6                                    | 19.3                            |
+| LLAMA3_2_1B           | 16.1                               | 11.4                               | ❌                                | 15.6                                    | 19.3                            |
 | LLAMA3_2_1B_SPINQUANT | 40.6                               | 16.7                               | 16.5                             | 40.3                                    | 48.2                            |
 | LLAMA3_2_1B_QLORA     | 31.8                               | 11.4                               | 11.2                             | 37.3                                    | 44.4                            |
-| LLAMA3_2_3B           | ❌                                 | ❌                                 | ❌                               | ❌                                      | 7.1                             |
-| LLAMA3_2_3B_SPINQUANT | 17.2                               | 8.2                                | ❌                               | 16.2                                    | 19.4                            |
-| LLAMA3_2_3B_QLORA     | 14.5                               | ❌                                 | ❌                               | 14.8                                    | 18.1                            |
+| LLAMA3_2_3B           | ❌                                  | ❌                                  | ❌                                | ❌                                       | 7.1                             |
+| LLAMA3_2_3B_SPINQUANT | 17.2                               | 8.2                                | ❌                                | 16.2                                    | 19.4                            |
+| LLAMA3_2_3B_QLORA     | 14.5                               | ❌                                  | ❌                                | 14.8                                    | 18.1                            |
 
 ❌ - Insufficient RAM.
+
+## Speech to text
+
+### Encoder
+
+| Model          | iPhone 16 Pro (XNNPack) | iPhone 13 Pro (XNNPack) | iPhone SE 3 (XNNPack) | Samsung Galaxy S24 (XNNPack) |
+| -------------- | ----------------------- | ----------------------- | --------------------- | ---------------------------- |
+| MOONSHINE_TINY | 0.48s                   | 0.69s                   | 0.69s                 | ❌                            |
+| WHISPER_TINY   | 1.00s                   | 1.40s                   | 1.49s                 | ❌                            |
+
+### Decoder
+
+| Model          | iPhone 16 Pro (XNNPack) | iPhone 13 Pro (XNNPack) | iPhone SE 3 (XNNPack) | Samsung Galaxy S24 (XNNPack) |
+| -------------- | ----------------------- | ----------------------- | --------------------- | ---------------------------- |
+| MOONSHINE_TINY | 13.23 tokens/s          | 7.77 tokens/s           | 7.61 tokens/s         | ❌                            |
+| WHISPER_TINY   | 8.65 tokens/s           | 5.41 tokens/s           | 5.31 tokens/s         | ❌                            |
diff --git a/docs/docs/benchmarks/memory-usage.md b/docs/docs/benchmarks/memory-usage.md
index 868a0884b6..235a724544 100644
--- a/docs/docs/benchmarks/memory-usage.md
+++ b/docs/docs/benchmarks/memory-usage.md
@@ -34,3 +34,12 @@ sidebar_position: 2
 | LLAMA3_2_3B           | 7.1                    | 7.3                |
 | LLAMA3_2_3B_SPINQUANT | 3.7                    | 3.8                |
 | LLAMA3_2_3B_QLORA     | 4                      | 4.1                |
+
+## Speech to text
+
+| Model          | Android (XNNPACK) [MB] | iOS (XNNPACK) [MB] |
+| -------------- | ---------------------- | ------------------ |
+| MOONSHINE_TINY | ❌                      | 868                |
+| WHISPER_TINY   | ❌                      | 950                |
+
+
diff --git a/docs/docs/benchmarks/model-size.md b/docs/docs/benchmarks/model-size.md
index a80f59d47f..c1eac882d7 100644
--- a/docs/docs/benchmarks/model-size.md
+++ b/docs/docs/benchmarks/model-size.md
@@ -34,3 +34,12 @@ sidebar_position: 1
 | LLAMA3_2_3B           | 6.43         |
 | LLAMA3_2_3B_SPINQUANT | 2.55         |
 | LLAMA3_2_3B_QLORA     | 2.65         |
+
+## Speech to text
+
+| Model          | XNNPACK [MB] |
+| -------------- | ------------ |
+| MOONSHINE_TINY | 148.9        |
+| WHISPER_TINY   | 231.0        |
+
+
diff --git a/docs/docs/speech-to-text/speech-to-text.md b/docs/docs/speech-to-text/speech-to-text.md
index cd8e9c57d6..3ae83b9bb9 100644
--- a/docs/docs/speech-to-text/speech-to-text.md
+++ b/docs/docs/speech-to-text/speech-to-text.md
@@ -55,15 +55,15 @@ Specifies the size of each audio chunk.
 
 ### Returns
 
-| Field          | Type                                    | Description                                                                                                                                                                                                                                                         |
-| -------------- | --------------------------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
-| `transcribe`   | `(input?: number[]) => Promise<string>` | Starts a transcription process for a given input array, which should be a waveform at 16kHz. When no input is provided, it uses an internal state which is set by calling `loadAudio`. Resolves a promise with the output transcription when the model is finished. |
-| `loadAudio`    | `(url: string) => void`                 | Loads audio file from given url. It sets an internal state which serves as an input to `transcribe()`.                                                                                                                                                              |
-| `error`        | <code>string &#124; null</code>         | Contains the error message if the model failed to load.                                                                                                                                                                                                             |
-| `sequence`     | <code>string &#124; null</code>         | This property is updated with each generated token. If you're looking to obtain tokens as they're generated, you should use this property.                                                                                                                                                                                                                 |
-| `isGenerating` | `boolean`                               | Indicates whether the model is currently processing an inference.                                                                                                                                                                                                   |
-| `isReady`      | `boolean`                               | Indicates whether the model has successfully loaded and is ready for inference. |
-| `downloadProgress`      | `number`                               | Tracks the progress of the model download process.                                                                                                                          |
+| Field              | Type                                    | Description                                                                                                                                                                                                                                                         |
+| ------------------ | --------------------------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
+| `transcribe`       | `(input?: number[]) => Promise<string>` | Starts a transcription process for a given input array, which should be a waveform at 16kHz. When no input is provided, it uses an internal state which is set by calling `loadAudio`. Resolves a promise with the output transcription when the model is finished. |
+| `loadAudio`        | `(url: string) => void`                 | Loads audio file from given url. It sets an internal state which serves as an input to `transcribe()`.                                                                                                                                                              |
+| `error`            | <code>string &#124; null</code>         | Contains the error message if the model failed to load.                                                                                                                                                                                                             |
+| `sequence`         | <code>string &#124; null</code>         | This property is updated with each generated token. If you're looking to obtain tokens as they're generated, you should use this property.                                                                                                                          |
+| `isGenerating`     | `boolean`                               | Indicates whether the model is currently processing an inference.                                                                                                                                                                                                   |
+| `isReady`          | `boolean`                               | Indicates whether the model has successfully loaded and is ready for inference.                                                                                                                                                                                     |
+| `downloadProgress` | `number`                                | Tracks the progress of the model download process.                                                                                                                                                                                                                  |
 
 ## Running the model
 
@@ -83,7 +83,7 @@ function App() {
     tokenizerSource: WHISPER_TOKENIZER_URL
     modelName: 'whisper',
   });
-  const audioUrl = 'file:///Users/.../never-gonna-give-you-up.mp3';
+  const audioUrl = 'https://your-url.com/never-gonna-give-you-up.mp3';
 
   return (
     <Button

From dda1f256653b6104e92a7a46964372680553ed93 Mon Sep 17 00:00:00 2001
From: chmjkb <jakubchmura1607@gmail.com>
Date: Wed, 5 Mar 2025 10:38:41 +0100
Subject: [PATCH 17/35] finished stt docs

---
 docs/docs/speech-to-text/speech-to-text.md | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/docs/docs/speech-to-text/speech-to-text.md b/docs/docs/speech-to-text/speech-to-text.md
index 3ae83b9bb9..424345c093 100644
--- a/docs/docs/speech-to-text/speech-to-text.md
+++ b/docs/docs/speech-to-text/speech-to-text.md
@@ -9,6 +9,10 @@ With the latest `v0.3.0` release we introduce a new hook - `useSpeechToText`. Sp
 Currently, we do not support direct microphone input streaming to the model. Instead, in  v0.3.0, we provide a way to transcribe an audio file.
 :::
 
+:::caution
+It is recommended to use models provided by us, which are available at our [Hugging Face repository](https://huggingface.co/software-mansion/react-native-executorch-moonshine-tiny). You can also use [constants](https://github.com/software-mansion/react-native-executorch/tree/main/src/constants/modelUrls.ts) shipped with our library
+:::
+
 ## Reference
 
 ```typescript

From ddbd3ced7329fd9372b7e33c1b13e91701ddd6a0 Mon Sep 17 00:00:00 2001
From: jakmro <jakub.mroz@swmansion.com>
Date: Wed, 5 Mar 2025 12:09:13 +0100
Subject: [PATCH 18/35] Shift sidebar by one

---
 docs/docs/benchmarks/_category_.json   | 2 +-
 docs/docs/hookless-api/_category_.json | 2 +-
 docs/docs/module-api/_category_.json   | 2 +-
 docs/docs/utils/_category_.json        | 2 +-
 4 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/docs/docs/benchmarks/_category_.json b/docs/docs/benchmarks/_category_.json
index 001b34959d..e790334696 100644
--- a/docs/docs/benchmarks/_category_.json
+++ b/docs/docs/benchmarks/_category_.json
@@ -1,6 +1,6 @@
 {
   "label": "Benchmarks",
-  "position": 7,
+  "position": 8,
   "link": {
     "type": "generated-index"
   }
diff --git a/docs/docs/hookless-api/_category_.json b/docs/docs/hookless-api/_category_.json
index e96f518638..6c0a89084f 100644
--- a/docs/docs/hookless-api/_category_.json
+++ b/docs/docs/hookless-api/_category_.json
@@ -1,6 +1,6 @@
 {
   "label": "Hookless API",
-  "position": 4,
+  "position": 5,
   "link": {
     "type": "generated-index"
   }
diff --git a/docs/docs/module-api/_category_.json b/docs/docs/module-api/_category_.json
index b04000182d..8cc82679ce 100644
--- a/docs/docs/module-api/_category_.json
+++ b/docs/docs/module-api/_category_.json
@@ -1,6 +1,6 @@
 {
   "label": "Module API",
-  "position": 5,
+  "position": 6,
   "link": {
     "type": "generated-index"
   }
diff --git a/docs/docs/utils/_category_.json b/docs/docs/utils/_category_.json
index 4bbbc17380..fe7e29fe8c 100644
--- a/docs/docs/utils/_category_.json
+++ b/docs/docs/utils/_category_.json
@@ -1,6 +1,6 @@
 {
   "label": "Utils",
-  "position": 6,
+  "position": 7,
   "link": {
     "type": "generated-index"
   }

From fadcd1f9de8323bfb321a773bd248d5a7be585eb Mon Sep 17 00:00:00 2001
From: jakmro <jakub.mroz@swmansion.com>
Date: Wed, 5 Mar 2025 12:11:49 +0100
Subject: [PATCH 19/35] Fix formatting

---
 docs/docs/benchmarks/inference-time.md     | 16 ++++++++--------
 docs/docs/benchmarks/memory-usage.md       |  6 ++----
 docs/docs/benchmarks/model-size.md         |  2 --
 docs/docs/speech-to-text/speech-to-text.md | 22 +++++++++++++---------
 4 files changed, 23 insertions(+), 23 deletions(-)

diff --git a/docs/docs/benchmarks/inference-time.md b/docs/docs/benchmarks/inference-time.md
index d0595b55ba..de5a6628ee 100644
--- a/docs/docs/benchmarks/inference-time.md
+++ b/docs/docs/benchmarks/inference-time.md
@@ -32,12 +32,12 @@ Times presented in the tables are measured as consecutive runs of the model. Ini
 
 | Model                 | iPhone 16 Pro (XNNPACK) [tokens/s] | iPhone 13 Pro (XNNPACK) [tokens/s] | iPhone SE 3 (XNNPACK) [tokens/s] | Samsung Galaxy S24 (XNNPACK) [tokens/s] | OnePlus 12 (XNNPACK) [tokens/s] |
 | --------------------- | ---------------------------------- | ---------------------------------- | -------------------------------- | --------------------------------------- | ------------------------------- |
-| LLAMA3_2_1B           | 16.1                               | 11.4                               | ❌                                | 15.6                                    | 19.3                            |
+| LLAMA3_2_1B           | 16.1                               | 11.4                               | ❌                               | 15.6                                    | 19.3                            |
 | LLAMA3_2_1B_SPINQUANT | 40.6                               | 16.7                               | 16.5                             | 40.3                                    | 48.2                            |
 | LLAMA3_2_1B_QLORA     | 31.8                               | 11.4                               | 11.2                             | 37.3                                    | 44.4                            |
-| LLAMA3_2_3B           | ❌                                  | ❌                                  | ❌                                | ❌                                       | 7.1                             |
-| LLAMA3_2_3B_SPINQUANT | 17.2                               | 8.2                                | ❌                                | 16.2                                    | 19.4                            |
-| LLAMA3_2_3B_QLORA     | 14.5                               | ❌                                  | ❌                                | 14.8                                    | 18.1                            |
+| LLAMA3_2_3B           | ❌                                 | ❌                                 | ❌                               | ❌                                      | 7.1                             |
+| LLAMA3_2_3B_SPINQUANT | 17.2                               | 8.2                                | ❌                               | 16.2                                    | 19.4                            |
+| LLAMA3_2_3B_QLORA     | 14.5                               | ❌                                 | ❌                               | 14.8                                    | 18.1                            |
 
 ❌ - Insufficient RAM.
 
@@ -47,12 +47,12 @@ Times presented in the tables are measured as consecutive runs of the model. Ini
 
 | Model          | iPhone 16 Pro (XNNPack) | iPhone 13 Pro (XNNPack) | iPhone SE 3 (XNNPack) | Samsung Galaxy S24 (XNNPack) |
 | -------------- | ----------------------- | ----------------------- | --------------------- | ---------------------------- |
-| MOONSHINE_TINY | 0.48s                   | 0.69s                   | 0.69s                 | ❌                            |
-| WHISPER_TINY   | 1.00s                   | 1.40s                   | 1.49s                 | ❌                            |
+| MOONSHINE_TINY | 0.48s                   | 0.69s                   | 0.69s                 | ❌                           |
+| WHISPER_TINY   | 1.00s                   | 1.40s                   | 1.49s                 | ❌                           |
 
 ### Decoder
 
 | Model          | iPhone 16 Pro (XNNPack) | iPhone 13 Pro (XNNPack) | iPhone SE 3 (XNNPack) | Samsung Galaxy S24 (XNNPack) |
 | -------------- | ----------------------- | ----------------------- | --------------------- | ---------------------------- |
-| MOONSHINE_TINY | 13.23 tokens/s          | 7.77 tokens/s           | 7.61 tokens/s         | ❌                            |
-| WHISPER_TINY   | 8.65 tokens/s           | 5.41 tokens/s           | 5.31 tokens/s         | ❌                            |
+| MOONSHINE_TINY | 13.23 tokens/s          | 7.77 tokens/s           | 7.61 tokens/s         | ❌                           |
+| WHISPER_TINY   | 8.65 tokens/s           | 5.41 tokens/s           | 5.31 tokens/s         | ❌                           |
diff --git a/docs/docs/benchmarks/memory-usage.md b/docs/docs/benchmarks/memory-usage.md
index 235a724544..3f43fe4594 100644
--- a/docs/docs/benchmarks/memory-usage.md
+++ b/docs/docs/benchmarks/memory-usage.md
@@ -39,7 +39,5 @@ sidebar_position: 2
 
 | Model          | Android (XNNPACK) [MB] | iOS (XNNPACK) [MB] |
 | -------------- | ---------------------- | ------------------ |
-| MOONSHINE_TINY | ❌                      | 868                |
-| WHISPER_TINY   | ❌                      | 950                |
-
-
+| MOONSHINE_TINY | ❌                     | 868                |
+| WHISPER_TINY   | ❌                     | 950                |
diff --git a/docs/docs/benchmarks/model-size.md b/docs/docs/benchmarks/model-size.md
index c1eac882d7..f01a1ab34e 100644
--- a/docs/docs/benchmarks/model-size.md
+++ b/docs/docs/benchmarks/model-size.md
@@ -41,5 +41,3 @@ sidebar_position: 1
 | -------------- | ------------ |
 | MOONSHINE_TINY | 148.9        |
 | WHISPER_TINY   | 231.0        |
-
-
diff --git a/docs/docs/speech-to-text/speech-to-text.md b/docs/docs/speech-to-text/speech-to-text.md
index 424345c093..99b48aaa63 100644
--- a/docs/docs/speech-to-text/speech-to-text.md
+++ b/docs/docs/speech-to-text/speech-to-text.md
@@ -6,7 +6,7 @@ sidebar_position: 1
 With the latest `v0.3.0` release we introduce a new hook - `useSpeechToText`. Speech to text is a task that allows to transform spoken language to written text. It is commonly used to implement features such as transcription or voice assistants. As of now, [every supported STT model](#supported-models) runs on the XNNPack backend.
 
 :::info
-Currently, we do not support direct microphone input streaming to the model. Instead, in  v0.3.0, we provide a way to transcribe an audio file.
+Currently, we do not support direct microphone input streaming to the model. Instead, in v0.3.0, we provide a way to transcribe an audio file.
 :::
 
 :::caution
@@ -35,10 +35,13 @@ try {
   console.error(error);
 }
 ```
+
 ### Streaming
+
 Given that STT models need to take in a specified sequence length, there is a need to chunk the input audio. Chunking audio may result in cutting speech mid-sentence, which might be hard to understand for the model. To make it work, we employed an algorithm that uses overlapping audio chunks which might introduce some overhead, but gives way better results.
 
 ### Arguments
+
 **`modelName`**
 A literal of `"moonshine" | "whisper"` which serves as an identifier for which model should be used.
 
@@ -73,7 +76,6 @@ Specifies the size of each audio chunk.
 
 To run the model, you can use the `transcribe` method. It accepts one optional argument, which is the waveform representation of the audio. If you called `loadAudio` beforehand, you don't need to pass anything to `transcribe`. However, you can still pass this argument if you want to use your own audio. This function returns a promise, which will return the generated tokens when everything succeeds. If the model fails during inference, it will throw an error. If you want to obtain tokens in streaming fashion, you can also use the `.sequence` property which is updated with each generated token, analogously to the useLLM hook.
 
-
 ## Example
 
 ```typescript
@@ -102,7 +104,9 @@ function App() {
   // ... Rest of your component
 }
 ```
+
 ## Supported models
+
 - [Whisper (tiny.en)](https://github.com/openai/whisper)
 - [Moonshine (tiny)](https://github.com/usefulsensors/moonshine)
 
@@ -119,8 +123,8 @@ function App() {
 
 | Model             | Android (XNNPACK) [MB] | iOS (XNNPACK) [MB] |
 | ----------------- | ---------------------- | ------------------ |
-| Whisper (tiny.en) | ❌                      | 950                |
-| Moonshine (tiny)  | ❌                      | 868                |
+| Whisper (tiny.en) | ❌                     | 950                |
+| Moonshine (tiny)  | ❌                     | 868                |
 
 ### Inference time
 
@@ -132,12 +136,12 @@ Given that Whisper accepts a 30 seconds audio chunks, we employed a streaming al
 
 | Model             | iPhone 16 Pro (XNNPack) | iPhone 13 Pro (XNNPack) | iPhone SE 3 (XNNPack) | Samsung Galaxy S24 (XNNPack) |
 | ----------------- | ----------------------- | ----------------------- | --------------------- | ---------------------------- |
-| Whisper (tiny.en) | 8.65 tokens/s           | 5.41 tokens/s           | 5.31 tokens/s         | ❌                            |
-| Moonshine (tiny)  | 13.23 tokens/s          | 7.77 tokens/s           | 7.61 tokens/s         | ❌                            |
+| Whisper (tiny.en) | 8.65 tokens/s           | 5.41 tokens/s           | 5.31 tokens/s         | ❌                           |
+| Moonshine (tiny)  | 13.23 tokens/s          | 7.77 tokens/s           | 7.61 tokens/s         | ❌                           |
 
 #### Encoder
+
 | Model             | iPhone 16 Pro (XNNPack) | iPhone 13 Pro (XNNPack) | iPhone SE 3 (XNNPack) | Samsung Galaxy S24 (XNNPack) |
 | ----------------- | ----------------------- | ----------------------- | --------------------- | ---------------------------- |
-| Whisper (tiny.en) | 1.00s                   | 1.40s                   | 1.49s                 | ❌                            |
-| Moonshine (tiny)  | 0.48s                   | 0.69s                   | 0.69s                 | ❌                            |
-
+| Whisper (tiny.en) | 1.00s                   | 1.40s                   | 1.49s                 | ❌                           |
+| Moonshine (tiny)  | 0.48s                   | 0.69s                   | 0.69s                 | ❌                           |

From 1cfa6679065f0a62aa0f87e3c9a2b311a8ec035c Mon Sep 17 00:00:00 2001
From: jakmro <jakub.mroz@swmansion.com>
Date: Wed, 5 Mar 2025 12:21:00 +0100
Subject: [PATCH 20/35] Change naming

---
 docs/docs/benchmarks/inference-time.md | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/docs/docs/benchmarks/inference-time.md b/docs/docs/benchmarks/inference-time.md
index de5a6628ee..e60af1203a 100644
--- a/docs/docs/benchmarks/inference-time.md
+++ b/docs/docs/benchmarks/inference-time.md
@@ -43,16 +43,16 @@ Times presented in the tables are measured as consecutive runs of the model. Ini
 
 ## Speech to text
 
-### Encoder
+### Encoding
 
-| Model          | iPhone 16 Pro (XNNPack) | iPhone 13 Pro (XNNPack) | iPhone SE 3 (XNNPack) | Samsung Galaxy S24 (XNNPack) |
+| Model          | iPhone 16 Pro (XNNPACK) | iPhone 13 Pro (XNNPACK) | iPhone SE 3 (XNNPACK) | Samsung Galaxy S24 (XNNPACK) |
 | -------------- | ----------------------- | ----------------------- | --------------------- | ---------------------------- |
 | MOONSHINE_TINY | 0.48s                   | 0.69s                   | 0.69s                 | ❌                           |
 | WHISPER_TINY   | 1.00s                   | 1.40s                   | 1.49s                 | ❌                           |
 
-### Decoder
+### Decoding
 
-| Model          | iPhone 16 Pro (XNNPack) | iPhone 13 Pro (XNNPack) | iPhone SE 3 (XNNPack) | Samsung Galaxy S24 (XNNPack) |
+| Model          | iPhone 16 Pro (XNNPACK) | iPhone 13 Pro (XNNPACK) | iPhone SE 3 (XNNPACK) | Samsung Galaxy S24 (XNNPACK) |
 | -------------- | ----------------------- | ----------------------- | --------------------- | ---------------------------- |
 | MOONSHINE_TINY | 13.23 tokens/s          | 7.77 tokens/s           | 7.61 tokens/s         | ❌                           |
 | WHISPER_TINY   | 8.65 tokens/s           | 5.41 tokens/s           | 5.31 tokens/s         | ❌                           |

From a8ca420e30decf19563031a458f81a4d063dfc78 Mon Sep 17 00:00:00 2001
From: jakmro <jakub.mroz@swmansion.com>
Date: Wed, 5 Mar 2025 12:41:36 +0100
Subject: [PATCH 21/35] Styling fixes

---
 docs/docs/benchmarks/inference-time.md     |  4 +-
 docs/docs/benchmarks/memory-usage.md       |  2 +-
 docs/docs/benchmarks/model-size.md         |  2 +-
 docs/docs/speech-to-text/speech-to-text.md | 44 +++++++++++-----------
 4 files changed, 27 insertions(+), 25 deletions(-)

diff --git a/docs/docs/benchmarks/inference-time.md b/docs/docs/benchmarks/inference-time.md
index e60af1203a..4bb685a51d 100644
--- a/docs/docs/benchmarks/inference-time.md
+++ b/docs/docs/benchmarks/inference-time.md
@@ -47,12 +47,12 @@ Times presented in the tables are measured as consecutive runs of the model. Ini
 
 | Model          | iPhone 16 Pro (XNNPACK) | iPhone 13 Pro (XNNPACK) | iPhone SE 3 (XNNPACK) | Samsung Galaxy S24 (XNNPACK) |
 | -------------- | ----------------------- | ----------------------- | --------------------- | ---------------------------- |
-| MOONSHINE_TINY | 0.48s                   | 0.69s                   | 0.69s                 | ❌                           |
 | WHISPER_TINY   | 1.00s                   | 1.40s                   | 1.49s                 | ❌                           |
+| MOONSHINE_TINY | 0.48s                   | 0.69s                   | 0.69s                 | ❌                           |
 
 ### Decoding
 
 | Model          | iPhone 16 Pro (XNNPACK) | iPhone 13 Pro (XNNPACK) | iPhone SE 3 (XNNPACK) | Samsung Galaxy S24 (XNNPACK) |
 | -------------- | ----------------------- | ----------------------- | --------------------- | ---------------------------- |
-| MOONSHINE_TINY | 13.23 tokens/s          | 7.77 tokens/s           | 7.61 tokens/s         | ❌                           |
 | WHISPER_TINY   | 8.65 tokens/s           | 5.41 tokens/s           | 5.31 tokens/s         | ❌                           |
+| MOONSHINE_TINY | 13.23 tokens/s          | 7.77 tokens/s           | 7.61 tokens/s         | ❌                           |
diff --git a/docs/docs/benchmarks/memory-usage.md b/docs/docs/benchmarks/memory-usage.md
index 3f43fe4594..c1d174fb86 100644
--- a/docs/docs/benchmarks/memory-usage.md
+++ b/docs/docs/benchmarks/memory-usage.md
@@ -39,5 +39,5 @@ sidebar_position: 2
 
 | Model          | Android (XNNPACK) [MB] | iOS (XNNPACK) [MB] |
 | -------------- | ---------------------- | ------------------ |
-| MOONSHINE_TINY | ❌                     | 868                |
 | WHISPER_TINY   | ❌                     | 950                |
+| MOONSHINE_TINY | ❌                     | 868                |
diff --git a/docs/docs/benchmarks/model-size.md b/docs/docs/benchmarks/model-size.md
index f01a1ab34e..78fc8ccaa4 100644
--- a/docs/docs/benchmarks/model-size.md
+++ b/docs/docs/benchmarks/model-size.md
@@ -39,5 +39,5 @@ sidebar_position: 1
 
 | Model          | XNNPACK [MB] |
 | -------------- | ------------ |
-| MOONSHINE_TINY | 148.9        |
 | WHISPER_TINY   | 231.0        |
+| MOONSHINE_TINY | 148.9        |
diff --git a/docs/docs/speech-to-text/speech-to-text.md b/docs/docs/speech-to-text/speech-to-text.md
index 99b48aaa63..82a78ff3c1 100644
--- a/docs/docs/speech-to-text/speech-to-text.md
+++ b/docs/docs/speech-to-text/speech-to-text.md
@@ -3,7 +3,7 @@ title: useSpeechToText
 sidebar_position: 1
 ---
 
-With the latest `v0.3.0` release we introduce a new hook - `useSpeechToText`. Speech to text is a task that allows to transform spoken language to written text. It is commonly used to implement features such as transcription or voice assistants. As of now, [every supported STT model](#supported-models) runs on the XNNPack backend.
+With the latest `v0.3.0` release we introduce a new hook - `useSpeechToText`. Speech to text is a task that allows to transform spoken language to written text. It is commonly used to implement features such as transcription or voice assistants. As of now, [all supported STT models](#supported-models) run on the XNNPACK backend.
 
 :::info
 Currently, we do not support direct microphone input streaming to the model. Instead, in v0.3.0, we provide a way to transcribe an audio file.
@@ -107,24 +107,26 @@ function App() {
 
 ## Supported models
 
-- [Whisper (tiny.en)](https://github.com/openai/whisper)
-- [Moonshine (tiny)](https://github.com/usefulsensors/moonshine)
+| Model                                                                 | Language |
+| --------------------------------------------------------------------- | -------- |
+| [Whisper tiny.en](https://huggingface.co/openai/whisper-tiny.en)      | English  |
+| [Moonshine tiny](https://huggingface.co/UsefulSensors/moonshine-tiny) | English  |
 
 ## Benchmarks
 
 ### Model size
 
-| Model             | XNNPACK [MB] |
-| ----------------- | ------------ |
-| Whisper (tiny.en) | 231          |
-| Moonshine tiny    | 149          |
+| Model          | XNNPACK [MB] |
+| -------------- | ------------ |
+| WHISPER_TINY   | 231.0        |
+| MOONSHINE_TINY | 148.9        |
 
 ### Memory usage
 
-| Model             | Android (XNNPACK) [MB] | iOS (XNNPACK) [MB] |
-| ----------------- | ---------------------- | ------------------ |
-| Whisper (tiny.en) | ❌                     | 950                |
-| Moonshine (tiny)  | ❌                     | 868                |
+| Model          | Android (XNNPACK) [MB] | iOS (XNNPACK) [MB] |
+| -------------- | ---------------------- | ------------------ |
+| WHISPER_TINY   | ❌                     | 950                |
+| MOONSHINE_TINY | ❌                     | 868                |
 
 ### Inference time
 
@@ -132,16 +134,16 @@ function App() {
 Given that Whisper accepts a 30 seconds audio chunks, we employed a streaming algorithm to maintain consistency across long audio files. Therefore, data presented in this table may differ from what you experience in your apps.
 :::
 
-#### Decoder
+#### Encoding
 
-| Model             | iPhone 16 Pro (XNNPack) | iPhone 13 Pro (XNNPack) | iPhone SE 3 (XNNPack) | Samsung Galaxy S24 (XNNPack) |
-| ----------------- | ----------------------- | ----------------------- | --------------------- | ---------------------------- |
-| Whisper (tiny.en) | 8.65 tokens/s           | 5.41 tokens/s           | 5.31 tokens/s         | ❌                           |
-| Moonshine (tiny)  | 13.23 tokens/s          | 7.77 tokens/s           | 7.61 tokens/s         | ❌                           |
+| Model          | iPhone 16 Pro (XNNPACK) | iPhone 13 Pro (XNNPACK) | iPhone SE 3 (XNNPACK) | Samsung Galaxy S24 (XNNPACK) |
+| -------------- | ----------------------- | ----------------------- | --------------------- | ---------------------------- |
+| WHISPER_TINY   | 1.00s                   | 1.40s                   | 1.49s                 | ❌                           |
+| MOONSHINE_TINY | 0.48s                   | 0.69s                   | 0.69s                 | ❌                           |
 
-#### Encoder
+#### Decoding
 
-| Model             | iPhone 16 Pro (XNNPack) | iPhone 13 Pro (XNNPack) | iPhone SE 3 (XNNPack) | Samsung Galaxy S24 (XNNPack) |
-| ----------------- | ----------------------- | ----------------------- | --------------------- | ---------------------------- |
-| Whisper (tiny.en) | 1.00s                   | 1.40s                   | 1.49s                 | ❌                           |
-| Moonshine (tiny)  | 0.48s                   | 0.69s                   | 0.69s                 | ❌                           |
+| Model          | iPhone 16 Pro (XNNPACK) | iPhone 13 Pro (XNNPACK) | iPhone SE 3 (XNNPACK) | Samsung Galaxy S24 (XNNPACK) |
+| -------------- | ----------------------- | ----------------------- | --------------------- | ---------------------------- |
+| WHISPER_TINY   | 8.65 tokens/s           | 5.41 tokens/s           | 5.31 tokens/s         | ❌                           |
+| MOONSHINE_TINY | 13.23 tokens/s          | 7.77 tokens/s           | 7.61 tokens/s         | ❌                           |

From 21964f602b39c474620ebce678bbe9ff4df45e84 Mon Sep 17 00:00:00 2001
From: chmjkb <jakubchmura1607@gmail.com>
Date: Wed, 5 Mar 2025 13:22:19 +0100
Subject: [PATCH 22/35] Add missing coma

---
 docs/docs/speech-to-text/speech-to-text.md | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/docs/docs/speech-to-text/speech-to-text.md b/docs/docs/speech-to-text/speech-to-text.md
index 82a78ff3c1..39f738e77e 100644
--- a/docs/docs/speech-to-text/speech-to-text.md
+++ b/docs/docs/speech-to-text/speech-to-text.md
@@ -21,7 +21,7 @@ import { useSpeechToText, MOONSHINE_TOKENIZER_URL, MOONSHINE_TINY_ENCODER_URL, M
 const model = useSpeechToText({
   encoderSource: MOONSHINE_TINY_ENCODER_URL,
   decoderSource: MOONSHINE_TINY_DECODER_URL,
-  tokenizerSource: MOONSHINE_TOKENIZER_URL
+  tokenizerSource: MOONSHINE_TOKENIZER_URL,
   modelName: 'moonshine',
 });
 
@@ -86,7 +86,7 @@ function App() {
   const model = useSpeechToText({
     encoderSource: WHISPER_TINY_ENCODER_URL,
     decoderSource: WHISPER_TINY_DECODER_URL,
-    tokenizerSource: WHISPER_TOKENIZER_URL
+    tokenizerSource: WHISPER_TOKENIZER_URL,
     modelName: 'whisper',
   });
   const audioUrl = 'https://your-url.com/never-gonna-give-you-up.mp3';

From 99769c064a0a5c235bb1cfa4cd52ab845f634063 Mon Sep 17 00:00:00 2001
From: chmjkb <jakubchmura1607@gmail.com>
Date: Wed, 5 Mar 2025 13:33:46 +0100
Subject: [PATCH 23/35] Fix syntax issues in example code

---
 docs/docs/speech-to-text/speech-to-text.md | 40 +++++++++++++---------
 1 file changed, 23 insertions(+), 17 deletions(-)

diff --git a/docs/docs/speech-to-text/speech-to-text.md b/docs/docs/speech-to-text/speech-to-text.md
index 39f738e77e..18aed72e90 100644
--- a/docs/docs/speech-to-text/speech-to-text.md
+++ b/docs/docs/speech-to-text/speech-to-text.md
@@ -74,7 +74,7 @@ Specifies the size of each audio chunk.
 
 ## Running the model
 
-To run the model, you can use the `transcribe` method. It accepts one optional argument, which is the waveform representation of the audio. If you called `loadAudio` beforehand, you don't need to pass anything to `transcribe`. However, you can still pass this argument if you want to use your own audio. This function returns a promise, which will return the generated tokens when everything succeeds. If the model fails during inference, it will throw an error. If you want to obtain tokens in streaming fashion, you can also use the `.sequence` property which is updated with each generated token, analogously to the useLLM hook.
+To run the model, you can use the `transcribe` method. It accepts one optional argument, which is the waveform representation of the audio. If you called `loadAudio` beforehand, you don't need to pass anything to `transcribe`. However, you can still pass this argument if you want to use your own audio. This function returns a promise, which will return the generated tokens when everything succeeds. If the model fails during inference, it will throw an error. If you want to obtain tokens in streaming fashion, you can also use the `sequence` property which is updated with each generated token, analogously to the useLLM hook.
 
 ## Example
 
@@ -89,19 +89,25 @@ function App() {
     tokenizerSource: WHISPER_TOKENIZER_URL,
     modelName: 'whisper',
   });
+
   const audioUrl = 'https://your-url.com/never-gonna-give-you-up.mp3';
 
   return (
-    <Button
-      onPress=(async () => {
-        // Alternatively, you can obtain audio from any other source and pass it to transcribe()
-        await model.loadAudio(audioUrl);
-        await model.transcribe();
-      })
-    />
-    <Text>{model.sequence}</Text>
-  )
-  // ... Rest of your component
+    <View>
+      <Button
+        onPress={async () => {
+          try {
+            await model.loadAudio(audioUrl);
+            await model.transcribe();
+          } catch (error) {
+            console.error("Error transcribing audio:", error);
+          }
+        }}
+        title="Transcribe"
+      />
+      <Text>{model.sequence}</Text>
+    </View>
+  );
 }
 ```
 
@@ -125,8 +131,8 @@ function App() {
 
 | Model          | Android (XNNPACK) [MB] | iOS (XNNPACK) [MB] |
 | -------------- | ---------------------- | ------------------ |
-| WHISPER_TINY   | ❌                     | 950                |
-| MOONSHINE_TINY | ❌                     | 868                |
+| WHISPER_TINY   | ❌                      | 950                |
+| MOONSHINE_TINY | ❌                      | 868                |
 
 ### Inference time
 
@@ -138,12 +144,12 @@ Given that Whisper accepts a 30 seconds audio chunks, we employed a streaming al
 
 | Model          | iPhone 16 Pro (XNNPACK) | iPhone 13 Pro (XNNPACK) | iPhone SE 3 (XNNPACK) | Samsung Galaxy S24 (XNNPACK) |
 | -------------- | ----------------------- | ----------------------- | --------------------- | ---------------------------- |
-| WHISPER_TINY   | 1.00s                   | 1.40s                   | 1.49s                 | ❌                           |
-| MOONSHINE_TINY | 0.48s                   | 0.69s                   | 0.69s                 | ❌                           |
+| WHISPER_TINY   | 1.00s                   | 1.40s                   | 1.49s                 | ❌                            |
+| MOONSHINE_TINY | 0.48s                   | 0.69s                   | 0.69s                 | ❌                            |
 
 #### Decoding
 
 | Model          | iPhone 16 Pro (XNNPACK) | iPhone 13 Pro (XNNPACK) | iPhone SE 3 (XNNPACK) | Samsung Galaxy S24 (XNNPACK) |
 | -------------- | ----------------------- | ----------------------- | --------------------- | ---------------------------- |
-| WHISPER_TINY   | 8.65 tokens/s           | 5.41 tokens/s           | 5.31 tokens/s         | ❌                           |
-| MOONSHINE_TINY | 13.23 tokens/s          | 7.77 tokens/s           | 7.61 tokens/s         | ❌                           |
+| WHISPER_TINY   | 8.65 tokens/s           | 5.41 tokens/s           | 5.31 tokens/s         | ❌                            |
+| MOONSHINE_TINY | 13.23 tokens/s          | 7.77 tokens/s           | 7.61 tokens/s         | ❌                            |

From 1a3ee9bc6829df781b93395cb916f6b322c63a19 Mon Sep 17 00:00:00 2001
From: chmjkb <jakubchmura1607@gmail.com>
Date: Wed, 5 Mar 2025 15:04:43 +0100
Subject: [PATCH 24/35] add docs for hookless API

---
 docs/docs/hookless-api/SpeechToTextModule.md | 44 ++++++++++++++++++++
 1 file changed, 44 insertions(+)
 create mode 100644 docs/docs/hookless-api/SpeechToTextModule.md

diff --git a/docs/docs/hookless-api/SpeechToTextModule.md b/docs/docs/hookless-api/SpeechToTextModule.md
new file mode 100644
index 0000000000..f0db11b23c
--- /dev/null
+++ b/docs/docs/hookless-api/SpeechToTextModule.md
@@ -0,0 +1,44 @@
+---
+title: SpeechToTextModule
+sidebar_position: 6
+---
+
+Hookless implementation of the [useSpeechToText](../speech-to-text/) hook.
+
+## Reference
+
+```typescript
+import { SpeechToTextModule } from 'react-native-executorch';
+
+const audioUrl = 'https://www.your-url.com/cool-music.mp3';
+
+// Loading the model
+const onSequenceUpdate = (sequence) => {
+    console.log(sequence);
+};
+await SpeechToTextModule.load('moonshine', onSequenceUpdate);
+
+// Loading the audio and running the model
+await SpeechToTextModule.loadAudio(audioUrl);
+const transcribedText = await SpeechToTextModule.transcribe();
+```
+
+### Methods
+
+| Method       | Type                                                                                                                                                                                                                                                                       | Description                                                                                                                                                                                                                                                                                                                               |
+| ------------ | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | ----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
+| `load`       | <code>(modelName: 'whisper' &#124 'moonshine, transcribeCallback?: (sequence: string) => void, modelDownloadProgressCalback?: (downloadProgress: number) => void, encoderSource?: ResourceSource, decoderSource?: ResourceSource, tokenizerSource?: ResourceSource)</code> | Loads the model specified with `modelName`, where `encoderSource`, `decoderSource`, `tokenizerSource` are strings specifying the location of the binaries for the models. `modelDownloadProgressCallback` allows you to monitor the current progress of the model download, while `transcribeCallback` is invoked with each generated token |
+| `transcribe` | `(waveform: number[]): Promise<string>`                                                                                                                                                                                                                                    | Starts a transcription process for a given input array, which should be a waveform at 16kHz. When no input is provided, it uses an internal state which is set by calling `loadAudio`. Resolves a promise with the output transcription when the model is finished.                                                                       |
+| `loadAudio`  | `(url: string) => void`                                                                                                                                                                                                                                                    | Loads audio file from given url. It sets an internal state which serves as an input to `transcribe()`.                                                                                                                                                                                                                                    |
+
+## Loading the model
+
+To load the model, use the `load` method. It accepts the `encoderSource`, `decoderSource`, `tokenizerSource` which are strings that specify the location of the binaries for the model. For more information, take a look at [loading models](../fundamentals/loading-models.md) page. This method returns a promise, which can resolve to an error or void.
+
+## Running the model
+
+To run the model, you can use the `transcribe` method. It accepts one argument, which is an array of numbers representing a waveform at 16kHz sampling rate. The method returns a promise, which can resolve either to an error or a string containing the output text.
+
+## Obtaining the input
+
+To get the input, you can use the `loadAudio` method, which sets the internal input state of the model. Then you can just call `transcribe` without passing any args. It is also possible to pass inputs from other sources, as long as it is a float array containing the aforementioned waveform.

From 014eb120688751e5e225c7fa69624a74fe978734 Mon Sep 17 00:00:00 2001
From: chmjkb <jakubchmura1607@gmail.com>
Date: Thu, 6 Mar 2025 09:31:55 +0100
Subject: [PATCH 25/35] add encode and decode to hookless api

---
 docs/docs/hookless-api/SpeechToTextModule.md | 10 ++++++----
 1 file changed, 6 insertions(+), 4 deletions(-)

diff --git a/docs/docs/hookless-api/SpeechToTextModule.md b/docs/docs/hookless-api/SpeechToTextModule.md
index f0db11b23c..0c8e8decec 100644
--- a/docs/docs/hookless-api/SpeechToTextModule.md
+++ b/docs/docs/hookless-api/SpeechToTextModule.md
@@ -25,11 +25,13 @@ const transcribedText = await SpeechToTextModule.transcribe();
 
 ### Methods
 
-| Method       | Type                                                                                                                                                                                                                                                                       | Description                                                                                                                                                                                                                                                                                                                               |
-| ------------ | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | ----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
+| Method       | Type                                                                                                                                                                                                                                                                       | Description                                                                                                                                                                                                                                                                                                                                 |
+| ------------ | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
 | `load`       | <code>(modelName: 'whisper' &#124 'moonshine, transcribeCallback?: (sequence: string) => void, modelDownloadProgressCalback?: (downloadProgress: number) => void, encoderSource?: ResourceSource, decoderSource?: ResourceSource, tokenizerSource?: ResourceSource)</code> | Loads the model specified with `modelName`, where `encoderSource`, `decoderSource`, `tokenizerSource` are strings specifying the location of the binaries for the models. `modelDownloadProgressCallback` allows you to monitor the current progress of the model download, while `transcribeCallback` is invoked with each generated token |
-| `transcribe` | `(waveform: number[]): Promise<string>`                                                                                                                                                                                                                                    | Starts a transcription process for a given input array, which should be a waveform at 16kHz. When no input is provided, it uses an internal state which is set by calling `loadAudio`. Resolves a promise with the output transcription when the model is finished.                                                                       |
-| `loadAudio`  | `(url: string) => void`                                                                                                                                                                                                                                                    | Loads audio file from given url. It sets an internal state which serves as an input to `transcribe()`.                                                                                                                                                                                                                                    |
+| `transcribe` | `(waveform: number[]): Promise<string>`                                                                                                                                                                                                                                    | Starts a transcription process for a given input array, which should be a waveform at 16kHz. When no input is provided, it uses an internal state which is set by calling `loadAudio`. Resolves a promise with the output transcription when the model is finished.                                                                         |
+| `loadAudio`  | `(url: string) => void`                                                                                                                                                                                                                                                    | Loads audio file from given url. It sets an internal state which serves as an input to `transcribe()`.                                                                                                                                                                                                                                      |
+| `encode`     | `(waveform: number[]) => Promise<number[]>`                                                                                                                                                                                                                                | Runs the encoding part of the model. Returns a float array representing the output of the encoder.                                                                                                                                                                                                                                          |
+| `decode`     | `(tokens: number[], encodings: number[]) => Promise<number[]>`                                                                                                                                                                                                             | Runs the decoder of the model. Returns a single token representing a next token in the output sequence.                                                                                                                                                                                                                                     |
 
 ## Loading the model
 

From fb3bdc370a584a70cc57bffb6b8cd18aa3ec4381 Mon Sep 17 00:00:00 2001
From: chmjkb <jakubchmura1607@gmail.com>
Date: Thu, 6 Mar 2025 10:42:52 +0100
Subject: [PATCH 26/35] remove Rick Astley :(

---
 docs/docs/hookless-api/SpeechToTextModule.md             | 2 +-
 .../{speech-to-text.md => useSpeechToText.md}            | 9 +++++----
 2 files changed, 6 insertions(+), 5 deletions(-)
 rename docs/docs/speech-to-text/{speech-to-text.md => useSpeechToText.md} (89%)

diff --git a/docs/docs/hookless-api/SpeechToTextModule.md b/docs/docs/hookless-api/SpeechToTextModule.md
index 0c8e8decec..21696ab054 100644
--- a/docs/docs/hookless-api/SpeechToTextModule.md
+++ b/docs/docs/hookless-api/SpeechToTextModule.md
@@ -10,7 +10,7 @@ Hookless implementation of the [useSpeechToText](../speech-to-text/) hook.
 ```typescript
 import { SpeechToTextModule } from 'react-native-executorch';
 
-const audioUrl = 'https://www.your-url.com/cool-music.mp3';
+const audioUrl = 'https://www.your-url.com/cool-audio.mp3';
 
 // Loading the model
 const onSequenceUpdate = (sequence) => {
diff --git a/docs/docs/speech-to-text/speech-to-text.md b/docs/docs/speech-to-text/useSpeechToText.md
similarity index 89%
rename from docs/docs/speech-to-text/speech-to-text.md
rename to docs/docs/speech-to-text/useSpeechToText.md
index 18aed72e90..e138c87df4 100644
--- a/docs/docs/speech-to-text/speech-to-text.md
+++ b/docs/docs/speech-to-text/useSpeechToText.md
@@ -25,7 +25,7 @@ const model = useSpeechToText({
   modelName: 'moonshine',
 });
 
-const audioUrl = 'https://your-url.com/never-gonna-give-you-up.mp3';
+const audioUrl = 'https://your-url.com/your-audio.mp3';
 
 try {
   await model.loadAudio(audioUrl);
@@ -38,7 +38,7 @@ try {
 
 ### Streaming
 
-Given that STT models need to take in a specified sequence length, there is a need to chunk the input audio. Chunking audio may result in cutting speech mid-sentence, which might be hard to understand for the model. To make it work, we employed an algorithm that uses overlapping audio chunks which might introduce some overhead, but gives way better results.
+Given that STT models take in a fixed length sequence, there is a need to chunk the input audio. Chunking audio may result in cutting speech mid-sentence, which might be hard to understand for the model. To make it work, we employed an algorithm that uses overlapping audio chunks which might introduce some overhead, but yield way better transcription results for longer audio.
 
 ### Arguments
 
@@ -74,7 +74,8 @@ Specifies the size of each audio chunk.
 
 ## Running the model
 
-To run the model, you can use the `transcribe` method. It accepts one optional argument, which is the waveform representation of the audio. If you called `loadAudio` beforehand, you don't need to pass anything to `transcribe`. However, you can still pass this argument if you want to use your own audio. This function returns a promise, which will return the generated tokens when everything succeeds. If the model fails during inference, it will throw an error. If you want to obtain tokens in streaming fashion, you can also use the `sequence` property which is updated with each generated token, analogously to the useLLM hook.
+To run the model, you can use the `transcribe` method. It accepts one optional argument: the waveform representation of the audio. If you called `loadAudio` beforehand, you don't need to pass anything to `transcribe`. However, you can still pass this argument if you want to use your own audio.
+This function returns a promise, which resolves to the generated tokens when successful. If the model fails during inference, it will throw an error. If you want to obtain tokens in a streaming fashion, you can also use the sequence property, which is updated with each generated token, similar to the [useLLM](../llms/useLLM.md) hook.
 
 ## Example
 
@@ -90,7 +91,7 @@ function App() {
     modelName: 'whisper',
   });
 
-  const audioUrl = 'https://your-url.com/never-gonna-give-you-up.mp3';
+  const audioUrl = 'https://your-url.com/your-audio.mp3';
 
   return (
     <View>

From cab80b5a260b594d6ca2dfc1e59615128a6cfff8 Mon Sep 17 00:00:00 2001
From: chmjkb <jakubchmura1607@gmail.com>
Date: Thu, 6 Mar 2025 12:16:59 +0100
Subject: [PATCH 27/35] rephrase overlapSeconds param

---
 docs/docs/speech-to-text/useSpeechToText.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docs/docs/speech-to-text/useSpeechToText.md b/docs/docs/speech-to-text/useSpeechToText.md
index e138c87df4..d880d84287 100644
--- a/docs/docs/speech-to-text/useSpeechToText.md
+++ b/docs/docs/speech-to-text/useSpeechToText.md
@@ -55,7 +55,7 @@ Analogous to the encoderSource, this takes in a string which is a source for the
 A string that specifies the location to the tokenizer for the model. This works just as the encoder and decoder do. Defaults to [constants](https://github.com/software-mansion/react-native-executorch/blob/main/src/constants/modelUrls.ts) for given model.
 
 **`overlapSeconds?`**
-Specifies the length of overlap between each audio chunk.
+Specifies the length of overlap between consecutive audio chunks.
 
 **`windowSize?`**
 Specifies the size of each audio chunk.

From 4130127e7f67f5ec303f0b823cef1e09a3e0474f Mon Sep 17 00:00:00 2001
From: chmjkb <jakubchmura1607@gmail.com>
Date: Thu, 6 Mar 2025 12:33:06 +0100
Subject: [PATCH 28/35] remove inference time

---
 docs/docs/benchmarks/inference-time.md      | 16 ----------------
 docs/docs/speech-to-text/useSpeechToText.md | 18 ++----------------
 2 files changed, 2 insertions(+), 32 deletions(-)

diff --git a/docs/docs/benchmarks/inference-time.md b/docs/docs/benchmarks/inference-time.md
index 4bb685a51d..c1f91a3b7b 100644
--- a/docs/docs/benchmarks/inference-time.md
+++ b/docs/docs/benchmarks/inference-time.md
@@ -40,19 +40,3 @@ Times presented in the tables are measured as consecutive runs of the model. Ini
 | LLAMA3_2_3B_QLORA     | 14.5                               | ❌                                 | ❌                               | 14.8                                    | 18.1                            |
 
 ❌ - Insufficient RAM.
-
-## Speech to text
-
-### Encoding
-
-| Model          | iPhone 16 Pro (XNNPACK) | iPhone 13 Pro (XNNPACK) | iPhone SE 3 (XNNPACK) | Samsung Galaxy S24 (XNNPACK) |
-| -------------- | ----------------------- | ----------------------- | --------------------- | ---------------------------- |
-| WHISPER_TINY   | 1.00s                   | 1.40s                   | 1.49s                 | ❌                           |
-| MOONSHINE_TINY | 0.48s                   | 0.69s                   | 0.69s                 | ❌                           |
-
-### Decoding
-
-| Model          | iPhone 16 Pro (XNNPACK) | iPhone 13 Pro (XNNPACK) | iPhone SE 3 (XNNPACK) | Samsung Galaxy S24 (XNNPACK) |
-| -------------- | ----------------------- | ----------------------- | --------------------- | ---------------------------- |
-| WHISPER_TINY   | 8.65 tokens/s           | 5.41 tokens/s           | 5.31 tokens/s         | ❌                           |
-| MOONSHINE_TINY | 13.23 tokens/s          | 7.77 tokens/s           | 7.61 tokens/s         | ❌                           |
diff --git a/docs/docs/speech-to-text/useSpeechToText.md b/docs/docs/speech-to-text/useSpeechToText.md
index d880d84287..e6f26c778e 100644
--- a/docs/docs/speech-to-text/useSpeechToText.md
+++ b/docs/docs/speech-to-text/useSpeechToText.md
@@ -138,19 +138,5 @@ function App() {
 ### Inference time
 
 :::warning warning
-Given that Whisper accepts a 30 seconds audio chunks, we employed a streaming algorithm to maintain consistency across long audio files. Therefore, data presented in this table may differ from what you experience in your apps.
-:::
-
-#### Encoding
-
-| Model          | iPhone 16 Pro (XNNPACK) | iPhone 13 Pro (XNNPACK) | iPhone SE 3 (XNNPACK) | Samsung Galaxy S24 (XNNPACK) |
-| -------------- | ----------------------- | ----------------------- | --------------------- | ---------------------------- |
-| WHISPER_TINY   | 1.00s                   | 1.40s                   | 1.49s                 | ❌                            |
-| MOONSHINE_TINY | 0.48s                   | 0.69s                   | 0.69s                 | ❌                            |
-
-#### Decoding
-
-| Model          | iPhone 16 Pro (XNNPACK) | iPhone 13 Pro (XNNPACK) | iPhone SE 3 (XNNPACK) | Samsung Galaxy S24 (XNNPACK) |
-| -------------- | ----------------------- | ----------------------- | --------------------- | ---------------------------- |
-| WHISPER_TINY   | 8.65 tokens/s           | 5.41 tokens/s           | 5.31 tokens/s         | ❌                            |
-| MOONSHINE_TINY | 13.23 tokens/s          | 7.77 tokens/s           | 7.61 tokens/s         | ❌                            |
+Given that Whisper accepts a 30 seconds audio chunks, we employed a streaming algorithm to maintain consistency across long audio files. Therefore, the inference time for benchmarks are not there yet.
+:::
\ No newline at end of file

From decd2a65a8f4e4d8bdbb330c18927cc7c8abdc0e Mon Sep 17 00:00:00 2001
From: chmjkb <jakubchmura1607@gmail.com>
Date: Thu, 6 Mar 2025 12:37:51 +0100
Subject: [PATCH 29/35] Add type definitions to SpeechToTextModule docs

---
 docs/docs/hookless-api/SpeechToTextModule.md | 9 +++++++++
 1 file changed, 9 insertions(+)

diff --git a/docs/docs/hookless-api/SpeechToTextModule.md b/docs/docs/hookless-api/SpeechToTextModule.md
index 21696ab054..2440a6c774 100644
--- a/docs/docs/hookless-api/SpeechToTextModule.md
+++ b/docs/docs/hookless-api/SpeechToTextModule.md
@@ -33,6 +33,15 @@ const transcribedText = await SpeechToTextModule.transcribe();
 | `encode`     | `(waveform: number[]) => Promise<number[]>`                                                                                                                                                                                                                                | Runs the encoding part of the model. Returns a float array representing the output of the encoder.                                                                                                                                                                                                                                          |
 | `decode`     | `(tokens: number[], encodings: number[]) => Promise<number[]>`                                                                                                                                                                                                             | Runs the decoder of the model. Returns a single token representing a next token in the output sequence.                                                                                                                                                                                                                                     |
 
+<details>
+<summary>Type definitions</summary>
+
+```typescript
+type ResourceSource = string | number;
+```
+
+</details>
+
 ## Loading the model
 
 To load the model, use the `load` method. It accepts the `encoderSource`, `decoderSource`, `tokenizerSource` which are strings that specify the location of the binaries for the model. For more information, take a look at [loading models](../fundamentals/loading-models.md) page. This method returns a promise, which can resolve to an error or void.

From f2c60e80fcc8d37d7da5dd90085a0bdc350dbfa0 Mon Sep 17 00:00:00 2001
From: chmjkb <jakubchmura1607@gmail.com>
Date: Thu, 6 Mar 2025 12:43:35 +0100
Subject: [PATCH 30/35] Add missing load() info

---
 docs/docs/hookless-api/SpeechToTextModule.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docs/docs/hookless-api/SpeechToTextModule.md b/docs/docs/hookless-api/SpeechToTextModule.md
index 2440a6c774..2438c8431d 100644
--- a/docs/docs/hookless-api/SpeechToTextModule.md
+++ b/docs/docs/hookless-api/SpeechToTextModule.md
@@ -44,7 +44,7 @@ type ResourceSource = string | number;
 
 ## Loading the model
 
-To load the model, use the `load` method. It accepts the `encoderSource`, `decoderSource`, `tokenizerSource` which are strings that specify the location of the binaries for the model. For more information, take a look at [loading models](../fundamentals/loading-models.md) page. This method returns a promise, which can resolve to an error or void.
+To load the model, use the `load` method. The required argument is `modelName`, which serves as an identifier for which model to use. It also accepts accepts optional arguments such as `encoderSource`, `decoderSource`, `tokenizerSource` which are strings that specify the location of the binaries for the model. For more information, take a look at [loading models](../fundamentals/loading-models.md) page. This method returns a promise, which can resolve to an error or void.
 
 ## Running the model
 

From eb0f63c0f9a7938eb8e7e637528cee5ca2f275b0 Mon Sep 17 00:00:00 2001
From: Jakub Mroz <115979017+jakmro@users.noreply.github.com>
Date: Thu, 6 Mar 2025 12:38:14 +0000
Subject: [PATCH 31/35] Add and improve memory usage benchmarks

---
 docs/docs/benchmarks/memory-usage.md        | 4 ++--
 docs/docs/speech-to-text/useSpeechToText.md | 4 ++--
 2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/docs/docs/benchmarks/memory-usage.md b/docs/docs/benchmarks/memory-usage.md
index c1d174fb86..c105b6d286 100644
--- a/docs/docs/benchmarks/memory-usage.md
+++ b/docs/docs/benchmarks/memory-usage.md
@@ -39,5 +39,5 @@ sidebar_position: 2
 
 | Model          | Android (XNNPACK) [MB] | iOS (XNNPACK) [MB] |
 | -------------- | ---------------------- | ------------------ |
-| WHISPER_TINY   | ❌                     | 950                |
-| MOONSHINE_TINY | ❌                     | 868                |
+| WHISPER_TINY   | 900                    | 600                |
+| MOONSHINE_TINY | 650                    | 560                |
diff --git a/docs/docs/speech-to-text/useSpeechToText.md b/docs/docs/speech-to-text/useSpeechToText.md
index e6f26c778e..7721746dbe 100644
--- a/docs/docs/speech-to-text/useSpeechToText.md
+++ b/docs/docs/speech-to-text/useSpeechToText.md
@@ -132,8 +132,8 @@ function App() {
 
 | Model          | Android (XNNPACK) [MB] | iOS (XNNPACK) [MB] |
 | -------------- | ---------------------- | ------------------ |
-| WHISPER_TINY   | ❌                      | 950                |
-| MOONSHINE_TINY | ❌                      | 868                |
+| WHISPER_TINY   | 900                    | 600                |
+| MOONSHINE_TINY | 650                    | 560                |
 
 ### Inference time
 

From 3e7f804b50982fa21e307875e8d694c4f91689bd Mon Sep 17 00:00:00 2001
From: Jakub Mroz <115979017+jakmro@users.noreply.github.com>
Date: Thu, 6 Mar 2025 14:40:11 +0000
Subject: [PATCH 32/35] Fix links

---
 docs/docs/hookless-api/ClassificationModule.md  | 2 +-
 docs/docs/hookless-api/LLMModule.md             | 2 +-
 docs/docs/hookless-api/ObjectDetectionModule.md | 2 +-
 docs/docs/hookless-api/StyleTransferModule.md   | 2 +-
 4 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/docs/docs/hookless-api/ClassificationModule.md b/docs/docs/hookless-api/ClassificationModule.md
index 732971db27..2e62cbd4ab 100644
--- a/docs/docs/hookless-api/ClassificationModule.md
+++ b/docs/docs/hookless-api/ClassificationModule.md
@@ -3,7 +3,7 @@ title: ClassificationModule
 sidebar_position: 1
 ---
 
-Hookless implementation of the [useClassification](../computer-vision/useClassification.mdx) hook.
+Hookless implementation of the [useClassification](../computer-vision/useClassification.md) hook.
 
 ## Reference
 
diff --git a/docs/docs/hookless-api/LLMModule.md b/docs/docs/hookless-api/LLMModule.md
index d52e2e0376..037b151bff 100644
--- a/docs/docs/hookless-api/LLMModule.md
+++ b/docs/docs/hookless-api/LLMModule.md
@@ -3,7 +3,7 @@ title: LLMModule
 sidebar_position: 3
 ---
 
-Hookless implementation of the [useLLM](../llms/running-llms.md) hook.
+Hookless implementation of the [useLLM](../llms/useLLM.md) hook.
 
 ## Reference
 
diff --git a/docs/docs/hookless-api/ObjectDetectionModule.md b/docs/docs/hookless-api/ObjectDetectionModule.md
index 2cc3504ef4..6c730b7fe0 100644
--- a/docs/docs/hookless-api/ObjectDetectionModule.md
+++ b/docs/docs/hookless-api/ObjectDetectionModule.md
@@ -3,7 +3,7 @@ title: ObjectDetectionModule
 sidebar_position: 5
 ---
 
-Hookless implementation of the [useObjectDetection](../computer-vision/useObjectDetection.mdx) hook.
+Hookless implementation of the [useObjectDetection](../computer-vision/useObjectDetection.md) hook.
 
 ## Reference
 
diff --git a/docs/docs/hookless-api/StyleTransferModule.md b/docs/docs/hookless-api/StyleTransferModule.md
index f084d8cad5..29c750bee3 100644
--- a/docs/docs/hookless-api/StyleTransferModule.md
+++ b/docs/docs/hookless-api/StyleTransferModule.md
@@ -3,7 +3,7 @@ title: StyleTransferModule
 sidebar_position: 4
 ---
 
-Hookless implementation of the [useStyleTransfer](../computer-vision/useStyleTransfer.mdx) hook.
+Hookless implementation of the [useStyleTransfer](../computer-vision/useStyleTransfer.md) hook.
 
 ## Reference
 

From aecc0c0d8be8e9dc8363b73ed0f0e95391bf216d Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Mateusz=20Kopci=C5=84ski?=
 <mateusz.kopcinski@swmansnion.com>
Date: Thu, 6 Mar 2025 15:47:23 +0100
Subject: [PATCH 33/35] documentation changes

---
 docs/docs/speech-to-text/useSpeechToText.md | 57 ++++++++-------------
 1 file changed, 22 insertions(+), 35 deletions(-)

diff --git a/docs/docs/speech-to-text/useSpeechToText.md b/docs/docs/speech-to-text/useSpeechToText.md
index 7721746dbe..8173ee0d4f 100644
--- a/docs/docs/speech-to-text/useSpeechToText.md
+++ b/docs/docs/speech-to-text/useSpeechToText.md
@@ -16,29 +16,26 @@ It is recommended to use models provided by us, which are available at our [Hugg
 ## Reference
 
 ```typescript
-import { useSpeechToText, MOONSHINE_TOKENIZER_URL, MOONSHINE_TINY_ENCODER_URL, MOONSHINE_TINY_DECODER_URL } from 'react-native-executorch';
+import { useSpeechToText } from 'react-native-executorch';
 
-const model = useSpeechToText({
-  encoderSource: MOONSHINE_TINY_ENCODER_URL,
-  decoderSource: MOONSHINE_TINY_DECODER_URL,
-  tokenizerSource: MOONSHINE_TOKENIZER_URL,
+const {transcribe, error} = useSpeechToText({
   modelName: 'moonshine',
 });
 
-const audioUrl = 'https://your-url.com/your-audio.mp3';
+const audioUrl = ...; // url with audio to transcribe
 
-try {
-  await model.loadAudio(audioUrl);
-  const transcription = await model.transcribe();
+await model.loadAudio(audioUrl);
+const transcription = await transcribe();
+if (error) {
+  console.log(error);
+} else {
   console.log(transcription);
-} catch (error) {
-  console.error(error);
 }
 ```
 
 ### Streaming
 
-Given that STT models take in a fixed length sequence, there is a need to chunk the input audio. Chunking audio may result in cutting speech mid-sentence, which might be hard to understand for the model. To make it work, we employed an algorithm that uses overlapping audio chunks which might introduce some overhead, but yield way better transcription results for longer audio.
+Given that STT models can process audio no longer than 30 seconds, there is a need to chunk the input audio. Chunking audio may result in cutting speech mid-sentence, which might be hard to understand for the model. To make it work, we employed an algorithm (adapted for mobile devices from [whisper-streaming](https://aclanthology.org/2023.ijcnlp-demo.3.pdf)) that uses overlapping audio chunks. This might introduce some overhead, but allows for processing audio inputs of arbitrary length.
 
 ### Arguments
 
@@ -55,10 +52,10 @@ Analogous to the encoderSource, this takes in a string which is a source for the
 A string that specifies the location to the tokenizer for the model. This works just as the encoder and decoder do. Defaults to [constants](https://github.com/software-mansion/react-native-executorch/blob/main/src/constants/modelUrls.ts) for given model.
 
 **`overlapSeconds?`**
-Specifies the length of overlap between consecutive audio chunks.
+Specifies the length of overlap between consecutive audio chunks (expressed in seconds).
 
 **`windowSize?`**
-Specifies the size of each audio chunk.
+Specifies the size of each audio chunk (expressed in seconds).
 
 ### Returns
 
@@ -66,28 +63,24 @@ Specifies the size of each audio chunk.
 | ------------------ | --------------------------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
 | `transcribe`       | `(input?: number[]) => Promise<string>` | Starts a transcription process for a given input array, which should be a waveform at 16kHz. When no input is provided, it uses an internal state which is set by calling `loadAudio`. Resolves a promise with the output transcription when the model is finished. |
 | `loadAudio`        | `(url: string) => void`                 | Loads audio file from given url. It sets an internal state which serves as an input to `transcribe()`.                                                                                                                                                              |
-| `error`            | <code>string &#124; null</code>         | Contains the error message if the model failed to load.                                                                                                                                                                                                             |
-| `sequence`         | <code>string &#124; null</code>         | This property is updated with each generated token. If you're looking to obtain tokens as they're generated, you should use this property.                                                                                                                          |
-| `isGenerating`     | `boolean`                               | Indicates whether the model is currently processing an inference.                                                                                                                                                                                                   |
-| `isReady`          | `boolean`                               | Indicates whether the model has successfully loaded and is ready for inference.                                                                                                                                                                                     |
+| `error`            | <code>Error &#124; undefined</code>         | Contains the error message if the model failed to load.                                                                                                                                                                                                             |
+| `sequence`         | <code>string</code>         | This property is updated with each generated token. If you're looking to obtain tokens as they're generated, you should use this property.                                                                                                                          |
+| `isModelGenerating`     | `boolean`                               | Indicates whether the model is currently processing an inference.                                                                                                                                                                                                   |
+| `isModelReady`          | `boolean`                               | Indicates whether the model has successfully loaded and is ready for inference.                                                                                                                                                                                     |
 | `downloadProgress` | `number`                                | Tracks the progress of the model download process.                                                                                                                                                                                                                  |
 
 ## Running the model
 
-To run the model, you can use the `transcribe` method. It accepts one optional argument: the waveform representation of the audio. If you called `loadAudio` beforehand, you don't need to pass anything to `transcribe`. However, you can still pass this argument if you want to use your own audio.
-This function returns a promise, which resolves to the generated tokens when successful. If the model fails during inference, it will throw an error. If you want to obtain tokens in a streaming fashion, you can also use the sequence property, which is updated with each generated token, similar to the [useLLM](../llms/useLLM.md) hook.
+Before running the model's `transcribe` method be sure to obtain waveform of the audio You wish to transcribe. You can either use `loadAudio` method to load audio from a url and save it in model's internal state or obtain the waveform on your own (remember to use sampling rate of 16kHz!). In the latter case just pass the obtained waveform as argument to the `transcribe` method which returns a promise resolving to the generated tokens when successful. If the model fails during inference the `error` property contains details of the error. If you want to obtain tokens in a streaming fashion, you can also use the sequence property, which is updated with each generated token, similar to the [useLLM](../llms/useLLM.md) hook.
 
 ## Example
 
 ```typescript
 import { Button, Text } from 'react-native';
-import { useSpeechToText, WHISPER_TOKENIZER_URL, WHISPER_TINY_ENCODER_URL, WHISPER_TINY_DECODER_URL } from 'react-native-executorch';
+import { useSpeechToText } from 'react-native-executorch';
 
 function App() {
-  const model = useSpeechToText({
-    encoderSource: WHISPER_TINY_ENCODER_URL,
-    decoderSource: WHISPER_TINY_DECODER_URL,
-    tokenizerSource: WHISPER_TOKENIZER_URL,
+  const {loadAudio, transcribe, sequence} = useSpeechToText({
     modelName: 'whisper',
   });
 
@@ -98,15 +91,15 @@ function App() {
       <Button
         onPress={async () => {
           try {
-            await model.loadAudio(audioUrl);
-            await model.transcribe();
+            await loadAudio(audioUrl);
+            await transcribe();
           } catch (error) {
             console.error("Error transcribing audio:", error);
           }
-        }}
+        }
         title="Transcribe"
       />
-      <Text>{model.sequence}</Text>
+      <Text>{sequence}</Text>
     </View>
   );
 }
@@ -134,9 +127,3 @@ function App() {
 | -------------- | ---------------------- | ------------------ |
 | WHISPER_TINY   | 900                    | 600                |
 | MOONSHINE_TINY | 650                    | 560                |
-
-### Inference time
-
-:::warning warning
-Given that Whisper accepts a 30 seconds audio chunks, we employed a streaming algorithm to maintain consistency across long audio files. Therefore, the inference time for benchmarks are not there yet.
-:::
\ No newline at end of file

From 67a960d69cd513354881b2535a60c05dc16e0491 Mon Sep 17 00:00:00 2001
From: chmjkb <jakubchmura1607@gmail.com>
Date: Thu, 6 Mar 2025 15:55:32 +0100
Subject: [PATCH 34/35] cosmetic stuf

---
 docs/docs/speech-to-text/useSpeechToText.md | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/docs/docs/speech-to-text/useSpeechToText.md b/docs/docs/speech-to-text/useSpeechToText.md
index 8173ee0d4f..53dd749b91 100644
--- a/docs/docs/speech-to-text/useSpeechToText.md
+++ b/docs/docs/speech-to-text/useSpeechToText.md
@@ -18,11 +18,11 @@ It is recommended to use models provided by us, which are available at our [Hugg
 ```typescript
 import { useSpeechToText } from 'react-native-executorch';
 
-const {transcribe, error} = useSpeechToText({
+const { transcribe, error } = useSpeechToText({
   modelName: 'moonshine',
 });
 
-const audioUrl = ...; // url with audio to transcribe
+const audioUrl = ...; // URL with audio to transcribe
 
 await model.loadAudio(audioUrl);
 const transcription = await transcribe();
@@ -80,11 +80,11 @@ import { Button, Text } from 'react-native';
 import { useSpeechToText } from 'react-native-executorch';
 
 function App() {
-  const {loadAudio, transcribe, sequence} = useSpeechToText({
+  const { loadAudio, transcribe, sequence } = useSpeechToText({
     modelName: 'whisper',
   });
 
-  const audioUrl = 'https://your-url.com/your-audio.mp3';
+  const audioUrl = ...; // URL with audio to transcribe
 
   return (
     <View>

From 61e1df4e4919c61a6dd83e2793b8aca8228a37ea Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Mateusz=20Kopci=C5=84ski?=
 <mateusz.kopcinski@swmansnion.com>
Date: Thu, 6 Mar 2025 16:00:49 +0100
Subject: [PATCH 35/35]  docs changes

---
 docs/docs/speech-to-text/useSpeechToText.md | 20 ++++++++------------
 1 file changed, 8 insertions(+), 12 deletions(-)

diff --git a/docs/docs/speech-to-text/useSpeechToText.md b/docs/docs/speech-to-text/useSpeechToText.md
index 53dd749b91..6cde2e04cf 100644
--- a/docs/docs/speech-to-text/useSpeechToText.md
+++ b/docs/docs/speech-to-text/useSpeechToText.md
@@ -18,13 +18,13 @@ It is recommended to use models provided by us, which are available at our [Hugg
 ```typescript
 import { useSpeechToText } from 'react-native-executorch';
 
-const { transcribe, error } = useSpeechToText({
+const { transcribe, error, loadAudio } = useSpeechToText({
   modelName: 'moonshine',
 });
 
 const audioUrl = ...; // URL with audio to transcribe
 
-await model.loadAudio(audioUrl);
+await loadAudio(audioUrl);
 const transcription = await transcribe();
 if (error) {
   console.log(error);
@@ -65,8 +65,8 @@ Specifies the size of each audio chunk (expressed in seconds).
 | `loadAudio`        | `(url: string) => void`                 | Loads audio file from given url. It sets an internal state which serves as an input to `transcribe()`.                                                                                                                                                              |
 | `error`            | <code>Error &#124; undefined</code>         | Contains the error message if the model failed to load.                                                                                                                                                                                                             |
 | `sequence`         | <code>string</code>         | This property is updated with each generated token. If you're looking to obtain tokens as they're generated, you should use this property.                                                                                                                          |
-| `isModelGenerating`     | `boolean`                               | Indicates whether the model is currently processing an inference.                                                                                                                                                                                                   |
-| `isModelReady`          | `boolean`                               | Indicates whether the model has successfully loaded and is ready for inference.                                                                                                                                                                                     |
+| `isGenerating`     | `boolean`                               | Indicates whether the model is currently processing an inference.                                                                                                                                                                                                   |
+| `isReady`          | `boolean`                               | Indicates whether the model has successfully loaded and is ready for inference.                                                                                                                                                                                     |
 | `downloadProgress` | `number`                                | Tracks the progress of the model download process.                                                                                                                                                                                                                  |
 
 ## Running the model
@@ -80,7 +80,7 @@ import { Button, Text } from 'react-native';
 import { useSpeechToText } from 'react-native-executorch';
 
 function App() {
-  const { loadAudio, transcribe, sequence } = useSpeechToText({
+  const { loadAudio, transcribe, sequence, error } = useSpeechToText({
     modelName: 'whisper',
   });
 
@@ -90,16 +90,12 @@ function App() {
     <View>
       <Button
         onPress={async () => {
-          try {
-            await loadAudio(audioUrl);
-            await transcribe();
-          } catch (error) {
-            console.error("Error transcribing audio:", error);
-          }
+          await loadAudio(audioUrl);
+          await transcribe();
         }
         title="Transcribe"
       />
-      <Text>{sequence}</Text>
+      <Text>{error ? error : sequence}</Text>
     </View>
   );
 }