Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 4 additions & 0 deletions .cspell-wordlist.txt
Original file line number Diff line number Diff line change
Expand Up @@ -53,6 +53,10 @@ Lexend
finetuned
MINILM
MPNET
DISTILUSE
distiluse
Distil
torchao
QINT
FNUZ
wordlist
Expand Down
10 changes: 10 additions & 0 deletions apps/text-embeddings/app/text-embeddings/index.tsx
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,8 @@ import {
ALL_MPNET_BASE_V2,
MULTI_QA_MINILM_L6_COS_V1,
MULTI_QA_MPNET_BASE_DOT_V1,
DISTILUSE_BASE_MULTILINGUAL_CASED_V2_8DA4W,
DISTILUSE_BASE_MULTILINGUAL_CASED_V2_COREML,
TextEmbeddingsProps,
} from 'react-native-executorch';

Expand All @@ -28,6 +30,14 @@ const MODELS: { label: string; value: TextEmbeddingModel }[] = [
{ label: 'MPNet Base', value: ALL_MPNET_BASE_V2 },
{ label: 'MultiQA MiniLM', value: MULTI_QA_MINILM_L6_COS_V1 },
{ label: 'MultiQA MPNet', value: MULTI_QA_MPNET_BASE_DOT_V1 },
{
label: 'Multilingual DistilUSE (8da4w)',
value: DISTILUSE_BASE_MULTILINGUAL_CASED_V2_8DA4W,
},
{
label: 'Multilingual DistilUSE (CoreML)',
value: DISTILUSE_BASE_MULTILINGUAL_CASED_V2_COREML,
},
];
import { useIsFocused } from '@react-navigation/native';
import { dotProduct } from '../../utils/math';
Expand Down
16 changes: 9 additions & 7 deletions docs/docs/02-benchmarks/inference-time.md
Original file line number Diff line number Diff line change
Expand Up @@ -150,13 +150,15 @@ Average time to synthesize speech from an input text of approximately 60 tokens,
Benchmark times for text embeddings are highly dependent on the sentence length. The numbers below are based on a sentence of around 80 tokens. For shorter or longer sentences, inference time may vary accordingly.
:::

| Model | iPhone 17 Pro (XNNPACK) [ms] | OnePlus 12 (XNNPACK) [ms] |
| -------------------------- | :--------------------------: | :-----------------------: |
| ALL_MINILM_L6_V2 | 7 | 21 |
| ALL_MPNET_BASE_V2 | 24 | 90 |
| MULTI_QA_MINILM_L6_COS_V1 | 7 | 19 |
| MULTI_QA_MPNET_BASE_DOT_V1 | 24 | 88 |
| CLIP_VIT_BASE_PATCH32_TEXT | 14 | 39 |
| Model / Device | iPhone 17 Pro [ms] | OnePlus 12 [ms] |
| ---------------------------------------------------- | :----------------: | :-------------: |
| ALL_MINILM_L6_V2 (XNNPACK) | 7 | 21 |
| ALL_MPNET_BASE_V2 (XNNPACK) | 24 | 90 |
| MULTI_QA_MINILM_L6_COS_V1 (XNNPACK) | 7 | 19 |
| MULTI_QA_MPNET_BASE_DOT_V1 (XNNPACK) | 24 | 88 |
| CLIP_VIT_BASE_PATCH32_TEXT (XNNPACK) | 14 | 39 |
| DISTILUSE_BASE_MULTILINGUAL_CASED_V2 (XNNPACK 8da4w) | 16 | 15 |
| DISTILUSE_BASE_MULTILINGUAL_CASED_V2 (Core ML FP32) | 15 | - |

## Image Embeddings

Expand Down
20 changes: 11 additions & 9 deletions docs/docs/02-benchmarks/memory-usage.md
Original file line number Diff line number Diff line change
Expand Up @@ -98,13 +98,15 @@ The reported memory usage values include the memory footprint of the Phonemis pa

## Text Embeddings

| Model / Device | iPhone 17 Pro [MB] | OnePlus 12 [MB] |
| ------------------------------------ | :----------------: | :-------------: |
| ALL_MINILM_L6_V2 (XNNPACK) | 110 | 95 |
| ALL_MPNET_BASE_V2 (XNNPACK) | 455 | 405 |
| MULTI_QA_MINILM_L6_COS_V1 (XNNPACK) | 140 | 120 |
| MULTI_QA_MPNET_BASE_DOT_V1 (XNNPACK) | 455 | 435 |
| CLIP_VIT_BASE_PATCH32_TEXT (XNNPACK) | 280 | 200 |
| Model / Device | iPhone 17 Pro [MB] | OnePlus 12 [MB] |
| ---------------------------------------------------- | :----------------: | :-------------: |
| ALL_MINILM_L6_V2 (XNNPACK) | 110 | 95 |
| ALL_MPNET_BASE_V2 (XNNPACK) | 455 | 405 |
| MULTI_QA_MINILM_L6_COS_V1 (XNNPACK) | 140 | 120 |
| MULTI_QA_MPNET_BASE_DOT_V1 (XNNPACK) | 455 | 435 |
| CLIP_VIT_BASE_PATCH32_TEXT (XNNPACK) | 280 | 200 |
| DISTILUSE_BASE_MULTILINGUAL_CASED_V2 (XNNPACK 8da4w) | 36 | 44 |
| DISTILUSE_BASE_MULTILINGUAL_CASED_V2 (Core ML FP32) | 55 | - |

## Image Embeddings

Expand All @@ -120,8 +122,8 @@ output. When resize is enabled, expect higher memory usage and inference time
with higher resolutions.
:::

| Model / Device | iPhone 17 Pro [MB] | OnePlus 12 [MB] |
| --------------------------- | :----------------: | :-------------: |
| Model / Device | iPhone 17 Pro [MB] | OnePlus 12 [MB] |
| ---------------------------- | :----------------: | :-------------: |
| DEEPLABV3_RESNET50 (XNNPACK) | 660 | 930 |

## Instance Segmentation
Expand Down
16 changes: 9 additions & 7 deletions docs/docs/02-benchmarks/model-size.md
Original file line number Diff line number Diff line change
Expand Up @@ -89,13 +89,15 @@ title: Model Size

## Text Embeddings

| Model | XNNPACK [MB] |
| -------------------------- | :----------: |
| ALL_MINILM_L6_V2 | 91 |
| ALL_MPNET_BASE_V2 | 438 |
| MULTI_QA_MINILM_L6_COS_V1 | 91 |
| MULTI_QA_MPNET_BASE_DOT_V1 | 438 |
| CLIP_VIT_BASE_PATCH32_TEXT | 254 |
| Model | Size [MB] |
| ------------------------------------------- | :-------: |
| ALL_MINILM_L6_V2 | 91 |
| ALL_MPNET_BASE_V2 | 438 |
| MULTI_QA_MINILM_L6_COS_V1 | 91 |
| MULTI_QA_MPNET_BASE_DOT_V1 | 438 |
| CLIP_VIT_BASE_PATCH32_TEXT | 254 |
| DISTILUSE_BASE_MULTILINGUAL_CASED_V2_8DA4W | 393 |
| DISTILUSE_BASE_MULTILINGUAL_CASED_V2_COREML | 541 |

## Image Embeddings

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -101,13 +101,14 @@ function App() {

## Supported models

| Model | Language | Max Tokens | Embedding Dimensions | Description |
| ----------------------------------------------------------------------------------------------------- | :------: | :--------: | :------------------: | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
| [all-MiniLM-L6-v2](https://huggingface.co/sentence-transformers/all-MiniLM-L6-v2) | English | 254 | 384 | All-round model tuned for many use-cases. Trained on a large and diverse dataset of over 1 billion training pairs. |
| [all-mpnet-base-v2](https://huggingface.co/sentence-transformers/all-mpnet-base-v2) | English | 382 | 768 | All-round model tuned for many use-cases. Trained on a large and diverse dataset of over 1 billion training pairs. |
| [multi-qa-MiniLM-L6-cos-v1](https://huggingface.co/sentence-transformers/multi-qa-MiniLM-L6-cos-v1) | English | 509 | 384 | This model was tuned for semantic search: Given a query/question, it can find relevant passages. It was trained on a large and diverse set of (question, answer) pairs. |
| [multi-qa-mpnet-base-dot-v1](https://huggingface.co/sentence-transformers/multi-qa-mpnet-base-dot-v1) | English | 510 | 768 | This model was tuned for semantic search: Given a query/question, it can find relevant passages. It was trained on a large and diverse set of (question, answer) pairs. |
| [clip-vit-base-patch32-text](https://huggingface.co/openai/clip-vit-base-patch32) | English | 74 | 512 | CLIP (Contrastive Language-Image Pre-Training) is a neural network trained on a variety of (image, text) pairs. CLIP allows to embed images and text into the same vector space. This allows to find similar images as well as to implement image search. This is the text encoder part of the CLIP model. To embed images checkout [clip-vit-base-patch32-image](../02-computer-vision/useImageEmbeddings.md#supported-models). |
| Model | Language | Max Tokens | Embedding Dimensions | Description |
| ------------------------------------------------------------------------------------------------------------------------- | :-----------: | :--------: | :------------------: | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
| [all-MiniLM-L6-v2](https://huggingface.co/sentence-transformers/all-MiniLM-L6-v2) | English | 254 | 384 | All-round model tuned for many use-cases. Trained on a large and diverse dataset of over 1 billion training pairs. |
| [all-mpnet-base-v2](https://huggingface.co/sentence-transformers/all-mpnet-base-v2) | English | 382 | 768 | All-round model tuned for many use-cases. Trained on a large and diverse dataset of over 1 billion training pairs. |
| [multi-qa-MiniLM-L6-cos-v1](https://huggingface.co/sentence-transformers/multi-qa-MiniLM-L6-cos-v1) | English | 509 | 384 | This model was tuned for semantic search: Given a query/question, it can find relevant passages. It was trained on a large and diverse set of (question, answer) pairs. |
| [multi-qa-mpnet-base-dot-v1](https://huggingface.co/sentence-transformers/multi-qa-mpnet-base-dot-v1) | English | 510 | 768 | This model was tuned for semantic search: Given a query/question, it can find relevant passages. It was trained on a large and diverse set of (question, answer) pairs. |
| [distiluse-base-multilingual-cased-v2](https://huggingface.co/sentence-transformers/distiluse-base-multilingual-cased-v2) | 50+ languages | 126 | 512 | Multilingual DistilBERT with a 768→512 projection head. Recommended when broader language coverage matters more than the exact English quality of MiniLM/MPNet. |
| [clip-vit-base-patch32-text](https://huggingface.co/openai/clip-vit-base-patch32) | English | 74 | 512 | CLIP (Contrastive Language-Image Pre-Training) is a neural network trained on a variety of (image, text) pairs. CLIP allows to embed images and text into the same vector space. This allows to find similar images as well as to implement image search. This is the text encoder part of the CLIP model. To embed images checkout [clip-vit-base-patch32-image](../02-computer-vision/useImageEmbeddings.md#supported-models). |

**`Max Tokens`** - The maximum number of tokens that can be processed by the model. If the input text exceeds this limit, it will be truncated.

Expand Down
25 changes: 24 additions & 1 deletion packages/react-native-executorch/src/constants/modelUrls.ts
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
import { Platform } from 'react-native';
import { URL_PREFIX, VERSION_TAG } from './versions';
import { URL_PREFIX, VERSION_TAG, NEXT_VERSION_TAG } from './versions';

// LLMs

Expand Down Expand Up @@ -987,6 +987,9 @@ const MULTI_QA_MINILM_L6_COS_V1_MODEL = `${URL_PREFIX}-multi-qa-MiniLM-L6-cos-v1
const MULTI_QA_MINILM_L6_COS_V1_TOKENIZER = `${URL_PREFIX}-multi-qa-MiniLM-L6-cos-v1/${VERSION_TAG}/tokenizer.json`;
const MULTI_QA_MPNET_BASE_DOT_V1_MODEL = `${URL_PREFIX}-multi-qa-mpnet-base-dot-v1/${VERSION_TAG}/multi-qa-mpnet-base-dot-v1_xnnpack.pte`;
const MULTI_QA_MPNET_BASE_DOT_V1_TOKENIZER = `${URL_PREFIX}-multi-qa-mpnet-base-dot-v1/${VERSION_TAG}/tokenizer.json`;
const DISTILUSE_BASE_MULTILINGUAL_CASED_V2_8DA4W_MODEL = `${URL_PREFIX}-distiluse-base-multilingual-cased-v2/${NEXT_VERSION_TAG}/xnnpack/distiluse-base-multilingual-cased-v2_xnnpack_8da4w.pte`;
const DISTILUSE_BASE_MULTILINGUAL_CASED_V2_COREML_MODEL = `${URL_PREFIX}-distiluse-base-multilingual-cased-v2/${NEXT_VERSION_TAG}/coreml/distiluse-base-multilingual-cased-v2_coreml_fp32.pte`;
const DISTILUSE_BASE_MULTILINGUAL_CASED_V2_TOKENIZER = `${URL_PREFIX}-distiluse-base-multilingual-cased-v2/${NEXT_VERSION_TAG}/tokenizer.json`;
const CLIP_VIT_BASE_PATCH32_TEXT_MODEL = `${URL_PREFIX}-clip-vit-base-patch32/${VERSION_TAG}/xnnpack/clip_vit_base_patch32_text_xnnpack_fp32.pte`;
const CLIP_VIT_BASE_PATCH32_TEXT_TOKENIZER = `${URL_PREFIX}-clip-vit-base-patch32/${VERSION_TAG}/tokenizer.json`;

Expand Down Expand Up @@ -1026,6 +1029,24 @@ export const MULTI_QA_MPNET_BASE_DOT_V1 = {
tokenizerSource: MULTI_QA_MPNET_BASE_DOT_V1_TOKENIZER,
} as const;

/**
* @category Models - Text Embeddings
*/
export const DISTILUSE_BASE_MULTILINGUAL_CASED_V2_8DA4W = {
modelName: 'distiluse-base-multilingual-cased-v2-8da4w',
modelSource: DISTILUSE_BASE_MULTILINGUAL_CASED_V2_8DA4W_MODEL,
tokenizerSource: DISTILUSE_BASE_MULTILINGUAL_CASED_V2_TOKENIZER,
} as const;

/**
* @category Models - Text Embeddings
*/
export const DISTILUSE_BASE_MULTILINGUAL_CASED_V2_COREML = {
modelName: 'distiluse-base-multilingual-cased-v2-coreml',
modelSource: DISTILUSE_BASE_MULTILINGUAL_CASED_V2_COREML_MODEL,
tokenizerSource: DISTILUSE_BASE_MULTILINGUAL_CASED_V2_TOKENIZER,
} as const;

/**
* @category Models - Text Embeddings
*/
Expand Down Expand Up @@ -1175,6 +1196,8 @@ export const MODEL_REGISTRY = {
ALL_MPNET_BASE_V2,
MULTI_QA_MINILM_L6_COS_V1,
MULTI_QA_MPNET_BASE_DOT_V1,
DISTILUSE_BASE_MULTILINGUAL_CASED_V2_8DA4W,
DISTILUSE_BASE_MULTILINGUAL_CASED_V2_COREML,
CLIP_VIT_BASE_PATCH32_TEXT,
BK_SDM_TINY_VPRED_512,
BK_SDM_TINY_VPRED_256,
Expand Down
2 changes: 2 additions & 0 deletions packages/react-native-executorch/src/types/textEmbeddings.ts
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,8 @@ export type TextEmbeddingsModelName =
| 'all-mpnet-base-v2'
| 'multi-qa-minilm-l6-cos-v1'
| 'multi-qa-mpnet-base-dot-v1'
| 'distiluse-base-multilingual-cased-v2-8da4w'
| 'distiluse-base-multilingual-cased-v2-coreml'
| 'clip-vit-base-patch32-text';

/**
Expand Down
Loading