diff --git a/.cspell-wordlist.txt b/.cspell-wordlist.txt index f1d55b4c98..d611f986b6 100644 --- a/.cspell-wordlist.txt +++ b/.cspell-wordlist.txt @@ -53,6 +53,10 @@ Lexend finetuned MINILM MPNET +DISTILUSE +distiluse +Distil +torchao QINT FNUZ wordlist diff --git a/apps/text-embeddings/app/text-embeddings/index.tsx b/apps/text-embeddings/app/text-embeddings/index.tsx index e31097940c..772c9d39f2 100644 --- a/apps/text-embeddings/app/text-embeddings/index.tsx +++ b/apps/text-embeddings/app/text-embeddings/index.tsx @@ -18,6 +18,8 @@ import { ALL_MPNET_BASE_V2, MULTI_QA_MINILM_L6_COS_V1, MULTI_QA_MPNET_BASE_DOT_V1, + DISTILUSE_BASE_MULTILINGUAL_CASED_V2_8DA4W, + DISTILUSE_BASE_MULTILINGUAL_CASED_V2_COREML, TextEmbeddingsProps, } from 'react-native-executorch'; @@ -28,6 +30,14 @@ const MODELS: { label: string; value: TextEmbeddingModel }[] = [ { label: 'MPNet Base', value: ALL_MPNET_BASE_V2 }, { label: 'MultiQA MiniLM', value: MULTI_QA_MINILM_L6_COS_V1 }, { label: 'MultiQA MPNet', value: MULTI_QA_MPNET_BASE_DOT_V1 }, + { + label: 'Multilingual DistilUSE (8da4w)', + value: DISTILUSE_BASE_MULTILINGUAL_CASED_V2_8DA4W, + }, + { + label: 'Multilingual DistilUSE (CoreML)', + value: DISTILUSE_BASE_MULTILINGUAL_CASED_V2_COREML, + }, ]; import { useIsFocused } from '@react-navigation/native'; import { dotProduct } from '../../utils/math'; diff --git a/docs/docs/02-benchmarks/inference-time.md b/docs/docs/02-benchmarks/inference-time.md index dbc06dc85f..6f23dd92d8 100644 --- a/docs/docs/02-benchmarks/inference-time.md +++ b/docs/docs/02-benchmarks/inference-time.md @@ -150,13 +150,15 @@ Average time to synthesize speech from an input text of approximately 60 tokens, Benchmark times for text embeddings are highly dependent on the sentence length. The numbers below are based on a sentence of around 80 tokens. For shorter or longer sentences, inference time may vary accordingly. ::: -| Model | iPhone 17 Pro (XNNPACK) [ms] | OnePlus 12 (XNNPACK) [ms] | -| -------------------------- | :--------------------------: | :-----------------------: | -| ALL_MINILM_L6_V2 | 7 | 21 | -| ALL_MPNET_BASE_V2 | 24 | 90 | -| MULTI_QA_MINILM_L6_COS_V1 | 7 | 19 | -| MULTI_QA_MPNET_BASE_DOT_V1 | 24 | 88 | -| CLIP_VIT_BASE_PATCH32_TEXT | 14 | 39 | +| Model / Device | iPhone 17 Pro [ms] | OnePlus 12 [ms] | +| ---------------------------------------------------- | :----------------: | :-------------: | +| ALL_MINILM_L6_V2 (XNNPACK) | 7 | 21 | +| ALL_MPNET_BASE_V2 (XNNPACK) | 24 | 90 | +| MULTI_QA_MINILM_L6_COS_V1 (XNNPACK) | 7 | 19 | +| MULTI_QA_MPNET_BASE_DOT_V1 (XNNPACK) | 24 | 88 | +| CLIP_VIT_BASE_PATCH32_TEXT (XNNPACK) | 14 | 39 | +| DISTILUSE_BASE_MULTILINGUAL_CASED_V2 (XNNPACK 8da4w) | 16 | 15 | +| DISTILUSE_BASE_MULTILINGUAL_CASED_V2 (Core ML FP32) | 15 | - | ## Image Embeddings diff --git a/docs/docs/02-benchmarks/memory-usage.md b/docs/docs/02-benchmarks/memory-usage.md index a225b724d1..88cd19698e 100644 --- a/docs/docs/02-benchmarks/memory-usage.md +++ b/docs/docs/02-benchmarks/memory-usage.md @@ -98,13 +98,15 @@ The reported memory usage values include the memory footprint of the Phonemis pa ## Text Embeddings -| Model / Device | iPhone 17 Pro [MB] | OnePlus 12 [MB] | -| ------------------------------------ | :----------------: | :-------------: | -| ALL_MINILM_L6_V2 (XNNPACK) | 110 | 95 | -| ALL_MPNET_BASE_V2 (XNNPACK) | 455 | 405 | -| MULTI_QA_MINILM_L6_COS_V1 (XNNPACK) | 140 | 120 | -| MULTI_QA_MPNET_BASE_DOT_V1 (XNNPACK) | 455 | 435 | -| CLIP_VIT_BASE_PATCH32_TEXT (XNNPACK) | 280 | 200 | +| Model / Device | iPhone 17 Pro [MB] | OnePlus 12 [MB] | +| ---------------------------------------------------- | :----------------: | :-------------: | +| ALL_MINILM_L6_V2 (XNNPACK) | 110 | 95 | +| ALL_MPNET_BASE_V2 (XNNPACK) | 455 | 405 | +| MULTI_QA_MINILM_L6_COS_V1 (XNNPACK) | 140 | 120 | +| MULTI_QA_MPNET_BASE_DOT_V1 (XNNPACK) | 455 | 435 | +| CLIP_VIT_BASE_PATCH32_TEXT (XNNPACK) | 280 | 200 | +| DISTILUSE_BASE_MULTILINGUAL_CASED_V2 (XNNPACK 8da4w) | 36 | 44 | +| DISTILUSE_BASE_MULTILINGUAL_CASED_V2 (Core ML FP32) | 55 | - | ## Image Embeddings @@ -120,8 +122,8 @@ output. When resize is enabled, expect higher memory usage and inference time with higher resolutions. ::: -| Model / Device | iPhone 17 Pro [MB] | OnePlus 12 [MB] | -| --------------------------- | :----------------: | :-------------: | +| Model / Device | iPhone 17 Pro [MB] | OnePlus 12 [MB] | +| ---------------------------- | :----------------: | :-------------: | | DEEPLABV3_RESNET50 (XNNPACK) | 660 | 930 | ## Instance Segmentation diff --git a/docs/docs/02-benchmarks/model-size.md b/docs/docs/02-benchmarks/model-size.md index f9f5e4701f..14c1777689 100644 --- a/docs/docs/02-benchmarks/model-size.md +++ b/docs/docs/02-benchmarks/model-size.md @@ -89,13 +89,15 @@ title: Model Size ## Text Embeddings -| Model | XNNPACK [MB] | -| -------------------------- | :----------: | -| ALL_MINILM_L6_V2 | 91 | -| ALL_MPNET_BASE_V2 | 438 | -| MULTI_QA_MINILM_L6_COS_V1 | 91 | -| MULTI_QA_MPNET_BASE_DOT_V1 | 438 | -| CLIP_VIT_BASE_PATCH32_TEXT | 254 | +| Model | Size [MB] | +| ------------------------------------------- | :-------: | +| ALL_MINILM_L6_V2 | 91 | +| ALL_MPNET_BASE_V2 | 438 | +| MULTI_QA_MINILM_L6_COS_V1 | 91 | +| MULTI_QA_MPNET_BASE_DOT_V1 | 438 | +| CLIP_VIT_BASE_PATCH32_TEXT | 254 | +| DISTILUSE_BASE_MULTILINGUAL_CASED_V2_8DA4W | 393 | +| DISTILUSE_BASE_MULTILINGUAL_CASED_V2_COREML | 541 | ## Image Embeddings diff --git a/docs/docs/03-hooks/01-natural-language-processing/useTextEmbeddings.md b/docs/docs/03-hooks/01-natural-language-processing/useTextEmbeddings.md index 84c8499808..2f92eb6e8f 100644 --- a/docs/docs/03-hooks/01-natural-language-processing/useTextEmbeddings.md +++ b/docs/docs/03-hooks/01-natural-language-processing/useTextEmbeddings.md @@ -101,13 +101,14 @@ function App() { ## Supported models -| Model | Language | Max Tokens | Embedding Dimensions | Description | -| ----------------------------------------------------------------------------------------------------- | :------: | :--------: | :------------------: | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | -| [all-MiniLM-L6-v2](https://huggingface.co/sentence-transformers/all-MiniLM-L6-v2) | English | 254 | 384 | All-round model tuned for many use-cases. Trained on a large and diverse dataset of over 1 billion training pairs. | -| [all-mpnet-base-v2](https://huggingface.co/sentence-transformers/all-mpnet-base-v2) | English | 382 | 768 | All-round model tuned for many use-cases. Trained on a large and diverse dataset of over 1 billion training pairs. | -| [multi-qa-MiniLM-L6-cos-v1](https://huggingface.co/sentence-transformers/multi-qa-MiniLM-L6-cos-v1) | English | 509 | 384 | This model was tuned for semantic search: Given a query/question, it can find relevant passages. It was trained on a large and diverse set of (question, answer) pairs. | -| [multi-qa-mpnet-base-dot-v1](https://huggingface.co/sentence-transformers/multi-qa-mpnet-base-dot-v1) | English | 510 | 768 | This model was tuned for semantic search: Given a query/question, it can find relevant passages. It was trained on a large and diverse set of (question, answer) pairs. | -| [clip-vit-base-patch32-text](https://huggingface.co/openai/clip-vit-base-patch32) | English | 74 | 512 | CLIP (Contrastive Language-Image Pre-Training) is a neural network trained on a variety of (image, text) pairs. CLIP allows to embed images and text into the same vector space. This allows to find similar images as well as to implement image search. This is the text encoder part of the CLIP model. To embed images checkout [clip-vit-base-patch32-image](../02-computer-vision/useImageEmbeddings.md#supported-models). | +| Model | Language | Max Tokens | Embedding Dimensions | Description | +| ------------------------------------------------------------------------------------------------------------------------- | :-----------: | :--------: | :------------------: | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | +| [all-MiniLM-L6-v2](https://huggingface.co/sentence-transformers/all-MiniLM-L6-v2) | English | 254 | 384 | All-round model tuned for many use-cases. Trained on a large and diverse dataset of over 1 billion training pairs. | +| [all-mpnet-base-v2](https://huggingface.co/sentence-transformers/all-mpnet-base-v2) | English | 382 | 768 | All-round model tuned for many use-cases. Trained on a large and diverse dataset of over 1 billion training pairs. | +| [multi-qa-MiniLM-L6-cos-v1](https://huggingface.co/sentence-transformers/multi-qa-MiniLM-L6-cos-v1) | English | 509 | 384 | This model was tuned for semantic search: Given a query/question, it can find relevant passages. It was trained on a large and diverse set of (question, answer) pairs. | +| [multi-qa-mpnet-base-dot-v1](https://huggingface.co/sentence-transformers/multi-qa-mpnet-base-dot-v1) | English | 510 | 768 | This model was tuned for semantic search: Given a query/question, it can find relevant passages. It was trained on a large and diverse set of (question, answer) pairs. | +| [distiluse-base-multilingual-cased-v2](https://huggingface.co/sentence-transformers/distiluse-base-multilingual-cased-v2) | 50+ languages | 126 | 512 | Multilingual DistilBERT with a 768→512 projection head. Recommended when broader language coverage matters more than the exact English quality of MiniLM/MPNet. | +| [clip-vit-base-patch32-text](https://huggingface.co/openai/clip-vit-base-patch32) | English | 74 | 512 | CLIP (Contrastive Language-Image Pre-Training) is a neural network trained on a variety of (image, text) pairs. CLIP allows to embed images and text into the same vector space. This allows to find similar images as well as to implement image search. This is the text encoder part of the CLIP model. To embed images checkout [clip-vit-base-patch32-image](../02-computer-vision/useImageEmbeddings.md#supported-models). | **`Max Tokens`** - The maximum number of tokens that can be processed by the model. If the input text exceeds this limit, it will be truncated. diff --git a/packages/react-native-executorch/src/constants/modelUrls.ts b/packages/react-native-executorch/src/constants/modelUrls.ts index 92cd95bd7a..2f05fe3a63 100644 --- a/packages/react-native-executorch/src/constants/modelUrls.ts +++ b/packages/react-native-executorch/src/constants/modelUrls.ts @@ -1,5 +1,5 @@ import { Platform } from 'react-native'; -import { URL_PREFIX, VERSION_TAG } from './versions'; +import { URL_PREFIX, VERSION_TAG, NEXT_VERSION_TAG } from './versions'; // LLMs @@ -987,6 +987,9 @@ const MULTI_QA_MINILM_L6_COS_V1_MODEL = `${URL_PREFIX}-multi-qa-MiniLM-L6-cos-v1 const MULTI_QA_MINILM_L6_COS_V1_TOKENIZER = `${URL_PREFIX}-multi-qa-MiniLM-L6-cos-v1/${VERSION_TAG}/tokenizer.json`; const MULTI_QA_MPNET_BASE_DOT_V1_MODEL = `${URL_PREFIX}-multi-qa-mpnet-base-dot-v1/${VERSION_TAG}/multi-qa-mpnet-base-dot-v1_xnnpack.pte`; const MULTI_QA_MPNET_BASE_DOT_V1_TOKENIZER = `${URL_PREFIX}-multi-qa-mpnet-base-dot-v1/${VERSION_TAG}/tokenizer.json`; +const DISTILUSE_BASE_MULTILINGUAL_CASED_V2_8DA4W_MODEL = `${URL_PREFIX}-distiluse-base-multilingual-cased-v2/${NEXT_VERSION_TAG}/xnnpack/distiluse-base-multilingual-cased-v2_xnnpack_8da4w.pte`; +const DISTILUSE_BASE_MULTILINGUAL_CASED_V2_COREML_MODEL = `${URL_PREFIX}-distiluse-base-multilingual-cased-v2/${NEXT_VERSION_TAG}/coreml/distiluse-base-multilingual-cased-v2_coreml_fp32.pte`; +const DISTILUSE_BASE_MULTILINGUAL_CASED_V2_TOKENIZER = `${URL_PREFIX}-distiluse-base-multilingual-cased-v2/${NEXT_VERSION_TAG}/tokenizer.json`; const CLIP_VIT_BASE_PATCH32_TEXT_MODEL = `${URL_PREFIX}-clip-vit-base-patch32/${VERSION_TAG}/xnnpack/clip_vit_base_patch32_text_xnnpack_fp32.pte`; const CLIP_VIT_BASE_PATCH32_TEXT_TOKENIZER = `${URL_PREFIX}-clip-vit-base-patch32/${VERSION_TAG}/tokenizer.json`; @@ -1026,6 +1029,24 @@ export const MULTI_QA_MPNET_BASE_DOT_V1 = { tokenizerSource: MULTI_QA_MPNET_BASE_DOT_V1_TOKENIZER, } as const; +/** + * @category Models - Text Embeddings + */ +export const DISTILUSE_BASE_MULTILINGUAL_CASED_V2_8DA4W = { + modelName: 'distiluse-base-multilingual-cased-v2-8da4w', + modelSource: DISTILUSE_BASE_MULTILINGUAL_CASED_V2_8DA4W_MODEL, + tokenizerSource: DISTILUSE_BASE_MULTILINGUAL_CASED_V2_TOKENIZER, +} as const; + +/** + * @category Models - Text Embeddings + */ +export const DISTILUSE_BASE_MULTILINGUAL_CASED_V2_COREML = { + modelName: 'distiluse-base-multilingual-cased-v2-coreml', + modelSource: DISTILUSE_BASE_MULTILINGUAL_CASED_V2_COREML_MODEL, + tokenizerSource: DISTILUSE_BASE_MULTILINGUAL_CASED_V2_TOKENIZER, +} as const; + /** * @category Models - Text Embeddings */ @@ -1175,6 +1196,8 @@ export const MODEL_REGISTRY = { ALL_MPNET_BASE_V2, MULTI_QA_MINILM_L6_COS_V1, MULTI_QA_MPNET_BASE_DOT_V1, + DISTILUSE_BASE_MULTILINGUAL_CASED_V2_8DA4W, + DISTILUSE_BASE_MULTILINGUAL_CASED_V2_COREML, CLIP_VIT_BASE_PATCH32_TEXT, BK_SDM_TINY_VPRED_512, BK_SDM_TINY_VPRED_256, diff --git a/packages/react-native-executorch/src/types/textEmbeddings.ts b/packages/react-native-executorch/src/types/textEmbeddings.ts index 87b5d6375f..45f636c8e5 100644 --- a/packages/react-native-executorch/src/types/textEmbeddings.ts +++ b/packages/react-native-executorch/src/types/textEmbeddings.ts @@ -10,6 +10,8 @@ export type TextEmbeddingsModelName = | 'all-mpnet-base-v2' | 'multi-qa-minilm-l6-cos-v1' | 'multi-qa-mpnet-base-dot-v1' + | 'distiluse-base-multilingual-cased-v2-8da4w' + | 'distiluse-base-multilingual-cased-v2-coreml' | 'clip-vit-base-patch32-text'; /**