From fe9b704e41b22d899fc04a4ba6ec9b556358946c Mon Sep 17 00:00:00 2001 From: Klaus Hueck Date: Tue, 11 Jun 2024 13:39:00 +0200 Subject: [PATCH 1/2] feat: add support for SOTA german embedding model with long context length jinaai/jina-embeddings-v2-base-de --- fastembed/text/jina_onnx_embedding.py | 8 ++++++++ tests/test_text_onnx_embeddings.py | 1 + 2 files changed, 9 insertions(+) diff --git a/fastembed/text/jina_onnx_embedding.py b/fastembed/text/jina_onnx_embedding.py index a5a0806d2..0b50696e9 100644 --- a/fastembed/text/jina_onnx_embedding.py +++ b/fastembed/text/jina_onnx_embedding.py @@ -24,6 +24,14 @@ "sources": {"hf": "xenova/jina-embeddings-v2-small-en"}, "model_file": "onnx/model.onnx", }, + { + "model": "jinaai/jina-embeddings-v2-base-de", + "dim": 768, + "description": "German embedding model supporting 8192 sequence length", + "size_in_GB": 0.16, + "sources": {"hf": "jinaai/jina-embeddings-v2-base-de"}, + "model_file": "onnx/model_fp16.onnx", + }, ] diff --git a/tests/test_text_onnx_embeddings.py b/tests/test_text_onnx_embeddings.py index 31751c96b..6a420b6c4 100644 --- a/tests/test_text_onnx_embeddings.py +++ b/tests/test_text_onnx_embeddings.py @@ -36,6 +36,7 @@ ), "jinaai/jina-embeddings-v2-small-en": np.array([-0.0455, -0.0428, -0.0122, 0.0613, 0.0015]), "jinaai/jina-embeddings-v2-base-en": np.array([-0.0332, -0.0509, 0.0287, -0.0043, -0.0077]), + "jinaai/jina-embeddings-v2-base-de": np.array([-0.0085, 0.0417, 0.0342, 0.0309, -0.0149]), "nomic-ai/nomic-embed-text-v1": np.array([0.0061, 0.0103, -0.0296, -0.0242, -0.0170]), "nomic-ai/nomic-embed-text-v1.5": np.array( [-1.6531514e-02, 8.5380634e-05, -1.8171231e-01, -3.9333291e-03, 1.2763254e-02] From e6d2ca12a49d74023921ae8c78a96bb7c9813cce Mon Sep 17 00:00:00 2001 From: George Date: Fri, 14 Jun 2024 13:25:12 +0200 Subject: [PATCH 2/2] Fix jina de model weight --- fastembed/text/jina_onnx_embedding.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fastembed/text/jina_onnx_embedding.py b/fastembed/text/jina_onnx_embedding.py index 0b50696e9..4237847c4 100644 --- a/fastembed/text/jina_onnx_embedding.py +++ b/fastembed/text/jina_onnx_embedding.py @@ -28,7 +28,7 @@ "model": "jinaai/jina-embeddings-v2-base-de", "dim": 768, "description": "German embedding model supporting 8192 sequence length", - "size_in_GB": 0.16, + "size_in_GB": 0.32, "sources": {"hf": "jinaai/jina-embeddings-v2-base-de"}, "model_file": "onnx/model_fp16.onnx", },