Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
12 changes: 0 additions & 12 deletions graphrag/config/models/vector_store_schema_config.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,16 +30,6 @@ class VectorStoreSchemaConfig(BaseModel):
default="vector",
)

text_field: str = Field(
description="The text field to use.",
default="text",
)

attributes_field: str = Field(
description="The attributes field to use.",
default="attributes",
)

vector_size: int = Field(
description="The vector size to use.",
default=DEFAULT_VECTOR_SIZE,
Expand All @@ -52,8 +42,6 @@ def _validate_schema(self) -> None:
for field in [
self.id_field,
self.vector_field,
self.text_field,
self.attributes_field,
]:
if not is_valid_field_name(field):
msg = f"Unsafe or invalid field name: {field}"
Expand Down
7 changes: 1 addition & 6 deletions graphrag/index/operations/embed_text/embed_text.py
Original file line number Diff line number Diff line change
Expand Up @@ -152,7 +152,6 @@ async def _text_embed_with_vector_store(
)
batch = input.iloc[insert_batch_size * i : insert_batch_size * (i + 1)]
texts: list[str] = batch[embed_column].tolist()
titles: list[str] = batch[title].tolist()
ids: list[str] = batch[id_column].tolist()
result = await strategy_exec(texts, callbacks, cache, strategy_config)
if result.embeddings:
Expand All @@ -163,16 +162,12 @@ async def _text_embed_with_vector_store(

vectors = result.embeddings or []
documents: list[VectorStoreDocument] = []
for doc_id, doc_text, doc_title, doc_vector in zip(
ids, texts, titles, vectors, strict=True
):
for doc_id, doc_vector in zip(ids, vectors, strict=True):
if type(doc_vector) is np.ndarray:
doc_vector = doc_vector.tolist()
document = VectorStoreDocument(
id=doc_id,
text=doc_text,
vector=doc_vector,
attributes={"title": doc_title},
)
documents.append(document)

Expand Down
15 changes: 8 additions & 7 deletions graphrag/query/structured_search/basic_search/basic_context.py
Original file line number Diff line number Diff line change
Expand Up @@ -59,14 +59,15 @@ def build_context(
text_embedder=lambda t: self.text_embedder.embed(t),
k=k,
)
related_text_list = [
{
text_id_col: self.text_id_map[f"{chunk.document.id}"],
text_col: chunk.document.text,
}
for chunk in related_texts

text_unit_ids = {t.document.id for t in related_texts}
text_units_filtered = []
text_units_filtered = [
{text_id_col: t.id, text_col: t.text}
for t in self.text_units or []
if t.id in text_unit_ids
]
related_text_df = pd.DataFrame(related_text_list)
related_text_df = pd.DataFrame(text_units_filtered)
else:
related_text_df = pd.DataFrame({
text_id_col: [],
Expand Down
15 changes: 0 additions & 15 deletions graphrag/vector_stores/azure_ai_search.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,6 @@

"""A package containing the Azure AI Search vector store implementation."""

import json
from typing import Any

from azure.core.credentials import AzureKeyCredential
Expand All @@ -13,7 +12,6 @@
from azure.search.documents.indexes.models import (
HnswAlgorithmConfiguration,
HnswParameters,
SearchableField,
SearchField,
SearchFieldDataType,
SearchIndex,
Expand Down Expand Up @@ -121,13 +119,6 @@ def load_documents(
vector_search_dimensions=self.vector_size,
vector_search_profile_name=self.vector_search_profile_name,
),
SearchableField(
name=self.text_field, type=SearchFieldDataType.String
),
SimpleField(
name=self.attributes_field,
type=SearchFieldDataType.String,
),
],
vector_search=vector_search,
)
Expand All @@ -139,8 +130,6 @@ def load_documents(
{
self.id_field: doc.id,
self.vector_field: doc.vector,
self.text_field: doc.text,
self.attributes_field: json.dumps(doc.attributes),
}
for doc in documents
if doc.vector is not None
Expand All @@ -165,9 +154,7 @@ def similarity_search_by_vector(
VectorStoreSearchResult(
document=VectorStoreDocument(
id=doc.get(self.id_field, ""),
text=doc.get(self.text_field, ""),
vector=doc.get(self.vector_field, []),
attributes=(json.loads(doc.get(self.attributes_field, "{}"))),
),
# Cosine similarity between 0.333 and 1.000
# https://learn.microsoft.com/en-us/azure/search/hybrid-search-ranking#scores-in-a-hybrid-search-results
Expand All @@ -192,7 +179,5 @@ def search_by_id(self, id: str) -> VectorStoreDocument:
response = self.db_connection.get_document(id)
return VectorStoreDocument(
id=response.get(self.id_field, ""),
text=response.get(self.text_field, ""),
vector=response.get(self.vector_field, []),
attributes=(json.loads(response.get(self.attributes_field, "{}"))),
)
8 changes: 1 addition & 7 deletions graphrag/vector_stores/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@
"""Base classes for vector stores."""

from abc import ABC, abstractmethod
from dataclasses import dataclass, field
from dataclasses import dataclass
from typing import Any

from graphrag.config.models.vector_store_schema_config import VectorStoreSchemaConfig
Expand All @@ -18,12 +18,8 @@ class VectorStoreDocument:
id: str | int
"""unique id for the document"""

text: str | None
vector: list[float] | None

attributes: dict[str, Any] = field(default_factory=dict)
"""store any additional metadata, e.g. title, date ranges, etc"""


@dataclass
class VectorStoreSearchResult:
Expand Down Expand Up @@ -54,9 +50,7 @@ def __init__(

self.index_name = vector_store_schema_config.index_name
self.id_field = vector_store_schema_config.id_field
self.text_field = vector_store_schema_config.text_field
self.vector_field = vector_store_schema_config.vector_field
self.attributes_field = vector_store_schema_config.attributes_field
self.vector_size = vector_store_schema_config.vector_size

@abstractmethod
Expand Down
11 changes: 2 additions & 9 deletions graphrag/vector_stores/cosmosdb.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,6 @@

"""A package containing the CosmosDB vector store implementation."""

import json
from typing import Any

from azure.cosmos import ContainerProxy, CosmosClient, DatabaseProxy
Expand Down Expand Up @@ -171,8 +170,6 @@ def load_documents(
doc_json = {
self.id_field: doc.id,
self.vector_field: doc.vector,
self.text_field: doc.text,
self.attributes_field: json.dumps(doc.attributes),
}
print("Storing document in CosmosDB:") # noqa: T201
print(doc_json) # noqa: T201
Expand All @@ -187,7 +184,7 @@ def similarity_search_by_vector(
raise ValueError(msg)

try:
query = f"SELECT TOP {k} c.{self.id_field}, c.{self.text_field}, c.{self.vector_field}, c.{self.attributes_field}, VectorDistance(c.{self.vector_field}, @embedding) AS SimilarityScore FROM c ORDER BY VectorDistance(c.{self.vector_field}, @embedding)" # noqa: S608
query = f"SELECT TOP {k} c.{self.id_field}, c.{self.vector_field}, VectorDistance(c.{self.vector_field}, @embedding) AS SimilarityScore FROM c ORDER BY VectorDistance(c.{self.vector_field}, @embedding)" # noqa: S608
query_params = [{"name": "@embedding", "value": query_embedding}]
items = list(
self._container_client.query_items(
Expand All @@ -199,7 +196,7 @@ def similarity_search_by_vector(
except (CosmosHttpResponseError, ValueError):
# Currently, the CosmosDB emulator does not support the VectorDistance function.
# For emulator or test environments - fetch all items and calculate distance locally
query = f"SELECT c.{self.id_field}, c.{self.text_field}, c.{self.vector_field}, c.{self.attributes_field} FROM c" # noqa: S608
query = f"SELECT c.{self.id_field}, c.{self.vector_field} FROM c" # noqa: S608
items = list(
self._container_client.query_items(
query=query,
Expand Down Expand Up @@ -231,9 +228,7 @@ def cosine_similarity(a, b):
VectorStoreSearchResult(
document=VectorStoreDocument(
id=item.get(self.id_field, ""),
text=item.get(self.text_field, ""),
vector=item.get(self.vector_field, []),
attributes=(json.loads(item.get(self.attributes_field, "{}"))),
),
score=item.get("SimilarityScore", 0.0),
)
Expand Down Expand Up @@ -261,8 +256,6 @@ def search_by_id(self, id: str) -> VectorStoreDocument:
return VectorStoreDocument(
id=item.get(self.id_field, ""),
vector=item.get(self.vector_field, []),
text=item.get(self.text_field, ""),
attributes=(json.loads(item.get(self.attributes_field, "{}"))),
)

def clear(self) -> None:
Expand Down
20 changes: 5 additions & 15 deletions graphrag/vector_stores/lancedb.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,19 +3,19 @@

"""The LanceDB vector storage implementation package."""

import json # noqa: I001
from typing import Any
import pyarrow as pa

import lancedb
import numpy as np
import pyarrow as pa

from graphrag.config.models.vector_store_schema_config import VectorStoreSchemaConfig
from graphrag.data_model.types import TextEmbedder

from graphrag.vector_stores.base import (
BaseVectorStore,
VectorStoreDocument,
VectorStoreSearchResult,
)
import lancedb


class LanceDBVectorStore(BaseVectorStore):
Expand All @@ -41,19 +41,15 @@ def load_documents(
"""Load documents into vector storage."""
# Step 1: Prepare data columns manually
ids = []
texts = []
vectors = []
attributes = []

for document in documents:
self.vector_size = (
len(document.vector) if document.vector else self.vector_size
)
if document.vector is not None and len(document.vector) == self.vector_size:
ids.append(document.id)
texts.append(document.text)
vectors.append(np.array(document.vector, dtype=np.float32))
attributes.append(json.dumps(document.attributes))

# Step 2: Handle empty case
if len(ids) == 0:
Expand All @@ -69,9 +65,7 @@ def load_documents(
# Step 4: Create PyArrow table (let schema be inferred)
data = pa.table({
self.id_field: pa.array(ids, type=pa.string()),
self.text_field: pa.array(texts, type=pa.string()),
self.vector_field: vector_column,
self.attributes_field: pa.array(attributes, type=pa.string()),
})

# NOTE: If modifying the next section of code, ensure that the schema remains the same.
Expand Down Expand Up @@ -127,9 +121,7 @@ def similarity_search_by_vector(
VectorStoreSearchResult(
document=VectorStoreDocument(
id=doc[self.id_field],
text=doc[self.text_field],
vector=doc[self.vector_field],
attributes=json.loads(doc[self.attributes_field]),
),
score=1 - abs(float(doc["_distance"])),
)
Expand All @@ -155,8 +147,6 @@ def search_by_id(self, id: str) -> VectorStoreDocument:
if doc:
return VectorStoreDocument(
id=doc[0][self.id_field],
text=doc[0][self.text_field],
vector=doc[0][self.vector_field],
attributes=json.loads(doc[0][self.attributes_field]),
)
return VectorStoreDocument(id=id, text=None, vector=None)
return VectorStoreDocument(id=id, vector=None)
22 changes: 0 additions & 22 deletions tests/integration/vector_stores/test_azure_ai_search.py
Original file line number Diff line number Diff line change
Expand Up @@ -63,8 +63,6 @@ def vector_store_custom(self, mock_search_client, mock_index_client):
vector_store_schema_config=VectorStoreSchemaConfig(
index_name="test_vectors",
id_field="id_custom",
text_field="text_custom",
attributes_field="attributes_custom",
vector_field="vector_custom",
vector_size=5,
),
Expand All @@ -86,15 +84,11 @@ def sample_documents(self):
return [
VectorStoreDocument(
id="doc1",
text="This is document 1",
vector=[0.1, 0.2, 0.3, 0.4, 0.5],
attributes={"title": "Doc 1", "category": "test"},
),
VectorStoreDocument(
id="doc2",
text="This is document 2",
vector=[0.2, 0.3, 0.4, 0.5, 0.6],
attributes={"title": "Doc 2", "category": "test"},
),
]

Expand All @@ -110,26 +104,20 @@ async def test_vector_store_operations(
search_results = [
{
"id": "doc1",
"text": "This is document 1",
"vector": [0.1, 0.2, 0.3, 0.4, 0.5],
"attributes": '{"title": "Doc 1", "category": "test"}',
"@search.score": 0.9,
},
{
"id": "doc2",
"text": "This is document 2",
"vector": [0.2, 0.3, 0.4, 0.5, 0.6],
"attributes": '{"title": "Doc 2", "category": "test"}',
"@search.score": 0.8,
},
]
mock_search_client.search.return_value = search_results

mock_search_client.get_document.return_value = {
"id": "doc1",
"text": "This is document 1",
"vector": [0.1, 0.2, 0.3, 0.4, 0.5],
"attributes": '{"title": "Doc 1", "category": "test"}',
}

vector_store.load_documents(sample_documents)
Expand All @@ -154,8 +142,6 @@ def mock_embedder(text: str) -> list[float]:

doc = vector_store.search_by_id("doc1")
assert doc.id == "doc1"
assert doc.text == "This is document 1"
assert doc.attributes["title"] == "Doc 1"

async def test_empty_embedding(self, vector_store, mock_search_client):
"""Test similarity search by text with empty embedding."""
Expand Down Expand Up @@ -186,26 +172,20 @@ async def test_vector_store_customization(
search_results = [
{
vector_store_custom.id_field: "doc1",
vector_store_custom.text_field: "This is document 1",
vector_store_custom.vector_field: [0.1, 0.2, 0.3, 0.4, 0.5],
vector_store_custom.attributes_field: '{"title": "Doc 1", "category": "test"}',
"@search.score": 0.9,
},
{
vector_store_custom.id_field: "doc2",
vector_store_custom.text_field: "This is document 2",
vector_store_custom.vector_field: [0.2, 0.3, 0.4, 0.5, 0.6],
vector_store_custom.attributes_field: '{"title": "Doc 2", "category": "test"}',
"@search.score": 0.8,
},
]
mock_search_client.search.return_value = search_results

mock_search_client.get_document.return_value = {
vector_store_custom.id_field: "doc1",
vector_store_custom.text_field: "This is document 1",
vector_store_custom.vector_field: [0.1, 0.2, 0.3, 0.4, 0.5],
vector_store_custom.attributes_field: '{"title": "Doc 1", "category": "test"}',
}

vector_store_custom.load_documents(sample_documents)
Expand All @@ -230,5 +210,3 @@ def mock_embedder(text: str) -> list[float]:

doc = vector_store_custom.search_by_id("doc1")
assert doc.id == "doc1"
assert doc.text == "This is document 1"
assert doc.attributes["title"] == "Doc 1"
Loading
Loading