diff --git a/langchain-coordinode/langchain_coordinode/graph.py b/langchain-coordinode/langchain_coordinode/graph.py index 8a19ff6..cc0f2e7 100644 --- a/langchain-coordinode/langchain_coordinode/graph.py +++ b/langchain-coordinode/langchain_coordinode/graph.py @@ -5,6 +5,7 @@ import hashlib import json import re +from collections.abc import Sequence from typing import Any from langchain_community.graphs.graph_store import GraphStore @@ -194,6 +195,45 @@ def query( # cypher() returns List[Dict[str, Any]] directly — column name → value. return self._client.cypher(query, params=params or {}) + def similarity_search( + self, + query_vector: Sequence[float], + k: int = 5, + label: str = "Chunk", + property: str = "embedding", + ) -> list[dict[str, Any]]: + """Find nodes whose ``property`` vector is closest to ``query_vector``. + + Wraps ``CoordinodeClient.vector_search()``. The returned list contains + one dict per result with the keys ``node`` (node properties), ``id`` + (internal integer node ID), and ``distance`` (cosine distance, lower = + more similar). + + Args: + query_vector: Embedding vector to search for. + k: Maximum number of results to return. + label: Node label to search (default ``"Chunk"``). + property: Embedding property name (default ``"embedding"``). + + Returns: + List of result dicts sorted by ascending distance. + """ + # Use len() instead of truthiness check: numpy.ndarray (and other Sequence + # types) raise ValueError("The truth value of an array is ambiguous") when + # used in a boolean context. len() == 0 works for all sequence types. + if len(query_vector) == 0: + return [] + results = sorted( + self._client.vector_search( + label=label, + property=property, + vector=query_vector, + top_k=k, + ), + key=lambda r: r.distance, + ) + return [{"id": r.node.id, "node": r.node.properties, "distance": r.distance} for r in results] + # ── Lifecycle ───────────────────────────────────────────────────────── def close(self) -> None: diff --git a/tests/integration/adapters/test_langchain.py b/tests/integration/adapters/test_langchain.py index 799e831..1ac62a1 100644 --- a/tests/integration/adapters/test_langchain.py +++ b/tests/integration/adapters/test_langchain.py @@ -133,6 +133,48 @@ def test_add_graph_documents_idempotent(graph, unique_tag): assert result[0]["cnt"] == 1 +# ── similarity_search ───────────────────────────────────────────────────────── + + +def test_similarity_search_returns_results(graph, unique_tag): + """similarity_search() returns node dicts with id, node, and distance keys. + + Seeds a :LCSim node with a known embedding, then searches for the closest + vector. The seeded node must appear in the top-k results. + """ + # Derive a unique embedding from the test tag (same technique as llama-index + # test) to avoid collisions with other :LCSim nodes in the shared DB. + seed = list(bytes.fromhex(unique_tag)) + vec = [float(seed[i % len(seed)]) / 255.0 for i in range(16)] + + try: + seed_rows = graph.query( + "CREATE (n:LCSim {id: $id, embedding: $vec}) RETURN n AS nid", + params={"id": f"lcsim-{unique_tag}", "vec": vec}, + ) + # graph.query() wraps CoordinodeClient.cypher() which returns raw dict values. + # CoordiNode: CREATE ... RETURN n yields the internal integer node ID directly + # (NOT a node object). similarity_search() also returns {"id": r.node.id, ...} + # where r.node.id is the same integer. Direct equality comparison is correct. + seeded_internal_id = seed_rows[0]["nid"] + + results = graph.similarity_search(vec, k=5, label="LCSim", property="embedding") + + assert isinstance(results, list) + assert len(results) >= 1 + assert all("id" in r and "node" in r and "distance" in r for r in results) + assert any(r["id"] == seeded_internal_id for r in results) + assert all(results[i]["distance"] <= results[i + 1]["distance"] for i in range(len(results) - 1)) + finally: + graph.query("MATCH (n:LCSim {id: $id}) DELETE n", params={"id": f"lcsim-{unique_tag}"}) + + +def test_similarity_search_empty_vector_returns_empty(graph): + """similarity_search() with an empty vector list returns an empty list without error.""" + results = graph.similarity_search([], k=5) + assert results == [] + + def test_schema_refreshes_after_add(graph, unique_tag): """structured_schema is invalidated and re-fetched after add_graph_documents.""" graph._schema = None # force refresh diff --git a/tests/integration/adapters/test_llama_index.py b/tests/integration/adapters/test_llama_index.py index 971a2f2..140d106 100644 --- a/tests/integration/adapters/test_llama_index.py +++ b/tests/integration/adapters/test_llama_index.py @@ -72,6 +72,23 @@ def test_upsert_nodes_idempotent(store, tag): assert len(found) == 1 +def test_upsert_relations_idempotent(store, tag): + """Upserting the same relation twice must produce exactly one edge (MERGE idempotent).""" + src = EntityNode(label="LIIdempRel", name=f"IdempSrc-{tag}") + dst = EntityNode(label="LIIdempRel", name=f"IdempDst-{tag}") + store.upsert_nodes([src, dst]) + + rel = Relation(label="LI_IDEMP_REL", source_id=src.id, target_id=dst.id) + store.upsert_relations([rel]) + store.upsert_relations([rel]) # second call must not duplicate + + rows = store.structured_query( + "MATCH (a {id: $src})-[r:LI_IDEMP_REL]->(b {id: $dst}) RETURN count(r) AS cnt", + param_map={"src": src.id, "dst": dst.id}, + ) + assert rows[0]["cnt"] == 1, f"expected exactly 1 edge after double upsert, got: {rows}" + + def test_get_by_id(store, tag): node = EntityNode(label="LIGetById", name=f"ById-{tag}") node_id = node.id