From 6d009a714a90c3a915deb84d618a15ea35830a20 Mon Sep 17 00:00:00 2001 From: Dmitry Prudnikov Date: Sun, 12 Apr 2026 12:23:55 +0300 Subject: [PATCH 1/7] feat: use MERGE for edges, wildcard patterns, type()/labels() functions MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit CoordiNode v0.3.1+ supports wildcard [r] in MATCH patterns, type(r), labels(n), and MERGE for relationship patterns. Update adapters to use these instead of the old workarounds. LangChain adapter: - refresh_schema(): wildcard MATCH (a)-[r]->(b) with labels()/type() - _create_edge(): CREATE → MERGE (idempotent edge upsert) - _link_document_to_entities(): CREATE → MERGE for MENTIONS edges LlamaIndex adapter: - get_triplets(): wildcard [r] pattern; type(r) instead of r.__type__ - get_rel_map(): wildcard [r]; ignore_rels pushed into Cypher WHERE so LIMIT applies only to non-ignored edges - upsert_relations(): CREATE → MERGE (idempotent) - Remove _parse_edge_types_from_schema() (no longer needed) Tests: - count(*) → count(r) for relationship counting - cnt >= 1 → cnt == 1 for idempotent edge assertions - get_triplets() test uses wildcard (no relation_names) Closes #24 --- .gitignore | 3 +- .../langchain_coordinode/graph.py | 61 +++++++------ .../graph_stores/coordinode/base.py | 86 ++++--------------- tests/integration/adapters/test_langchain.py | 18 ++-- .../integration/adapters/test_llama_index.py | 4 +- 5 files changed, 57 insertions(+), 115 deletions(-) diff --git a/.gitignore b/.gitignore index f19cdd2..8f8786e 100644 --- a/.gitignore +++ b/.gitignore @@ -17,7 +17,8 @@ venv/ env/ # Version files generated by hatch-vcs -coordinode/coordinode/_version.py +coordinode/_version.py langchain-coordinode/langchain_coordinode/_version.py llama-index-coordinode/llama_index/graph_stores/coordinode/_version.py GAPS.md +CLAUDE.md diff --git a/langchain-coordinode/langchain_coordinode/graph.py b/langchain-coordinode/langchain_coordinode/graph.py index 6814c95..2c97d26 100644 --- a/langchain-coordinode/langchain_coordinode/graph.py +++ b/langchain-coordinode/langchain_coordinode/graph.py @@ -72,19 +72,18 @@ def refresh_schema(self) -> None: structured = _parse_schema(text) # Augment with relationship triples (start_label, type, end_label) via # Cypher — get_schema_text() only lists edge types without direction. - # CoordiNode: wildcard [r] returns no results; build typed pattern from - # the rel_props keys returned by _parse_schema(). - rel_types = list(structured.get("rel_props", {}).keys()) - if rel_types: - rel_filter = "|".join(_cypher_ident(t) for t in rel_types) - rows = self._client.cypher( - f"MATCH (a)-[r:{rel_filter}]->(b) " - "RETURN DISTINCT a.__label__ AS src, r.__type__ AS rel, b.__label__ AS dst" - ) + rows = self._client.cypher( + "MATCH (a)-[r]->(b) RETURN DISTINCT labels(a) AS src_labels, type(r) AS rel, labels(b) AS dst_labels" + ) + if rows: structured["relationships"] = [ - {"start": row["src"], "type": row["rel"], "end": row["dst"]} + { + "start": _first_label(row.get("src_labels")), + "type": row["rel"], + "end": _first_label(row.get("dst_labels")), + } for row in rows - if row.get("src") and row.get("rel") and row.get("dst") + if _first_label(row.get("src_labels")) and row.get("rel") and _first_label(row.get("dst_labels")) ] self._structured_schema = structured @@ -95,18 +94,14 @@ def add_graph_documents( ) -> None: """Store nodes and relationships extracted from ``GraphDocument`` objects. - Nodes are upserted by ``id`` (used as the ``name`` property) via - ``MERGE``, so repeated calls are safe for nodes. - - Relationships are created with unconditional ``CREATE`` because - CoordiNode does not yet support ``MERGE`` for edge patterns. Re-ingesting - the same ``GraphDocument`` will therefore produce duplicate edges. + Both nodes and relationships are upserted via ``MERGE``, so repeated + calls with the same data are idempotent. Args: graph_documents: List of ``langchain_community.graphs.graph_document.GraphDocument``. include_source: If ``True``, also store the source ``Document`` as a ``__Document__`` node linked to every extracted entity via - ``MENTIONS`` edges (also unconditional ``CREATE``). + ``MENTIONS`` edges. """ for doc in graph_documents: for node in doc.nodes: @@ -133,12 +128,10 @@ def _upsert_node(self, node: Any) -> None: ) def _create_edge(self, rel: Any) -> None: - """Create a relationship via unconditional CREATE. + """Upsert a relationship via MERGE (idempotent). - CoordiNode does not support MERGE for edge patterns. Re-ingesting the - same relationship will create a duplicate edge. SET r += $props is - skipped when props is empty because SET r += {} is not supported by all - server versions. + SET r += $props is skipped when props is empty because + SET r += {} is not supported by all server versions. """ src_label = _cypher_ident(rel.source.type or "Entity") dst_label = _cypher_ident(rel.target.type or "Entity") @@ -148,19 +141,19 @@ def _create_edge(self, rel: Any) -> None: self._client.cypher( f"MATCH (src:{src_label} {{name: $src}}) " f"MATCH (dst:{dst_label} {{name: $dst}}) " - f"CREATE (src)-[r:{rel_type}]->(dst) SET r += $props", + f"MERGE (src)-[r:{rel_type}]->(dst) SET r += $props", params={"src": rel.source.id, "dst": rel.target.id, "props": props}, ) else: self._client.cypher( f"MATCH (src:{src_label} {{name: $src}}) " f"MATCH (dst:{dst_label} {{name: $dst}}) " - f"CREATE (src)-[r:{rel_type}]->(dst)", + f"MERGE (src)-[r:{rel_type}]->(dst)", params={"src": rel.source.id, "dst": rel.target.id}, ) def _link_document_to_entities(self, doc: Any) -> None: - """Upsert a ``__Document__`` node and CREATE ``MENTIONS`` edges to all entities.""" + """Upsert a ``__Document__`` node and MERGE ``MENTIONS`` edges to all entities.""" src_id = getattr(doc.source, "id", None) or _stable_document_id(doc.source) self._client.cypher( "MERGE (d:__Document__ {id: $id}) SET d.page_content = $text", @@ -169,7 +162,7 @@ def _link_document_to_entities(self, doc: Any) -> None: for node in doc.nodes: label = _cypher_ident(node.type or "Entity") self._client.cypher( - f"MATCH (d:__Document__ {{id: $doc_id}}) MATCH (n:{label} {{name: $name}}) CREATE (d)-[:MENTIONS]->(n)", + f"MATCH (d:__Document__ {{id: $doc_id}}) MATCH (n:{label} {{name: $name}}) MERGE (d)-[:MENTIONS]->(n)", params={"doc_id": src_id, "name": node.id}, ) @@ -211,10 +204,7 @@ def _stable_document_id(source: Any) -> str: Combines ``page_content`` and sorted ``metadata`` items so the same document produces the same ``__Document__`` node ID across different - Python processes. This makes document-node creation stable when - ``include_source=True`` is used, but does not make re-ingest fully - idempotent because ``MENTIONS`` edges are not deduplicated until edge - ``MERGE``/dedup support is added to CoordiNode. + Python processes. """ content = getattr(source, "page_content", "") or "" metadata = getattr(source, "metadata", {}) or {} @@ -232,6 +222,15 @@ def _stable_document_id(source: Any) -> str: return hashlib.sha256(canonical.encode()).hexdigest()[:32] +def _first_label(labels: Any) -> str | None: + """Extract the first label from a labels() result (list of strings).""" + if isinstance(labels, list) and labels: + return str(labels[0]) + if isinstance(labels, str): + return labels + return None + + def _cypher_ident(name: str) -> str: """Escape a label/type name for use as a Cypher identifier.""" # ASCII-only word characters: letter/digit/underscore, not starting with digit. diff --git a/llama-index-coordinode/llama_index/graph_stores/coordinode/base.py b/llama-index-coordinode/llama_index/graph_stores/coordinode/base.py index 2d43788..1a6b3e1 100644 --- a/llama-index-coordinode/llama_index/graph_stores/coordinode/base.py +++ b/llama-index-coordinode/llama_index/graph_stores/coordinode/base.py @@ -117,13 +117,7 @@ def get_triplets( properties: dict[str, Any] | None = None, ids: list[str] | None = None, ) -> list[list[LabelledNode]]: - """Retrieve triplets (subject, predicate, object) as node triples. - - Note: - ``relation_names`` is **required**. CoordiNode does not support - untyped wildcard ``[r]`` relationship patterns — they silently return - no rows. Omitting ``relation_names`` raises ``NotImplementedError``. - """ + """Retrieve triplets (subject, predicate, object) as node triples.""" conditions: list[str] = [] params: dict[str, Any] = {} @@ -133,22 +127,15 @@ def get_triplets( conditions.append("(n.name IN $entity_names OR m.name IN $entity_names)") params["entity_names"] = entity_names if relation_names: - # Escape each type name to prevent Cypher injection rel_filter = "|".join(_cypher_ident(t) for t in relation_names) rel_pattern = f"[r:{rel_filter}]" else: - # CoordiNode: wildcard [r] pattern returns no results. - # Callers must supply relation_names for the query to work. - raise NotImplementedError( - "CoordinodePropertyGraphStore.get_triplets() requires relation_names — " - "CoordiNode does not support untyped wildcard [r] patterns" - ) + rel_pattern = "[r]" where = f"WHERE {' AND '.join(conditions)}" if conditions else "" - # CoordiNode: use r.__type__ instead of type(r) — type() returns null. cypher = ( f"MATCH (n)-{rel_pattern}->(m) {where} " - "RETURN n, r.__type__ AS rel_type, m, n.id AS _src_id, m.id AS _dst_id " + "RETURN n, type(r) AS rel_type, m, n.id AS _src_id, m.id AS _dst_id " "LIMIT 1000" ) result = self._client.cypher(cypher, params=params) @@ -189,27 +176,23 @@ def get_rel_map( if not graph_nodes: return [] - # CoordiNode: wildcard [r] pattern returns no results. Fetch all - # known edge types from the schema and build a typed pattern instead, - # e.g. [r:TYPE_A|TYPE_B|...]. - schema_text = self._client.get_schema_text() - edge_types = _parse_edge_types_from_schema(schema_text) - ignored = set(ignore_rels) if ignore_rels else set() - active_types = [t for t in edge_types if t not in ignored] - - if not active_types: - return [] - - rel_filter = "|".join(_cypher_ident(t) for t in active_types) node_ids = [n.id for n in graph_nodes] - safe_limit = int(limit) # coerce to int to prevent Cypher injection via non-integer input + safe_limit = int(limit) params: dict[str, object] = {"ids": node_ids} + # Push ignore_rels filter into the WHERE clause so LIMIT applies only + # to non-ignored edges and callers receive up to `limit` visible results. + if ignored: + params["ignored"] = list(ignored) + ignore_clause = "AND type(r) NOT IN $ignored " + else: + ignore_clause = "" + cypher = ( - f"MATCH (n)-[r:{rel_filter}]->(m) " - f"WHERE n.id IN $ids " - f"RETURN n, r.__type__ AS _rel_type, m, n.id AS _src_id, m.id AS _dst_id " + "MATCH (n)-[r]->(m) " + f"WHERE n.id IN $ids {ignore_clause}" + f"RETURN n, type(r) AS _rel_type, m, n.id AS _src_id, m.id AS _dst_id " f"LIMIT {safe_limit}" ) result = self._client.cypher(cypher, params=params) @@ -237,28 +220,21 @@ def upsert_nodes(self, nodes: list[LabelledNode]) -> None: self._client.cypher(cypher, params={"id": node.id, "props": props}) def upsert_relations(self, relations: list[Relation]) -> None: - """Upsert relationships into the graph.""" + """Upsert relationships into the graph (idempotent via MERGE).""" for rel in relations: props = rel.properties or {} label = _cypher_ident(rel.label) - # CoordiNode does not yet support MERGE for edge patterns; use CREATE. - # A WHERE NOT (src)-[:TYPE]->(dst) guard was tested but returns 0 - # rows silently in CoordiNode, making all CREATE statements no-ops. - # Until server-side MERGE or pattern predicates are supported, - # repeated calls will create duplicate edges. - # SET r += $props is skipped when props is empty — SET r += {} is - # not supported by all server versions. if props: cypher = ( f"MATCH (src {{id: $src_id}}) MATCH (dst {{id: $dst_id}}) " - f"CREATE (src)-[r:{label}]->(dst) SET r += $props" + f"MERGE (src)-[r:{label}]->(dst) SET r += $props" ) self._client.cypher( cypher, params={"src_id": rel.source_id, "dst_id": rel.target_id, "props": props}, ) else: - cypher = f"MATCH (src {{id: $src_id}}) MATCH (dst {{id: $dst_id}}) CREATE (src)-[r:{label}]->(dst)" + cypher = f"MATCH (src {{id: $src_id}}) MATCH (dst {{id: $dst_id}}) MERGE (src)-[r:{label}]->(dst)" self._client.cypher( cypher, params={"src_id": rel.source_id, "dst_id": rel.target_id}, @@ -376,29 +352,3 @@ def _node_label(node: LabelledNode) -> str: if isinstance(node, EntityNode): return node.label or "Entity" return "Node" - - -def _parse_edge_types_from_schema(schema_text: str) -> list[str]: - """Extract edge type names from CoordiNode schema text. - - Parses the "Edge types:" section produced by ``get_schema_text()``. - """ - edge_types: list[str] = [] - lines = iter(schema_text.splitlines()) - - # Advance to the "Edge types:" header. - for line in lines: - if line.strip().lower().startswith("edge types"): - break - - # Collect bullet items until the first blank line. - for line in lines: - stripped = line.strip() - if not stripped: - break - if stripped.startswith(("-", "*")): - name = stripped.lstrip("-* ").split("(")[0].strip() - if name: - edge_types.append(name) - - return edge_types diff --git a/tests/integration/adapters/test_langchain.py b/tests/integration/adapters/test_langchain.py index 2d5e274..799e831 100644 --- a/tests/integration/adapters/test_langchain.py +++ b/tests/integration/adapters/test_langchain.py @@ -97,21 +97,15 @@ def test_add_graph_documents_creates_relationship(graph, unique_tag): graph.add_graph_documents([doc]) # Verify the relationship was created, not just the source node. - # count(*) instead of count(r): CoordiNode returns 0 for relationship-variable counts result = graph.query( - "MATCH (a:LCPerson2 {name: $src})-[r:LC_RESEARCHES]->(b:LCConcept {name: $dst}) RETURN count(*) AS cnt", + "MATCH (a:LCPerson2 {name: $src})-[r:LC_RESEARCHES]->(b:LCConcept {name: $dst}) RETURN count(r) AS cnt", params={"src": f"Charlie-{unique_tag}", "dst": f"GraphRAG-{unique_tag}"}, ) - assert result[0]["cnt"] >= 1, f"relationship not found: {result}" + assert result[0]["cnt"] == 1, f"expected exactly 1 relationship: {result}" def test_add_graph_documents_idempotent(graph, unique_tag): - """Calling add_graph_documents twice must not raise. - - Nodes are idempotent (MERGE). Edges are NOT — CoordiNode does not yet - support MERGE for edges, so unconditional CREATE is used and duplicate - edges are expected after two ingests. - """ + """Calling add_graph_documents twice produces exactly one edge (MERGE idempotent).""" node_a = Node(id=f"Idempotent-{unique_tag}", type="LCIdempotent") node_b = Node(id=f"IdempTarget-{unique_tag}", type="LCIdempotent") rel = Relationship(source=node_a, target=node_b, type="LC_IDEMP_REL") @@ -131,12 +125,12 @@ def test_add_graph_documents_idempotent(graph, unique_tag): ) assert result[0]["cnt"] == 1 - # Edges: unconditional CREATE → count >= 1 (may be > 1 due to CoordiNode limitation) + # Edges: MERGE keeps count at 1 (idempotent) result = graph.query( - "MATCH (a:LCIdempotent {name: $src})-[r:LC_IDEMP_REL]->(b:LCIdempotent {name: $dst}) RETURN count(*) AS cnt", + "MATCH (a:LCIdempotent {name: $src})-[r:LC_IDEMP_REL]->(b:LCIdempotent {name: $dst}) RETURN count(r) AS cnt", params={"src": f"Idempotent-{unique_tag}", "dst": f"IdempTarget-{unique_tag}"}, ) - assert result[0]["cnt"] >= 1 + assert result[0]["cnt"] == 1 def test_schema_refreshes_after_add(graph, unique_tag): diff --git a/tests/integration/adapters/test_llama_index.py b/tests/integration/adapters/test_llama_index.py index bf90e74..771a324 100644 --- a/tests/integration/adapters/test_llama_index.py +++ b/tests/integration/adapters/test_llama_index.py @@ -96,11 +96,9 @@ def test_upsert_and_get_triplets(store, tag): ) store.upsert_relations([rel]) - # CoordiNode does not support wildcard [r] patterns yet — must pass relation_names. - # See: get_triplets() implementation note. + # Wildcard [r] works — no need to specify relation_names. triplets = store.get_triplets( entity_names=[f"Src-{tag}"], - relation_names=["LI_RESEARCHES"], ) assert isinstance(triplets, list) assert len(triplets) >= 1 From ab1ea64b1934a3b42a439cb46b00abcefed7a003 Mon Sep 17 00:00:00 2001 From: Dmitry Prudnikov Date: Sun, 12 Apr 2026 12:38:21 +0300 Subject: [PATCH 2/7] docs(langchain): explain why refresh_schema uses no LIMIT on DISTINCT query --- langchain-coordinode/langchain_coordinode/graph.py | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/langchain-coordinode/langchain_coordinode/graph.py b/langchain-coordinode/langchain_coordinode/graph.py index 2c97d26..c9b2e51 100644 --- a/langchain-coordinode/langchain_coordinode/graph.py +++ b/langchain-coordinode/langchain_coordinode/graph.py @@ -72,6 +72,11 @@ def refresh_schema(self) -> None: structured = _parse_schema(text) # Augment with relationship triples (start_label, type, end_label) via # Cypher — get_schema_text() only lists edge types without direction. + # No LIMIT here intentionally: RETURN DISTINCT already collapses all edges + # to unique (src_label, rel_type, dst_label) combinations, so the result + # is bounded by the number of distinct relationship type triples, not by + # total edge count. Adding a LIMIT would silently drop relationship types + # that happen to appear beyond the limit, producing an incomplete schema. rows = self._client.cypher( "MATCH (a)-[r]->(b) RETURN DISTINCT labels(a) AS src_labels, type(r) AS rel, labels(b) AS dst_labels" ) From 69fa9916b29959d82d9fffd3518ad8326eaeeaac Mon Sep 17 00:00:00 2001 From: Dmitry Prudnikov Date: Sun, 12 Apr 2026 12:39:55 +0300 Subject: [PATCH 3/7] test(llama-index): add vector_query() integration tests --- .../integration/adapters/test_llama_index.py | 38 +++++++++++++++++++ 1 file changed, 38 insertions(+) diff --git a/tests/integration/adapters/test_llama_index.py b/tests/integration/adapters/test_llama_index.py index 771a324..635fe1d 100644 --- a/tests/integration/adapters/test_llama_index.py +++ b/tests/integration/adapters/test_llama_index.py @@ -12,6 +12,7 @@ import pytest from llama_index.core.graph_stores.types import EntityNode, Relation +from llama_index.core.vector_stores.types import VectorStoreQuery from llama_index.graph_stores.coordinode import CoordinodePropertyGraphStore ADDR = os.environ.get("COORDINODE_ADDR", "localhost:7080") @@ -150,3 +151,40 @@ def test_delete_by_entity_name(store, tag): found = store.get(properties={"name": f"DelNamed-{tag}"}) assert len(found) == 0 + + +# ── Vector query ────────────────────────────────────────────────────────────── + + +def test_vector_query_returns_results(store, tag): + """vector_query() returns nodes and scores for an embedding that matches stored data. + + vector_query() without filters defaults to label="Chunk", so the seed node must use + that label to be found by the underlying vector_search() call. + """ + vec = [float(i) / 16 for i in range(16)] + # Seed a Chunk node with an embedding directly via Cypher. + # vector_query() defaults label to "Chunk" when no MetadataFilters are provided. + store._client.cypher( + "CREATE (n:Chunk {id: $id, text: $text, embedding: $vec})", + params={"id": f"vec-{tag}", "text": "test chunk", "vec": vec}, + ) + try: + query = VectorStoreQuery(query_embedding=vec, similarity_top_k=1) + nodes, scores = store.vector_query(query) + + assert isinstance(nodes, list) + assert isinstance(scores, list) + assert len(nodes) >= 1 + assert len(scores) == len(nodes) + assert scores[0] >= 0.0 + finally: + store._client.cypher("MATCH (n:Chunk {id: $id}) DELETE n", params={"id": f"vec-{tag}"}) + + +def test_vector_query_empty_embedding_returns_empty(store): + """vector_query() with no query_embedding returns empty lists without error.""" + query = VectorStoreQuery(query_embedding=None, similarity_top_k=5) + nodes, scores = store.vector_query(query) + assert nodes == [] + assert scores == [] From 778e8c336588b13465993312c50f48cf9b0f8e63 Mon Sep 17 00:00:00 2001 From: Dmitry Prudnikov Date: Sun, 12 Apr 2026 12:54:56 +0300 Subject: [PATCH 4/7] fix(langchain): use min() in _first_label for deterministic label selection openCypher does not guarantee a stable ordering for labels(), so using labels[0] produced nondeterministic schema entries across refresh_schema() calls. Replace with min(labels) to always select the lexicographically smallest label consistently. Also strengthen the vector_query() integration test: capture the seeded node's internal CoordiNode ID from CREATE RETURN and assert it appears in the returned ChunkNode list, proving the specific seeded node was found rather than any pre-existing Chunk. --- langchain-coordinode/langchain_coordinode/graph.py | 9 +++++++-- tests/integration/adapters/test_llama_index.py | 10 ++++++++-- 2 files changed, 15 insertions(+), 4 deletions(-) diff --git a/langchain-coordinode/langchain_coordinode/graph.py b/langchain-coordinode/langchain_coordinode/graph.py index c9b2e51..9fa96fd 100644 --- a/langchain-coordinode/langchain_coordinode/graph.py +++ b/langchain-coordinode/langchain_coordinode/graph.py @@ -228,9 +228,14 @@ def _stable_document_id(source: Any) -> str: def _first_label(labels: Any) -> str | None: - """Extract the first label from a labels() result (list of strings).""" + """Extract a stable label from a labels() result (list of strings). + + openCypher does not guarantee a stable ordering for labels(), so using + labels[0] would produce nondeterministic schema entries across calls. + We return the lexicographically smallest label as a deterministic rule. + """ if isinstance(labels, list) and labels: - return str(labels[0]) + return str(min(labels)) if isinstance(labels, str): return labels return None diff --git a/tests/integration/adapters/test_llama_index.py b/tests/integration/adapters/test_llama_index.py index 635fe1d..5083f00 100644 --- a/tests/integration/adapters/test_llama_index.py +++ b/tests/integration/adapters/test_llama_index.py @@ -165,10 +165,13 @@ def test_vector_query_returns_results(store, tag): vec = [float(i) / 16 for i in range(16)] # Seed a Chunk node with an embedding directly via Cypher. # vector_query() defaults label to "Chunk" when no MetadataFilters are provided. - store._client.cypher( - "CREATE (n:Chunk {id: $id, text: $text, embedding: $vec})", + # Capture the internal CoordiNode node ID (returned as integer by RETURN n) so we + # can assert the specific seeded node is retrieved — not just any pre-existing Chunk. + seed_rows = store._client.cypher( + "CREATE (n:Chunk {id: $id, text: $text, embedding: $vec}) RETURN n AS nid", params={"id": f"vec-{tag}", "text": "test chunk", "vec": vec}, ) + seeded_internal_id = str(seed_rows[0]["nid"]) try: query = VectorStoreQuery(query_embedding=vec, similarity_top_k=1) nodes, scores = store.vector_query(query) @@ -176,6 +179,9 @@ def test_vector_query_returns_results(store, tag): assert isinstance(nodes, list) assert isinstance(scores, list) assert len(nodes) >= 1 + # vector_search returns CoordiNode internal node IDs (ChunkNode.id_); + # verify our seeded node is the one found. + assert any(str(getattr(node, "id_", "")) == seeded_internal_id for node in nodes) assert len(scores) == len(nodes) assert scores[0] >= 0.0 finally: From f31a6976504ac9482aa3f5bbc591c7dba3079bb8 Mon Sep 17 00:00:00 2001 From: Dmitry Prudnikov Date: Sun, 12 Apr 2026 13:06:30 +0300 Subject: [PATCH 5/7] test(llama-index): move seed creation into try block for reliable cleanup Ensures the finally DELETE runs even if seeded_internal_id extraction fails after a successful CREATE. Addresses CodeRabbit nitpick on test robustness. --- .../integration/adapters/test_llama_index.py | 20 ++++++++++--------- 1 file changed, 11 insertions(+), 9 deletions(-) diff --git a/tests/integration/adapters/test_llama_index.py b/tests/integration/adapters/test_llama_index.py index 5083f00..fb4fcdc 100644 --- a/tests/integration/adapters/test_llama_index.py +++ b/tests/integration/adapters/test_llama_index.py @@ -163,16 +163,18 @@ def test_vector_query_returns_results(store, tag): that label to be found by the underlying vector_search() call. """ vec = [float(i) / 16 for i in range(16)] - # Seed a Chunk node with an embedding directly via Cypher. - # vector_query() defaults label to "Chunk" when no MetadataFilters are provided. - # Capture the internal CoordiNode node ID (returned as integer by RETURN n) so we - # can assert the specific seeded node is retrieved — not just any pre-existing Chunk. - seed_rows = store._client.cypher( - "CREATE (n:Chunk {id: $id, text: $text, embedding: $vec}) RETURN n AS nid", - params={"id": f"vec-{tag}", "text": "test chunk", "vec": vec}, - ) - seeded_internal_id = str(seed_rows[0]["nid"]) + # Seeding is inside the try block so that the finally cleanup always runs even if + # the CREATE succeeds but extracting seeded_internal_id raises (e.g., unexpected + # response format). vector_query() defaults label to "Chunk" when no + # MetadataFilters are provided. try: + # Capture the internal CoordiNode node ID (returned as integer by RETURN n) so + # we can assert the specific seeded node is retrieved — not just any Chunk. + seed_rows = store._client.cypher( + "CREATE (n:Chunk {id: $id, text: $text, embedding: $vec}) RETURN n AS nid", + params={"id": f"vec-{tag}", "text": "test chunk", "vec": vec}, + ) + seeded_internal_id = str(seed_rows[0]["nid"]) query = VectorStoreQuery(query_embedding=vec, similarity_top_k=1) nodes, scores = store.vector_query(query) From 543816f07b5ccd3212aa949fe52b363e5d89a2ee Mon Sep 17 00:00:00 2001 From: Dmitry Prudnikov Date: Sun, 12 Apr 2026 13:30:08 +0300 Subject: [PATCH 6/7] test(llama-index): clarify CoordiNode RETURN n and vector_search behaviour MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Add detailed comments explaining: - In CoordiNode, CREATE ... RETURN n yields the internal integer node ID, not a property map — verified empirically (seed_rows[0]["nid"] == int) - vector_search returns Node(properties={}) so node.properties.get("id") is always None and cannot be used for node identification - ChunkNode.id_ == str(r.node.id) is the correct comparison target --- tests/integration/adapters/test_llama_index.py | 15 +++++++++++---- 1 file changed, 11 insertions(+), 4 deletions(-) diff --git a/tests/integration/adapters/test_llama_index.py b/tests/integration/adapters/test_llama_index.py index fb4fcdc..8496257 100644 --- a/tests/integration/adapters/test_llama_index.py +++ b/tests/integration/adapters/test_llama_index.py @@ -168,8 +168,16 @@ def test_vector_query_returns_results(store, tag): # response format). vector_query() defaults label to "Chunk" when no # MetadataFilters are provided. try: - # Capture the internal CoordiNode node ID (returned as integer by RETURN n) so - # we can assert the specific seeded node is retrieved — not just any Chunk. + # In CoordiNode, `CREATE (n:...) RETURN n` returns the internal integer node ID, + # NOT a property map. This is CoordiNode-specific behaviour verified empirically: + # seed_rows[0]["nid"] → 90 (int) + # ChunkNode.id_ is set from vector_search's r.node.id (same internal integer), + # so comparing str(node.id_) == str(seed_rows[0]["nid"]) correctly identifies + # the specific seeded node. + # + # NOTE: vector_search returns Node(id=N, properties={}) — the properties dict is + # always EMPTY, so node.properties.get("id") would always be None and cannot be + # used for identification. seed_rows = store._client.cypher( "CREATE (n:Chunk {id: $id, text: $text, embedding: $vec}) RETURN n AS nid", params={"id": f"vec-{tag}", "text": "test chunk", "vec": vec}, @@ -181,8 +189,7 @@ def test_vector_query_returns_results(store, tag): assert isinstance(nodes, list) assert isinstance(scores, list) assert len(nodes) >= 1 - # vector_search returns CoordiNode internal node IDs (ChunkNode.id_); - # verify our seeded node is the one found. + # ChunkNode.id_ == str(r.node.id) == internal CoordiNode node ID captured above. assert any(str(getattr(node, "id_", "")) == seeded_internal_id for node in nodes) assert len(scores) == len(nodes) assert scores[0] >= 0.0 From f0e1ff3f954e83f11671fb385c058af514c72c6e Mon Sep 17 00:00:00 2001 From: Dmitry Prudnikov Date: Sun, 12 Apr 2026 13:57:04 +0300 Subject: [PATCH 7/7] fix(langchain): deduplicate relationship triples after _first_label normalization RETURN DISTINCT operates on raw label lists, but min()-based _first_label() can collapse different multi-label combos (e.g. ['Employee','Person'] and ['Person','Employee']) into the same (start, type, end) triple. Use a set for deduplication after normalization so each relationship triple appears once. Also make the vector_query() integration test embedding unique per test tag (derived from tag bytes) and increase similarity_top_k to 5 to prevent flaky results in shared integration DBs where another :Chunk may share the same vector. --- .../langchain_coordinode/graph.py | 20 ++++++++++++------- .../integration/adapters/test_llama_index.py | 10 ++++++++-- 2 files changed, 21 insertions(+), 9 deletions(-) diff --git a/langchain-coordinode/langchain_coordinode/graph.py b/langchain-coordinode/langchain_coordinode/graph.py index 9fa96fd..8a19ff6 100644 --- a/langchain-coordinode/langchain_coordinode/graph.py +++ b/langchain-coordinode/langchain_coordinode/graph.py @@ -81,14 +81,20 @@ def refresh_schema(self) -> None: "MATCH (a)-[r]->(b) RETURN DISTINCT labels(a) AS src_labels, type(r) AS rel, labels(b) AS dst_labels" ) if rows: + # Deduplicate after _first_label() normalization: RETURN DISTINCT operates on + # raw label lists, but _first_label(min()) can collapse different multi-label + # combinations to the same (start, type, end) triple (e.g. ['Employee','Person'] + # and ['Person','Employee'] both min-normalize to 'Employee'). Use a set to + # ensure each relationship triple appears at most once. + triples: set[tuple[str, str, str]] = set() + for row in rows: + start = _first_label(row.get("src_labels")) + end = _first_label(row.get("dst_labels")) + rel = row.get("rel") + if start and rel and end: + triples.add((start, rel, end)) structured["relationships"] = [ - { - "start": _first_label(row.get("src_labels")), - "type": row["rel"], - "end": _first_label(row.get("dst_labels")), - } - for row in rows - if _first_label(row.get("src_labels")) and row.get("rel") and _first_label(row.get("dst_labels")) + {"start": start, "type": rel, "end": end} for start, rel, end in sorted(triples) ] self._structured_schema = structured diff --git a/tests/integration/adapters/test_llama_index.py b/tests/integration/adapters/test_llama_index.py index 8496257..971a2f2 100644 --- a/tests/integration/adapters/test_llama_index.py +++ b/tests/integration/adapters/test_llama_index.py @@ -162,7 +162,11 @@ def test_vector_query_returns_results(store, tag): vector_query() without filters defaults to label="Chunk", so the seed node must use that label to be found by the underlying vector_search() call. """ - vec = [float(i) / 16 for i in range(16)] + # Derive a unique embedding from the test tag so that no other :Chunk in the shared + # integration DB can have the same or closer vector, preventing flaky top-k results. + # tag is uuid4().hex[:8] → 8 hex chars → 4 bytes of entropy. + seed = list(bytes.fromhex(tag)) + vec = [float(seed[i % len(seed)]) / 255.0 for i in range(16)] # Seeding is inside the try block so that the finally cleanup always runs even if # the CREATE succeeds but extracting seeded_internal_id raises (e.g., unexpected # response format). vector_query() defaults label to "Chunk" when no @@ -183,7 +187,9 @@ def test_vector_query_returns_results(store, tag): params={"id": f"vec-{tag}", "text": "test chunk", "vec": vec}, ) seeded_internal_id = str(seed_rows[0]["nid"]) - query = VectorStoreQuery(query_embedding=vec, similarity_top_k=1) + # top_k=5: even if other :Chunk nodes exist with similar vectors, the unique + # tag-based embedding ensures ours is among the closest results. + query = VectorStoreQuery(query_embedding=vec, similarity_top_k=5) nodes, scores = store.vector_query(query) assert isinstance(nodes, list)