From 3af964c82137c3d520b736689e7213e5faeb9747 Mon Sep 17 00:00:00 2001 From: esoteric-ephemera Date: Tue, 11 Nov 2025 16:20:24 -0800 Subject: [PATCH 1/5] draft search by similarity feature vec --- mp_api/client/routes/materials/similarity.py | 76 +++++++++++++++++++- 1 file changed, 74 insertions(+), 2 deletions(-) diff --git a/mp_api/client/routes/materials/similarity.py b/mp_api/client/routes/materials/similarity.py index cf1dc175e..037750aba 100644 --- a/mp_api/client/routes/materials/similarity.py +++ b/mp_api/client/routes/materials/similarity.py @@ -1,16 +1,31 @@ from __future__ import annotations -from emmet.core.similarity import SimilarityDoc +from typing import TYPE_CHECKING -from mp_api.client.core import BaseRester +from emmet.core.mpid import MPID, AlphaID +from emmet.core.similarity import CrystalNNSimilarity, SimilarityDoc, SimilarityEntry + +from mp_api.client.core import BaseRester, MPRestError from mp_api.client.core.utils import validate_ids +if TYPE_CHECKING: + from emmet.core.similarity import SimilarityScorer + from pymatgen.core import Structure + class SimilarityRester(BaseRester): suffix = "materials/similarity" document_model = SimilarityDoc # type: ignore primary_key = "material_id" + _fingerprinter: SimilarityScorer | None = None + + @property + def fingerprinter(self, structure: Structure) -> list[float]: + if self._fingerprinter is None: + self._fingerprinter = CrystalNNSimilarity() + return self._fingerprinter()._featurize_structure(structure).tolist() + def search( self, material_ids: str | list[str] | None = None, @@ -53,3 +68,60 @@ def search( fields=fields, **query_params, ) + + def find_similar( + self, + structure_or_mpid: Structure | str | MPID | AlphaID, + num_chunks: int | None = None, + chunk_size: int | None = 1000, + ) -> list[SimilarityEntry] | list[dict]: + """Find structures similar to a user-submitted structure. + + Arguments: + structure_or_mpid : pymatgen .Structure, or str, MPID, AlphaID + If a .Structure, the feature vector is computed on the fly + If a str, MPID, or AlphaID, attempts to retrieve a pre-computed + feature vector using the input as a material ID + num_chunks (int or None): Maximum number of chunks of data to yield. None will yield all possible. + chunk_size (int or None): Number of data entries per chunk. + + Returns: + ([SimilarityEntry] | [dict]) List of SimilarityEntry documents + (if `use_document_model`) or dict (otherwise) listing + structures most similar to the input structure. + """ + if isinstance(structure_or_mpid, str | MPID | AlphaID): + fmt_idx = AlphaID(structure_or_mpid).string + + docs = self.search(material_ids=[fmt_idx], fields=["feature_vector"]) + if not docs: + raise MPRestError(f"No similarity data available for {fmt_idx}") + feature_vector = docs[0]["feature_vector"] + else: + feature_vector = self.fingerprinter(structure_or_mpid) + + result = self._query_resource( + criteria={"feature_vector": feature_vector, "_limit": chunk_size}, + suburl="match", + use_document_model=False, # Return type is not exactly a SimilarityDoc, closer to SimilarityEntry + chunk_size=chunk_size, + num_chunks=num_chunks, + ).get("data", None) + + if result is None: + raise MPRestError( + "Could not find any structures similar to the input structure." + ) + + sim_docs = [ + { + "formula": entry["formula_pretty"], + "task_id": entry["material_id"], + "dissimilarity": 100 * (1.0 - entry["score"]), + } + for entry in result + ] + + if self.use_document_model: + return [SimilarityEntry(**doc) for doc in sim_docs] + return sim_docs From 855150cf6f7c43d7c46bb38ece39c93b880a0bda Mon Sep 17 00:00:00 2001 From: esoteric-ephemera Date: Thu, 4 Dec 2025 17:46:58 -0800 Subject: [PATCH 2/5] update commensurate with emmet pr --- .gemini/settings.json | 19 +++++++++++++++++++ mp_api/client/routes/materials/similarity.py | 11 ++++++++++- .../requirements-ubuntu-latest_py3.11.txt | 2 +- ...quirements-ubuntu-latest_py3.11_extras.txt | 2 +- .../requirements-ubuntu-latest_py3.12.txt | 2 +- ...quirements-ubuntu-latest_py3.12_extras.txt | 2 +- 6 files changed, 33 insertions(+), 5 deletions(-) create mode 100644 .gemini/settings.json diff --git a/.gemini/settings.json b/.gemini/settings.json new file mode 100644 index 000000000..d9ac4567e --- /dev/null +++ b/.gemini/settings.json @@ -0,0 +1,19 @@ +{ + "mcpServers": { + "Materials_Project_MCP": { + "command": "uv", + "args": [ + "run", + "--project", + "/Users/aaronkaplan/Library/CloudStorage/Dropbox/mp_soft_engr/software/mp_api", + "--with", + "fastmcp", + "--with-requirements", + "/Users/aaronkaplan/Library/CloudStorage/Dropbox/mp_soft_engr/software/mp_api/requirements/requirements-ubuntu-latest_py3.12_extras.txt", + "fastmcp", + "run", + "/Users/aaronkaplan/Library/CloudStorage/Dropbox/mp_soft_engr/software/mp_api/mp_api/mcp/server.py" + ] + } + } +} diff --git a/mp_api/client/routes/materials/similarity.py b/mp_api/client/routes/materials/similarity.py index 037750aba..283eae4f7 100644 --- a/mp_api/client/routes/materials/similarity.py +++ b/mp_api/client/routes/materials/similarity.py @@ -1,9 +1,12 @@ from __future__ import annotations +import zlib from typing import TYPE_CHECKING +import numpy as np from emmet.core.mpid import MPID, AlphaID from emmet.core.similarity import CrystalNNSimilarity, SimilarityDoc, SimilarityEntry +from pymatgen.core import Composition from mp_api.client.core import BaseRester, MPRestError from mp_api.client.core.utils import validate_ids @@ -101,7 +104,12 @@ def find_similar( feature_vector = self.fingerprinter(structure_or_mpid) result = self._query_resource( - criteria={"feature_vector": feature_vector, "_limit": chunk_size}, + criteria={ + "feature_vector_hex": zlib.compress( + np.array(feature_vector).tobytes() + ).hex(), + "_limit": chunk_size, + }, suburl="match", use_document_model=False, # Return type is not exactly a SimilarityDoc, closer to SimilarityEntry chunk_size=chunk_size, @@ -117,6 +125,7 @@ def find_similar( { "formula": entry["formula_pretty"], "task_id": entry["material_id"], + "nelements": len(Composition(entry["formula_pretty"]).elements), "dissimilarity": 100 * (1.0 - entry["score"]), } for entry in result diff --git a/requirements/requirements-ubuntu-latest_py3.11.txt b/requirements/requirements-ubuntu-latest_py3.11.txt index d3ff66cb4..6a3041a6d 100644 --- a/requirements/requirements-ubuntu-latest_py3.11.txt +++ b/requirements/requirements-ubuntu-latest_py3.11.txt @@ -24,7 +24,7 @@ contourpy==1.3.3 # via matplotlib cycler==0.12.1 # via matplotlib -emmet-core==0.86.0 +emmet-core==0.86.2rc0 # via mp-api (pyproject.toml) fonttools==4.60.1 # via matplotlib diff --git a/requirements/requirements-ubuntu-latest_py3.11_extras.txt b/requirements/requirements-ubuntu-latest_py3.11_extras.txt index 59ac2f166..631c069ca 100644 --- a/requirements/requirements-ubuntu-latest_py3.11_extras.txt +++ b/requirements/requirements-ubuntu-latest_py3.11_extras.txt @@ -62,7 +62,7 @@ dnspython==2.8.0 # pymongo docutils==0.21.2 # via sphinx -emmet-core[all]==0.86.0 +emmet-core[all]==0.86.2rc0 # via mp-api (pyproject.toml) execnet==2.1.1 # via pytest-xdist diff --git a/requirements/requirements-ubuntu-latest_py3.12.txt b/requirements/requirements-ubuntu-latest_py3.12.txt index 29ee10749..2af32c831 100644 --- a/requirements/requirements-ubuntu-latest_py3.12.txt +++ b/requirements/requirements-ubuntu-latest_py3.12.txt @@ -24,7 +24,7 @@ contourpy==1.3.3 # via matplotlib cycler==0.12.1 # via matplotlib -emmet-core==0.86.0 +emmet-core==0.86.2rc0 # via mp-api (pyproject.toml) fonttools==4.60.1 # via matplotlib diff --git a/requirements/requirements-ubuntu-latest_py3.12_extras.txt b/requirements/requirements-ubuntu-latest_py3.12_extras.txt index 9b4c609a5..668ba3056 100644 --- a/requirements/requirements-ubuntu-latest_py3.12_extras.txt +++ b/requirements/requirements-ubuntu-latest_py3.12_extras.txt @@ -62,7 +62,7 @@ dnspython==2.8.0 # pymongo docutils==0.21.2 # via sphinx -emmet-core[all]==0.86.0 +emmet-core[all]==0.86.2rc0 # via mp-api (pyproject.toml) execnet==2.1.1 # via pytest-xdist From 4849293236f6665947e5b5857d38586acc9436f1 Mon Sep 17 00:00:00 2001 From: esoteric-ephemera Date: Thu, 4 Dec 2025 17:47:17 -0800 Subject: [PATCH 3/5] remove file --- .gemini/settings.json | 19 ------------------- 1 file changed, 19 deletions(-) delete mode 100644 .gemini/settings.json diff --git a/.gemini/settings.json b/.gemini/settings.json deleted file mode 100644 index d9ac4567e..000000000 --- a/.gemini/settings.json +++ /dev/null @@ -1,19 +0,0 @@ -{ - "mcpServers": { - "Materials_Project_MCP": { - "command": "uv", - "args": [ - "run", - "--project", - "/Users/aaronkaplan/Library/CloudStorage/Dropbox/mp_soft_engr/software/mp_api", - "--with", - "fastmcp", - "--with-requirements", - "/Users/aaronkaplan/Library/CloudStorage/Dropbox/mp_soft_engr/software/mp_api/requirements/requirements-ubuntu-latest_py3.12_extras.txt", - "fastmcp", - "run", - "/Users/aaronkaplan/Library/CloudStorage/Dropbox/mp_soft_engr/software/mp_api/mp_api/mcp/server.py" - ] - } - } -} From 78355e1a3ef5f592e09240893edc92fd940ab334 Mon Sep 17 00:00:00 2001 From: esoteric-ephemera Date: Fri, 5 Dec 2025 09:57:49 -0800 Subject: [PATCH 4/5] tune up methods + add limit to number of returned results --- mp_api/client/routes/materials/similarity.py | 43 +++++++++++++++----- 1 file changed, 32 insertions(+), 11 deletions(-) diff --git a/mp_api/client/routes/materials/similarity.py b/mp_api/client/routes/materials/similarity.py index 283eae4f7..b468ef34e 100644 --- a/mp_api/client/routes/materials/similarity.py +++ b/mp_api/client/routes/materials/similarity.py @@ -6,14 +6,16 @@ import numpy as np from emmet.core.mpid import MPID, AlphaID from emmet.core.similarity import CrystalNNSimilarity, SimilarityDoc, SimilarityEntry -from pymatgen.core import Composition +from pymatgen.core import Composition, Structure from mp_api.client.core import BaseRester, MPRestError from mp_api.client.core.utils import validate_ids if TYPE_CHECKING: from emmet.core.similarity import SimilarityScorer - from pymatgen.core import Structure + +# This limit seems to be associated with MongoDB vector search +MAX_VECTOR_SEARCH_RESULTS = 10_000 class SimilarityRester(BaseRester): @@ -23,11 +25,15 @@ class SimilarityRester(BaseRester): _fingerprinter: SimilarityScorer | None = None - @property - def fingerprinter(self, structure: Structure) -> list[float]: + def fingerprint_structure(self, structure: Structure) -> np.ndarray: + """Get the fingerprint of a user-submitted structures.""" if self._fingerprinter is None: self._fingerprinter = CrystalNNSimilarity() - return self._fingerprinter()._featurize_structure(structure).tolist() + return self._fingerprinter._featurize_structure(structure) + + def _get_hex_fingerprint(self, feature_vetor: np.ndarray) -> str: + """Convert feature vector fingerprint to compressed hex str.""" + return zlib.compress(feature_vetor.tobytes()).hex() def search( self, @@ -75,18 +81,24 @@ def search( def find_similar( self, structure_or_mpid: Structure | str | MPID | AlphaID, + top: int | None = 50, num_chunks: int | None = None, chunk_size: int | None = 1000, ) -> list[SimilarityEntry] | list[dict]: - """Find structures similar to a user-submitted structure. + """Find structures most similar to a user-submitted structure. Arguments: structure_or_mpid : pymatgen .Structure, or str, MPID, AlphaID If a .Structure, the feature vector is computed on the fly If a str, MPID, or AlphaID, attempts to retrieve a pre-computed feature vector using the input as a material ID + top : int + The number of most similar materials to return, defaults to 50. + Setting to None will return the maximum possible number of + most similar materials.. num_chunks (int or None): Maximum number of chunks of data to yield. None will yield all possible. chunk_size (int or None): Number of data entries per chunk. + The chunk_size is also used to limit the number of responses returned. Returns: ([SimilarityEntry] | [dict]) List of SimilarityEntry documents @@ -100,15 +112,24 @@ def find_similar( if not docs: raise MPRestError(f"No similarity data available for {fmt_idx}") feature_vector = docs[0]["feature_vector"] + + elif isinstance(structure_or_mpid, Structure): + feature_vector = self.fingerprint_structure(structure_or_mpid) + else: - feature_vector = self.fingerprinter(structure_or_mpid) + raise ValueError("Please submit a pymatgen Structure or MP ID.") + + top = top or MAX_VECTOR_SEARCH_RESULTS + if not isinstance(top, int) or top < 1: + raise ValueError( + f"Invalid number of possible top matches specified = {top}." + "Please specify a positive integer or `None` to return all results." + ) result = self._query_resource( criteria={ - "feature_vector_hex": zlib.compress( - np.array(feature_vector).tobytes() - ).hex(), - "_limit": chunk_size, + "feature_vector_hex": self._get_hex_fingerprint(feature_vector), + "_limit": top, }, suburl="match", use_document_model=False, # Return type is not exactly a SimilarityDoc, closer to SimilarityEntry From 2e8d9f8684812a415e64d1595280fb947b6afd00 Mon Sep 17 00:00:00 2001 From: esoteric-ephemera Date: Fri, 5 Dec 2025 13:28:06 -0800 Subject: [PATCH 5/5] revise similarity search to reduce to hex and norm --- mp_api/client/routes/materials/similarity.py | 18 ++++++++++-------- pyproject.toml | 4 ++-- .../requirements-ubuntu-latest_py3.11.txt | 2 +- ...equirements-ubuntu-latest_py3.11_extras.txt | 2 +- .../requirements-ubuntu-latest_py3.12.txt | 2 +- ...equirements-ubuntu-latest_py3.12_extras.txt | 2 +- 6 files changed, 16 insertions(+), 14 deletions(-) diff --git a/mp_api/client/routes/materials/similarity.py b/mp_api/client/routes/materials/similarity.py index b468ef34e..6cb600d59 100644 --- a/mp_api/client/routes/materials/similarity.py +++ b/mp_api/client/routes/materials/similarity.py @@ -1,17 +1,21 @@ from __future__ import annotations -import zlib from typing import TYPE_CHECKING -import numpy as np from emmet.core.mpid import MPID, AlphaID -from emmet.core.similarity import CrystalNNSimilarity, SimilarityDoc, SimilarityEntry +from emmet.core.similarity import ( + CrystalNNSimilarity, + SimilarityDoc, + SimilarityEntry, + _vector_to_hex_and_norm, +) from pymatgen.core import Composition, Structure from mp_api.client.core import BaseRester, MPRestError from mp_api.client.core.utils import validate_ids if TYPE_CHECKING: + import numpy as np from emmet.core.similarity import SimilarityScorer # This limit seems to be associated with MongoDB vector search @@ -31,10 +35,6 @@ def fingerprint_structure(self, structure: Structure) -> np.ndarray: self._fingerprinter = CrystalNNSimilarity() return self._fingerprinter._featurize_structure(structure) - def _get_hex_fingerprint(self, feature_vetor: np.ndarray) -> str: - """Convert feature vector fingerprint to compressed hex str.""" - return zlib.compress(feature_vetor.tobytes()).hex() - def search( self, material_ids: str | list[str] | None = None, @@ -126,9 +126,11 @@ def find_similar( "Please specify a positive integer or `None` to return all results." ) + vector_hex, vector_norm = _vector_to_hex_and_norm(feature_vector) result = self._query_resource( criteria={ - "feature_vector_hex": self._get_hex_fingerprint(feature_vector), + "feature_vector_hex": vector_hex, + "feature_vector_norm": vector_norm, "_limit": top, }, suburl="match", diff --git a/pyproject.toml b/pyproject.toml index afefc1e06..093f8312b 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -25,7 +25,7 @@ dependencies = [ "typing-extensions>=3.7.4.1", "requests>=2.23.0", "monty>=2024.12.10", - "emmet-core>=0.85.1rc0", + "emmet-core>=0.86.2rc1", "smart_open", "boto3", "orjson >= 3.10,<4", @@ -33,7 +33,7 @@ dependencies = [ dynamic = ["version"] [project.optional-dependencies] -all = ["emmet-core[all]>=0.85.1rc0", "custodian", "mpcontribs-client>=5.10"] +all = ["emmet-core[all]>=0.86.2rc1", "custodian", "mpcontribs-client>=5.10"] test = [ "pre-commit", "pytest", diff --git a/requirements/requirements-ubuntu-latest_py3.11.txt b/requirements/requirements-ubuntu-latest_py3.11.txt index 6a3041a6d..11c93cf9d 100644 --- a/requirements/requirements-ubuntu-latest_py3.11.txt +++ b/requirements/requirements-ubuntu-latest_py3.11.txt @@ -24,7 +24,7 @@ contourpy==1.3.3 # via matplotlib cycler==0.12.1 # via matplotlib -emmet-core==0.86.2rc0 +emmet-core==0.86.2rc1 # via mp-api (pyproject.toml) fonttools==4.60.1 # via matplotlib diff --git a/requirements/requirements-ubuntu-latest_py3.11_extras.txt b/requirements/requirements-ubuntu-latest_py3.11_extras.txt index 631c069ca..c94daa297 100644 --- a/requirements/requirements-ubuntu-latest_py3.11_extras.txt +++ b/requirements/requirements-ubuntu-latest_py3.11_extras.txt @@ -62,7 +62,7 @@ dnspython==2.8.0 # pymongo docutils==0.21.2 # via sphinx -emmet-core[all]==0.86.2rc0 +emmet-core[all]==0.86.2rc1 # via mp-api (pyproject.toml) execnet==2.1.1 # via pytest-xdist diff --git a/requirements/requirements-ubuntu-latest_py3.12.txt b/requirements/requirements-ubuntu-latest_py3.12.txt index 2af32c831..8ecc55fff 100644 --- a/requirements/requirements-ubuntu-latest_py3.12.txt +++ b/requirements/requirements-ubuntu-latest_py3.12.txt @@ -24,7 +24,7 @@ contourpy==1.3.3 # via matplotlib cycler==0.12.1 # via matplotlib -emmet-core==0.86.2rc0 +emmet-core==0.86.2rc1 # via mp-api (pyproject.toml) fonttools==4.60.1 # via matplotlib diff --git a/requirements/requirements-ubuntu-latest_py3.12_extras.txt b/requirements/requirements-ubuntu-latest_py3.12_extras.txt index 668ba3056..c5f57a7d6 100644 --- a/requirements/requirements-ubuntu-latest_py3.12_extras.txt +++ b/requirements/requirements-ubuntu-latest_py3.12_extras.txt @@ -62,7 +62,7 @@ dnspython==2.8.0 # pymongo docutils==0.21.2 # via sphinx -emmet-core[all]==0.86.2rc0 +emmet-core[all]==0.86.2rc1 # via mp-api (pyproject.toml) execnet==2.1.1 # via pytest-xdist