From 7492accca961fdf522211249db798d91c5fe1022 Mon Sep 17 00:00:00 2001 From: Robert Forrest Date: Mon, 13 Apr 2026 17:17:24 +0100 Subject: [PATCH 1/2] fix: use lmdb_get to access already loaded data without re-reading --- .../reference/reference_dataset_serializer.py | 10 ++++------ 1 file changed, 4 insertions(+), 6 deletions(-) diff --git a/mattergen/evaluation/reference/reference_dataset_serializer.py b/mattergen/evaluation/reference/reference_dataset_serializer.py index 4ddaee79..46b471d9 100644 --- a/mattergen/evaluation/reference/reference_dataset_serializer.py +++ b/mattergen/evaluation/reference/reference_dataset_serializer.py @@ -136,7 +136,7 @@ def __init__(self, lmdb_path: Path, cleanup_dir: bool = False): """ self.env = lmdb_open(lmdb_path, readonly=True) self.num_entries_by_chemsys_reduced_formulas = ( - self._build_num_entries_by_chemsys_reduced_formulas(lmdb_path) + self._build_num_entries_by_chemsys_reduced_formulas() ) self.total_num_entries = sum( sum(d.values()) for d in self.num_entries_by_chemsys_reduced_formulas.values() @@ -144,14 +144,12 @@ def __init__(self, lmdb_path: Path, cleanup_dir: bool = False): # close the LMDB environment when this object is garbage collected weakref.finalize(self, self._cleanup, self.env, cleanup_dir) - def _build_num_entries_by_chemsys_reduced_formulas( - self, lmdb_path: Path - ) -> dict[str, dict[str, int]]: - chemical_systems = lmdb_read_metadata(lmdb_path, "chemical_systems") + def _build_num_entries_by_chemsys_reduced_formulas(self) -> dict[str, dict[str, int]]: result: defaultdict[str, dict[str, int]] = defaultdict(dict) with self.env.begin() as txn: + chemical_systems = lmdb_get(txn, "chemical_systems") for chemsys in chemical_systems: - reduced_formulas = lmdb_read_metadata(lmdb_path, f"{chemsys}.reduced_formulas") + reduced_formulas = lmdb_get(txn, f"{chemsys}.reduced_formulas") for reduced_formula in reduced_formulas: result[chemsys][reduced_formula] = lmdb_get( txn, f"{chemsys}.{reduced_formula}.length" From 8580e5034667a7e871f6b0bcfb1edc9bea7524b6 Mon Sep 17 00:00:00 2001 From: Robert Forrest Date: Mon, 13 Apr 2026 17:38:01 +0100 Subject: [PATCH 2/2] test: regression test for LMDBGZSerializer re-reading the same env --- .../test_reference_dataset_serializer.py | 32 +++++++++++++++++++ 1 file changed, 32 insertions(+) create mode 100644 mattergen/tests/test_reference_dataset_serializer.py diff --git a/mattergen/tests/test_reference_dataset_serializer.py b/mattergen/tests/test_reference_dataset_serializer.py new file mode 100644 index 00000000..0124d520 --- /dev/null +++ b/mattergen/tests/test_reference_dataset_serializer.py @@ -0,0 +1,32 @@ +from pathlib import Path + +from pymatgen.core import Lattice, Structure +from pymatgen.entries.computed_entries import ComputedStructureEntry + +from mattergen.evaluation.reference.reference_dataset import ReferenceDataset +from mattergen.evaluation.reference.reference_dataset_serializer import LMDBGZSerializer + + +def test_deserialize_does_not_reopen_same_lmdb_while_environment_is_active( + tmp_path: Path, +) -> None: + serializer = LMDBGZSerializer() + dataset_path = tmp_path / "reference.lmdb.gz" + entry = ComputedStructureEntry( + structure=Structure( + lattice=Lattice.cubic(3.5), + species=["Fe", "O"], + coords=[[0, 0, 0], [0.5, 0.5, 0.5]], + ), + energy=0.0, + ) + serializer.serialize( + ReferenceDataset.from_entries("reference", [entry]), + dataset_path, + ) + + reference = serializer.deserialize(dataset_path) + assert reference.name == "reference" + assert reference.impl.chemical_systems == ("Fe-O",) + assert len(reference) == 1 + reference.impl.cleanup(cleanup_dir=True)