From c6b298dd23471e2251578db1adc6e441a13b34f4 Mon Sep 17 00:00:00 2001 From: Kevin Maik Jablonka Date: Tue, 28 Mar 2023 07:34:49 +0200 Subject: [PATCH 1/2] docs: update contribution guide to reflect schema changes --- CONTRIBUTING.md | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index ca90ec2c3..1c592e61a 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -51,7 +51,7 @@ targets: - id: Solubility # name of the column in a tabular dataset description: Experimental aqueous solubility value (LogS) # description of what this column means units: log(mol/L) # units of the values in this column (leave empty if unitless) - type: continuous # can be "categorical", "ordinal", "continuous" + type: continuous # can be "categorical", "ordinal", "continuous", "boolean" names: # names for the property (to sample from for building the prompts) - solubility - water solubility @@ -63,9 +63,13 @@ targets: - solubility - water solubility - solubility in water +benchmarks: # lists all benchmarks this dataset has been part of. split_column is a column in this dataframe with the value "train", "valid", "test" - indicating to which fold a specific entry belongs to + - name: TDC + link: https://tdcommons.ai/ + split_column: split identifiers: - id: InChI # column name - type: InChI # can be "SMILES", "SELFIES", "IUPAC", "OTHER" + type: InChI # can be "SMILES", "SELFIES", "IUPAC", "Other", "InChI", "InChiKey", see IdentifierEnum description: International Chemical Identifier # description (optional, except for "OTHER") license: CC0 1.0 # license under which the original dataset was published num_points: 10000 # number of datapoints in this dataset From 3e1481d10f7202dad090719242a5587c6baf2b25 Mon Sep 17 00:00:00 2001 From: Kevin Maik Jablonka Date: Tue, 28 Mar 2023 07:56:49 +0200 Subject: [PATCH 2/2] feat: add rxnsmiles as identifier --- CONTRIBUTING.md | 12 ++++++++++-- src/chemnlp/data_val/model.py | 5 +++++ 2 files changed, 15 insertions(+), 2 deletions(-) diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index 1c592e61a..c2cb22fb8 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -69,7 +69,7 @@ benchmarks: # lists all benchmarks this dataset has been part of. split_column i split_column: split identifiers: - id: InChI # column name - type: InChI # can be "SMILES", "SELFIES", "IUPAC", "Other", "InChI", "InChiKey", see IdentifierEnum + type: InChI # can be "SMILES", "SELFIES", "IUPAC", "Other", "InChI", "InChiKey", "RXNSMILES", "RXNSMILESWAdd" see IdentifierEnum description: International Chemical Identifier # description (optional, except for "OTHER") license: CC0 1.0 # license under which the original dataset was published num_points: 10000 # number of datapoints in this dataset @@ -147,8 +147,16 @@ Please indicate this in the `meta.yaml` under the field `split_col`. #### Identifiers -We ask you to add `uris` and `pubchem_aids` in case you find suitable references. +We ask you to add `uris` and `pubchem_aids` in case you find suitable references. We distinguish certain types of identifiers, for which you have to specify the correct strings. The currently allowed types are in the `IdentifierEnum` in `src/chemnlp/data_val/model.py`: +- `SMILES`: Use the canonical form ([RdKit](https://www.rdkit.org/docs/GettingStartedInPython.html)) +- `SELFIES`: [Self-referencing embedded strings](https://github.com/aspuru-guzik-group/selfies) +- `IUPAC`: IUPAC-Name, not use it for non-standard, common names +- `InChI` +- `InChIKey`: The key derived from the `InChI` +- `RXNSMILES`: The [reaction SMILES](https://www.daylight.com/meetings/summerschool98/course/dave/smiles-react.html) containing only educt and product +- `RXNSMILESWAdd`: The reaction SMILES also containing solvent and additives +- `Other`: For all other identifiers ##### Uniform Resource Identifiers (URIs) diff --git a/src/chemnlp/data_val/model.py b/src/chemnlp/data_val/model.py index f9ed7996d..acb1aa372 100644 --- a/src/chemnlp/data_val/model.py +++ b/src/chemnlp/data_val/model.py @@ -15,6 +15,11 @@ class IdentifierEnum(YamlStrEnum): inchi = "InChI" inchikey = "InChIKey" other = "Other" + # we distinguish two RXN-SMILES variants. + # the simple one only includes educt and product + # the other one (rxnsmilesWAdd) also includes solvents etc. + rxnsmiles = "RXNSMILES" + rxnsmilesWAdd = "RXNSMILESWAdd" class Identifier(YamlModel):