diff --git a/.github/workflows/main.yml b/.github/workflows/main.yml index b369103..851581f 100644 --- a/.github/workflows/main.yml +++ b/.github/workflows/main.yml @@ -33,6 +33,7 @@ jobs: pytest --nbmake -n=auto --nbmake-kernel=smoketests --nbmake-timeout=1800 ./notebooks/introductory/basic/2*.ipynb pytest --nbmake -n=auto --nbmake-kernel=smoketests --nbmake-timeout=1800 ./notebooks/introductory/basic/3*.ipynb pytest --nbmake -n=auto --nbmake-kernel=smoketests --nbmake-timeout=1800 ./notebooks/introductory/meta/*.ipynb + pytest --nbmake -n=auto --nbmake-kernel=smoketests --nbmake-timeout=1800 ./notebooks/introductory/custom/*.ipynb migrate-and-advanved: diff --git a/notebooks/introductory/custom/1._Create_Modelpack_with_2step_linker.ipynb b/notebooks/introductory/custom/1._Create_Modelpack_with_2step_linker.ipynb new file mode 100644 index 0000000..2023e1b --- /dev/null +++ b/notebooks/introductory/custom/1._Create_Modelpack_with_2step_linker.ipynb @@ -0,0 +1,270 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "a99ddc26", + "metadata": {}, + "source": [ + "# Two-step linker\n", + "\n", + "The two-step linker is a linker that attempt to learn and use type information for further context.\n", + "It essentially uses the same context manager as the regular linker, but only for types.\n", + "Since there's generally far fewer types, and they're all quite different, classification between them should be easier.\n", + "This should help with misidentifying wrong types of concepts that share the same name (e.g _infusion (procedure)_ vs _infusion (method of drug admiinstartion)_).\n", + "\n", + "Normally, changing the linker would mean you'd retrain the model.\n", + "That's because the new linker needs to be retrained.\n", + "However, if the train counts are correct, we should be able to infer the per-type traininng from existing per-concept trainining." + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "id": "4a571a00", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Existing concept (and types), their names, and corrresponding training:\n", + "73211009 (set()) \t Diabetes Mellitus Diagnosed \t 2\n", + "396230008 (set()) \t Wagner Unverricht Syndrome \t 1\n", + "44132006 (set()) \t Abscess \t 2\n", + "128477000 (set()) \t Abscesses \t 2\n" + ] + } + ], + "source": [ + "# let's reuse the supervised trained model, but we'll change the linker\n", + "from medcat.cat import CAT\n", + "cat = CAT.load_model_pack(\"../basic/models/sup_trained_model.zip\")\n", + "print(\"Existing concept (and types), their names, and corrresponding training:\")\n", + "for tui, ci in cat.cdb.cui2info.items():\n", + " print(f\"{tui:16s} ({ci['type_ids']}) \\t {cat.cdb.get_name(tui):24s} \\t {ci['count_train']}\")" + ] + }, + { + "cell_type": "markdown", + "id": "20905c11", + "metadata": {}, + "source": [ + "We notice that the current concepts don't have type IDs.\n", + "So we have to add them before we can use 2-step linking.\n", + "Let's just devide them into two types:\n", + "- Ones that start with A\n", + "- Ones that don't start with A\n", + "This is an arbitrary distinction, but should work for our example." + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "id": "68594a45", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Existing concept (and types), their names, and corrresponding training:\n", + "73211009 ({'NA'}) \t Diabetes Mellitus Diagnosed \t 2\n", + "396230008 ({'NA'}) \t Wagner Unverricht Syndrome \t 1\n", + "44132006 ({'SA'}) \t Abscess \t 2\n", + "128477000 ({'SA'}) \t Abscesses \t 2\n" + ] + } + ], + "source": [ + "from medcat.cdb.concepts import TypeInfo\n", + "cat.cdb.type_id2info.update({\n", + " \"SA\": TypeInfo(\n", + " type_id=\"SA\",\n", + " name=\"Starts with A\",\n", + " cuis=[cui for cui in cat.cdb.cui2info if cat.cdb.get_name(cui).startswith(\"A\")],\n", + " ),\n", + " \"NA\": TypeInfo(\n", + " type_id=\"NA\",\n", + " name=\"Does not start with A\",\n", + " cuis=[cui for cui in cat.cdb.cui2info if not cat.cdb.get_name(cui).startswith(\"A\")],\n", + " ),\n", + "})\n", + "# NOTE: we also need to update the concept info, so that the new types are included\n", + "for tui, ci in cat.cdb.cui2info.items():\n", + " for ti in cat.cdb.type_id2info.values():\n", + " if tui in ti.cuis:\n", + " ci[\"type_ids\"].add(ti.type_id)\n", + "\n", + "print(\"Existing concept (and types), their names, and corrresponding training:\")\n", + "for tui, ci in cat.cdb.cui2info.items():\n", + " print(f\"{tui:16s} ({ci['type_ids']}) \\t {cat.cdb.get_name(tui):24s} \\t {ci['count_train']}\")" + ] + }, + { + "cell_type": "markdown", + "id": "c0a19671", + "metadata": {}, + "source": [ + "Now we've got the necessary information to use a 2-step linker.\n", + "We just need to get the context vectors from the existing training." + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "id": "6cf1577a", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Existing concept (and types), their names, and corrresponding training:\n", + "73211009 ({'NA'}) \t Diabetes Mellitus Diagnosed \t 2\n", + "396230008 ({'NA'}) \t Wagner Unverricht Syndrome \t 1\n", + "44132006 ({'SA'}) \t Abscess \t 2\n", + "128477000 ({'SA'}) \t Abscesses \t 2\n", + "TYPE_ID:SA (set()) \t Starts with A \t 4\n", + "TYPE_ID:NA (set()) \t Does not start with A \t 3\n" + ] + } + ], + "source": [ + "import numpy as np\n", + "from medcat.components.linking.two_step_context_based_linker import TYPE_ID_PREFIX\n", + "# change linking components\n", + "cat.config.components.linking.comp_name = 'medcat2_two_step_linker'\n", + "# recreate pipe with 2-step linker, this won't be automatic\n", + "cat._recreate_pipe()\n", + "# update TypeID counts and context vectors\n", + "def get_context_vectors(tui: str) -> tuple[dict[str, np.ndarray | None], int]:\n", + " cvs = [((ci := cat.cdb.cui2info[cui])['context_vectors'], ci['count_train'])\n", + " for cui in cat.cdb.type_id2info[tui].cuis]\n", + " # print(\"FIRST CSV\", cvs[0][0]['xlong'].shape)\n", + " vec_types = set(\n", + " vt\n", + " for ccvs, _ in cvs if ccvs\n", + " for vt in ccvs)\n", + " out_vec: dict[str, np.ndarray | None] = {}\n", + " max_train = 0\n", + " for vtype in vec_types:\n", + " vecs: list[np.ndarray] = []\n", + " counts: list[int] = []\n", + " for ccvs, count in cvs:\n", + " if vtype in ccvs:\n", + " vecs.append(ccvs[vtype])\n", + " counts.append(count)\n", + " if vecs and counts:\n", + " out_vec[vtype] = np.sum([c * v for c, v in zip(counts, vecs)], axis=0) / sum(counts)\n", + " max_train = max(max_train, sum(counts))\n", + " return out_vec, max_train\n", + "\n", + "for tui, type_ci in cat.cdb.cui2info.items():\n", + " if not tui.startswith(TYPE_ID_PREFIX):\n", + " # ignore actual/regular CUIs\n", + " continue\n", + " type_ci['context_vectors'], type_ci['count_train'] = get_context_vectors(tui.removeprefix(TYPE_ID_PREFIX))\n", + " # print(\"Set CV\", type_ci['context_vectors'], type_ci['context_vectors']['xlong'].shape)\n", + "print(\"Existing concept (and types), their names, and corrresponding training:\")\n", + "for tui, ci in cat.cdb.cui2info.items():\n", + " print(f\"{tui:16s} ({ci['type_ids']}) \\t {cat.cdb.get_name(tui):24s} \\t {ci['count_train']}\")" + ] + }, + { + "cell_type": "markdown", + "id": "e54063d6", + "metadata": {}, + "source": [ + "NB: Ideally this is where you would start if you had done the entire process of model creation and training (e.g basics/1 through 3) with a 2-step linker approach.\n", + "\n", + "Now let's explore the model's output.\n", + "To tell the difference (since it's all happening under the hood), we need to enable logging." + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "id": "c4febc81", + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Narrowing down candidates for: 'abscess' from ['44132006', '128477000']\n", + "Adding per CUI to 128477000 (tokens 5..5) weights {'44132006': 0.7396468208767506, '128477000': 0.7396468208767506}\n", + "Linker started with entity: abscess\n", + "Mixing type similarity of 0.9929 and CUI similarity of 0.7396 with 0.7682 weight for CUI similarity\n", + "[Per CUI weights] CUI: 44132006, Name: abscess, Old sim: 0.993, New sim: 0.798\n", + "Mixing type similarity of 0.1344 and CUI similarity of 0.7396 with 0.7682 weight for CUI similarity\n", + "[Per CUI weights] CUI: 128477000, Name: abscess, Old sim: 0.134, New sim: 0.599\n", + "Considering CUI 44132006 with sim 0.798340\n", + "Narrowing down candidates for: 'abscess' from ['44132006', '128477000']\n", + "Adding per CUI to 128477000 (tokens 1..1) weights {'44132006': 0.4386427664494404, '128477000': 0.4386427664494404}\n", + "Linker started with entity: abscess\n", + "Mixing type similarity of 0.2137 and CUI similarity of 0.4386 with 0.4239 weight for CUI similarity\n", + "[Per CUI weights] CUI: 44132006, Name: abscess, Old sim: 0.214, New sim: 0.309\n", + "Mixing type similarity of 0.4625 and CUI similarity of 0.4386 with 0.4239 weight for CUI similarity\n", + "[Per CUI weights] CUI: 128477000, Name: abscess, Old sim: 0.462, New sim: 0.452\n", + "Considering CUI 128477000 with sim 0.452367\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "0 : {0: {'pretty_name': 'Abscess', 'cui': '44132006', 'type_ids': ['SA'], 'source_value': 'abscess', 'detected_name': 'abscess', 'acc': 0.7983395419982506, 'context_similarity': 0.7983395419982506, 'start': 43, 'end': 50, 'id': 0, 'meta_anns': {}, 'context_left': [], 'context_center': [], 'context_right': []}}\n", + "1 : {0: {'pretty_name': 'Abscesses', 'cui': '128477000', 'type_ids': ['SA'], 'source_value': 'abscess', 'detected_name': 'abscess', 'acc': 0.45236695339580785, 'context_similarity': 0.45236695339580785, 'start': 3, 'end': 10, 'id': 0, 'meta_anns': {}, 'context_left': [], 'context_center': [], 'context_right': []}}\n" + ] + } + ], + "source": [ + "abscess_text_morph = \"\"\"Histopathology reveals a well-encapsulated abscess with central necrosis and neutrophilic infiltration.\"\"\"\n", + "abscess_text_disorder = \"\"\"An abscess is a disorder, which is a clinical condition characterized by the formation of a painful and inflamed mass containing purulent material\"\"\"\n", + "\n", + "# enable logging\n", + "import logging\n", + "from medcat.components.linking.two_step_context_based_linker import logger as tsl_l\n", + "from medcat.components.linking.context_based_linker import logger as cbl_l\n", + "sh = logging.StreamHandler()\n", + "for l in [tsl_l, cbl_l]:\n", + " if not l.handlers:\n", + " l.addHandler(sh)\n", + " l.setLevel(logging.DEBUG)\n", + "\n", + "for text_num, text in enumerate([abscess_text_morph, abscess_text_disorder]):\n", + " ents = cat.get_entities(text)['entities']\n", + " print(text_num, \":\", ents)" + ] + }, + { + "cell_type": "markdown", + "id": "70ac5fb9", + "metadata": {}, + "source": [ + "As we saw above in the log, some extra steps were done to mix in the type context to the CUI context." + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": ".venv", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.10.13" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/notebooks/introductory/custom/README.md b/notebooks/introductory/custom/README.md new file mode 100644 index 0000000..397f3ae --- /dev/null +++ b/notebooks/introductory/custom/README.md @@ -0,0 +1,2 @@ +This section uses more custom (non-default) components. +But the components are still a part of the core release. \ No newline at end of file