-
Notifications
You must be signed in to change notification settings - Fork 10
Remote model service option, and frontend fixes for overlapping, multiple annos per spans #264
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,33 @@ | ||
| # Generated by Django 5.1.11 on 2025-12-11 11:47 | ||
|
|
||
| from django.db import migrations, models | ||
|
|
||
|
|
||
| class Migration(migrations.Migration): | ||
|
|
||
| dependencies = [ | ||
| ('api', '0092_exportedproject_cdb_search_filter_id_and_more'), | ||
| ] | ||
|
|
||
| operations = [ | ||
| migrations.AddField( | ||
| model_name='projectannotateentities', | ||
| name='model_service_url', | ||
| field=models.CharField(blank=True, help_text='URL of the remote MedCAT service API (e.g., http://medcat-service:8000)', max_length=500, null=True), | ||
| ), | ||
| migrations.AddField( | ||
| model_name='projectannotateentities', | ||
| name='use_model_service', | ||
| field=models.BooleanField(default=False, help_text='Use a remote MedCAT service API for document processing instead of local models'), | ||
| ), | ||
| migrations.AddField( | ||
| model_name='projectgroup', | ||
| name='model_service_url', | ||
| field=models.CharField(blank=True, help_text='URL of the remote MedCAT service API (e.g., http://medcat-service:8000)', max_length=500, null=True), | ||
| ), | ||
| migrations.AddField( | ||
| model_name='projectgroup', | ||
| name='use_model_service', | ||
| field=models.BooleanField(default=False, help_text='Use a remote MedCAT service API for document processing instead of local models'), | ||
| ), | ||
| ] |
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -458,15 +458,24 @@ class Meta: | |
| 'if a model pack is used for the project') | ||
| relations = models.ManyToManyField('Relation', blank=True, default=None, | ||
| help_text='Relations that will be available for this project') | ||
| use_model_service = models.BooleanField(default=False, | ||
| help_text='Use a remote MedCAT service API for document processing instead of local models'\ | ||
| '(note: interim model training is not supported for remote model service projects)') | ||
| model_service_url = models.CharField(max_length=500, blank=True, null=True, | ||
| help_text='URL of the remote MedCAT service API (e.g., http://medcat-service:8003)') | ||
|
|
||
| def save(self, *args, **kwargs): | ||
| if self.model_pack is None and (self.concept_db is None or self.vocab is None): | ||
| raise ValidationError('Must set at least the ModelPack or a Concept Database and Vocab Pair') | ||
| if self.model_pack and (self.concept_db is not None or self.vocab is not None): | ||
| raise ValidationError('Cannot set model pack and ConceptDB or a Vocab. You must use one or the other.') | ||
| if self.deid_model_annotation and self.model_pack is None: | ||
| raise ValidationError('Must set a DeID ModelPack for De-ID Model Annotation, cannot only set a cdb / vocab pair as' | ||
| ' not be a DeId model') | ||
| # If using remote model service, skip local model validation | ||
| if not self.use_model_service: | ||
| if self.model_pack is None and (self.concept_db is None or self.vocab is None): | ||
| raise ValidationError('Must set at least the ModelPack or a Concept Database and Vocab Pair') | ||
| if self.model_pack and (self.concept_db is not None or self.vocab is not None): | ||
| raise ValidationError('Cannot set model pack and ConceptDB or a Vocab. You must use one or the other.') | ||
| if self.deid_model_annotation and self.model_pack is None: | ||
| raise ValidationError('Must set a DeID ModelPack for De-ID Model Annotation, cannot only set a cdb / vocab pair as' | ||
| ' not be a DeId model') | ||
| elif self.use_model_service and not self.model_service_url: | ||
| raise ValidationError('When using model service, model_service_url must be set') | ||
|
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Bug: DeID flag silently ignored for remote model serviceWhen Additional Locations (2) |
||
| super().save(*args, **kwargs) | ||
|
|
||
|
|
||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -3,6 +3,7 @@ | |
| import os | ||
| from typing import List | ||
|
|
||
| import requests | ||
| from background_task import background | ||
| from django.contrib.auth.models import User | ||
| from django.db import transaction | ||
|
|
@@ -20,6 +21,70 @@ | |
| logger = logging.getLogger('trainer') | ||
|
|
||
|
|
||
| class RemoteEntity: | ||
| """A simple class to mimic spaCy entity structure for remote API responses.""" | ||
| def __init__(self, entity_data, text): | ||
| self.cui = entity_data.get('cui', '') | ||
| self.start_char_index = entity_data.get('start', 0) | ||
| self.end_char_index = entity_data.get('end', 0) | ||
| self.text = entity_data.get('detected_name') or entity_data.get('source_value', '') | ||
| self.context_similarity = entity_data.get('context_similarity', entity_data.get('acc', 0.0)) | ||
|
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Bug: Missing similarity score marks all annotations as deletedWhen using the remote model service, Additional Locations (1)
Member
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. intentional, remote model service models should set this |
||
| self._meta_anns = entity_data.get('meta_anns', {}) | ||
| self._text = text | ||
|
|
||
| def get_addon_data(self, key): | ||
| """Mimic get_addon_data for meta_cat_meta_anns.""" | ||
| if key == 'meta_cat_meta_anns': | ||
| return self._meta_anns | ||
| return {} | ||
|
|
||
|
|
||
| class RemoteSpacyDoc: | ||
| """A simple class to mimic spaCy document structure for remote API responses.""" | ||
| def __init__(self, linked_ents): | ||
| self.linked_ents = linked_ents | ||
|
|
||
|
|
||
| def call_remote_model_service(service_url, text): | ||
| """ | ||
| Call the remote MedCAT service API to process text. | ||
|
|
||
| Args: | ||
| service_url: Base URL of the remote service (e.g., http://medcat-service:8000) | ||
| text: Text to process | ||
|
|
||
| Returns: | ||
| RemoteSpacyDoc object with linked_ents | ||
| """ | ||
| # Ensure service_url doesn't end with / | ||
| service_url = service_url.rstrip('/') | ||
| api_url = f"{service_url}/api/process" | ||
|
|
||
| payload = { | ||
| "text": text | ||
| } | ||
|
|
||
| try: | ||
| response = requests.post(api_url, json=payload, timeout=60) | ||
| response.raise_for_status() | ||
| result = response.json() | ||
|
|
||
| # Extract entities from the response | ||
| entities_data = result.get('entities', {}) | ||
| linked_ents = [] | ||
|
|
||
| for _, entity_data in entities_data.items(): | ||
| linked_ents.append(RemoteEntity(entity_data, text)) | ||
|
|
||
| return RemoteSpacyDoc(linked_ents) | ||
| except requests.exceptions.RequestException as e: | ||
| logger.error(f"Error calling remote model service at {api_url}: {e}") | ||
| raise Exception(f"Failed to call remote model service: {str(e)}") from e | ||
| except Exception as e: | ||
| logger.error(f"Error processing remote model service response: {e}") | ||
| raise Exception(f"Failed to process remote model service response: {str(e)}") from e | ||
|
|
||
|
|
||
| def remove_annotations(document, project, partial=False): | ||
| try: | ||
| if partial: | ||
|
|
@@ -36,7 +101,27 @@ def remove_annotations(document, project, partial=False): | |
| logger.debug(f"Something went wrong: {e}") | ||
|
|
||
|
|
||
| def add_annotations(spacy_doc, user, project, document, existing_annotations, cat): | ||
| class SimpleFilters: | ||
| """Simple filter object for remote service when cat is not available.""" | ||
| def __init__(self, cuis=None, cuis_exclude=None): | ||
| self.cuis = cuis or set() | ||
| self.cuis_exclude = cuis_exclude or set() | ||
|
|
||
|
|
||
| def add_annotations(spacy_doc, user, project, document, existing_annotations, cat=None, filters=None, similarity_threshold=0.3): | ||
| """ | ||
| Add annotations from spacy_doc to the database. | ||
|
|
||
| Args: | ||
| spacy_doc: spaCy document with linked_ents or RemoteSpacyDoc | ||
| user: User object | ||
| project: ProjectAnnotateEntities object | ||
| document: Document object | ||
| existing_annotations: List of existing AnnotatedEntity objects | ||
| cat: CAT object (optional, required if filters not provided) | ||
| filters: SimpleFilters object (optional, used when cat is None) | ||
| similarity_threshold: float (optional, default 0.3, used when cat is None) | ||
| """ | ||
| spacy_doc.linked_ents.sort(key=lambda x: len(x.text), reverse=True) | ||
|
|
||
| tkns_in = [] | ||
|
Collaborator
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This is no longer used. I suppose that's because we now DO allow entities that cover the same tokens.
Member
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. not sure what you mean by its not used? its used
Collaborator
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. The variable defined on this line is not used anywhere else. It used to be used. But with the removal of code lower down (when iterating over the entities), this is no longer ever used. |
||
|
|
@@ -57,9 +142,13 @@ def add_annotations(spacy_doc, user, project, document, existing_annotations, ca | |
| metataskvals2obj = {} | ||
| pass | ||
|
|
||
| def check_ents(ent): | ||
| return any((ea[0] < ent.start_char_index < ea[1]) or | ||
| (ea[0] < ent.end_char_index < ea[1]) for ea in existing_annos_intervals) | ||
| # Get filters and similarity threshold | ||
| if cat is not None: | ||
| filters_obj = cat.config.components.linking.filters | ||
| MIN_ACC = cat.config.components.linking.similarity_threshold | ||
| else: | ||
| filters_obj = filters or SimpleFilters() | ||
| MIN_ACC = similarity_threshold | ||
|
|
||
| def check_filters(cui, filters): | ||
| if cui in filters.cuis or not filters.cuis: | ||
|
|
@@ -68,15 +157,8 @@ def check_filters(cui, filters): | |
| return False | ||
|
|
||
| for ent in spacy_doc.linked_ents: | ||
| if not check_ents(ent) and check_filters(ent.cui, cat.config.components.linking.filters): | ||
| to_add = True | ||
| for tkn in ent: | ||
| if tkn in tkns_in: | ||
| to_add = False | ||
| if to_add: | ||
| for tkn in ent: | ||
| tkns_in.append(tkn) | ||
| ents.append(ent) | ||
| if check_filters(ent.cui, filters_obj): | ||
| ents.append(ent) | ||
|
|
||
| logger.debug('Found %s annotations to store', len(ents)) | ||
| for ent in ents: | ||
|
|
@@ -93,6 +175,7 @@ def check_filters(cui, filters): | |
|
|
||
| ann_ent = AnnotatedEntity.objects.filter(project=project, | ||
| document=document, | ||
| entity=entity, | ||
| start_ind=ent.start_char_index, | ||
| end_ind=ent.end_char_index).first() | ||
| if ann_ent is None: | ||
|
|
@@ -107,7 +190,6 @@ def check_filters(cui, filters): | |
| ann_ent.end_ind = ent.end_char_index | ||
| ann_ent.acc = ent.context_similarity | ||
|
|
||
| MIN_ACC = cat.config.components.linking.similarity_threshold | ||
| if ent.context_similarity < MIN_ACC: | ||
| ann_ent.deleted = True | ||
| ann_ent.validated = True | ||
|
|
@@ -157,7 +239,7 @@ def get_create_cdb_infos(cdb, concept, cui, cui_info_prop, code_prop, desc_prop, | |
|
|
||
|
|
||
| def create_annotation(source_val: str, selection_occurrence_index: int, cui: str, user: User, | ||
| project: ProjectAnnotateEntities, document, cat: CAT): | ||
| project: ProjectAnnotateEntities, document: Document): | ||
| text = document.text | ||
| id = None | ||
|
|
||
|
|
@@ -251,29 +333,58 @@ def prep_docs(project_id: List[int], doc_ids: List[int], user_id: int): | |
| project = ProjectAnnotateEntities.objects.get(id=project_id) | ||
| docs = Document.objects.filter(id__in=doc_ids) | ||
|
|
||
| logger.info('Loading CAT object in bg process for project: %s', project.id) | ||
| cat = get_medcat(project=project) | ||
|
|
||
| # Set CAT filters | ||
| cat.config.components.linking.filters.cuis = project.cuis | ||
|
|
||
| for doc in docs: | ||
| logger.info(f'Running MedCAT model for project {project.id}:{project.name} over doc: {doc.id}') | ||
| if not project.deid_model_annotation: | ||
| spacy_doc = cat(doc.text) | ||
| else: | ||
| deid = DeIdModel(cat) | ||
| spacy_doc = deid(doc.text) | ||
| anns = AnnotatedEntity.objects.filter(document=doc).filter(project=project) | ||
| with transaction.atomic(): | ||
| add_annotations(spacy_doc=spacy_doc, | ||
| user=user, | ||
| project=project, | ||
| document=doc, | ||
| cat=cat, | ||
| existing_annotations=anns) | ||
| # add doc to prepared_documents | ||
| project.prepared_documents.add(doc) | ||
| # Get CUI filters | ||
| cuis = set() | ||
| if project.cuis is not None and project.cuis: | ||
| cuis = set([str(cui).strip() for cui in project.cuis.split(",")]) | ||
| if project.cuis_file is not None and project.cuis_file: | ||
| try: | ||
| cuis.update(json.load(open(project.cuis_file.path))) | ||
| except FileNotFoundError: | ||
| logger.warning('Missing CUI filter file for project %s', project.id) | ||
|
|
||
| if project.use_model_service: | ||
|
Collaborator
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. The only 2 differences between these 2 if clauses seem to be:
Perhaps we can avoid some of the code duplication by:
Just a nice to have, really. |
||
| # Use remote model service | ||
| logger.info('Using remote model service in bg process for project: %s', project.id) | ||
| filters = SimpleFilters(cuis=cuis) | ||
| for doc in docs: | ||
| logger.info('Running remote MedCAT service for project %s:%s over doc: %s', project.id, project.name, doc.id) | ||
| spacy_doc = call_remote_model_service(project.model_service_url, doc.text) | ||
| anns = AnnotatedEntity.objects.filter(document=doc).filter(project=project) | ||
| with transaction.atomic(): | ||
| add_annotations(spacy_doc=spacy_doc, | ||
| user=user, | ||
| project=project, | ||
| document=doc, | ||
| cat=None, | ||
| filters=filters, | ||
| similarity_threshold=0.3, | ||
| existing_annotations=anns) | ||
| project.prepared_documents.add(doc) | ||
| else: | ||
| # Use local medcat model | ||
| logger.info('Loading CAT object in bg process for project: %s', project.id) | ||
| cat = get_medcat(project=project) | ||
|
|
||
| # Set CAT filters | ||
| cat.config.components.linking.filters.cuis = cuis | ||
|
|
||
| for doc in docs: | ||
| logger.info('Running MedCAT model for project %s:%s over doc: %s', project.id, project.name, doc.id) | ||
| if not project.deid_model_annotation: | ||
| spacy_doc = cat(doc.text) | ||
| else: | ||
| deid = DeIdModel(cat) | ||
| spacy_doc = deid(doc.text) | ||
| anns = AnnotatedEntity.objects.filter(document=doc).filter(project=project) | ||
| with transaction.atomic(): | ||
| add_annotations(spacy_doc=spacy_doc, | ||
| user=user, | ||
| project=project, | ||
| document=doc, | ||
| cat=cat, | ||
| existing_annotations=anns) | ||
| project.prepared_documents.add(doc) | ||
| project.save() | ||
| logger.info('Prepared all docs for project: %s, docs processed: %s', | ||
| project.id, project.prepared_documents) | ||
|
|
||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Is there a need for both the boolean and the URL? Couoldn't we just assume that if there's a URL, it needs to be used?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
agree - but in prep for when the admin - project setup is re-done, having a checkbox and some validation on a URL or some auto-generated URL etc. easier to keep explicit for now