Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 2 additions & 2 deletions medcat-trainer/webapp/api/api/admin/models.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@

_PROJECT_ANNO_ENTS_SETTINGS_FIELD_ORDER = (
'concept_db', 'vocab', 'model_pack', 'cdb_search_filter', 'deid_model_annotation', 'require_entity_validation', 'train_model_on_submit',
'add_new_entities', 'restrict_concept_lookup', 'terminate_available', 'irrelevant_available',
'use_model_service', 'model_service_url', 'add_new_entities', 'restrict_concept_lookup', 'terminate_available', 'irrelevant_available',
'enable_entity_annotation_comments', 'tasks', 'relations'
)

Expand Down Expand Up @@ -177,7 +177,7 @@ class ModelPackAdmin(admin.ModelAdmin):

def metacats(self, obj):
return ", ".join(str(m_c) for m_c in obj.meta_cats.all())

def save_model(self, request, obj, form, change):
obj.last_modified_by = request.user
super().save_model(request, obj, form, change)
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,33 @@
# Generated by Django 5.1.11 on 2025-12-11 11:47

from django.db import migrations, models


class Migration(migrations.Migration):

dependencies = [
('api', '0092_exportedproject_cdb_search_filter_id_and_more'),
]

operations = [
migrations.AddField(
model_name='projectannotateentities',
name='model_service_url',
field=models.CharField(blank=True, help_text='URL of the remote MedCAT service API (e.g., http://medcat-service:8000)', max_length=500, null=True),
),
migrations.AddField(
model_name='projectannotateentities',
name='use_model_service',
field=models.BooleanField(default=False, help_text='Use a remote MedCAT service API for document processing instead of local models'),
),
migrations.AddField(
model_name='projectgroup',
name='model_service_url',
field=models.CharField(blank=True, help_text='URL of the remote MedCAT service API (e.g., http://medcat-service:8000)', max_length=500, null=True),
),
migrations.AddField(
model_name='projectgroup',
name='use_model_service',
field=models.BooleanField(default=False, help_text='Use a remote MedCAT service API for document processing instead of local models'),
),
]
23 changes: 16 additions & 7 deletions medcat-trainer/webapp/api/api/models.py
Original file line number Diff line number Diff line change
Expand Up @@ -458,15 +458,24 @@ class Meta:
'if a model pack is used for the project')
relations = models.ManyToManyField('Relation', blank=True, default=None,
help_text='Relations that will be available for this project')
use_model_service = models.BooleanField(default=False,
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Is there a need for both the boolean and the URL? Couoldn't we just assume that if there's a URL, it needs to be used?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

agree - but in prep for when the admin - project setup is re-done, having a checkbox and some validation on a URL or some auto-generated URL etc. easier to keep explicit for now

help_text='Use a remote MedCAT service API for document processing instead of local models'\
'(note: interim model training is not supported for remote model service projects)')
model_service_url = models.CharField(max_length=500, blank=True, null=True,
help_text='URL of the remote MedCAT service API (e.g., http://medcat-service:8003)')

def save(self, *args, **kwargs):
if self.model_pack is None and (self.concept_db is None or self.vocab is None):
raise ValidationError('Must set at least the ModelPack or a Concept Database and Vocab Pair')
if self.model_pack and (self.concept_db is not None or self.vocab is not None):
raise ValidationError('Cannot set model pack and ConceptDB or a Vocab. You must use one or the other.')
if self.deid_model_annotation and self.model_pack is None:
raise ValidationError('Must set a DeID ModelPack for De-ID Model Annotation, cannot only set a cdb / vocab pair as'
' not be a DeId model')
# If using remote model service, skip local model validation
if not self.use_model_service:
if self.model_pack is None and (self.concept_db is None or self.vocab is None):
raise ValidationError('Must set at least the ModelPack or a Concept Database and Vocab Pair')
if self.model_pack and (self.concept_db is not None or self.vocab is not None):
raise ValidationError('Cannot set model pack and ConceptDB or a Vocab. You must use one or the other.')
if self.deid_model_annotation and self.model_pack is None:
raise ValidationError('Must set a DeID ModelPack for De-ID Model Annotation, cannot only set a cdb / vocab pair as'
' not be a DeId model')
elif self.use_model_service and not self.model_service_url:
raise ValidationError('When using model service, model_service_url must be set')
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Bug: DeID flag silently ignored for remote model service

When use_model_service is True, the deid_model_annotation flag is completely ignored. The model validation skips the DeID check when using remote service, allowing users to enable both options. However, the document processing code only checks deid_model_annotation in the local model path, meaning DeID processing never occurs for remote service projects even when the flag is set. Users who configure a project expecting De-identification processing will silently get standard processing instead.

Additional Locations (2)

Fix in Cursor Fix in Web

super().save(*args, **kwargs)


Expand Down
187 changes: 149 additions & 38 deletions medcat-trainer/webapp/api/api/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
import os
from typing import List

import requests
from background_task import background
from django.contrib.auth.models import User
from django.db import transaction
Expand All @@ -20,6 +21,70 @@
logger = logging.getLogger('trainer')


class RemoteEntity:
"""A simple class to mimic spaCy entity structure for remote API responses."""
def __init__(self, entity_data, text):
self.cui = entity_data.get('cui', '')
self.start_char_index = entity_data.get('start', 0)
self.end_char_index = entity_data.get('end', 0)
self.text = entity_data.get('detected_name') or entity_data.get('source_value', '')
self.context_similarity = entity_data.get('context_similarity', entity_data.get('acc', 0.0))
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Bug: Missing similarity score marks all annotations as deleted

When using the remote model service, RemoteEntity.context_similarity defaults to 0.0 if the API response doesn't include context_similarity or acc fields. Since the hardcoded threshold is 0.3, any entity without these fields will have ent.context_similarity < MIN_ACC evaluate to true, causing the annotation to be marked as deleted=True and validated=True. This silently marks all such annotations as deleted and validated (appearing as human-reviewed) when they should be presented for manual review.

Additional Locations (1)

Fix in Cursor Fix in Web

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

intentional, remote model service models should set this

self._meta_anns = entity_data.get('meta_anns', {})
self._text = text

def get_addon_data(self, key):
"""Mimic get_addon_data for meta_cat_meta_anns."""
if key == 'meta_cat_meta_anns':
return self._meta_anns
return {}


class RemoteSpacyDoc:
"""A simple class to mimic spaCy document structure for remote API responses."""
def __init__(self, linked_ents):
self.linked_ents = linked_ents


def call_remote_model_service(service_url, text):
"""
Call the remote MedCAT service API to process text.

Args:
service_url: Base URL of the remote service (e.g., http://medcat-service:8000)
text: Text to process

Returns:
RemoteSpacyDoc object with linked_ents
"""
# Ensure service_url doesn't end with /
service_url = service_url.rstrip('/')
api_url = f"{service_url}/api/process"

payload = {
"text": text
}

try:
response = requests.post(api_url, json=payload, timeout=60)
response.raise_for_status()
result = response.json()

# Extract entities from the response
entities_data = result.get('entities', {})
linked_ents = []

for _, entity_data in entities_data.items():
linked_ents.append(RemoteEntity(entity_data, text))

return RemoteSpacyDoc(linked_ents)
except requests.exceptions.RequestException as e:
logger.error(f"Error calling remote model service at {api_url}: {e}")
raise Exception(f"Failed to call remote model service: {str(e)}") from e
except Exception as e:
logger.error(f"Error processing remote model service response: {e}")
raise Exception(f"Failed to process remote model service response: {str(e)}") from e


def remove_annotations(document, project, partial=False):
try:
if partial:
Expand All @@ -36,7 +101,27 @@ def remove_annotations(document, project, partial=False):
logger.debug(f"Something went wrong: {e}")


def add_annotations(spacy_doc, user, project, document, existing_annotations, cat):
class SimpleFilters:
"""Simple filter object for remote service when cat is not available."""
def __init__(self, cuis=None, cuis_exclude=None):
self.cuis = cuis or set()
self.cuis_exclude = cuis_exclude or set()


def add_annotations(spacy_doc, user, project, document, existing_annotations, cat=None, filters=None, similarity_threshold=0.3):
"""
Add annotations from spacy_doc to the database.

Args:
spacy_doc: spaCy document with linked_ents or RemoteSpacyDoc
user: User object
project: ProjectAnnotateEntities object
document: Document object
existing_annotations: List of existing AnnotatedEntity objects
cat: CAT object (optional, required if filters not provided)
filters: SimpleFilters object (optional, used when cat is None)
similarity_threshold: float (optional, default 0.3, used when cat is None)
"""
spacy_doc.linked_ents.sort(key=lambda x: len(x.text), reverse=True)

tkns_in = []
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This is no longer used. I suppose that's because we now DO allow entities that cover the same tokens.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

not sure what you mean by its not used? its used views.py and utils.py

Copy link
Collaborator

@mart-r mart-r Dec 17, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The variable defined on this line is not used anywhere else. It used to be used. But with the removal of code lower down (when iterating over the entities), this is no longer ever used.

Expand All @@ -57,9 +142,13 @@ def add_annotations(spacy_doc, user, project, document, existing_annotations, ca
metataskvals2obj = {}
pass

def check_ents(ent):
return any((ea[0] < ent.start_char_index < ea[1]) or
(ea[0] < ent.end_char_index < ea[1]) for ea in existing_annos_intervals)
# Get filters and similarity threshold
if cat is not None:
filters_obj = cat.config.components.linking.filters
MIN_ACC = cat.config.components.linking.similarity_threshold
else:
filters_obj = filters or SimpleFilters()
MIN_ACC = similarity_threshold

def check_filters(cui, filters):
if cui in filters.cuis or not filters.cuis:
Expand All @@ -68,15 +157,8 @@ def check_filters(cui, filters):
return False

for ent in spacy_doc.linked_ents:
if not check_ents(ent) and check_filters(ent.cui, cat.config.components.linking.filters):
to_add = True
for tkn in ent:
if tkn in tkns_in:
to_add = False
if to_add:
for tkn in ent:
tkns_in.append(tkn)
ents.append(ent)
if check_filters(ent.cui, filters_obj):
ents.append(ent)

logger.debug('Found %s annotations to store', len(ents))
for ent in ents:
Expand All @@ -93,6 +175,7 @@ def check_filters(cui, filters):

ann_ent = AnnotatedEntity.objects.filter(project=project,
document=document,
entity=entity,
start_ind=ent.start_char_index,
end_ind=ent.end_char_index).first()
if ann_ent is None:
Expand All @@ -107,7 +190,6 @@ def check_filters(cui, filters):
ann_ent.end_ind = ent.end_char_index
ann_ent.acc = ent.context_similarity

MIN_ACC = cat.config.components.linking.similarity_threshold
if ent.context_similarity < MIN_ACC:
ann_ent.deleted = True
ann_ent.validated = True
Expand Down Expand Up @@ -157,7 +239,7 @@ def get_create_cdb_infos(cdb, concept, cui, cui_info_prop, code_prop, desc_prop,


def create_annotation(source_val: str, selection_occurrence_index: int, cui: str, user: User,
project: ProjectAnnotateEntities, document, cat: CAT):
project: ProjectAnnotateEntities, document: Document):
text = document.text
id = None

Expand Down Expand Up @@ -251,29 +333,58 @@ def prep_docs(project_id: List[int], doc_ids: List[int], user_id: int):
project = ProjectAnnotateEntities.objects.get(id=project_id)
docs = Document.objects.filter(id__in=doc_ids)

logger.info('Loading CAT object in bg process for project: %s', project.id)
cat = get_medcat(project=project)

# Set CAT filters
cat.config.components.linking.filters.cuis = project.cuis

for doc in docs:
logger.info(f'Running MedCAT model for project {project.id}:{project.name} over doc: {doc.id}')
if not project.deid_model_annotation:
spacy_doc = cat(doc.text)
else:
deid = DeIdModel(cat)
spacy_doc = deid(doc.text)
anns = AnnotatedEntity.objects.filter(document=doc).filter(project=project)
with transaction.atomic():
add_annotations(spacy_doc=spacy_doc,
user=user,
project=project,
document=doc,
cat=cat,
existing_annotations=anns)
# add doc to prepared_documents
project.prepared_documents.add(doc)
# Get CUI filters
cuis = set()
if project.cuis is not None and project.cuis:
cuis = set([str(cui).strip() for cui in project.cuis.split(",")])
if project.cuis_file is not None and project.cuis_file:
try:
cuis.update(json.load(open(project.cuis_file.path)))
except FileNotFoundError:
logger.warning('Missing CUI filter file for project %s', project.id)

if project.use_model_service:
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The only 2 differences between these 2 if clauses seem to be:

  • Getting the spacy_doc
  • Passing different variables to add_annotations

Perhaps we can avoid some of the code duplication by:

  • Using a functools.partial on add_annotations to pass required / changed kwargs
  • Using a callable for getting the spacy_doc
  • Change the logged message to be dynamic
  • Running everything else just once
    • While using the partial and the callable

Just a nice to have, really.

# Use remote model service
logger.info('Using remote model service in bg process for project: %s', project.id)
filters = SimpleFilters(cuis=cuis)
for doc in docs:
logger.info('Running remote MedCAT service for project %s:%s over doc: %s', project.id, project.name, doc.id)
spacy_doc = call_remote_model_service(project.model_service_url, doc.text)
anns = AnnotatedEntity.objects.filter(document=doc).filter(project=project)
with transaction.atomic():
add_annotations(spacy_doc=spacy_doc,
user=user,
project=project,
document=doc,
cat=None,
filters=filters,
similarity_threshold=0.3,
existing_annotations=anns)
project.prepared_documents.add(doc)
else:
# Use local medcat model
logger.info('Loading CAT object in bg process for project: %s', project.id)
cat = get_medcat(project=project)

# Set CAT filters
cat.config.components.linking.filters.cuis = cuis

for doc in docs:
logger.info('Running MedCAT model for project %s:%s over doc: %s', project.id, project.name, doc.id)
if not project.deid_model_annotation:
spacy_doc = cat(doc.text)
else:
deid = DeIdModel(cat)
spacy_doc = deid(doc.text)
anns = AnnotatedEntity.objects.filter(document=doc).filter(project=project)
with transaction.atomic():
add_annotations(spacy_doc=spacy_doc,
user=user,
project=project,
document=doc,
cat=cat,
existing_annotations=anns)
project.prepared_documents.add(doc)
project.save()
logger.info('Prepared all docs for project: %s, docs processed: %s',
project.id, project.prepared_documents)
Expand Down
Loading