clamsproject · keighrim · May 25, 2021 · May 25, 2021 · May 25, 2021 · May 25, 2021
diff --git a/mmif/serialize/annotation.py b/mmif/serialize/annotation.py
@@ -27,29 +27,36 @@ class Annotation(FreezableMmifObject):
     """
 
     def __init__(self, anno_obj: Union[bytes, str, dict] = None) -> None:
-        self._type: Union[str, ThingTypesBase] = ''
+        self._type: ThingTypesBase = ThingTypesBase('')
         if not hasattr(self, 'properties'):  # don't overwrite DocumentProperties on super() call
             self.properties: AnnotationProperties = AnnotationProperties()
             self._attribute_classes = pmap({'properties': AnnotationProperties})
         self.disallow_additional_properties()
         self._required_attributes = pvector(["_type", "properties"])
         super().__init__(anno_obj)
-
-    def is_type(self, type: Union[str, ThingTypesBase]) -> bool:
+
+    def _deserialize(self, input_dict: dict) -> None:
+        self.at_type = input_dict.pop('_type')
+        super()._deserialize(input_dict)
+
+    def is_type(self, at_type: Union[str, ThingTypesBase]) -> bool:
         """
         Check if the @type of this object matches.
         """
-        return str(self.at_type) == str(type)
+        return self.at_type == at_type
 
     @property
-    def at_type(self) -> Union[str, ThingTypesBase]:
+    def at_type(self) -> ThingTypesBase:
         # TODO (krim @ 8/19/20): should we always return string? leaving this to return
         #  different types can be confusing for sdk users.
         return self._type
 
     @at_type.setter
     def at_type(self, at_type: Union[str, ThingTypesBase]) -> None:
-        self._type = at_type
+        if isinstance(at_type, str):
+            self._type = ThingTypesBase.from_str(at_type)
+        else:
+            self._type = at_type
 
     @property
     def id(self) -> str:
@@ -85,7 +92,7 @@ def add_property(self, name: str,
                              f"(\"{name}\": \"{str(value)}\"")
 
     def is_document(self):
-        return self.at_type.endswith("Document")
+        return isinstance(self.at_type, DocumentTypesBase)
 
 
 class Document(Annotation):

diff --git a/mmif/serialize/mmif.py b/mmif/serialize/mmif.py
@@ -265,6 +265,8 @@ def get_alignments(self, at_type1: Union[str, ThingTypesBase], at_type2: Union[s
         :return: a dict that keyed by view IDs (str) and has lists of alignment Annotation objects as values.
         """
         v_and_a = {}
+        # at_type1 = ThingTypesBase.from_str(at_type1) if isinstance(at_type1, str) else at_type1
+        # at_type2 = ThingTypesBase.from_str(at_type2) if isinstance(at_type2, str) else at_type2
         for alignment_view in self.get_all_views_contain(AnnotationTypes.Alignment):
             alignments = []
             # TODO (krim @ 11/7/20): maybe Alignment can have metadata on what types are aligned?
@@ -274,10 +276,12 @@ def get_alignments(self, at_type1: Union[str, ThingTypesBase], at_type2: Union[s
                     ann_id = cast(str, ann_id)
                     if ':' in ann_id:
                         view_id, ann_id = ann_id.split(':')
-                        aligned_types.add(str(cast(Annotation, self[view_id][ann_id]).at_type))
+                        aligned_type = cast(Annotation, self[view_id][ann_id]).at_type
                     else:
-                        aligned_types.add(str(cast(Annotation, alignment_view[ann_id]).at_type))
-                if str(at_type1) in aligned_types and str(at_type2) in aligned_types:
+                        aligned_type = cast(Annotation, alignment_view[ann_id]).at_type
+                    aligned_types.add(aligned_type)
+                aligned_types = list(aligned_types)  # because membership check for sets also checks hash() values
+                if at_type1 in aligned_types and at_type2 in aligned_types:
                     alignments.append(alignment)
             if len(alignments) > 0:
                 v_and_a[alignment_view.id] = alignments
@@ -319,9 +323,9 @@ def get_all_views_contain(self, at_types: Union[ThingTypesBase, str, List[Union[
         """
         if isinstance(at_types, list):
             return [view for view in self.views
-                    if all(map(lambda x: str(x) in view.metadata.contains, at_types))]
+                    if all(map(lambda x: x in view.metadata.contains, at_types))]
         else:
-            return [view for view in self.views if str(at_types) in view.metadata.contains]
+            return [view for view in self.views if at_types in view.metadata.contains]
 
     def get_views_contain(self, at_types: Union[ThingTypesBase, str, List[Union[str, ThingTypesBase]]]) -> List[View]:
         """
@@ -340,11 +344,11 @@ def get_view_contains(self, at_types: Union[ThingTypesBase, str, List[Union[str,
         # will return the *latest* view
         # works as of python 3.6+ (checked by setup.py) because dicts are deterministically ordered by insertion order
         for view in reversed(self.views):
-            if isinstance(at_types, str) or isinstance(at_types, ThingTypesBase):
-                if str(at_types) in view.metadata.contains:
+            if isinstance(at_types, list):
+                if all(map(lambda x: x in view.metadata.contains, at_types)):
                     return view
             else:
-                if all(map(lambda x: str(x) in view.metadata.contains, at_types)):
+                if at_types in view.metadata.contains:
                     return view
         return None
 

diff --git a/mmif/serialize/view.py b/mmif/serialize/view.py
@@ -128,8 +128,8 @@ def prop_check(k, v, *props):
             return any(k in prop and prop[k] == v for prop in props)
 
         for annotation in self.annotations:
-            at_type_metadata = self.metadata.contains.get(str(annotation.at_type), {})
-            if not at_type or (at_type and str(annotation.at_type) == str(at_type)):
+            at_type_metadata = self.metadata.contains.get(annotation.at_type, {})
+            if not at_type or (at_type and annotation.at_type == at_type):
                 if all(map(lambda kv: prop_check(kv[0], kv[1], annotation.properties, at_type_metadata), properties.items())):
                     yield annotation
 
@@ -196,24 +196,6 @@ def __init__(self, viewmetadata_obj: Union[bytes, str, dict] = None) -> None:
         # see MmifObject::_required_attributes in model.py 
         super().__init__(viewmetadata_obj)
 
-    def _find_match_hotfix(self, key: str) -> bool:
-        """
-        Checks the existing types in the contains dict to see if
-        the type passed in as ``key`` has the same shortname.
-
-        FIXME: this will produce undesired results if there is a
-         shortname conflict in the view.
-
-        :param key: the type (shortname or IRI) to check
-        :return: whether ``key`` already has a match in the ``contains`` dict
-        """
-        exists = False
-        for existing_type in self.contains.keys():
-            if key.split('/')[-1] == existing_type.split('/')[-1]:
-                exists = True
-                break
-        return exists
-
     def new_contain(self, at_type: Union[str, ThingTypesBase], contain_dict: dict = None) -> Optional['Contain']:
         """
         Adds a new element to the ``contains`` dictionary.
@@ -222,16 +204,12 @@ def new_contain(self, at_type: Union[str, ThingTypesBase], contain_dict: dict =
         :param contain_dict: any metadata associated with the annotation type
         :return: the generated :class:`Contain` object
         """
-        if isinstance(at_type, ThingTypesBase):
-            exists = self._find_match_hotfix(at_type.name) or self._find_match_hotfix(at_type.value)
-            final_key = at_type.value
-        else:
-            exists = self._find_match_hotfix(at_type)
-            final_key = at_type
-
-        if not exists:
+        if isinstance(at_type, str):
+            at_type = ThingTypesBase.from_str(at_type)
+
+        if at_type not in self.contains:
             new_contain = Contain(contain_dict)
-            self.contains[final_key] = new_contain
+            self.contains[at_type] = new_contain
             return new_contain
 
     def add_parameters(self, param_dict: dict = None, **param_kwargs):
@@ -331,11 +309,21 @@ def append(self, value: Union[Annotation, Document], overwrite=False) -> None:
 
 
 class ContainsDict(FreezableDataDict[Contain]):
-    _items: Dict[str, Contain]
+    _items: Dict[ThingTypesBase, Contain]
 
     def _deserialize(self, input_dict: dict) -> None:
         self._items = {key: Contain(value) for key, value in input_dict.items()}
 
     def update(self, other: Union[dict, 'ContainsDict'], overwrite=False):
         for k, v in other.items():
+            if isinstance(k, str):
+                k = ThingTypesBase.from_str(k)
             self._append_with_key(k, v, overwrite=overwrite)
+
+    def get(self, key: Union[str, ThingTypesBase], default=None):
+        if isinstance(key, str):
+            key = ThingTypesBase.from_str(key)
+        return self._items.get(key, default)
+
+    def __contains__(self, item):
+        return item in list(self._items.keys())
diff --git a/setup.py b/setup.py
@@ -12,6 +12,7 @@
 
 name = "mmif-python"
 version_fname = "VERSION"
+vocabulary_templates_path = 'templates/python/vocabulary'
 cmdclass = {}
 
 # Used to have `import mmif` that imported `mmif` directory as a sibling, not `mmif` site-package,
@@ -44,32 +45,40 @@ def generate_subpack(parpack_name, subpack_name, init_contents=""):
     return subpack_dir
 
 
-def generate_vocab_enum(spec_version, clams_types, source_path) -> str:
+def generate_vocab_enum(spec_version, clams_types, mod_name) -> str:
     vocab_url = 'http://mmif.clams.ai/%s/vocabulary' % spec_version
+
+    template_file = os.path.join(vocabulary_templates_path, mod_name + '.txt')
+    if mod_name.startswith('annotation'):
+        base_class_name = 'AnnotationTypesBase'
+    elif mod_name.startswith('document'):
+        base_class_name = 'DocumentTypesBase'
+    else: 
+        base_class_name = 'ClamsTypesBase'
 
     file_out = io.StringIO()
-    with open(source_path, 'r') as file_in:
+    with open(template_file, 'r') as file_in:
         for line in file_in.readlines():
             file_out.write(line.replace('<VERSION>', spec_version))
         for type_name in clams_types:
-            file_out.write(f"    {type_name} = '{vocab_url}/{type_name}'\n")
+            file_out.write(f"    {type_name} = {base_class_name}('{vocab_url}/{type_name}')\n")
 
     string_out = file_out.getvalue()
     file_out.close()
     return string_out
 
 
-def generate_vocabulary(spec_version, clams_types, source_path):
+def generate_vocabulary(spec_version, clams_types):
     """
     :param spec_version:
     :param clams_types: the tree
-    :param source_path: the directory of source txt files
+    :param template_path: the directory of source txt files
     :return:
     """
     types = {
-        'thing_types': ['ThingTypesBase', 'ThingType'],
-        'annotation_types': ['AnnotationTypesBase', 'AnnotationTypes'],
-        'document_types': ['DocumentTypesBase', 'DocumentTypes']
+        'base_types': ['ThingTypesBase', 'ThingType', 'ClamsTypesBase', 'AnnotationTypesBase', 'DocumentTypesBase'],
+        'annotation_types': ['AnnotationTypes'],
+        'document_types': ['DocumentTypes']
     }
     vocabulary_dir = generate_subpack(
         mmif_name, mmif_vocabulary_pkg,
@@ -88,11 +97,11 @@ def generate_vocabulary(spec_version, clams_types, source_path):
         'annotation_types': [t for t in clams_types if 'Document' not in t and t != 'Thing'],
 
         # extract thing type
-        'thing_types': clams_types[:1]
+        'base_types': clams_types[:1]
     }
 
     for mod_name, type_list in type_lists.items():
-        enum_contents = generate_vocab_enum(spec_version, type_list, os.path.join(source_path, mod_name+'.txt'))
+        enum_contents = generate_vocab_enum(spec_version, type_list, mod_name)
         write_res_file(vocabulary_dir, mod_name+'.py', enum_contents)
 
     return vocabulary_dir
@@ -151,7 +160,7 @@ def mod_run(self):
         import yaml
         yaml_file = io.BytesIO(get_spec_file_at_tag(gittag, mmif_vocab_res_oriname))
         clams_types = [t['name'] for t in list(yaml.safe_load_all(yaml_file.read()))]
-        generate_vocabulary(spec_version, clams_types, 'vocabulary_files')
+        generate_vocabulary(spec_version, clams_types)
 
         ori_run(self)
 

diff --git a/templates/python/vocabulary/annotation_types.txt b/templates/python/vocabulary/annotation_types.txt
@@ -0,0 +1,10 @@
+# Spec version <VERSION>
+# This file is auto-generated by setup.py
+
+from .base_types import AnnotationTypesBase
+
+class AnnotationTypes(AnnotationTypesBase):
+    """
+    This class contains the CLAMS annotation types 
+    defined in the spec version <VERSION> as class variables. 
+    """
diff --git a/templates/python/vocabulary/base_types.txt b/templates/python/vocabulary/base_types.txt
@@ -0,0 +1,110 @@
+# This file is auto-generated by setup.py
+
+class TypesBase(object):
+    """
+    Base class for arbitrary vocabulary type. 
+    This class provides bisic initializer, comparators, 
+    and (de-)serialization methods. 
+    """
+
+    def __init__(self, type_uri: str):
+        if '/' in type_uri:
+            self.base_uri, self.shortname = type_uri.rsplit('/', 1)
+        else:
+            self.base_uri = ""
+            self.shortname = type_uri
+
+    @classmethod
+    def from_str(cls, string: str):
+        if 'mmif.clams.ai' in string:
+            if string.endswith('Document'):
+                return DocumentTypesBase(string)
+            else:
+                return AnnotationTypesBase(string)
+        else:
+            return cls(string)
+
+    def __hash__(self):
+        return hash(str(self))
+
+    def __eq__(self, other):
+        if isinstance(other, str):
+            other = self.from_str(other)
+        return isinstance(other, TypesBase) and self.base_uri == other.base_uri and self.shortname == other.shortname
+
+    def __repr__(self):
+        if len(self.base_uri) > 0: 
+            return f'{self.base_uri}/{self.shortname}'
+        else:
+            return self.shortname
+
+    # aliases
+    def __str__(self):
+        return self.__repr__()
+
+    def _serialize(self):
+        return self.__repr__()
+
+
+ThingTypesBase = TypesBase
+
+
+class ClamsTypesBase(ThingTypesBase):
+    """
+    Base class for CLAMS vocabulary types. Main 
+    This class adds handling of MMIF specificaiton versions 
+    in initializer and comparators. 
+    """
+
+    def __init__(self, type_uri: str):
+        if 'mmif.clams.ai' in type_uri:
+            self.base_uri, self.version, _, self.shortname = type_uri.rsplit('/', 3)
+        else:
+            raise ValueError(f'{type_uri} is not a CLAMS vocabulary URI')
+
+    def __hash__(self):
+        return hash(str(self))
+
+    def __eq__(self, other):
+        if isinstance(other, str):
+            other = ThingTypesBase.from_str(other)
+        if isinstance(other, ClamsTypesBase):
+            if '.' in self.version and '.' in other.version:
+            # regular version 
+                s_major, s_minor, s_patch = self.version.split('.')
+                o_major, o_minor, o_patch = other.version.split('.')
+                if s_major != o_major or s_minor != o_minor:
+                    return False
+            else:
+                # dummy version given at development time
+                if self.version != other.version:
+                    return False
+            return self.base_uri == other.base_uri and self.shortname == other.shortname
+        else:
+            return False
+
+    def __repr__(self):
+        return f'{self.base_uri}/{self.version}/vocabulary/{self.shortname}'
+
+
+class AnnotationTypesBase(ClamsTypesBase):
+    """
+    Inherit from this class to build your own custom annotation
+    vocabularies. 
+    """
+    ...
+
+
+class DocumentTypesBase(ClamsTypesBase):
+    """
+    Inherit from this class to build your own custom document
+    vocabularies. 
+    """
+    ...
+
+
+class ThingType(ThingTypesBase):
+    """
+    This class contains the topmost CLAMS thing type 
+    defined in the spec version <VERSION> as a class variable. 
+    """
diff --git a/templates/python/vocabulary/document_types.txt b/templates/python/vocabulary/document_types.txt
@@ -0,0 +1,11 @@
+# Spec version <VERSION>
+# This file is auto-generated by setup.py
+
+from .base_types import DocumentTypesBase
+
+
+class DocumentTypes(DocumentTypesBase):
+    """
+    This class contains the CLAMS document types 
+    defined in the spec version <VERSION> as class variables. 
+    """