From e3ac8af13740cc70f2743374437a72fce0b0716b Mon Sep 17 00:00:00 2001 From: Eric Weitz Date: Wed, 18 Dec 2024 15:56:58 -0500 Subject: [PATCH 01/11] Begin ClinVar cache for clinical variant view in Gene Leads --- scripts/python/cache/clinvar_cache.py | 50 +++++++++++++++++++++++++++ 1 file changed, 50 insertions(+) create mode 100644 scripts/python/cache/clinvar_cache.py diff --git a/scripts/python/cache/clinvar_cache.py b/scripts/python/cache/clinvar_cache.py new file mode 100644 index 00000000..ab2c9503 --- /dev/null +++ b/scripts/python/cache/clinvar_cache.py @@ -0,0 +1,50 @@ +import csv +import gzip + +def get_is_relevant(fields): + is_clinical_concern = False + is_robustly_reviewed = False + for field in fields: + [name, value] = field.split('=') + if name == 'CLNSIG' and value in ['Pathogenic', 'Likely_pathogenic', 'Pathogenic/Likely_pathogenic']: + is_clinical_concern = True + if name == 'CLNREVSTAT' and 'multiple_submitters' in value or value == 'reviewed_by_expert_panel': + is_robustly_reviewed = True + + return is_clinical_concern and is_robustly_reviewed + +def trim_info_fields(fields): + slim_fields = [] + slim_names = ['CLNDN', 'CLNREVSTAT', 'CLNSIG', 'CLNVC', 'MC', 'ORIGIN', 'RS'] + for field in fields: + name = field.split('=')[0] + if name in slim_names: + slim_fields.append(field) + slim_info = ';'.join(slim_fields) + return slim_info + +output_rows = [] + +with open('clinvar_20241215.vcf') as file: + reader = csv.reader(file, delimiter="\t") + for row in reader: + if row[0][0] == '#': + continue + # print('row', row) + info = row[7] + fields = info.split(';') + is_relevant = get_is_relevant(fields) + if not is_relevant: + continue + slim_info = trim_info_fields(fields) + row[7] = slim_info + + output_rows.append('\t'.join(row)) + +content = '\n'.join(output_rows) + +output_path = 'clinvar_pathogenic_and_lp_20241215.vcf' +with open(output_path, "w") as f: + f.write(content) +with gzip.open(f"{output_path}.gz", "wt") as f: + f.write(content) From 4a5d9792bb5dcf1e6ad8a149d097a7f53746202c Mon Sep 17 00:00:00 2001 From: Eric Weitz Date: Thu, 19 Dec 2024 09:24:56 -0500 Subject: [PATCH 02/11] Refine clinical cache pipeline --- scripts/python/cache/clinvar_cache.py | 28 ++++++++++++++++++++------- 1 file changed, 21 insertions(+), 7 deletions(-) diff --git a/scripts/python/cache/clinvar_cache.py b/scripts/python/cache/clinvar_cache.py index ab2c9503..d34b6e33 100644 --- a/scripts/python/cache/clinvar_cache.py +++ b/scripts/python/cache/clinvar_cache.py @@ -1,25 +1,39 @@ import csv import gzip + +clinical_concerns = ['Likely_pathogenic', 'Pathogenic/Likely_pathogenic', 'Pathogenic'] +robust_review_statuses = [ + 'criteria_provided,_multiple_submitters,_no_conflicts', + 'reviewed_by_expert_panel', + 'practice_guideline' +] + def get_is_relevant(fields): is_clinical_concern = False is_robustly_reviewed = False for field in fields: [name, value] = field.split('=') - if name == 'CLNSIG' and value in ['Pathogenic', 'Likely_pathogenic', 'Pathogenic/Likely_pathogenic']: + + if name == 'CLNSIG' and value in clinical_concerns: is_clinical_concern = True - if name == 'CLNREVSTAT' and 'multiple_submitters' in value or value == 'reviewed_by_expert_panel': + + if name == 'CLNREVSTAT' and value in robust_review_statuses: is_robustly_reviewed = True return is_clinical_concern and is_robustly_reviewed def trim_info_fields(fields): slim_fields = [] - slim_names = ['CLNDN', 'CLNREVSTAT', 'CLNSIG', 'CLNVC', 'MC', 'ORIGIN', 'RS'] + names_to_keep = ['CLNDN', 'CLNREVSTAT', 'CLNSIG', 'CLNVC', 'MC', 'ORIGIN', 'RS'] for field in fields: - name = field.split('=')[0] - if name in slim_names: - slim_fields.append(field) + [name, value] = field.split('=') + if name in names_to_keep: + if name == 'CLNSIG': + value = clinical_concerns.index(value) + elif name == 'CLNREVSTAT': + value = robust_review_statuses.index(value) + slim_fields.append(f"{name}={value}") slim_info = ';'.join(slim_fields) return slim_info @@ -43,7 +57,7 @@ def trim_info_fields(fields): content = '\n'.join(output_rows) -output_path = 'clinvar_pathogenic_and_lp_20241215.vcf' +output_path = 'clinvar_priority_20241215.vcf' with open(output_path, "w") as f: f.write(content) with gzip.open(f"{output_path}.gz", "wt") as f: From 3e9c80bee263f2fa3d70584d5f7097420c01baaa Mon Sep 17 00:00:00 2001 From: Eric Weitz Date: Thu, 19 Dec 2024 10:25:53 -0500 Subject: [PATCH 03/11] Map disease names to IDs; 16 MB uncompressed, 1.7M gz -> 11, 1.4 --- scripts/python/cache/clinvar_cache.py | 35 ++++++++++++++++++++++++++- 1 file changed, 34 insertions(+), 1 deletion(-) diff --git a/scripts/python/cache/clinvar_cache.py b/scripts/python/cache/clinvar_cache.py index d34b6e33..04aa513c 100644 --- a/scripts/python/cache/clinvar_cache.py +++ b/scripts/python/cache/clinvar_cache.py @@ -1,4 +1,5 @@ import csv +import json import gzip @@ -9,6 +10,9 @@ 'practice_guideline' ] +# Disease names by MONDO IDs +disease_names_by_id = {} + def get_is_relevant(fields): is_clinical_concern = False is_robustly_reviewed = False @@ -25,15 +29,40 @@ def get_is_relevant(fields): def trim_info_fields(fields): slim_fields = [] - names_to_keep = ['CLNDN', 'CLNREVSTAT', 'CLNSIG', 'CLNVC', 'MC', 'ORIGIN', 'RS'] + names_to_keep = ['CLNREVSTAT', 'CLNSIG', 'CLNVC', 'MC', 'ORIGIN', 'RS'] + disease_names = [] + disease_ids = [] for field in fields: [name, value] = field.split('=') + + if name == 'CLNDISDB': # "Clinical disease database" + entries = value.split('|') + for entry in entries: + full_values = entry.split(',') + for fv in full_values: + split_fv = fv.split(':') + db_name = split_fv[0] + db_value = split_fv[-1] + if db_name == 'MONDO': + disease_ids.append(db_value) + + elif name == 'CLNDN': # "Clinical disease name" + disease_names = value.split('|') + if name in names_to_keep: if name == 'CLNSIG': value = clinical_concerns.index(value) elif name == 'CLNREVSTAT': value = robust_review_statuses.index(value) slim_fields.append(f"{name}={value}") + + for (i, disease_id) in enumerate(disease_ids): + if disease_id not in disease_names_by_id: + disease_names_by_id[disease_id] = disease_names[i] + + disease_ids_string = ','.join(disease_ids) + slim_fields.insert(0, disease_ids_string) + slim_info = ';'.join(slim_fields) return slim_info @@ -57,6 +86,10 @@ def trim_info_fields(fields): content = '\n'.join(output_rows) +disease_map = json.dumps(disease_names_by_id) +content = disease_map + '\n' + content + + output_path = 'clinvar_priority_20241215.vcf' with open(output_path, "w") as f: f.write(content) From 690ed1686f04c75494e40f1e8f11b950b6704017 Mon Sep 17 00:00:00 2001 From: Eric Weitz Date: Thu, 19 Dec 2024 10:52:09 -0500 Subject: [PATCH 04/11] Remove unused columns, add column names to headers --- scripts/python/cache/clinvar_cache.py | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/scripts/python/cache/clinvar_cache.py b/scripts/python/cache/clinvar_cache.py index 04aa513c..a765d01f 100644 --- a/scripts/python/cache/clinvar_cache.py +++ b/scripts/python/cache/clinvar_cache.py @@ -80,14 +80,20 @@ def trim_info_fields(fields): if not is_relevant: continue slim_info = trim_info_fields(fields) - row[7] = slim_info + del row[5:7] + row[5] = slim_info output_rows.append('\t'.join(row)) content = '\n'.join(output_rows) disease_map = json.dumps(disease_names_by_id) -content = disease_map + '\n' + content +column_names = ['#CHROM', 'POS', 'ID', 'REF', 'ALT', 'INFO'] +headers = '\n'.join([ + '# disease_names_by_mondo_id = ' + disease_map, + '\t'.join(column_names) +]) +content = headers + '\n' + content output_path = 'clinvar_priority_20241215.vcf' From 21f3f48802ad230d8d349a53a4df917df39c49a2 Mon Sep 17 00:00:00 2001 From: Eric Weitz Date: Thu, 19 Dec 2024 13:43:46 -0500 Subject: [PATCH 05/11] Index variant types; 8.2 MB uncompressed, 1.4 gz -> 6.9, 1.3 --- scripts/python/cache/clinvar_cache.py | 18 +++++++++++++----- 1 file changed, 13 insertions(+), 5 deletions(-) diff --git a/scripts/python/cache/clinvar_cache.py b/scripts/python/cache/clinvar_cache.py index a765d01f..1f1a05cd 100644 --- a/scripts/python/cache/clinvar_cache.py +++ b/scripts/python/cache/clinvar_cache.py @@ -12,6 +12,7 @@ # Disease names by MONDO IDs disease_names_by_id = {} +variant_types = [] def get_is_relevant(fields): is_clinical_concern = False @@ -49,12 +50,18 @@ def trim_info_fields(fields): elif name == 'CLNDN': # "Clinical disease name" disease_names = value.split('|') - if name in names_to_keep: + elif name == 'CLNVC': + if value not in variant_types: + variant_types.append(value) + variant_type = variant_types.index(value) + slim_fields.append(str(variant_type)) + + elif name in names_to_keep: if name == 'CLNSIG': value = clinical_concerns.index(value) elif name == 'CLNREVSTAT': value = robust_review_statuses.index(value) - slim_fields.append(f"{name}={value}") + slim_fields.append(str(value)) for (i, disease_id) in enumerate(disease_ids): if disease_id not in disease_names_by_id: @@ -63,7 +70,7 @@ def trim_info_fields(fields): disease_ids_string = ','.join(disease_ids) slim_fields.insert(0, disease_ids_string) - slim_info = ';'.join(slim_fields) + slim_info = '\t'.join(slim_fields) return slim_info output_rows = [] @@ -88,15 +95,16 @@ def trim_info_fields(fields): content = '\n'.join(output_rows) disease_map = json.dumps(disease_names_by_id) -column_names = ['#CHROM', 'POS', 'ID', 'REF', 'ALT', 'INFO'] +column_names = ['#CHROM', 'POS', 'ID', 'REF', 'ALT', 'DISEASE_IDS', 'CLNREVSTAT', 'CLNSIG', 'CLNVC', 'ORIGIN', 'RS'] headers = '\n'.join([ '# disease_names_by_mondo_id = ' + disease_map, + '# variant_types = ' + str(variant_types), '\t'.join(column_names) ]) content = headers + '\n' + content -output_path = 'clinvar_priority_20241215.vcf' +output_path = 'clinvar_priority_20241215.tsv' with open(output_path, "w") as f: f.write(content) with gzip.open(f"{output_path}.gz", "wt") as f: From 8ae4f58cbc3772b23ccbd59d36d6f709fa352455 Mon Sep 17 00:00:00 2001 From: Eric Weitz Date: Thu, 19 Dec 2024 15:22:50 -0500 Subject: [PATCH 06/11] Index molecular consequences; 6.9 MB uncompressed, 1.3 gz -> 4.8, 1.3 --- scripts/python/cache/clinvar_cache.py | 13 ++++++++++++- 1 file changed, 12 insertions(+), 1 deletion(-) diff --git a/scripts/python/cache/clinvar_cache.py b/scripts/python/cache/clinvar_cache.py index 1f1a05cd..10f20eaa 100644 --- a/scripts/python/cache/clinvar_cache.py +++ b/scripts/python/cache/clinvar_cache.py @@ -13,6 +13,7 @@ # Disease names by MONDO IDs disease_names_by_id = {} variant_types = [] +molecular_consequences = [] def get_is_relevant(fields): is_clinical_concern = False @@ -56,6 +57,16 @@ def trim_info_fields(fields): variant_type = variant_types.index(value) slim_fields.append(str(variant_type)) + elif name == 'MC': + entries = value.split('|') + slim_mc = [] + for entry in entries: + if entry not in molecular_consequences: + molecular_consequences.append(entry) + molecular_consequence = molecular_consequences.index(entry) + slim_mc.append(str(molecular_consequence)) + slim_fields.append(','.join(slim_mc)) + elif name in names_to_keep: if name == 'CLNSIG': value = clinical_concerns.index(value) @@ -95,7 +106,7 @@ def trim_info_fields(fields): content = '\n'.join(output_rows) disease_map = json.dumps(disease_names_by_id) -column_names = ['#CHROM', 'POS', 'ID', 'REF', 'ALT', 'DISEASE_IDS', 'CLNREVSTAT', 'CLNSIG', 'CLNVC', 'ORIGIN', 'RS'] +column_names = ['#CHROM', 'POS', 'ID', 'REF', 'ALT', 'DISEASE_IDS', 'CLNREVSTAT', 'CLNSIG', 'CLNVC', 'MC', 'ORIGIN', 'RS'] headers = '\n'.join([ '# disease_names_by_mondo_id = ' + disease_map, '# variant_types = ' + str(variant_types), From ae5ba2fcd08e2de64621a1c22a5752dbc21b9622 Mon Sep 17 00:00:00 2001 From: Eric Weitz Date: Thu, 19 Dec 2024 17:03:04 -0500 Subject: [PATCH 07/11] Index disease IDs and names; 4.8 MB uncompressed, 1.3 gz -> 4.3, 1.3 --- scripts/python/cache/clinvar_cache.py | 26 ++++++++++++++++++-------- 1 file changed, 18 insertions(+), 8 deletions(-) diff --git a/scripts/python/cache/clinvar_cache.py b/scripts/python/cache/clinvar_cache.py index 10f20eaa..e485837d 100644 --- a/scripts/python/cache/clinvar_cache.py +++ b/scripts/python/cache/clinvar_cache.py @@ -10,8 +10,7 @@ 'practice_guideline' ] -# Disease names by MONDO IDs -disease_names_by_id = {} +disease_ids_and_names = [] variant_types = [] molecular_consequences = [] @@ -32,6 +31,7 @@ def get_is_relevant(fields): def trim_info_fields(fields): slim_fields = [] names_to_keep = ['CLNREVSTAT', 'CLNSIG', 'CLNVC', 'MC', 'ORIGIN', 'RS'] + disease_indexes = [] disease_names = [] disease_ids = [] for field in fields: @@ -41,12 +41,16 @@ def trim_info_fields(fields): entries = value.split('|') for entry in entries: full_values = entry.split(',') + has_mondo = False for fv in full_values: split_fv = fv.split(':') db_name = split_fv[0] db_value = split_fv[-1] if db_name == 'MONDO': disease_ids.append(db_value) + has_mondo = True + if not has_mondo: + disease_ids.append('-1') elif name == 'CLNDN': # "Clinical disease name" disease_names = value.split('|') @@ -75,11 +79,17 @@ def trim_info_fields(fields): slim_fields.append(str(value)) for (i, disease_id) in enumerate(disease_ids): - if disease_id not in disease_names_by_id: - disease_names_by_id[disease_id] = disease_names[i] + if disease_id == '-1': + continue + disease_name = disease_names[i] + disease_id_and_name = disease_id + '|' + disease_name + if disease_id_and_name not in disease_ids_and_names: + disease_ids_and_names.append(disease_id_and_name) + disease_index = disease_ids_and_names.index(disease_id_and_name) + disease_indexes.append(str(disease_index)) - disease_ids_string = ','.join(disease_ids) - slim_fields.insert(0, disease_ids_string) + disease_indexes_string = ','.join(disease_indexes) + slim_fields.insert(0, disease_indexes_string) slim_info = '\t'.join(slim_fields) return slim_info @@ -105,10 +115,10 @@ def trim_info_fields(fields): content = '\n'.join(output_rows) -disease_map = json.dumps(disease_names_by_id) +disease_map = json.dumps(disease_ids_and_names) column_names = ['#CHROM', 'POS', 'ID', 'REF', 'ALT', 'DISEASE_IDS', 'CLNREVSTAT', 'CLNSIG', 'CLNVC', 'MC', 'ORIGIN', 'RS'] headers = '\n'.join([ - '# disease_names_by_mondo_id = ' + disease_map, + '# disease_mondo_ids_and_names = ' + disease_map, '# variant_types = ' + str(variant_types), '\t'.join(column_names) ]) From f68574ae697aed08e769014f6411268b9d9ed334 Mon Sep 17 00:00:00 2001 From: Eric Weitz Date: Fri, 20 Dec 2024 07:22:00 -0500 Subject: [PATCH 08/11] Refine molecular consequence index; 4.3 MB uncompressed, 1.3 gz -> 4.1, 1.2 --- scripts/python/cache/clinvar_cache.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/scripts/python/cache/clinvar_cache.py b/scripts/python/cache/clinvar_cache.py index e485837d..ba4d9b26 100644 --- a/scripts/python/cache/clinvar_cache.py +++ b/scripts/python/cache/clinvar_cache.py @@ -62,7 +62,7 @@ def trim_info_fields(fields): slim_fields.append(str(variant_type)) elif name == 'MC': - entries = value.split('|') + entries = value.split(',') slim_mc = [] for entry in entries: if entry not in molecular_consequences: @@ -120,6 +120,7 @@ def trim_info_fields(fields): headers = '\n'.join([ '# disease_mondo_ids_and_names = ' + disease_map, '# variant_types = ' + str(variant_types), + '# molecular_consequences = ' + str(molecular_consequences), '\t'.join(column_names) ]) content = headers + '\n' + content From e9c55b2f8969b45d158eedafc0bfee7d76f1c4ae Mon Sep 17 00:00:00 2001 From: Eric Weitz Date: Fri, 20 Dec 2024 08:00:35 -0500 Subject: [PATCH 09/11] Add basic documentation for ClinVar cache pipeline --- scripts/python/cache/clinvar_cache.py | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/scripts/python/cache/clinvar_cache.py b/scripts/python/cache/clinvar_cache.py index ba4d9b26..db878124 100644 --- a/scripts/python/cache/clinvar_cache.py +++ b/scripts/python/cache/clinvar_cache.py @@ -1,8 +1,13 @@ +"""Cache data on variants related to human health, from NCBI ClinVar + +Example: +python clinvar_cache.py +""" + import csv import json import gzip - clinical_concerns = ['Likely_pathogenic', 'Pathogenic/Likely_pathogenic', 'Pathogenic'] robust_review_statuses = [ 'criteria_provided,_multiple_submitters,_no_conflicts', @@ -96,6 +101,8 @@ def trim_info_fields(fields): output_rows = [] +# https://ftp.ncbi.nlm.nih.gov/pub/clinvar/vcf_GRCh38/clinvar_20241215.vcf.gz +# Source: https://ftp.ncbi.nlm.nih.gov/pub/clinvar/vcf_GRCh38/ with open('clinvar_20241215.vcf') as file: reader = csv.reader(file, delimiter="\t") for row in reader: From 6557daf33f3830f715e37d7f4687391f3d57d631 Mon Sep 17 00:00:00 2001 From: Eric Weitz Date: Fri, 20 Dec 2024 10:54:56 -0500 Subject: [PATCH 10/11] Update test of gene structure functionality --- test/offline/gene-structure.test.js | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/test/offline/gene-structure.test.js b/test/offline/gene-structure.test.js index f4a94119..8177e78c 100644 --- a/test/offline/gene-structure.test.js +++ b/test/offline/gene-structure.test.js @@ -22,7 +22,7 @@ describe('Ideogram gene structure functionality', function() { const apoeLabel = document.querySelector('#_c18_a3 path'); apoeLabel.dispatchEvent(new Event('mouseover')); const subparts = document.querySelectorAll('rect.subpart'); - assert.equal(subparts.length, 7); // spliced, without introns + assert.equal(subparts.length, 9); // spliced, without introns done(); }, 500); } From 631ddfa7469dae99743f9442d98f0968b86292ce Mon Sep 17 00:00:00 2001 From: Eric Weitz Date: Fri, 20 Dec 2024 11:01:12 -0500 Subject: [PATCH 11/11] Update test of toggling exon splice --- test/offline/gene-structure.test.js | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/test/offline/gene-structure.test.js b/test/offline/gene-structure.test.js index 8177e78c..dd4a2e3a 100644 --- a/test/offline/gene-structure.test.js +++ b/test/offline/gene-structure.test.js @@ -162,7 +162,7 @@ describe('Ideogram gene structure functionality', function() { const sKeydown = new KeyboardEvent('keydown', {key: 's'}); document.dispatchEvent(sKeydown); let subparts = document.querySelectorAll('rect.subpart'); - assert.equal(subparts.length, 10); // includes introns + assert.equal(subparts.length, 12); // includes introns document.dispatchEvent(sKeydown); setTimeout(async function() {