Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
140 changes: 140 additions & 0 deletions scripts/python/cache/clinvar_cache.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,140 @@
"""Cache data on variants related to human health, from NCBI ClinVar

Example:
python clinvar_cache.py
"""

import csv
import json
import gzip

clinical_concerns = ['Likely_pathogenic', 'Pathogenic/Likely_pathogenic', 'Pathogenic']
robust_review_statuses = [
'criteria_provided,_multiple_submitters,_no_conflicts',
'reviewed_by_expert_panel',
'practice_guideline'
]

disease_ids_and_names = []
variant_types = []
molecular_consequences = []

def get_is_relevant(fields):
is_clinical_concern = False
is_robustly_reviewed = False
for field in fields:
[name, value] = field.split('=')

if name == 'CLNSIG' and value in clinical_concerns:
is_clinical_concern = True

if name == 'CLNREVSTAT' and value in robust_review_statuses:
is_robustly_reviewed = True

return is_clinical_concern and is_robustly_reviewed

def trim_info_fields(fields):
slim_fields = []
names_to_keep = ['CLNREVSTAT', 'CLNSIG', 'CLNVC', 'MC', 'ORIGIN', 'RS']
disease_indexes = []
disease_names = []
disease_ids = []
for field in fields:
[name, value] = field.split('=')

if name == 'CLNDISDB': # "Clinical disease database"
entries = value.split('|')
for entry in entries:
full_values = entry.split(',')
has_mondo = False
for fv in full_values:
split_fv = fv.split(':')
db_name = split_fv[0]
db_value = split_fv[-1]
if db_name == 'MONDO':
disease_ids.append(db_value)
has_mondo = True
if not has_mondo:
disease_ids.append('-1')

elif name == 'CLNDN': # "Clinical disease name"
disease_names = value.split('|')

elif name == 'CLNVC':
if value not in variant_types:
variant_types.append(value)
variant_type = variant_types.index(value)
slim_fields.append(str(variant_type))

elif name == 'MC':
entries = value.split(',')
slim_mc = []
for entry in entries:
if entry not in molecular_consequences:
molecular_consequences.append(entry)
molecular_consequence = molecular_consequences.index(entry)
slim_mc.append(str(molecular_consequence))
slim_fields.append(','.join(slim_mc))

elif name in names_to_keep:
if name == 'CLNSIG':
value = clinical_concerns.index(value)
elif name == 'CLNREVSTAT':
value = robust_review_statuses.index(value)
slim_fields.append(str(value))

for (i, disease_id) in enumerate(disease_ids):
if disease_id == '-1':
continue
disease_name = disease_names[i]
disease_id_and_name = disease_id + '|' + disease_name
if disease_id_and_name not in disease_ids_and_names:
disease_ids_and_names.append(disease_id_and_name)
disease_index = disease_ids_and_names.index(disease_id_and_name)
disease_indexes.append(str(disease_index))

disease_indexes_string = ','.join(disease_indexes)
slim_fields.insert(0, disease_indexes_string)

slim_info = '\t'.join(slim_fields)
return slim_info

output_rows = []

# https://ftp.ncbi.nlm.nih.gov/pub/clinvar/vcf_GRCh38/clinvar_20241215.vcf.gz
# Source: https://ftp.ncbi.nlm.nih.gov/pub/clinvar/vcf_GRCh38/
with open('clinvar_20241215.vcf') as file:
reader = csv.reader(file, delimiter="\t")
for row in reader:
if row[0][0] == '#':
continue
# print('row', row)
info = row[7]
fields = info.split(';')
is_relevant = get_is_relevant(fields)
if not is_relevant:
continue
slim_info = trim_info_fields(fields)
del row[5:7]
row[5] = slim_info

output_rows.append('\t'.join(row))

content = '\n'.join(output_rows)

disease_map = json.dumps(disease_ids_and_names)
column_names = ['#CHROM', 'POS', 'ID', 'REF', 'ALT', 'DISEASE_IDS', 'CLNREVSTAT', 'CLNSIG', 'CLNVC', 'MC', 'ORIGIN', 'RS']
headers = '\n'.join([
'# disease_mondo_ids_and_names = ' + disease_map,
'# variant_types = ' + str(variant_types),
'# molecular_consequences = ' + str(molecular_consequences),
'\t'.join(column_names)
])
content = headers + '\n' + content


output_path = 'clinvar_priority_20241215.tsv'
with open(output_path, "w") as f:
f.write(content)
with gzip.open(f"{output_path}.gz", "wt") as f:
f.write(content)
4 changes: 2 additions & 2 deletions test/offline/gene-structure.test.js
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,7 @@ describe('Ideogram gene structure functionality', function() {
const apoeLabel = document.querySelector('#_c18_a3 path');
apoeLabel.dispatchEvent(new Event('mouseover'));
const subparts = document.querySelectorAll('rect.subpart');
assert.equal(subparts.length, 7); // spliced, without introns
assert.equal(subparts.length, 9); // spliced, without introns
done();
}, 500);
}
Expand Down Expand Up @@ -162,7 +162,7 @@ describe('Ideogram gene structure functionality', function() {
const sKeydown = new KeyboardEvent('keydown', {key: 's'});
document.dispatchEvent(sKeydown);
let subparts = document.querySelectorAll('rect.subpart');
assert.equal(subparts.length, 10); // includes introns
assert.equal(subparts.length, 12); // includes introns

document.dispatchEvent(sKeydown);
setTimeout(async function() {
Expand Down