diff --git a/README.md b/README.md index 48377215..7fc1b721 100644 --- a/README.md +++ b/README.md @@ -28,7 +28,7 @@ It is recommended to start PostgreSQL using [Docker](https://www.docker.com/): ```shell docker run -e POSTGRES_USER=beacon \ -e POSTGRES_PASSWORD=beacon \ - -v "$PWD/data":/docker-entrypoint-initdb.d + -v "$PWD/data":/docker-entrypoint-initdb.d \ -e POSTGRES_DB=beacondb \ -p 5432:5432 postgres:11.6 ``` diff --git a/beacon_api/utils/db_load.py b/beacon_api/utils/db_load.py index bd100e6d..94b64d3a 100644 --- a/beacon_api/utils/db_load.py +++ b/beacon_api/utils/db_load.py @@ -302,83 +302,82 @@ async def insert_variants(self, dataset_id, variants, min_ac): async with self._conn.transaction(): LOG.info("Insert variants into the database") for variant in variants: - # params = (frequency, count, actual variant Type) - params = self._unpack(variant) - # Coordinates that are read from VCF are 1-based, - # cyvcf2 reads them as 0-based, and they are inserted into the DB as such - - # params may carry single variants [1] or packed variants [20, 15, 10, 1] - # The first check prunes for single variants, packed variants must be removed afterwards - if params[1][0] >= min_ac: - # Remove packed variants that don't meet the minimum allele count requirements - # Packed variants are always ordered from largest to smallest, this process starts - # popping values from the right (small) side until there are no more small values to pop - while params[1][-1] < min_ac: - params[0].pop() # aaf - params[1].pop() # ac - params[2].pop() # vt - params[3].pop() # alt - if len(params[5]) > 0: - params[5].pop() # bnd - - # Nothing interesting on the variant with no aaf - # because none of the samples have it - if variant.aaf > 0: - - # We Process Breakend Records into a different table for now - if params[5] != []: - # await self.insert_mates(dataset_id, variant, params) - # Most likely there will be only one BND per Record - for bnd in params[5]: - await self._conn.execute( - """INSERT INTO beacon_mate_table - (datasetId, chromosome, chromosomeStart, chromosomePos, - mate, mateStart, matePos, reference, alternate, alleleCount, - callCount, frequency, "end") - SELECT ($1), ($2), ($3), ($4), - ($5), ($6), ($7), ($8), t.alt, t.ac, ($11), t.freq, ($13) - FROM (SELECT unnest($9::varchar[]) alt, unnest($10::integer[]) ac, - unnest($12::float[]) freq) t - ON CONFLICT (datasetId, chromosome, mate, chromosomePos, matePos) - DO NOTHING""", - dataset_id, - variant.CHROM.replace("chr", ""), - variant.start, - variant.ID, - bnd[0].replace("chr", ""), - bnd[1], - bnd[6], - variant.REF, - params[3], - params[1], - params[4], - params[0], - variant.end, - ) - else: + # Nothing interesting on the variant with no aaf + # because none of the samples have it + if variant.aaf > 0: + # params = (frequency, count, actual variant Type) + params = self._unpack(variant) + # Coordinates that are read from VCF are 1-based, + # cyvcf2 reads them as 0-based, and they are inserted into the DB as such + + # params may carry single variants [1] or packed variants [20, 15, 10, 1] + # The first check prunes for single variants, packed variants must be removed afterwards + if params[1][0] >= min_ac: + # Remove packed variants that don't meet the minimum allele count requirements + # Packed variants are always ordered from largest to smallest, this process starts + # popping values from the right (small) side until there are no more small values to pop + while params[1][-1] < min_ac: + params[0].pop() # aaf + params[1].pop() # ac + params[2].pop() # vt + params[3].pop() # alt + if len(params[5]) > 0: + params[5].pop() # bnd + + # We Process Breakend Records into a different table for now + if params[5] != []: + # await self.insert_mates(dataset_id, variant, params) + # Most likely there will be only one BND per Record + for bnd in params[5]: await self._conn.execute( - """INSERT INTO beacon_data_table - (datasetId, chromosome, start, reference, alternate, - "end", aggregatedVariantType, alleleCount, callCount, frequency, variantType) - SELECT ($1), ($2), ($3), ($4), t.alt, ($6), ($7), t.ac, ($9), t.freq, t.vt - FROM (SELECT unnest($5::varchar[]) alt, unnest($8::integer[]) ac, - unnest($10::float[]) freq, unnest($11::varchar[]) as vt) t - ON CONFLICT (datasetId, chromosome, start, reference, alternate) + """INSERT INTO beacon_mate_table + (datasetId, chromosome, chromosomeStart, chromosomePos, + mate, mateStart, matePos, reference, alternate, alleleCount, + callCount, frequency, "end") + SELECT ($1), ($2), ($3), ($4), + ($5), ($6), ($7), ($8), t.alt, t.ac, ($11), t.freq, ($13) + FROM (SELECT unnest($9::varchar[]) alt, unnest($10::integer[]) ac, + unnest($12::float[]) freq) t + ON CONFLICT (datasetId, chromosome, mate, chromosomePos, matePos) DO NOTHING""", dataset_id, variant.CHROM.replace("chr", ""), variant.start, + variant.ID, + bnd[0].replace("chr", ""), + bnd[1], + bnd[6], variant.REF, params[3], - variant.end, - variant.var_type.upper(), params[1], params[4], params[0], - params[2], + variant.end, ) - - LOG.debug("Variants have been inserted") + else: + await self._conn.execute( + """INSERT INTO beacon_data_table + (datasetId, chromosome, start, reference, alternate, + "end", aggregatedVariantType, alleleCount, callCount, frequency, variantType) + SELECT ($1), ($2), ($3), ($4), t.alt, ($6), ($7), t.ac, ($9), t.freq, t.vt + FROM (SELECT unnest($5::varchar[]) alt, unnest($8::integer[]) ac, + unnest($10::float[]) freq, unnest($11::varchar[]) as vt) t + ON CONFLICT (datasetId, chromosome, start, reference, alternate) + DO NOTHING""", + dataset_id, + variant.CHROM.replace("chr", ""), + variant.start, + variant.REF, + params[3], + variant.end, + variant.var_type.upper(), + params[1], + params[4], + params[0], + params[2], + ) + + LOG.debug("Variants have been inserted") except Exception as e: LOG.error(f"AN ERROR OCCURRED WHILE ATTEMPTING TO INSERT VARIANTS -> {e}") diff --git a/requirements.txt b/requirements.txt index 1bc02c8c..a67d62e6 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,15 +1,15 @@ aiohttp==3.7.4.post0 aiohttp-cors==0.7.0 -asyncpg==0.24.0 +asyncpg==0.25.0 jsonschema==3.2.0; python_version < '3.7' -jsonschema==4.0.1; python_version >= '3.7' +jsonschema==4.2.1; python_version >= '3.7' Cython==0.29.24 cyvcf2==0.10.1; python_version < '3.7' cyvcf2; python_version >= '3.7' uvloop==0.14.0; python_version < '3.7' uvloop==0.16.0; python_version >= '3.7' aiocache==0.11.1 -ujson==4.2.0 +ujson==4.3.0 aiomcache==0.6.0 -Authlib==0.15.4 +Authlib==0.15.5 gunicorn==20.1.0 diff --git a/setup.py b/setup.py index 0ab44670..89262e2a 100644 --- a/setup.py +++ b/setup.py @@ -37,39 +37,39 @@ "Programming Language :: Python :: 3.7", ], install_requires=[ - "asyncpg==0.24.0", + "asyncpg==0.25.0", "aiohttp==3.7.4.post0", - "Authlib==0.15.4", + "Authlib==0.15.5", "aiohttp-cors==0.7.0", "jsonschema==3.2.0; python_version < '3.7'", - "jsonschema==4.0.1; python_version >= '3.7'", + "jsonschema==4.2.1; python_version >= '3.7'", "gunicorn==20.1.0", "uvloop==0.14.0; python_version < '3.7'", "uvloop==0.16.0; python_version >= '3.7'", "cyvcf2==0.10.1; python_version < '3.7'", "cyvcf2; python_version >= '3.7'", "aiocache==0.11.1", - "ujson==4.2.0", + "ujson==4.3.0", "aiomcache==0.6.0", ], extras_require={ "vcf": [ "cyvcf2==0.10.1; python_version < '3.7'", - "numpy==1.21.2", + "numpy==1.21.4", "cyvcf2; python_version >= '3.7'", "Cython==0.29.24", ], "test": [ - "coverage==6.0", + "coverage==6.1.2", "pytest<6.3", "pytest-cov==3.0.0", "testfixtures==6.18.3", "tox==3.24.4", - "flake8==3.9.2", + "flake8==4.0.1", "flake8-docstrings==1.6.0", "asynctest==0.13.0", "aioresponses==0.7.2", - "black==21.9b0", + "black==21.11b1", ], "docs": ["sphinx >= 1.4", "sphinx_rtd_theme==1.0.0"], },