From 97a34a557d2dbb305ccae284e0eb78b3dc994dba Mon Sep 17 00:00:00 2001 From: avisionh Date: Sun, 23 May 2021 21:06:28 +0100 Subject: [PATCH 1/4] feat: Transform dictionary to dataframe This is so we can reformat suitably for neo4j. --- exporter.py | 23 +++++ poetry.lock | 141 +++++++++++++++++++++++----- pyproject.toml | 1 + tests/conftest.py | 16 +++- tests/fixtures/fixture_exporter.py | 30 ++++++ tests/fixtures/fixture_extractor.py | 21 ++--- tests/unit/test_exporter.py | 9 ++ 7 files changed, 200 insertions(+), 41 deletions(-) create mode 100644 exporter.py create mode 100644 tests/fixtures/fixture_exporter.py create mode 100644 tests/unit/test_exporter.py diff --git a/exporter.py b/exporter.py new file mode 100644 index 0000000..a73fd20 --- /dev/null +++ b/exporter.py @@ -0,0 +1,23 @@ +import pandas as pd + + +def convert_dict_to_df(data: dict) -> pd.DataFrame: + """ + Converts a dictionary into a dataframe, with keys and values being a column each. + :param data: Dictionary to convert to a dataframe. + :return: Dataframe where dictionary keys and values are a column. + """ + data = pd.DataFrame.from_dict(data=data, orient="index").reset_index() + data = data.rename(columns={"index": "table"}) + data = pd.melt( + frame=data, + id_vars="table", + var_name="original_column_name", + value_name="dependency", + ) + # remove nans + data = data[data["dependency"].notnull()] + # sort values + data = data.sort_values(by="table") + + return data[["table", "dependency"]] diff --git a/poetry.lock b/poetry.lock index 23fe942..8654be0 100644 --- a/poetry.lock +++ b/poetry.lock @@ -147,17 +147,17 @@ optional = false python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*" [[package]] -name = "mccabe" -version = "0.6.1" -description = "McCabe checker, plugin for flake8" +name = "iniconfig" +version = "1.1.1" +description = "iniconfig: brain-dead simple config-ini parsing" category = "main" optional = false python-versions = "*" [[package]] -name = "iniconfig" -version = "1.1.1" -description = "iniconfig: brain-dead simple config-ini parsing" +name = "mccabe" +version = "0.6.1" +description = "McCabe checker, plugin for flake8" category = "main" optional = false python-versions = "*" @@ -240,6 +240,14 @@ category = "main" optional = false python-versions = "*" +[[package]] +name = "numpy" +version = "1.20.3" +description = "NumPy is the fundamental package for array computing with Python." +category = "main" +optional = false +python-versions = ">=3.7" + [[package]] name = "packaging" version = "20.9" @@ -251,6 +259,22 @@ python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*" [package.dependencies] pyparsing = ">=2.0.2" +[[package]] +name = "pandas" +version = "1.2.4" +description = "Powerful data structures for data analysis, time series, and statistics" +category = "main" +optional = false +python-versions = ">=3.7.1" + +[package.dependencies] +numpy = ">=1.16.5" +python-dateutil = ">=2.7.3" +pytz = ">=2017.3" + +[package.extras] +test = ["pytest (>=5.0.1)", "pytest-xdist", "hypothesis (>=3.58)"] + [[package]] name = "pluggy" version = "0.13.1" @@ -278,6 +302,14 @@ pyyaml = ">=5.1" toml = "*" virtualenv = ">=20.0.8" +[[package]] +name = "py" +version = "1.10.0" +description = "library with cross-python path, ini-parsing, io, code, log facilities" +category = "main" +optional = false +python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*" + [[package]] name = "pycodestyle" version = "2.7.0" @@ -294,14 +326,6 @@ category = "main" optional = false python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*" -[[package]] -name = "py" -version = "1.10.0" -description = "library with cross-python path, ini-parsing, io, code, log facilities" -category = "main" -optional = false -python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*" - [[package]] name = "pyparsing" version = "2.4.7" @@ -331,6 +355,25 @@ toml = "*" [package.extras] testing = ["argcomplete", "hypothesis (>=3.56)", "mock", "nose", "requests", "xmlschema"] +[[package]] +name = "python-dateutil" +version = "2.8.1" +description = "Extensions to the standard Python datetime module" +category = "main" +optional = false +python-versions = "!=3.0.*,!=3.1.*,!=3.2.*,>=2.7" + +[package.dependencies] +six = ">=1.5" + +[[package]] +name = "pytz" +version = "2021.1" +description = "World timezone definitions, modern and historical" +category = "main" +optional = false +python-versions = "*" + [[package]] name = "pyyaml" version = "5.4.1" @@ -428,7 +471,7 @@ testing = ["coverage (>=4)", "coverage-enable-subprocess (>=1)", "flaky (>=3)", [metadata] lock-version = "1.1" python-versions = "^3.8" -content-hash = "daf7c9934fd96f7224a177fe729ebf64c36c6dc462f13c38592ce7058f224a2d" +content-hash = "bf0d8695b7bcb1c4144b9be6361e7cb68362475afe6f8f275f38b958e9bd4441" [metadata.files] appdirs = [ @@ -489,14 +532,14 @@ idna = [ {file = "idna-2.10-py2.py3-none-any.whl", hash = "sha256:b97d804b1e9b523befed77c48dacec60e6dcb0b5391d57af6a65a312a90648c0"}, {file = "idna-2.10.tar.gz", hash = "sha256:b307872f855b18632ce0c21c5e45be78c0ea7ae4c15c828c20788b26921eb3f6"}, ] -mccabe = [ - {file = "mccabe-0.6.1-py2.py3-none-any.whl", hash = "sha256:ab8a6258860da4b6677da4bd2fe5dc2c659cff31b3ee4f7f5d64e79735b80d42"}, - {file = "mccabe-0.6.1.tar.gz", hash = "sha256:dd8d182285a0fe56bace7f45b5e7d1a6ebcbf524e8f3bd87eb0f125271b8831f"}, -] iniconfig = [ {file = "iniconfig-1.1.1-py2.py3-none-any.whl", hash = "sha256:011e24c64b7f47f6ebd835bb12a743f2fbe9a26d4cecaa7f53bc4f35ee9da8b3"}, {file = "iniconfig-1.1.1.tar.gz", hash = "sha256:bc3af051d7d14b2ee5ef9969666def0cd1a000e121eaea580d4a313df4b37f32"}, ] +mccabe = [ + {file = "mccabe-0.6.1-py2.py3-none-any.whl", hash = "sha256:ab8a6258860da4b6677da4bd2fe5dc2c659cff31b3ee4f7f5d64e79735b80d42"}, + {file = "mccabe-0.6.1.tar.gz", hash = "sha256:dd8d182285a0fe56bace7f45b5e7d1a6ebcbf524e8f3bd87eb0f125271b8831f"}, +] mo-dots = [ {file = "mo-dots-4.22.21108.tar.gz", hash = "sha256:f9f2f9a6e44e214959aecd55857b6df487b8e86daf51334e2e982d47ea33ed23"}, ] @@ -519,10 +562,54 @@ nodeenv = [ {file = "nodeenv-1.6.0-py2.py3-none-any.whl", hash = "sha256:621e6b7076565ddcacd2db0294c0381e01fd28945ab36bcf00f41c5daf63bef7"}, {file = "nodeenv-1.6.0.tar.gz", hash = "sha256:3ef13ff90291ba2a4a7a4ff9a979b63ffdd00a464dbe04acf0ea6471517a4c2b"}, ] +numpy = [ + {file = "numpy-1.20.3-cp37-cp37m-macosx_10_9_x86_64.whl", hash = "sha256:70eb5808127284c4e5c9e836208e09d685a7978b6a216db85960b1a112eeace8"}, + {file = "numpy-1.20.3-cp37-cp37m-manylinux_2_12_i686.manylinux2010_i686.whl", hash = "sha256:6ca2b85a5997dabc38301a22ee43c82adcb53ff660b89ee88dded6b33687e1d8"}, + {file = "numpy-1.20.3-cp37-cp37m-manylinux_2_12_x86_64.manylinux2010_x86_64.whl", hash = "sha256:c5bf0e132acf7557fc9bb8ded8b53bbbbea8892f3c9a1738205878ca9434206a"}, + {file = "numpy-1.20.3-cp37-cp37m-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:db250fd3e90117e0312b611574cd1b3f78bec046783195075cbd7ba9c3d73f16"}, + {file = "numpy-1.20.3-cp37-cp37m-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:637d827248f447e63585ca3f4a7d2dfaa882e094df6cfa177cc9cf9cd6cdf6d2"}, + {file = "numpy-1.20.3-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.whl", hash = "sha256:8b7bb4b9280da3b2856cb1fc425932f46fba609819ee1c62256f61799e6a51d2"}, + {file = "numpy-1.20.3-cp37-cp37m-win32.whl", hash = "sha256:67d44acb72c31a97a3d5d33d103ab06d8ac20770e1c5ad81bdb3f0c086a56cf6"}, + {file = "numpy-1.20.3-cp37-cp37m-win_amd64.whl", hash = "sha256:43909c8bb289c382170e0282158a38cf306a8ad2ff6dfadc447e90f9961bef43"}, + {file = "numpy-1.20.3-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:f1452578d0516283c87608a5a5548b0cdde15b99650efdfd85182102ef7a7c17"}, + {file = "numpy-1.20.3-cp38-cp38-manylinux_2_12_i686.manylinux2010_i686.whl", hash = "sha256:6e51534e78d14b4a009a062641f465cfaba4fdcb046c3ac0b1f61dd97c861b1b"}, + {file = "numpy-1.20.3-cp38-cp38-manylinux_2_12_x86_64.manylinux2010_x86_64.whl", hash = "sha256:e515c9a93aebe27166ec9593411c58494fa98e5fcc219e47260d9ab8a1cc7f9f"}, + {file = "numpy-1.20.3-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:c1c09247ccea742525bdb5f4b5ceeacb34f95731647fe55774aa36557dbb5fa4"}, + {file = "numpy-1.20.3-cp38-cp38-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:66fbc6fed94a13b9801fb70b96ff30605ab0a123e775a5e7a26938b717c5d71a"}, + {file = "numpy-1.20.3-cp38-cp38-manylinux_2_5_x86_64.manylinux1_x86_64.whl", hash = "sha256:ea9cff01e75a956dbee133fa8e5b68f2f92175233de2f88de3a682dd94deda65"}, + {file = "numpy-1.20.3-cp38-cp38-win32.whl", hash = "sha256:f39a995e47cb8649673cfa0579fbdd1cdd33ea497d1728a6cb194d6252268e48"}, + {file = "numpy-1.20.3-cp38-cp38-win_amd64.whl", hash = "sha256:1676b0a292dd3c99e49305a16d7a9f42a4ab60ec522eac0d3dd20cdf362ac010"}, + {file = "numpy-1.20.3-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:830b044f4e64a76ba71448fce6e604c0fc47a0e54d8f6467be23749ac2cbd2fb"}, + {file = "numpy-1.20.3-cp39-cp39-manylinux_2_12_i686.manylinux2010_i686.whl", hash = "sha256:55b745fca0a5ab738647d0e4db099bd0a23279c32b31a783ad2ccea729e632df"}, + {file = "numpy-1.20.3-cp39-cp39-manylinux_2_12_x86_64.manylinux2010_x86_64.whl", hash = "sha256:5d050e1e4bc9ddb8656d7b4f414557720ddcca23a5b88dd7cff65e847864c400"}, + {file = "numpy-1.20.3-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:a9c65473ebc342715cb2d7926ff1e202c26376c0dcaaee85a1fd4b8d8c1d3b2f"}, + {file = "numpy-1.20.3-cp39-cp39-win32.whl", hash = "sha256:16f221035e8bd19b9dc9a57159e38d2dd060b48e93e1d843c49cb370b0f415fd"}, + {file = "numpy-1.20.3-cp39-cp39-win_amd64.whl", hash = "sha256:6690080810f77485667bfbff4f69d717c3be25e5b11bb2073e76bb3f578d99b4"}, + {file = "numpy-1.20.3-pp37-pypy37_pp73-manylinux_2_12_x86_64.manylinux2010_x86_64.whl", hash = "sha256:4e465afc3b96dbc80cf4a5273e5e2b1e3451286361b4af70ce1adb2984d392f9"}, + {file = "numpy-1.20.3.zip", hash = "sha256:e55185e51b18d788e49fe8305fd73ef4470596b33fc2c1ceb304566b99c71a69"}, +] packaging = [ {file = "packaging-20.9-py2.py3-none-any.whl", hash = "sha256:67714da7f7bc052e064859c05c595155bd1ee9f69f76557e21f051443c20947a"}, {file = "packaging-20.9.tar.gz", hash = "sha256:5b327ac1320dc863dca72f4514ecc086f31186744b84a230374cc1fd776feae5"}, ] +pandas = [ + {file = "pandas-1.2.4-cp37-cp37m-macosx_10_9_x86_64.whl", hash = "sha256:c601c6fdebc729df4438ec1f62275d6136a0dd14d332fc0e8ce3f7d2aadb4dd6"}, + {file = "pandas-1.2.4-cp37-cp37m-manylinux1_i686.whl", hash = "sha256:8d4c74177c26aadcfb4fd1de6c1c43c2bf822b3e0fc7a9b409eeaf84b3e92aaa"}, + {file = "pandas-1.2.4-cp37-cp37m-manylinux1_x86_64.whl", hash = "sha256:b730add5267f873b3383c18cac4df2527ac4f0f0eed1c6cf37fcb437e25cf558"}, + {file = "pandas-1.2.4-cp37-cp37m-win32.whl", hash = "sha256:2cb7e8f4f152f27dc93f30b5c7a98f6c748601ea65da359af734dd0cf3fa733f"}, + {file = "pandas-1.2.4-cp37-cp37m-win_amd64.whl", hash = "sha256:2111c25e69fa9365ba80bbf4f959400054b2771ac5d041ed19415a8b488dc70a"}, + {file = "pandas-1.2.4-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:167693a80abc8eb28051fbd184c1b7afd13ce2c727a5af47b048f1ea3afefff4"}, + {file = "pandas-1.2.4-cp38-cp38-manylinux1_i686.whl", hash = "sha256:612add929bf3ba9d27b436cc8853f5acc337242d6b584203f207e364bb46cb12"}, + {file = "pandas-1.2.4-cp38-cp38-manylinux1_x86_64.whl", hash = "sha256:971e2a414fce20cc5331fe791153513d076814d30a60cd7348466943e6e909e4"}, + {file = "pandas-1.2.4-cp38-cp38-win32.whl", hash = "sha256:68d7baa80c74aaacbed597265ca2308f017859123231542ff8a5266d489e1858"}, + {file = "pandas-1.2.4-cp38-cp38-win_amd64.whl", hash = "sha256:bd659c11a4578af740782288cac141a322057a2e36920016e0fc7b25c5a4b686"}, + {file = "pandas-1.2.4-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:9db70ffa8b280bb4de83f9739d514cd0735825e79eef3a61d312420b9f16b758"}, + {file = "pandas-1.2.4-cp39-cp39-manylinux1_i686.whl", hash = "sha256:298f0553fd3ba8e002c4070a723a59cdb28eda579f3e243bc2ee397773f5398b"}, + {file = "pandas-1.2.4-cp39-cp39-manylinux1_x86_64.whl", hash = "sha256:52d2472acbb8a56819a87aafdb8b5b6d2b3386e15c95bde56b281882529a7ded"}, + {file = "pandas-1.2.4-cp39-cp39-win32.whl", hash = "sha256:d0877407359811f7b853b548a614aacd7dea83b0c0c84620a9a643f180060950"}, + {file = "pandas-1.2.4-cp39-cp39-win_amd64.whl", hash = "sha256:2b063d41803b6a19703b845609c0b700913593de067b552a8b24dd8eeb8c9895"}, + {file = "pandas-1.2.4.tar.gz", hash = "sha256:649ecab692fade3cbfcf967ff936496b0cfba0af00a55dfaacd82bdda5cb2279"}, +] pluggy = [ {file = "pluggy-0.13.1-py2.py3-none-any.whl", hash = "sha256:966c145cd83c96502c3c3868f50408687b38434af77734af1e9ca461a4081d2d"}, {file = "pluggy-0.13.1.tar.gz", hash = "sha256:15b2acde666561e1298d71b523007ed7364de07029219b604cf808bfa1c765b0"}, @@ -531,6 +618,10 @@ pre-commit = [ {file = "pre_commit-2.12.1-py2.py3-none-any.whl", hash = "sha256:70c5ec1f30406250b706eda35e868b87e3e4ba099af8787e3e8b4b01e84f4712"}, {file = "pre_commit-2.12.1.tar.gz", hash = "sha256:900d3c7e1bf4cf0374bb2893c24c23304952181405b4d88c9c40b72bda1bb8a9"}, ] +py = [ + {file = "py-1.10.0-py2.py3-none-any.whl", hash = "sha256:3b80836aa6d1feeaa108e046da6423ab8f6ceda6468545ae8d02d9d58d18818a"}, + {file = "py-1.10.0.tar.gz", hash = "sha256:21b81bda15b66ef5e1a777a21c4dcd9c20ad3efd0b3f817e7a809035269e1bd3"}, +] pycodestyle = [ {file = "pycodestyle-2.7.0-py2.py3-none-any.whl", hash = "sha256:514f76d918fcc0b55c6680472f0a37970994e07bbb80725808c17089be302068"}, {file = "pycodestyle-2.7.0.tar.gz", hash = "sha256:c389c1d06bf7904078ca03399a4816f974a1d590090fecea0c63ec26ebaf1cef"}, @@ -539,10 +630,6 @@ pyflakes = [ {file = "pyflakes-2.3.1-py2.py3-none-any.whl", hash = "sha256:7893783d01b8a89811dd72d7dfd4d84ff098e5eed95cfa8905b22bbffe52efc3"}, {file = "pyflakes-2.3.1.tar.gz", hash = "sha256:f5bc8ecabc05bb9d291eb5203d6810b49040f6ff446a756326104746cc00c1db"}, ] -py = [ - {file = "py-1.10.0-py2.py3-none-any.whl", hash = "sha256:3b80836aa6d1feeaa108e046da6423ab8f6ceda6468545ae8d02d9d58d18818a"}, - {file = "py-1.10.0.tar.gz", hash = "sha256:21b81bda15b66ef5e1a777a21c4dcd9c20ad3efd0b3f817e7a809035269e1bd3"}, -] pyparsing = [ {file = "pyparsing-2.4.7-py2.py3-none-any.whl", hash = "sha256:ef9d7589ef3c200abe66653d3f1ab1033c3c419ae9b9bdb1240a85b024efc88b"}, {file = "pyparsing-2.4.7.tar.gz", hash = "sha256:c203ec8783bf771a155b207279b9bccb8dea02d8f0c9e5f8ead507bc3246ecc1"}, @@ -551,6 +638,14 @@ pytest = [ {file = "pytest-6.2.4-py3-none-any.whl", hash = "sha256:91ef2131a9bd6be8f76f1f08eac5c5317221d6ad1e143ae03894b862e8976890"}, {file = "pytest-6.2.4.tar.gz", hash = "sha256:50bcad0a0b9c5a72c8e4e7c9855a3ad496ca6a881a3641b4260605450772c54b"}, ] +python-dateutil = [ + {file = "python-dateutil-2.8.1.tar.gz", hash = "sha256:73ebfe9dbf22e832286dafa60473e4cd239f8592f699aa5adaf10050e6e1823c"}, + {file = "python_dateutil-2.8.1-py2.py3-none-any.whl", hash = "sha256:75bb3f31ea686f1197762692a9ee6a7550b59fc6ca3a1f4b5d7e32fb98e2da2a"}, +] +pytz = [ + {file = "pytz-2021.1-py2.py3-none-any.whl", hash = "sha256:eb10ce3e7736052ed3623d49975ce333bcd712c7bb19a58b9e2089d4057d0798"}, + {file = "pytz-2021.1.tar.gz", hash = "sha256:83a4a90894bf38e243cf052c8b58f381bfe9a7a483f6a9cab140bc7f702ac4da"}, +] pyyaml = [ {file = "PyYAML-5.4.1-cp27-cp27m-macosx_10_9_x86_64.whl", hash = "sha256:3b2b1824fe7112845700f815ff6a489360226a5609b96ec2190a45e62a9fc922"}, {file = "PyYAML-5.4.1-cp27-cp27m-win32.whl", hash = "sha256:129def1b7c1bf22faffd67b8f3724645203b79d8f4cc81f674654d9902cb4393"}, diff --git a/pyproject.toml b/pyproject.toml index 6ec76d9..9715158 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -14,6 +14,7 @@ flake8 = "^3.9.2" moz-sql-parser = "^4.40.21126" tqdm = "^4.60.0" pytest = "^6.2.4" +pandas = "^1.2.4" [tool.poetry.dev-dependencies] diff --git a/tests/conftest.py b/tests/conftest.py index 10f2b4a..47113dc 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -3,10 +3,20 @@ pytest_plugins = [ "tests.fixtures.fixture_extractor", + "tests.fixtures.fixture_exporter", ] @pytest.fixture() -def query_user_activity(): - with open(file="data/reporting/user_activity.sql", mode="r") as f: - return f.read() +def extracted_analytics(): + return { + "analytics.repo": [ + "github_repos.commits", + "github_repos.languages", + "github_repos.licenses", + ], + "analytics.author": ["github_repos.commits"], + "analytics.committer": ["github_repos.commits"], + "analytics.commit": ["github_repos.commits"], + "analytics.user": ["analytics.author", "analytics.committer"], + } diff --git a/tests/fixtures/fixture_exporter.py b/tests/fixtures/fixture_exporter.py new file mode 100644 index 0000000..a9f49ca --- /dev/null +++ b/tests/fixtures/fixture_exporter.py @@ -0,0 +1,30 @@ +import pytest +import pandas as pd + + +@pytest.fixture() +def dict_as_df(): + return pd.DataFrame( + data={ + "table": [ + "analytics.author", + "analytics.commit", + "analytics.committer", + "analytics.repo", + "analytics.repo", + "analytics.repo", + "analytics.user", + "analytics.user", + ], + "dependency": [ + "github_repos.commits", + "github_repos.commits", + "github_repos.commits", + "github_repos.commits", + "github_repos.languages", + "github_repos.licenses", + "analytics.author", + "analytics.committer", + ], + } + ) diff --git a/tests/fixtures/fixture_extractor.py b/tests/fixtures/fixture_extractor.py index 56b49c0..3b160fa 100644 --- a/tests/fixtures/fixture_extractor.py +++ b/tests/fixtures/fixture_extractor.py @@ -1,6 +1,12 @@ import pytest +@pytest.fixture() +def query_user_activity(): + with open(file="data/reporting/user_activity.sql", mode="r") as f: + return f.read() + + @pytest.fixture() def cleaned_user_activity(): return ( @@ -37,18 +43,3 @@ def extracted_reporting(): "analytics.user", ] } - - -@pytest.fixture() -def extracted_analytics(): - return { - "analytics.repo": [ - "github_repos.commits", - "github_repos.languages", - "github_repos.licenses", - ], - "analytics.author": ["github_repos.commits"], - "analytics.committer": ["github_repos.commits"], - "analytics.commit": ["github_repos.commits"], - "analytics.user": ["analytics.author", "analytics.committer"], - } diff --git a/tests/unit/test_exporter.py b/tests/unit/test_exporter.py new file mode 100644 index 0000000..c0c70fc --- /dev/null +++ b/tests/unit/test_exporter.py @@ -0,0 +1,9 @@ +import exporter +import pandas.testing as pdt + + +def test_convert_dict_to_df(extracted_analytics, dict_as_df): + df = exporter.convert_dict_to_df(data=extracted_analytics) + pdt.assert_frame_equal( + left=df.reset_index(drop=True), right=dict_as_df.reset_index(drop=True) + ) From 7fb551e79d9bbaa57fb82ce0732e3defb9283a0c Mon Sep 17 00:00:00 2001 From: avisionh Date: Sun, 23 May 2021 21:23:35 +0100 Subject: [PATCH 2/4] feat: Separate dataset and table names from string This is so we can assign labels to nodes based on the dataset. --- exporter.py | 24 +++++++++++++++ poetry.lock | 2 +- pyproject.toml | 1 + tests/fixtures/fixture_exporter.py | 48 ++++++++++++++++++++++++++++++ tests/unit/test_exporter.py | 8 +++++ 5 files changed, 82 insertions(+), 1 deletion(-) diff --git a/exporter.py b/exporter.py index a73fd20..b6b8ec7 100644 --- a/exporter.py +++ b/exporter.py @@ -1,4 +1,6 @@ +from typing import Union import pandas as pd +import numpy as np def convert_dict_to_df(data: dict) -> pd.DataFrame: @@ -21,3 +23,25 @@ def convert_dict_to_df(data: dict) -> pd.DataFrame: data = data.sort_values(by="table") return data[["table", "dependency"]] + + +def separate_dataset_table(data: Union[pd.Series, pd.DataFrame]) -> pd.DataFrame: + """ + Separates string of . into dataset and table name. + :param data: Dataframe with columns to separate string entries into dataset and table name. + :return: Dataframe with columns for dataset and table name. + """ + if isinstance(data, pd.Series): + cols = "table" + else: + cols = data.columns + + for col in cols: + col_names = [f"{col}_dataset", f"{col}_name"] + # remove backslashes and split on period + data[col] = data[col].str.replace(pat="\\", repl="", regex=True) + data[col_names] = data[col].str.split(pat=".", n=1, expand=True) + # remove full column + data = data.drop(columns=col) + + return data diff --git a/poetry.lock b/poetry.lock index 8654be0..63c1e20 100644 --- a/poetry.lock +++ b/poetry.lock @@ -471,7 +471,7 @@ testing = ["coverage (>=4)", "coverage-enable-subprocess (>=1)", "flaky (>=3)", [metadata] lock-version = "1.1" python-versions = "^3.8" -content-hash = "bf0d8695b7bcb1c4144b9be6361e7cb68362475afe6f8f275f38b958e9bd4441" +content-hash = "a8b7b42d96b152e9a8fd58b71161823bc914c37ee846e2e9a1121168e874d320" [metadata.files] appdirs = [ diff --git a/pyproject.toml b/pyproject.toml index 9715158..95de0a3 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -15,6 +15,7 @@ moz-sql-parser = "^4.40.21126" tqdm = "^4.60.0" pytest = "^6.2.4" pandas = "^1.2.4" +numpy = "^1.20.3" [tool.poetry.dev-dependencies] diff --git a/tests/fixtures/fixture_exporter.py b/tests/fixtures/fixture_exporter.py index a9f49ca..881d8ff 100644 --- a/tests/fixtures/fixture_exporter.py +++ b/tests/fixtures/fixture_exporter.py @@ -28,3 +28,51 @@ def dict_as_df(): ], } ) + + +@pytest.fixture() +def df_separate_dataset_table(): + return pd.DataFrame( + data={ + "table_dataset": [ + "analytics", + "analytics", + "analytics", + "analytics", + "analytics", + "analytics", + "analytics", + "analytics", + ], + "table_name": [ + "author", + "commit", + "committer", + "repo", + "repo", + "repo", + "user", + "user", + ], + "dependency_dataset": [ + "github_repos", + "github_repos", + "github_repos", + "github_repos", + "github_repos", + "github_repos", + "analytics", + "analytics", + ], + "dependency_name": [ + "commits", + "commits", + "commits", + "commits", + "languages", + "licenses", + "author", + "committer", + ], + } + ) diff --git a/tests/unit/test_exporter.py b/tests/unit/test_exporter.py index c0c70fc..741d334 100644 --- a/tests/unit/test_exporter.py +++ b/tests/unit/test_exporter.py @@ -7,3 +7,11 @@ def test_convert_dict_to_df(extracted_analytics, dict_as_df): pdt.assert_frame_equal( left=df.reset_index(drop=True), right=dict_as_df.reset_index(drop=True) ) + + +def test_separate_dataset_table(dict_as_df, df_separate_dataset_table): + df = exporter.separate_dataset_table(data=dict_as_df) + pdt.assert_frame_equal( + left=df.reset_index(drop=True), + right=df_separate_dataset_table.reset_index(drop=True), + ) From 2a831d17cb65d43e1fdc3067d899e25c4e6722e6 Mon Sep 17 00:00:00 2001 From: avisionh Date: Sun, 23 May 2021 21:30:26 +0100 Subject: [PATCH 3/4] feat: Export different csvs by dataset This is so we can uniquely identify nodes and relationships when exporting to neo4j. --- exporter.py | 44 +++++++++++++++++++++++++++++++++++++++++++- poetry.lock | 2 +- pyproject.toml | 1 - 3 files changed, 44 insertions(+), 3 deletions(-) diff --git a/exporter.py b/exporter.py index b6b8ec7..e30c341 100644 --- a/exporter.py +++ b/exporter.py @@ -1,6 +1,5 @@ from typing import Union import pandas as pd -import numpy as np def convert_dict_to_df(data: dict) -> pd.DataFrame: @@ -45,3 +44,46 @@ def separate_dataset_table(data: Union[pd.Series, pd.DataFrame]) -> pd.DataFrame data = data.drop(columns=col) return data + + +def export_unique_names(data: pd.DataFrame, path_or_buf: str): + """ + Concatenates and unions a dataframe so we we get unique table names. This is so we create nodes in neo4j. + :param data: Dataframe to get the names from. + :param path_or_buf: String of the directory to store files. + :return: + """ + data_table = data[["table_dataset", "table_name"]] + data_dependency = data[["dependency_dataset", "dependency_name"]] + # rename so can union + data_dependency = data_dependency.rename( + columns={"dependency_dataset": "table_dataset", "dependency_name": "table_name"} + ) + frames = [data_table, data_dependency] + data_frames = pd.concat(objs=frames, axis="index") + + for ds in data_frames["table_dataset"].unique(): + df = data_frames[data_frames["table_layer"] == ds] + df = df.drop_duplicates(subset="table_name") + df.to_csv(path_or_buf=f"{path_or_buf}/{ds}_tables.csv", index=False) + + +def export_table_dependency(data: pd.DataFrame, path_or_buf: str): + """ + Filters a dataframe by its table and dependency levels so it can be exported into neo4j. + :param data: Dataframe to filter by table and dependency. + Requires column to be called 'table_dataset' and 'dependency_dataset'. + :param path_or_buf: String of the directory to store files. + :return: + """ + for t_ds in data["table_dataset"].unique(): + mask_t_ds = data["table_dataset"] == t_ds + for d_ds in data["dependency_dataset"].unique(): + mask_d_ds = data["dependency_dataset"] == d_ds + df_out = data.loc[ + (mask_t_ds & mask_d_ds), + ] + df_out = df_out.drop(columns=["table", "dependency"]) + df_out.to_csv( + path_or_buf=f"{path_or_buf}/{t_ds}_{d_ds}_dependency.csv", index=False + ) diff --git a/poetry.lock b/poetry.lock index 63c1e20..8654be0 100644 --- a/poetry.lock +++ b/poetry.lock @@ -471,7 +471,7 @@ testing = ["coverage (>=4)", "coverage-enable-subprocess (>=1)", "flaky (>=3)", [metadata] lock-version = "1.1" python-versions = "^3.8" -content-hash = "a8b7b42d96b152e9a8fd58b71161823bc914c37ee846e2e9a1121168e874d320" +content-hash = "bf0d8695b7bcb1c4144b9be6361e7cb68362475afe6f8f275f38b958e9bd4441" [metadata.files] appdirs = [ diff --git a/pyproject.toml b/pyproject.toml index 95de0a3..9715158 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -15,7 +15,6 @@ moz-sql-parser = "^4.40.21126" tqdm = "^4.60.0" pytest = "^6.2.4" pandas = "^1.2.4" -numpy = "^1.20.3" [tool.poetry.dev-dependencies] From 59acff580b99fc8edb29928a577ef66aa67eb6ea Mon Sep 17 00:00:00 2001 From: avisionh Date: Sun, 23 May 2021 23:28:45 +0100 Subject: [PATCH 4/4] feat: Write module to extract table and dependencies This is so we can combine all the code we've written thus far into a module that can be run from the command line. Update README.md with new instructions for extracting table dependencies. --- .gitignore | 6 ++ .pre-commit-config.yaml | 1 - README.md | 7 +- exporter.py | 3 +- extractor.py | 2 + neo4j/.gitkeep | 0 poetry.lock | 2 +- pyproject.toml | 1 + {data => sql}/analytics/author.sql | 0 {data => sql}/analytics/commit.sql | 0 {data => sql}/analytics/committer.sql | 0 {data => sql}/analytics/repo.sql | 0 {data => sql}/analytics/user.sql | 0 {data => sql}/reporting/user_activity.sql | 0 sqlquerygraph.py | 94 +++++++++++++++++++++++ tests/fixtures/fixture_extractor.py | 2 +- tests/integration/test_extractor.py | 2 +- tests/unit/test_extractor.py | 2 +- 18 files changed, 114 insertions(+), 8 deletions(-) create mode 100644 neo4j/.gitkeep rename {data => sql}/analytics/author.sql (100%) rename {data => sql}/analytics/commit.sql (100%) rename {data => sql}/analytics/committer.sql (100%) rename {data => sql}/analytics/repo.sql (100%) rename {data => sql}/analytics/user.sql (100%) rename {data => sql}/reporting/user_activity.sql (100%) create mode 100644 sqlquerygraph.py diff --git a/.gitignore b/.gitignore index abcbf77..5e6ab21 100644 --- a/.gitignore +++ b/.gitignore @@ -15,3 +15,9 @@ # django __pycache__/ + +# data +*.csv + +# outputs +*.html diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index b595f61..a791996 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -5,7 +5,6 @@ repos: rev: 3.9.2 hooks: - id: flake8 - args: ["src"] - repo: https://github.com/psf/black rev: 21.5b1 # Replace by any tag/version: https://github.com/psf/black/tags hooks: diff --git a/README.md b/README.md index c759593..074cef6 100644 --- a/README.md +++ b/README.md @@ -21,7 +21,7 @@ To run the code in here, ensure your system meets the following requirements: Note there may be some Python IDE-specific requirements around loading environment variables, which are not considered here. ### Set-up -For quickstart set-up of the project, run the below in your shell: +For quickstart set-up of the project, run the below in your shell/terminal: ```shell script # 1. read project-specific environment variables direnv allow @@ -34,6 +34,11 @@ poetry install pre-commit install ``` +To then extract the tables and their dependencies from the example SQL scripts in the `sql/` directory, run the following in your shell/terminal: +```shell script +python sqlquerygraph.py -sd 'sql' -ed 'neo4j' -rd 'github_repos' 'analytics' 'reporting' +``` + ### Run neo4j graph database We use [neo4j](https://neo4j.com/) for this project to visualise the dependencies between tables. To install neo4j locally using Docker Compose, follow the below instructions: 1. Install Docker diff --git a/exporter.py b/exporter.py index e30c341..052fa52 100644 --- a/exporter.py +++ b/exporter.py @@ -63,7 +63,7 @@ def export_unique_names(data: pd.DataFrame, path_or_buf: str): data_frames = pd.concat(objs=frames, axis="index") for ds in data_frames["table_dataset"].unique(): - df = data_frames[data_frames["table_layer"] == ds] + df = data_frames[data_frames["table_dataset"] == ds] df = df.drop_duplicates(subset="table_name") df.to_csv(path_or_buf=f"{path_or_buf}/{ds}_tables.csv", index=False) @@ -83,7 +83,6 @@ def export_table_dependency(data: pd.DataFrame, path_or_buf: str): df_out = data.loc[ (mask_t_ds & mask_d_ds), ] - df_out = df_out.drop(columns=["table", "dependency"]) df_out.to_csv( path_or_buf=f"{path_or_buf}/{t_ds}_{d_ds}_dependency.csv", index=False ) diff --git a/extractor.py b/extractor.py index 9a4b8bf..55baed4 100644 --- a/extractor.py +++ b/extractor.py @@ -170,6 +170,8 @@ def extract_table_dependencies_from_queries( ] tables = sorted(list(set(table_from + table_join))) + if verbose: + print(f"Extracted table names are {tables}...\n") # store in dictionary dicts[f"{self.schema}.{file_name}"] = tables diff --git a/neo4j/.gitkeep b/neo4j/.gitkeep new file mode 100644 index 0000000..e69de29 diff --git a/poetry.lock b/poetry.lock index 8654be0..63c1e20 100644 --- a/poetry.lock +++ b/poetry.lock @@ -471,7 +471,7 @@ testing = ["coverage (>=4)", "coverage-enable-subprocess (>=1)", "flaky (>=3)", [metadata] lock-version = "1.1" python-versions = "^3.8" -content-hash = "bf0d8695b7bcb1c4144b9be6361e7cb68362475afe6f8f275f38b958e9bd4441" +content-hash = "a8b7b42d96b152e9a8fd58b71161823bc914c37ee846e2e9a1121168e874d320" [metadata.files] appdirs = [ diff --git a/pyproject.toml b/pyproject.toml index 9715158..95de0a3 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -15,6 +15,7 @@ moz-sql-parser = "^4.40.21126" tqdm = "^4.60.0" pytest = "^6.2.4" pandas = "^1.2.4" +numpy = "^1.20.3" [tool.poetry.dev-dependencies] diff --git a/data/analytics/author.sql b/sql/analytics/author.sql similarity index 100% rename from data/analytics/author.sql rename to sql/analytics/author.sql diff --git a/data/analytics/commit.sql b/sql/analytics/commit.sql similarity index 100% rename from data/analytics/commit.sql rename to sql/analytics/commit.sql diff --git a/data/analytics/committer.sql b/sql/analytics/committer.sql similarity index 100% rename from data/analytics/committer.sql rename to sql/analytics/committer.sql diff --git a/data/analytics/repo.sql b/sql/analytics/repo.sql similarity index 100% rename from data/analytics/repo.sql rename to sql/analytics/repo.sql diff --git a/data/analytics/user.sql b/sql/analytics/user.sql similarity index 100% rename from data/analytics/user.sql rename to sql/analytics/user.sql diff --git a/data/reporting/user_activity.sql b/sql/reporting/user_activity.sql similarity index 100% rename from data/reporting/user_activity.sql rename to sql/reporting/user_activity.sql diff --git a/sqlquerygraph.py b/sqlquerygraph.py new file mode 100644 index 0000000..7755888 --- /dev/null +++ b/sqlquerygraph.py @@ -0,0 +1,94 @@ +import os +import argparse + +from extractor import Extractor +import exporter + +import numpy as np +import pandas as pd + + +if __name__ == """__main__""": + argp = argparse.ArgumentParser() + argp.add_argument( + "-sd", + "--script_dir", + type=str, + help="Directory where we store subdirectories of our SQL queries", + ) + argp.add_argument( + "-d", + "--sub_dir", + default=None, + type=str, + help="Subdirectories within script_dir that you want to read SQL queries from. " + "If no value is inputted, then use all subdirectories in script_dir.", + ) + argp.add_argument( + "-rd", + "--reference_datasets", + nargs="*", + type=str, + default=[], + help="Datasets that contain tables in database to look-up against. " + "If no values is inputted, then take datasets specified in constants.py.", + ) + argp.add_argument("-ed", "--export_dir", type=str, help="Directory to store files.") + argp.add_argument( + "-v", + "--verbose", + default=False, + type=bool, + help="Boolean to output steps taken and query after cleaning. " + "Useful if want to check where function is failing.", + ) + args = argp.parse_args() + + # initialise empty array for storing dfs + arr = np.empty(shape=(0, 2)) + + if args.sub_dir is None: + subdir = os.listdir(path=args.script_dir) + else: + subdir = args.sub_dir + print(subdir) + + for i, dataset in enumerate(subdir): + print(f"Extracting {dataset} tables and their dependencies from scripts\n") + print("*******************************************\n") + + # create text to remove + dir_report = f"{args.script_dir}/{dataset}" + remove_txt = [] + for table in os.listdir(dir_report): + table_name, _ = os.path.splitext(p=table) + remove_txt.append(f"MERGE {dataset}.{table_name} USING (") + remove_txt.append( + ") ON FALSE WHEN NOT MATCHED THEN " + "INSERT ROW WHEN NOT MATCHED BY SOURCE THEN " + "DELETE" + ) + extractor = Extractor(script_dir=f"{args.script_dir}/{dataset}", schema=dataset) + table_dependencies = extractor.extract_table_dependencies_from_queries( + reference_datasets=args.reference_datasets, + str_to_remove=remove_txt, + verbose=args.verbose, + ) + print(f"Converting {dataset} dictionaries to dataframes\n") + print("*******************************************\n") + df_tables = exporter.convert_dict_to_df(data=table_dependencies) + df_tables = df_tables.to_numpy() + arr = np.concatenate((arr, df_tables), axis=0) + + print("Splitting tables from their dependencies\n") + print("*******************************************\n") + df = pd.DataFrame(data=arr, columns=["table", "dependency"]) + df = exporter.separate_dataset_table(data=df) + + print("Exporting unique table names for nodes\n") + print("*******************************************\n") + exporter.export_unique_names(data=df, path_or_buf=args.export_dir) + + print("Exporting table dependencies for relationships\n") + print("*******************************************\n") + exporter.export_table_dependency(data=df, path_or_buf=args.export_dir) diff --git a/tests/fixtures/fixture_extractor.py b/tests/fixtures/fixture_extractor.py index 3b160fa..6b7d59d 100644 --- a/tests/fixtures/fixture_extractor.py +++ b/tests/fixtures/fixture_extractor.py @@ -3,7 +3,7 @@ @pytest.fixture() def query_user_activity(): - with open(file="data/reporting/user_activity.sql", mode="r") as f: + with open(file="sql/reporting/user_activity.sql", mode="r") as f: return f.read() diff --git a/tests/integration/test_extractor.py b/tests/integration/test_extractor.py index d53357a..aedc25f 100644 --- a/tests/integration/test_extractor.py +++ b/tests/integration/test_extractor.py @@ -13,7 +13,7 @@ def test_extract_table_dependencies_from_queries( extract = [extracted_analytics, extracted_reporting] for i, schema in enumerate(schemes): - dir_report = f"data/{schema}" + dir_report = f"sql/{schema}" remove_txt = [] for table in os.listdir(dir_report): table_name, _ = os.path.splitext(p=table) diff --git a/tests/unit/test_extractor.py b/tests/unit/test_extractor.py index 8674c60..4fa1f25 100644 --- a/tests/unit/test_extractor.py +++ b/tests/unit/test_extractor.py @@ -3,7 +3,7 @@ def test_clean_query(query_user_activity, cleaned_user_activity): schema = "reporting" - dir_report = f"data/{schema}" + dir_report = f"sql/{schema}" extractor = Extractor(script_dir=dir_report, schema=schema) txt_remove = [ f"MERGE {schema}.user_activity USING (",