From 97a34a557d2dbb305ccae284e0eb78b3dc994dba Mon Sep 17 00:00:00 2001
From: avisionh <a_vision@hotmail.co.uk>
Date: Sun, 23 May 2021 21:06:28 +0100
Subject: [PATCH 1/4] feat: Transform dictionary to dataframe

This is so we can reformat suitably for neo4j.
---
 exporter.py                         |  23 +++++
 poetry.lock                         | 141 +++++++++++++++++++++++-----
 pyproject.toml                      |   1 +
 tests/conftest.py                   |  16 +++-
 tests/fixtures/fixture_exporter.py  |  30 ++++++
 tests/fixtures/fixture_extractor.py |  21 ++---
 tests/unit/test_exporter.py         |   9 ++
 7 files changed, 200 insertions(+), 41 deletions(-)
 create mode 100644 exporter.py
 create mode 100644 tests/fixtures/fixture_exporter.py
 create mode 100644 tests/unit/test_exporter.py

diff --git a/exporter.py b/exporter.py
new file mode 100644
index 0000000..a73fd20
--- /dev/null
+++ b/exporter.py
@@ -0,0 +1,23 @@
+import pandas as pd
+
+
+def convert_dict_to_df(data: dict) -> pd.DataFrame:
+    """
+    Converts a dictionary into a dataframe, with keys and values being a column each.
+    :param data: Dictionary to convert to a dataframe.
+    :return: Dataframe where dictionary keys and values are a column.
+    """
+    data = pd.DataFrame.from_dict(data=data, orient="index").reset_index()
+    data = data.rename(columns={"index": "table"})
+    data = pd.melt(
+        frame=data,
+        id_vars="table",
+        var_name="original_column_name",
+        value_name="dependency",
+    )
+    # remove nans
+    data = data[data["dependency"].notnull()]
+    # sort values
+    data = data.sort_values(by="table")
+
+    return data[["table", "dependency"]]
diff --git a/poetry.lock b/poetry.lock
index 23fe942..8654be0 100644
--- a/poetry.lock
+++ b/poetry.lock
@@ -147,17 +147,17 @@ optional = false
 python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*"
 
 [[package]]
-name = "mccabe"
-version = "0.6.1"
-description = "McCabe checker, plugin for flake8"
+name = "iniconfig"
+version = "1.1.1"
+description = "iniconfig: brain-dead simple config-ini parsing"
 category = "main"
 optional = false
 python-versions = "*"
 
 [[package]]
-name = "iniconfig"
-version = "1.1.1"
-description = "iniconfig: brain-dead simple config-ini parsing"
+name = "mccabe"
+version = "0.6.1"
+description = "McCabe checker, plugin for flake8"
 category = "main"
 optional = false
 python-versions = "*"
@@ -240,6 +240,14 @@ category = "main"
 optional = false
 python-versions = "*"
 
+[[package]]
+name = "numpy"
+version = "1.20.3"
+description = "NumPy is the fundamental package for array computing with Python."
+category = "main"
+optional = false
+python-versions = ">=3.7"
+
 [[package]]
 name = "packaging"
 version = "20.9"
@@ -251,6 +259,22 @@ python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*"
 [package.dependencies]
 pyparsing = ">=2.0.2"
 
+[[package]]
+name = "pandas"
+version = "1.2.4"
+description = "Powerful data structures for data analysis, time series, and statistics"
+category = "main"
+optional = false
+python-versions = ">=3.7.1"
+
+[package.dependencies]
+numpy = ">=1.16.5"
+python-dateutil = ">=2.7.3"
+pytz = ">=2017.3"
+
+[package.extras]
+test = ["pytest (>=5.0.1)", "pytest-xdist", "hypothesis (>=3.58)"]
+
 [[package]]
 name = "pluggy"
 version = "0.13.1"
@@ -278,6 +302,14 @@ pyyaml = ">=5.1"
 toml = "*"
 virtualenv = ">=20.0.8"
 
+[[package]]
+name = "py"
+version = "1.10.0"
+description = "library with cross-python path, ini-parsing, io, code, log facilities"
+category = "main"
+optional = false
+python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*"
+
 [[package]]
 name = "pycodestyle"
 version = "2.7.0"
@@ -294,14 +326,6 @@ category = "main"
 optional = false
 python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*"
 
-[[package]]
-name = "py"
-version = "1.10.0"
-description = "library with cross-python path, ini-parsing, io, code, log facilities"
-category = "main"
-optional = false
-python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*"
-
 [[package]]
 name = "pyparsing"
 version = "2.4.7"
@@ -331,6 +355,25 @@ toml = "*"
 [package.extras]
 testing = ["argcomplete", "hypothesis (>=3.56)", "mock", "nose", "requests", "xmlschema"]
 
+[[package]]
+name = "python-dateutil"
+version = "2.8.1"
+description = "Extensions to the standard Python datetime module"
+category = "main"
+optional = false
+python-versions = "!=3.0.*,!=3.1.*,!=3.2.*,>=2.7"
+
+[package.dependencies]
+six = ">=1.5"
+
+[[package]]
+name = "pytz"
+version = "2021.1"
+description = "World timezone definitions, modern and historical"
+category = "main"
+optional = false
+python-versions = "*"
+
 [[package]]
 name = "pyyaml"
 version = "5.4.1"
@@ -428,7 +471,7 @@ testing = ["coverage (>=4)", "coverage-enable-subprocess (>=1)", "flaky (>=3)",
 [metadata]
 lock-version = "1.1"
 python-versions = "^3.8"
-content-hash = "daf7c9934fd96f7224a177fe729ebf64c36c6dc462f13c38592ce7058f224a2d"
+content-hash = "bf0d8695b7bcb1c4144b9be6361e7cb68362475afe6f8f275f38b958e9bd4441"
 
 [metadata.files]
 appdirs = [
@@ -489,14 +532,14 @@ idna = [
     {file = "idna-2.10-py2.py3-none-any.whl", hash = "sha256:b97d804b1e9b523befed77c48dacec60e6dcb0b5391d57af6a65a312a90648c0"},
     {file = "idna-2.10.tar.gz", hash = "sha256:b307872f855b18632ce0c21c5e45be78c0ea7ae4c15c828c20788b26921eb3f6"},
 ]
-mccabe = [
-    {file = "mccabe-0.6.1-py2.py3-none-any.whl", hash = "sha256:ab8a6258860da4b6677da4bd2fe5dc2c659cff31b3ee4f7f5d64e79735b80d42"},
-    {file = "mccabe-0.6.1.tar.gz", hash = "sha256:dd8d182285a0fe56bace7f45b5e7d1a6ebcbf524e8f3bd87eb0f125271b8831f"},
-]
 iniconfig = [
     {file = "iniconfig-1.1.1-py2.py3-none-any.whl", hash = "sha256:011e24c64b7f47f6ebd835bb12a743f2fbe9a26d4cecaa7f53bc4f35ee9da8b3"},
     {file = "iniconfig-1.1.1.tar.gz", hash = "sha256:bc3af051d7d14b2ee5ef9969666def0cd1a000e121eaea580d4a313df4b37f32"},
 ]
+mccabe = [
+    {file = "mccabe-0.6.1-py2.py3-none-any.whl", hash = "sha256:ab8a6258860da4b6677da4bd2fe5dc2c659cff31b3ee4f7f5d64e79735b80d42"},
+    {file = "mccabe-0.6.1.tar.gz", hash = "sha256:dd8d182285a0fe56bace7f45b5e7d1a6ebcbf524e8f3bd87eb0f125271b8831f"},
+]
 mo-dots = [
     {file = "mo-dots-4.22.21108.tar.gz", hash = "sha256:f9f2f9a6e44e214959aecd55857b6df487b8e86daf51334e2e982d47ea33ed23"},
 ]
@@ -519,10 +562,54 @@ nodeenv = [
     {file = "nodeenv-1.6.0-py2.py3-none-any.whl", hash = "sha256:621e6b7076565ddcacd2db0294c0381e01fd28945ab36bcf00f41c5daf63bef7"},
     {file = "nodeenv-1.6.0.tar.gz", hash = "sha256:3ef13ff90291ba2a4a7a4ff9a979b63ffdd00a464dbe04acf0ea6471517a4c2b"},
 ]
+numpy = [
+    {file = "numpy-1.20.3-cp37-cp37m-macosx_10_9_x86_64.whl", hash = "sha256:70eb5808127284c4e5c9e836208e09d685a7978b6a216db85960b1a112eeace8"},
+    {file = "numpy-1.20.3-cp37-cp37m-manylinux_2_12_i686.manylinux2010_i686.whl", hash = "sha256:6ca2b85a5997dabc38301a22ee43c82adcb53ff660b89ee88dded6b33687e1d8"},
+    {file = "numpy-1.20.3-cp37-cp37m-manylinux_2_12_x86_64.manylinux2010_x86_64.whl", hash = "sha256:c5bf0e132acf7557fc9bb8ded8b53bbbbea8892f3c9a1738205878ca9434206a"},
+    {file = "numpy-1.20.3-cp37-cp37m-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:db250fd3e90117e0312b611574cd1b3f78bec046783195075cbd7ba9c3d73f16"},
+    {file = "numpy-1.20.3-cp37-cp37m-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:637d827248f447e63585ca3f4a7d2dfaa882e094df6cfa177cc9cf9cd6cdf6d2"},
+    {file = "numpy-1.20.3-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.whl", hash = "sha256:8b7bb4b9280da3b2856cb1fc425932f46fba609819ee1c62256f61799e6a51d2"},
+    {file = "numpy-1.20.3-cp37-cp37m-win32.whl", hash = "sha256:67d44acb72c31a97a3d5d33d103ab06d8ac20770e1c5ad81bdb3f0c086a56cf6"},
+    {file = "numpy-1.20.3-cp37-cp37m-win_amd64.whl", hash = "sha256:43909c8bb289c382170e0282158a38cf306a8ad2ff6dfadc447e90f9961bef43"},
+    {file = "numpy-1.20.3-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:f1452578d0516283c87608a5a5548b0cdde15b99650efdfd85182102ef7a7c17"},
+    {file = "numpy-1.20.3-cp38-cp38-manylinux_2_12_i686.manylinux2010_i686.whl", hash = "sha256:6e51534e78d14b4a009a062641f465cfaba4fdcb046c3ac0b1f61dd97c861b1b"},
+    {file = "numpy-1.20.3-cp38-cp38-manylinux_2_12_x86_64.manylinux2010_x86_64.whl", hash = "sha256:e515c9a93aebe27166ec9593411c58494fa98e5fcc219e47260d9ab8a1cc7f9f"},
+    {file = "numpy-1.20.3-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:c1c09247ccea742525bdb5f4b5ceeacb34f95731647fe55774aa36557dbb5fa4"},
+    {file = "numpy-1.20.3-cp38-cp38-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:66fbc6fed94a13b9801fb70b96ff30605ab0a123e775a5e7a26938b717c5d71a"},
+    {file = "numpy-1.20.3-cp38-cp38-manylinux_2_5_x86_64.manylinux1_x86_64.whl", hash = "sha256:ea9cff01e75a956dbee133fa8e5b68f2f92175233de2f88de3a682dd94deda65"},
+    {file = "numpy-1.20.3-cp38-cp38-win32.whl", hash = "sha256:f39a995e47cb8649673cfa0579fbdd1cdd33ea497d1728a6cb194d6252268e48"},
+    {file = "numpy-1.20.3-cp38-cp38-win_amd64.whl", hash = "sha256:1676b0a292dd3c99e49305a16d7a9f42a4ab60ec522eac0d3dd20cdf362ac010"},
+    {file = "numpy-1.20.3-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:830b044f4e64a76ba71448fce6e604c0fc47a0e54d8f6467be23749ac2cbd2fb"},
+    {file = "numpy-1.20.3-cp39-cp39-manylinux_2_12_i686.manylinux2010_i686.whl", hash = "sha256:55b745fca0a5ab738647d0e4db099bd0a23279c32b31a783ad2ccea729e632df"},
+    {file = "numpy-1.20.3-cp39-cp39-manylinux_2_12_x86_64.manylinux2010_x86_64.whl", hash = "sha256:5d050e1e4bc9ddb8656d7b4f414557720ddcca23a5b88dd7cff65e847864c400"},
+    {file = "numpy-1.20.3-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:a9c65473ebc342715cb2d7926ff1e202c26376c0dcaaee85a1fd4b8d8c1d3b2f"},
+    {file = "numpy-1.20.3-cp39-cp39-win32.whl", hash = "sha256:16f221035e8bd19b9dc9a57159e38d2dd060b48e93e1d843c49cb370b0f415fd"},
+    {file = "numpy-1.20.3-cp39-cp39-win_amd64.whl", hash = "sha256:6690080810f77485667bfbff4f69d717c3be25e5b11bb2073e76bb3f578d99b4"},
+    {file = "numpy-1.20.3-pp37-pypy37_pp73-manylinux_2_12_x86_64.manylinux2010_x86_64.whl", hash = "sha256:4e465afc3b96dbc80cf4a5273e5e2b1e3451286361b4af70ce1adb2984d392f9"},
+    {file = "numpy-1.20.3.zip", hash = "sha256:e55185e51b18d788e49fe8305fd73ef4470596b33fc2c1ceb304566b99c71a69"},
+]
 packaging = [
     {file = "packaging-20.9-py2.py3-none-any.whl", hash = "sha256:67714da7f7bc052e064859c05c595155bd1ee9f69f76557e21f051443c20947a"},
     {file = "packaging-20.9.tar.gz", hash = "sha256:5b327ac1320dc863dca72f4514ecc086f31186744b84a230374cc1fd776feae5"},
 ]
+pandas = [
+    {file = "pandas-1.2.4-cp37-cp37m-macosx_10_9_x86_64.whl", hash = "sha256:c601c6fdebc729df4438ec1f62275d6136a0dd14d332fc0e8ce3f7d2aadb4dd6"},
+    {file = "pandas-1.2.4-cp37-cp37m-manylinux1_i686.whl", hash = "sha256:8d4c74177c26aadcfb4fd1de6c1c43c2bf822b3e0fc7a9b409eeaf84b3e92aaa"},
+    {file = "pandas-1.2.4-cp37-cp37m-manylinux1_x86_64.whl", hash = "sha256:b730add5267f873b3383c18cac4df2527ac4f0f0eed1c6cf37fcb437e25cf558"},
+    {file = "pandas-1.2.4-cp37-cp37m-win32.whl", hash = "sha256:2cb7e8f4f152f27dc93f30b5c7a98f6c748601ea65da359af734dd0cf3fa733f"},
+    {file = "pandas-1.2.4-cp37-cp37m-win_amd64.whl", hash = "sha256:2111c25e69fa9365ba80bbf4f959400054b2771ac5d041ed19415a8b488dc70a"},
+    {file = "pandas-1.2.4-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:167693a80abc8eb28051fbd184c1b7afd13ce2c727a5af47b048f1ea3afefff4"},
+    {file = "pandas-1.2.4-cp38-cp38-manylinux1_i686.whl", hash = "sha256:612add929bf3ba9d27b436cc8853f5acc337242d6b584203f207e364bb46cb12"},
+    {file = "pandas-1.2.4-cp38-cp38-manylinux1_x86_64.whl", hash = "sha256:971e2a414fce20cc5331fe791153513d076814d30a60cd7348466943e6e909e4"},
+    {file = "pandas-1.2.4-cp38-cp38-win32.whl", hash = "sha256:68d7baa80c74aaacbed597265ca2308f017859123231542ff8a5266d489e1858"},
+    {file = "pandas-1.2.4-cp38-cp38-win_amd64.whl", hash = "sha256:bd659c11a4578af740782288cac141a322057a2e36920016e0fc7b25c5a4b686"},
+    {file = "pandas-1.2.4-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:9db70ffa8b280bb4de83f9739d514cd0735825e79eef3a61d312420b9f16b758"},
+    {file = "pandas-1.2.4-cp39-cp39-manylinux1_i686.whl", hash = "sha256:298f0553fd3ba8e002c4070a723a59cdb28eda579f3e243bc2ee397773f5398b"},
+    {file = "pandas-1.2.4-cp39-cp39-manylinux1_x86_64.whl", hash = "sha256:52d2472acbb8a56819a87aafdb8b5b6d2b3386e15c95bde56b281882529a7ded"},
+    {file = "pandas-1.2.4-cp39-cp39-win32.whl", hash = "sha256:d0877407359811f7b853b548a614aacd7dea83b0c0c84620a9a643f180060950"},
+    {file = "pandas-1.2.4-cp39-cp39-win_amd64.whl", hash = "sha256:2b063d41803b6a19703b845609c0b700913593de067b552a8b24dd8eeb8c9895"},
+    {file = "pandas-1.2.4.tar.gz", hash = "sha256:649ecab692fade3cbfcf967ff936496b0cfba0af00a55dfaacd82bdda5cb2279"},
+]
 pluggy = [
     {file = "pluggy-0.13.1-py2.py3-none-any.whl", hash = "sha256:966c145cd83c96502c3c3868f50408687b38434af77734af1e9ca461a4081d2d"},
     {file = "pluggy-0.13.1.tar.gz", hash = "sha256:15b2acde666561e1298d71b523007ed7364de07029219b604cf808bfa1c765b0"},
@@ -531,6 +618,10 @@ pre-commit = [
     {file = "pre_commit-2.12.1-py2.py3-none-any.whl", hash = "sha256:70c5ec1f30406250b706eda35e868b87e3e4ba099af8787e3e8b4b01e84f4712"},
     {file = "pre_commit-2.12.1.tar.gz", hash = "sha256:900d3c7e1bf4cf0374bb2893c24c23304952181405b4d88c9c40b72bda1bb8a9"},
 ]
+py = [
+    {file = "py-1.10.0-py2.py3-none-any.whl", hash = "sha256:3b80836aa6d1feeaa108e046da6423ab8f6ceda6468545ae8d02d9d58d18818a"},
+    {file = "py-1.10.0.tar.gz", hash = "sha256:21b81bda15b66ef5e1a777a21c4dcd9c20ad3efd0b3f817e7a809035269e1bd3"},
+]
 pycodestyle = [
     {file = "pycodestyle-2.7.0-py2.py3-none-any.whl", hash = "sha256:514f76d918fcc0b55c6680472f0a37970994e07bbb80725808c17089be302068"},
     {file = "pycodestyle-2.7.0.tar.gz", hash = "sha256:c389c1d06bf7904078ca03399a4816f974a1d590090fecea0c63ec26ebaf1cef"},
@@ -539,10 +630,6 @@ pyflakes = [
     {file = "pyflakes-2.3.1-py2.py3-none-any.whl", hash = "sha256:7893783d01b8a89811dd72d7dfd4d84ff098e5eed95cfa8905b22bbffe52efc3"},
     {file = "pyflakes-2.3.1.tar.gz", hash = "sha256:f5bc8ecabc05bb9d291eb5203d6810b49040f6ff446a756326104746cc00c1db"},
 ]
-py = [
-    {file = "py-1.10.0-py2.py3-none-any.whl", hash = "sha256:3b80836aa6d1feeaa108e046da6423ab8f6ceda6468545ae8d02d9d58d18818a"},
-    {file = "py-1.10.0.tar.gz", hash = "sha256:21b81bda15b66ef5e1a777a21c4dcd9c20ad3efd0b3f817e7a809035269e1bd3"},
-]
 pyparsing = [
     {file = "pyparsing-2.4.7-py2.py3-none-any.whl", hash = "sha256:ef9d7589ef3c200abe66653d3f1ab1033c3c419ae9b9bdb1240a85b024efc88b"},
     {file = "pyparsing-2.4.7.tar.gz", hash = "sha256:c203ec8783bf771a155b207279b9bccb8dea02d8f0c9e5f8ead507bc3246ecc1"},
@@ -551,6 +638,14 @@ pytest = [
     {file = "pytest-6.2.4-py3-none-any.whl", hash = "sha256:91ef2131a9bd6be8f76f1f08eac5c5317221d6ad1e143ae03894b862e8976890"},
     {file = "pytest-6.2.4.tar.gz", hash = "sha256:50bcad0a0b9c5a72c8e4e7c9855a3ad496ca6a881a3641b4260605450772c54b"},
 ]
+python-dateutil = [
+    {file = "python-dateutil-2.8.1.tar.gz", hash = "sha256:73ebfe9dbf22e832286dafa60473e4cd239f8592f699aa5adaf10050e6e1823c"},
+    {file = "python_dateutil-2.8.1-py2.py3-none-any.whl", hash = "sha256:75bb3f31ea686f1197762692a9ee6a7550b59fc6ca3a1f4b5d7e32fb98e2da2a"},
+]
+pytz = [
+    {file = "pytz-2021.1-py2.py3-none-any.whl", hash = "sha256:eb10ce3e7736052ed3623d49975ce333bcd712c7bb19a58b9e2089d4057d0798"},
+    {file = "pytz-2021.1.tar.gz", hash = "sha256:83a4a90894bf38e243cf052c8b58f381bfe9a7a483f6a9cab140bc7f702ac4da"},
+]
 pyyaml = [
     {file = "PyYAML-5.4.1-cp27-cp27m-macosx_10_9_x86_64.whl", hash = "sha256:3b2b1824fe7112845700f815ff6a489360226a5609b96ec2190a45e62a9fc922"},
     {file = "PyYAML-5.4.1-cp27-cp27m-win32.whl", hash = "sha256:129def1b7c1bf22faffd67b8f3724645203b79d8f4cc81f674654d9902cb4393"},
diff --git a/pyproject.toml b/pyproject.toml
index 6ec76d9..9715158 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -14,6 +14,7 @@ flake8 = "^3.9.2"
 moz-sql-parser = "^4.40.21126"
 tqdm = "^4.60.0"
 pytest = "^6.2.4"
+pandas = "^1.2.4"
 
 [tool.poetry.dev-dependencies]
 
diff --git a/tests/conftest.py b/tests/conftest.py
index 10f2b4a..47113dc 100644
--- a/tests/conftest.py
+++ b/tests/conftest.py
@@ -3,10 +3,20 @@
 
 pytest_plugins = [
     "tests.fixtures.fixture_extractor",
+    "tests.fixtures.fixture_exporter",
 ]
 
 
 @pytest.fixture()
-def query_user_activity():
-    with open(file="data/reporting/user_activity.sql", mode="r") as f:
-        return f.read()
+def extracted_analytics():
+    return {
+        "analytics.repo": [
+            "github_repos.commits",
+            "github_repos.languages",
+            "github_repos.licenses",
+        ],
+        "analytics.author": ["github_repos.commits"],
+        "analytics.committer": ["github_repos.commits"],
+        "analytics.commit": ["github_repos.commits"],
+        "analytics.user": ["analytics.author", "analytics.committer"],
+    }
diff --git a/tests/fixtures/fixture_exporter.py b/tests/fixtures/fixture_exporter.py
new file mode 100644
index 0000000..a9f49ca
--- /dev/null
+++ b/tests/fixtures/fixture_exporter.py
@@ -0,0 +1,30 @@
+import pytest
+import pandas as pd
+
+
+@pytest.fixture()
+def dict_as_df():
+    return pd.DataFrame(
+        data={
+            "table": [
+                "analytics.author",
+                "analytics.commit",
+                "analytics.committer",
+                "analytics.repo",
+                "analytics.repo",
+                "analytics.repo",
+                "analytics.user",
+                "analytics.user",
+            ],
+            "dependency": [
+                "github_repos.commits",
+                "github_repos.commits",
+                "github_repos.commits",
+                "github_repos.commits",
+                "github_repos.languages",
+                "github_repos.licenses",
+                "analytics.author",
+                "analytics.committer",
+            ],
+        }
+    )
diff --git a/tests/fixtures/fixture_extractor.py b/tests/fixtures/fixture_extractor.py
index 56b49c0..3b160fa 100644
--- a/tests/fixtures/fixture_extractor.py
+++ b/tests/fixtures/fixture_extractor.py
@@ -1,6 +1,12 @@
 import pytest
 
 
+@pytest.fixture()
+def query_user_activity():
+    with open(file="data/reporting/user_activity.sql", mode="r") as f:
+        return f.read()
+
+
 @pytest.fixture()
 def cleaned_user_activity():
     return (
@@ -37,18 +43,3 @@ def extracted_reporting():
             "analytics.user",
         ]
     }
-
-
-@pytest.fixture()
-def extracted_analytics():
-    return {
-        "analytics.repo": [
-            "github_repos.commits",
-            "github_repos.languages",
-            "github_repos.licenses",
-        ],
-        "analytics.author": ["github_repos.commits"],
-        "analytics.committer": ["github_repos.commits"],
-        "analytics.commit": ["github_repos.commits"],
-        "analytics.user": ["analytics.author", "analytics.committer"],
-    }
diff --git a/tests/unit/test_exporter.py b/tests/unit/test_exporter.py
new file mode 100644
index 0000000..c0c70fc
--- /dev/null
+++ b/tests/unit/test_exporter.py
@@ -0,0 +1,9 @@
+import exporter
+import pandas.testing as pdt
+
+
+def test_convert_dict_to_df(extracted_analytics, dict_as_df):
+    df = exporter.convert_dict_to_df(data=extracted_analytics)
+    pdt.assert_frame_equal(
+        left=df.reset_index(drop=True), right=dict_as_df.reset_index(drop=True)
+    )

From 7fb551e79d9bbaa57fb82ce0732e3defb9283a0c Mon Sep 17 00:00:00 2001
From: avisionh <a_vision@hotmail.co.uk>
Date: Sun, 23 May 2021 21:23:35 +0100
Subject: [PATCH 2/4] feat: Separate dataset and table names from string

This is so we can assign labels to nodes based on the dataset.
---
 exporter.py                        | 24 +++++++++++++++
 poetry.lock                        |  2 +-
 pyproject.toml                     |  1 +
 tests/fixtures/fixture_exporter.py | 48 ++++++++++++++++++++++++++++++
 tests/unit/test_exporter.py        |  8 +++++
 5 files changed, 82 insertions(+), 1 deletion(-)

diff --git a/exporter.py b/exporter.py
index a73fd20..b6b8ec7 100644
--- a/exporter.py
+++ b/exporter.py
@@ -1,4 +1,6 @@
+from typing import Union
 import pandas as pd
+import numpy as np
 
 
 def convert_dict_to_df(data: dict) -> pd.DataFrame:
@@ -21,3 +23,25 @@ def convert_dict_to_df(data: dict) -> pd.DataFrame:
     data = data.sort_values(by="table")
 
     return data[["table", "dependency"]]
+
+
+def separate_dataset_table(data: Union[pd.Series, pd.DataFrame]) -> pd.DataFrame:
+    """
+    Separates string of <dataset>.<table_name> into dataset and table name.
+    :param data: Dataframe with columns to separate string entries into dataset and table name.
+    :return: Dataframe with columns for dataset and table name.
+    """
+    if isinstance(data, pd.Series):
+        cols = "table"
+    else:
+        cols = data.columns
+
+    for col in cols:
+        col_names = [f"{col}_dataset", f"{col}_name"]
+        # remove backslashes and split on period
+        data[col] = data[col].str.replace(pat="\\", repl="", regex=True)
+        data[col_names] = data[col].str.split(pat=".", n=1, expand=True)
+        # remove full column
+        data = data.drop(columns=col)
+
+    return data
diff --git a/poetry.lock b/poetry.lock
index 8654be0..63c1e20 100644
--- a/poetry.lock
+++ b/poetry.lock
@@ -471,7 +471,7 @@ testing = ["coverage (>=4)", "coverage-enable-subprocess (>=1)", "flaky (>=3)",
 [metadata]
 lock-version = "1.1"
 python-versions = "^3.8"
-content-hash = "bf0d8695b7bcb1c4144b9be6361e7cb68362475afe6f8f275f38b958e9bd4441"
+content-hash = "a8b7b42d96b152e9a8fd58b71161823bc914c37ee846e2e9a1121168e874d320"
 
 [metadata.files]
 appdirs = [
diff --git a/pyproject.toml b/pyproject.toml
index 9715158..95de0a3 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -15,6 +15,7 @@ moz-sql-parser = "^4.40.21126"
 tqdm = "^4.60.0"
 pytest = "^6.2.4"
 pandas = "^1.2.4"
+numpy = "^1.20.3"
 
 [tool.poetry.dev-dependencies]
 
diff --git a/tests/fixtures/fixture_exporter.py b/tests/fixtures/fixture_exporter.py
index a9f49ca..881d8ff 100644
--- a/tests/fixtures/fixture_exporter.py
+++ b/tests/fixtures/fixture_exporter.py
@@ -28,3 +28,51 @@ def dict_as_df():
             ],
         }
     )
+
+
+@pytest.fixture()
+def df_separate_dataset_table():
+    return pd.DataFrame(
+        data={
+            "table_dataset": [
+                "analytics",
+                "analytics",
+                "analytics",
+                "analytics",
+                "analytics",
+                "analytics",
+                "analytics",
+                "analytics",
+            ],
+            "table_name": [
+                "author",
+                "commit",
+                "committer",
+                "repo",
+                "repo",
+                "repo",
+                "user",
+                "user",
+            ],
+            "dependency_dataset": [
+                "github_repos",
+                "github_repos",
+                "github_repos",
+                "github_repos",
+                "github_repos",
+                "github_repos",
+                "analytics",
+                "analytics",
+            ],
+            "dependency_name": [
+                "commits",
+                "commits",
+                "commits",
+                "commits",
+                "languages",
+                "licenses",
+                "author",
+                "committer",
+            ],
+        }
+    )
diff --git a/tests/unit/test_exporter.py b/tests/unit/test_exporter.py
index c0c70fc..741d334 100644
--- a/tests/unit/test_exporter.py
+++ b/tests/unit/test_exporter.py
@@ -7,3 +7,11 @@ def test_convert_dict_to_df(extracted_analytics, dict_as_df):
     pdt.assert_frame_equal(
         left=df.reset_index(drop=True), right=dict_as_df.reset_index(drop=True)
     )
+
+
+def test_separate_dataset_table(dict_as_df, df_separate_dataset_table):
+    df = exporter.separate_dataset_table(data=dict_as_df)
+    pdt.assert_frame_equal(
+        left=df.reset_index(drop=True),
+        right=df_separate_dataset_table.reset_index(drop=True),
+    )

From 2a831d17cb65d43e1fdc3067d899e25c4e6722e6 Mon Sep 17 00:00:00 2001
From: avisionh <a_vision@hotmail.co.uk>
Date: Sun, 23 May 2021 21:30:26 +0100
Subject: [PATCH 3/4] feat: Export different csvs by dataset

This is so we can uniquely identify nodes and relationships when exporting to neo4j.
---
 exporter.py    | 44 +++++++++++++++++++++++++++++++++++++++++++-
 poetry.lock    |  2 +-
 pyproject.toml |  1 -
 3 files changed, 44 insertions(+), 3 deletions(-)

diff --git a/exporter.py b/exporter.py
index b6b8ec7..e30c341 100644
--- a/exporter.py
+++ b/exporter.py
@@ -1,6 +1,5 @@
 from typing import Union
 import pandas as pd
-import numpy as np
 
 
 def convert_dict_to_df(data: dict) -> pd.DataFrame:
@@ -45,3 +44,46 @@ def separate_dataset_table(data: Union[pd.Series, pd.DataFrame]) -> pd.DataFrame
         data = data.drop(columns=col)
 
     return data
+
+
+def export_unique_names(data: pd.DataFrame, path_or_buf: str):
+    """
+    Concatenates and unions a dataframe so we we get unique table names. This is so we create nodes in neo4j.
+    :param data: Dataframe to get the names from.
+    :param path_or_buf: String of the directory to store files.
+    :return:
+    """
+    data_table = data[["table_dataset", "table_name"]]
+    data_dependency = data[["dependency_dataset", "dependency_name"]]
+    # rename so can union
+    data_dependency = data_dependency.rename(
+        columns={"dependency_dataset": "table_dataset", "dependency_name": "table_name"}
+    )
+    frames = [data_table, data_dependency]
+    data_frames = pd.concat(objs=frames, axis="index")
+
+    for ds in data_frames["table_dataset"].unique():
+        df = data_frames[data_frames["table_layer"] == ds]
+        df = df.drop_duplicates(subset="table_name")
+        df.to_csv(path_or_buf=f"{path_or_buf}/{ds}_tables.csv", index=False)
+
+
+def export_table_dependency(data: pd.DataFrame, path_or_buf: str):
+    """
+    Filters a dataframe by its table and dependency levels so it can be exported into neo4j.
+    :param data: Dataframe to filter by table and dependency.
+                Requires column to be called 'table_dataset' and 'dependency_dataset'.
+    :param path_or_buf: String of the directory to store files.
+    :return:
+    """
+    for t_ds in data["table_dataset"].unique():
+        mask_t_ds = data["table_dataset"] == t_ds
+        for d_ds in data["dependency_dataset"].unique():
+            mask_d_ds = data["dependency_dataset"] == d_ds
+            df_out = data.loc[
+                (mask_t_ds & mask_d_ds),
+            ]
+            df_out = df_out.drop(columns=["table", "dependency"])
+            df_out.to_csv(
+                path_or_buf=f"{path_or_buf}/{t_ds}_{d_ds}_dependency.csv", index=False
+            )
diff --git a/poetry.lock b/poetry.lock
index 63c1e20..8654be0 100644
--- a/poetry.lock
+++ b/poetry.lock
@@ -471,7 +471,7 @@ testing = ["coverage (>=4)", "coverage-enable-subprocess (>=1)", "flaky (>=3)",
 [metadata]
 lock-version = "1.1"
 python-versions = "^3.8"
-content-hash = "a8b7b42d96b152e9a8fd58b71161823bc914c37ee846e2e9a1121168e874d320"
+content-hash = "bf0d8695b7bcb1c4144b9be6361e7cb68362475afe6f8f275f38b958e9bd4441"
 
 [metadata.files]
 appdirs = [
diff --git a/pyproject.toml b/pyproject.toml
index 95de0a3..9715158 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -15,7 +15,6 @@ moz-sql-parser = "^4.40.21126"
 tqdm = "^4.60.0"
 pytest = "^6.2.4"
 pandas = "^1.2.4"
-numpy = "^1.20.3"
 
 [tool.poetry.dev-dependencies]
 

From 59acff580b99fc8edb29928a577ef66aa67eb6ea Mon Sep 17 00:00:00 2001
From: avisionh <a_vision@hotmail.co.uk>
Date: Sun, 23 May 2021 23:28:45 +0100
Subject: [PATCH 4/4] feat: Write module to extract table and dependencies

This is so we can combine all the code we've written thus far into a module that can be run from the command line.

Update README.md with new instructions for extracting table dependencies.
---
 .gitignore                                |  6 ++
 .pre-commit-config.yaml                   |  1 -
 README.md                                 |  7 +-
 exporter.py                               |  3 +-
 extractor.py                              |  2 +
 neo4j/.gitkeep                            |  0
 poetry.lock                               |  2 +-
 pyproject.toml                            |  1 +
 {data => sql}/analytics/author.sql        |  0
 {data => sql}/analytics/commit.sql        |  0
 {data => sql}/analytics/committer.sql     |  0
 {data => sql}/analytics/repo.sql          |  0
 {data => sql}/analytics/user.sql          |  0
 {data => sql}/reporting/user_activity.sql |  0
 sqlquerygraph.py                          | 94 +++++++++++++++++++++++
 tests/fixtures/fixture_extractor.py       |  2 +-
 tests/integration/test_extractor.py       |  2 +-
 tests/unit/test_extractor.py              |  2 +-
 18 files changed, 114 insertions(+), 8 deletions(-)
 create mode 100644 neo4j/.gitkeep
 rename {data => sql}/analytics/author.sql (100%)
 rename {data => sql}/analytics/commit.sql (100%)
 rename {data => sql}/analytics/committer.sql (100%)
 rename {data => sql}/analytics/repo.sql (100%)
 rename {data => sql}/analytics/user.sql (100%)
 rename {data => sql}/reporting/user_activity.sql (100%)
 create mode 100644 sqlquerygraph.py

diff --git a/.gitignore b/.gitignore
index abcbf77..5e6ab21 100644
--- a/.gitignore
+++ b/.gitignore
@@ -15,3 +15,9 @@
 
 # django
 __pycache__/
+
+# data
+*.csv
+
+# outputs
+*.html
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
index b595f61..a791996 100644
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -5,7 +5,6 @@ repos:
     rev: 3.9.2
     hooks:
     -   id: flake8
-        args: ["src"]
 -   repo: https://github.com/psf/black
     rev: 21.5b1 # Replace by any tag/version: https://github.com/psf/black/tags
     hooks:
diff --git a/README.md b/README.md
index c759593..074cef6 100644
--- a/README.md
+++ b/README.md
@@ -21,7 +21,7 @@ To run the code in here, ensure your system meets the following requirements:
 Note there may be some Python IDE-specific requirements around loading environment variables, which are not considered here.
 
 ### Set-up
-For quickstart set-up of the project, run the below in your shell:
+For quickstart set-up of the project, run the below in your shell/terminal:
 ```shell script
 # 1. read project-specific environment variables
 direnv allow
@@ -34,6 +34,11 @@ poetry install
 pre-commit install
 ```
 
+To then extract the tables and their dependencies from the example SQL scripts in the `sql/` directory, run the following in your shell/terminal:
+```shell script
+python sqlquerygraph.py -sd 'sql' -ed 'neo4j' -rd 'github_repos' 'analytics' 'reporting'
+```
+
 ### Run neo4j graph database
 We use [neo4j](https://neo4j.com/) for this project to visualise the dependencies between tables. To install neo4j locally using Docker Compose, follow the below instructions:
 1. Install Docker
diff --git a/exporter.py b/exporter.py
index e30c341..052fa52 100644
--- a/exporter.py
+++ b/exporter.py
@@ -63,7 +63,7 @@ def export_unique_names(data: pd.DataFrame, path_or_buf: str):
     data_frames = pd.concat(objs=frames, axis="index")
 
     for ds in data_frames["table_dataset"].unique():
-        df = data_frames[data_frames["table_layer"] == ds]
+        df = data_frames[data_frames["table_dataset"] == ds]
         df = df.drop_duplicates(subset="table_name")
         df.to_csv(path_or_buf=f"{path_or_buf}/{ds}_tables.csv", index=False)
 
@@ -83,7 +83,6 @@ def export_table_dependency(data: pd.DataFrame, path_or_buf: str):
             df_out = data.loc[
                 (mask_t_ds & mask_d_ds),
             ]
-            df_out = df_out.drop(columns=["table", "dependency"])
             df_out.to_csv(
                 path_or_buf=f"{path_or_buf}/{t_ds}_{d_ds}_dependency.csv", index=False
             )
diff --git a/extractor.py b/extractor.py
index 9a4b8bf..55baed4 100644
--- a/extractor.py
+++ b/extractor.py
@@ -170,6 +170,8 @@ def extract_table_dependencies_from_queries(
             ]
             tables = sorted(list(set(table_from + table_join)))
 
+            if verbose:
+                print(f"Extracted table names are {tables}...\n")
             # store in dictionary
             dicts[f"{self.schema}.{file_name}"] = tables
 
diff --git a/neo4j/.gitkeep b/neo4j/.gitkeep
new file mode 100644
index 0000000..e69de29
diff --git a/poetry.lock b/poetry.lock
index 8654be0..63c1e20 100644
--- a/poetry.lock
+++ b/poetry.lock
@@ -471,7 +471,7 @@ testing = ["coverage (>=4)", "coverage-enable-subprocess (>=1)", "flaky (>=3)",
 [metadata]
 lock-version = "1.1"
 python-versions = "^3.8"
-content-hash = "bf0d8695b7bcb1c4144b9be6361e7cb68362475afe6f8f275f38b958e9bd4441"
+content-hash = "a8b7b42d96b152e9a8fd58b71161823bc914c37ee846e2e9a1121168e874d320"
 
 [metadata.files]
 appdirs = [
diff --git a/pyproject.toml b/pyproject.toml
index 9715158..95de0a3 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -15,6 +15,7 @@ moz-sql-parser = "^4.40.21126"
 tqdm = "^4.60.0"
 pytest = "^6.2.4"
 pandas = "^1.2.4"
+numpy = "^1.20.3"
 
 [tool.poetry.dev-dependencies]
 
diff --git a/data/analytics/author.sql b/sql/analytics/author.sql
similarity index 100%
rename from data/analytics/author.sql
rename to sql/analytics/author.sql
diff --git a/data/analytics/commit.sql b/sql/analytics/commit.sql
similarity index 100%
rename from data/analytics/commit.sql
rename to sql/analytics/commit.sql
diff --git a/data/analytics/committer.sql b/sql/analytics/committer.sql
similarity index 100%
rename from data/analytics/committer.sql
rename to sql/analytics/committer.sql
diff --git a/data/analytics/repo.sql b/sql/analytics/repo.sql
similarity index 100%
rename from data/analytics/repo.sql
rename to sql/analytics/repo.sql
diff --git a/data/analytics/user.sql b/sql/analytics/user.sql
similarity index 100%
rename from data/analytics/user.sql
rename to sql/analytics/user.sql
diff --git a/data/reporting/user_activity.sql b/sql/reporting/user_activity.sql
similarity index 100%
rename from data/reporting/user_activity.sql
rename to sql/reporting/user_activity.sql
diff --git a/sqlquerygraph.py b/sqlquerygraph.py
new file mode 100644
index 0000000..7755888
--- /dev/null
+++ b/sqlquerygraph.py
@@ -0,0 +1,94 @@
+import os
+import argparse
+
+from extractor import Extractor
+import exporter
+
+import numpy as np
+import pandas as pd
+
+
+if __name__ == """__main__""":
+    argp = argparse.ArgumentParser()
+    argp.add_argument(
+        "-sd",
+        "--script_dir",
+        type=str,
+        help="Directory where we store subdirectories of our SQL queries",
+    )
+    argp.add_argument(
+        "-d",
+        "--sub_dir",
+        default=None,
+        type=str,
+        help="Subdirectories within script_dir that you want to read SQL queries from. "
+        "If no value is inputted, then use all subdirectories in script_dir.",
+    )
+    argp.add_argument(
+        "-rd",
+        "--reference_datasets",
+        nargs="*",
+        type=str,
+        default=[],
+        help="Datasets that contain tables in database to look-up against. "
+        "If no values is inputted, then take datasets specified in constants.py.",
+    )
+    argp.add_argument("-ed", "--export_dir", type=str, help="Directory to store files.")
+    argp.add_argument(
+        "-v",
+        "--verbose",
+        default=False,
+        type=bool,
+        help="Boolean to output steps taken and query after cleaning. "
+        "Useful if want to check where function is failing.",
+    )
+    args = argp.parse_args()
+
+    # initialise empty array for storing dfs
+    arr = np.empty(shape=(0, 2))
+
+    if args.sub_dir is None:
+        subdir = os.listdir(path=args.script_dir)
+    else:
+        subdir = args.sub_dir
+    print(subdir)
+
+    for i, dataset in enumerate(subdir):
+        print(f"Extracting {dataset} tables and their dependencies from scripts\n")
+        print("*******************************************\n")
+
+        # create text to remove
+        dir_report = f"{args.script_dir}/{dataset}"
+        remove_txt = []
+        for table in os.listdir(dir_report):
+            table_name, _ = os.path.splitext(p=table)
+            remove_txt.append(f"MERGE {dataset}.{table_name} USING (")
+        remove_txt.append(
+            ") ON FALSE WHEN NOT MATCHED THEN "
+            "INSERT ROW WHEN NOT MATCHED BY SOURCE THEN "
+            "DELETE"
+        )
+        extractor = Extractor(script_dir=f"{args.script_dir}/{dataset}", schema=dataset)
+        table_dependencies = extractor.extract_table_dependencies_from_queries(
+            reference_datasets=args.reference_datasets,
+            str_to_remove=remove_txt,
+            verbose=args.verbose,
+        )
+        print(f"Converting {dataset} dictionaries to dataframes\n")
+        print("*******************************************\n")
+        df_tables = exporter.convert_dict_to_df(data=table_dependencies)
+        df_tables = df_tables.to_numpy()
+        arr = np.concatenate((arr, df_tables), axis=0)
+
+    print("Splitting tables from their dependencies\n")
+    print("*******************************************\n")
+    df = pd.DataFrame(data=arr, columns=["table", "dependency"])
+    df = exporter.separate_dataset_table(data=df)
+
+    print("Exporting unique table names for nodes\n")
+    print("*******************************************\n")
+    exporter.export_unique_names(data=df, path_or_buf=args.export_dir)
+
+    print("Exporting table dependencies for relationships\n")
+    print("*******************************************\n")
+    exporter.export_table_dependency(data=df, path_or_buf=args.export_dir)
diff --git a/tests/fixtures/fixture_extractor.py b/tests/fixtures/fixture_extractor.py
index 3b160fa..6b7d59d 100644
--- a/tests/fixtures/fixture_extractor.py
+++ b/tests/fixtures/fixture_extractor.py
@@ -3,7 +3,7 @@
 
 @pytest.fixture()
 def query_user_activity():
-    with open(file="data/reporting/user_activity.sql", mode="r") as f:
+    with open(file="sql/reporting/user_activity.sql", mode="r") as f:
         return f.read()
 
 
diff --git a/tests/integration/test_extractor.py b/tests/integration/test_extractor.py
index d53357a..aedc25f 100644
--- a/tests/integration/test_extractor.py
+++ b/tests/integration/test_extractor.py
@@ -13,7 +13,7 @@ def test_extract_table_dependencies_from_queries(
     extract = [extracted_analytics, extracted_reporting]
 
     for i, schema in enumerate(schemes):
-        dir_report = f"data/{schema}"
+        dir_report = f"sql/{schema}"
         remove_txt = []
         for table in os.listdir(dir_report):
             table_name, _ = os.path.splitext(p=table)
diff --git a/tests/unit/test_extractor.py b/tests/unit/test_extractor.py
index 8674c60..4fa1f25 100644
--- a/tests/unit/test_extractor.py
+++ b/tests/unit/test_extractor.py
@@ -3,7 +3,7 @@
 
 def test_clean_query(query_user_activity, cleaned_user_activity):
     schema = "reporting"
-    dir_report = f"data/{schema}"
+    dir_report = f"sql/{schema}"
     extractor = Extractor(script_dir=dir_report, schema=schema)
     txt_remove = [
         f"MERGE {schema}.user_activity USING (",