From 8f5606191e3be524148daf19b2b1ddca4df09fda Mon Sep 17 00:00:00 2001
From: avisionh <a_vision@hotmail.co.uk>
Date: Sun, 16 May 2021 11:05:52 +0100
Subject: [PATCH 01/14] feat: Write method to read SQL query

This is so we can start parsing and extracting dependencies from it.
---
 extractor.py | 0
 1 file changed, 0 insertions(+), 0 deletions(-)
 create mode 100644 extractor.py

diff --git a/extractor.py b/extractor.py
new file mode 100644
index 0000000..e69de29

From bf2e8d88b43bbf4e2c51b1ddbea018d589f73718 Mon Sep 17 00:00:00 2001
From: avisionh <a_vision@hotmail.co.uk>
Date: Sun, 16 May 2021 11:11:24 +0100
Subject: [PATCH 02/14] feat: Add method to remove string from query

This is so we have functionality to remove strings that make the query difficult to parse.
---
 extractor.py | 54 ++++++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 54 insertions(+)

diff --git a/extractor.py b/extractor.py
index e69de29..fc4efe1 100644
--- a/extractor.py
+++ b/extractor.py
@@ -0,0 +1,54 @@
+from typing import Union
+
+import os
+import re
+
+
+class Extractor:
+    """
+    Extract table names from SQL queries.
+
+    :param script_dir: String of the directory were we store our SQL queries.
+    :param schema: String of the dataset/schema that the SQL queries creating the table belongs to.
+    """
+
+    def __init__(self, script_dir: str, schema: str):
+        self.script_dir = script_dir
+        self.schema = schema
+
+    def read_query(self, file: str) -> str:
+        """
+        Reads a SQL file in.
+
+        :param file: String of the file to read query from.
+        :return: String of the SQL query from the file.
+        """
+        _, file_extension = os.path.splitext(p=file)
+        if file_extension == ".sql":
+            with open(file=os.path.join(self.script_dir, file), mode="r") as f:
+                query = f.read()
+            return query
+        else:
+            raise Exception(
+                f"Passed in a {file_extension} file. \n"
+                f"Please pass in a .sql file instead."
+            )
+
+    @staticmethod
+    def clean_query(query: str, str_to_remove: Union[str, list]) -> str:
+        """
+        Cleans a query so it can be parsed.
+
+        :param query: String of the query to clean.
+        :param str_to_remove: String or list of strings to remove from the query.
+        :return: String of the cleaned query to parse.
+        """
+        # remove new lines and multiple spaces
+        query = query.replace("\n", " ")
+        query = re.sub(pattern=r"\s+", repl=" ", string=query)
+
+        if str_to_remove is not None:
+            for txt in str_to_remove:
+                query = query.replace(txt, "")
+
+        return query

From 0dc191b4b6d627cd7208069aa716ecfdd3df45e4 Mon Sep 17 00:00:00 2001
From: avisionh <a_vision@hotmail.co.uk>
Date: Sun, 16 May 2021 11:17:31 +0100
Subject: [PATCH 03/14] feat: Write method to parse SQL query

This is so we can get it into a format that we can extract table names from.
---
 extractor.py   | 17 ++++++++++
 poetry.lock    | 90 +++++++++++++++++++++++++++++++++++++++++++++++++-
 pyproject.toml |  1 +
 3 files changed, 107 insertions(+), 1 deletion(-)

diff --git a/extractor.py b/extractor.py
index fc4efe1..aedc420 100644
--- a/extractor.py
+++ b/extractor.py
@@ -3,6 +3,9 @@
 import os
 import re
 
+from moz_sql_parser import parse
+from pprint import pprint
+
 
 class Extractor:
     """
@@ -52,3 +55,17 @@ def clean_query(query: str, str_to_remove: Union[str, list]) -> str:
                 query = query.replace(txt, "")
 
         return query
+
+    @staticmethod
+    def parse_query(query: str, print_tree: bool = False) -> dict:
+        """
+        Parse a query into a JSON parse-tree.
+
+        :param query: String of the SQL query to parse as a JSON parse-tree.
+        :param print_tree: Boolean to print the JSON parse-tree.
+        :return: Dictionary of the query as a JSON parse-tree.
+        """
+        query_json = parse(sql=query)
+        if print_tree:
+            pprint(object=query_json)
+        return query_json
diff --git a/poetry.lock b/poetry.lock
index 7ae4cb6..2126a4c 100644
--- a/poetry.lock
+++ b/poetry.lock
@@ -124,6 +124,76 @@ category = "main"
 optional = false
 python-versions = "*"
 
+[[package]]
+name = "mo-dots"
+version = "4.22.21108"
+description = "More Dots! Dot-access to Python dicts like Javascript"
+category = "main"
+optional = false
+python-versions = "*"
+
+[package.dependencies]
+mo-future = "3.147.20327"
+mo-imports = "3.149.20327"
+
+[[package]]
+name = "mo-future"
+version = "3.147.20327"
+description = "More future! Make Python 2/3 compatibility a bit easier"
+category = "main"
+optional = false
+python-versions = "*"
+
+[[package]]
+name = "mo-imports"
+version = "3.149.20327"
+description = "More Imports! - Delayed importing"
+category = "main"
+optional = false
+python-versions = "*"
+
+[package.dependencies]
+mo-future = "3.147.20327"
+
+[[package]]
+name = "mo-kwargs"
+version = "4.22.21108"
+description = "More KWARGS! Let call parameters override kwargs"
+category = "main"
+optional = false
+python-versions = "*"
+
+[package.dependencies]
+mo-dots = "4.22.21108"
+mo-future = "3.147.20327"
+
+[[package]]
+name = "mo-logs"
+version = "4.23.21108"
+description = "More Logs! Structured Logging and Exception Handling"
+category = "main"
+optional = false
+python-versions = "*"
+
+[package.dependencies]
+mo-dots = "4.22.21108"
+mo-future = "3.147.20327"
+mo-imports = "3.149.20327"
+mo-kwargs = "4.22.21108"
+
+[[package]]
+name = "moz-sql-parser"
+version = "4.40.21126"
+description = "Extract Parse Tree from SQL"
+category = "main"
+optional = false
+python-versions = "*"
+
+[package.dependencies]
+mo-dots = "4.22.21108"
+mo-future = "3.147.20327"
+mo-logs = "4.23.21108"
+
 [[package]]
 name = "nodeenv"
 version = "1.6.0"
@@ -248,7 +318,7 @@ testing = ["coverage (>=4)", "coverage-enable-subprocess (>=1)", "flaky (>=3)",
 [metadata]
 lock-version = "1.1"
 python-versions = "^3.8"
-content-hash = "09173b5e497218e0d74966bdf08a78ebbfbc7322863b44a255d4e01538efec91"
+content-hash = "6dbd677bb57cbbb6b000fc84f6c0fc06daa173f8e22a50b485746b6c566e1ef6"
 
 [metadata.files]
 appdirs = [
@@ -301,6 +371,24 @@ mccabe = [
     {file = "mccabe-0.6.1-py2.py3-none-any.whl", hash = "sha256:ab8a6258860da4b6677da4bd2fe5dc2c659cff31b3ee4f7f5d64e79735b80d42"},
     {file = "mccabe-0.6.1.tar.gz", hash = "sha256:dd8d182285a0fe56bace7f45b5e7d1a6ebcbf524e8f3bd87eb0f125271b8831f"},
 ]
+mo-dots = [
+    {file = "mo-dots-4.22.21108.tar.gz", hash = "sha256:f9f2f9a6e44e214959aecd55857b6df487b8e86daf51334e2e982d47ea33ed23"},
+]
+mo-future = [
+    {file = "mo-future-3.147.20327.tar.gz", hash = "sha256:4aafcf859e4657bc11b04ee9791321e2d44702604ae01cac7c412468cb6a513f"},
+]
+mo-imports = [
+    {file = "mo-imports-3.149.20327.tar.gz", hash = "sha256:32e3dc720c84765224d29ed7e9d6972a369d41d756e6f26ddb5b7aa01241331d"},
+]
+mo-kwargs = [
+    {file = "mo-kwargs-4.22.21108.tar.gz", hash = "sha256:9c9d00ab86f1f75013193807d90cad17cbc515384cf3b17b8aff3104a300f7ce"},
+]
+mo-logs = [
+    {file = "mo-logs-4.23.21108.tar.gz", hash = "sha256:de4136a7ce215ecbfd7a368588be0a3f1fd8a6521dc2d4aae57cc1c3ba299aab"},
+]
+moz-sql-parser = [
+    {file = "moz-sql-parser-4.40.21126.tar.gz", hash = "sha256:b3d37cc8ff118d86009aa12646791549537ec0ae8ac312efd4641289c8eee080"},
+]
 nodeenv = [
     {file = "nodeenv-1.6.0-py2.py3-none-any.whl", hash = "sha256:621e6b7076565ddcacd2db0294c0381e01fd28945ab36bcf00f41c5daf63bef7"},
     {file = "nodeenv-1.6.0.tar.gz", hash = "sha256:3ef13ff90291ba2a4a7a4ff9a979b63ffdd00a464dbe04acf0ea6471517a4c2b"},
diff --git a/pyproject.toml b/pyproject.toml
index d59f726..2fff1b8 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -11,6 +11,7 @@ pre-commit = "^2.12.1"
 detect-secrets = "^1.1.0"
 direnv = "^2020.12.3"
 flake8 = "^3.9.2"
+moz-sql-parser = "^4.40.21126"
 
 [tool.poetry.dev-dependencies]
 

From 54d9c4720f160e2a6951a585aaaf00a6b5f2d5be Mon Sep 17 00:00:00 2001
From: avisionh <a_vision@hotmail.co.uk>
Date: Sun, 16 May 2021 11:24:18 +0100
Subject: [PATCH 04/14] feat: Add recursive method to extract from nested
 dictionary

This is so we can get the equivalent value of a passed in key for a nested dictionary.
---
 extractor.py | 41 +++++++++++++++++++++++++++++++++++++++++
 1 file changed, 41 insertions(+)

diff --git a/extractor.py b/extractor.py
index aedc420..204ec74 100644
--- a/extractor.py
+++ b/extractor.py
@@ -69,3 +69,44 @@ def parse_query(query: str, print_tree: bool = False) -> dict:
         if print_tree:
             pprint(object=query_json)
         return query_json
+
+    @staticmethod
+    def extract_from_json(obj: dict, key: str) -> list:
+        """
+        Recursively fetch values from a nested JSON.
+
+        For our purposes, extract where key is 'from' allows extraction of *most* table names after a `FROM` clause.
+            - It does not extract the table names when the name is nested in a subquery.
+            - Nor does it extract table names in '<TYPE> JOIN` clauses.
+        To achieve above two, need to extract where the key is 'value' and compare with actual table names.
+        This is because the values returned when key is 'value' are table names, column names etc.
+        Reference
+            -  https://hackersandslackers.com/extract-data-from-complex-json-python/
+        :param obj: Dictionary to extract values from.
+        :param key: String of the value you want to extract.
+        :return: List of values for the key.
+        """
+        arr = []
+
+        def extract(obj: Union[dict, list], arr: list, key: str) -> list:
+            """
+            Recusively search for values of key in a JSON tree.
+
+            :param obj: Dictionary to extract values from.
+            :param arr: List to store extracted values to.
+            :param key: String of the dictionary key to extract associated value from.
+            :return: List of the extracted values.
+            """
+            if isinstance(obj, dict):
+                for k, v in obj.items():
+                    if isinstance(v, (dict, list)):
+                        extract(obj=v, arr=arr, key=key)
+                    elif k == key:
+                        arr.append(v)
+            elif isinstance(obj, list):
+                for item in obj:
+                    extract(obj=item, arr=arr, key=key)
+            return arr
+
+        values = extract(obj=obj, arr=arr, key=key)
+        return values

From fa1cd69b50300b5f8ba0b0fe3504701b750100ff Mon Sep 17 00:00:00 2001
From: avisionh <a_vision@hotmail.co.uk>
Date: Sun, 16 May 2021 11:27:54 +0100
Subject: [PATCH 05/14] docs: Instruct how to set-up project environment

This is so people can replicate the work here.
---
 README.md | 29 +++++++++++++++++++++++++++++
 1 file changed, 29 insertions(+)

diff --git a/README.md b/README.md
index dd4e328..1020862 100644
--- a/README.md
+++ b/README.md
@@ -2,3 +2,32 @@
 [![build status](https://github.com/avisionh/sqlquerygraph/workflows/pytesting/badge.svg)](https://github.com/avisionh/sqlqueryraph/actions) [![CodeFactor](https://www.codefactor.io/repository/github/avisionh/sqlquerygraph/badge)](https://www.codefactor.io/repository/github/avisionh/sqlquerygraph) [![License: MIT](https://img.shields.io/badge/License-MIT-informational.svg)](https://opensource.org/licenses/MIT) [![Code style: black](https://img.shields.io/badge/code%20style-black-000000.svg)](https://github.com/psf/black)
 
 Parse your SQL queries and represent their structure as a graph.
+
+## Requirements
+To run the code in here, ensure your system meets the following requirements:
+- Unix-like operating system (macOS, Linux, ...);
+- [`direnv`](https://direnv.net/) installed, including shell hooks;
+- [`.envrc`](https://github.com/avisionh/sqlquerygraph/blob/main/.envrc) allowed/trusted by `direnv` to
+  use the environment variables - see [below](#allowingtrusting-envrc);
+- Python 3.8 or above; and
+- [Poetry](https://python-poetry.org/docs/) installed.
+
+Parse your SQL queries and represent their structure as a graph.
+
+Note there may be some Python IDE-specific requirements around loading environment variables, which are not considered here.
+
+### Set-up
+For quickstart set-up of the project, run the below in your shell:
+```shell script
+# 1. read project-specific environment variables
+direnv allow
+
+# 2. activate virtual environment and install package dependencies
+poetry shell
+poetry install
+
+# 3. check adherence to good standards on every commit
+pre-commit install
+```
+
+***

From 386ec6a9b51015f523199a7287db54246aa691c5 Mon Sep 17 00:00:00 2001
From: avisionh <a_vision@hotmail.co.uk>
Date: Sun, 16 May 2021 11:51:36 +0100
Subject: [PATCH 06/14] feat: Write method to extract tables and dependencies
 from scripts

This is so we can get it in a dictionary for visualising.
---
 README.md      |  2 ++
 extractor.py   | 64 ++++++++++++++++++++++++++++++++++++++++++++++++++
 poetry.lock    | 19 ++++++++++++++-
 pyproject.toml |  1 +
 4 files changed, 85 insertions(+), 1 deletion(-)

diff --git a/README.md b/README.md
index 1020862..96d5ed9 100644
--- a/README.md
+++ b/README.md
@@ -3,6 +3,8 @@
 
 Parse your SQL queries and represent their structure as a graph.
 
+Currently, we implement the ability of represent how each of the tables in a set of SQL query scripts depend on each other.
+
 ## Requirements
 To run the code in here, ensure your system meets the following requirements:
 - Unix-like operating system (macOS, Linux, ...);
diff --git a/extractor.py b/extractor.py
index 204ec74..8b3ce5e 100644
--- a/extractor.py
+++ b/extractor.py
@@ -2,6 +2,7 @@
 
 import os
 import re
+from tqdm import tqdm
 
 from moz_sql_parser import parse
 from pprint import pprint
@@ -110,3 +111,66 @@ def extract(obj: Union[dict, list], arr: list, key: str) -> list:
 
         values = extract(obj=obj, arr=arr, key=key)
         return values
+
+    def extract_table_dependencies_from_queries(
+        self,
+        reference_datasets: list,
+        str_to_remove: Union[str, list] = None,
+        verbose: bool = False,
+    ) -> dict:
+        """
+        Extracts the table names and their dependencies from a set of .sql files.
+
+        :param reference_datasets: List of datasets/schema that the tables belong to.
+        :param str_to_remove: String or list of strings to remove from the query.
+        :param verbose: Boolean to output steps taken and query after cleaning. Useful for debugging.
+        :return: Dictionary of tables as keys and their dependent tables as values.
+        """
+        queries, jsons, dicts = {}, {}, {}
+        reference_datasets = tuple([f"{txt}." for txt in reference_datasets])
+        for file_name in tqdm(os.listdir(path=self.script_dir)):
+
+            if verbose:
+                print(f"Reading query {file_name}...\n")
+            query = self.read_query(file=file_name)
+            queries[file_name] = query
+
+            if str_to_remove is not None:
+                if verbose:
+                    print(
+                        f"Cleaning query {file_name} by removing {str_to_remove}...\n"
+                    )
+                queries[file_name] = self.clean_query(
+                    query=queries[file_name], str_to_remove=str_to_remove
+                )
+
+            if verbose:
+                print(f"Cleaned query is {queries[file_name]}")
+                print(f"Parsing query {file_name}...\n")
+            jsons[file_name] = self.parse_query(
+                query=queries[file_name], print_tree=verbose
+            )
+
+            if verbose:
+                print(f"Extracting table names from {file_name}...\n")
+            #   - from: tables after 'from' clause
+            #       + though sometimes keys are not 'from' so need to
+            #       + look at values associated to the 'value' key
+            #   - value: tables after '... join' clauses
+            #       + can also include tables after 'from' clause if they
+            #       + are in a subquery
+            table_from = self.extract_from_json(obj=jsons[file_name], key="from")
+
+            # keep only table elements and not table aliases - as defined by period
+            table_from = [txt for txt in table_from if "." in txt]
+            table_value = self.extract_from_json(obj=jsons[file_name], key="value")
+            # extract table values when it starts with `<schema>.`
+            table_join = [
+                txt for txt in table_value if str(txt).startswith(reference_datasets)
+            ]
+            tables = list(set(table_from + table_join))
+
+            # store in dictionary
+            dicts[f"{self.schema}.{file_name}"] = tables
+
+        return dicts
diff --git a/poetry.lock b/poetry.lock
index 2126a4c..cce25f4 100644
--- a/poetry.lock
+++ b/poetry.lock
@@ -276,6 +276,19 @@ category = "main"
 optional = false
 python-versions = ">=2.6, !=3.0.*, !=3.1.*, !=3.2.*"
 
+[[package]]
+name = "tqdm"
+version = "4.60.0"
+description = "Fast, Extensible Progress Meter"
+category = "main"
+optional = false
+python-versions = "!=3.0.*,!=3.1.*,!=3.2.*,!=3.3.*,>=2.7"
+
+[package.extras]
+dev = ["py-make (>=0.1.0)", "twine", "wheel"]
+notebook = ["ipywidgets (>=6)"]
+telegram = ["requests"]
+
 [[package]]
 name = "urllib3"
 version = "1.26.4"
@@ -318,7 +331,7 @@ testing = ["coverage (>=4)", "coverage-enable-subprocess (>=1)", "flaky (>=3)",
 [metadata]
 lock-version = "1.1"
 python-versions = "^3.8"
-content-hash = "6dbd677bb57cbbb6b000fc84f6c0fc06daa173f8e22a50b485746b6c566e1ef6"
+content-hash = "339e8cc655e66a1e2c0f5b773e6608ae3323f3af7a95ea3c6f94d8dacd086f20"
 
 [metadata.files]
 appdirs = [
@@ -448,6 +461,10 @@ toml = [
     {file = "toml-0.10.2-py2.py3-none-any.whl", hash = "sha256:806143ae5bfb6a3c6e736a764057db0e6a0e05e338b5630894a5f779cabb4f9b"},
     {file = "toml-0.10.2.tar.gz", hash = "sha256:b3bda1d108d5dd99f4a20d24d9c348e91c4db7ab1b749200bded2f839ccbe68f"},
 ]
+tqdm = [
+    {file = "tqdm-4.60.0-py2.py3-none-any.whl", hash = "sha256:daec693491c52e9498632dfbe9ccfc4882a557f5fa08982db1b4d3adbe0887c3"},
+    {file = "tqdm-4.60.0.tar.gz", hash = "sha256:ebdebdb95e3477ceea267decfc0784859aa3df3e27e22d23b83e9b272bf157ae"},
+]
 urllib3 = [
     {file = "urllib3-1.26.4-py2.py3-none-any.whl", hash = "sha256:2f4da4594db7e1e110a944bb1b551fdf4e6c136ad42e4234131391e21eb5b0df"},
     {file = "urllib3-1.26.4.tar.gz", hash = "sha256:e7b021f7241115872f92f43c6508082facffbd1c048e3c6e2bb9c2a157e28937"},
diff --git a/pyproject.toml b/pyproject.toml
index 2fff1b8..f7a8f0d 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -12,6 +12,7 @@ detect-secrets = "^1.1.0"
 direnv = "^2020.12.3"
 flake8 = "^3.9.2"
 moz-sql-parser = "^4.40.21126"
+tqdm = "^4.60.0"
 
 [tool.poetry.dev-dependencies]
 

From 04fd17ca3350c585eecd3370f5eb07abd430b5b0 Mon Sep 17 00:00:00 2001
From: avisionh <a_vision@hotmail.co.uk>
Date: Tue, 18 May 2021 23:01:18 +0100
Subject: [PATCH 07/14] data: Add scripts to create tables in BQ

This is so we can have a layer of tables derived off another layer.
---
 data/analytics/author.sql           | 15 +++++++++++++++
 data/analytics/commit.sql           | 20 ++++++++++++++++++++
 data/analytics/committer.sql        | 15 +++++++++++++++
 data/analytics/repo.sql             | 19 +++++++++++++++++++
 data/analytics/user.sql             | 18 ++++++++++++++++++
 tests/conftest.py                   |  0
 tests/fixtures/fixture_extractor.py |  0
 tests/unit/test_extractor.py        |  0
 8 files changed, 87 insertions(+)
 create mode 100644 data/analytics/author.sql
 create mode 100644 data/analytics/commit.sql
 create mode 100644 data/analytics/committer.sql
 create mode 100644 data/analytics/repo.sql
 create mode 100644 data/analytics/user.sql
 create mode 100644 tests/conftest.py
 create mode 100644 tests/fixtures/fixture_extractor.py
 create mode 100644 tests/unit/test_extractor.py

diff --git a/data/analytics/author.sql b/data/analytics/author.sql
new file mode 100644
index 0000000..de4024a
--- /dev/null
+++ b/data/analytics/author.sql
@@ -0,0 +1,15 @@
+MERGE analytics.author
+USING (
+    SELECT
+        author.name AS name
+        ,author.email AS email
+        ,author.time_sec AS time_sec
+        ,author.tz_offset AS tz_offset
+        ,author.date.seconds AS date_seconds
+        ,author.date.nanos AS date_nanos
+    FROM `bigquery-public-data.github_repos.commits`)
+ON FALSE
+WHEN NOT MATCHED THEN
+    INSERT ROW
+WHEN NOT MATCHED BY SOURCE THEN
+    DELETE;
diff --git a/data/analytics/commit.sql b/data/analytics/commit.sql
new file mode 100644
index 0000000..f5015ae
--- /dev/null
+++ b/data/analytics/commit.sql
@@ -0,0 +1,20 @@
+MERGE analytics.commit
+USING (
+    SELECT
+        commit
+        ,tree
+        ,parent
+        ,author.name AS author_name
+        ,author.time_sec AS author_timesec
+        ,committer.name AS committer_name
+        ,committer.time_sec AS committer_time_sec
+        ,subject
+        ,message
+        ,repo_name
+        ,difference_truncated
+    FROM `bigquery-public-data.github_repos.commits`)
+ON FALSE
+WHEN NOT MATCHED THEN
+    INSERT ROW
+WHEN NOT MATCHED BY SOURCE THEN
+    DELETE;
diff --git a/data/analytics/committer.sql b/data/analytics/committer.sql
new file mode 100644
index 0000000..635c197
--- /dev/null
+++ b/data/analytics/committer.sql
@@ -0,0 +1,15 @@
+MERGE analytics.committer
+USING (
+    SELECT
+        committer.name AS name
+        ,committer.email AS email
+        ,committer.time_sec AS time_sec
+        ,committer.tz_offset AS tz_offset
+        ,committer.date.seconds AS date_seconds
+        ,committer.date.nanos AS date_nanos
+    FROM `bigquery-public-data.github_repos.commits`)
+ON FALSE
+WHEN NOT MATCHED THEN
+    INSERT ROW
+WHEN NOT MATCHED BY SOURCE THEN
+    DELETE;
diff --git a/data/analytics/repo.sql b/data/analytics/repo.sql
new file mode 100644
index 0000000..2dab4e2
--- /dev/null
+++ b/data/analytics/repo.sql
@@ -0,0 +1,19 @@
+MERGE analytics.repo
+USING (
+    SELECT
+        a.repo_name
+        ,a.author.name
+        ,a.author.time_sec
+        ,b.language.name AS language
+        ,b.language.bytes AS repo_size
+        ,c.license
+    FROM `bigquery-public-data.github_repos.commits` AS a
+    LEFT JOIN `bigquery-public-data.github_repos.languages` AS b
+        ON a.repo_name = b.repo_name
+    LEFT JOIN `bigquery-public-data.github_repos.licenses` AS c
+        ON a.repo_name = c.repo_name)
+ON FALSE
+WHEN NOT MATCHED THEN
+    INSERT ROW
+WHEN NOT MATCHED BY SOURCE THEN
+    DELETE;
diff --git a/data/analytics/user.sql b/data/analytics/user.sql
new file mode 100644
index 0000000..fee3f32
--- /dev/null
+++ b/data/analytics/user.sql
@@ -0,0 +1,18 @@
+MERGE analytics.user
+USING (
+    SELECT DISTINCT
+        name
+        ,email
+        ,'author' AS user_type
+    FROM analytics.author
+    UNION
+    SELECT DISTINCT
+        name
+        ,email
+        ,'committer' AS user_type
+    FROM analytics.committer)
+ON FALSE
+WHEN NOT MATCHED THEN
+    INSERT ROW
+WHEN NOT MATCHED BY SOURCE THEN
+    DELETE;
diff --git a/tests/conftest.py b/tests/conftest.py
new file mode 100644
index 0000000..e69de29
diff --git a/tests/fixtures/fixture_extractor.py b/tests/fixtures/fixture_extractor.py
new file mode 100644
index 0000000..e69de29
diff --git a/tests/unit/test_extractor.py b/tests/unit/test_extractor.py
new file mode 100644
index 0000000..e69de29

From 079d553eb3488f260748d42cff58e9da989bc9ea Mon Sep 17 00:00:00 2001
From: avisionh <a_vision@hotmail.co.uk>
Date: Wed, 19 May 2021 08:36:19 +0100
Subject: [PATCH 08/14] data: Create user activity table

This is so we have a complicated query which uses ctes and subqueries, which will be good for testing.
---
 data/analytics/author.sql        |  3 +-
 data/analytics/commit.sql        |  3 +-
 data/analytics/committer.sql     |  3 +-
 data/analytics/repo.sql          |  7 ++--
 data/analytics/user.sql          |  3 +-
 data/reporting/user_activity.sql | 63 ++++++++++++++++++++++++++++++++
 6 files changed, 75 insertions(+), 7 deletions(-)
 create mode 100644 data/reporting/user_activity.sql

diff --git a/data/analytics/author.sql b/data/analytics/author.sql
index de4024a..9d8b7af 100644
--- a/data/analytics/author.sql
+++ b/data/analytics/author.sql
@@ -7,7 +7,8 @@ USING (
         ,author.tz_offset AS tz_offset
         ,author.date.seconds AS date_seconds
         ,author.date.nanos AS date_nanos
-    FROM `bigquery-public-data.github_repos.commits`)
+    FROM `bigquery-public-data.github_repos.commits`
+)
 ON FALSE
 WHEN NOT MATCHED THEN
     INSERT ROW
diff --git a/data/analytics/commit.sql b/data/analytics/commit.sql
index f5015ae..eda3a63 100644
--- a/data/analytics/commit.sql
+++ b/data/analytics/commit.sql
@@ -12,7 +12,8 @@ USING (
         ,message
         ,repo_name
         ,difference_truncated
-    FROM `bigquery-public-data.github_repos.commits`)
+    FROM `bigquery-public-data.github_repos.commits`
+)
 ON FALSE
 WHEN NOT MATCHED THEN
     INSERT ROW
diff --git a/data/analytics/committer.sql b/data/analytics/committer.sql
index 635c197..6c76d31 100644
--- a/data/analytics/committer.sql
+++ b/data/analytics/committer.sql
@@ -7,7 +7,8 @@ USING (
         ,committer.tz_offset AS tz_offset
         ,committer.date.seconds AS date_seconds
         ,committer.date.nanos AS date_nanos
-    FROM `bigquery-public-data.github_repos.commits`)
+    FROM `bigquery-public-data.github_repos.commits`
+)
 ON FALSE
 WHEN NOT MATCHED THEN
     INSERT ROW
diff --git a/data/analytics/repo.sql b/data/analytics/repo.sql
index 2dab4e2..7269807 100644
--- a/data/analytics/repo.sql
+++ b/data/analytics/repo.sql
@@ -2,8 +2,8 @@ MERGE analytics.repo
 USING (
     SELECT
         a.repo_name
-        ,a.author.name
-        ,a.author.time_sec
+        ,a.author.name AS author_name
+        ,a.author.time_sec AS author_time_sec
         ,b.language.name AS language
         ,b.language.bytes AS repo_size
         ,c.license
@@ -11,7 +11,8 @@ USING (
     LEFT JOIN `bigquery-public-data.github_repos.languages` AS b
         ON a.repo_name = b.repo_name
     LEFT JOIN `bigquery-public-data.github_repos.licenses` AS c
-        ON a.repo_name = c.repo_name)
+        ON a.repo_name = c.repo_name
+)
 ON FALSE
 WHEN NOT MATCHED THEN
     INSERT ROW
diff --git a/data/analytics/user.sql b/data/analytics/user.sql
index fee3f32..accd863 100644
--- a/data/analytics/user.sql
+++ b/data/analytics/user.sql
@@ -10,7 +10,8 @@ USING (
         name
         ,email
         ,'committer' AS user_type
-    FROM analytics.committer)
+    FROM analytics.committer
+)
 ON FALSE
 WHEN NOT MATCHED THEN
     INSERT ROW
diff --git a/data/reporting/user_activity.sql b/data/reporting/user_activity.sql
new file mode 100644
index 0000000..19748c2
--- /dev/null
+++ b/data/reporting/user_activity.sql
@@ -0,0 +1,63 @@
+MERGE reporting.user_activity
+USING (
+    WITH cte_base AS
+    (
+        SELECT
+            b.name
+            ,b.email
+            ,'commit' AS activity_type
+            ,COUNT(a.*) AS activity_count
+        FROM analytics.commit AS a
+        LEFT JOIN analytics.user AS b
+            ON a.committer_name = b.name
+                AND b.user_type = 'committer'
+        GROUP BY
+            b.name
+            ,b.email
+            ,a.repo_name
+
+        UNION
+
+        SELECT
+            a.author_name AS name
+            ,b.email
+            ,'repo' AS activity_type
+            ,COUNT(a.*) AS activity_count
+        FROM analytics.repo AS a
+        LEFT JOIN analytics.user AS b
+            ON a.author_name = b.name
+        GROUP BY
+            a.author_name
+            ,b.email
+    )
+
+    SELECT
+        name
+        ,email
+        ,activity_type
+        ,activity_count
+    FROM cte_base
+    UNION
+    SELECT
+        name
+        ,email
+        ,activity_type
+        ,activity_count
+    FROM
+    (
+        SELECT
+            name
+            ,email
+            ,'total' AS activity_type
+            ,SUM(activity_count) AS activity_count
+        FROM cte_base
+        GROUP BY
+            name
+            ,email
+    )
+)
+ON FALSE
+WHEN NOT MATCHED THEN
+    INSERT ROW
+WHEN NOT MATCHED BY SOURCE THEN
+    DELETE;

From 5ebe7511b77aedad33b952213b452a3791a8f3cd Mon Sep 17 00:00:00 2001
From: avisionh <a_vision@hotmail.co.uk>
Date: Thu, 20 May 2021 08:49:21 +0100
Subject: [PATCH 09/14] tests: Add for main class method

This is to ensure it works.
---
 .gitignore                                    |   3 +
 extractor.py                                  |  14 +-
 poetry.lock                                   | 135 +++++++++++++++++-
 pyproject.toml                                |   1 +
 tests/{unit/test_extractor.py => __init__.py} |   0
 tests/conftest.py                             |  12 ++
 tests/fixtures/fixture_extractor.py           |  12 ++
 tests/integration/test_extractor.py           |  26 ++++
 8 files changed, 195 insertions(+), 8 deletions(-)
 rename tests/{unit/test_extractor.py => __init__.py} (100%)
 create mode 100644 tests/integration/test_extractor.py

diff --git a/.gitignore b/.gitignore
index 0cb7c0a..abcbf77 100644
--- a/.gitignore
+++ b/.gitignore
@@ -12,3 +12,6 @@
 
 # environment
 .env
+
+# django
+__pycache__/
diff --git a/extractor.py b/extractor.py
index 8b3ce5e..44640bb 100644
--- a/extractor.py
+++ b/extractor.py
@@ -20,18 +20,19 @@ def __init__(self, script_dir: str, schema: str):
         self.script_dir = script_dir
         self.schema = schema
 
-    def read_query(self, file: str) -> str:
+    def read_query(self, file: str) -> (str, str):
         """
         Reads a SQL file in.
+        Note: Relies on SQL script being named the same as table or View it is creating.
 
         :param file: String of the file to read query from.
-        :return: String of the SQL query from the file.
+        :return: Tuple of strings of the table name and SQL query from the file.
         """
-        _, file_extension = os.path.splitext(p=file)
+        file_name, file_extension = os.path.splitext(p=file)
         if file_extension == ".sql":
             with open(file=os.path.join(self.script_dir, file), mode="r") as f:
                 query = f.read()
-            return query
+            return file_name, query
         else:
             raise Exception(
                 f"Passed in a {file_extension} file. \n"
@@ -129,10 +130,9 @@ def extract_table_dependencies_from_queries(
         queries, jsons, dicts = {}, {}, {}
         reference_datasets = tuple([f"{txt}." for txt in reference_datasets])
         for file_name in tqdm(os.listdir(path=self.script_dir)):
-
             if verbose:
                 print(f"Reading query {file_name}...\n")
-            query = self.read_query(file=file_name)
+            file_name, query = self.read_query(file=file_name)
             queries[file_name] = query
 
             if str_to_remove is not None:
@@ -168,7 +168,7 @@ def extract_table_dependencies_from_queries(
             table_join = [
                 txt for txt in table_value if str(txt).startswith(reference_datasets)
             ]
-            tables = list(set(table_from + table_join))
+            tables = sorted(list(set(table_from + table_join)))
 
             # store in dictionary
             dicts[f"{self.schema}.{file_name}"] = tables
diff --git a/poetry.lock b/poetry.lock
index cce25f4..23fe942 100644
--- a/poetry.lock
+++ b/poetry.lock
@@ -6,6 +6,28 @@ category = "main"
 optional = false
 python-versions = "*"
 
+[[package]]
+name = "atomicwrites"
+version = "1.4.0"
+description = "Atomic file writes."
+category = "main"
+optional = false
+python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*"
+
+[[package]]
+name = "attrs"
+version = "21.2.0"
+description = "Classes Without Boilerplate"
+category = "main"
+optional = false
+python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*, !=3.4.*"
+
+[package.extras]
+dev = ["coverage[toml] (>=5.0.2)", "hypothesis", "pympler", "pytest (>=4.3.0)", "six", "mypy", "pytest-mypy-plugins", "zope.interface", "furo", "sphinx", "sphinx-notfound-page", "pre-commit"]
+docs = ["furo", "sphinx", "zope.interface", "sphinx-notfound-page"]
+tests = ["coverage[toml] (>=5.0.2)", "hypothesis", "pympler", "pytest (>=4.3.0)", "six", "mypy", "pytest-mypy-plugins", "zope.interface"]
+tests_no_zope = ["coverage[toml] (>=5.0.2)", "hypothesis", "pympler", "pytest (>=4.3.0)", "six", "mypy", "pytest-mypy-plugins"]
+
 [[package]]
 name = "certifi"
 version = "2020.12.5"
@@ -30,6 +52,14 @@ category = "main"
 optional = false
 python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*, !=3.4.*"
 
+[[package]]
+name = "colorama"
+version = "0.4.4"
+description = "Cross-platform colored terminal text."
+category = "main"
+optional = false
+python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*, !=3.4.*"
+
 [[package]]
 name = "detect-secrets"
 version = "1.1.0"
@@ -124,6 +154,14 @@ category = "main"
 optional = false
 python-versions = "*"
 
+[[package]]
+name = "iniconfig"
+version = "1.1.1"
+description = "iniconfig: brain-dead simple config-ini parsing"
+category = "main"
+optional = false
+python-versions = "*"
+
 [[package]]
 name = "mo-dots"
 version = "4.22.21108"
@@ -202,6 +240,28 @@ category = "main"
 optional = false
 python-versions = "*"
 
+[[package]]
+name = "packaging"
+version = "20.9"
+description = "Core utilities for Python packages"
+category = "main"
+optional = false
+python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*"
+
+[package.dependencies]
+pyparsing = ">=2.0.2"
+
+[[package]]
+name = "pluggy"
+version = "0.13.1"
+description = "plugin and hook calling mechanisms for python"
+category = "main"
+optional = false
+python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*"
+
+[package.extras]
+dev = ["pre-commit", "tox"]
+
 [[package]]
 name = "pre-commit"
 version = "2.12.1"
@@ -234,6 +294,43 @@ category = "main"
 optional = false
 python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*"
 
+[[package]]
+name = "py"
+version = "1.10.0"
+description = "library with cross-python path, ini-parsing, io, code, log facilities"
+category = "main"
+optional = false
+python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*"
+
+[[package]]
+name = "pyparsing"
+version = "2.4.7"
+description = "Python parsing module"
+category = "main"
+optional = false
+python-versions = ">=2.6, !=3.0.*, !=3.1.*, !=3.2.*"
+
+[[package]]
+name = "pytest"
+version = "6.2.4"
+description = "pytest: simple powerful testing with Python"
+category = "main"
+optional = false
+python-versions = ">=3.6"
+
+[package.dependencies]
+atomicwrites = {version = ">=1.0", markers = "sys_platform == \"win32\""}
+attrs = ">=19.2.0"
+colorama = {version = "*", markers = "sys_platform == \"win32\""}
+iniconfig = "*"
+packaging = "*"
+pluggy = ">=0.12,<1.0.0a1"
+py = ">=1.8.2"
+toml = "*"
+
+[package.extras]
+testing = ["argcomplete", "hypothesis (>=3.56)", "mock", "nose", "requests", "xmlschema"]
+
 [[package]]
 name = "pyyaml"
 version = "5.4.1"
@@ -331,13 +428,21 @@ testing = ["coverage (>=4)", "coverage-enable-subprocess (>=1)", "flaky (>=3)",
 [metadata]
 lock-version = "1.1"
 python-versions = "^3.8"
-content-hash = "339e8cc655e66a1e2c0f5b773e6608ae3323f3af7a95ea3c6f94d8dacd086f20"
+content-hash = "daf7c9934fd96f7224a177fe729ebf64c36c6dc462f13c38592ce7058f224a2d"
 
 [metadata.files]
 appdirs = [
     {file = "appdirs-1.4.4-py2.py3-none-any.whl", hash = "sha256:a841dacd6b99318a741b166adb07e19ee71a274450e68237b4650ca1055ab128"},
     {file = "appdirs-1.4.4.tar.gz", hash = "sha256:7d5d0167b2b1ba821647616af46a749d1c653740dd0d2415100fe26e27afdf41"},
 ]
+atomicwrites = [
+    {file = "atomicwrites-1.4.0-py2.py3-none-any.whl", hash = "sha256:6d1784dea7c0c8d4a5172b6c620f40b6e4cbfdf96d783691f2e1302a7b88e197"},
+    {file = "atomicwrites-1.4.0.tar.gz", hash = "sha256:ae70396ad1a434f9c7046fd2dd196fc04b12f9e91ffb859164193be8b6168a7a"},
+]
+attrs = [
+    {file = "attrs-21.2.0-py2.py3-none-any.whl", hash = "sha256:149e90d6d8ac20db7a955ad60cf0e6881a3f20d37096140088356da6c716b0b1"},
+    {file = "attrs-21.2.0.tar.gz", hash = "sha256:ef6aaac3ca6cd92904cdd0d83f629a15f18053ec84e6432106f7a4d04ae4f5fb"},
+]
 certifi = [
     {file = "certifi-2020.12.5-py2.py3-none-any.whl", hash = "sha256:719a74fb9e33b9bd44cc7f3a8d94bc35e4049deebe19ba7d8e108280cfd59830"},
     {file = "certifi-2020.12.5.tar.gz", hash = "sha256:1a4995114262bffbc2413b159f2a1a480c969de6e6eb13ee966d470af86af59c"},
@@ -350,6 +455,10 @@ chardet = [
     {file = "chardet-4.0.0-py2.py3-none-any.whl", hash = "sha256:f864054d66fd9118f2e67044ac8981a54775ec5b67aed0441892edb553d21da5"},
     {file = "chardet-4.0.0.tar.gz", hash = "sha256:0d6f53a15db4120f2b08c94f11e7d93d2c911ee118b6b30a04ec3ee8310179fa"},
 ]
+colorama = [
+    {file = "colorama-0.4.4-py2.py3-none-any.whl", hash = "sha256:9f47eda37229f68eee03b24b9748937c7dc3868f906e8ba69fbcbdd3bc5dc3e2"},
+    {file = "colorama-0.4.4.tar.gz", hash = "sha256:5941b2b48a20143d2267e95b1c2a7603ce057ee39fd88e7329b0c292aa16869b"},
+]
 detect-secrets = [
     {file = "detect_secrets-1.1.0-py2.py3-none-any.whl", hash = "sha256:be8cca3dc65f6fd637f5dec9f583f1cf4a680dc1a580b3d2e65a5ac7a277456a"},
     {file = "detect_secrets-1.1.0.tar.gz", hash = "sha256:68250b31bc108f665f05f0ecfb34f92423280e48e65adbb887fdf721ed909627"},
@@ -384,6 +493,10 @@ mccabe = [
     {file = "mccabe-0.6.1-py2.py3-none-any.whl", hash = "sha256:ab8a6258860da4b6677da4bd2fe5dc2c659cff31b3ee4f7f5d64e79735b80d42"},
     {file = "mccabe-0.6.1.tar.gz", hash = "sha256:dd8d182285a0fe56bace7f45b5e7d1a6ebcbf524e8f3bd87eb0f125271b8831f"},
 ]
+iniconfig = [
+    {file = "iniconfig-1.1.1-py2.py3-none-any.whl", hash = "sha256:011e24c64b7f47f6ebd835bb12a743f2fbe9a26d4cecaa7f53bc4f35ee9da8b3"},
+    {file = "iniconfig-1.1.1.tar.gz", hash = "sha256:bc3af051d7d14b2ee5ef9969666def0cd1a000e121eaea580d4a313df4b37f32"},
+]
 mo-dots = [
     {file = "mo-dots-4.22.21108.tar.gz", hash = "sha256:f9f2f9a6e44e214959aecd55857b6df487b8e86daf51334e2e982d47ea33ed23"},
 ]
@@ -406,6 +519,14 @@ nodeenv = [
     {file = "nodeenv-1.6.0-py2.py3-none-any.whl", hash = "sha256:621e6b7076565ddcacd2db0294c0381e01fd28945ab36bcf00f41c5daf63bef7"},
     {file = "nodeenv-1.6.0.tar.gz", hash = "sha256:3ef13ff90291ba2a4a7a4ff9a979b63ffdd00a464dbe04acf0ea6471517a4c2b"},
 ]
+packaging = [
+    {file = "packaging-20.9-py2.py3-none-any.whl", hash = "sha256:67714da7f7bc052e064859c05c595155bd1ee9f69f76557e21f051443c20947a"},
+    {file = "packaging-20.9.tar.gz", hash = "sha256:5b327ac1320dc863dca72f4514ecc086f31186744b84a230374cc1fd776feae5"},
+]
+pluggy = [
+    {file = "pluggy-0.13.1-py2.py3-none-any.whl", hash = "sha256:966c145cd83c96502c3c3868f50408687b38434af77734af1e9ca461a4081d2d"},
+    {file = "pluggy-0.13.1.tar.gz", hash = "sha256:15b2acde666561e1298d71b523007ed7364de07029219b604cf808bfa1c765b0"},
+]
 pre-commit = [
     {file = "pre_commit-2.12.1-py2.py3-none-any.whl", hash = "sha256:70c5ec1f30406250b706eda35e868b87e3e4ba099af8787e3e8b4b01e84f4712"},
     {file = "pre_commit-2.12.1.tar.gz", hash = "sha256:900d3c7e1bf4cf0374bb2893c24c23304952181405b4d88c9c40b72bda1bb8a9"},
@@ -418,6 +539,18 @@ pyflakes = [
     {file = "pyflakes-2.3.1-py2.py3-none-any.whl", hash = "sha256:7893783d01b8a89811dd72d7dfd4d84ff098e5eed95cfa8905b22bbffe52efc3"},
     {file = "pyflakes-2.3.1.tar.gz", hash = "sha256:f5bc8ecabc05bb9d291eb5203d6810b49040f6ff446a756326104746cc00c1db"},
 ]
+py = [
+    {file = "py-1.10.0-py2.py3-none-any.whl", hash = "sha256:3b80836aa6d1feeaa108e046da6423ab8f6ceda6468545ae8d02d9d58d18818a"},
+    {file = "py-1.10.0.tar.gz", hash = "sha256:21b81bda15b66ef5e1a777a21c4dcd9c20ad3efd0b3f817e7a809035269e1bd3"},
+]
+pyparsing = [
+    {file = "pyparsing-2.4.7-py2.py3-none-any.whl", hash = "sha256:ef9d7589ef3c200abe66653d3f1ab1033c3c419ae9b9bdb1240a85b024efc88b"},
+    {file = "pyparsing-2.4.7.tar.gz", hash = "sha256:c203ec8783bf771a155b207279b9bccb8dea02d8f0c9e5f8ead507bc3246ecc1"},
+]
+pytest = [
+    {file = "pytest-6.2.4-py3-none-any.whl", hash = "sha256:91ef2131a9bd6be8f76f1f08eac5c5317221d6ad1e143ae03894b862e8976890"},
+    {file = "pytest-6.2.4.tar.gz", hash = "sha256:50bcad0a0b9c5a72c8e4e7c9855a3ad496ca6a881a3641b4260605450772c54b"},
+]
 pyyaml = [
     {file = "PyYAML-5.4.1-cp27-cp27m-macosx_10_9_x86_64.whl", hash = "sha256:3b2b1824fe7112845700f815ff6a489360226a5609b96ec2190a45e62a9fc922"},
     {file = "PyYAML-5.4.1-cp27-cp27m-win32.whl", hash = "sha256:129def1b7c1bf22faffd67b8f3724645203b79d8f4cc81f674654d9902cb4393"},
diff --git a/pyproject.toml b/pyproject.toml
index f7a8f0d..6ec76d9 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -13,6 +13,7 @@ direnv = "^2020.12.3"
 flake8 = "^3.9.2"
 moz-sql-parser = "^4.40.21126"
 tqdm = "^4.60.0"
+pytest = "^6.2.4"
 
 [tool.poetry.dev-dependencies]
 
diff --git a/tests/unit/test_extractor.py b/tests/__init__.py
similarity index 100%
rename from tests/unit/test_extractor.py
rename to tests/__init__.py
diff --git a/tests/conftest.py b/tests/conftest.py
index e69de29..10f2b4a 100644
--- a/tests/conftest.py
+++ b/tests/conftest.py
@@ -0,0 +1,12 @@
+import pytest
+
+
+pytest_plugins = [
+    "tests.fixtures.fixture_extractor",
+]
+
+
+@pytest.fixture()
+def query_user_activity():
+    with open(file="data/reporting/user_activity.sql", mode="r") as f:
+        return f.read()
diff --git a/tests/fixtures/fixture_extractor.py b/tests/fixtures/fixture_extractor.py
index e69de29..8c6330e 100644
--- a/tests/fixtures/fixture_extractor.py
+++ b/tests/fixtures/fixture_extractor.py
@@ -0,0 +1,12 @@
+import pytest
+
+
+@pytest.fixture()
+def extracted_user_activity():
+    return {
+        "reporting.user_activity": [
+            "analytics.commit",
+            "analytics.repo",
+            "analytics.user",
+        ]
+    }
diff --git a/tests/integration/test_extractor.py b/tests/integration/test_extractor.py
new file mode 100644
index 0000000..1399a56
--- /dev/null
+++ b/tests/integration/test_extractor.py
@@ -0,0 +1,26 @@
+import pytest
+import os
+from extractor import Extractor
+
+
+# run multiple times to ensure value ordering is preserved
+# if not preserved, then test will fail
+@pytest.mark.parametrize("execution_number", range(3))
+def test_create_query_removal_text(execution_number, extracted_user_activity):
+    schema = "reporting"
+    dir_report = "data/reporting"
+    remove_txt = []
+    for table in os.listdir(dir_report):
+        table_name, _ = os.path.splitext(p=table)
+        remove_txt.append(f"MERGE {schema}.{table_name} USING (")
+        remove_txt.append(
+            ") ON FALSE WHEN NOT MATCHED THEN "
+            "INSERT ROW WHEN NOT MATCHED BY SOURCE THEN "
+            "DELETE"
+        )
+    extractor = Extractor(script_dir=dir_report, schema="reporting")
+    output = extractor.extract_table_dependencies_from_queries(
+        reference_datasets=["reporting", "analytics", "github_repos"],
+        str_to_remove=remove_txt,
+    )
+    assert output == extracted_user_activity

From 0f6af375290a0440aec4ea1d64fd75bba22d14b8 Mon Sep 17 00:00:00 2001
From: avisionh <a_vision@hotmail.co.uk>
Date: Thu, 20 May 2021 09:02:05 +0100
Subject: [PATCH 10/14] fix: Avoid repeating appending same text to list in
 test

This is to improve efficiency.

Also make documentation of reference_datasets argument clearer.
---
 extractor.py                        |  2 +-
 tests/integration/test_extractor.py | 12 ++++++------
 2 files changed, 7 insertions(+), 7 deletions(-)

diff --git a/extractor.py b/extractor.py
index 44640bb..cda05cf 100644
--- a/extractor.py
+++ b/extractor.py
@@ -122,7 +122,7 @@ def extract_table_dependencies_from_queries(
         """
         Extracts the table names and their dependencies from a set of .sql files.
 
-        :param reference_datasets: List of datasets/schema that the tables belong to.
+        :param reference_datasets: List of datasets/schema of database.
         :param str_to_remove: String or list of strings to remove from the query.
         :param verbose: Boolean to output steps taken and query after cleaning. Useful for debugging.
         :return: Dictionary of tables as keys and their dependent tables as values.
diff --git a/tests/integration/test_extractor.py b/tests/integration/test_extractor.py
index 1399a56..007e1a3 100644
--- a/tests/integration/test_extractor.py
+++ b/tests/integration/test_extractor.py
@@ -13,12 +13,12 @@ def test_create_query_removal_text(execution_number, extracted_user_activity):
     for table in os.listdir(dir_report):
         table_name, _ = os.path.splitext(p=table)
         remove_txt.append(f"MERGE {schema}.{table_name} USING (")
-        remove_txt.append(
-            ") ON FALSE WHEN NOT MATCHED THEN "
-            "INSERT ROW WHEN NOT MATCHED BY SOURCE THEN "
-            "DELETE"
-        )
-    extractor = Extractor(script_dir=dir_report, schema="reporting")
+    remove_txt.append(
+        ") ON FALSE WHEN NOT MATCHED THEN "
+        "INSERT ROW WHEN NOT MATCHED BY SOURCE THEN "
+        "DELETE"
+    )
+    extractor = Extractor(script_dir=dir_report, schema=schema)
     output = extractor.extract_table_dependencies_from_queries(
         reference_datasets=["reporting", "analytics", "github_repos"],
         str_to_remove=remove_txt,

From b4e3b3883c01dd6d3974ad3f218b5c411bfef1ac Mon Sep 17 00:00:00 2001
From: avisionh <a_vision@hotmail.co.uk>
Date: Thu, 20 May 2021 21:54:32 +0100
Subject: [PATCH 11/14] feat: Test analytics dataset scripts

This is so we can ensure class method works on a directory of SQL scripts.
---
 data/analytics/author.sql           |  2 +-
 data/analytics/commit.sql           |  2 +-
 data/analytics/committer.sql        |  2 +-
 data/analytics/repo.sql             |  6 ++--
 tests/fixtures/fixture_extractor.py | 44 ++++++++++++++++++++++++++++-
 tests/integration/test_extractor.py | 41 +++++++++++++++------------
 6 files changed, 72 insertions(+), 25 deletions(-)

diff --git a/data/analytics/author.sql b/data/analytics/author.sql
index 9d8b7af..052593d 100644
--- a/data/analytics/author.sql
+++ b/data/analytics/author.sql
@@ -7,7 +7,7 @@ USING (
         ,author.tz_offset AS tz_offset
         ,author.date.seconds AS date_seconds
         ,author.date.nanos AS date_nanos
-    FROM `bigquery-public-data.github_repos.commits`
+    FROM github_repos.commits
 )
 ON FALSE
 WHEN NOT MATCHED THEN
diff --git a/data/analytics/commit.sql b/data/analytics/commit.sql
index eda3a63..0e15249 100644
--- a/data/analytics/commit.sql
+++ b/data/analytics/commit.sql
@@ -12,7 +12,7 @@ USING (
         ,message
         ,repo_name
         ,difference_truncated
-    FROM `bigquery-public-data.github_repos.commits`
+    FROM github_repos.commits
 )
 ON FALSE
 WHEN NOT MATCHED THEN
diff --git a/data/analytics/committer.sql b/data/analytics/committer.sql
index 6c76d31..9c9c1e7 100644
--- a/data/analytics/committer.sql
+++ b/data/analytics/committer.sql
@@ -7,7 +7,7 @@ USING (
         ,committer.tz_offset AS tz_offset
         ,committer.date.seconds AS date_seconds
         ,committer.date.nanos AS date_nanos
-    FROM `bigquery-public-data.github_repos.commits`
+    FROM github_repos.commits
 )
 ON FALSE
 WHEN NOT MATCHED THEN
diff --git a/data/analytics/repo.sql b/data/analytics/repo.sql
index 7269807..0750069 100644
--- a/data/analytics/repo.sql
+++ b/data/analytics/repo.sql
@@ -7,10 +7,10 @@ USING (
         ,b.language.name AS language
         ,b.language.bytes AS repo_size
         ,c.license
-    FROM `bigquery-public-data.github_repos.commits` AS a
-    LEFT JOIN `bigquery-public-data.github_repos.languages` AS b
+    FROM github_repos.commits AS a
+    LEFT JOIN github_repos.languages AS b
         ON a.repo_name = b.repo_name
-    LEFT JOIN `bigquery-public-data.github_repos.licenses` AS c
+    LEFT JOIN github_repos.licenses AS c
         ON a.repo_name = c.repo_name
 )
 ON FALSE
diff --git a/tests/fixtures/fixture_extractor.py b/tests/fixtures/fixture_extractor.py
index 8c6330e..665b84e 100644
--- a/tests/fixtures/fixture_extractor.py
+++ b/tests/fixtures/fixture_extractor.py
@@ -2,7 +2,34 @@
 
 
 @pytest.fixture()
-def extracted_user_activity():
+def cleaned_user_activity():
+    return (
+        "WITH cte_base AS ( "
+        "SELECT b.name ,b.email ,'commit' AS activity_type ,COUNT(a.*) AS activity_count "
+        "FROM analytics.commit AS a "
+        "LEFT JOIN analytics.user AS b "
+        "ON a.committer_name = b.name "
+        "AND b.user_type = 'committer' "
+        "GROUP BY b.name ,b.email ,a.repo_name "
+        "UNION "
+        "SELECT a.author_name AS name ,b.email ,'repo' AS activity_type ,COUNT(a.*) AS activity_count "
+        "FROM analytics.repo AS a "
+        "LEFT JOIN analytics.user AS b "
+        "ON a.author_name = b.name "
+        "GROUP BY a.author_name ,b.email ) "
+        "SELECT name ,email ,activity_type ,activity_count "
+        "FROM cte_base "
+        "UNION "
+        "SELECT name ,email ,activity_type ,activity_count "
+        "FROM ( "
+        "SELECT name ,email ,'total' AS activity_type ,SUM(activity_count) AS activity_count "
+        "FROM cte_base "
+        "GROUP BY name ,email ) ;"
+    )
+
+
+@pytest.fixture()
+def extracted_reporting():
     return {
         "reporting.user_activity": [
             "analytics.commit",
@@ -10,3 +37,18 @@ def extracted_user_activity():
             "analytics.user",
         ]
     }
+
+
+@pytest.fixture()
+def extracted_analytics():
+    return {
+        "analytics.repo": [
+            "github_repos.commits",
+            "github_repos.languages",
+            "github_repos.licenses",
+        ],
+        "analytics.author": ["github_repos.commits"],
+        "analytics.committer": ["github_repos.commits"],
+        "analytics.commit": ["github_repos.commits"],
+        "analytics.user": ["analytics.author", "analytics.committer"],
+    }
diff --git a/tests/integration/test_extractor.py b/tests/integration/test_extractor.py
index 007e1a3..d53357a 100644
--- a/tests/integration/test_extractor.py
+++ b/tests/integration/test_extractor.py
@@ -6,21 +6,26 @@
 # run multiple times to ensure value ordering is preserved
 # if not preserved, then test will fail
 @pytest.mark.parametrize("execution_number", range(3))
-def test_create_query_removal_text(execution_number, extracted_user_activity):
-    schema = "reporting"
-    dir_report = "data/reporting"
-    remove_txt = []
-    for table in os.listdir(dir_report):
-        table_name, _ = os.path.splitext(p=table)
-        remove_txt.append(f"MERGE {schema}.{table_name} USING (")
-    remove_txt.append(
-        ") ON FALSE WHEN NOT MATCHED THEN "
-        "INSERT ROW WHEN NOT MATCHED BY SOURCE THEN "
-        "DELETE"
-    )
-    extractor = Extractor(script_dir=dir_report, schema=schema)
-    output = extractor.extract_table_dependencies_from_queries(
-        reference_datasets=["reporting", "analytics", "github_repos"],
-        str_to_remove=remove_txt,
-    )
-    assert output == extracted_user_activity
+def test_extract_table_dependencies_from_queries(
+    execution_number, extracted_reporting, extracted_analytics
+):
+    schemes = ["analytics", "reporting"]
+    extract = [extracted_analytics, extracted_reporting]
+
+    for i, schema in enumerate(schemes):
+        dir_report = f"data/{schema}"
+        remove_txt = []
+        for table in os.listdir(dir_report):
+            table_name, _ = os.path.splitext(p=table)
+            remove_txt.append(f"MERGE {schema}.{table_name} USING (")
+        remove_txt.append(
+            ") ON FALSE WHEN NOT MATCHED THEN "
+            "INSERT ROW WHEN NOT MATCHED BY SOURCE THEN "
+            "DELETE"
+        )
+        extractor = Extractor(script_dir=dir_report, schema=schema)
+        output = extractor.extract_table_dependencies_from_queries(
+            reference_datasets=["reporting", "analytics", "github_repos"],
+            str_to_remove=remove_txt,
+        )
+        assert output == extract[i]

From fb3e7992ada7fb7eac80030255aef264fe3c051d Mon Sep 17 00:00:00 2001
From: avisionh <a_vision@hotmail.co.uk>
Date: Thu, 20 May 2021 22:06:14 +0100
Subject: [PATCH 12/14] test: Add unit-test for clean_query method

This is so we can check this works also.
---
 extractor.py                        |  2 +-
 tests/fixtures/fixture_extractor.py |  4 ++--
 tests/unit/test_extractor.py        | 17 +++++++++++++++++
 3 files changed, 20 insertions(+), 3 deletions(-)
 create mode 100644 tests/unit/test_extractor.py

diff --git a/extractor.py b/extractor.py
index cda05cf..9a4b8bf 100644
--- a/extractor.py
+++ b/extractor.py
@@ -40,7 +40,7 @@ def read_query(self, file: str) -> (str, str):
             )
 
     @staticmethod
-    def clean_query(query: str, str_to_remove: Union[str, list]) -> str:
+    def clean_query(query: str, str_to_remove: Union[str, list] = None) -> str:
         """
         Cleans a query so it can be parsed.
 
diff --git a/tests/fixtures/fixture_extractor.py b/tests/fixtures/fixture_extractor.py
index 665b84e..56b49c0 100644
--- a/tests/fixtures/fixture_extractor.py
+++ b/tests/fixtures/fixture_extractor.py
@@ -4,7 +4,7 @@
 @pytest.fixture()
 def cleaned_user_activity():
     return (
-        "WITH cte_base AS ( "
+        " WITH cte_base AS ( "
         "SELECT b.name ,b.email ,'commit' AS activity_type ,COUNT(a.*) AS activity_count "
         "FROM analytics.commit AS a "
         "LEFT JOIN analytics.user AS b "
@@ -24,7 +24,7 @@ def cleaned_user_activity():
         "FROM ( "
         "SELECT name ,email ,'total' AS activity_type ,SUM(activity_count) AS activity_count "
         "FROM cte_base "
-        "GROUP BY name ,email ) ;"
+        "GROUP BY name ,email ) ; "
     )
 
 
diff --git a/tests/unit/test_extractor.py b/tests/unit/test_extractor.py
new file mode 100644
index 0000000..8674c60
--- /dev/null
+++ b/tests/unit/test_extractor.py
@@ -0,0 +1,17 @@
+from extractor import Extractor
+
+
+def test_clean_query(query_user_activity, cleaned_user_activity):
+    schema = "reporting"
+    dir_report = f"data/{schema}"
+    extractor = Extractor(script_dir=dir_report, schema=schema)
+    txt_remove = [
+        f"MERGE {schema}.user_activity USING (",
+        ") ON FALSE WHEN NOT MATCHED THEN "
+        "INSERT ROW WHEN NOT MATCHED BY SOURCE THEN "
+        "DELETE",
+    ]
+    cleaned_query = extractor.clean_query(
+        query=query_user_activity, str_to_remove=txt_remove
+    )
+    assert cleaned_query == cleaned_user_activity

From 7a236ed71dba8b7d708cdf36c155919862c184cc Mon Sep 17 00:00:00 2001
From: avisionh <a_vision@hotmail.co.uk>
Date: Thu, 20 May 2021 23:38:29 +0100
Subject: [PATCH 13/14] test: Add pytest in CI pipeline

This is so it can be run automatically.
---
 .github/workflows/pytesting.yml | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/.github/workflows/pytesting.yml b/.github/workflows/pytesting.yml
index 81bc57a..5090408 100644
--- a/.github/workflows/pytesting.yml
+++ b/.github/workflows/pytesting.yml
@@ -32,3 +32,7 @@ jobs:
         poetry run flake8 . --count --select=E9,F63,F7,F82 --show-source --statistics
         # exit-zero treats all errors as warnings. The GitHub editor is 127 chars wide
         poetry run flake8 . --count --exit-zero --max-complexity=10 --max-line-length=127 --statistics
+    - name: Test with pytest
+      run: |
+        poetry run pytest tests/unit
+        poetry run pytest tests/integration

From 0d4881da4ff25e6ab222c70c38e8c81b9860cae3 Mon Sep 17 00:00:00 2001
From: avisionh <a_vision@hotmail.co.uk>
Date: Fri, 21 May 2021 00:20:49 +0100
Subject: [PATCH 14/14] docs: Add aknowledgements to moz-sql-parser

This is to show what it builds on.

Include Python badge also and move ISSUES_TEMPLATE/ to .github/.
---
 .../ISSUE_TEMPLATE}/bug_report.md                   |  0
 .../ISSUE_TEMPLATE}/feature_request.md              |  0
 README.md                                           | 13 +++++++++----
 3 files changed, 9 insertions(+), 4 deletions(-)
 rename {ISSUE_TEMPLATE => .github/ISSUE_TEMPLATE}/bug_report.md (100%)
 rename {ISSUE_TEMPLATE => .github/ISSUE_TEMPLATE}/feature_request.md (100%)

diff --git a/ISSUE_TEMPLATE/bug_report.md b/.github/ISSUE_TEMPLATE/bug_report.md
similarity index 100%
rename from ISSUE_TEMPLATE/bug_report.md
rename to .github/ISSUE_TEMPLATE/bug_report.md
diff --git a/ISSUE_TEMPLATE/feature_request.md b/.github/ISSUE_TEMPLATE/feature_request.md
similarity index 100%
rename from ISSUE_TEMPLATE/feature_request.md
rename to .github/ISSUE_TEMPLATE/feature_request.md
diff --git a/README.md b/README.md
index 96d5ed9..4a4ab24 100644
--- a/README.md
+++ b/README.md
@@ -1,9 +1,13 @@
 # sqlquerygraph
-[![build status](https://github.com/avisionh/sqlquerygraph/workflows/pytesting/badge.svg)](https://github.com/avisionh/sqlqueryraph/actions) [![CodeFactor](https://www.codefactor.io/repository/github/avisionh/sqlquerygraph/badge)](https://www.codefactor.io/repository/github/avisionh/sqlquerygraph) [![License: MIT](https://img.shields.io/badge/License-MIT-informational.svg)](https://opensource.org/licenses/MIT) [![Code style: black](https://img.shields.io/badge/code%20style-black-000000.svg)](https://github.com/psf/black)
+[![build status](https://github.com/avisionh/sqlquerygraph/workflows/pytesting/badge.svg)](https://github.com/avisionh/sqlqueryraph/actions)
+[![](https://img.shields.io/badge/python-3.8%2B-blue.svg)](https://www.python.org/downloads/)
+[![CodeFactor](https://www.codefactor.io/repository/github/avisionh/sqlquerygraph/badge)](https://www.codefactor.io/repository/github/avisionh/sqlquerygraph)
+[![License: MIT](https://img.shields.io/badge/License-MIT-informational.svg)](https://opensource.org/licenses/MIT)
+[![Code style: black](https://img.shields.io/badge/code%20style-black-000000.svg)](https://github.com/psf/black)
 
 Parse your SQL queries and represent their structure as a graph.
 
-Currently, we implement the ability of represent how each of the tables in a set of SQL query scripts depend on each other.
+Currently, we implement the ability of representing how each of the tables in a set of SQL query scripts depend on each other.
 
 ## Requirements
 To run the code in here, ensure your system meets the following requirements:
@@ -14,8 +18,6 @@ To run the code in here, ensure your system meets the following requirements:
 - Python 3.8 or above; and
 - [Poetry](https://python-poetry.org/docs/) installed.
 
-Parse your SQL queries and represent their structure as a graph.
-
 Note there may be some Python IDE-specific requirements around loading environment variables, which are not considered here.
 
 ### Set-up
@@ -33,3 +35,6 @@ pre-commit install
 ```
 
 ***
+
+## Acknowledgements
+This builds on the excellent [moz-sql-parser](https://github.com/mozilla/moz-sql-parser) package.