diff --git a/ISSUE_TEMPLATE/bug_report.md b/.github/ISSUE_TEMPLATE/bug_report.md
similarity index 100%
rename from ISSUE_TEMPLATE/bug_report.md
rename to .github/ISSUE_TEMPLATE/bug_report.md
diff --git a/ISSUE_TEMPLATE/feature_request.md b/.github/ISSUE_TEMPLATE/feature_request.md
similarity index 100%
rename from ISSUE_TEMPLATE/feature_request.md
rename to .github/ISSUE_TEMPLATE/feature_request.md
diff --git a/.github/workflows/pytesting.yml b/.github/workflows/pytesting.yml
index 81bc57a..5090408 100644
--- a/.github/workflows/pytesting.yml
+++ b/.github/workflows/pytesting.yml
@@ -32,3 +32,7 @@ jobs:
         poetry run flake8 . --count --select=E9,F63,F7,F82 --show-source --statistics
         # exit-zero treats all errors as warnings. The GitHub editor is 127 chars wide
         poetry run flake8 . --count --exit-zero --max-complexity=10 --max-line-length=127 --statistics
+    - name: Test with pytest
+      run: |
+        poetry run pytest tests/unit
+        poetry run pytest tests/integration
diff --git a/.gitignore b/.gitignore
index 0cb7c0a..abcbf77 100644
--- a/.gitignore
+++ b/.gitignore
@@ -12,3 +12,6 @@
 
 # environment
 .env
+
+# django
+__pycache__/
diff --git a/README.md b/README.md
index dd4e328..4a4ab24 100644
--- a/README.md
+++ b/README.md
@@ -1,4 +1,40 @@
 # sqlquerygraph
-[![build status](https://github.com/avisionh/sqlquerygraph/workflows/pytesting/badge.svg)](https://github.com/avisionh/sqlqueryraph/actions) [![CodeFactor](https://www.codefactor.io/repository/github/avisionh/sqlquerygraph/badge)](https://www.codefactor.io/repository/github/avisionh/sqlquerygraph) [![License: MIT](https://img.shields.io/badge/License-MIT-informational.svg)](https://opensource.org/licenses/MIT) [![Code style: black](https://img.shields.io/badge/code%20style-black-000000.svg)](https://github.com/psf/black)
+[![build status](https://github.com/avisionh/sqlquerygraph/workflows/pytesting/badge.svg)](https://github.com/avisionh/sqlqueryraph/actions)
+[![](https://img.shields.io/badge/python-3.8%2B-blue.svg)](https://www.python.org/downloads/)
+[![CodeFactor](https://www.codefactor.io/repository/github/avisionh/sqlquerygraph/badge)](https://www.codefactor.io/repository/github/avisionh/sqlquerygraph)
+[![License: MIT](https://img.shields.io/badge/License-MIT-informational.svg)](https://opensource.org/licenses/MIT)
+[![Code style: black](https://img.shields.io/badge/code%20style-black-000000.svg)](https://github.com/psf/black)
 
 Parse your SQL queries and represent their structure as a graph.
+
+Currently, we implement the ability of representing how each of the tables in a set of SQL query scripts depend on each other.
+
+## Requirements
+To run the code in here, ensure your system meets the following requirements:
+- Unix-like operating system (macOS, Linux, ...);
+- [`direnv`](https://direnv.net/) installed, including shell hooks;
+- [`.envrc`](https://github.com/avisionh/sqlquerygraph/blob/main/.envrc) allowed/trusted by `direnv` to
+  use the environment variables - see [below](#allowingtrusting-envrc);
+- Python 3.8 or above; and
+- [Poetry](https://python-poetry.org/docs/) installed.
+
+Note there may be some Python IDE-specific requirements around loading environment variables, which are not considered here.
+
+### Set-up
+For quickstart set-up of the project, run the below in your shell:
+```shell script
+# 1. read project-specific environment variables
+direnv allow
+
+# 2. activate virtual environment and install package dependencies
+poetry shell
+poetry install
+
+# 3. check adherence to good standards on every commit
+pre-commit install
+```
+
+***
+
+## Acknowledgements
+This builds on the excellent [moz-sql-parser](https://github.com/mozilla/moz-sql-parser) package.
diff --git a/data/analytics/author.sql b/data/analytics/author.sql
new file mode 100644
index 0000000..052593d
--- /dev/null
+++ b/data/analytics/author.sql
@@ -0,0 +1,16 @@
+MERGE analytics.author
+USING (
+    SELECT
+        author.name AS name
+        ,author.email AS email
+        ,author.time_sec AS time_sec
+        ,author.tz_offset AS tz_offset
+        ,author.date.seconds AS date_seconds
+        ,author.date.nanos AS date_nanos
+    FROM github_repos.commits
+)
+ON FALSE
+WHEN NOT MATCHED THEN
+    INSERT ROW
+WHEN NOT MATCHED BY SOURCE THEN
+    DELETE;
diff --git a/data/analytics/commit.sql b/data/analytics/commit.sql
new file mode 100644
index 0000000..0e15249
--- /dev/null
+++ b/data/analytics/commit.sql
@@ -0,0 +1,21 @@
+MERGE analytics.commit
+USING (
+    SELECT
+        commit
+        ,tree
+        ,parent
+        ,author.name AS author_name
+        ,author.time_sec AS author_timesec
+        ,committer.name AS committer_name
+        ,committer.time_sec AS committer_time_sec
+        ,subject
+        ,message
+        ,repo_name
+        ,difference_truncated
+    FROM github_repos.commits
+)
+ON FALSE
+WHEN NOT MATCHED THEN
+    INSERT ROW
+WHEN NOT MATCHED BY SOURCE THEN
+    DELETE;
diff --git a/data/analytics/committer.sql b/data/analytics/committer.sql
new file mode 100644
index 0000000..9c9c1e7
--- /dev/null
+++ b/data/analytics/committer.sql
@@ -0,0 +1,16 @@
+MERGE analytics.committer
+USING (
+    SELECT
+        committer.name AS name
+        ,committer.email AS email
+        ,committer.time_sec AS time_sec
+        ,committer.tz_offset AS tz_offset
+        ,committer.date.seconds AS date_seconds
+        ,committer.date.nanos AS date_nanos
+    FROM github_repos.commits
+)
+ON FALSE
+WHEN NOT MATCHED THEN
+    INSERT ROW
+WHEN NOT MATCHED BY SOURCE THEN
+    DELETE;
diff --git a/data/analytics/repo.sql b/data/analytics/repo.sql
new file mode 100644
index 0000000..0750069
--- /dev/null
+++ b/data/analytics/repo.sql
@@ -0,0 +1,20 @@
+MERGE analytics.repo
+USING (
+    SELECT
+        a.repo_name
+        ,a.author.name AS author_name
+        ,a.author.time_sec AS author_time_sec
+        ,b.language.name AS language
+        ,b.language.bytes AS repo_size
+        ,c.license
+    FROM github_repos.commits AS a
+    LEFT JOIN github_repos.languages AS b
+        ON a.repo_name = b.repo_name
+    LEFT JOIN github_repos.licenses AS c
+        ON a.repo_name = c.repo_name
+)
+ON FALSE
+WHEN NOT MATCHED THEN
+    INSERT ROW
+WHEN NOT MATCHED BY SOURCE THEN
+    DELETE;
diff --git a/data/analytics/user.sql b/data/analytics/user.sql
new file mode 100644
index 0000000..accd863
--- /dev/null
+++ b/data/analytics/user.sql
@@ -0,0 +1,19 @@
+MERGE analytics.user
+USING (
+    SELECT DISTINCT
+        name
+        ,email
+        ,'author' AS user_type
+    FROM analytics.author
+    UNION
+    SELECT DISTINCT
+        name
+        ,email
+        ,'committer' AS user_type
+    FROM analytics.committer
+)
+ON FALSE
+WHEN NOT MATCHED THEN
+    INSERT ROW
+WHEN NOT MATCHED BY SOURCE THEN
+    DELETE;
diff --git a/data/reporting/user_activity.sql b/data/reporting/user_activity.sql
new file mode 100644
index 0000000..19748c2
--- /dev/null
+++ b/data/reporting/user_activity.sql
@@ -0,0 +1,63 @@
+MERGE reporting.user_activity
+USING (
+    WITH cte_base AS
+    (
+        SELECT
+            b.name
+            ,b.email
+            ,'commit' AS activity_type
+            ,COUNT(a.*) AS activity_count
+        FROM analytics.commit AS a
+        LEFT JOIN analytics.user AS b
+            ON a.committer_name = b.name
+                AND b.user_type = 'committer'
+        GROUP BY
+            b.name
+            ,b.email
+            ,a.repo_name
+
+        UNION
+
+        SELECT
+            a.author_name AS name
+            ,b.email
+            ,'repo' AS activity_type
+            ,COUNT(a.*) AS activity_count
+        FROM analytics.repo AS a
+        LEFT JOIN analytics.user AS b
+            ON a.author_name = b.name
+        GROUP BY
+            a.author_name
+            ,b.email
+    )
+
+    SELECT
+        name
+        ,email
+        ,activity_type
+        ,activity_count
+    FROM cte_base
+    UNION
+    SELECT
+        name
+        ,email
+        ,activity_type
+        ,activity_count
+    FROM
+    (
+        SELECT
+            name
+            ,email
+            ,'total' AS activity_type
+            ,SUM(activity_count) AS activity_count
+        FROM cte_base
+        GROUP BY
+            name
+            ,email
+    )
+)
+ON FALSE
+WHEN NOT MATCHED THEN
+    INSERT ROW
+WHEN NOT MATCHED BY SOURCE THEN
+    DELETE;
diff --git a/extractor.py b/extractor.py
new file mode 100644
index 0000000..9a4b8bf
--- /dev/null
+++ b/extractor.py
@@ -0,0 +1,176 @@
+from typing import Union
+
+import os
+import re
+from tqdm import tqdm
+
+from moz_sql_parser import parse
+from pprint import pprint
+
+
+class Extractor:
+    """
+    Extract table names from SQL queries.
+
+    :param script_dir: String of the directory were we store our SQL queries.
+    :param schema: String of the dataset/schema that the SQL queries creating the table belongs to.
+    """
+
+    def __init__(self, script_dir: str, schema: str):
+        self.script_dir = script_dir
+        self.schema = schema
+
+    def read_query(self, file: str) -> (str, str):
+        """
+        Reads a SQL file in.
+        Note: Relies on SQL script being named the same as table or View it is creating.
+
+        :param file: String of the file to read query from.
+        :return: Tuple of strings of the table name and SQL query from the file.
+        """
+        file_name, file_extension = os.path.splitext(p=file)
+        if file_extension == ".sql":
+            with open(file=os.path.join(self.script_dir, file), mode="r") as f:
+                query = f.read()
+            return file_name, query
+        else:
+            raise Exception(
+                f"Passed in a {file_extension} file. \n"
+                f"Please pass in a .sql file instead."
+            )
+
+    @staticmethod
+    def clean_query(query: str, str_to_remove: Union[str, list] = None) -> str:
+        """
+        Cleans a query so it can be parsed.
+
+        :param query: String of the query to clean.
+        :param str_to_remove: String or list of strings to remove from the query.
+        :return: String of the cleaned query to parse.
+        """
+        # remove new lines and multiple spaces
+        query = query.replace("\n", " ")
+        query = re.sub(pattern=r"\s+", repl=" ", string=query)
+
+        if str_to_remove is not None:
+            for txt in str_to_remove:
+                query = query.replace(txt, "")
+
+        return query
+
+    @staticmethod
+    def parse_query(query: str, print_tree: bool = False) -> dict:
+        """
+        Parse a query into a JSON parse-tree.
+
+        :param query: String of the SQL query to parse as a JSON parse-tree.
+        :param print_tree: Boolean to print the JSON parse-tree.
+        :return: Dictionary of the query as a JSON parse-tree.
+        """
+        query_json = parse(sql=query)
+        if print_tree:
+            pprint(object=query_json)
+        return query_json
+
+    @staticmethod
+    def extract_from_json(obj: dict, key: str) -> list:
+        """
+        Recursively fetch values from a nested JSON.
+
+        For our purposes, extract where key is 'from' allows extraction of *most* table names after a `FROM` clause.
+            - It does not extract the table names when the name is nested in a subquery.
+            - Nor does it extract table names in '<TYPE> JOIN` clauses.
+        To achieve above two, need to extract where the key is 'value' and compare with actual table names.
+        This is because the values returned when key is 'value' are table names, column names etc.
+        Reference
+            -  https://hackersandslackers.com/extract-data-from-complex-json-python/
+        :param obj: Dictionary to extract values from.
+        :param key: String of the value you want to extract.
+        :return: List of values for the key.
+        """
+        arr = []
+
+        def extract(obj: Union[dict, list], arr: list, key: str) -> list:
+            """
+            Recusively search for values of key in a JSON tree.
+
+            :param obj: Dictionary to extract values from.
+            :param arr: List to store extracted values to.
+            :param key: String of the dictionary key to extract associated value from.
+            :return: List of the extracted values.
+            """
+            if isinstance(obj, dict):
+                for k, v in obj.items():
+                    if isinstance(v, (dict, list)):
+                        extract(obj=v, arr=arr, key=key)
+                    elif k == key:
+                        arr.append(v)
+            elif isinstance(obj, list):
+                for item in obj:
+                    extract(obj=item, arr=arr, key=key)
+            return arr
+
+        values = extract(obj=obj, arr=arr, key=key)
+        return values
+
+    def extract_table_dependencies_from_queries(
+        self,
+        reference_datasets: list,
+        str_to_remove: Union[str, list] = None,
+        verbose: bool = False,
+    ) -> dict:
+        """
+        Extracts the table names and their dependencies from a set of .sql files.
+
+        :param reference_datasets: List of datasets/schema of database.
+        :param str_to_remove: String or list of strings to remove from the query.
+        :param verbose: Boolean to output steps taken and query after cleaning. Useful for debugging.
+        :return: Dictionary of tables as keys and their dependent tables as values.
+        """
+        queries, jsons, dicts = {}, {}, {}
+        reference_datasets = tuple([f"{txt}." for txt in reference_datasets])
+        for file_name in tqdm(os.listdir(path=self.script_dir)):
+            if verbose:
+                print(f"Reading query {file_name}...\n")
+            file_name, query = self.read_query(file=file_name)
+            queries[file_name] = query
+
+            if str_to_remove is not None:
+                if verbose:
+                    print(
+                        f"Cleaning query {file_name} by removing {str_to_remove}...\n"
+                    )
+                queries[file_name] = self.clean_query(
+                    query=queries[file_name], str_to_remove=str_to_remove
+                )
+
+            if verbose:
+                print(f"Cleaned query is {queries[file_name]}")
+                print(f"Parsing query {file_name}...\n")
+            jsons[file_name] = self.parse_query(
+                query=queries[file_name], print_tree=verbose
+            )
+
+            if verbose:
+                print(f"Extracting table names from {file_name}...\n")
+            #   - from: tables after 'from' clause
+            #       + though sometimes keys are not 'from' so need to
+            #       + look at values associated to the 'value' key
+            #   - value: tables after '... join' clauses
+            #       + can also include tables after 'from' clause if they
+            #       + are in a subquery
+            table_from = self.extract_from_json(obj=jsons[file_name], key="from")
+
+            # keep only table elements and not table aliases - as defined by period
+            table_from = [txt for txt in table_from if "." in txt]
+            table_value = self.extract_from_json(obj=jsons[file_name], key="value")
+            # extract table values when it starts with `<schema>.`
+            table_join = [
+                txt for txt in table_value if str(txt).startswith(reference_datasets)
+            ]
+            tables = sorted(list(set(table_from + table_join)))
+
+            # store in dictionary
+            dicts[f"{self.schema}.{file_name}"] = tables
+
+        return dicts
diff --git a/poetry.lock b/poetry.lock
index 7ae4cb6..23fe942 100644
--- a/poetry.lock
+++ b/poetry.lock
@@ -6,6 +6,28 @@ category = "main"
 optional = false
 python-versions = "*"
 
+[[package]]
+name = "atomicwrites"
+version = "1.4.0"
+description = "Atomic file writes."
+category = "main"
+optional = false
+python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*"
+
+[[package]]
+name = "attrs"
+version = "21.2.0"
+description = "Classes Without Boilerplate"
+category = "main"
+optional = false
+python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*, !=3.4.*"
+
+[package.extras]
+dev = ["coverage[toml] (>=5.0.2)", "hypothesis", "pympler", "pytest (>=4.3.0)", "six", "mypy", "pytest-mypy-plugins", "zope.interface", "furo", "sphinx", "sphinx-notfound-page", "pre-commit"]
+docs = ["furo", "sphinx", "zope.interface", "sphinx-notfound-page"]
+tests = ["coverage[toml] (>=5.0.2)", "hypothesis", "pympler", "pytest (>=4.3.0)", "six", "mypy", "pytest-mypy-plugins", "zope.interface"]
+tests_no_zope = ["coverage[toml] (>=5.0.2)", "hypothesis", "pympler", "pytest (>=4.3.0)", "six", "mypy", "pytest-mypy-plugins"]
+
 [[package]]
 name = "certifi"
 version = "2020.12.5"
@@ -30,6 +52,14 @@ category = "main"
 optional = false
 python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*, !=3.4.*"
 
+[[package]]
+name = "colorama"
+version = "0.4.4"
+description = "Cross-platform colored terminal text."
+category = "main"
+optional = false
+python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*, !=3.4.*"
+
 [[package]]
 name = "detect-secrets"
 version = "1.1.0"
@@ -124,6 +154,84 @@ category = "main"
 optional = false
 python-versions = "*"
 
+[[package]]
+name = "iniconfig"
+version = "1.1.1"
+description = "iniconfig: brain-dead simple config-ini parsing"
+category = "main"
+optional = false
+python-versions = "*"
+
+[[package]]
+name = "mo-dots"
+version = "4.22.21108"
+description = "More Dots! Dot-access to Python dicts like Javascript"
+category = "main"
+optional = false
+python-versions = "*"
+
+[package.dependencies]
+mo-future = "3.147.20327"
+mo-imports = "3.149.20327"
+
+[[package]]
+name = "mo-future"
+version = "3.147.20327"
+description = "More future! Make Python 2/3 compatibility a bit easier"
+category = "main"
+optional = false
+python-versions = "*"
+
+[[package]]
+name = "mo-imports"
+version = "3.149.20327"
+description = "More Imports! - Delayed importing"
+category = "main"
+optional = false
+python-versions = "*"
+
+[package.dependencies]
+mo-future = "3.147.20327"
+
+[[package]]
+name = "mo-kwargs"
+version = "4.22.21108"
+description = "More KWARGS! Let call parameters override kwargs"
+category = "main"
+optional = false
+python-versions = "*"
+
+[package.dependencies]
+mo-dots = "4.22.21108"
+mo-future = "3.147.20327"
+
+[[package]]
+name = "mo-logs"
+version = "4.23.21108"
+description = "More Logs! Structured Logging and Exception Handling"
+category = "main"
+optional = false
+python-versions = "*"
+
+[package.dependencies]
+mo-dots = "4.22.21108"
+mo-future = "3.147.20327"
+mo-imports = "3.149.20327"
+mo-kwargs = "4.22.21108"
+
+[[package]]
+name = "moz-sql-parser"
+version = "4.40.21126"
+description = "Extract Parse Tree from SQL"
+category = "main"
+optional = false
+python-versions = "*"
+
+[package.dependencies]
+mo-dots = "4.22.21108"
+mo-future = "3.147.20327"
+mo-logs = "4.23.21108"
+
 [[package]]
 name = "nodeenv"
 version = "1.6.0"
@@ -132,6 +240,28 @@ category = "main"
 optional = false
 python-versions = "*"
 
+[[package]]
+name = "packaging"
+version = "20.9"
+description = "Core utilities for Python packages"
+category = "main"
+optional = false
+python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*"
+
+[package.dependencies]
+pyparsing = ">=2.0.2"
+
+[[package]]
+name = "pluggy"
+version = "0.13.1"
+description = "plugin and hook calling mechanisms for python"
+category = "main"
+optional = false
+python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*"
+
+[package.extras]
+dev = ["pre-commit", "tox"]
+
 [[package]]
 name = "pre-commit"
 version = "2.12.1"
@@ -164,6 +294,43 @@ category = "main"
 optional = false
 python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*"
 
+[[package]]
+name = "py"
+version = "1.10.0"
+description = "library with cross-python path, ini-parsing, io, code, log facilities"
+category = "main"
+optional = false
+python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*"
+
+[[package]]
+name = "pyparsing"
+version = "2.4.7"
+description = "Python parsing module"
+category = "main"
+optional = false
+python-versions = ">=2.6, !=3.0.*, !=3.1.*, !=3.2.*"
+
+[[package]]
+name = "pytest"
+version = "6.2.4"
+description = "pytest: simple powerful testing with Python"
+category = "main"
+optional = false
+python-versions = ">=3.6"
+
+[package.dependencies]
+atomicwrites = {version = ">=1.0", markers = "sys_platform == \"win32\""}
+attrs = ">=19.2.0"
+colorama = {version = "*", markers = "sys_platform == \"win32\""}
+iniconfig = "*"
+packaging = "*"
+pluggy = ">=0.12,<1.0.0a1"
+py = ">=1.8.2"
+toml = "*"
+
+[package.extras]
+testing = ["argcomplete", "hypothesis (>=3.56)", "mock", "nose", "requests", "xmlschema"]
+
 [[package]]
 name = "pyyaml"
 version = "5.4.1"
@@ -206,6 +373,19 @@ category = "main"
 optional = false
 python-versions = ">=2.6, !=3.0.*, !=3.1.*, !=3.2.*"
 
+[[package]]
+name = "tqdm"
+version = "4.60.0"
+description = "Fast, Extensible Progress Meter"
+category = "main"
+optional = false
+python-versions = "!=3.0.*,!=3.1.*,!=3.2.*,!=3.3.*,>=2.7"
+
+[package.extras]
+dev = ["py-make (>=0.1.0)", "twine", "wheel"]
+notebook = ["ipywidgets (>=6)"]
+telegram = ["requests"]
+
 [[package]]
 name = "urllib3"
 version = "1.26.4"
@@ -248,13 +428,21 @@ testing = ["coverage (>=4)", "coverage-enable-subprocess (>=1)", "flaky (>=3)",
 [metadata]
 lock-version = "1.1"
 python-versions = "^3.8"
-content-hash = "09173b5e497218e0d74966bdf08a78ebbfbc7322863b44a255d4e01538efec91"
+content-hash = "daf7c9934fd96f7224a177fe729ebf64c36c6dc462f13c38592ce7058f224a2d"
 
 [metadata.files]
 appdirs = [
     {file = "appdirs-1.4.4-py2.py3-none-any.whl", hash = "sha256:a841dacd6b99318a741b166adb07e19ee71a274450e68237b4650ca1055ab128"},
     {file = "appdirs-1.4.4.tar.gz", hash = "sha256:7d5d0167b2b1ba821647616af46a749d1c653740dd0d2415100fe26e27afdf41"},
 ]
+atomicwrites = [
+    {file = "atomicwrites-1.4.0-py2.py3-none-any.whl", hash = "sha256:6d1784dea7c0c8d4a5172b6c620f40b6e4cbfdf96d783691f2e1302a7b88e197"},
+    {file = "atomicwrites-1.4.0.tar.gz", hash = "sha256:ae70396ad1a434f9c7046fd2dd196fc04b12f9e91ffb859164193be8b6168a7a"},
+]
+attrs = [
+    {file = "attrs-21.2.0-py2.py3-none-any.whl", hash = "sha256:149e90d6d8ac20db7a955ad60cf0e6881a3f20d37096140088356da6c716b0b1"},
+    {file = "attrs-21.2.0.tar.gz", hash = "sha256:ef6aaac3ca6cd92904cdd0d83f629a15f18053ec84e6432106f7a4d04ae4f5fb"},
+]
 certifi = [
     {file = "certifi-2020.12.5-py2.py3-none-any.whl", hash = "sha256:719a74fb9e33b9bd44cc7f3a8d94bc35e4049deebe19ba7d8e108280cfd59830"},
     {file = "certifi-2020.12.5.tar.gz", hash = "sha256:1a4995114262bffbc2413b159f2a1a480c969de6e6eb13ee966d470af86af59c"},
@@ -267,6 +455,10 @@ chardet = [
     {file = "chardet-4.0.0-py2.py3-none-any.whl", hash = "sha256:f864054d66fd9118f2e67044ac8981a54775ec5b67aed0441892edb553d21da5"},
     {file = "chardet-4.0.0.tar.gz", hash = "sha256:0d6f53a15db4120f2b08c94f11e7d93d2c911ee118b6b30a04ec3ee8310179fa"},
 ]
+colorama = [
+    {file = "colorama-0.4.4-py2.py3-none-any.whl", hash = "sha256:9f47eda37229f68eee03b24b9748937c7dc3868f906e8ba69fbcbdd3bc5dc3e2"},
+    {file = "colorama-0.4.4.tar.gz", hash = "sha256:5941b2b48a20143d2267e95b1c2a7603ce057ee39fd88e7329b0c292aa16869b"},
+]
 detect-secrets = [
     {file = "detect_secrets-1.1.0-py2.py3-none-any.whl", hash = "sha256:be8cca3dc65f6fd637f5dec9f583f1cf4a680dc1a580b3d2e65a5ac7a277456a"},
     {file = "detect_secrets-1.1.0.tar.gz", hash = "sha256:68250b31bc108f665f05f0ecfb34f92423280e48e65adbb887fdf721ed909627"},
@@ -301,10 +493,40 @@ mccabe = [
     {file = "mccabe-0.6.1-py2.py3-none-any.whl", hash = "sha256:ab8a6258860da4b6677da4bd2fe5dc2c659cff31b3ee4f7f5d64e79735b80d42"},
     {file = "mccabe-0.6.1.tar.gz", hash = "sha256:dd8d182285a0fe56bace7f45b5e7d1a6ebcbf524e8f3bd87eb0f125271b8831f"},
 ]
+iniconfig = [
+    {file = "iniconfig-1.1.1-py2.py3-none-any.whl", hash = "sha256:011e24c64b7f47f6ebd835bb12a743f2fbe9a26d4cecaa7f53bc4f35ee9da8b3"},
+    {file = "iniconfig-1.1.1.tar.gz", hash = "sha256:bc3af051d7d14b2ee5ef9969666def0cd1a000e121eaea580d4a313df4b37f32"},
+]
+mo-dots = [
+    {file = "mo-dots-4.22.21108.tar.gz", hash = "sha256:f9f2f9a6e44e214959aecd55857b6df487b8e86daf51334e2e982d47ea33ed23"},
+]
+mo-future = [
+    {file = "mo-future-3.147.20327.tar.gz", hash = "sha256:4aafcf859e4657bc11b04ee9791321e2d44702604ae01cac7c412468cb6a513f"},
+]
+mo-imports = [
+    {file = "mo-imports-3.149.20327.tar.gz", hash = "sha256:32e3dc720c84765224d29ed7e9d6972a369d41d756e6f26ddb5b7aa01241331d"},
+]
+mo-kwargs = [
+    {file = "mo-kwargs-4.22.21108.tar.gz", hash = "sha256:9c9d00ab86f1f75013193807d90cad17cbc515384cf3b17b8aff3104a300f7ce"},
+]
+mo-logs = [
+    {file = "mo-logs-4.23.21108.tar.gz", hash = "sha256:de4136a7ce215ecbfd7a368588be0a3f1fd8a6521dc2d4aae57cc1c3ba299aab"},
+]
+moz-sql-parser = [
+    {file = "moz-sql-parser-4.40.21126.tar.gz", hash = "sha256:b3d37cc8ff118d86009aa12646791549537ec0ae8ac312efd4641289c8eee080"},
+]
 nodeenv = [
     {file = "nodeenv-1.6.0-py2.py3-none-any.whl", hash = "sha256:621e6b7076565ddcacd2db0294c0381e01fd28945ab36bcf00f41c5daf63bef7"},
     {file = "nodeenv-1.6.0.tar.gz", hash = "sha256:3ef13ff90291ba2a4a7a4ff9a979b63ffdd00a464dbe04acf0ea6471517a4c2b"},
 ]
+packaging = [
+    {file = "packaging-20.9-py2.py3-none-any.whl", hash = "sha256:67714da7f7bc052e064859c05c595155bd1ee9f69f76557e21f051443c20947a"},
+    {file = "packaging-20.9.tar.gz", hash = "sha256:5b327ac1320dc863dca72f4514ecc086f31186744b84a230374cc1fd776feae5"},
+]
+pluggy = [
+    {file = "pluggy-0.13.1-py2.py3-none-any.whl", hash = "sha256:966c145cd83c96502c3c3868f50408687b38434af77734af1e9ca461a4081d2d"},
+    {file = "pluggy-0.13.1.tar.gz", hash = "sha256:15b2acde666561e1298d71b523007ed7364de07029219b604cf808bfa1c765b0"},
+]
 pre-commit = [
     {file = "pre_commit-2.12.1-py2.py3-none-any.whl", hash = "sha256:70c5ec1f30406250b706eda35e868b87e3e4ba099af8787e3e8b4b01e84f4712"},
     {file = "pre_commit-2.12.1.tar.gz", hash = "sha256:900d3c7e1bf4cf0374bb2893c24c23304952181405b4d88c9c40b72bda1bb8a9"},
@@ -317,6 +539,18 @@ pyflakes = [
     {file = "pyflakes-2.3.1-py2.py3-none-any.whl", hash = "sha256:7893783d01b8a89811dd72d7dfd4d84ff098e5eed95cfa8905b22bbffe52efc3"},
     {file = "pyflakes-2.3.1.tar.gz", hash = "sha256:f5bc8ecabc05bb9d291eb5203d6810b49040f6ff446a756326104746cc00c1db"},
 ]
+py = [
+    {file = "py-1.10.0-py2.py3-none-any.whl", hash = "sha256:3b80836aa6d1feeaa108e046da6423ab8f6ceda6468545ae8d02d9d58d18818a"},
+    {file = "py-1.10.0.tar.gz", hash = "sha256:21b81bda15b66ef5e1a777a21c4dcd9c20ad3efd0b3f817e7a809035269e1bd3"},
+]
+pyparsing = [
+    {file = "pyparsing-2.4.7-py2.py3-none-any.whl", hash = "sha256:ef9d7589ef3c200abe66653d3f1ab1033c3c419ae9b9bdb1240a85b024efc88b"},
+    {file = "pyparsing-2.4.7.tar.gz", hash = "sha256:c203ec8783bf771a155b207279b9bccb8dea02d8f0c9e5f8ead507bc3246ecc1"},
+]
+pytest = [
+    {file = "pytest-6.2.4-py3-none-any.whl", hash = "sha256:91ef2131a9bd6be8f76f1f08eac5c5317221d6ad1e143ae03894b862e8976890"},
+    {file = "pytest-6.2.4.tar.gz", hash = "sha256:50bcad0a0b9c5a72c8e4e7c9855a3ad496ca6a881a3641b4260605450772c54b"},
+]
 pyyaml = [
     {file = "PyYAML-5.4.1-cp27-cp27m-macosx_10_9_x86_64.whl", hash = "sha256:3b2b1824fe7112845700f815ff6a489360226a5609b96ec2190a45e62a9fc922"},
     {file = "PyYAML-5.4.1-cp27-cp27m-win32.whl", hash = "sha256:129def1b7c1bf22faffd67b8f3724645203b79d8f4cc81f674654d9902cb4393"},
@@ -360,6 +594,10 @@ toml = [
     {file = "toml-0.10.2-py2.py3-none-any.whl", hash = "sha256:806143ae5bfb6a3c6e736a764057db0e6a0e05e338b5630894a5f779cabb4f9b"},
     {file = "toml-0.10.2.tar.gz", hash = "sha256:b3bda1d108d5dd99f4a20d24d9c348e91c4db7ab1b749200bded2f839ccbe68f"},
 ]
+tqdm = [
+    {file = "tqdm-4.60.0-py2.py3-none-any.whl", hash = "sha256:daec693491c52e9498632dfbe9ccfc4882a557f5fa08982db1b4d3adbe0887c3"},
+    {file = "tqdm-4.60.0.tar.gz", hash = "sha256:ebdebdb95e3477ceea267decfc0784859aa3df3e27e22d23b83e9b272bf157ae"},
+]
 urllib3 = [
     {file = "urllib3-1.26.4-py2.py3-none-any.whl", hash = "sha256:2f4da4594db7e1e110a944bb1b551fdf4e6c136ad42e4234131391e21eb5b0df"},
     {file = "urllib3-1.26.4.tar.gz", hash = "sha256:e7b021f7241115872f92f43c6508082facffbd1c048e3c6e2bb9c2a157e28937"},
diff --git a/pyproject.toml b/pyproject.toml
index d59f726..6ec76d9 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -11,6 +11,9 @@ pre-commit = "^2.12.1"
 detect-secrets = "^1.1.0"
 direnv = "^2020.12.3"
 flake8 = "^3.9.2"
+moz-sql-parser = "^4.40.21126"
+tqdm = "^4.60.0"
+pytest = "^6.2.4"
 
 [tool.poetry.dev-dependencies]
 
diff --git a/tests/__init__.py b/tests/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/tests/conftest.py b/tests/conftest.py
new file mode 100644
index 0000000..10f2b4a
--- /dev/null
+++ b/tests/conftest.py
@@ -0,0 +1,12 @@
+import pytest
+
+
+pytest_plugins = [
+    "tests.fixtures.fixture_extractor",
+]
+
+
+@pytest.fixture()
+def query_user_activity():
+    with open(file="data/reporting/user_activity.sql", mode="r") as f:
+        return f.read()
diff --git a/tests/fixtures/fixture_extractor.py b/tests/fixtures/fixture_extractor.py
new file mode 100644
index 0000000..56b49c0
--- /dev/null
+++ b/tests/fixtures/fixture_extractor.py
@@ -0,0 +1,54 @@
+import pytest
+
+
+@pytest.fixture()
+def cleaned_user_activity():
+    return (
+        " WITH cte_base AS ( "
+        "SELECT b.name ,b.email ,'commit' AS activity_type ,COUNT(a.*) AS activity_count "
+        "FROM analytics.commit AS a "
+        "LEFT JOIN analytics.user AS b "
+        "ON a.committer_name = b.name "
+        "AND b.user_type = 'committer' "
+        "GROUP BY b.name ,b.email ,a.repo_name "
+        "UNION "
+        "SELECT a.author_name AS name ,b.email ,'repo' AS activity_type ,COUNT(a.*) AS activity_count "
+        "FROM analytics.repo AS a "
+        "LEFT JOIN analytics.user AS b "
+        "ON a.author_name = b.name "
+        "GROUP BY a.author_name ,b.email ) "
+        "SELECT name ,email ,activity_type ,activity_count "
+        "FROM cte_base "
+        "UNION "
+        "SELECT name ,email ,activity_type ,activity_count "
+        "FROM ( "
+        "SELECT name ,email ,'total' AS activity_type ,SUM(activity_count) AS activity_count "
+        "FROM cte_base "
+        "GROUP BY name ,email ) ; "
+    )
+
+
+@pytest.fixture()
+def extracted_reporting():
+    return {
+        "reporting.user_activity": [
+            "analytics.commit",
+            "analytics.repo",
+            "analytics.user",
+        ]
+    }
+
+
+@pytest.fixture()
+def extracted_analytics():
+    return {
+        "analytics.repo": [
+            "github_repos.commits",
+            "github_repos.languages",
+            "github_repos.licenses",
+        ],
+        "analytics.author": ["github_repos.commits"],
+        "analytics.committer": ["github_repos.commits"],
+        "analytics.commit": ["github_repos.commits"],
+        "analytics.user": ["analytics.author", "analytics.committer"],
+    }
diff --git a/tests/integration/test_extractor.py b/tests/integration/test_extractor.py
new file mode 100644
index 0000000..d53357a
--- /dev/null
+++ b/tests/integration/test_extractor.py
@@ -0,0 +1,31 @@
+import pytest
+import os
+from extractor import Extractor
+
+
+# run multiple times to ensure value ordering is preserved
+# if not preserved, then test will fail
+@pytest.mark.parametrize("execution_number", range(3))
+def test_extract_table_dependencies_from_queries(
+    execution_number, extracted_reporting, extracted_analytics
+):
+    schemes = ["analytics", "reporting"]
+    extract = [extracted_analytics, extracted_reporting]
+
+    for i, schema in enumerate(schemes):
+        dir_report = f"data/{schema}"
+        remove_txt = []
+        for table in os.listdir(dir_report):
+            table_name, _ = os.path.splitext(p=table)
+            remove_txt.append(f"MERGE {schema}.{table_name} USING (")
+        remove_txt.append(
+            ") ON FALSE WHEN NOT MATCHED THEN "
+            "INSERT ROW WHEN NOT MATCHED BY SOURCE THEN "
+            "DELETE"
+        )
+        extractor = Extractor(script_dir=dir_report, schema=schema)
+        output = extractor.extract_table_dependencies_from_queries(
+            reference_datasets=["reporting", "analytics", "github_repos"],
+            str_to_remove=remove_txt,
+        )
+        assert output == extract[i]
diff --git a/tests/unit/test_extractor.py b/tests/unit/test_extractor.py
new file mode 100644
index 0000000..8674c60
--- /dev/null
+++ b/tests/unit/test_extractor.py
@@ -0,0 +1,17 @@
+from extractor import Extractor
+
+
+def test_clean_query(query_user_activity, cleaned_user_activity):
+    schema = "reporting"
+    dir_report = f"data/{schema}"
+    extractor = Extractor(script_dir=dir_report, schema=schema)
+    txt_remove = [
+        f"MERGE {schema}.user_activity USING (",
+        ") ON FALSE WHEN NOT MATCHED THEN "
+        "INSERT ROW WHEN NOT MATCHED BY SOURCE THEN "
+        "DELETE",
+    ]
+    cleaned_query = extractor.clean_query(
+        query=query_user_activity, str_to_remove=txt_remove
+    )
+    assert cleaned_query == cleaned_user_activity