avisionh · avisionh · May 21, 2021 · May 16, 2021 · May 16, 2021 · May 16, 2021
diff --git a/ISSUE_TEMPLATE/bug_report.md → .github/ISSUE_TEMPLATE/bug_report.md b/ISSUE_TEMPLATE/bug_report.md → .github/ISSUE_TEMPLATE/bug_report.md
diff --git a/ISSUE_TEMPLATE/feature_request.md → .github/ISSUE_TEMPLATE/feature_request.md b/ISSUE_TEMPLATE/feature_request.md → .github/ISSUE_TEMPLATE/feature_request.md
diff --git a/.github/workflows/pytesting.yml b/.github/workflows/pytesting.yml
@@ -32,3 +32,7 @@ jobs:
         poetry run flake8 . --count --select=E9,F63,F7,F82 --show-source --statistics
         # exit-zero treats all errors as warnings. The GitHub editor is 127 chars wide
         poetry run flake8 . --count --exit-zero --max-complexity=10 --max-line-length=127 --statistics
+    - name: Test with pytest
+      run: |
+        poetry run pytest tests/unit
+        poetry run pytest tests/integration
diff --git a/.gitignore b/.gitignore
@@ -12,3 +12,6 @@
 
 # environment
 .env
+
+# django
+__pycache__/
diff --git a/README.md b/README.md
@@ -1,4 +1,40 @@
 # sqlquerygraph
-[![build status](https://github.com/avisionh/sqlquerygraph/workflows/pytesting/badge.svg)](https://github.com/avisionh/sqlqueryraph/actions) [![CodeFactor](https://www.codefactor.io/repository/github/avisionh/sqlquerygraph/badge)](https://www.codefactor.io/repository/github/avisionh/sqlquerygraph) [![License: MIT](https://img.shields.io/badge/License-MIT-informational.svg)](https://opensource.org/licenses/MIT) [![Code style: black](https://img.shields.io/badge/code%20style-black-000000.svg)](https://github.com/psf/black)
+[![build status](https://github.com/avisionh/sqlquerygraph/workflows/pytesting/badge.svg)](https://github.com/avisionh/sqlqueryraph/actions)
+[![](https://img.shields.io/badge/python-3.8%2B-blue.svg)](https://www.python.org/downloads/)
+[![CodeFactor](https://www.codefactor.io/repository/github/avisionh/sqlquerygraph/badge)](https://www.codefactor.io/repository/github/avisionh/sqlquerygraph)
+[![License: MIT](https://img.shields.io/badge/License-MIT-informational.svg)](https://opensource.org/licenses/MIT)
+[![Code style: black](https://img.shields.io/badge/code%20style-black-000000.svg)](https://github.com/psf/black)
 
 Parse your SQL queries and represent their structure as a graph.
+
+Currently, we implement the ability of representing how each of the tables in a set of SQL query scripts depend on each other.
+
+## Requirements
+To run the code in here, ensure your system meets the following requirements:
+- Unix-like operating system (macOS, Linux, ...);
+- [`direnv`](https://direnv.net/) installed, including shell hooks;
+- [`.envrc`](https://github.com/avisionh/sqlquerygraph/blob/main/.envrc) allowed/trusted by `direnv` to
+  use the environment variables - see [below](#allowingtrusting-envrc);
+- Python 3.8 or above; and
+- [Poetry](https://python-poetry.org/docs/) installed.
+
+Note there may be some Python IDE-specific requirements around loading environment variables, which are not considered here.
+
+### Set-up
+For quickstart set-up of the project, run the below in your shell:
+```shell script
+# 1. read project-specific environment variables
+direnv allow
+
+# 2. activate virtual environment and install package dependencies
+poetry shell
+poetry install
+
+# 3. check adherence to good standards on every commit
+pre-commit install
+```
+
+***
+
+## Acknowledgements
+This builds on the excellent [moz-sql-parser](https://github.com/mozilla/moz-sql-parser) package.
diff --git a/data/analytics/author.sql b/data/analytics/author.sql
@@ -0,0 +1,16 @@
+MERGE analytics.author
+USING (
+    SELECT
+        author.name AS name
+        ,author.email AS email
+        ,author.time_sec AS time_sec
+        ,author.tz_offset AS tz_offset
+        ,author.date.seconds AS date_seconds
+        ,author.date.nanos AS date_nanos
+    FROM github_repos.commits
+)
+ON FALSE
+WHEN NOT MATCHED THEN
+    INSERT ROW
+WHEN NOT MATCHED BY SOURCE THEN
+    DELETE;
diff --git a/data/analytics/commit.sql b/data/analytics/commit.sql
@@ -0,0 +1,21 @@
+MERGE analytics.commit
+USING (
+    SELECT
+        commit
+        ,tree
+        ,parent
+        ,author.name AS author_name
+        ,author.time_sec AS author_timesec
+        ,committer.name AS committer_name
+        ,committer.time_sec AS committer_time_sec
+        ,subject
+        ,message
+        ,repo_name
+        ,difference_truncated
+    FROM github_repos.commits
+)
+ON FALSE
+WHEN NOT MATCHED THEN
+    INSERT ROW
+WHEN NOT MATCHED BY SOURCE THEN
+    DELETE;
diff --git a/data/analytics/committer.sql b/data/analytics/committer.sql
@@ -0,0 +1,16 @@
+MERGE analytics.committer
+USING (
+    SELECT
+        committer.name AS name
+        ,committer.email AS email
+        ,committer.time_sec AS time_sec
+        ,committer.tz_offset AS tz_offset
+        ,committer.date.seconds AS date_seconds
+        ,committer.date.nanos AS date_nanos
+    FROM github_repos.commits
+)
+ON FALSE
+WHEN NOT MATCHED THEN
+    INSERT ROW
+WHEN NOT MATCHED BY SOURCE THEN
+    DELETE;
diff --git a/data/analytics/repo.sql b/data/analytics/repo.sql
@@ -0,0 +1,20 @@
+MERGE analytics.repo
+USING (
+    SELECT
+        a.repo_name
+        ,a.author.name AS author_name
+        ,a.author.time_sec AS author_time_sec
+        ,b.language.name AS language
+        ,b.language.bytes AS repo_size
+        ,c.license
+    FROM github_repos.commits AS a
+    LEFT JOIN github_repos.languages AS b
+        ON a.repo_name = b.repo_name
+    LEFT JOIN github_repos.licenses AS c
+        ON a.repo_name = c.repo_name
+)
+ON FALSE
+WHEN NOT MATCHED THEN
+    INSERT ROW
+WHEN NOT MATCHED BY SOURCE THEN
+    DELETE;
diff --git a/data/analytics/user.sql b/data/analytics/user.sql
@@ -0,0 +1,19 @@
+MERGE analytics.user
+USING (
+    SELECT DISTINCT
+        name
+        ,email
+        ,'author' AS user_type
+    FROM analytics.author
+    UNION
+    SELECT DISTINCT
+        name
+        ,email
+        ,'committer' AS user_type
+    FROM analytics.committer
+)
+ON FALSE
+WHEN NOT MATCHED THEN
+    INSERT ROW
+WHEN NOT MATCHED BY SOURCE THEN
+    DELETE;
diff --git a/data/reporting/user_activity.sql b/data/reporting/user_activity.sql
@@ -0,0 +1,63 @@
+MERGE reporting.user_activity
+USING (
+    WITH cte_base AS
+    (
+        SELECT
+            b.name
+            ,b.email
+            ,'commit' AS activity_type
+            ,COUNT(a.*) AS activity_count
+        FROM analytics.commit AS a
+        LEFT JOIN analytics.user AS b
+            ON a.committer_name = b.name
+                AND b.user_type = 'committer'
+        GROUP BY
+            b.name
+            ,b.email
+            ,a.repo_name
+
+        UNION
+
+        SELECT
+            a.author_name AS name
+            ,b.email
+            ,'repo' AS activity_type
+            ,COUNT(a.*) AS activity_count
+        FROM analytics.repo AS a
+        LEFT JOIN analytics.user AS b
+            ON a.author_name = b.name
+        GROUP BY
+            a.author_name
+            ,b.email
+    )
+
+    SELECT
+        name
+        ,email
+        ,activity_type
+        ,activity_count
+    FROM cte_base
+    UNION
+    SELECT
+        name
+        ,email
+        ,activity_type
+        ,activity_count
+    FROM
+    (
+        SELECT
+            name
+            ,email
+            ,'total' AS activity_type
+            ,SUM(activity_count) AS activity_count
+        FROM cte_base
+        GROUP BY
+            name
+            ,email
+    )
+)
+ON FALSE
+WHEN NOT MATCHED THEN
+    INSERT ROW
+WHEN NOT MATCHED BY SOURCE THEN
+    DELETE;
diff --git a/extractor.py b/extractor.py
@@ -0,0 +1,176 @@
+from typing import Union
+
+import os
+import re
+from tqdm import tqdm
+
+from moz_sql_parser import parse
+from pprint import pprint
+
+
+class Extractor:
+    """
+    Extract table names from SQL queries.
+
+    :param script_dir: String of the directory were we store our SQL queries.
+    :param schema: String of the dataset/schema that the SQL queries creating the table belongs to.
+    """
+
+    def __init__(self, script_dir: str, schema: str):
+        self.script_dir = script_dir
+        self.schema = schema
+
+    def read_query(self, file: str) -> (str, str):
+        """
+        Reads a SQL file in.
+        Note: Relies on SQL script being named the same as table or View it is creating.
+
+        :param file: String of the file to read query from.
+        :return: Tuple of strings of the table name and SQL query from the file.
+        """
+        file_name, file_extension = os.path.splitext(p=file)
+        if file_extension == ".sql":
+            with open(file=os.path.join(self.script_dir, file), mode="r") as f:
+                query = f.read()
+            return file_name, query
+        else:
+            raise Exception(
+                f"Passed in a {file_extension} file. \n"
+                f"Please pass in a .sql file instead."
+            )
+
+    @staticmethod
+    def clean_query(query: str, str_to_remove: Union[str, list] = None) -> str:
+        """
+        Cleans a query so it can be parsed.
+
+        :param query: String of the query to clean.
+        :param str_to_remove: String or list of strings to remove from the query.
+        :return: String of the cleaned query to parse.
+        """
+        # remove new lines and multiple spaces
+        query = query.replace("\n", " ")
+        query = re.sub(pattern=r"\s+", repl=" ", string=query)
+
+        if str_to_remove is not None:
+            for txt in str_to_remove:
+                query = query.replace(txt, "")
+
+        return query
+
+    @staticmethod
+    def parse_query(query: str, print_tree: bool = False) -> dict:
+        """
+        Parse a query into a JSON parse-tree.
+
+        :param query: String of the SQL query to parse as a JSON parse-tree.
+        :param print_tree: Boolean to print the JSON parse-tree.
+        :return: Dictionary of the query as a JSON parse-tree.
+        """
+        query_json = parse(sql=query)
+        if print_tree:
+            pprint(object=query_json)
+        return query_json
+
+    @staticmethod
+    def extract_from_json(obj: dict, key: str) -> list:
+        """
+        Recursively fetch values from a nested JSON.
+
+        For our purposes, extract where key is 'from' allows extraction of *most* table names after a `FROM` clause.
+            - It does not extract the table names when the name is nested in a subquery.
+            - Nor does it extract table names in '<TYPE> JOIN` clauses.
+        To achieve above two, need to extract where the key is 'value' and compare with actual table names.
+        This is because the values returned when key is 'value' are table names, column names etc.
+        Reference
+            -  https://hackersandslackers.com/extract-data-from-complex-json-python/
+        :param obj: Dictionary to extract values from.
+        :param key: String of the value you want to extract.
+        :return: List of values for the key.
+        """
+        arr = []
+
+        def extract(obj: Union[dict, list], arr: list, key: str) -> list:
+            """
+            Recusively search for values of key in a JSON tree.
+
+            :param obj: Dictionary to extract values from.
+            :param arr: List to store extracted values to.
+            :param key: String of the dictionary key to extract associated value from.
+            :return: List of the extracted values.
+            """
+            if isinstance(obj, dict):
+                for k, v in obj.items():
+                    if isinstance(v, (dict, list)):
+                        extract(obj=v, arr=arr, key=key)
+                    elif k == key:
+                        arr.append(v)
+            elif isinstance(obj, list):
+                for item in obj:
+                    extract(obj=item, arr=arr, key=key)
+            return arr
+
+        values = extract(obj=obj, arr=arr, key=key)
+        return values
+
+    def extract_table_dependencies_from_queries(
+        self,
+        reference_datasets: list,
+        str_to_remove: Union[str, list] = None,
+        verbose: bool = False,
+    ) -> dict:
+        """
+        Extracts the table names and their dependencies from a set of .sql files.
+
+        :param reference_datasets: List of datasets/schema of database.
+        :param str_to_remove: String or list of strings to remove from the query.
+        :param verbose: Boolean to output steps taken and query after cleaning. Useful for debugging.
+        :return: Dictionary of tables as keys and their dependent tables as values.
+        """
+        queries, jsons, dicts = {}, {}, {}
+        reference_datasets = tuple([f"{txt}." for txt in reference_datasets])
+        for file_name in tqdm(os.listdir(path=self.script_dir)):
+            if verbose:
+                print(f"Reading query {file_name}...\n")
+            file_name, query = self.read_query(file=file_name)
+            queries[file_name] = query
+
+            if str_to_remove is not None:
+                if verbose:
+                    print(
+                        f"Cleaning query {file_name} by removing {str_to_remove}...\n"
+                    )
+                queries[file_name] = self.clean_query(
+                    query=queries[file_name], str_to_remove=str_to_remove
+                )
+
+            if verbose:
+                print(f"Cleaned query is {queries[file_name]}")
+                print(f"Parsing query {file_name}...\n")
+            jsons[file_name] = self.parse_query(
+                query=queries[file_name], print_tree=verbose
+            )
+
+            if verbose:
+                print(f"Extracting table names from {file_name}...\n")
+            #   - from: tables after 'from' clause
+            #       + though sometimes keys are not 'from' so need to
+            #       + look at values associated to the 'value' key
+            #   - value: tables after '... join' clauses
+            #       + can also include tables after 'from' clause if they
+            #       + are in a subquery
+            table_from = self.extract_from_json(obj=jsons[file_name], key="from")
+
+            # keep only table elements and not table aliases - as defined by period
+            table_from = [txt for txt in table_from if "." in txt]
+            table_value = self.extract_from_json(obj=jsons[file_name], key="value")
+            # extract table values when it starts with `<schema>.`
+            table_join = [
+                txt for txt in table_value if str(txt).startswith(reference_datasets)
+            ]
+            tables = sorted(list(set(table_from + table_join)))
+
+            # store in dictionary
+            dicts[f"{self.schema}.{file_name}"] = tables
+
+        return dicts