From 8f5606191e3be524148daf19b2b1ddca4df09fda Mon Sep 17 00:00:00 2001 From: avisionh Date: Sun, 16 May 2021 11:05:52 +0100 Subject: [PATCH 01/14] feat: Write method to read SQL query This is so we can start parsing and extracting dependencies from it. --- extractor.py | 0 1 file changed, 0 insertions(+), 0 deletions(-) create mode 100644 extractor.py diff --git a/extractor.py b/extractor.py new file mode 100644 index 0000000..e69de29 From bf2e8d88b43bbf4e2c51b1ddbea018d589f73718 Mon Sep 17 00:00:00 2001 From: avisionh Date: Sun, 16 May 2021 11:11:24 +0100 Subject: [PATCH 02/14] feat: Add method to remove string from query This is so we have functionality to remove strings that make the query difficult to parse. --- extractor.py | 54 ++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 54 insertions(+) diff --git a/extractor.py b/extractor.py index e69de29..fc4efe1 100644 --- a/extractor.py +++ b/extractor.py @@ -0,0 +1,54 @@ +from typing import Union + +import os +import re + + +class Extractor: + """ + Extract table names from SQL queries. + + :param script_dir: String of the directory were we store our SQL queries. + :param schema: String of the dataset/schema that the SQL queries creating the table belongs to. + """ + + def __init__(self, script_dir: str, schema: str): + self.script_dir = script_dir + self.schema = schema + + def read_query(self, file: str) -> str: + """ + Reads a SQL file in. + + :param file: String of the file to read query from. + :return: String of the SQL query from the file. + """ + _, file_extension = os.path.splitext(p=file) + if file_extension == ".sql": + with open(file=os.path.join(self.script_dir, file), mode="r") as f: + query = f.read() + return query + else: + raise Exception( + f"Passed in a {file_extension} file. \n" + f"Please pass in a .sql file instead." + ) + + @staticmethod + def clean_query(query: str, str_to_remove: Union[str, list]) -> str: + """ + Cleans a query so it can be parsed. + + :param query: String of the query to clean. + :param str_to_remove: String or list of strings to remove from the query. + :return: String of the cleaned query to parse. + """ + # remove new lines and multiple spaces + query = query.replace("\n", " ") + query = re.sub(pattern=r"\s+", repl=" ", string=query) + + if str_to_remove is not None: + for txt in str_to_remove: + query = query.replace(txt, "") + + return query From 0dc191b4b6d627cd7208069aa716ecfdd3df45e4 Mon Sep 17 00:00:00 2001 From: avisionh Date: Sun, 16 May 2021 11:17:31 +0100 Subject: [PATCH 03/14] feat: Write method to parse SQL query This is so we can get it into a format that we can extract table names from. --- extractor.py | 17 ++++++++++ poetry.lock | 90 +++++++++++++++++++++++++++++++++++++++++++++++++- pyproject.toml | 1 + 3 files changed, 107 insertions(+), 1 deletion(-) diff --git a/extractor.py b/extractor.py index fc4efe1..aedc420 100644 --- a/extractor.py +++ b/extractor.py @@ -3,6 +3,9 @@ import os import re +from moz_sql_parser import parse +from pprint import pprint + class Extractor: """ @@ -52,3 +55,17 @@ def clean_query(query: str, str_to_remove: Union[str, list]) -> str: query = query.replace(txt, "") return query + + @staticmethod + def parse_query(query: str, print_tree: bool = False) -> dict: + """ + Parse a query into a JSON parse-tree. + + :param query: String of the SQL query to parse as a JSON parse-tree. + :param print_tree: Boolean to print the JSON parse-tree. + :return: Dictionary of the query as a JSON parse-tree. + """ + query_json = parse(sql=query) + if print_tree: + pprint(object=query_json) + return query_json diff --git a/poetry.lock b/poetry.lock index 7ae4cb6..2126a4c 100644 --- a/poetry.lock +++ b/poetry.lock @@ -124,6 +124,76 @@ category = "main" optional = false python-versions = "*" +[[package]] +name = "mo-dots" +version = "4.22.21108" +description = "More Dots! Dot-access to Python dicts like Javascript" +category = "main" +optional = false +python-versions = "*" + +[package.dependencies] +mo-future = "3.147.20327" +mo-imports = "3.149.20327" + +[[package]] +name = "mo-future" +version = "3.147.20327" +description = "More future! Make Python 2/3 compatibility a bit easier" +category = "main" +optional = false +python-versions = "*" + +[[package]] +name = "mo-imports" +version = "3.149.20327" +description = "More Imports! - Delayed importing" +category = "main" +optional = false +python-versions = "*" + +[package.dependencies] +mo-future = "3.147.20327" + +[[package]] +name = "mo-kwargs" +version = "4.22.21108" +description = "More KWARGS! Let call parameters override kwargs" +category = "main" +optional = false +python-versions = "*" + +[package.dependencies] +mo-dots = "4.22.21108" +mo-future = "3.147.20327" + +[[package]] +name = "mo-logs" +version = "4.23.21108" +description = "More Logs! Structured Logging and Exception Handling" +category = "main" +optional = false +python-versions = "*" + +[package.dependencies] +mo-dots = "4.22.21108" +mo-future = "3.147.20327" +mo-imports = "3.149.20327" +mo-kwargs = "4.22.21108" + +[[package]] +name = "moz-sql-parser" +version = "4.40.21126" +description = "Extract Parse Tree from SQL" +category = "main" +optional = false +python-versions = "*" + +[package.dependencies] +mo-dots = "4.22.21108" +mo-future = "3.147.20327" +mo-logs = "4.23.21108" + [[package]] name = "nodeenv" version = "1.6.0" @@ -248,7 +318,7 @@ testing = ["coverage (>=4)", "coverage-enable-subprocess (>=1)", "flaky (>=3)", [metadata] lock-version = "1.1" python-versions = "^3.8" -content-hash = "09173b5e497218e0d74966bdf08a78ebbfbc7322863b44a255d4e01538efec91" +content-hash = "6dbd677bb57cbbb6b000fc84f6c0fc06daa173f8e22a50b485746b6c566e1ef6" [metadata.files] appdirs = [ @@ -301,6 +371,24 @@ mccabe = [ {file = "mccabe-0.6.1-py2.py3-none-any.whl", hash = "sha256:ab8a6258860da4b6677da4bd2fe5dc2c659cff31b3ee4f7f5d64e79735b80d42"}, {file = "mccabe-0.6.1.tar.gz", hash = "sha256:dd8d182285a0fe56bace7f45b5e7d1a6ebcbf524e8f3bd87eb0f125271b8831f"}, ] +mo-dots = [ + {file = "mo-dots-4.22.21108.tar.gz", hash = "sha256:f9f2f9a6e44e214959aecd55857b6df487b8e86daf51334e2e982d47ea33ed23"}, +] +mo-future = [ + {file = "mo-future-3.147.20327.tar.gz", hash = "sha256:4aafcf859e4657bc11b04ee9791321e2d44702604ae01cac7c412468cb6a513f"}, +] +mo-imports = [ + {file = "mo-imports-3.149.20327.tar.gz", hash = "sha256:32e3dc720c84765224d29ed7e9d6972a369d41d756e6f26ddb5b7aa01241331d"}, +] +mo-kwargs = [ + {file = "mo-kwargs-4.22.21108.tar.gz", hash = "sha256:9c9d00ab86f1f75013193807d90cad17cbc515384cf3b17b8aff3104a300f7ce"}, +] +mo-logs = [ + {file = "mo-logs-4.23.21108.tar.gz", hash = "sha256:de4136a7ce215ecbfd7a368588be0a3f1fd8a6521dc2d4aae57cc1c3ba299aab"}, +] +moz-sql-parser = [ + {file = "moz-sql-parser-4.40.21126.tar.gz", hash = "sha256:b3d37cc8ff118d86009aa12646791549537ec0ae8ac312efd4641289c8eee080"}, +] nodeenv = [ {file = "nodeenv-1.6.0-py2.py3-none-any.whl", hash = "sha256:621e6b7076565ddcacd2db0294c0381e01fd28945ab36bcf00f41c5daf63bef7"}, {file = "nodeenv-1.6.0.tar.gz", hash = "sha256:3ef13ff90291ba2a4a7a4ff9a979b63ffdd00a464dbe04acf0ea6471517a4c2b"}, diff --git a/pyproject.toml b/pyproject.toml index d59f726..2fff1b8 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -11,6 +11,7 @@ pre-commit = "^2.12.1" detect-secrets = "^1.1.0" direnv = "^2020.12.3" flake8 = "^3.9.2" +moz-sql-parser = "^4.40.21126" [tool.poetry.dev-dependencies] From 54d9c4720f160e2a6951a585aaaf00a6b5f2d5be Mon Sep 17 00:00:00 2001 From: avisionh Date: Sun, 16 May 2021 11:24:18 +0100 Subject: [PATCH 04/14] feat: Add recursive method to extract from nested dictionary This is so we can get the equivalent value of a passed in key for a nested dictionary. --- extractor.py | 41 +++++++++++++++++++++++++++++++++++++++++ 1 file changed, 41 insertions(+) diff --git a/extractor.py b/extractor.py index aedc420..204ec74 100644 --- a/extractor.py +++ b/extractor.py @@ -69,3 +69,44 @@ def parse_query(query: str, print_tree: bool = False) -> dict: if print_tree: pprint(object=query_json) return query_json + + @staticmethod + def extract_from_json(obj: dict, key: str) -> list: + """ + Recursively fetch values from a nested JSON. + + For our purposes, extract where key is 'from' allows extraction of *most* table names after a `FROM` clause. + - It does not extract the table names when the name is nested in a subquery. + - Nor does it extract table names in ' JOIN` clauses. + To achieve above two, need to extract where the key is 'value' and compare with actual table names. + This is because the values returned when key is 'value' are table names, column names etc. + Reference + - https://hackersandslackers.com/extract-data-from-complex-json-python/ + :param obj: Dictionary to extract values from. + :param key: String of the value you want to extract. + :return: List of values for the key. + """ + arr = [] + + def extract(obj: Union[dict, list], arr: list, key: str) -> list: + """ + Recusively search for values of key in a JSON tree. + + :param obj: Dictionary to extract values from. + :param arr: List to store extracted values to. + :param key: String of the dictionary key to extract associated value from. + :return: List of the extracted values. + """ + if isinstance(obj, dict): + for k, v in obj.items(): + if isinstance(v, (dict, list)): + extract(obj=v, arr=arr, key=key) + elif k == key: + arr.append(v) + elif isinstance(obj, list): + for item in obj: + extract(obj=item, arr=arr, key=key) + return arr + + values = extract(obj=obj, arr=arr, key=key) + return values From fa1cd69b50300b5f8ba0b0fe3504701b750100ff Mon Sep 17 00:00:00 2001 From: avisionh Date: Sun, 16 May 2021 11:27:54 +0100 Subject: [PATCH 05/14] docs: Instruct how to set-up project environment This is so people can replicate the work here. --- README.md | 29 +++++++++++++++++++++++++++++ 1 file changed, 29 insertions(+) diff --git a/README.md b/README.md index dd4e328..1020862 100644 --- a/README.md +++ b/README.md @@ -2,3 +2,32 @@ [![build status](https://github.com/avisionh/sqlquerygraph/workflows/pytesting/badge.svg)](https://github.com/avisionh/sqlqueryraph/actions) [![CodeFactor](https://www.codefactor.io/repository/github/avisionh/sqlquerygraph/badge)](https://www.codefactor.io/repository/github/avisionh/sqlquerygraph) [![License: MIT](https://img.shields.io/badge/License-MIT-informational.svg)](https://opensource.org/licenses/MIT) [![Code style: black](https://img.shields.io/badge/code%20style-black-000000.svg)](https://github.com/psf/black) Parse your SQL queries and represent their structure as a graph. + +## Requirements +To run the code in here, ensure your system meets the following requirements: +- Unix-like operating system (macOS, Linux, ...); +- [`direnv`](https://direnv.net/) installed, including shell hooks; +- [`.envrc`](https://github.com/avisionh/sqlquerygraph/blob/main/.envrc) allowed/trusted by `direnv` to + use the environment variables - see [below](#allowingtrusting-envrc); +- Python 3.8 or above; and +- [Poetry](https://python-poetry.org/docs/) installed. + +Parse your SQL queries and represent their structure as a graph. + +Note there may be some Python IDE-specific requirements around loading environment variables, which are not considered here. + +### Set-up +For quickstart set-up of the project, run the below in your shell: +```shell script +# 1. read project-specific environment variables +direnv allow + +# 2. activate virtual environment and install package dependencies +poetry shell +poetry install + +# 3. check adherence to good standards on every commit +pre-commit install +``` + +*** From 386ec6a9b51015f523199a7287db54246aa691c5 Mon Sep 17 00:00:00 2001 From: avisionh Date: Sun, 16 May 2021 11:51:36 +0100 Subject: [PATCH 06/14] feat: Write method to extract tables and dependencies from scripts This is so we can get it in a dictionary for visualising. --- README.md | 2 ++ extractor.py | 64 ++++++++++++++++++++++++++++++++++++++++++++++++++ poetry.lock | 19 ++++++++++++++- pyproject.toml | 1 + 4 files changed, 85 insertions(+), 1 deletion(-) diff --git a/README.md b/README.md index 1020862..96d5ed9 100644 --- a/README.md +++ b/README.md @@ -3,6 +3,8 @@ Parse your SQL queries and represent their structure as a graph. +Currently, we implement the ability of represent how each of the tables in a set of SQL query scripts depend on each other. + ## Requirements To run the code in here, ensure your system meets the following requirements: - Unix-like operating system (macOS, Linux, ...); diff --git a/extractor.py b/extractor.py index 204ec74..8b3ce5e 100644 --- a/extractor.py +++ b/extractor.py @@ -2,6 +2,7 @@ import os import re +from tqdm import tqdm from moz_sql_parser import parse from pprint import pprint @@ -110,3 +111,66 @@ def extract(obj: Union[dict, list], arr: list, key: str) -> list: values = extract(obj=obj, arr=arr, key=key) return values + + def extract_table_dependencies_from_queries( + self, + reference_datasets: list, + str_to_remove: Union[str, list] = None, + verbose: bool = False, + ) -> dict: + """ + Extracts the table names and their dependencies from a set of .sql files. + + :param reference_datasets: List of datasets/schema that the tables belong to. + :param str_to_remove: String or list of strings to remove from the query. + :param verbose: Boolean to output steps taken and query after cleaning. Useful for debugging. + :return: Dictionary of tables as keys and their dependent tables as values. + """ + queries, jsons, dicts = {}, {}, {} + reference_datasets = tuple([f"{txt}." for txt in reference_datasets]) + for file_name in tqdm(os.listdir(path=self.script_dir)): + + if verbose: + print(f"Reading query {file_name}...\n") + query = self.read_query(file=file_name) + queries[file_name] = query + + if str_to_remove is not None: + if verbose: + print( + f"Cleaning query {file_name} by removing {str_to_remove}...\n" + ) + queries[file_name] = self.clean_query( + query=queries[file_name], str_to_remove=str_to_remove + ) + + if verbose: + print(f"Cleaned query is {queries[file_name]}") + print(f"Parsing query {file_name}...\n") + jsons[file_name] = self.parse_query( + query=queries[file_name], print_tree=verbose + ) + + if verbose: + print(f"Extracting table names from {file_name}...\n") + # - from: tables after 'from' clause + # + though sometimes keys are not 'from' so need to + # + look at values associated to the 'value' key + # - value: tables after '... join' clauses + # + can also include tables after 'from' clause if they + # + are in a subquery + table_from = self.extract_from_json(obj=jsons[file_name], key="from") + + # keep only table elements and not table aliases - as defined by period + table_from = [txt for txt in table_from if "." in txt] + table_value = self.extract_from_json(obj=jsons[file_name], key="value") + # extract table values when it starts with `.` + table_join = [ + txt for txt in table_value if str(txt).startswith(reference_datasets) + ] + tables = list(set(table_from + table_join)) + + # store in dictionary + dicts[f"{self.schema}.{file_name}"] = tables + + return dicts diff --git a/poetry.lock b/poetry.lock index 2126a4c..cce25f4 100644 --- a/poetry.lock +++ b/poetry.lock @@ -276,6 +276,19 @@ category = "main" optional = false python-versions = ">=2.6, !=3.0.*, !=3.1.*, !=3.2.*" +[[package]] +name = "tqdm" +version = "4.60.0" +description = "Fast, Extensible Progress Meter" +category = "main" +optional = false +python-versions = "!=3.0.*,!=3.1.*,!=3.2.*,!=3.3.*,>=2.7" + +[package.extras] +dev = ["py-make (>=0.1.0)", "twine", "wheel"] +notebook = ["ipywidgets (>=6)"] +telegram = ["requests"] + [[package]] name = "urllib3" version = "1.26.4" @@ -318,7 +331,7 @@ testing = ["coverage (>=4)", "coverage-enable-subprocess (>=1)", "flaky (>=3)", [metadata] lock-version = "1.1" python-versions = "^3.8" -content-hash = "6dbd677bb57cbbb6b000fc84f6c0fc06daa173f8e22a50b485746b6c566e1ef6" +content-hash = "339e8cc655e66a1e2c0f5b773e6608ae3323f3af7a95ea3c6f94d8dacd086f20" [metadata.files] appdirs = [ @@ -448,6 +461,10 @@ toml = [ {file = "toml-0.10.2-py2.py3-none-any.whl", hash = "sha256:806143ae5bfb6a3c6e736a764057db0e6a0e05e338b5630894a5f779cabb4f9b"}, {file = "toml-0.10.2.tar.gz", hash = "sha256:b3bda1d108d5dd99f4a20d24d9c348e91c4db7ab1b749200bded2f839ccbe68f"}, ] +tqdm = [ + {file = "tqdm-4.60.0-py2.py3-none-any.whl", hash = "sha256:daec693491c52e9498632dfbe9ccfc4882a557f5fa08982db1b4d3adbe0887c3"}, + {file = "tqdm-4.60.0.tar.gz", hash = "sha256:ebdebdb95e3477ceea267decfc0784859aa3df3e27e22d23b83e9b272bf157ae"}, +] urllib3 = [ {file = "urllib3-1.26.4-py2.py3-none-any.whl", hash = "sha256:2f4da4594db7e1e110a944bb1b551fdf4e6c136ad42e4234131391e21eb5b0df"}, {file = "urllib3-1.26.4.tar.gz", hash = "sha256:e7b021f7241115872f92f43c6508082facffbd1c048e3c6e2bb9c2a157e28937"}, diff --git a/pyproject.toml b/pyproject.toml index 2fff1b8..f7a8f0d 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -12,6 +12,7 @@ detect-secrets = "^1.1.0" direnv = "^2020.12.3" flake8 = "^3.9.2" moz-sql-parser = "^4.40.21126" +tqdm = "^4.60.0" [tool.poetry.dev-dependencies] From 04fd17ca3350c585eecd3370f5eb07abd430b5b0 Mon Sep 17 00:00:00 2001 From: avisionh Date: Tue, 18 May 2021 23:01:18 +0100 Subject: [PATCH 07/14] data: Add scripts to create tables in BQ This is so we can have a layer of tables derived off another layer. --- data/analytics/author.sql | 15 +++++++++++++++ data/analytics/commit.sql | 20 ++++++++++++++++++++ data/analytics/committer.sql | 15 +++++++++++++++ data/analytics/repo.sql | 19 +++++++++++++++++++ data/analytics/user.sql | 18 ++++++++++++++++++ tests/conftest.py | 0 tests/fixtures/fixture_extractor.py | 0 tests/unit/test_extractor.py | 0 8 files changed, 87 insertions(+) create mode 100644 data/analytics/author.sql create mode 100644 data/analytics/commit.sql create mode 100644 data/analytics/committer.sql create mode 100644 data/analytics/repo.sql create mode 100644 data/analytics/user.sql create mode 100644 tests/conftest.py create mode 100644 tests/fixtures/fixture_extractor.py create mode 100644 tests/unit/test_extractor.py diff --git a/data/analytics/author.sql b/data/analytics/author.sql new file mode 100644 index 0000000..de4024a --- /dev/null +++ b/data/analytics/author.sql @@ -0,0 +1,15 @@ +MERGE analytics.author +USING ( + SELECT + author.name AS name + ,author.email AS email + ,author.time_sec AS time_sec + ,author.tz_offset AS tz_offset + ,author.date.seconds AS date_seconds + ,author.date.nanos AS date_nanos + FROM `bigquery-public-data.github_repos.commits`) +ON FALSE +WHEN NOT MATCHED THEN + INSERT ROW +WHEN NOT MATCHED BY SOURCE THEN + DELETE; diff --git a/data/analytics/commit.sql b/data/analytics/commit.sql new file mode 100644 index 0000000..f5015ae --- /dev/null +++ b/data/analytics/commit.sql @@ -0,0 +1,20 @@ +MERGE analytics.commit +USING ( + SELECT + commit + ,tree + ,parent + ,author.name AS author_name + ,author.time_sec AS author_timesec + ,committer.name AS committer_name + ,committer.time_sec AS committer_time_sec + ,subject + ,message + ,repo_name + ,difference_truncated + FROM `bigquery-public-data.github_repos.commits`) +ON FALSE +WHEN NOT MATCHED THEN + INSERT ROW +WHEN NOT MATCHED BY SOURCE THEN + DELETE; diff --git a/data/analytics/committer.sql b/data/analytics/committer.sql new file mode 100644 index 0000000..635c197 --- /dev/null +++ b/data/analytics/committer.sql @@ -0,0 +1,15 @@ +MERGE analytics.committer +USING ( + SELECT + committer.name AS name + ,committer.email AS email + ,committer.time_sec AS time_sec + ,committer.tz_offset AS tz_offset + ,committer.date.seconds AS date_seconds + ,committer.date.nanos AS date_nanos + FROM `bigquery-public-data.github_repos.commits`) +ON FALSE +WHEN NOT MATCHED THEN + INSERT ROW +WHEN NOT MATCHED BY SOURCE THEN + DELETE; diff --git a/data/analytics/repo.sql b/data/analytics/repo.sql new file mode 100644 index 0000000..2dab4e2 --- /dev/null +++ b/data/analytics/repo.sql @@ -0,0 +1,19 @@ +MERGE analytics.repo +USING ( + SELECT + a.repo_name + ,a.author.name + ,a.author.time_sec + ,b.language.name AS language + ,b.language.bytes AS repo_size + ,c.license + FROM `bigquery-public-data.github_repos.commits` AS a + LEFT JOIN `bigquery-public-data.github_repos.languages` AS b + ON a.repo_name = b.repo_name + LEFT JOIN `bigquery-public-data.github_repos.licenses` AS c + ON a.repo_name = c.repo_name) +ON FALSE +WHEN NOT MATCHED THEN + INSERT ROW +WHEN NOT MATCHED BY SOURCE THEN + DELETE; diff --git a/data/analytics/user.sql b/data/analytics/user.sql new file mode 100644 index 0000000..fee3f32 --- /dev/null +++ b/data/analytics/user.sql @@ -0,0 +1,18 @@ +MERGE analytics.user +USING ( + SELECT DISTINCT + name + ,email + ,'author' AS user_type + FROM analytics.author + UNION + SELECT DISTINCT + name + ,email + ,'committer' AS user_type + FROM analytics.committer) +ON FALSE +WHEN NOT MATCHED THEN + INSERT ROW +WHEN NOT MATCHED BY SOURCE THEN + DELETE; diff --git a/tests/conftest.py b/tests/conftest.py new file mode 100644 index 0000000..e69de29 diff --git a/tests/fixtures/fixture_extractor.py b/tests/fixtures/fixture_extractor.py new file mode 100644 index 0000000..e69de29 diff --git a/tests/unit/test_extractor.py b/tests/unit/test_extractor.py new file mode 100644 index 0000000..e69de29 From 079d553eb3488f260748d42cff58e9da989bc9ea Mon Sep 17 00:00:00 2001 From: avisionh Date: Wed, 19 May 2021 08:36:19 +0100 Subject: [PATCH 08/14] data: Create user activity table This is so we have a complicated query which uses ctes and subqueries, which will be good for testing. --- data/analytics/author.sql | 3 +- data/analytics/commit.sql | 3 +- data/analytics/committer.sql | 3 +- data/analytics/repo.sql | 7 ++-- data/analytics/user.sql | 3 +- data/reporting/user_activity.sql | 63 ++++++++++++++++++++++++++++++++ 6 files changed, 75 insertions(+), 7 deletions(-) create mode 100644 data/reporting/user_activity.sql diff --git a/data/analytics/author.sql b/data/analytics/author.sql index de4024a..9d8b7af 100644 --- a/data/analytics/author.sql +++ b/data/analytics/author.sql @@ -7,7 +7,8 @@ USING ( ,author.tz_offset AS tz_offset ,author.date.seconds AS date_seconds ,author.date.nanos AS date_nanos - FROM `bigquery-public-data.github_repos.commits`) + FROM `bigquery-public-data.github_repos.commits` +) ON FALSE WHEN NOT MATCHED THEN INSERT ROW diff --git a/data/analytics/commit.sql b/data/analytics/commit.sql index f5015ae..eda3a63 100644 --- a/data/analytics/commit.sql +++ b/data/analytics/commit.sql @@ -12,7 +12,8 @@ USING ( ,message ,repo_name ,difference_truncated - FROM `bigquery-public-data.github_repos.commits`) + FROM `bigquery-public-data.github_repos.commits` +) ON FALSE WHEN NOT MATCHED THEN INSERT ROW diff --git a/data/analytics/committer.sql b/data/analytics/committer.sql index 635c197..6c76d31 100644 --- a/data/analytics/committer.sql +++ b/data/analytics/committer.sql @@ -7,7 +7,8 @@ USING ( ,committer.tz_offset AS tz_offset ,committer.date.seconds AS date_seconds ,committer.date.nanos AS date_nanos - FROM `bigquery-public-data.github_repos.commits`) + FROM `bigquery-public-data.github_repos.commits` +) ON FALSE WHEN NOT MATCHED THEN INSERT ROW diff --git a/data/analytics/repo.sql b/data/analytics/repo.sql index 2dab4e2..7269807 100644 --- a/data/analytics/repo.sql +++ b/data/analytics/repo.sql @@ -2,8 +2,8 @@ MERGE analytics.repo USING ( SELECT a.repo_name - ,a.author.name - ,a.author.time_sec + ,a.author.name AS author_name + ,a.author.time_sec AS author_time_sec ,b.language.name AS language ,b.language.bytes AS repo_size ,c.license @@ -11,7 +11,8 @@ USING ( LEFT JOIN `bigquery-public-data.github_repos.languages` AS b ON a.repo_name = b.repo_name LEFT JOIN `bigquery-public-data.github_repos.licenses` AS c - ON a.repo_name = c.repo_name) + ON a.repo_name = c.repo_name +) ON FALSE WHEN NOT MATCHED THEN INSERT ROW diff --git a/data/analytics/user.sql b/data/analytics/user.sql index fee3f32..accd863 100644 --- a/data/analytics/user.sql +++ b/data/analytics/user.sql @@ -10,7 +10,8 @@ USING ( name ,email ,'committer' AS user_type - FROM analytics.committer) + FROM analytics.committer +) ON FALSE WHEN NOT MATCHED THEN INSERT ROW diff --git a/data/reporting/user_activity.sql b/data/reporting/user_activity.sql new file mode 100644 index 0000000..19748c2 --- /dev/null +++ b/data/reporting/user_activity.sql @@ -0,0 +1,63 @@ +MERGE reporting.user_activity +USING ( + WITH cte_base AS + ( + SELECT + b.name + ,b.email + ,'commit' AS activity_type + ,COUNT(a.*) AS activity_count + FROM analytics.commit AS a + LEFT JOIN analytics.user AS b + ON a.committer_name = b.name + AND b.user_type = 'committer' + GROUP BY + b.name + ,b.email + ,a.repo_name + + UNION + + SELECT + a.author_name AS name + ,b.email + ,'repo' AS activity_type + ,COUNT(a.*) AS activity_count + FROM analytics.repo AS a + LEFT JOIN analytics.user AS b + ON a.author_name = b.name + GROUP BY + a.author_name + ,b.email + ) + + SELECT + name + ,email + ,activity_type + ,activity_count + FROM cte_base + UNION + SELECT + name + ,email + ,activity_type + ,activity_count + FROM + ( + SELECT + name + ,email + ,'total' AS activity_type + ,SUM(activity_count) AS activity_count + FROM cte_base + GROUP BY + name + ,email + ) +) +ON FALSE +WHEN NOT MATCHED THEN + INSERT ROW +WHEN NOT MATCHED BY SOURCE THEN + DELETE; From 5ebe7511b77aedad33b952213b452a3791a8f3cd Mon Sep 17 00:00:00 2001 From: avisionh Date: Thu, 20 May 2021 08:49:21 +0100 Subject: [PATCH 09/14] tests: Add for main class method This is to ensure it works. --- .gitignore | 3 + extractor.py | 14 +- poetry.lock | 135 +++++++++++++++++- pyproject.toml | 1 + tests/{unit/test_extractor.py => __init__.py} | 0 tests/conftest.py | 12 ++ tests/fixtures/fixture_extractor.py | 12 ++ tests/integration/test_extractor.py | 26 ++++ 8 files changed, 195 insertions(+), 8 deletions(-) rename tests/{unit/test_extractor.py => __init__.py} (100%) create mode 100644 tests/integration/test_extractor.py diff --git a/.gitignore b/.gitignore index 0cb7c0a..abcbf77 100644 --- a/.gitignore +++ b/.gitignore @@ -12,3 +12,6 @@ # environment .env + +# django +__pycache__/ diff --git a/extractor.py b/extractor.py index 8b3ce5e..44640bb 100644 --- a/extractor.py +++ b/extractor.py @@ -20,18 +20,19 @@ def __init__(self, script_dir: str, schema: str): self.script_dir = script_dir self.schema = schema - def read_query(self, file: str) -> str: + def read_query(self, file: str) -> (str, str): """ Reads a SQL file in. + Note: Relies on SQL script being named the same as table or View it is creating. :param file: String of the file to read query from. - :return: String of the SQL query from the file. + :return: Tuple of strings of the table name and SQL query from the file. """ - _, file_extension = os.path.splitext(p=file) + file_name, file_extension = os.path.splitext(p=file) if file_extension == ".sql": with open(file=os.path.join(self.script_dir, file), mode="r") as f: query = f.read() - return query + return file_name, query else: raise Exception( f"Passed in a {file_extension} file. \n" @@ -129,10 +130,9 @@ def extract_table_dependencies_from_queries( queries, jsons, dicts = {}, {}, {} reference_datasets = tuple([f"{txt}." for txt in reference_datasets]) for file_name in tqdm(os.listdir(path=self.script_dir)): - if verbose: print(f"Reading query {file_name}...\n") - query = self.read_query(file=file_name) + file_name, query = self.read_query(file=file_name) queries[file_name] = query if str_to_remove is not None: @@ -168,7 +168,7 @@ def extract_table_dependencies_from_queries( table_join = [ txt for txt in table_value if str(txt).startswith(reference_datasets) ] - tables = list(set(table_from + table_join)) + tables = sorted(list(set(table_from + table_join))) # store in dictionary dicts[f"{self.schema}.{file_name}"] = tables diff --git a/poetry.lock b/poetry.lock index cce25f4..23fe942 100644 --- a/poetry.lock +++ b/poetry.lock @@ -6,6 +6,28 @@ category = "main" optional = false python-versions = "*" +[[package]] +name = "atomicwrites" +version = "1.4.0" +description = "Atomic file writes." +category = "main" +optional = false +python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*" + +[[package]] +name = "attrs" +version = "21.2.0" +description = "Classes Without Boilerplate" +category = "main" +optional = false +python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*, !=3.4.*" + +[package.extras] +dev = ["coverage[toml] (>=5.0.2)", "hypothesis", "pympler", "pytest (>=4.3.0)", "six", "mypy", "pytest-mypy-plugins", "zope.interface", "furo", "sphinx", "sphinx-notfound-page", "pre-commit"] +docs = ["furo", "sphinx", "zope.interface", "sphinx-notfound-page"] +tests = ["coverage[toml] (>=5.0.2)", "hypothesis", "pympler", "pytest (>=4.3.0)", "six", "mypy", "pytest-mypy-plugins", "zope.interface"] +tests_no_zope = ["coverage[toml] (>=5.0.2)", "hypothesis", "pympler", "pytest (>=4.3.0)", "six", "mypy", "pytest-mypy-plugins"] + [[package]] name = "certifi" version = "2020.12.5" @@ -30,6 +52,14 @@ category = "main" optional = false python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*, !=3.4.*" +[[package]] +name = "colorama" +version = "0.4.4" +description = "Cross-platform colored terminal text." +category = "main" +optional = false +python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*, !=3.4.*" + [[package]] name = "detect-secrets" version = "1.1.0" @@ -124,6 +154,14 @@ category = "main" optional = false python-versions = "*" +[[package]] +name = "iniconfig" +version = "1.1.1" +description = "iniconfig: brain-dead simple config-ini parsing" +category = "main" +optional = false +python-versions = "*" + [[package]] name = "mo-dots" version = "4.22.21108" @@ -202,6 +240,28 @@ category = "main" optional = false python-versions = "*" +[[package]] +name = "packaging" +version = "20.9" +description = "Core utilities for Python packages" +category = "main" +optional = false +python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*" + +[package.dependencies] +pyparsing = ">=2.0.2" + +[[package]] +name = "pluggy" +version = "0.13.1" +description = "plugin and hook calling mechanisms for python" +category = "main" +optional = false +python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*" + +[package.extras] +dev = ["pre-commit", "tox"] + [[package]] name = "pre-commit" version = "2.12.1" @@ -234,6 +294,43 @@ category = "main" optional = false python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*" +[[package]] +name = "py" +version = "1.10.0" +description = "library with cross-python path, ini-parsing, io, code, log facilities" +category = "main" +optional = false +python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*" + +[[package]] +name = "pyparsing" +version = "2.4.7" +description = "Python parsing module" +category = "main" +optional = false +python-versions = ">=2.6, !=3.0.*, !=3.1.*, !=3.2.*" + +[[package]] +name = "pytest" +version = "6.2.4" +description = "pytest: simple powerful testing with Python" +category = "main" +optional = false +python-versions = ">=3.6" + +[package.dependencies] +atomicwrites = {version = ">=1.0", markers = "sys_platform == \"win32\""} +attrs = ">=19.2.0" +colorama = {version = "*", markers = "sys_platform == \"win32\""} +iniconfig = "*" +packaging = "*" +pluggy = ">=0.12,<1.0.0a1" +py = ">=1.8.2" +toml = "*" + +[package.extras] +testing = ["argcomplete", "hypothesis (>=3.56)", "mock", "nose", "requests", "xmlschema"] + [[package]] name = "pyyaml" version = "5.4.1" @@ -331,13 +428,21 @@ testing = ["coverage (>=4)", "coverage-enable-subprocess (>=1)", "flaky (>=3)", [metadata] lock-version = "1.1" python-versions = "^3.8" -content-hash = "339e8cc655e66a1e2c0f5b773e6608ae3323f3af7a95ea3c6f94d8dacd086f20" +content-hash = "daf7c9934fd96f7224a177fe729ebf64c36c6dc462f13c38592ce7058f224a2d" [metadata.files] appdirs = [ {file = "appdirs-1.4.4-py2.py3-none-any.whl", hash = "sha256:a841dacd6b99318a741b166adb07e19ee71a274450e68237b4650ca1055ab128"}, {file = "appdirs-1.4.4.tar.gz", hash = "sha256:7d5d0167b2b1ba821647616af46a749d1c653740dd0d2415100fe26e27afdf41"}, ] +atomicwrites = [ + {file = "atomicwrites-1.4.0-py2.py3-none-any.whl", hash = "sha256:6d1784dea7c0c8d4a5172b6c620f40b6e4cbfdf96d783691f2e1302a7b88e197"}, + {file = "atomicwrites-1.4.0.tar.gz", hash = "sha256:ae70396ad1a434f9c7046fd2dd196fc04b12f9e91ffb859164193be8b6168a7a"}, +] +attrs = [ + {file = "attrs-21.2.0-py2.py3-none-any.whl", hash = "sha256:149e90d6d8ac20db7a955ad60cf0e6881a3f20d37096140088356da6c716b0b1"}, + {file = "attrs-21.2.0.tar.gz", hash = "sha256:ef6aaac3ca6cd92904cdd0d83f629a15f18053ec84e6432106f7a4d04ae4f5fb"}, +] certifi = [ {file = "certifi-2020.12.5-py2.py3-none-any.whl", hash = "sha256:719a74fb9e33b9bd44cc7f3a8d94bc35e4049deebe19ba7d8e108280cfd59830"}, {file = "certifi-2020.12.5.tar.gz", hash = "sha256:1a4995114262bffbc2413b159f2a1a480c969de6e6eb13ee966d470af86af59c"}, @@ -350,6 +455,10 @@ chardet = [ {file = "chardet-4.0.0-py2.py3-none-any.whl", hash = "sha256:f864054d66fd9118f2e67044ac8981a54775ec5b67aed0441892edb553d21da5"}, {file = "chardet-4.0.0.tar.gz", hash = "sha256:0d6f53a15db4120f2b08c94f11e7d93d2c911ee118b6b30a04ec3ee8310179fa"}, ] +colorama = [ + {file = "colorama-0.4.4-py2.py3-none-any.whl", hash = "sha256:9f47eda37229f68eee03b24b9748937c7dc3868f906e8ba69fbcbdd3bc5dc3e2"}, + {file = "colorama-0.4.4.tar.gz", hash = "sha256:5941b2b48a20143d2267e95b1c2a7603ce057ee39fd88e7329b0c292aa16869b"}, +] detect-secrets = [ {file = "detect_secrets-1.1.0-py2.py3-none-any.whl", hash = "sha256:be8cca3dc65f6fd637f5dec9f583f1cf4a680dc1a580b3d2e65a5ac7a277456a"}, {file = "detect_secrets-1.1.0.tar.gz", hash = "sha256:68250b31bc108f665f05f0ecfb34f92423280e48e65adbb887fdf721ed909627"}, @@ -384,6 +493,10 @@ mccabe = [ {file = "mccabe-0.6.1-py2.py3-none-any.whl", hash = "sha256:ab8a6258860da4b6677da4bd2fe5dc2c659cff31b3ee4f7f5d64e79735b80d42"}, {file = "mccabe-0.6.1.tar.gz", hash = "sha256:dd8d182285a0fe56bace7f45b5e7d1a6ebcbf524e8f3bd87eb0f125271b8831f"}, ] +iniconfig = [ + {file = "iniconfig-1.1.1-py2.py3-none-any.whl", hash = "sha256:011e24c64b7f47f6ebd835bb12a743f2fbe9a26d4cecaa7f53bc4f35ee9da8b3"}, + {file = "iniconfig-1.1.1.tar.gz", hash = "sha256:bc3af051d7d14b2ee5ef9969666def0cd1a000e121eaea580d4a313df4b37f32"}, +] mo-dots = [ {file = "mo-dots-4.22.21108.tar.gz", hash = "sha256:f9f2f9a6e44e214959aecd55857b6df487b8e86daf51334e2e982d47ea33ed23"}, ] @@ -406,6 +519,14 @@ nodeenv = [ {file = "nodeenv-1.6.0-py2.py3-none-any.whl", hash = "sha256:621e6b7076565ddcacd2db0294c0381e01fd28945ab36bcf00f41c5daf63bef7"}, {file = "nodeenv-1.6.0.tar.gz", hash = "sha256:3ef13ff90291ba2a4a7a4ff9a979b63ffdd00a464dbe04acf0ea6471517a4c2b"}, ] +packaging = [ + {file = "packaging-20.9-py2.py3-none-any.whl", hash = "sha256:67714da7f7bc052e064859c05c595155bd1ee9f69f76557e21f051443c20947a"}, + {file = "packaging-20.9.tar.gz", hash = "sha256:5b327ac1320dc863dca72f4514ecc086f31186744b84a230374cc1fd776feae5"}, +] +pluggy = [ + {file = "pluggy-0.13.1-py2.py3-none-any.whl", hash = "sha256:966c145cd83c96502c3c3868f50408687b38434af77734af1e9ca461a4081d2d"}, + {file = "pluggy-0.13.1.tar.gz", hash = "sha256:15b2acde666561e1298d71b523007ed7364de07029219b604cf808bfa1c765b0"}, +] pre-commit = [ {file = "pre_commit-2.12.1-py2.py3-none-any.whl", hash = "sha256:70c5ec1f30406250b706eda35e868b87e3e4ba099af8787e3e8b4b01e84f4712"}, {file = "pre_commit-2.12.1.tar.gz", hash = "sha256:900d3c7e1bf4cf0374bb2893c24c23304952181405b4d88c9c40b72bda1bb8a9"}, @@ -418,6 +539,18 @@ pyflakes = [ {file = "pyflakes-2.3.1-py2.py3-none-any.whl", hash = "sha256:7893783d01b8a89811dd72d7dfd4d84ff098e5eed95cfa8905b22bbffe52efc3"}, {file = "pyflakes-2.3.1.tar.gz", hash = "sha256:f5bc8ecabc05bb9d291eb5203d6810b49040f6ff446a756326104746cc00c1db"}, ] +py = [ + {file = "py-1.10.0-py2.py3-none-any.whl", hash = "sha256:3b80836aa6d1feeaa108e046da6423ab8f6ceda6468545ae8d02d9d58d18818a"}, + {file = "py-1.10.0.tar.gz", hash = "sha256:21b81bda15b66ef5e1a777a21c4dcd9c20ad3efd0b3f817e7a809035269e1bd3"}, +] +pyparsing = [ + {file = "pyparsing-2.4.7-py2.py3-none-any.whl", hash = "sha256:ef9d7589ef3c200abe66653d3f1ab1033c3c419ae9b9bdb1240a85b024efc88b"}, + {file = "pyparsing-2.4.7.tar.gz", hash = "sha256:c203ec8783bf771a155b207279b9bccb8dea02d8f0c9e5f8ead507bc3246ecc1"}, +] +pytest = [ + {file = "pytest-6.2.4-py3-none-any.whl", hash = "sha256:91ef2131a9bd6be8f76f1f08eac5c5317221d6ad1e143ae03894b862e8976890"}, + {file = "pytest-6.2.4.tar.gz", hash = "sha256:50bcad0a0b9c5a72c8e4e7c9855a3ad496ca6a881a3641b4260605450772c54b"}, +] pyyaml = [ {file = "PyYAML-5.4.1-cp27-cp27m-macosx_10_9_x86_64.whl", hash = "sha256:3b2b1824fe7112845700f815ff6a489360226a5609b96ec2190a45e62a9fc922"}, {file = "PyYAML-5.4.1-cp27-cp27m-win32.whl", hash = "sha256:129def1b7c1bf22faffd67b8f3724645203b79d8f4cc81f674654d9902cb4393"}, diff --git a/pyproject.toml b/pyproject.toml index f7a8f0d..6ec76d9 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -13,6 +13,7 @@ direnv = "^2020.12.3" flake8 = "^3.9.2" moz-sql-parser = "^4.40.21126" tqdm = "^4.60.0" +pytest = "^6.2.4" [tool.poetry.dev-dependencies] diff --git a/tests/unit/test_extractor.py b/tests/__init__.py similarity index 100% rename from tests/unit/test_extractor.py rename to tests/__init__.py diff --git a/tests/conftest.py b/tests/conftest.py index e69de29..10f2b4a 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -0,0 +1,12 @@ +import pytest + + +pytest_plugins = [ + "tests.fixtures.fixture_extractor", +] + + +@pytest.fixture() +def query_user_activity(): + with open(file="data/reporting/user_activity.sql", mode="r") as f: + return f.read() diff --git a/tests/fixtures/fixture_extractor.py b/tests/fixtures/fixture_extractor.py index e69de29..8c6330e 100644 --- a/tests/fixtures/fixture_extractor.py +++ b/tests/fixtures/fixture_extractor.py @@ -0,0 +1,12 @@ +import pytest + + +@pytest.fixture() +def extracted_user_activity(): + return { + "reporting.user_activity": [ + "analytics.commit", + "analytics.repo", + "analytics.user", + ] + } diff --git a/tests/integration/test_extractor.py b/tests/integration/test_extractor.py new file mode 100644 index 0000000..1399a56 --- /dev/null +++ b/tests/integration/test_extractor.py @@ -0,0 +1,26 @@ +import pytest +import os +from extractor import Extractor + + +# run multiple times to ensure value ordering is preserved +# if not preserved, then test will fail +@pytest.mark.parametrize("execution_number", range(3)) +def test_create_query_removal_text(execution_number, extracted_user_activity): + schema = "reporting" + dir_report = "data/reporting" + remove_txt = [] + for table in os.listdir(dir_report): + table_name, _ = os.path.splitext(p=table) + remove_txt.append(f"MERGE {schema}.{table_name} USING (") + remove_txt.append( + ") ON FALSE WHEN NOT MATCHED THEN " + "INSERT ROW WHEN NOT MATCHED BY SOURCE THEN " + "DELETE" + ) + extractor = Extractor(script_dir=dir_report, schema="reporting") + output = extractor.extract_table_dependencies_from_queries( + reference_datasets=["reporting", "analytics", "github_repos"], + str_to_remove=remove_txt, + ) + assert output == extracted_user_activity From 0f6af375290a0440aec4ea1d64fd75bba22d14b8 Mon Sep 17 00:00:00 2001 From: avisionh Date: Thu, 20 May 2021 09:02:05 +0100 Subject: [PATCH 10/14] fix: Avoid repeating appending same text to list in test This is to improve efficiency. Also make documentation of reference_datasets argument clearer. --- extractor.py | 2 +- tests/integration/test_extractor.py | 12 ++++++------ 2 files changed, 7 insertions(+), 7 deletions(-) diff --git a/extractor.py b/extractor.py index 44640bb..cda05cf 100644 --- a/extractor.py +++ b/extractor.py @@ -122,7 +122,7 @@ def extract_table_dependencies_from_queries( """ Extracts the table names and their dependencies from a set of .sql files. - :param reference_datasets: List of datasets/schema that the tables belong to. + :param reference_datasets: List of datasets/schema of database. :param str_to_remove: String or list of strings to remove from the query. :param verbose: Boolean to output steps taken and query after cleaning. Useful for debugging. :return: Dictionary of tables as keys and their dependent tables as values. diff --git a/tests/integration/test_extractor.py b/tests/integration/test_extractor.py index 1399a56..007e1a3 100644 --- a/tests/integration/test_extractor.py +++ b/tests/integration/test_extractor.py @@ -13,12 +13,12 @@ def test_create_query_removal_text(execution_number, extracted_user_activity): for table in os.listdir(dir_report): table_name, _ = os.path.splitext(p=table) remove_txt.append(f"MERGE {schema}.{table_name} USING (") - remove_txt.append( - ") ON FALSE WHEN NOT MATCHED THEN " - "INSERT ROW WHEN NOT MATCHED BY SOURCE THEN " - "DELETE" - ) - extractor = Extractor(script_dir=dir_report, schema="reporting") + remove_txt.append( + ") ON FALSE WHEN NOT MATCHED THEN " + "INSERT ROW WHEN NOT MATCHED BY SOURCE THEN " + "DELETE" + ) + extractor = Extractor(script_dir=dir_report, schema=schema) output = extractor.extract_table_dependencies_from_queries( reference_datasets=["reporting", "analytics", "github_repos"], str_to_remove=remove_txt, From b4e3b3883c01dd6d3974ad3f218b5c411bfef1ac Mon Sep 17 00:00:00 2001 From: avisionh Date: Thu, 20 May 2021 21:54:32 +0100 Subject: [PATCH 11/14] feat: Test analytics dataset scripts This is so we can ensure class method works on a directory of SQL scripts. --- data/analytics/author.sql | 2 +- data/analytics/commit.sql | 2 +- data/analytics/committer.sql | 2 +- data/analytics/repo.sql | 6 ++-- tests/fixtures/fixture_extractor.py | 44 ++++++++++++++++++++++++++++- tests/integration/test_extractor.py | 41 +++++++++++++++------------ 6 files changed, 72 insertions(+), 25 deletions(-) diff --git a/data/analytics/author.sql b/data/analytics/author.sql index 9d8b7af..052593d 100644 --- a/data/analytics/author.sql +++ b/data/analytics/author.sql @@ -7,7 +7,7 @@ USING ( ,author.tz_offset AS tz_offset ,author.date.seconds AS date_seconds ,author.date.nanos AS date_nanos - FROM `bigquery-public-data.github_repos.commits` + FROM github_repos.commits ) ON FALSE WHEN NOT MATCHED THEN diff --git a/data/analytics/commit.sql b/data/analytics/commit.sql index eda3a63..0e15249 100644 --- a/data/analytics/commit.sql +++ b/data/analytics/commit.sql @@ -12,7 +12,7 @@ USING ( ,message ,repo_name ,difference_truncated - FROM `bigquery-public-data.github_repos.commits` + FROM github_repos.commits ) ON FALSE WHEN NOT MATCHED THEN diff --git a/data/analytics/committer.sql b/data/analytics/committer.sql index 6c76d31..9c9c1e7 100644 --- a/data/analytics/committer.sql +++ b/data/analytics/committer.sql @@ -7,7 +7,7 @@ USING ( ,committer.tz_offset AS tz_offset ,committer.date.seconds AS date_seconds ,committer.date.nanos AS date_nanos - FROM `bigquery-public-data.github_repos.commits` + FROM github_repos.commits ) ON FALSE WHEN NOT MATCHED THEN diff --git a/data/analytics/repo.sql b/data/analytics/repo.sql index 7269807..0750069 100644 --- a/data/analytics/repo.sql +++ b/data/analytics/repo.sql @@ -7,10 +7,10 @@ USING ( ,b.language.name AS language ,b.language.bytes AS repo_size ,c.license - FROM `bigquery-public-data.github_repos.commits` AS a - LEFT JOIN `bigquery-public-data.github_repos.languages` AS b + FROM github_repos.commits AS a + LEFT JOIN github_repos.languages AS b ON a.repo_name = b.repo_name - LEFT JOIN `bigquery-public-data.github_repos.licenses` AS c + LEFT JOIN github_repos.licenses AS c ON a.repo_name = c.repo_name ) ON FALSE diff --git a/tests/fixtures/fixture_extractor.py b/tests/fixtures/fixture_extractor.py index 8c6330e..665b84e 100644 --- a/tests/fixtures/fixture_extractor.py +++ b/tests/fixtures/fixture_extractor.py @@ -2,7 +2,34 @@ @pytest.fixture() -def extracted_user_activity(): +def cleaned_user_activity(): + return ( + "WITH cte_base AS ( " + "SELECT b.name ,b.email ,'commit' AS activity_type ,COUNT(a.*) AS activity_count " + "FROM analytics.commit AS a " + "LEFT JOIN analytics.user AS b " + "ON a.committer_name = b.name " + "AND b.user_type = 'committer' " + "GROUP BY b.name ,b.email ,a.repo_name " + "UNION " + "SELECT a.author_name AS name ,b.email ,'repo' AS activity_type ,COUNT(a.*) AS activity_count " + "FROM analytics.repo AS a " + "LEFT JOIN analytics.user AS b " + "ON a.author_name = b.name " + "GROUP BY a.author_name ,b.email ) " + "SELECT name ,email ,activity_type ,activity_count " + "FROM cte_base " + "UNION " + "SELECT name ,email ,activity_type ,activity_count " + "FROM ( " + "SELECT name ,email ,'total' AS activity_type ,SUM(activity_count) AS activity_count " + "FROM cte_base " + "GROUP BY name ,email ) ;" + ) + + +@pytest.fixture() +def extracted_reporting(): return { "reporting.user_activity": [ "analytics.commit", @@ -10,3 +37,18 @@ def extracted_user_activity(): "analytics.user", ] } + + +@pytest.fixture() +def extracted_analytics(): + return { + "analytics.repo": [ + "github_repos.commits", + "github_repos.languages", + "github_repos.licenses", + ], + "analytics.author": ["github_repos.commits"], + "analytics.committer": ["github_repos.commits"], + "analytics.commit": ["github_repos.commits"], + "analytics.user": ["analytics.author", "analytics.committer"], + } diff --git a/tests/integration/test_extractor.py b/tests/integration/test_extractor.py index 007e1a3..d53357a 100644 --- a/tests/integration/test_extractor.py +++ b/tests/integration/test_extractor.py @@ -6,21 +6,26 @@ # run multiple times to ensure value ordering is preserved # if not preserved, then test will fail @pytest.mark.parametrize("execution_number", range(3)) -def test_create_query_removal_text(execution_number, extracted_user_activity): - schema = "reporting" - dir_report = "data/reporting" - remove_txt = [] - for table in os.listdir(dir_report): - table_name, _ = os.path.splitext(p=table) - remove_txt.append(f"MERGE {schema}.{table_name} USING (") - remove_txt.append( - ") ON FALSE WHEN NOT MATCHED THEN " - "INSERT ROW WHEN NOT MATCHED BY SOURCE THEN " - "DELETE" - ) - extractor = Extractor(script_dir=dir_report, schema=schema) - output = extractor.extract_table_dependencies_from_queries( - reference_datasets=["reporting", "analytics", "github_repos"], - str_to_remove=remove_txt, - ) - assert output == extracted_user_activity +def test_extract_table_dependencies_from_queries( + execution_number, extracted_reporting, extracted_analytics +): + schemes = ["analytics", "reporting"] + extract = [extracted_analytics, extracted_reporting] + + for i, schema in enumerate(schemes): + dir_report = f"data/{schema}" + remove_txt = [] + for table in os.listdir(dir_report): + table_name, _ = os.path.splitext(p=table) + remove_txt.append(f"MERGE {schema}.{table_name} USING (") + remove_txt.append( + ") ON FALSE WHEN NOT MATCHED THEN " + "INSERT ROW WHEN NOT MATCHED BY SOURCE THEN " + "DELETE" + ) + extractor = Extractor(script_dir=dir_report, schema=schema) + output = extractor.extract_table_dependencies_from_queries( + reference_datasets=["reporting", "analytics", "github_repos"], + str_to_remove=remove_txt, + ) + assert output == extract[i] From fb3e7992ada7fb7eac80030255aef264fe3c051d Mon Sep 17 00:00:00 2001 From: avisionh Date: Thu, 20 May 2021 22:06:14 +0100 Subject: [PATCH 12/14] test: Add unit-test for clean_query method This is so we can check this works also. --- extractor.py | 2 +- tests/fixtures/fixture_extractor.py | 4 ++-- tests/unit/test_extractor.py | 17 +++++++++++++++++ 3 files changed, 20 insertions(+), 3 deletions(-) create mode 100644 tests/unit/test_extractor.py diff --git a/extractor.py b/extractor.py index cda05cf..9a4b8bf 100644 --- a/extractor.py +++ b/extractor.py @@ -40,7 +40,7 @@ def read_query(self, file: str) -> (str, str): ) @staticmethod - def clean_query(query: str, str_to_remove: Union[str, list]) -> str: + def clean_query(query: str, str_to_remove: Union[str, list] = None) -> str: """ Cleans a query so it can be parsed. diff --git a/tests/fixtures/fixture_extractor.py b/tests/fixtures/fixture_extractor.py index 665b84e..56b49c0 100644 --- a/tests/fixtures/fixture_extractor.py +++ b/tests/fixtures/fixture_extractor.py @@ -4,7 +4,7 @@ @pytest.fixture() def cleaned_user_activity(): return ( - "WITH cte_base AS ( " + " WITH cte_base AS ( " "SELECT b.name ,b.email ,'commit' AS activity_type ,COUNT(a.*) AS activity_count " "FROM analytics.commit AS a " "LEFT JOIN analytics.user AS b " @@ -24,7 +24,7 @@ def cleaned_user_activity(): "FROM ( " "SELECT name ,email ,'total' AS activity_type ,SUM(activity_count) AS activity_count " "FROM cte_base " - "GROUP BY name ,email ) ;" + "GROUP BY name ,email ) ; " ) diff --git a/tests/unit/test_extractor.py b/tests/unit/test_extractor.py new file mode 100644 index 0000000..8674c60 --- /dev/null +++ b/tests/unit/test_extractor.py @@ -0,0 +1,17 @@ +from extractor import Extractor + + +def test_clean_query(query_user_activity, cleaned_user_activity): + schema = "reporting" + dir_report = f"data/{schema}" + extractor = Extractor(script_dir=dir_report, schema=schema) + txt_remove = [ + f"MERGE {schema}.user_activity USING (", + ") ON FALSE WHEN NOT MATCHED THEN " + "INSERT ROW WHEN NOT MATCHED BY SOURCE THEN " + "DELETE", + ] + cleaned_query = extractor.clean_query( + query=query_user_activity, str_to_remove=txt_remove + ) + assert cleaned_query == cleaned_user_activity From 7a236ed71dba8b7d708cdf36c155919862c184cc Mon Sep 17 00:00:00 2001 From: avisionh Date: Thu, 20 May 2021 23:38:29 +0100 Subject: [PATCH 13/14] test: Add pytest in CI pipeline This is so it can be run automatically. --- .github/workflows/pytesting.yml | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/.github/workflows/pytesting.yml b/.github/workflows/pytesting.yml index 81bc57a..5090408 100644 --- a/.github/workflows/pytesting.yml +++ b/.github/workflows/pytesting.yml @@ -32,3 +32,7 @@ jobs: poetry run flake8 . --count --select=E9,F63,F7,F82 --show-source --statistics # exit-zero treats all errors as warnings. The GitHub editor is 127 chars wide poetry run flake8 . --count --exit-zero --max-complexity=10 --max-line-length=127 --statistics + - name: Test with pytest + run: | + poetry run pytest tests/unit + poetry run pytest tests/integration From 0d4881da4ff25e6ab222c70c38e8c81b9860cae3 Mon Sep 17 00:00:00 2001 From: avisionh Date: Fri, 21 May 2021 00:20:49 +0100 Subject: [PATCH 14/14] docs: Add aknowledgements to moz-sql-parser This is to show what it builds on. Include Python badge also and move ISSUES_TEMPLATE/ to .github/. --- .../ISSUE_TEMPLATE}/bug_report.md | 0 .../ISSUE_TEMPLATE}/feature_request.md | 0 README.md | 13 +++++++++---- 3 files changed, 9 insertions(+), 4 deletions(-) rename {ISSUE_TEMPLATE => .github/ISSUE_TEMPLATE}/bug_report.md (100%) rename {ISSUE_TEMPLATE => .github/ISSUE_TEMPLATE}/feature_request.md (100%) diff --git a/ISSUE_TEMPLATE/bug_report.md b/.github/ISSUE_TEMPLATE/bug_report.md similarity index 100% rename from ISSUE_TEMPLATE/bug_report.md rename to .github/ISSUE_TEMPLATE/bug_report.md diff --git a/ISSUE_TEMPLATE/feature_request.md b/.github/ISSUE_TEMPLATE/feature_request.md similarity index 100% rename from ISSUE_TEMPLATE/feature_request.md rename to .github/ISSUE_TEMPLATE/feature_request.md diff --git a/README.md b/README.md index 96d5ed9..4a4ab24 100644 --- a/README.md +++ b/README.md @@ -1,9 +1,13 @@ # sqlquerygraph -[![build status](https://github.com/avisionh/sqlquerygraph/workflows/pytesting/badge.svg)](https://github.com/avisionh/sqlqueryraph/actions) [![CodeFactor](https://www.codefactor.io/repository/github/avisionh/sqlquerygraph/badge)](https://www.codefactor.io/repository/github/avisionh/sqlquerygraph) [![License: MIT](https://img.shields.io/badge/License-MIT-informational.svg)](https://opensource.org/licenses/MIT) [![Code style: black](https://img.shields.io/badge/code%20style-black-000000.svg)](https://github.com/psf/black) +[![build status](https://github.com/avisionh/sqlquerygraph/workflows/pytesting/badge.svg)](https://github.com/avisionh/sqlqueryraph/actions) +[![](https://img.shields.io/badge/python-3.8%2B-blue.svg)](https://www.python.org/downloads/) +[![CodeFactor](https://www.codefactor.io/repository/github/avisionh/sqlquerygraph/badge)](https://www.codefactor.io/repository/github/avisionh/sqlquerygraph) +[![License: MIT](https://img.shields.io/badge/License-MIT-informational.svg)](https://opensource.org/licenses/MIT) +[![Code style: black](https://img.shields.io/badge/code%20style-black-000000.svg)](https://github.com/psf/black) Parse your SQL queries and represent their structure as a graph. -Currently, we implement the ability of represent how each of the tables in a set of SQL query scripts depend on each other. +Currently, we implement the ability of representing how each of the tables in a set of SQL query scripts depend on each other. ## Requirements To run the code in here, ensure your system meets the following requirements: @@ -14,8 +18,6 @@ To run the code in here, ensure your system meets the following requirements: - Python 3.8 or above; and - [Poetry](https://python-poetry.org/docs/) installed. -Parse your SQL queries and represent their structure as a graph. - Note there may be some Python IDE-specific requirements around loading environment variables, which are not considered here. ### Set-up @@ -33,3 +35,6 @@ pre-commit install ``` *** + +## Acknowledgements +This builds on the excellent [moz-sql-parser](https://github.com/mozilla/moz-sql-parser) package.