diff --git a/ISSUE_TEMPLATE/bug_report.md b/.github/ISSUE_TEMPLATE/bug_report.md similarity index 100% rename from ISSUE_TEMPLATE/bug_report.md rename to .github/ISSUE_TEMPLATE/bug_report.md diff --git a/ISSUE_TEMPLATE/feature_request.md b/.github/ISSUE_TEMPLATE/feature_request.md similarity index 100% rename from ISSUE_TEMPLATE/feature_request.md rename to .github/ISSUE_TEMPLATE/feature_request.md diff --git a/.github/workflows/pytesting.yml b/.github/workflows/pytesting.yml index 81bc57a..5090408 100644 --- a/.github/workflows/pytesting.yml +++ b/.github/workflows/pytesting.yml @@ -32,3 +32,7 @@ jobs: poetry run flake8 . --count --select=E9,F63,F7,F82 --show-source --statistics # exit-zero treats all errors as warnings. The GitHub editor is 127 chars wide poetry run flake8 . --count --exit-zero --max-complexity=10 --max-line-length=127 --statistics + - name: Test with pytest + run: | + poetry run pytest tests/unit + poetry run pytest tests/integration diff --git a/.gitignore b/.gitignore index 0cb7c0a..abcbf77 100644 --- a/.gitignore +++ b/.gitignore @@ -12,3 +12,6 @@ # environment .env + +# django +__pycache__/ diff --git a/README.md b/README.md index dd4e328..4a4ab24 100644 --- a/README.md +++ b/README.md @@ -1,4 +1,40 @@ # sqlquerygraph -[![build status](https://github.com/avisionh/sqlquerygraph/workflows/pytesting/badge.svg)](https://github.com/avisionh/sqlqueryraph/actions) [![CodeFactor](https://www.codefactor.io/repository/github/avisionh/sqlquerygraph/badge)](https://www.codefactor.io/repository/github/avisionh/sqlquerygraph) [![License: MIT](https://img.shields.io/badge/License-MIT-informational.svg)](https://opensource.org/licenses/MIT) [![Code style: black](https://img.shields.io/badge/code%20style-black-000000.svg)](https://github.com/psf/black) +[![build status](https://github.com/avisionh/sqlquerygraph/workflows/pytesting/badge.svg)](https://github.com/avisionh/sqlqueryraph/actions) +[![](https://img.shields.io/badge/python-3.8%2B-blue.svg)](https://www.python.org/downloads/) +[![CodeFactor](https://www.codefactor.io/repository/github/avisionh/sqlquerygraph/badge)](https://www.codefactor.io/repository/github/avisionh/sqlquerygraph) +[![License: MIT](https://img.shields.io/badge/License-MIT-informational.svg)](https://opensource.org/licenses/MIT) +[![Code style: black](https://img.shields.io/badge/code%20style-black-000000.svg)](https://github.com/psf/black) Parse your SQL queries and represent their structure as a graph. + +Currently, we implement the ability of representing how each of the tables in a set of SQL query scripts depend on each other. + +## Requirements +To run the code in here, ensure your system meets the following requirements: +- Unix-like operating system (macOS, Linux, ...); +- [`direnv`](https://direnv.net/) installed, including shell hooks; +- [`.envrc`](https://github.com/avisionh/sqlquerygraph/blob/main/.envrc) allowed/trusted by `direnv` to + use the environment variables - see [below](#allowingtrusting-envrc); +- Python 3.8 or above; and +- [Poetry](https://python-poetry.org/docs/) installed. + +Note there may be some Python IDE-specific requirements around loading environment variables, which are not considered here. + +### Set-up +For quickstart set-up of the project, run the below in your shell: +```shell script +# 1. read project-specific environment variables +direnv allow + +# 2. activate virtual environment and install package dependencies +poetry shell +poetry install + +# 3. check adherence to good standards on every commit +pre-commit install +``` + +*** + +## Acknowledgements +This builds on the excellent [moz-sql-parser](https://github.com/mozilla/moz-sql-parser) package. diff --git a/data/analytics/author.sql b/data/analytics/author.sql new file mode 100644 index 0000000..052593d --- /dev/null +++ b/data/analytics/author.sql @@ -0,0 +1,16 @@ +MERGE analytics.author +USING ( + SELECT + author.name AS name + ,author.email AS email + ,author.time_sec AS time_sec + ,author.tz_offset AS tz_offset + ,author.date.seconds AS date_seconds + ,author.date.nanos AS date_nanos + FROM github_repos.commits +) +ON FALSE +WHEN NOT MATCHED THEN + INSERT ROW +WHEN NOT MATCHED BY SOURCE THEN + DELETE; diff --git a/data/analytics/commit.sql b/data/analytics/commit.sql new file mode 100644 index 0000000..0e15249 --- /dev/null +++ b/data/analytics/commit.sql @@ -0,0 +1,21 @@ +MERGE analytics.commit +USING ( + SELECT + commit + ,tree + ,parent + ,author.name AS author_name + ,author.time_sec AS author_timesec + ,committer.name AS committer_name + ,committer.time_sec AS committer_time_sec + ,subject + ,message + ,repo_name + ,difference_truncated + FROM github_repos.commits +) +ON FALSE +WHEN NOT MATCHED THEN + INSERT ROW +WHEN NOT MATCHED BY SOURCE THEN + DELETE; diff --git a/data/analytics/committer.sql b/data/analytics/committer.sql new file mode 100644 index 0000000..9c9c1e7 --- /dev/null +++ b/data/analytics/committer.sql @@ -0,0 +1,16 @@ +MERGE analytics.committer +USING ( + SELECT + committer.name AS name + ,committer.email AS email + ,committer.time_sec AS time_sec + ,committer.tz_offset AS tz_offset + ,committer.date.seconds AS date_seconds + ,committer.date.nanos AS date_nanos + FROM github_repos.commits +) +ON FALSE +WHEN NOT MATCHED THEN + INSERT ROW +WHEN NOT MATCHED BY SOURCE THEN + DELETE; diff --git a/data/analytics/repo.sql b/data/analytics/repo.sql new file mode 100644 index 0000000..0750069 --- /dev/null +++ b/data/analytics/repo.sql @@ -0,0 +1,20 @@ +MERGE analytics.repo +USING ( + SELECT + a.repo_name + ,a.author.name AS author_name + ,a.author.time_sec AS author_time_sec + ,b.language.name AS language + ,b.language.bytes AS repo_size + ,c.license + FROM github_repos.commits AS a + LEFT JOIN github_repos.languages AS b + ON a.repo_name = b.repo_name + LEFT JOIN github_repos.licenses AS c + ON a.repo_name = c.repo_name +) +ON FALSE +WHEN NOT MATCHED THEN + INSERT ROW +WHEN NOT MATCHED BY SOURCE THEN + DELETE; diff --git a/data/analytics/user.sql b/data/analytics/user.sql new file mode 100644 index 0000000..accd863 --- /dev/null +++ b/data/analytics/user.sql @@ -0,0 +1,19 @@ +MERGE analytics.user +USING ( + SELECT DISTINCT + name + ,email + ,'author' AS user_type + FROM analytics.author + UNION + SELECT DISTINCT + name + ,email + ,'committer' AS user_type + FROM analytics.committer +) +ON FALSE +WHEN NOT MATCHED THEN + INSERT ROW +WHEN NOT MATCHED BY SOURCE THEN + DELETE; diff --git a/data/reporting/user_activity.sql b/data/reporting/user_activity.sql new file mode 100644 index 0000000..19748c2 --- /dev/null +++ b/data/reporting/user_activity.sql @@ -0,0 +1,63 @@ +MERGE reporting.user_activity +USING ( + WITH cte_base AS + ( + SELECT + b.name + ,b.email + ,'commit' AS activity_type + ,COUNT(a.*) AS activity_count + FROM analytics.commit AS a + LEFT JOIN analytics.user AS b + ON a.committer_name = b.name + AND b.user_type = 'committer' + GROUP BY + b.name + ,b.email + ,a.repo_name + + UNION + + SELECT + a.author_name AS name + ,b.email + ,'repo' AS activity_type + ,COUNT(a.*) AS activity_count + FROM analytics.repo AS a + LEFT JOIN analytics.user AS b + ON a.author_name = b.name + GROUP BY + a.author_name + ,b.email + ) + + SELECT + name + ,email + ,activity_type + ,activity_count + FROM cte_base + UNION + SELECT + name + ,email + ,activity_type + ,activity_count + FROM + ( + SELECT + name + ,email + ,'total' AS activity_type + ,SUM(activity_count) AS activity_count + FROM cte_base + GROUP BY + name + ,email + ) +) +ON FALSE +WHEN NOT MATCHED THEN + INSERT ROW +WHEN NOT MATCHED BY SOURCE THEN + DELETE; diff --git a/extractor.py b/extractor.py new file mode 100644 index 0000000..9a4b8bf --- /dev/null +++ b/extractor.py @@ -0,0 +1,176 @@ +from typing import Union + +import os +import re +from tqdm import tqdm + +from moz_sql_parser import parse +from pprint import pprint + + +class Extractor: + """ + Extract table names from SQL queries. + + :param script_dir: String of the directory were we store our SQL queries. + :param schema: String of the dataset/schema that the SQL queries creating the table belongs to. + """ + + def __init__(self, script_dir: str, schema: str): + self.script_dir = script_dir + self.schema = schema + + def read_query(self, file: str) -> (str, str): + """ + Reads a SQL file in. + Note: Relies on SQL script being named the same as table or View it is creating. + + :param file: String of the file to read query from. + :return: Tuple of strings of the table name and SQL query from the file. + """ + file_name, file_extension = os.path.splitext(p=file) + if file_extension == ".sql": + with open(file=os.path.join(self.script_dir, file), mode="r") as f: + query = f.read() + return file_name, query + else: + raise Exception( + f"Passed in a {file_extension} file. \n" + f"Please pass in a .sql file instead." + ) + + @staticmethod + def clean_query(query: str, str_to_remove: Union[str, list] = None) -> str: + """ + Cleans a query so it can be parsed. + + :param query: String of the query to clean. + :param str_to_remove: String or list of strings to remove from the query. + :return: String of the cleaned query to parse. + """ + # remove new lines and multiple spaces + query = query.replace("\n", " ") + query = re.sub(pattern=r"\s+", repl=" ", string=query) + + if str_to_remove is not None: + for txt in str_to_remove: + query = query.replace(txt, "") + + return query + + @staticmethod + def parse_query(query: str, print_tree: bool = False) -> dict: + """ + Parse a query into a JSON parse-tree. + + :param query: String of the SQL query to parse as a JSON parse-tree. + :param print_tree: Boolean to print the JSON parse-tree. + :return: Dictionary of the query as a JSON parse-tree. + """ + query_json = parse(sql=query) + if print_tree: + pprint(object=query_json) + return query_json + + @staticmethod + def extract_from_json(obj: dict, key: str) -> list: + """ + Recursively fetch values from a nested JSON. + + For our purposes, extract where key is 'from' allows extraction of *most* table names after a `FROM` clause. + - It does not extract the table names when the name is nested in a subquery. + - Nor does it extract table names in ' JOIN` clauses. + To achieve above two, need to extract where the key is 'value' and compare with actual table names. + This is because the values returned when key is 'value' are table names, column names etc. + Reference + - https://hackersandslackers.com/extract-data-from-complex-json-python/ + :param obj: Dictionary to extract values from. + :param key: String of the value you want to extract. + :return: List of values for the key. + """ + arr = [] + + def extract(obj: Union[dict, list], arr: list, key: str) -> list: + """ + Recusively search for values of key in a JSON tree. + + :param obj: Dictionary to extract values from. + :param arr: List to store extracted values to. + :param key: String of the dictionary key to extract associated value from. + :return: List of the extracted values. + """ + if isinstance(obj, dict): + for k, v in obj.items(): + if isinstance(v, (dict, list)): + extract(obj=v, arr=arr, key=key) + elif k == key: + arr.append(v) + elif isinstance(obj, list): + for item in obj: + extract(obj=item, arr=arr, key=key) + return arr + + values = extract(obj=obj, arr=arr, key=key) + return values + + def extract_table_dependencies_from_queries( + self, + reference_datasets: list, + str_to_remove: Union[str, list] = None, + verbose: bool = False, + ) -> dict: + """ + Extracts the table names and their dependencies from a set of .sql files. + + :param reference_datasets: List of datasets/schema of database. + :param str_to_remove: String or list of strings to remove from the query. + :param verbose: Boolean to output steps taken and query after cleaning. Useful for debugging. + :return: Dictionary of tables as keys and their dependent tables as values. + """ + queries, jsons, dicts = {}, {}, {} + reference_datasets = tuple([f"{txt}." for txt in reference_datasets]) + for file_name in tqdm(os.listdir(path=self.script_dir)): + if verbose: + print(f"Reading query {file_name}...\n") + file_name, query = self.read_query(file=file_name) + queries[file_name] = query + + if str_to_remove is not None: + if verbose: + print( + f"Cleaning query {file_name} by removing {str_to_remove}...\n" + ) + queries[file_name] = self.clean_query( + query=queries[file_name], str_to_remove=str_to_remove + ) + + if verbose: + print(f"Cleaned query is {queries[file_name]}") + print(f"Parsing query {file_name}...\n") + jsons[file_name] = self.parse_query( + query=queries[file_name], print_tree=verbose + ) + + if verbose: + print(f"Extracting table names from {file_name}...\n") + # - from: tables after 'from' clause + # + though sometimes keys are not 'from' so need to + # + look at values associated to the 'value' key + # - value: tables after '... join' clauses + # + can also include tables after 'from' clause if they + # + are in a subquery + table_from = self.extract_from_json(obj=jsons[file_name], key="from") + + # keep only table elements and not table aliases - as defined by period + table_from = [txt for txt in table_from if "." in txt] + table_value = self.extract_from_json(obj=jsons[file_name], key="value") + # extract table values when it starts with `.` + table_join = [ + txt for txt in table_value if str(txt).startswith(reference_datasets) + ] + tables = sorted(list(set(table_from + table_join))) + + # store in dictionary + dicts[f"{self.schema}.{file_name}"] = tables + + return dicts diff --git a/poetry.lock b/poetry.lock index 7ae4cb6..23fe942 100644 --- a/poetry.lock +++ b/poetry.lock @@ -6,6 +6,28 @@ category = "main" optional = false python-versions = "*" +[[package]] +name = "atomicwrites" +version = "1.4.0" +description = "Atomic file writes." +category = "main" +optional = false +python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*" + +[[package]] +name = "attrs" +version = "21.2.0" +description = "Classes Without Boilerplate" +category = "main" +optional = false +python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*, !=3.4.*" + +[package.extras] +dev = ["coverage[toml] (>=5.0.2)", "hypothesis", "pympler", "pytest (>=4.3.0)", "six", "mypy", "pytest-mypy-plugins", "zope.interface", "furo", "sphinx", "sphinx-notfound-page", "pre-commit"] +docs = ["furo", "sphinx", "zope.interface", "sphinx-notfound-page"] +tests = ["coverage[toml] (>=5.0.2)", "hypothesis", "pympler", "pytest (>=4.3.0)", "six", "mypy", "pytest-mypy-plugins", "zope.interface"] +tests_no_zope = ["coverage[toml] (>=5.0.2)", "hypothesis", "pympler", "pytest (>=4.3.0)", "six", "mypy", "pytest-mypy-plugins"] + [[package]] name = "certifi" version = "2020.12.5" @@ -30,6 +52,14 @@ category = "main" optional = false python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*, !=3.4.*" +[[package]] +name = "colorama" +version = "0.4.4" +description = "Cross-platform colored terminal text." +category = "main" +optional = false +python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*, !=3.4.*" + [[package]] name = "detect-secrets" version = "1.1.0" @@ -124,6 +154,84 @@ category = "main" optional = false python-versions = "*" +[[package]] +name = "iniconfig" +version = "1.1.1" +description = "iniconfig: brain-dead simple config-ini parsing" +category = "main" +optional = false +python-versions = "*" + +[[package]] +name = "mo-dots" +version = "4.22.21108" +description = "More Dots! Dot-access to Python dicts like Javascript" +category = "main" +optional = false +python-versions = "*" + +[package.dependencies] +mo-future = "3.147.20327" +mo-imports = "3.149.20327" + +[[package]] +name = "mo-future" +version = "3.147.20327" +description = "More future! Make Python 2/3 compatibility a bit easier" +category = "main" +optional = false +python-versions = "*" + +[[package]] +name = "mo-imports" +version = "3.149.20327" +description = "More Imports! - Delayed importing" +category = "main" +optional = false +python-versions = "*" + +[package.dependencies] +mo-future = "3.147.20327" + +[[package]] +name = "mo-kwargs" +version = "4.22.21108" +description = "More KWARGS! Let call parameters override kwargs" +category = "main" +optional = false +python-versions = "*" + +[package.dependencies] +mo-dots = "4.22.21108" +mo-future = "3.147.20327" + +[[package]] +name = "mo-logs" +version = "4.23.21108" +description = "More Logs! Structured Logging and Exception Handling" +category = "main" +optional = false +python-versions = "*" + +[package.dependencies] +mo-dots = "4.22.21108" +mo-future = "3.147.20327" +mo-imports = "3.149.20327" +mo-kwargs = "4.22.21108" + +[[package]] +name = "moz-sql-parser" +version = "4.40.21126" +description = "Extract Parse Tree from SQL" +category = "main" +optional = false +python-versions = "*" + +[package.dependencies] +mo-dots = "4.22.21108" +mo-future = "3.147.20327" +mo-logs = "4.23.21108" + [[package]] name = "nodeenv" version = "1.6.0" @@ -132,6 +240,28 @@ category = "main" optional = false python-versions = "*" +[[package]] +name = "packaging" +version = "20.9" +description = "Core utilities for Python packages" +category = "main" +optional = false +python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*" + +[package.dependencies] +pyparsing = ">=2.0.2" + +[[package]] +name = "pluggy" +version = "0.13.1" +description = "plugin and hook calling mechanisms for python" +category = "main" +optional = false +python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*" + +[package.extras] +dev = ["pre-commit", "tox"] + [[package]] name = "pre-commit" version = "2.12.1" @@ -164,6 +294,43 @@ category = "main" optional = false python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*" +[[package]] +name = "py" +version = "1.10.0" +description = "library with cross-python path, ini-parsing, io, code, log facilities" +category = "main" +optional = false +python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*" + +[[package]] +name = "pyparsing" +version = "2.4.7" +description = "Python parsing module" +category = "main" +optional = false +python-versions = ">=2.6, !=3.0.*, !=3.1.*, !=3.2.*" + +[[package]] +name = "pytest" +version = "6.2.4" +description = "pytest: simple powerful testing with Python" +category = "main" +optional = false +python-versions = ">=3.6" + +[package.dependencies] +atomicwrites = {version = ">=1.0", markers = "sys_platform == \"win32\""} +attrs = ">=19.2.0" +colorama = {version = "*", markers = "sys_platform == \"win32\""} +iniconfig = "*" +packaging = "*" +pluggy = ">=0.12,<1.0.0a1" +py = ">=1.8.2" +toml = "*" + +[package.extras] +testing = ["argcomplete", "hypothesis (>=3.56)", "mock", "nose", "requests", "xmlschema"] + [[package]] name = "pyyaml" version = "5.4.1" @@ -206,6 +373,19 @@ category = "main" optional = false python-versions = ">=2.6, !=3.0.*, !=3.1.*, !=3.2.*" +[[package]] +name = "tqdm" +version = "4.60.0" +description = "Fast, Extensible Progress Meter" +category = "main" +optional = false +python-versions = "!=3.0.*,!=3.1.*,!=3.2.*,!=3.3.*,>=2.7" + +[package.extras] +dev = ["py-make (>=0.1.0)", "twine", "wheel"] +notebook = ["ipywidgets (>=6)"] +telegram = ["requests"] + [[package]] name = "urllib3" version = "1.26.4" @@ -248,13 +428,21 @@ testing = ["coverage (>=4)", "coverage-enable-subprocess (>=1)", "flaky (>=3)", [metadata] lock-version = "1.1" python-versions = "^3.8" -content-hash = "09173b5e497218e0d74966bdf08a78ebbfbc7322863b44a255d4e01538efec91" +content-hash = "daf7c9934fd96f7224a177fe729ebf64c36c6dc462f13c38592ce7058f224a2d" [metadata.files] appdirs = [ {file = "appdirs-1.4.4-py2.py3-none-any.whl", hash = "sha256:a841dacd6b99318a741b166adb07e19ee71a274450e68237b4650ca1055ab128"}, {file = "appdirs-1.4.4.tar.gz", hash = "sha256:7d5d0167b2b1ba821647616af46a749d1c653740dd0d2415100fe26e27afdf41"}, ] +atomicwrites = [ + {file = "atomicwrites-1.4.0-py2.py3-none-any.whl", hash = "sha256:6d1784dea7c0c8d4a5172b6c620f40b6e4cbfdf96d783691f2e1302a7b88e197"}, + {file = "atomicwrites-1.4.0.tar.gz", hash = "sha256:ae70396ad1a434f9c7046fd2dd196fc04b12f9e91ffb859164193be8b6168a7a"}, +] +attrs = [ + {file = "attrs-21.2.0-py2.py3-none-any.whl", hash = "sha256:149e90d6d8ac20db7a955ad60cf0e6881a3f20d37096140088356da6c716b0b1"}, + {file = "attrs-21.2.0.tar.gz", hash = "sha256:ef6aaac3ca6cd92904cdd0d83f629a15f18053ec84e6432106f7a4d04ae4f5fb"}, +] certifi = [ {file = "certifi-2020.12.5-py2.py3-none-any.whl", hash = "sha256:719a74fb9e33b9bd44cc7f3a8d94bc35e4049deebe19ba7d8e108280cfd59830"}, {file = "certifi-2020.12.5.tar.gz", hash = "sha256:1a4995114262bffbc2413b159f2a1a480c969de6e6eb13ee966d470af86af59c"}, @@ -267,6 +455,10 @@ chardet = [ {file = "chardet-4.0.0-py2.py3-none-any.whl", hash = "sha256:f864054d66fd9118f2e67044ac8981a54775ec5b67aed0441892edb553d21da5"}, {file = "chardet-4.0.0.tar.gz", hash = "sha256:0d6f53a15db4120f2b08c94f11e7d93d2c911ee118b6b30a04ec3ee8310179fa"}, ] +colorama = [ + {file = "colorama-0.4.4-py2.py3-none-any.whl", hash = "sha256:9f47eda37229f68eee03b24b9748937c7dc3868f906e8ba69fbcbdd3bc5dc3e2"}, + {file = "colorama-0.4.4.tar.gz", hash = "sha256:5941b2b48a20143d2267e95b1c2a7603ce057ee39fd88e7329b0c292aa16869b"}, +] detect-secrets = [ {file = "detect_secrets-1.1.0-py2.py3-none-any.whl", hash = "sha256:be8cca3dc65f6fd637f5dec9f583f1cf4a680dc1a580b3d2e65a5ac7a277456a"}, {file = "detect_secrets-1.1.0.tar.gz", hash = "sha256:68250b31bc108f665f05f0ecfb34f92423280e48e65adbb887fdf721ed909627"}, @@ -301,10 +493,40 @@ mccabe = [ {file = "mccabe-0.6.1-py2.py3-none-any.whl", hash = "sha256:ab8a6258860da4b6677da4bd2fe5dc2c659cff31b3ee4f7f5d64e79735b80d42"}, {file = "mccabe-0.6.1.tar.gz", hash = "sha256:dd8d182285a0fe56bace7f45b5e7d1a6ebcbf524e8f3bd87eb0f125271b8831f"}, ] +iniconfig = [ + {file = "iniconfig-1.1.1-py2.py3-none-any.whl", hash = "sha256:011e24c64b7f47f6ebd835bb12a743f2fbe9a26d4cecaa7f53bc4f35ee9da8b3"}, + {file = "iniconfig-1.1.1.tar.gz", hash = "sha256:bc3af051d7d14b2ee5ef9969666def0cd1a000e121eaea580d4a313df4b37f32"}, +] +mo-dots = [ + {file = "mo-dots-4.22.21108.tar.gz", hash = "sha256:f9f2f9a6e44e214959aecd55857b6df487b8e86daf51334e2e982d47ea33ed23"}, +] +mo-future = [ + {file = "mo-future-3.147.20327.tar.gz", hash = "sha256:4aafcf859e4657bc11b04ee9791321e2d44702604ae01cac7c412468cb6a513f"}, +] +mo-imports = [ + {file = "mo-imports-3.149.20327.tar.gz", hash = "sha256:32e3dc720c84765224d29ed7e9d6972a369d41d756e6f26ddb5b7aa01241331d"}, +] +mo-kwargs = [ + {file = "mo-kwargs-4.22.21108.tar.gz", hash = "sha256:9c9d00ab86f1f75013193807d90cad17cbc515384cf3b17b8aff3104a300f7ce"}, +] +mo-logs = [ + {file = "mo-logs-4.23.21108.tar.gz", hash = "sha256:de4136a7ce215ecbfd7a368588be0a3f1fd8a6521dc2d4aae57cc1c3ba299aab"}, +] +moz-sql-parser = [ + {file = "moz-sql-parser-4.40.21126.tar.gz", hash = "sha256:b3d37cc8ff118d86009aa12646791549537ec0ae8ac312efd4641289c8eee080"}, +] nodeenv = [ {file = "nodeenv-1.6.0-py2.py3-none-any.whl", hash = "sha256:621e6b7076565ddcacd2db0294c0381e01fd28945ab36bcf00f41c5daf63bef7"}, {file = "nodeenv-1.6.0.tar.gz", hash = "sha256:3ef13ff90291ba2a4a7a4ff9a979b63ffdd00a464dbe04acf0ea6471517a4c2b"}, ] +packaging = [ + {file = "packaging-20.9-py2.py3-none-any.whl", hash = "sha256:67714da7f7bc052e064859c05c595155bd1ee9f69f76557e21f051443c20947a"}, + {file = "packaging-20.9.tar.gz", hash = "sha256:5b327ac1320dc863dca72f4514ecc086f31186744b84a230374cc1fd776feae5"}, +] +pluggy = [ + {file = "pluggy-0.13.1-py2.py3-none-any.whl", hash = "sha256:966c145cd83c96502c3c3868f50408687b38434af77734af1e9ca461a4081d2d"}, + {file = "pluggy-0.13.1.tar.gz", hash = "sha256:15b2acde666561e1298d71b523007ed7364de07029219b604cf808bfa1c765b0"}, +] pre-commit = [ {file = "pre_commit-2.12.1-py2.py3-none-any.whl", hash = "sha256:70c5ec1f30406250b706eda35e868b87e3e4ba099af8787e3e8b4b01e84f4712"}, {file = "pre_commit-2.12.1.tar.gz", hash = "sha256:900d3c7e1bf4cf0374bb2893c24c23304952181405b4d88c9c40b72bda1bb8a9"}, @@ -317,6 +539,18 @@ pyflakes = [ {file = "pyflakes-2.3.1-py2.py3-none-any.whl", hash = "sha256:7893783d01b8a89811dd72d7dfd4d84ff098e5eed95cfa8905b22bbffe52efc3"}, {file = "pyflakes-2.3.1.tar.gz", hash = "sha256:f5bc8ecabc05bb9d291eb5203d6810b49040f6ff446a756326104746cc00c1db"}, ] +py = [ + {file = "py-1.10.0-py2.py3-none-any.whl", hash = "sha256:3b80836aa6d1feeaa108e046da6423ab8f6ceda6468545ae8d02d9d58d18818a"}, + {file = "py-1.10.0.tar.gz", hash = "sha256:21b81bda15b66ef5e1a777a21c4dcd9c20ad3efd0b3f817e7a809035269e1bd3"}, +] +pyparsing = [ + {file = "pyparsing-2.4.7-py2.py3-none-any.whl", hash = "sha256:ef9d7589ef3c200abe66653d3f1ab1033c3c419ae9b9bdb1240a85b024efc88b"}, + {file = "pyparsing-2.4.7.tar.gz", hash = "sha256:c203ec8783bf771a155b207279b9bccb8dea02d8f0c9e5f8ead507bc3246ecc1"}, +] +pytest = [ + {file = "pytest-6.2.4-py3-none-any.whl", hash = "sha256:91ef2131a9bd6be8f76f1f08eac5c5317221d6ad1e143ae03894b862e8976890"}, + {file = "pytest-6.2.4.tar.gz", hash = "sha256:50bcad0a0b9c5a72c8e4e7c9855a3ad496ca6a881a3641b4260605450772c54b"}, +] pyyaml = [ {file = "PyYAML-5.4.1-cp27-cp27m-macosx_10_9_x86_64.whl", hash = "sha256:3b2b1824fe7112845700f815ff6a489360226a5609b96ec2190a45e62a9fc922"}, {file = "PyYAML-5.4.1-cp27-cp27m-win32.whl", hash = "sha256:129def1b7c1bf22faffd67b8f3724645203b79d8f4cc81f674654d9902cb4393"}, @@ -360,6 +594,10 @@ toml = [ {file = "toml-0.10.2-py2.py3-none-any.whl", hash = "sha256:806143ae5bfb6a3c6e736a764057db0e6a0e05e338b5630894a5f779cabb4f9b"}, {file = "toml-0.10.2.tar.gz", hash = "sha256:b3bda1d108d5dd99f4a20d24d9c348e91c4db7ab1b749200bded2f839ccbe68f"}, ] +tqdm = [ + {file = "tqdm-4.60.0-py2.py3-none-any.whl", hash = "sha256:daec693491c52e9498632dfbe9ccfc4882a557f5fa08982db1b4d3adbe0887c3"}, + {file = "tqdm-4.60.0.tar.gz", hash = "sha256:ebdebdb95e3477ceea267decfc0784859aa3df3e27e22d23b83e9b272bf157ae"}, +] urllib3 = [ {file = "urllib3-1.26.4-py2.py3-none-any.whl", hash = "sha256:2f4da4594db7e1e110a944bb1b551fdf4e6c136ad42e4234131391e21eb5b0df"}, {file = "urllib3-1.26.4.tar.gz", hash = "sha256:e7b021f7241115872f92f43c6508082facffbd1c048e3c6e2bb9c2a157e28937"}, diff --git a/pyproject.toml b/pyproject.toml index d59f726..6ec76d9 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -11,6 +11,9 @@ pre-commit = "^2.12.1" detect-secrets = "^1.1.0" direnv = "^2020.12.3" flake8 = "^3.9.2" +moz-sql-parser = "^4.40.21126" +tqdm = "^4.60.0" +pytest = "^6.2.4" [tool.poetry.dev-dependencies] diff --git a/tests/__init__.py b/tests/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/tests/conftest.py b/tests/conftest.py new file mode 100644 index 0000000..10f2b4a --- /dev/null +++ b/tests/conftest.py @@ -0,0 +1,12 @@ +import pytest + + +pytest_plugins = [ + "tests.fixtures.fixture_extractor", +] + + +@pytest.fixture() +def query_user_activity(): + with open(file="data/reporting/user_activity.sql", mode="r") as f: + return f.read() diff --git a/tests/fixtures/fixture_extractor.py b/tests/fixtures/fixture_extractor.py new file mode 100644 index 0000000..56b49c0 --- /dev/null +++ b/tests/fixtures/fixture_extractor.py @@ -0,0 +1,54 @@ +import pytest + + +@pytest.fixture() +def cleaned_user_activity(): + return ( + " WITH cte_base AS ( " + "SELECT b.name ,b.email ,'commit' AS activity_type ,COUNT(a.*) AS activity_count " + "FROM analytics.commit AS a " + "LEFT JOIN analytics.user AS b " + "ON a.committer_name = b.name " + "AND b.user_type = 'committer' " + "GROUP BY b.name ,b.email ,a.repo_name " + "UNION " + "SELECT a.author_name AS name ,b.email ,'repo' AS activity_type ,COUNT(a.*) AS activity_count " + "FROM analytics.repo AS a " + "LEFT JOIN analytics.user AS b " + "ON a.author_name = b.name " + "GROUP BY a.author_name ,b.email ) " + "SELECT name ,email ,activity_type ,activity_count " + "FROM cte_base " + "UNION " + "SELECT name ,email ,activity_type ,activity_count " + "FROM ( " + "SELECT name ,email ,'total' AS activity_type ,SUM(activity_count) AS activity_count " + "FROM cte_base " + "GROUP BY name ,email ) ; " + ) + + +@pytest.fixture() +def extracted_reporting(): + return { + "reporting.user_activity": [ + "analytics.commit", + "analytics.repo", + "analytics.user", + ] + } + + +@pytest.fixture() +def extracted_analytics(): + return { + "analytics.repo": [ + "github_repos.commits", + "github_repos.languages", + "github_repos.licenses", + ], + "analytics.author": ["github_repos.commits"], + "analytics.committer": ["github_repos.commits"], + "analytics.commit": ["github_repos.commits"], + "analytics.user": ["analytics.author", "analytics.committer"], + } diff --git a/tests/integration/test_extractor.py b/tests/integration/test_extractor.py new file mode 100644 index 0000000..d53357a --- /dev/null +++ b/tests/integration/test_extractor.py @@ -0,0 +1,31 @@ +import pytest +import os +from extractor import Extractor + + +# run multiple times to ensure value ordering is preserved +# if not preserved, then test will fail +@pytest.mark.parametrize("execution_number", range(3)) +def test_extract_table_dependencies_from_queries( + execution_number, extracted_reporting, extracted_analytics +): + schemes = ["analytics", "reporting"] + extract = [extracted_analytics, extracted_reporting] + + for i, schema in enumerate(schemes): + dir_report = f"data/{schema}" + remove_txt = [] + for table in os.listdir(dir_report): + table_name, _ = os.path.splitext(p=table) + remove_txt.append(f"MERGE {schema}.{table_name} USING (") + remove_txt.append( + ") ON FALSE WHEN NOT MATCHED THEN " + "INSERT ROW WHEN NOT MATCHED BY SOURCE THEN " + "DELETE" + ) + extractor = Extractor(script_dir=dir_report, schema=schema) + output = extractor.extract_table_dependencies_from_queries( + reference_datasets=["reporting", "analytics", "github_repos"], + str_to_remove=remove_txt, + ) + assert output == extract[i] diff --git a/tests/unit/test_extractor.py b/tests/unit/test_extractor.py new file mode 100644 index 0000000..8674c60 --- /dev/null +++ b/tests/unit/test_extractor.py @@ -0,0 +1,17 @@ +from extractor import Extractor + + +def test_clean_query(query_user_activity, cleaned_user_activity): + schema = "reporting" + dir_report = f"data/{schema}" + extractor = Extractor(script_dir=dir_report, schema=schema) + txt_remove = [ + f"MERGE {schema}.user_activity USING (", + ") ON FALSE WHEN NOT MATCHED THEN " + "INSERT ROW WHEN NOT MATCHED BY SOURCE THEN " + "DELETE", + ] + cleaned_query = extractor.clean_query( + query=query_user_activity, str_to_remove=txt_remove + ) + assert cleaned_query == cleaned_user_activity