Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
File renamed without changes.
4 changes: 4 additions & 0 deletions .github/workflows/pytesting.yml
Original file line number Diff line number Diff line change
Expand Up @@ -32,3 +32,7 @@ jobs:
poetry run flake8 . --count --select=E9,F63,F7,F82 --show-source --statistics
# exit-zero treats all errors as warnings. The GitHub editor is 127 chars wide
poetry run flake8 . --count --exit-zero --max-complexity=10 --max-line-length=127 --statistics
- name: Test with pytest
run: |
poetry run pytest tests/unit
poetry run pytest tests/integration
3 changes: 3 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -12,3 +12,6 @@

# environment
.env

# django
__pycache__/
38 changes: 37 additions & 1 deletion README.md
Original file line number Diff line number Diff line change
@@ -1,4 +1,40 @@
# sqlquerygraph
[![build status](https://github.com/avisionh/sqlquerygraph/workflows/pytesting/badge.svg)](https://github.com/avisionh/sqlqueryraph/actions) [![CodeFactor](https://www.codefactor.io/repository/github/avisionh/sqlquerygraph/badge)](https://www.codefactor.io/repository/github/avisionh/sqlquerygraph) [![License: MIT](https://img.shields.io/badge/License-MIT-informational.svg)](https://opensource.org/licenses/MIT) [![Code style: black](https://img.shields.io/badge/code%20style-black-000000.svg)](https://github.com/psf/black)
[![build status](https://github.com/avisionh/sqlquerygraph/workflows/pytesting/badge.svg)](https://github.com/avisionh/sqlqueryraph/actions)
[![](https://img.shields.io/badge/python-3.8%2B-blue.svg)](https://www.python.org/downloads/)
[![CodeFactor](https://www.codefactor.io/repository/github/avisionh/sqlquerygraph/badge)](https://www.codefactor.io/repository/github/avisionh/sqlquerygraph)
[![License: MIT](https://img.shields.io/badge/License-MIT-informational.svg)](https://opensource.org/licenses/MIT)
[![Code style: black](https://img.shields.io/badge/code%20style-black-000000.svg)](https://github.com/psf/black)

Parse your SQL queries and represent their structure as a graph.

Currently, we implement the ability of representing how each of the tables in a set of SQL query scripts depend on each other.

## Requirements
To run the code in here, ensure your system meets the following requirements:
- Unix-like operating system (macOS, Linux, ...);
- [`direnv`](https://direnv.net/) installed, including shell hooks;
- [`.envrc`](https://github.com/avisionh/sqlquerygraph/blob/main/.envrc) allowed/trusted by `direnv` to
use the environment variables - see [below](#allowingtrusting-envrc);
- Python 3.8 or above; and
- [Poetry](https://python-poetry.org/docs/) installed.

Note there may be some Python IDE-specific requirements around loading environment variables, which are not considered here.

### Set-up
For quickstart set-up of the project, run the below in your shell:
```shell script
# 1. read project-specific environment variables
direnv allow

# 2. activate virtual environment and install package dependencies
poetry shell
poetry install

# 3. check adherence to good standards on every commit
pre-commit install
```

***

## Acknowledgements
This builds on the excellent [moz-sql-parser](https://github.com/mozilla/moz-sql-parser) package.
16 changes: 16 additions & 0 deletions data/analytics/author.sql
Original file line number Diff line number Diff line change
@@ -0,0 +1,16 @@
MERGE analytics.author
USING (
SELECT
author.name AS name
,author.email AS email
,author.time_sec AS time_sec
,author.tz_offset AS tz_offset
,author.date.seconds AS date_seconds
,author.date.nanos AS date_nanos
FROM github_repos.commits
)
ON FALSE
WHEN NOT MATCHED THEN
INSERT ROW
WHEN NOT MATCHED BY SOURCE THEN
DELETE;
21 changes: 21 additions & 0 deletions data/analytics/commit.sql
Original file line number Diff line number Diff line change
@@ -0,0 +1,21 @@
MERGE analytics.commit
USING (
SELECT
commit
,tree
,parent
,author.name AS author_name
,author.time_sec AS author_timesec
,committer.name AS committer_name
,committer.time_sec AS committer_time_sec
,subject
,message
,repo_name
,difference_truncated
FROM github_repos.commits
)
ON FALSE
WHEN NOT MATCHED THEN
INSERT ROW
WHEN NOT MATCHED BY SOURCE THEN
DELETE;
16 changes: 16 additions & 0 deletions data/analytics/committer.sql
Original file line number Diff line number Diff line change
@@ -0,0 +1,16 @@
MERGE analytics.committer
USING (
SELECT
committer.name AS name
,committer.email AS email
,committer.time_sec AS time_sec
,committer.tz_offset AS tz_offset
,committer.date.seconds AS date_seconds
,committer.date.nanos AS date_nanos
FROM github_repos.commits
)
ON FALSE
WHEN NOT MATCHED THEN
INSERT ROW
WHEN NOT MATCHED BY SOURCE THEN
DELETE;
20 changes: 20 additions & 0 deletions data/analytics/repo.sql
Original file line number Diff line number Diff line change
@@ -0,0 +1,20 @@
MERGE analytics.repo
USING (
SELECT
a.repo_name
,a.author.name AS author_name
,a.author.time_sec AS author_time_sec
,b.language.name AS language
,b.language.bytes AS repo_size
,c.license
FROM github_repos.commits AS a
LEFT JOIN github_repos.languages AS b
ON a.repo_name = b.repo_name
LEFT JOIN github_repos.licenses AS c
ON a.repo_name = c.repo_name
)
ON FALSE
WHEN NOT MATCHED THEN
INSERT ROW
WHEN NOT MATCHED BY SOURCE THEN
DELETE;
19 changes: 19 additions & 0 deletions data/analytics/user.sql
Original file line number Diff line number Diff line change
@@ -0,0 +1,19 @@
MERGE analytics.user
USING (
SELECT DISTINCT
name
,email
,'author' AS user_type
FROM analytics.author
UNION
SELECT DISTINCT
name
,email
,'committer' AS user_type
FROM analytics.committer
)
ON FALSE
WHEN NOT MATCHED THEN
INSERT ROW
WHEN NOT MATCHED BY SOURCE THEN
DELETE;
63 changes: 63 additions & 0 deletions data/reporting/user_activity.sql
Original file line number Diff line number Diff line change
@@ -0,0 +1,63 @@
MERGE reporting.user_activity
USING (
WITH cte_base AS
(
SELECT
b.name
,b.email
,'commit' AS activity_type
,COUNT(a.*) AS activity_count
FROM analytics.commit AS a
LEFT JOIN analytics.user AS b
ON a.committer_name = b.name
AND b.user_type = 'committer'
GROUP BY
b.name
,b.email
,a.repo_name

UNION

SELECT
a.author_name AS name
,b.email
,'repo' AS activity_type
,COUNT(a.*) AS activity_count
FROM analytics.repo AS a
LEFT JOIN analytics.user AS b
ON a.author_name = b.name
GROUP BY
a.author_name
,b.email
)

SELECT
name
,email
,activity_type
,activity_count
FROM cte_base
UNION
SELECT
name
,email
,activity_type
,activity_count
FROM
(
SELECT
name
,email
,'total' AS activity_type
,SUM(activity_count) AS activity_count
FROM cte_base
GROUP BY
name
,email
)
)
ON FALSE
WHEN NOT MATCHED THEN
INSERT ROW
WHEN NOT MATCHED BY SOURCE THEN
DELETE;
176 changes: 176 additions & 0 deletions extractor.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,176 @@
from typing import Union

import os
import re
from tqdm import tqdm

from moz_sql_parser import parse
from pprint import pprint


class Extractor:
"""
Extract table names from SQL queries.

:param script_dir: String of the directory were we store our SQL queries.
:param schema: String of the dataset/schema that the SQL queries creating the table belongs to.
"""

def __init__(self, script_dir: str, schema: str):
self.script_dir = script_dir
self.schema = schema

def read_query(self, file: str) -> (str, str):
"""
Reads a SQL file in.
Note: Relies on SQL script being named the same as table or View it is creating.

:param file: String of the file to read query from.
:return: Tuple of strings of the table name and SQL query from the file.
"""
file_name, file_extension = os.path.splitext(p=file)
if file_extension == ".sql":
with open(file=os.path.join(self.script_dir, file), mode="r") as f:
query = f.read()
return file_name, query
else:
raise Exception(
f"Passed in a {file_extension} file. \n"
f"Please pass in a .sql file instead."
)

@staticmethod
def clean_query(query: str, str_to_remove: Union[str, list] = None) -> str:
"""
Cleans a query so it can be parsed.

:param query: String of the query to clean.
:param str_to_remove: String or list of strings to remove from the query.
:return: String of the cleaned query to parse.
"""
# remove new lines and multiple spaces
query = query.replace("\n", " ")
query = re.sub(pattern=r"\s+", repl=" ", string=query)

if str_to_remove is not None:
for txt in str_to_remove:
query = query.replace(txt, "")

return query

@staticmethod
def parse_query(query: str, print_tree: bool = False) -> dict:
"""
Parse a query into a JSON parse-tree.

:param query: String of the SQL query to parse as a JSON parse-tree.
:param print_tree: Boolean to print the JSON parse-tree.
:return: Dictionary of the query as a JSON parse-tree.
"""
query_json = parse(sql=query)
if print_tree:
pprint(object=query_json)
return query_json

@staticmethod
def extract_from_json(obj: dict, key: str) -> list:
"""
Recursively fetch values from a nested JSON.

For our purposes, extract where key is 'from' allows extraction of *most* table names after a `FROM` clause.
- It does not extract the table names when the name is nested in a subquery.
- Nor does it extract table names in '<TYPE> JOIN` clauses.
To achieve above two, need to extract where the key is 'value' and compare with actual table names.
This is because the values returned when key is 'value' are table names, column names etc.
Reference
- https://hackersandslackers.com/extract-data-from-complex-json-python/
:param obj: Dictionary to extract values from.
:param key: String of the value you want to extract.
:return: List of values for the key.
"""
arr = []

def extract(obj: Union[dict, list], arr: list, key: str) -> list:
"""
Recusively search for values of key in a JSON tree.

:param obj: Dictionary to extract values from.
:param arr: List to store extracted values to.
:param key: String of the dictionary key to extract associated value from.
:return: List of the extracted values.
"""
if isinstance(obj, dict):
for k, v in obj.items():
if isinstance(v, (dict, list)):
extract(obj=v, arr=arr, key=key)
elif k == key:
arr.append(v)
elif isinstance(obj, list):
for item in obj:
extract(obj=item, arr=arr, key=key)
return arr

values = extract(obj=obj, arr=arr, key=key)
return values

def extract_table_dependencies_from_queries(
self,
reference_datasets: list,
str_to_remove: Union[str, list] = None,
verbose: bool = False,
) -> dict:
"""
Extracts the table names and their dependencies from a set of .sql files.

:param reference_datasets: List of datasets/schema of database.
:param str_to_remove: String or list of strings to remove from the query.
:param verbose: Boolean to output steps taken and query after cleaning. Useful for debugging.
:return: Dictionary of tables as keys and their dependent tables as values.
"""
queries, jsons, dicts = {}, {}, {}
reference_datasets = tuple([f"{txt}." for txt in reference_datasets])
for file_name in tqdm(os.listdir(path=self.script_dir)):
if verbose:
print(f"Reading query {file_name}...\n")
file_name, query = self.read_query(file=file_name)
queries[file_name] = query

if str_to_remove is not None:
if verbose:
print(
f"Cleaning query {file_name} by removing {str_to_remove}...\n"
)
queries[file_name] = self.clean_query(
query=queries[file_name], str_to_remove=str_to_remove
)

if verbose:
print(f"Cleaned query is {queries[file_name]}")
print(f"Parsing query {file_name}...\n")
jsons[file_name] = self.parse_query(
query=queries[file_name], print_tree=verbose
)

if verbose:
print(f"Extracting table names from {file_name}...\n")
# - from: tables after 'from' clause
# + though sometimes keys are not 'from' so need to
# + look at values associated to the 'value' key
# - value: tables after '... join' clauses
# + can also include tables after 'from' clause if they
# + are in a subquery
table_from = self.extract_from_json(obj=jsons[file_name], key="from")

# keep only table elements and not table aliases - as defined by period
table_from = [txt for txt in table_from if "." in txt]
table_value = self.extract_from_json(obj=jsons[file_name], key="value")
# extract table values when it starts with `<schema>.`
table_join = [
txt for txt in table_value if str(txt).startswith(reference_datasets)
]
tables = sorted(list(set(table_from + table_join)))

# store in dictionary
dicts[f"{self.schema}.{file_name}"] = tables

return dicts
Loading