Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 6 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -15,3 +15,9 @@

# django
__pycache__/

# data
*.csv

# outputs
*.html
1 change: 0 additions & 1 deletion .pre-commit-config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,6 @@ repos:
rev: 3.9.2
hooks:
- id: flake8
args: ["src"]
- repo: https://github.com/psf/black
rev: 21.5b1 # Replace by any tag/version: https://github.com/psf/black/tags
hooks:
Expand Down
7 changes: 6 additions & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,7 @@ To run the code in here, ensure your system meets the following requirements:
Note there may be some Python IDE-specific requirements around loading environment variables, which are not considered here.

### Set-up
For quickstart set-up of the project, run the below in your shell:
For quickstart set-up of the project, run the below in your shell/terminal:
```shell script
# 1. read project-specific environment variables
direnv allow
Expand All @@ -34,6 +34,11 @@ poetry install
pre-commit install
```

To then extract the tables and their dependencies from the example SQL scripts in the `sql/` directory, run the following in your shell/terminal:
```shell script
python sqlquerygraph.py -sd 'sql' -ed 'neo4j' -rd 'github_repos' 'analytics' 'reporting'
```

### Run neo4j graph database
We use [neo4j](https://neo4j.com/) for this project to visualise the dependencies between tables. To install neo4j locally using Docker Compose, follow the below instructions:
1. Install Docker
Expand Down
88 changes: 88 additions & 0 deletions exporter.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,88 @@
from typing import Union
import pandas as pd


def convert_dict_to_df(data: dict) -> pd.DataFrame:
"""
Converts a dictionary into a dataframe, with keys and values being a column each.
:param data: Dictionary to convert to a dataframe.
:return: Dataframe where dictionary keys and values are a column.
"""
data = pd.DataFrame.from_dict(data=data, orient="index").reset_index()
data = data.rename(columns={"index": "table"})
data = pd.melt(
frame=data,
id_vars="table",
var_name="original_column_name",
value_name="dependency",
)
# remove nans
data = data[data["dependency"].notnull()]
# sort values
data = data.sort_values(by="table")

return data[["table", "dependency"]]


def separate_dataset_table(data: Union[pd.Series, pd.DataFrame]) -> pd.DataFrame:
"""
Separates string of <dataset>.<table_name> into dataset and table name.
:param data: Dataframe with columns to separate string entries into dataset and table name.
:return: Dataframe with columns for dataset and table name.
"""
if isinstance(data, pd.Series):
cols = "table"
else:
cols = data.columns

for col in cols:
col_names = [f"{col}_dataset", f"{col}_name"]
# remove backslashes and split on period
data[col] = data[col].str.replace(pat="\\", repl="", regex=True)
data[col_names] = data[col].str.split(pat=".", n=1, expand=True)
# remove full column
data = data.drop(columns=col)

return data


def export_unique_names(data: pd.DataFrame, path_or_buf: str):
"""
Concatenates and unions a dataframe so we we get unique table names. This is so we create nodes in neo4j.
:param data: Dataframe to get the names from.
:param path_or_buf: String of the directory to store files.
:return:
"""
data_table = data[["table_dataset", "table_name"]]
data_dependency = data[["dependency_dataset", "dependency_name"]]
# rename so can union
data_dependency = data_dependency.rename(
columns={"dependency_dataset": "table_dataset", "dependency_name": "table_name"}
)
frames = [data_table, data_dependency]
data_frames = pd.concat(objs=frames, axis="index")

for ds in data_frames["table_dataset"].unique():
df = data_frames[data_frames["table_dataset"] == ds]
df = df.drop_duplicates(subset="table_name")
df.to_csv(path_or_buf=f"{path_or_buf}/{ds}_tables.csv", index=False)


def export_table_dependency(data: pd.DataFrame, path_or_buf: str):
"""
Filters a dataframe by its table and dependency levels so it can be exported into neo4j.
:param data: Dataframe to filter by table and dependency.
Requires column to be called 'table_dataset' and 'dependency_dataset'.
:param path_or_buf: String of the directory to store files.
:return:
"""
for t_ds in data["table_dataset"].unique():
mask_t_ds = data["table_dataset"] == t_ds
for d_ds in data["dependency_dataset"].unique():
mask_d_ds = data["dependency_dataset"] == d_ds
df_out = data.loc[
(mask_t_ds & mask_d_ds),
]
df_out.to_csv(
path_or_buf=f"{path_or_buf}/{t_ds}_{d_ds}_dependency.csv", index=False
)
2 changes: 2 additions & 0 deletions extractor.py
Original file line number Diff line number Diff line change
Expand Up @@ -170,6 +170,8 @@ def extract_table_dependencies_from_queries(
]
tables = sorted(list(set(table_from + table_join)))

if verbose:
print(f"Extracted table names are {tables}...\n")
# store in dictionary
dicts[f"{self.schema}.{file_name}"] = tables

Expand Down
Empty file added neo4j/.gitkeep
Empty file.
141 changes: 118 additions & 23 deletions poetry.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

2 changes: 2 additions & 0 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,8 @@ flake8 = "^3.9.2"
moz-sql-parser = "^4.40.21126"
tqdm = "^4.60.0"
pytest = "^6.2.4"
pandas = "^1.2.4"
numpy = "^1.20.3"

[tool.poetry.dev-dependencies]

Expand Down
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
Loading