avisionh · avisionh · May 24, 2021 · May 23, 2021 · May 23, 2021 · May 23, 2021
diff --git a/.gitignore b/.gitignore
@@ -15,3 +15,9 @@
 
 # django
 __pycache__/
+
+# data
+*.csv
+
+# outputs
+*.html
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
@@ -5,7 +5,6 @@ repos:
     rev: 3.9.2
     hooks:
     -   id: flake8
-        args: ["src"]
 -   repo: https://github.com/psf/black
     rev: 21.5b1 # Replace by any tag/version: https://github.com/psf/black/tags
     hooks:

diff --git a/README.md b/README.md
@@ -21,7 +21,7 @@ To run the code in here, ensure your system meets the following requirements:
 Note there may be some Python IDE-specific requirements around loading environment variables, which are not considered here.
 
 ### Set-up
-For quickstart set-up of the project, run the below in your shell:
+For quickstart set-up of the project, run the below in your shell/terminal:
 ```shell script
 # 1. read project-specific environment variables
 direnv allow
@@ -34,6 +34,11 @@ poetry install
 pre-commit install
 ```
 
+To then extract the tables and their dependencies from the example SQL scripts in the `sql/` directory, run the following in your shell/terminal:
+```shell script
+python sqlquerygraph.py -sd 'sql' -ed 'neo4j' -rd 'github_repos' 'analytics' 'reporting'
+```
+
 ### Run neo4j graph database
 We use [neo4j](https://neo4j.com/) for this project to visualise the dependencies between tables. To install neo4j locally using Docker Compose, follow the below instructions:
 1. Install Docker

diff --git a/exporter.py b/exporter.py
@@ -0,0 +1,88 @@
+from typing import Union
+import pandas as pd
+
+
+def convert_dict_to_df(data: dict) -> pd.DataFrame:
+    """
+    Converts a dictionary into a dataframe, with keys and values being a column each.
+    :param data: Dictionary to convert to a dataframe.
+    :return: Dataframe where dictionary keys and values are a column.
+    """
+    data = pd.DataFrame.from_dict(data=data, orient="index").reset_index()
+    data = data.rename(columns={"index": "table"})
+    data = pd.melt(
+        frame=data,
+        id_vars="table",
+        var_name="original_column_name",
+        value_name="dependency",
+    )
+    # remove nans
+    data = data[data["dependency"].notnull()]
+    # sort values
+    data = data.sort_values(by="table")
+
+    return data[["table", "dependency"]]
+
+
+def separate_dataset_table(data: Union[pd.Series, pd.DataFrame]) -> pd.DataFrame:
+    """
+    Separates string of <dataset>.<table_name> into dataset and table name.
+    :param data: Dataframe with columns to separate string entries into dataset and table name.
+    :return: Dataframe with columns for dataset and table name.
+    """
+    if isinstance(data, pd.Series):
+        cols = "table"
+    else:
+        cols = data.columns
+
+    for col in cols:
+        col_names = [f"{col}_dataset", f"{col}_name"]
+        # remove backslashes and split on period
+        data[col] = data[col].str.replace(pat="\\", repl="", regex=True)
+        data[col_names] = data[col].str.split(pat=".", n=1, expand=True)
+        # remove full column
+        data = data.drop(columns=col)
+
+    return data
+
+
+def export_unique_names(data: pd.DataFrame, path_or_buf: str):
+    """
+    Concatenates and unions a dataframe so we we get unique table names. This is so we create nodes in neo4j.
+    :param data: Dataframe to get the names from.
+    :param path_or_buf: String of the directory to store files.
+    :return:
+    """
+    data_table = data[["table_dataset", "table_name"]]
+    data_dependency = data[["dependency_dataset", "dependency_name"]]
+    # rename so can union
+    data_dependency = data_dependency.rename(
+        columns={"dependency_dataset": "table_dataset", "dependency_name": "table_name"}
+    )
+    frames = [data_table, data_dependency]
+    data_frames = pd.concat(objs=frames, axis="index")
+
+    for ds in data_frames["table_dataset"].unique():
+        df = data_frames[data_frames["table_dataset"] == ds]
+        df = df.drop_duplicates(subset="table_name")
+        df.to_csv(path_or_buf=f"{path_or_buf}/{ds}_tables.csv", index=False)
+
+
+def export_table_dependency(data: pd.DataFrame, path_or_buf: str):
+    """
+    Filters a dataframe by its table and dependency levels so it can be exported into neo4j.
+    :param data: Dataframe to filter by table and dependency.
+                Requires column to be called 'table_dataset' and 'dependency_dataset'.
+    :param path_or_buf: String of the directory to store files.
+    :return:
+    """
+    for t_ds in data["table_dataset"].unique():
+        mask_t_ds = data["table_dataset"] == t_ds
+        for d_ds in data["dependency_dataset"].unique():
+            mask_d_ds = data["dependency_dataset"] == d_ds
+            df_out = data.loc[
+                (mask_t_ds & mask_d_ds),
+            ]
+            df_out.to_csv(
+                path_or_buf=f"{path_or_buf}/{t_ds}_{d_ds}_dependency.csv", index=False
+            )
diff --git a/extractor.py b/extractor.py
@@ -170,6 +170,8 @@ def extract_table_dependencies_from_queries(
             ]
             tables = sorted(list(set(table_from + table_join)))
 
+            if verbose:
+                print(f"Extracted table names are {tables}...\n")
             # store in dictionary
             dicts[f"{self.schema}.{file_name}"] = tables
 

diff --git a/neo4j/.gitkeep b/neo4j/.gitkeep
diff --git a/poetry.lock b/poetry.lock
diff --git a/pyproject.toml b/pyproject.toml
@@ -14,6 +14,8 @@ flake8 = "^3.9.2"
 moz-sql-parser = "^4.40.21126"
 tqdm = "^4.60.0"
 pytest = "^6.2.4"
+pandas = "^1.2.4"
+numpy = "^1.20.3"
 
 [tool.poetry.dev-dependencies]
 

diff --git a/data/analytics/author.sql → sql/analytics/author.sql b/data/analytics/author.sql → sql/analytics/author.sql
diff --git a/data/analytics/commit.sql → sql/analytics/commit.sql b/data/analytics/commit.sql → sql/analytics/commit.sql
diff --git a/data/analytics/committer.sql → sql/analytics/committer.sql b/data/analytics/committer.sql → sql/analytics/committer.sql
diff --git a/data/analytics/repo.sql → sql/analytics/repo.sql b/data/analytics/repo.sql → sql/analytics/repo.sql
diff --git a/data/analytics/user.sql → sql/analytics/user.sql b/data/analytics/user.sql → sql/analytics/user.sql
diff --git a/data/reporting/user_activity.sql → sql/reporting/user_activity.sql b/data/reporting/user_activity.sql → sql/reporting/user_activity.sql