From 88461a17f2f8914a81bbf7f5df2a8b6d35392e81 Mon Sep 17 00:00:00 2001 From: "Miroslav Chomut (CZ)" Date: Thu, 17 Oct 2024 10:42:32 +0200 Subject: [PATCH 1/3] #22 Update schemas --- conf/topics.json | 185 ++++++++++++++++++++--------------------- scripts/notebook.ipynb | 25 +++--- 2 files changed, 104 insertions(+), 106 deletions(-) diff --git a/conf/topics.json b/conf/topics.json index d1c9a1e..0a8006c 100644 --- a/conf/topics.json +++ b/conf/topics.json @@ -1,122 +1,119 @@ { "run.topic": { + "type": "object", "properties": { - "app_id_snow": { - "description": "Application ID or ServiceNow identifier", - "type": "string" - }, - "data_definition_id": { - "description": "Identifier for the data definition", - "type": "string" - }, - "environment": { - "description": "Environment", - "type": "string" - }, - "guid": { - "description": "Unique identifier for the event (GUID)", - "type": "string" + "event_id": { + "type": "string", + "description": "Unique identifier for the event (GUID), generated for each unique event, for de-duplication purposes" }, "job_ref": { - "description": "Identifier of the job in it’s respective system.", - "type": "string" + "type": "string", + "description": "Identifier of the job in it’s respective system (e.g. Spark Application Id, Glue Job Id, EMR Step Id, etc)." }, - "message": { - "description": "Pipeline status message.", - "type": "string" + "tenant_id ": { + "type": "string", + "description": "Application ID (4 letter code) or ServiceNow identifier related to the pipeline/domain/process owner (tenant of the tool)" }, "source_app": { - "description": "Source application name", - "type": "string" - }, - "status": { - "description": "Status of the run. Does not speak of the quality.", - "enum": [ - "Finished", - "Failed", - "Killed" - ], - "type": "string" + "type": "string", + "description": "Standardized source application name (aqueduct, unify, lum, etc)" }, - "timestamp_end": { - "description": "End timestamp of the run in epoch milliseconds", - "type": "number" + "source_app_version": { + "type": "string", + "description": "Source application version (SemVer preferred)" + }, + "environment": { + "type": "string", + "description": "Environment (dev, uat, pre-prod, prod, test or others)" }, "timestamp_start": { - "description": "Start timestamp of the run in epoch milliseconds", - "type": "number" + "type": "number", + "description": "Start timestamp of the run in epoch milliseconds" + }, + "timestamp_end": { + "type": "number", + "description": "End timestamp of the run in epoch milliseconds" + }, + "jobs": { + "type": "array", + "description": "List of individual jobs withing the run", + "element_type": "object", + "object_schema": { + "catalog_id": { + "type": "string", + "description": "Identifier for the data definition (Glue/Hive) database and table name for example" + }, + "status": { + "type": "string", + "enum": ["succeeded", "failed", "killed", "skipped"], + "description": "Status of the job." + }, + "timestamp_start": { + "type": "number", + "description": "Start timestamp of a job that is a part of a run in epoch milliseconds" + }, + "timestamp_end": { + "type": "number", + "description": "End timestamp of a job that is a part of a run in epoch milliseconds" + }, + "message": { + "type": "string", + "description": "Job status/error message." + } + } } }, - "required": [ - "guid", - "app_id_snow", - "source_app", - "timestamp_start", - "timestamp_end", - "data_definition_id", - "status" - ], - "type": "object" + "required": ["event_id", "job_ref", "tenant_id", "source_app", "source_app_version", "environment", "timestamp_start", "timestamp_end", "jobs", "jobs.catalog_id", "jobs.status", " jobs.timestamp_start", "jobs.timestamp_end"] }, "edla.change.topic": { + "type": "object", "properties": { - "app_id_snow": { - "description": "Application ID or ServiceNow identifier", - "type": "string" + "event_id": { + "type": "string", + "description": "Unique identifier for the event (GUID)" }, - "data_definition_id": { - "description": "Identifier for the data definition", - "type": "string" + "tenant_id": { + "type": "string", + "description": "Application ID or ServiceNow identifier" }, - "environment": { - "description": "Environment", - "type": "string" + "source_app": { + "type": "string", + "description": " Standardized source application name (aqueduct, unify, lum, etc)" }, - "format": { - "description": "Format of the data", - "type": "string" + "source_app_version": { + "type": "string", + "description": "Source application version (SemVer preferred)" }, - "guid": { - "description": "Unique identifier for the event (GUID)", - "type": "string" + "environment": { + "type": "string", + "description": "Environment (dev, uat, pre-prod, prod, test or others)" }, - "location": { - "description": "Location of the data", - "type": "string" + "timestamp_event": { + "type": "number", + "description": "Timestamp of the event in epoch milliseconds" + }, + "catalog_id": { + "type": "string", + "description": "Identifier for the data definition (Glue/Hive) database and table name for example " }, "operation": { - "description": "Operation performed", - "enum": [ - "CREATE", - "UPDATE", - "ARCHIVE" - ], - "type": "string" - }, - "schema_link": { - "description": "Link to the data schema", - "type": "string" + "type": "string", + "enum": ["overwrite", "append", "archive", "delete"], + "description": "Operation performed" }, - "source_app": { - "description": "Source application name", - "type": "string" + "location": { + "type": "string", + "description": "Location of the data" }, - "timestamp_event": { - "description": "Timestamp of the event in epoch milliseconds", - "type": "number" + "format": { + "type": "string", + "description": "Format of the data (parquet, delta, crunch, etc)." + }, + "format_options": { + "type": "object", + "description": "When possible, add additional options related to the format" } }, - "required": [ - "guid", - "app_id_snow", - "source_app", - "timestamp_event", - "data_definition_id", - "operation", - "location", - "format", - "schema_link" - ], - "type": "object" + "required": ["event_id", "tenant_id", "source_app", "source_app_version", "environment", "timestamp_event", "catalog_id", "operation", "format"] } -} \ No newline at end of file +} diff --git a/scripts/notebook.ipynb b/scripts/notebook.ipynb index 13ea40b..738fe10 100644 --- a/scripts/notebook.ipynb +++ b/scripts/notebook.ipynb @@ -84,17 +84,18 @@ " \"pathParameters\": {\"topic_name\": \"edla.change.topic\"},\n", " \"headers\": {\"bearer\": jwtToken},\n", " \"body\": json.dumps({\n", - " \"app_id_snow\": \"app-1234\",\n", - " \"data_definition_id\": \"data-def-5678\",\n", - " \"environment\": \"DEV\",\n", - " \"format\": \"JSON\",\n", - " \"guid\": \"550e8400-e29b-41d4-a716-446655440000\",\n", - " \"location\": \"s3://data-lake/customer_data\",\n", - " \"operation\": \"CREATE\",\n", - " \"schema_link\": \"https://schema-registry.company.com/schemas/data-def-5678\",\n", - " \"source_app\": \"DataIngestionApp\",\n", - " \"timestamp_event\": 1657896543210\n", - " })\n", + " \"event_id\": \"JupyterEventId\",\n", + " \"tenant_id\": \"JupyterTenantId\",\n", + " \"source_app\": \"JupyterSrc\",\n", + " \"source_app_version\": \"v2024-10-17\",\n", + " \"environment\": \"JupyterEnv\",\n", + " \"timestamp_event\": 20241017,\n", + " \"catalog_id\": \"TestCatalog\",\n", + " \"operation\": \"delete\",\n", + " \"location\": \"UnitTest\",\n", + " \"format\": \"TestFormat\",\n", + " \"formatOptions\": {\"Foo\" : \"Bar\"}\n", + " })\n", "}, {})" ] }, @@ -128,7 +129,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.12.2" + "version": "3.12.6" } }, "nbformat": 4, From c932b90f1a2d849d57576d199b4df4d03d4172bc Mon Sep 17 00:00:00 2001 From: "Miroslav Chomut (CZ)" Date: Wed, 23 Oct 2024 10:09:56 +0200 Subject: [PATCH 2/3] Update scripts/notebook.ipynb Co-authored-by: Ruslan Yushchenko --- scripts/notebook.ipynb | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scripts/notebook.ipynb b/scripts/notebook.ipynb index 738fe10..c95fbe0 100644 --- a/scripts/notebook.ipynb +++ b/scripts/notebook.ipynb @@ -89,7 +89,7 @@ " \"source_app\": \"JupyterSrc\",\n", " \"source_app_version\": \"v2024-10-17\",\n", " \"environment\": \"JupyterEnv\",\n", - " \"timestamp_event\": 20241017,\n", + " \"timestamp_event\": 1729602770000,\n", " \"catalog_id\": \"TestCatalog\",\n", " \"operation\": \"delete\",\n", " \"location\": \"UnitTest\",\n", From 8f91757fa904aa4c16718e87797cfec6267f6f2b Mon Sep 17 00:00:00 2001 From: "Miroslav Chomut (CZ)" Date: Wed, 23 Oct 2024 10:44:41 +0200 Subject: [PATCH 3/3] topics requirements update --- conf/topics.json | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/conf/topics.json b/conf/topics.json index 0a8006c..2f83383 100644 --- a/conf/topics.json +++ b/conf/topics.json @@ -63,7 +63,7 @@ } } }, - "required": ["event_id", "job_ref", "tenant_id", "source_app", "source_app_version", "environment", "timestamp_start", "timestamp_end", "jobs", "jobs.catalog_id", "jobs.status", " jobs.timestamp_start", "jobs.timestamp_end"] + "required": ["event_id", "job_ref", "tenant_id", "source_app", "source_app_version", "environment", "timestamp_start", "timestamp_end", "jobs"] }, "edla.change.topic": { "type": "object",