From 1648c9b3d1676c4724a9a0fbc27ede91989280ca Mon Sep 17 00:00:00 2001 From: Daniel Kavan Date: Thu, 20 Aug 2020 16:23:42 +0200 Subject: [PATCH 01/18] testrun fix (spark.testing.memory=1g) --- atum/src/main/scala/za/co/absa/atum/utils/SparkTestBase.scala | 1 + 1 file changed, 1 insertion(+) diff --git a/atum/src/main/scala/za/co/absa/atum/utils/SparkTestBase.scala b/atum/src/main/scala/za/co/absa/atum/utils/SparkTestBase.scala index 63697264..85d4f99b 100644 --- a/atum/src/main/scala/za/co/absa/atum/utils/SparkTestBase.scala +++ b/atum/src/main/scala/za/co/absa/atum/utils/SparkTestBase.scala @@ -30,6 +30,7 @@ trait SparkTestBase { .config("spark.driver.bindAddress", "127.0.0.1") .config("spark.driver.host", "127.0.0.1") .config("spark.ui.enabled", "false") + .config("spark.testing.memory", 1024*1024*1024) // otherwise may fail based on local machine settings .getOrCreate() } From b1f5e1726439c0c96700673e84d24518fee1b685 Mon Sep 17 00:00:00 2001 From: Daniel Kavan Date: Fri, 21 Aug 2020 13:41:02 +0200 Subject: [PATCH 02/18] spark-version update to v2.4.5, json4s update, aws sdk latest version added. --- .../scala/za/co/absa/atum/AtumImplicits.scala | 247 ++++++++++-------- pom.xml | 21 +- 2 files changed, 150 insertions(+), 118 deletions(-) diff --git a/atum/src/main/scala/za/co/absa/atum/AtumImplicits.scala b/atum/src/main/scala/za/co/absa/atum/AtumImplicits.scala index f37996bd..c96bc86c 100644 --- a/atum/src/main/scala/za/co/absa/atum/AtumImplicits.scala +++ b/atum/src/main/scala/za/co/absa/atum/AtumImplicits.scala @@ -21,56 +21,58 @@ import org.apache.spark.sql.{Dataset, Row, SparkSession} import za.co.absa.atum.core.Atum.controlFrameworkState import za.co.absa.atum.core.{Atum, Constants, SparkEventListener, SparkQueryExecutionListener} import za.co.absa.atum.persistence._ +import za.co.absa.atum.persistence.hdfs.{ControlMeasuresHdfsLoaderJsonFile, ControlMeasuresHdfsStorerJsonFile} +import za.co.absa.atum.persistence.s3.{ControlMeasuresS3LoaderJsonFile, ControlMeasuresS3StorerJsonFile} /** - * The object contains implicit methods for Control Framework - * Minimalistic example of enabling control measurements tracking: - * {{{ - * import za.co.absa.atum.Atum - * import za.co.absa.atum.AtumImplicits._ - * - * ... - * - * spark.enableControlFrameworkTracking(sourceInfoFile = "/source/info/file/path") - * - * ... - * - * dataSet.setCheckpoint("Checkpoint Name") - * }}} - * - * You can use enableControlFrameworkTracking() without parameters if the _INFO file - * is in the path. - * - */ + * The object contains implicit methods for Control Framework + * Minimalistic example of enabling control measurements tracking: + * {{{ + * import za.co.absa.atum.Atum + * import za.co.absa.atum.AtumImplicits._ + * + * ... + * + * spark.enableControlFrameworkTracking(sourceInfoFile = "/source/info/file/path") + * + * ... + * + * dataSet.setCheckpoint("Checkpoint Name") + * }}} + * + * You can use enableControlFrameworkTracking() without parameters if the _INFO file + * is in the path. + * + */ object AtumImplicits { - type DefaultControlInfoStorer = ControlMeasuresStorerJsonFile - type DefaultControlInfoLoader = ControlMeasuresLoaderJsonFile + type DefaultControlInfoStorer = ControlMeasuresHdfsStorerJsonFile + type DefaultControlInfoLoader = ControlMeasuresHdfsLoaderJsonFile implicit def StringToPath(path: String): Path = new Path(path) /** - * The class contains implicit methods for [[org.apache.spark.sql.SparkSession]]. - */ + * The class contains implicit methods for [[org.apache.spark.sql.SparkSession]]. + */ implicit class SparkSessionWrapper(sparkSession: SparkSession) { /** - * Enable control measurements tracking. - * Input and output info file names will be inferred automatically based on data source and destination paths - * - */ + * Enable control measurements tracking. + * Input and output info file names will be inferred automatically based on data source and destination paths + * + */ def enableControlMeasuresTracking(): SparkSession = { enableControlMeasuresTracking(None, None) } /** - * Enable control measurements tracking. - * Both input and output info file paths need to be provided - * - * Example info file path name: "data/input/wikidata.csv.info" - * - * @param sourceInfoFile Pathname to a json-formatted info file containing control measurements - * @param destinationInfoFile Pathname to save the control measurement results to - */ + * Enable control measurements tracking. + * Both input and output info file paths need to be provided + * + * Example info file path name: "data/input/wikidata.csv.info" + * + * @param sourceInfoFile Pathname to a json-formatted info file containing control measurements + * @param destinationInfoFile Pathname to save the control measurement results to + */ def enableControlMeasuresTracking(sourceInfoFile: String = "", destinationInfoFile: String = ""): SparkSession = { val hadoopConfiguration = sparkSession.sparkContext.hadoopConfiguration @@ -81,14 +83,27 @@ object AtumImplicits { enableControlMeasuresTracking(loader, storer) } + // TODO need souceS3Location, dest s3location and possibly some s3 kms:sse, kmskeyId + def enableControlMeasuresTrackingForS3(sourceS3Location: Option[S3Location], + destinationS3Config: Option[(S3Location, S3KmsSettings)] + ): SparkSession = { + + val loader = sourceS3Location.map(new ControlMeasuresS3LoaderJsonFile(_)) + val storer = destinationS3Config.map { case (destLoc, kms) => + new ControlMeasuresS3StorerJsonFile(destLoc, kms) + } + + enableControlMeasuresTracking(loader, storer) + } + /** - * Enable control measurements tracking. - * This is a generic way to enable control measurements tracking enabling to provide a custom - * control measurements loader and storer objects - * - * @param loader An object responsible for loading data source control measurements - * @param storer An object responsible for storing the result control measurements - */ + * Enable control measurements tracking. + * This is a generic way to enable control measurements tracking enabling to provide a custom + * control measurements loader and storer objects + * + * @param loader An object responsible for loading data source control measurements + * @param storer An object responsible for storing the result control measurements + */ def enableControlMeasuresTracking(loader: Option[ControlMeasuresLoader], storer: Option[ControlMeasuresStorer]): SparkSession = sparkSession.synchronized { @@ -106,26 +121,26 @@ object AtumImplicits { } /** - * Explicitly disable control measurements tracking. - * After invoking this routine control measuress will not be tracked for the rest of the Spark Job - * - */ + * Explicitly disable control measurements tracking. + * After invoking this routine control measuress will not be tracked for the rest of the Spark Job + * + */ def disableControlMeasuresTracking(): SparkSession = sparkSession.synchronized { Atum.dispose(sparkSession) sparkSession - } + } /** - * Sets control measurements file name for the source and destination data set. - * The file name should not contain path as it will be inferred from data source and destination. - * Use this only if info file paths and not specified when calling enableControlFrameworkTracking() - * - * Example info file name: "_INFO" - * - * @param fileName A file name for control measurements info - */ + * Sets control measurements file name for the source and destination data set. + * The file name should not contain path as it will be inferred from data source and destination. + * Use this only if info file paths and not specified when calling enableControlFrameworkTracking() + * + * Example info file name: "_INFO" + * + * @param fileName A file name for control measurements info + */ def setControlMeasuresFileName(fileName: String): SparkSession = { setControlMeasuresInputFileName(fileName) setControlMeasuresOutputFileName(fileName) @@ -133,59 +148,59 @@ object AtumImplicits { } /** - * Sets control measurements file name for the source data set. - * The file name should not contain path as it will be inferred from data source. - * Use this only if the input info file path and not specified when calling enableControlFrameworkTracking() - * - * Example info file name: "_INFO" - * - * @param fileName A file name for control measurements info - */ + * Sets control measurements file name for the source data set. + * The file name should not contain path as it will be inferred from data source. + * Use this only if the input info file path and not specified when calling enableControlFrameworkTracking() + * + * Example info file name: "_INFO" + * + * @param fileName A file name for control measurements info + */ def setControlMeasuresInputFileName(fileName: String): SparkSession = { Atum.setControlMeasuresInputFileName(fileName) sparkSession } /** - * Sets control measurements file name for the destination data set. - * The file name should not contain path as it will be inferred from data destination. - * Use this only if the output info file path and not specified when calling enableControlFrameworkTracking() - * - * Example info file name: "_INFO" - * - * @param fileName A file name for control measurements info - */ + * Sets control measurements file name for the destination data set. + * The file name should not contain path as it will be inferred from data destination. + * Use this only if the output info file path and not specified when calling enableControlFrameworkTracking() + * + * Example info file name: "_INFO" + * + * @param fileName A file name for control measurements info + */ def setControlMeasuresOutputFileName(fileName: String): SparkSession = { Atum.setControlMeasuresOutputFileName(fileName) sparkSession } /** - * The method sets workflow name for the current job - * - * @param workflowName Name of the checkpoint - */ + * The method sets workflow name for the current job + * + * @param workflowName Name of the checkpoint + */ def setControlMeasuresWorkflow(workflowName: String): SparkSession = { Atum.setWorkflowName(workflowName) sparkSession } /** - * Check if Control Framework is initialized - * - * @return true is Control Framework is initialized - */ + * Check if Control Framework is initialized + * + * @return true is Control Framework is initialized + */ def isControlMeasuresTrackingEnabled: Boolean = { sparkSession.sessionState.conf contains Constants.InitFlagKey } /** - * The method notifies Menas of a job failure - * - * @param jobStep A job step name - * @param errorDescription An error description - * @param techDetails A technical details - */ + * The method notifies Menas of a job failure + * + * @param jobStep A job step name + * @param errorDescription An error description + * @param techDetails A technical details + */ def setControlMeasurementError(jobStep: String, errorDescription: String, techDetails: String): SparkSession = { val errorDescriptionTrunc = if (errorDescription.length > Constants.maxErrorMessageSize) errorDescription.substring(0, Constants.maxErrorMessageSize) @@ -204,16 +219,16 @@ object AtumImplicits { } /** - * The class contains implicit methods for [[org.apache.spark.sql.Dataset]]. - */ + * The class contains implicit methods for [[org.apache.spark.sql.Dataset]]. + */ implicit class DataSetWrapper(dataset: Dataset[Row]) { /** - * The method creates a new checkpoint by calculating control measurements of the dataset - * On first checkpoint Spark Session Key ControlFrameworkKeys.InfoFileVersionKey is updated - * to the info file stored version - * - * @param name Name of the checkpoint - */ + * The method creates a new checkpoint by calculating control measurements of the dataset + * On first checkpoint Spark Session Key ControlFrameworkKeys.InfoFileVersionKey is updated + * to the info file stored version + * + * @param name Name of the checkpoint + */ def setCheckpoint(name: String, persistInDatabase: Boolean = true): Dataset[Row] = { if (!(dataset.sparkSession.sessionState.conf contains Constants.InitFlagKey)) throw new IllegalStateException("Control framework tracking is not initialized.") @@ -227,12 +242,12 @@ object AtumImplicits { } /** - * The method returns the number of records in the dataframe calculated during the last checkpoint. - * If record count is absent in the checkpoint measurements, None is returned. - * - * This is useful to optimize out an additional df.count() invocation in a Spark job with - * enabled control measurements. - */ + * The method returns the number of records in the dataframe calculated during the last checkpoint. + * If record count is absent in the checkpoint measurements, None is returned. + * + * This is useful to optimize out an additional df.count() invocation in a Spark job with + * enabled control measurements. + */ def lastCheckpointRowCount: Option[Long] = { if (!(dataset.sparkSession.sessionState.conf contains Constants.InitFlagKey)) throw new IllegalStateException("Control framework tracking is not initialized.") @@ -246,11 +261,11 @@ object AtumImplicits { } /** - * The method registers a column rename of a column that is used for control measurements - * - * @param oldName A job step name - * @param newName An error description - */ + * The method registers a column rename of a column that is used for control measurements + * + * @param oldName A job step name + * @param newName An error description + */ def registerColumnRename(oldName: String, newName: String): Dataset[Row] = { if (!(dataset.sparkSession.sessionState.conf contains Constants.InitFlagKey)) throw new IllegalStateException("Control framework tracking is not initialized.") @@ -259,10 +274,10 @@ object AtumImplicits { } /** - * The method registers a column drop when it is no longer needed for the column to calculate control measurements - * - * @param columnName A column to be dropped from measurements - */ + * The method registers a column drop when it is no longer needed for the column to calculate control measurements + * + * @param columnName A column to be dropped from measurements + */ def registerColumnDrop(columnName: String): Dataset[Row] = { if (!(dataset.sparkSession.sessionState.conf contains Constants.InitFlagKey)) throw new IllegalStateException("Control framework tracking is not initialized.") @@ -271,20 +286,20 @@ object AtumImplicits { } /** - * The method fetches the initial control measurements and puts version from info file - * to ControlFrameworkKeys.InfoFileVersionKey Spark Session Key - * - */ + * The method fetches the initial control measurements and puts version from info file + * to ControlFrameworkKeys.InfoFileVersionKey Spark Session Key + * + */ def loadControlInfoFile(): Dataset[Row] = { Atum.controlFrameworkState.initializeControlInfo(dataset) dataset } /** - * The method saves the info file to the specified destination path on HDFS - * - * @param outputPath A directory or a file name to save the info file to. - */ + * The method saves the info file to the specified destination path on HDFS + * + * @param outputPath A directory or a file name to save the info file to. + */ def writeInfoFile(outputPath: String): Dataset[Row] = { Atum.controlFrameworkState.storeCurrentInfoFile(outputPath) dataset diff --git a/pom.xml b/pom.xml index b9b25c22..e8e2b588 100644 --- a/pom.xml +++ b/pom.xml @@ -97,16 +97,29 @@ http://github.com/AbsaOSS/atum/tree/master - 3.2.11 + 3.5.3 2.11 2.11.8 1.0 2.2.4 1.7.25 - 2.2.1 + 2.4.5 2.4.16 + 2.13.65 + + + + software.amazon.awssdk + bom + ${aws.java.sdk.version} + pom + import + + + + @@ -133,6 +146,10 @@ json4s-ext_${scala.compat.version} ${json4s.version} + + software.amazon.awssdk + s3 + From d0145caad453f1e576a21eb7b92245c60ad47f69 Mon Sep 17 00:00:00 2001 From: Daniel Kavan Date: Fri, 21 Aug 2020 14:45:37 +0200 Subject: [PATCH 03/18] tests enabled in examples, order for the sample measurements tests instilled --- examples/pom.xml | 2 +- .../scala/za/co/absa/atum/examples/SampleMeasurements1.scala | 2 ++ .../scala/za/co/absa/atum/examples/SampleMeasurements2.scala | 2 ++ .../main/scala/za/co/absa/atum/utils/SparkLocalMaster.scala | 4 ++++ ...ts1Runner.scala => SampleMeasurementsAllRunnerSpec.scala} | 5 ++++- 5 files changed, 13 insertions(+), 2 deletions(-) rename examples/src/test/scala/za/co/absa/atum/examples/{SampleMeasurements1Runner.scala => SampleMeasurementsAllRunnerSpec.scala} (79%) diff --git a/examples/pom.xml b/examples/pom.xml index fc19dbaf..e3ce2290 100644 --- a/examples/pom.xml +++ b/examples/pom.xml @@ -75,7 +75,7 @@ scalatest-maven-plugin ${scalatest.maven.version} - true + false diff --git a/examples/src/main/scala/za/co/absa/atum/examples/SampleMeasurements1.scala b/examples/src/main/scala/za/co/absa/atum/examples/SampleMeasurements1.scala index 5c511a62..c6d639e9 100644 --- a/examples/src/main/scala/za/co/absa/atum/examples/SampleMeasurements1.scala +++ b/examples/src/main/scala/za/co/absa/atum/examples/SampleMeasurements1.scala @@ -42,5 +42,7 @@ object SampleMeasurements1 { .setCheckpoint("checkpoint1") .write.mode(SaveMode.Overwrite) .parquet("data/output/stage1_job_results") + + spark.disableControlMeasuresTracking() } } diff --git a/examples/src/main/scala/za/co/absa/atum/examples/SampleMeasurements2.scala b/examples/src/main/scala/za/co/absa/atum/examples/SampleMeasurements2.scala index 71531eac..5a8ec454 100644 --- a/examples/src/main/scala/za/co/absa/atum/examples/SampleMeasurements2.scala +++ b/examples/src/main/scala/za/co/absa/atum/examples/SampleMeasurements2.scala @@ -47,5 +47,7 @@ object SampleMeasurements2 { .setCheckpoint("checkpoint2") .write.mode(SaveMode.Overwrite) .parquet("data/output/stage2_job_results") + + spark.disableControlMeasuresTracking() } } diff --git a/examples/src/main/scala/za/co/absa/atum/utils/SparkLocalMaster.scala b/examples/src/main/scala/za/co/absa/atum/utils/SparkLocalMaster.scala index 381a6374..2fb00c39 100644 --- a/examples/src/main/scala/za/co/absa/atum/utils/SparkLocalMaster.scala +++ b/examples/src/main/scala/za/co/absa/atum/utils/SparkLocalMaster.scala @@ -17,4 +17,8 @@ package za.co.absa.atum.utils trait SparkLocalMaster { System.getProperties.setProperty("spark.master", "local[*]") + + // in order to runSampleMeasuremts as tests, otherwise + // java.lang.IllegalArgumentException: System memory 259522560 must be at least 471859200... is thrown + System.getProperties.setProperty("spark.testing.memory", (1024*1024*1024).toString) // 1g } diff --git a/examples/src/test/scala/za/co/absa/atum/examples/SampleMeasurements1Runner.scala b/examples/src/test/scala/za/co/absa/atum/examples/SampleMeasurementsAllRunnerSpec.scala similarity index 79% rename from examples/src/test/scala/za/co/absa/atum/examples/SampleMeasurements1Runner.scala rename to examples/src/test/scala/za/co/absa/atum/examples/SampleMeasurementsAllRunnerSpec.scala index f30d71ac..5f50fae5 100644 --- a/examples/src/test/scala/za/co/absa/atum/examples/SampleMeasurements1Runner.scala +++ b/examples/src/test/scala/za/co/absa/atum/examples/SampleMeasurementsAllRunnerSpec.scala @@ -18,8 +18,11 @@ package za.co.absa.atum.examples import org.scalatest.FunSuite import za.co.absa.atum.utils._ -class SampleMeasurements1Runner extends FunSuite +class SampleMeasurementsAllRunnerSpec extends FunSuite with SparkJobRunnerMethods with SparkLocalMaster { + + // SampleMeasurement2 depends on SampleMeasurements1's output, so they must be run in this order runSparkJobAsTest[SampleMeasurements1.type] + runSparkJobAsTest[SampleMeasurements2.type] } From ac00d10899163a513ea653712512bf16e68dd133 Mon Sep 17 00:00:00 2001 From: Daniel Kavan Date: Fri, 21 Aug 2020 17:55:22 +0200 Subject: [PATCH 04/18] Loader from s3 crudely works on local host by reading my SAML profile --- .../scala/za/co/absa/atum/AtumImplicits.scala | 1 + .../atum/core/ControlFrameworkState.scala | 5 +- .../core/SparkQueryExecutionListener.scala | 2 +- .../ControlMeasuresHdfsLoaderJsonFile.scala} | 5 +- .../ControlMeasuresHdfsStorerJsonFile.scala} | 5 +- .../s3/ControlMeasuresS3LoaderJsonFile.scala | 53 +++++++++++++++++++ .../s3/ControlMeasuresS3StorerJsonFile.scala | 35 ++++++++++++ .../absa/atum/persistence/s3/S3Location.scala | 16 ++++++ .../atum/examples/SampleS3Measurements1.scala | 52 ++++++++++++++++++ ...a => SampleMeasurementsS3RunnerSpec.scala} | 5 +- 10 files changed, 170 insertions(+), 9 deletions(-) rename atum/src/main/scala/za/co/absa/atum/persistence/{ControlMeasuresLoaderJsonFile.scala => hdfs/ControlMeasuresHdfsLoaderJsonFile.scala} (84%) rename atum/src/main/scala/za/co/absa/atum/persistence/{ControlMeasuresStorerJsonFile.scala => hdfs/ControlMeasuresHdfsStorerJsonFile.scala} (86%) create mode 100644 atum/src/main/scala/za/co/absa/atum/persistence/s3/ControlMeasuresS3LoaderJsonFile.scala create mode 100644 atum/src/main/scala/za/co/absa/atum/persistence/s3/ControlMeasuresS3StorerJsonFile.scala create mode 100644 atum/src/main/scala/za/co/absa/atum/persistence/s3/S3Location.scala create mode 100644 examples/src/main/scala/za/co/absa/atum/examples/SampleS3Measurements1.scala rename examples/src/test/scala/za/co/absa/atum/examples/{SampleMeasurements2Runner.scala => SampleMeasurementsS3RunnerSpec.scala} (87%) diff --git a/atum/src/main/scala/za/co/absa/atum/AtumImplicits.scala b/atum/src/main/scala/za/co/absa/atum/AtumImplicits.scala index c96bc86c..df5d846b 100644 --- a/atum/src/main/scala/za/co/absa/atum/AtumImplicits.scala +++ b/atum/src/main/scala/za/co/absa/atum/AtumImplicits.scala @@ -93,6 +93,7 @@ object AtumImplicits { new ControlMeasuresS3StorerJsonFile(destLoc, kms) } + Atum.log.info(s"enableControlMeasuresTracking(loader = $loader, storer = $storer)") // TODO remove debug enableControlMeasuresTracking(loader, storer) } diff --git a/atum/src/main/scala/za/co/absa/atum/core/ControlFrameworkState.scala b/atum/src/main/scala/za/co/absa/atum/core/ControlFrameworkState.scala index 9663ca36..560a08a4 100644 --- a/atum/src/main/scala/za/co/absa/atum/core/ControlFrameworkState.scala +++ b/atum/src/main/scala/za/co/absa/atum/core/ControlFrameworkState.scala @@ -23,7 +23,8 @@ import za.co.absa.atum.AtumImplicits.DefaultControlInfoLoader import za.co.absa.atum.core.Atum.log import za.co.absa.atum.core.ControlType.Count import za.co.absa.atum.model.{RunError, RunState, _} -import za.co.absa.atum.persistence.{ControlMeasuresLoader, ControlMeasuresStorer, ControlMeasuresStorerJsonFile} +import za.co.absa.atum.persistence.hdfs.ControlMeasuresHdfsStorerJsonFile +import za.co.absa.atum.persistence.{ControlMeasuresLoader, ControlMeasuresStorer} import za.co.absa.atum.plugins.EventListener import za.co.absa.atum.utils.ExecutionPlanUtils.inferInputInfoFileName @@ -253,7 +254,7 @@ class ControlFrameworkState(sparkSession: SparkSession) { outputHDFSPathFileName } - val storer = new ControlMeasuresStorerJsonFile(hadoopConfiguration, outputFilePath) + val storer = new ControlMeasuresHdfsStorerJsonFile(hadoopConfiguration, outputFilePath) storer.store(accumulator.getControlMeasure) Atum.log.info(s"Control measurements saved to ${outputFilePath.toUri.toString}") } diff --git a/atum/src/main/scala/za/co/absa/atum/core/SparkQueryExecutionListener.scala b/atum/src/main/scala/za/co/absa/atum/core/SparkQueryExecutionListener.scala index f5a8d42a..f77d7711 100644 --- a/atum/src/main/scala/za/co/absa/atum/core/SparkQueryExecutionListener.scala +++ b/atum/src/main/scala/za/co/absa/atum/core/SparkQueryExecutionListener.scala @@ -19,7 +19,7 @@ import java.io.{PrintWriter, StringWriter} import org.apache.spark.sql.execution.QueryExecution import org.apache.spark.sql.util.QueryExecutionListener -import za.co.absa.atum.persistence.ControlMeasuresStorerJsonFile +import za.co.absa.atum.persistence.hdfs.ControlMeasuresHdfsStorerJsonFile import za.co.absa.atum.utils.ExecutionPlanUtils.{inferOutputFileName, inferOutputInfoFileName} /** diff --git a/atum/src/main/scala/za/co/absa/atum/persistence/ControlMeasuresLoaderJsonFile.scala b/atum/src/main/scala/za/co/absa/atum/persistence/hdfs/ControlMeasuresHdfsLoaderJsonFile.scala similarity index 84% rename from atum/src/main/scala/za/co/absa/atum/persistence/ControlMeasuresLoaderJsonFile.scala rename to atum/src/main/scala/za/co/absa/atum/persistence/hdfs/ControlMeasuresHdfsLoaderJsonFile.scala index ca33aca1..5ba4c9fc 100644 --- a/atum/src/main/scala/za/co/absa/atum/persistence/ControlMeasuresLoaderJsonFile.scala +++ b/atum/src/main/scala/za/co/absa/atum/persistence/hdfs/ControlMeasuresHdfsLoaderJsonFile.scala @@ -13,18 +13,19 @@ * limitations under the License. */ -package za.co.absa.atum.persistence +package za.co.absa.atum.persistence.hdfs import org.apache.commons.io.IOUtils import org.apache.hadoop.conf.Configuration import org.apache.hadoop.fs.{FileSystem, Path} import za.co.absa.atum.model.ControlMeasure +import za.co.absa.atum.persistence.{ControlMeasuresLoader, ControlMeasuresParser} import za.co.absa.atum.utils.ControlUtils import scala.collection.JavaConverters._ /** A loader of control measurements from a JSON file stored in HDFS filesystem. */ -class ControlMeasuresLoaderJsonFile(hadoopConfiguration: Configuration, path: Path) extends ControlMeasuresLoader { +class ControlMeasuresHdfsLoaderJsonFile(hadoopConfiguration: Configuration, path: Path) extends ControlMeasuresLoader { override def load(): ControlMeasure = { val fs = FileSystem.get(hadoopConfiguration) diff --git a/atum/src/main/scala/za/co/absa/atum/persistence/ControlMeasuresStorerJsonFile.scala b/atum/src/main/scala/za/co/absa/atum/persistence/hdfs/ControlMeasuresHdfsStorerJsonFile.scala similarity index 86% rename from atum/src/main/scala/za/co/absa/atum/persistence/ControlMeasuresStorerJsonFile.scala rename to atum/src/main/scala/za/co/absa/atum/persistence/hdfs/ControlMeasuresHdfsStorerJsonFile.scala index 5e325f51..291eba47 100644 --- a/atum/src/main/scala/za/co/absa/atum/persistence/ControlMeasuresStorerJsonFile.scala +++ b/atum/src/main/scala/za/co/absa/atum/persistence/hdfs/ControlMeasuresHdfsStorerJsonFile.scala @@ -13,16 +13,17 @@ * limitations under the License. */ -package za.co.absa.atum.persistence +package za.co.absa.atum.persistence.hdfs import org.apache.hadoop.conf.Configuration import org.apache.hadoop.fs.permission.FsPermission import org.apache.hadoop.fs.{FileSystem, Path} import za.co.absa.atum.model.ControlMeasure +import za.co.absa.atum.persistence.{ControlMeasuresParser, ControlMeasuresStorer} import za.co.absa.atum.utils.ARMImplicits /** A storer of control measurements to HDFS filesystem as a JSON file . */ -class ControlMeasuresStorerJsonFile(hadoopConfiguration: Configuration, path: Path) extends ControlMeasuresStorer { +class ControlMeasuresHdfsStorerJsonFile(hadoopConfiguration: Configuration, path: Path) extends ControlMeasuresStorer { override def store(controlInfo: ControlMeasure): Unit = { val serialized = ControlMeasuresParser asJson controlInfo saveDataToFile(serialized) diff --git a/atum/src/main/scala/za/co/absa/atum/persistence/s3/ControlMeasuresS3LoaderJsonFile.scala b/atum/src/main/scala/za/co/absa/atum/persistence/s3/ControlMeasuresS3LoaderJsonFile.scala new file mode 100644 index 00000000..f6419ed7 --- /dev/null +++ b/atum/src/main/scala/za/co/absa/atum/persistence/s3/ControlMeasuresS3LoaderJsonFile.scala @@ -0,0 +1,53 @@ +/* + * Copyright 2018-2019 ABSA Group Limited + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package za.co.absa.atum.persistence.s3 + +import software.amazon.awssdk.auth.credentials.ProfileCredentialsProvider +import software.amazon.awssdk.services.s3.S3Client +import software.amazon.awssdk.services.s3.model.GetObjectRequest +import za.co.absa.atum.model.ControlMeasure +import za.co.absa.atum.persistence.{ControlMeasuresLoader, ControlMeasuresParser, S3Location} +import za.co.absa.atum.utils.ControlUtils + +/** A loader of control measurements from a JSON file stored in AWS S3. */ +class ControlMeasuresS3LoaderJsonFile(inputLocation: S3Location) extends ControlMeasuresLoader { + override def load(): ControlMeasure = { + + println(s"TODO loading from $inputLocation") + + // to run locally, we need credentials: + val samlCredentials = ProfileCredentialsProvider.create("saml") + println(s"samlCredentials = ${samlCredentials.resolveCredentials().accessKeyId()}, ${samlCredentials.resolveCredentials().secretAccessKey().take(5)}...") + + val s3Client = S3Client.builder() + .region(inputLocation.region) + .credentialsProvider(samlCredentials) // todo only for local? use default credentials instead? + .build() + + + // read + val getRequest = GetObjectRequest + .builder().bucket(inputLocation.bucketName).key(inputLocation.path) + .build() + + val controlInfoJson = s3Client.getObjectAsBytes(getRequest).asUtf8String() + + ControlUtils.preprocessControlMeasure(ControlMeasuresParser fromJson controlInfoJson) + } + override def getInfo: String = { + s"JSON deserializer from ${inputLocation.s3String()}" + } +} diff --git a/atum/src/main/scala/za/co/absa/atum/persistence/s3/ControlMeasuresS3StorerJsonFile.scala b/atum/src/main/scala/za/co/absa/atum/persistence/s3/ControlMeasuresS3StorerJsonFile.scala new file mode 100644 index 00000000..61ea01aa --- /dev/null +++ b/atum/src/main/scala/za/co/absa/atum/persistence/s3/ControlMeasuresS3StorerJsonFile.scala @@ -0,0 +1,35 @@ +/* + * Copyright 2018-2019 ABSA Group Limited + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package za.co.absa.atum.persistence.s3 + +import za.co.absa.atum.model.ControlMeasure +import za.co.absa.atum.persistence.{ControlMeasuresParser, ControlMeasuresStorer, S3KmsSettings, S3Location} + +/** A storer of control measurements to AWS S3 as a JSON file . */ +class ControlMeasuresS3StorerJsonFile(outputLocation: S3Location, kmsSettings: S3KmsSettings) extends ControlMeasuresStorer { + override def store(controlInfo: ControlMeasure): Unit = { + val serialized = ControlMeasuresParser asJson controlInfo + saveDataToFile(serialized) + } + + private def saveDataToFile(data: String): Unit = { + println(s"TODO writing to $data to $outputLocation") + } + + override def getInfo: String = { + s"JSON serializer for Storer to ${outputLocation.s3String()}" + } +} diff --git a/atum/src/main/scala/za/co/absa/atum/persistence/s3/S3Location.scala b/atum/src/main/scala/za/co/absa/atum/persistence/s3/S3Location.scala new file mode 100644 index 00000000..dde5aa99 --- /dev/null +++ b/atum/src/main/scala/za/co/absa/atum/persistence/s3/S3Location.scala @@ -0,0 +1,16 @@ +package za.co.absa.atum.persistence + +import software.amazon.awssdk.regions.Region +import software.amazon.awssdk.services.s3.model.ServerSideEncryption + +case class S3Location(bucketName: String, path: String, region: Region = Region.EU_WEST_1) { + /** + * Returns formatted S3 string, e.g. `s3://myBucket/path/to/somewhere` + * @param protocol http "s3" protocol, e.g. s3, s3n, s3a. Default = "s3". + * @return formatted s3 string + */ + def s3String(protocol: String = "s3"): String = s"s3://$bucketName/$path" +} + +case class S3KmsSettings(kmsKeyId: String, serverSideEncryption: ServerSideEncryption = ServerSideEncryption.AWS_KMS) + diff --git a/examples/src/main/scala/za/co/absa/atum/examples/SampleS3Measurements1.scala b/examples/src/main/scala/za/co/absa/atum/examples/SampleS3Measurements1.scala new file mode 100644 index 00000000..b5677d7c --- /dev/null +++ b/examples/src/main/scala/za/co/absa/atum/examples/SampleS3Measurements1.scala @@ -0,0 +1,52 @@ +/* + * Copyright 2018-2019 ABSA Group Limited + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package za.co.absa.atum.examples + +import org.apache.spark.sql.{SaveMode, SparkSession} +import za.co.absa.atum.AtumImplicits._ +import za.co.absa.atum.persistence.S3Location + +object SampleS3Measurements1 { + def main(args: Array[String]) { + val sparkBuilder = SparkSession.builder().appName("Sample S3 Measurements 1 Job") + val spark = sparkBuilder +// .master("local") + .getOrCreate() + + import spark.implicits._ + + // Initializing library to hook up to Apache Spark + spark.enableControlMeasuresTrackingForS3( + sourceS3Location = Some(S3Location("euw1-ctodatadev-dev-bigdatarnd-s3-poc", "atum/input/wikidata.csv.info")), + destinationS3Config = None + ) + .setControlMeasuresWorkflow("Job 1 S3 ") + + // A business logic of a spark job ... + + spark.read + .option("header", "true") + .option("inferSchema", "true") + .csv("data/input/wikidata.csv") + .as("source") + .filter($"total_response_size" > 1000) + .setCheckpoint("checkpoint1") + .write.mode(SaveMode.Overwrite) + .parquet("data/output_s3/stage1_job_results") + + spark.disableControlMeasuresTracking() + } +} diff --git a/examples/src/test/scala/za/co/absa/atum/examples/SampleMeasurements2Runner.scala b/examples/src/test/scala/za/co/absa/atum/examples/SampleMeasurementsS3RunnerSpec.scala similarity index 87% rename from examples/src/test/scala/za/co/absa/atum/examples/SampleMeasurements2Runner.scala rename to examples/src/test/scala/za/co/absa/atum/examples/SampleMeasurementsS3RunnerSpec.scala index 9fe0b80a..4189f4b5 100644 --- a/examples/src/test/scala/za/co/absa/atum/examples/SampleMeasurements2Runner.scala +++ b/examples/src/test/scala/za/co/absa/atum/examples/SampleMeasurementsS3RunnerSpec.scala @@ -18,8 +18,9 @@ package za.co.absa.atum.examples import org.scalatest.FunSuite import za.co.absa.atum.utils._ -class SampleMeasurements2Runner extends FunSuite +class SampleMeasurementsS3RunnerSpec extends FunSuite with SparkJobRunnerMethods with SparkLocalMaster { - runSparkJobAsTest[SampleMeasurements2.type] + + runSparkJobAsTest[SampleS3Measurements1.type] } From aa50232512e732551a2d94e8235f689e60db7921 Mon Sep 17 00:00:00 2001 From: Daniel Kavan Date: Mon, 24 Aug 2020 17:45:12 +0200 Subject: [PATCH 05/18] Storer to s3 crudely works on local host by reading my SAML profile and by explictly providing the kmsKeyId --- .../s3/ControlMeasuresS3StorerJsonFile.scala | 21 +++++- .../atum/examples/SampleS3Measurements2.scala | 64 +++++++++++++++++++ .../SampleMeasurementsS3RunnerSpec.scala | 1 + 3 files changed, 85 insertions(+), 1 deletion(-) create mode 100644 examples/src/main/scala/za/co/absa/atum/examples/SampleS3Measurements2.scala diff --git a/atum/src/main/scala/za/co/absa/atum/persistence/s3/ControlMeasuresS3StorerJsonFile.scala b/atum/src/main/scala/za/co/absa/atum/persistence/s3/ControlMeasuresS3StorerJsonFile.scala index 61ea01aa..2558f534 100644 --- a/atum/src/main/scala/za/co/absa/atum/persistence/s3/ControlMeasuresS3StorerJsonFile.scala +++ b/atum/src/main/scala/za/co/absa/atum/persistence/s3/ControlMeasuresS3StorerJsonFile.scala @@ -15,6 +15,10 @@ package za.co.absa.atum.persistence.s3 +import software.amazon.awssdk.auth.credentials.ProfileCredentialsProvider +import software.amazon.awssdk.core.sync.RequestBody +import software.amazon.awssdk.services.s3.S3Client +import software.amazon.awssdk.services.s3.model.{PutObjectRequest, PutObjectResponse, ServerSideEncryption} import za.co.absa.atum.model.ControlMeasure import za.co.absa.atum.persistence.{ControlMeasuresParser, ControlMeasuresStorer, S3KmsSettings, S3Location} @@ -26,7 +30,22 @@ class ControlMeasuresS3StorerJsonFile(outputLocation: S3Location, kmsSettings: S } private def saveDataToFile(data: String): Unit = { - println(s"TODO writing to $data to $outputLocation") + // to run locally, we need credentials: + val samlCredentials = ProfileCredentialsProvider.create("saml") + println(s"samlCredentials = ${samlCredentials.resolveCredentials().accessKeyId()}, ${samlCredentials.resolveCredentials().secretAccessKey().take(5)}...") + + val s3Client = S3Client.builder() + .region(outputLocation.region) + .credentialsProvider(samlCredentials) // todo only for local? use default credentials instead? + .build() + + val putRequest = PutObjectRequest.builder.bucket(outputLocation.bucketName).key(outputLocation.path) + .serverSideEncryption(kmsSettings.serverSideEncryption) + .ssekmsKeyId(kmsSettings.kmsKeyId) + .build() + + // may throw S3Exception or SdkClientException (base exception class = SdkException) + s3Client.putObject(putRequest, RequestBody.fromString(data)) // would throw S3Exception or similar } override def getInfo: String = { diff --git a/examples/src/main/scala/za/co/absa/atum/examples/SampleS3Measurements2.scala b/examples/src/main/scala/za/co/absa/atum/examples/SampleS3Measurements2.scala new file mode 100644 index 00000000..90a03cc0 --- /dev/null +++ b/examples/src/main/scala/za/co/absa/atum/examples/SampleS3Measurements2.scala @@ -0,0 +1,64 @@ +/* + * Copyright 2018-2019 ABSA Group Limited + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package za.co.absa.atum.examples + +import org.apache.spark.sql.{SaveMode, SparkSession} +import software.amazon.awssdk.services.s3.S3Configuration +import za.co.absa.atum.AtumImplicits._ +import za.co.absa.atum.persistence.{S3KmsSettings, S3Location} + +object SampleS3Measurements2 { + def main(args: Array[String]) { + + // This example is intended to run AFTER SampleMeasurements1, otherwise it will fail on input file absence + + val sparkBuilder = SparkSession.builder().appName("Sample Measurements 2 Job") + //val spark = sparkBuilder.master("local").getOrCreate() + val spark = sparkBuilder.getOrCreate() + import spark.implicits._ + + val kmsKeyId = "todo put keyId here" // TODO must be removed/resuplied + val s3KmsSettings = + + // Initializing library to hook up to Apache Spark + // No need to specify datasetName and datasetVersion as it is stage 2 and it will be determined automatically + spark.enableControlMeasuresTrackingForS3( + sourceS3Location = None, + destinationS3Config = Some( + S3Location("euw1-ctodatadev-dev-bigdatarnd-s3-poc", "atum/output/wikidata.csv.info"), + S3KmsSettings(kmsKeyId) + ) + ) + .setControlMeasuresWorkflow("Job 2") + + val sourceDS = spark.read + .parquet("data/output_s3/stage1_job_results") + + // A business logic of a spark job ... + + // An example - a column rename + // If the renamed column is one of control measurement columns, the rename need to be registered in Control Framework + sourceDS.as("target") + .withColumnRenamed("total_response_size", "trs") // Renaming the column + .registerColumnRename("total_response_size","trs") // Registering the rename, from now on the new name for the column is 'trs' + .filter($"trs" > 1000) + .setCheckpoint("checkpoint2") + .write.mode(SaveMode.Overwrite) + .parquet("data/output_s3/stage2_job_results") + + spark.disableControlMeasuresTracking() + } +} diff --git a/examples/src/test/scala/za/co/absa/atum/examples/SampleMeasurementsS3RunnerSpec.scala b/examples/src/test/scala/za/co/absa/atum/examples/SampleMeasurementsS3RunnerSpec.scala index 4189f4b5..5cf399ee 100644 --- a/examples/src/test/scala/za/co/absa/atum/examples/SampleMeasurementsS3RunnerSpec.scala +++ b/examples/src/test/scala/za/co/absa/atum/examples/SampleMeasurementsS3RunnerSpec.scala @@ -23,4 +23,5 @@ class SampleMeasurementsS3RunnerSpec extends FunSuite with SparkLocalMaster { runSparkJobAsTest[SampleS3Measurements1.type] + runSparkJobAsTest[SampleS3Measurements2.type] } From 6ecf7f2ff49fe4b4ef096a2c4987b41840eb2ae4 Mon Sep 17 00:00:00 2001 From: Daniel Kavan Date: Tue, 25 Aug 2020 15:07:08 +0200 Subject: [PATCH 06/18] Unit tests for ControlMeasuresHdfsLoaderJsonFile | ControlMeasuresHdfsStorerJsonSpec --- .../persistence/ControlMeasuresParser.scala | 7 ++++ .../ControlMeasuresHdfsLoaderJsonFile.scala | 10 ++---- .../za/co/absa/atum/utils/FileUtils.scala | 12 +++++++ .../za/co/absa/atum/utils/HdfsFileUtils.scala | 18 ++++++++++ atum/src/test/resources/example_input.info | 29 +++++++++++++++ .../absa/atum/persistence/TestResources.scala | 21 +++++++++++ .../ControlMeasuresHdfsLoaderJsonSpec.scala | 19 ++++++++++ .../ControlMeasuresHdfsStorerJsonSpec.scala | 36 +++++++++++++++++++ 8 files changed, 145 insertions(+), 7 deletions(-) create mode 100644 atum/src/main/scala/za/co/absa/atum/utils/FileUtils.scala create mode 100644 atum/src/main/scala/za/co/absa/atum/utils/HdfsFileUtils.scala create mode 100644 atum/src/test/resources/example_input.info create mode 100644 atum/src/test/scala/za/co/absa/atum/persistence/TestResources.scala create mode 100644 atum/src/test/scala/za/co/absa/atum/persistence/hdfs/ControlMeasuresHdfsLoaderJsonSpec.scala create mode 100644 atum/src/test/scala/za/co/absa/atum/persistence/hdfs/ControlMeasuresHdfsStorerJsonSpec.scala diff --git a/atum/src/main/scala/za/co/absa/atum/persistence/ControlMeasuresParser.scala b/atum/src/main/scala/za/co/absa/atum/persistence/ControlMeasuresParser.scala index 981c4f43..22c45055 100644 --- a/atum/src/main/scala/za/co/absa/atum/persistence/ControlMeasuresParser.scala +++ b/atum/src/main/scala/za/co/absa/atum/persistence/ControlMeasuresParser.scala @@ -29,6 +29,13 @@ object ControlMeasuresParser { ControlUtils.asJson[ControlMeasure](controlMeasure) } + /** + * The method returns a prettified JSON representation of a [[za.co.absa.atum.model.ControlMeasure]] object + */ + def asJsonPretty(controlMeasure: ControlMeasure): String = { + ControlUtils.asJsonPretty[ControlMeasure](controlMeasure) + } + /** * The method returns a [[za.co.absa.atum.model.ControlMeasure]] object parsed from JSON string. */ diff --git a/atum/src/main/scala/za/co/absa/atum/persistence/hdfs/ControlMeasuresHdfsLoaderJsonFile.scala b/atum/src/main/scala/za/co/absa/atum/persistence/hdfs/ControlMeasuresHdfsLoaderJsonFile.scala index 5ba4c9fc..266b4ab1 100644 --- a/atum/src/main/scala/za/co/absa/atum/persistence/hdfs/ControlMeasuresHdfsLoaderJsonFile.scala +++ b/atum/src/main/scala/za/co/absa/atum/persistence/hdfs/ControlMeasuresHdfsLoaderJsonFile.scala @@ -15,22 +15,18 @@ package za.co.absa.atum.persistence.hdfs -import org.apache.commons.io.IOUtils import org.apache.hadoop.conf.Configuration import org.apache.hadoop.fs.{FileSystem, Path} import za.co.absa.atum.model.ControlMeasure import za.co.absa.atum.persistence.{ControlMeasuresLoader, ControlMeasuresParser} -import za.co.absa.atum.utils.ControlUtils - -import scala.collection.JavaConverters._ +import za.co.absa.atum.utils.{ControlUtils, HdfsFileUtils} /** A loader of control measurements from a JSON file stored in HDFS filesystem. */ class ControlMeasuresHdfsLoaderJsonFile(hadoopConfiguration: Configuration, path: Path) extends ControlMeasuresLoader { override def load(): ControlMeasure = { - val fs = FileSystem.get(hadoopConfiguration) - val stream = fs.open(path) - val controlInfoJson = try IOUtils.readLines(stream).asScala.mkString("\n") finally stream.close() + implicit val fs = FileSystem.get(hadoopConfiguration) + val controlInfoJson = HdfsFileUtils.readHdfsFileToString(path) ControlUtils.preprocessControlMeasure(ControlMeasuresParser fromJson controlInfoJson) } diff --git a/atum/src/main/scala/za/co/absa/atum/utils/FileUtils.scala b/atum/src/main/scala/za/co/absa/atum/utils/FileUtils.scala new file mode 100644 index 00000000..80d7ea5d --- /dev/null +++ b/atum/src/main/scala/za/co/absa/atum/utils/FileUtils.scala @@ -0,0 +1,12 @@ +package za.co.absa.atum.utils + +object FileUtils { + def readFileToString(path: String): String = { + val testTxtSource = scala.io.Source.fromFile(path) + val str = testTxtSource.mkString + testTxtSource.close() + + str + } + +} diff --git a/atum/src/main/scala/za/co/absa/atum/utils/HdfsFileUtils.scala b/atum/src/main/scala/za/co/absa/atum/utils/HdfsFileUtils.scala new file mode 100644 index 00000000..8820e80a --- /dev/null +++ b/atum/src/main/scala/za/co/absa/atum/utils/HdfsFileUtils.scala @@ -0,0 +1,18 @@ +package za.co.absa.atum.utils + +import org.apache.commons.io.IOUtils +import org.apache.hadoop.fs.{FileSystem, Path} + +import scala.collection.JavaConverters._ + +object HdfsFileUtils { + + def readHdfsFileToString(path: Path)(implicit fs: FileSystem): String = { + val stream = fs.open(path) + try + IOUtils.readLines(stream).asScala.mkString("\n") + finally + stream.close() + } + +} diff --git a/atum/src/test/resources/example_input.info b/atum/src/test/resources/example_input.info new file mode 100644 index 00000000..dba19355 --- /dev/null +++ b/atum/src/test/resources/example_input.info @@ -0,0 +1,29 @@ +{ + "metadata": { + "sourceApplication": "AtumTest", + "country": "CZ", + "historyType": "Snapshot", + "dataFilename": "example_input.csv", + "sourceType": "public", + "version": 1, + "informationDate": "01-01-2020", + "additionalInfo": { } + }, + "checkpoints": [ + { + "name": "checkpointA", + "processStartTime": "01-01-2020 08:00:00", + "processEndTime": "01-01-2020 08:00:10", + "workflowName": "wf1", + "order": 1, + "controls": [ + { + "controlName": "control1", + "controlType": "someControlType", + "controlCol": "column1", + "controlValue": "1234" + } + ] + } + ] +} diff --git a/atum/src/test/scala/za/co/absa/atum/persistence/TestResources.scala b/atum/src/test/scala/za/co/absa/atum/persistence/TestResources.scala new file mode 100644 index 00000000..434d6241 --- /dev/null +++ b/atum/src/test/scala/za/co/absa/atum/persistence/TestResources.scala @@ -0,0 +1,21 @@ +package za.co.absa.atum.persistence + +import za.co.absa.atum.core.ControlType +import za.co.absa.atum.model.{Checkpoint, ControlMeasure, ControlMeasureMetadata, Measurement} + +object TestResources { + + object InputInfo { + val localPath: String = getClass.getResource("/example_input.info").getPath + + // conforms to the content of the Resource file `example_input.info` + val controlMeasure = ControlMeasure( + ControlMeasureMetadata("AtumTest", "CZ", "Snapshot", "example_input.csv", "public", 1, "01-01-2020", Map.empty), + runUniqueId = None, + List(Checkpoint("checkpointA", None, None, "01-01-2020 08:00:00", "01-01-2020 08:00:10", "wf1", 1, List( + Measurement("control1", "someControlType", "column1", "1234") + ))) + ) + } + +} diff --git a/atum/src/test/scala/za/co/absa/atum/persistence/hdfs/ControlMeasuresHdfsLoaderJsonSpec.scala b/atum/src/test/scala/za/co/absa/atum/persistence/hdfs/ControlMeasuresHdfsLoaderJsonSpec.scala new file mode 100644 index 00000000..c706c2a6 --- /dev/null +++ b/atum/src/test/scala/za/co/absa/atum/persistence/hdfs/ControlMeasuresHdfsLoaderJsonSpec.scala @@ -0,0 +1,19 @@ +package za.co.absa.atum.persistence.hdfs + +import org.apache.hadoop.conf.Configuration +import org.apache.hadoop.fs.Path +import org.scalatest.{FlatSpec, Matchers} +import za.co.absa.atum.persistence.TestResources + +class ControlMeasuresHdfsLoaderJsonSpec extends FlatSpec with Matchers { + + val inputPath: String = TestResources.InputInfo.localPath + val expectedInputControlMeasure = TestResources.InputInfo.controlMeasure + + "ControlMeasuresHdfsLoaderJsonFile" should "load json file from HDFS" in { + val loadedControlMeasure = new ControlMeasuresHdfsLoaderJsonFile(new Configuration(), new Path(inputPath)).load() + + loadedControlMeasure shouldBe expectedInputControlMeasure + } + +} diff --git a/atum/src/test/scala/za/co/absa/atum/persistence/hdfs/ControlMeasuresHdfsStorerJsonSpec.scala b/atum/src/test/scala/za/co/absa/atum/persistence/hdfs/ControlMeasuresHdfsStorerJsonSpec.scala new file mode 100644 index 00000000..3da5de13 --- /dev/null +++ b/atum/src/test/scala/za/co/absa/atum/persistence/hdfs/ControlMeasuresHdfsStorerJsonSpec.scala @@ -0,0 +1,36 @@ +package za.co.absa.atum.persistence.hdfs + +import org.apache.hadoop.conf.Configuration +import org.apache.hadoop.fs.{FileSystem, Path} +import org.scalatest.{FlatSpec, Matchers} +import za.co.absa.atum.persistence.TestResources +import za.co.absa.atum.utils.{FileUtils, HdfsFileUtils} + +class ControlMeasuresHdfsStorerJsonSpec extends FlatSpec with Matchers { + + val expectedFilePath: String = TestResources.InputInfo.localPath + val inputControlMeasure = TestResources.InputInfo.controlMeasure + + val hadoopConfiguration = new Configuration() + implicit val fs = FileSystem.get(hadoopConfiguration) + + "ControlMeasuresHdfsStorerJsonFile" should "store json file to HDFS" in { + + val outputPath = new Path("/tmp/json-hdfs-storing-test") + fs.delete(outputPath, false) + + new ControlMeasuresHdfsStorerJsonFile(new Configuration(), outputPath).store(inputControlMeasure) + + val actualContent = HdfsFileUtils.readHdfsFileToString(outputPath) + val expectedContent = FileUtils.readFileToString(expectedFilePath) + + // some output may be prettified while other may not, we do not take this into account. + filterWhitespaces(actualContent) shouldBe filterWhitespaces(expectedContent) + + fs.delete(outputPath, false) + } + + private def filterWhitespaces(content: String): String = { + content.filterNot(_.isWhitespace) + } +} From 11b751bd6ff32570db9d8d0467f555f4c43000c4 Mon Sep 17 00:00:00 2001 From: Daniel Kavan Date: Thu, 27 Aug 2020 17:59:43 +0200 Subject: [PATCH 07/18] Unit tests for ControlMeasuresS3LoaderJsonSpec - org.mockito.plugins.MockMaker with "mock-maker-inline" allows mocking final classes. (hint: https://www.baeldung.com/mockito-final) --- atum/pom.xml | 8 +++ .../s3/ControlMeasuresS3LoaderJsonFile.scala | 23 +++------ .../s3/ControlMeasuresS3StorerJsonFile.scala | 15 +++--- .../za/co/absa/atum/utils/S3ClientUtils.scala | 51 +++++++++++++++++++ .../org.mockito.plugins.MockMaker | 1 + .../s3/ControlMeasuresS3LoaderJsonSpec.scala | 37 ++++++++++++++ pom.xml | 1 + 7 files changed, 113 insertions(+), 23 deletions(-) create mode 100644 atum/src/main/scala/za/co/absa/atum/utils/S3ClientUtils.scala create mode 100644 atum/src/test/resources/mockito-extensions/org.mockito.plugins.MockMaker create mode 100644 atum/src/test/scala/za/co/absa/atum/s3/ControlMeasuresS3LoaderJsonSpec.scala diff --git a/atum/pom.xml b/atum/pom.xml index 0286bd49..338479ab 100644 --- a/atum/pom.xml +++ b/atum/pom.xml @@ -35,6 +35,14 @@ ${json4s.version} provided + + + org.mockito + mockito-core + ${mockito.version} + test + + diff --git a/atum/src/main/scala/za/co/absa/atum/persistence/s3/ControlMeasuresS3LoaderJsonFile.scala b/atum/src/main/scala/za/co/absa/atum/persistence/s3/ControlMeasuresS3LoaderJsonFile.scala index f6419ed7..18756034 100644 --- a/atum/src/main/scala/za/co/absa/atum/persistence/s3/ControlMeasuresS3LoaderJsonFile.scala +++ b/atum/src/main/scala/za/co/absa/atum/persistence/s3/ControlMeasuresS3LoaderJsonFile.scala @@ -20,34 +20,27 @@ import software.amazon.awssdk.services.s3.S3Client import software.amazon.awssdk.services.s3.model.GetObjectRequest import za.co.absa.atum.model.ControlMeasure import za.co.absa.atum.persistence.{ControlMeasuresLoader, ControlMeasuresParser, S3Location} -import za.co.absa.atum.utils.ControlUtils +import za.co.absa.atum.utils.{ControlUtils, S3ClientUtils} /** A loader of control measurements from a JSON file stored in AWS S3. */ class ControlMeasuresS3LoaderJsonFile(inputLocation: S3Location) extends ControlMeasuresLoader { override def load(): ControlMeasure = { + val s3Client: S3Client = getS3Client - println(s"TODO loading from $inputLocation") - - // to run locally, we need credentials: - val samlCredentials = ProfileCredentialsProvider.create("saml") - println(s"samlCredentials = ${samlCredentials.resolveCredentials().accessKeyId()}, ${samlCredentials.resolveCredentials().secretAccessKey().take(5)}...") - - val s3Client = S3Client.builder() - .region(inputLocation.region) - .credentialsProvider(samlCredentials) // todo only for local? use default credentials instead? - .build() - - - // read val getRequest = GetObjectRequest .builder().bucket(inputLocation.bucketName).key(inputLocation.path) .build() val controlInfoJson = s3Client.getObjectAsBytes(getRequest).asUtf8String() - ControlUtils.preprocessControlMeasure(ControlMeasuresParser fromJson controlInfoJson) } + override def getInfo: String = { s"JSON deserializer from ${inputLocation.s3String()}" } + + private[s3] def getS3Client: S3Client = S3ClientUtils.getS3Client(inputLocation.region) + + // S3ClientUtils.getS3ClientWithLocalProfile(inputLocation.region, "saml") // TODO remove + } diff --git a/atum/src/main/scala/za/co/absa/atum/persistence/s3/ControlMeasuresS3StorerJsonFile.scala b/atum/src/main/scala/za/co/absa/atum/persistence/s3/ControlMeasuresS3StorerJsonFile.scala index 2558f534..f134cb72 100644 --- a/atum/src/main/scala/za/co/absa/atum/persistence/s3/ControlMeasuresS3StorerJsonFile.scala +++ b/atum/src/main/scala/za/co/absa/atum/persistence/s3/ControlMeasuresS3StorerJsonFile.scala @@ -21,6 +21,7 @@ import software.amazon.awssdk.services.s3.S3Client import software.amazon.awssdk.services.s3.model.{PutObjectRequest, PutObjectResponse, ServerSideEncryption} import za.co.absa.atum.model.ControlMeasure import za.co.absa.atum.persistence.{ControlMeasuresParser, ControlMeasuresStorer, S3KmsSettings, S3Location} +import za.co.absa.atum.utils.S3ClientUtils /** A storer of control measurements to AWS S3 as a JSON file . */ class ControlMeasuresS3StorerJsonFile(outputLocation: S3Location, kmsSettings: S3KmsSettings) extends ControlMeasuresStorer { @@ -30,14 +31,7 @@ class ControlMeasuresS3StorerJsonFile(outputLocation: S3Location, kmsSettings: S } private def saveDataToFile(data: String): Unit = { - // to run locally, we need credentials: - val samlCredentials = ProfileCredentialsProvider.create("saml") - println(s"samlCredentials = ${samlCredentials.resolveCredentials().accessKeyId()}, ${samlCredentials.resolveCredentials().secretAccessKey().take(5)}...") - - val s3Client = S3Client.builder() - .region(outputLocation.region) - .credentialsProvider(samlCredentials) // todo only for local? use default credentials instead? - .build() + val s3Client = getS3Client val putRequest = PutObjectRequest.builder.bucket(outputLocation.bucketName).key(outputLocation.path) .serverSideEncryption(kmsSettings.serverSideEncryption) @@ -51,4 +45,9 @@ class ControlMeasuresS3StorerJsonFile(outputLocation: S3Location, kmsSettings: S override def getInfo: String = { s"JSON serializer for Storer to ${outputLocation.s3String()}" } + + private[s3] def getS3Client: S3Client = S3ClientUtils.getS3Client(outputLocation.region) + + // S3ClientUtils.getS3ClientWithLocalProfile(inputLocation.region, "saml") // TODO remove + } diff --git a/atum/src/main/scala/za/co/absa/atum/utils/S3ClientUtils.scala b/atum/src/main/scala/za/co/absa/atum/utils/S3ClientUtils.scala new file mode 100644 index 00000000..97f84614 --- /dev/null +++ b/atum/src/main/scala/za/co/absa/atum/utils/S3ClientUtils.scala @@ -0,0 +1,51 @@ +package za.co.absa.atum.utils + +import software.amazon.awssdk.auth.credentials.ProfileCredentialsProvider +import software.amazon.awssdk.regions.Region +import software.amazon.awssdk.services.s3.{S3Client, S3ClientBuilder} +import za.co.absa.atum.core.Atum.log + +object S3ClientUtils { + + def getS3ClientWithLocalProfile(region: Region, profileName: String): S3Client = { + val localProfileCredentials = ProfileCredentialsProvider.create(profileName) + log.debug(s"Credentials of local $profileName profile =" + + s" ${localProfileCredentials.resolveCredentials().accessKeyId()}, ${localProfileCredentials.resolveCredentials().secretAccessKey().take(5)}...") + + getS3Client(region, Some(localProfileCredentials)) + } + + def getS3Client(region: Region, credentialsProvider: Option[ProfileCredentialsProvider] = None): S3Client = { + S3Client.builder() + .region(region) + .applyCredentialsProviderIfDefined(credentialsProvider) + .build() + } + + implicit class S3ClientBuilderExt(s3ClientBuilder: S3ClientBuilder) { + /** + * Universal conditional S3ClientBuilder=>S3ClientBuilder apply method + * + * @param condition `fn` will be applied when true, not applied when false + * @param fn full + * @return original object + */ + def applyIf(condition: Boolean, fn: S3ClientBuilder => S3ClientBuilder): S3ClientBuilder = { + if (condition) { + fn(s3ClientBuilder) + } else + s3ClientBuilder + } + + /** + * Apply `optionalCredentialsProvider` if defined + * + * @param optionalCredentialsProvider + * @return original object + */ + def applyCredentialsProviderIfDefined(optionalCredentialsProvider: Option[ProfileCredentialsProvider]): S3ClientBuilder = { + optionalCredentialsProvider.fold(s3ClientBuilder)(s3ClientBuilder.credentialsProvider) + } + } + +} diff --git a/atum/src/test/resources/mockito-extensions/org.mockito.plugins.MockMaker b/atum/src/test/resources/mockito-extensions/org.mockito.plugins.MockMaker new file mode 100644 index 00000000..1f0955d4 --- /dev/null +++ b/atum/src/test/resources/mockito-extensions/org.mockito.plugins.MockMaker @@ -0,0 +1 @@ +mock-maker-inline diff --git a/atum/src/test/scala/za/co/absa/atum/s3/ControlMeasuresS3LoaderJsonSpec.scala b/atum/src/test/scala/za/co/absa/atum/s3/ControlMeasuresS3LoaderJsonSpec.scala new file mode 100644 index 00000000..851dc796 --- /dev/null +++ b/atum/src/test/scala/za/co/absa/atum/s3/ControlMeasuresS3LoaderJsonSpec.scala @@ -0,0 +1,37 @@ +package za.co.absa.atum.s3 + +import org.mockito.{ArgumentMatcher, ArgumentMatchers, Mockito} +import org.scalatest.mock.MockitoSugar +import org.scalatest.{FlatSpec, Matchers} +import software.amazon.awssdk.core.ResponseBytes +import software.amazon.awssdk.regions.Region +import software.amazon.awssdk.services.s3.S3Client +import software.amazon.awssdk.services.s3.model.{GetObjectRequest, GetObjectResponse} +import za.co.absa.atum.persistence.s3.ControlMeasuresS3LoaderJsonFile +import za.co.absa.atum.persistence.{S3Location, TestResources} +import za.co.absa.atum.utils.FileUtils + +class ControlMeasuresS3LoaderJsonSpec extends FlatSpec with Matchers with MockitoSugar { + + val inputPath: String = TestResources.InputInfo.localPath + val expectedInputControlMeasure = TestResources.InputInfo.controlMeasure + + "ControlMeasuresS3LoaderJsonFile" should "load json file from (mocked) S3" in { + val inputLocation = S3Location(bucketName = "bucket1", "path/to/json.info", region = Region.EU_WEST_2) + val mockedS3Client = mock[S3Client] + val mockedRequest: ResponseBytes[GetObjectResponse] = mock[ResponseBytes[GetObjectResponse]] + + val loader = new ControlMeasuresS3LoaderJsonFile(inputLocation){ + override def getS3Client: S3Client = mockedS3Client + } + + val mockedS3Data = FileUtils.readFileToString(inputPath) + + Mockito.when(mockedS3Client.getObjectAsBytes(ArgumentMatchers.any[GetObjectRequest]())).thenReturn(mockedRequest) + Mockito.when(mockedRequest.asUtf8String()).thenReturn(mockedS3Data) + + val loadedControlMeasure = loader.load() + loadedControlMeasure shouldBe expectedInputControlMeasure + } + +} diff --git a/pom.xml b/pom.xml index e8e2b588..3d41f645 100644 --- a/pom.xml +++ b/pom.xml @@ -106,6 +106,7 @@ 2.4.5 2.4.16 2.13.65 + 3.5.7 From 89def5c613ea1fb2e74e54c88f1303b732321c0e Mon Sep 17 00:00:00 2001 From: Daniel Kavan Date: Fri, 28 Aug 2020 11:51:29 +0200 Subject: [PATCH 08/18] ScalaTest version update to 3.2.2, Mockito -> ScalaMockito --- atum/pom.xml | 11 +++++++++-- .../atum/BigDecimalToJsonSerializationSpec.scala | 5 +++-- .../co/absa/atum/CachingStorageLevelSpec.scala | 6 ++++-- .../ControlInfoToJsonSerializationSpec.scala | 6 +++--- .../co/absa/atum/ControlMeasurementsSpec.scala | 7 ++++--- .../scala/za/co/absa/atum/ControlUtilsSpec.scala | 8 ++++---- .../hdfs/ControlMeasuresHdfsLoaderJsonSpec.scala | 5 +++-- .../hdfs/ControlMeasuresHdfsStorerJsonSpec.scala | 5 +++-- .../s3/ControlMeasuresS3LoaderJsonSpec.scala | 16 +++++++++++++--- .../absa/atum/utils/SparkJobRunnerMethods.scala | 4 ++-- .../SampleMeasurementsAllRunnerSpec.scala | 4 ++-- .../SampleMeasurementsS3RunnerSpec.scala | 9 +++++---- pom.xml | 4 ++-- 13 files changed, 57 insertions(+), 33 deletions(-) diff --git a/atum/pom.xml b/atum/pom.xml index 338479ab..c84f0646 100644 --- a/atum/pom.xml +++ b/atum/pom.xml @@ -38,8 +38,15 @@ org.mockito - mockito-core - ${mockito.version} + mockito-scala_${scala.compat.version} + ${mockito.scala.version} + test + + + + org.mockito + mockito-scala-scalatest_${scala.compat.version} + ${mockito.scala.version} test diff --git a/atum/src/test/scala/za/co/absa/atum/BigDecimalToJsonSerializationSpec.scala b/atum/src/test/scala/za/co/absa/atum/BigDecimalToJsonSerializationSpec.scala index cab11840..19153e76 100644 --- a/atum/src/test/scala/za/co/absa/atum/BigDecimalToJsonSerializationSpec.scala +++ b/atum/src/test/scala/za/co/absa/atum/BigDecimalToJsonSerializationSpec.scala @@ -15,13 +15,14 @@ package za.co.absa.atum -import org.scalatest.{FlatSpec, Matchers} import org.json4s._ import org.json4s.jackson.Serialization import org.json4s.jackson.Serialization.write +import org.scalatest.flatspec.AnyFlatSpec +import org.scalatest.matchers.should.Matchers import za.co.absa.atum.model.Measurement -class BigDecimalToJsonSerializationSpec extends FlatSpec with Matchers { +class BigDecimalToJsonSerializationSpec extends AnyFlatSpec with Matchers { implicit val formats: Formats = Serialization.formats(NoTypeHints).withBigDecimal "write" should "serialize a scala.math.BigDecimal" in diff --git a/atum/src/test/scala/za/co/absa/atum/CachingStorageLevelSpec.scala b/atum/src/test/scala/za/co/absa/atum/CachingStorageLevelSpec.scala index 7080b9c7..14805769 100644 --- a/atum/src/test/scala/za/co/absa/atum/CachingStorageLevelSpec.scala +++ b/atum/src/test/scala/za/co/absa/atum/CachingStorageLevelSpec.scala @@ -16,11 +16,13 @@ package za.co.absa.atum import org.apache.spark.storage.StorageLevel -import org.scalatest.{BeforeAndAfter, FlatSpec, Matchers} +import org.scalatest.BeforeAndAfter +import org.scalatest.flatspec.AnyFlatSpec +import org.scalatest.matchers.should.Matchers import za.co.absa.atum.core.Atum import za.co.absa.atum.utils.SparkTestBase -class CachingStorageLevelSpec extends FlatSpec with Matchers with SparkTestBase with BeforeAndAfter { +class CachingStorageLevelSpec extends AnyFlatSpec with Matchers with SparkTestBase with BeforeAndAfter { before { Atum.init(spark) diff --git a/atum/src/test/scala/za/co/absa/atum/ControlInfoToJsonSerializationSpec.scala b/atum/src/test/scala/za/co/absa/atum/ControlInfoToJsonSerializationSpec.scala index 76f66e8a..8994897d 100644 --- a/atum/src/test/scala/za/co/absa/atum/ControlInfoToJsonSerializationSpec.scala +++ b/atum/src/test/scala/za/co/absa/atum/ControlInfoToJsonSerializationSpec.scala @@ -15,15 +15,15 @@ package za.co.absa.atum -import org.scalatest.{FlatSpec, Matchers} +import org.scalatest.flatspec.AnyFlatSpec +import org.scalatest.matchers.should.Matchers import za.co.absa.atum.model.{Checkpoint, ControlMeasure, ControlMeasureMetadata, Measurement} import za.co.absa.atum.utils.{BuildProperties, ControlUtils} -import za.co.absa.atum.model._ /** * Unit tests for ControlInfo object serialization */ -class ControlInfoToJsonSerializationSpec extends FlatSpec with Matchers { +class ControlInfoToJsonSerializationSpec extends AnyFlatSpec with Matchers { val exampleCtrlInfo = ControlMeasure( metadata = ControlMeasureMetadata( sourceApplication = "FrontArena", diff --git a/atum/src/test/scala/za/co/absa/atum/ControlMeasurementsSpec.scala b/atum/src/test/scala/za/co/absa/atum/ControlMeasurementsSpec.scala index 5fcb9f67..5b27671c 100644 --- a/atum/src/test/scala/za/co/absa/atum/ControlMeasurementsSpec.scala +++ b/atum/src/test/scala/za/co/absa/atum/ControlMeasurementsSpec.scala @@ -16,13 +16,14 @@ package za.co.absa.atum import org.apache.spark.sql.types._ -import org.scalatest.{FlatSpec, Matchers} -import za.co.absa.atum.core.{Constants, ControlType, MeasurementProcessor} +import org.scalatest.flatspec.AnyFlatSpec +import org.scalatest.matchers.should.Matchers +import za.co.absa.atum.core.{ControlType, MeasurementProcessor} import za.co.absa.atum.model.Measurement import za.co.absa.atum.utils.SparkTestBase //noinspection ZeroIndexToHead -class ControlMeasurementsSpec extends FlatSpec with Matchers with SparkTestBase { +class ControlMeasurementsSpec extends AnyFlatSpec with Matchers with SparkTestBase { import spark.implicits._ diff --git a/atum/src/test/scala/za/co/absa/atum/ControlUtilsSpec.scala b/atum/src/test/scala/za/co/absa/atum/ControlUtilsSpec.scala index d1085b3a..0c77aa51 100644 --- a/atum/src/test/scala/za/co/absa/atum/ControlUtilsSpec.scala +++ b/atum/src/test/scala/za/co/absa/atum/ControlUtilsSpec.scala @@ -16,12 +16,12 @@ package za.co.absa.atum import org.apache.spark.sql.types._ -import org.scalatest._ -import za.co.absa.atum.utils.ControlUtils -import za.co.absa.atum.utils.SparkTestBase +import org.scalatest.flatspec.AnyFlatSpec +import org.scalatest.matchers.should.Matchers +import za.co.absa.atum.utils.{ControlUtils, SparkTestBase} -class ControlUtilsSpec extends FlatSpec with Matchers with SparkTestBase { +class ControlUtilsSpec extends AnyFlatSpec with Matchers with SparkTestBase { import spark.implicits._ private val singleStringColumnDF = spark.sparkContext.parallelize(List("987987", "example", "example", "another example")).toDF diff --git a/atum/src/test/scala/za/co/absa/atum/persistence/hdfs/ControlMeasuresHdfsLoaderJsonSpec.scala b/atum/src/test/scala/za/co/absa/atum/persistence/hdfs/ControlMeasuresHdfsLoaderJsonSpec.scala index c706c2a6..efe4b329 100644 --- a/atum/src/test/scala/za/co/absa/atum/persistence/hdfs/ControlMeasuresHdfsLoaderJsonSpec.scala +++ b/atum/src/test/scala/za/co/absa/atum/persistence/hdfs/ControlMeasuresHdfsLoaderJsonSpec.scala @@ -2,10 +2,11 @@ package za.co.absa.atum.persistence.hdfs import org.apache.hadoop.conf.Configuration import org.apache.hadoop.fs.Path -import org.scalatest.{FlatSpec, Matchers} +import org.scalatest.flatspec.AnyFlatSpec +import org.scalatest.matchers.should.Matchers import za.co.absa.atum.persistence.TestResources -class ControlMeasuresHdfsLoaderJsonSpec extends FlatSpec with Matchers { +class ControlMeasuresHdfsLoaderJsonSpec extends AnyFlatSpec with Matchers { val inputPath: String = TestResources.InputInfo.localPath val expectedInputControlMeasure = TestResources.InputInfo.controlMeasure diff --git a/atum/src/test/scala/za/co/absa/atum/persistence/hdfs/ControlMeasuresHdfsStorerJsonSpec.scala b/atum/src/test/scala/za/co/absa/atum/persistence/hdfs/ControlMeasuresHdfsStorerJsonSpec.scala index 3da5de13..a83c8604 100644 --- a/atum/src/test/scala/za/co/absa/atum/persistence/hdfs/ControlMeasuresHdfsStorerJsonSpec.scala +++ b/atum/src/test/scala/za/co/absa/atum/persistence/hdfs/ControlMeasuresHdfsStorerJsonSpec.scala @@ -2,11 +2,12 @@ package za.co.absa.atum.persistence.hdfs import org.apache.hadoop.conf.Configuration import org.apache.hadoop.fs.{FileSystem, Path} -import org.scalatest.{FlatSpec, Matchers} +import org.scalatest.flatspec.AnyFlatSpec +import org.scalatest.matchers.should.Matchers import za.co.absa.atum.persistence.TestResources import za.co.absa.atum.utils.{FileUtils, HdfsFileUtils} -class ControlMeasuresHdfsStorerJsonSpec extends FlatSpec with Matchers { +class ControlMeasuresHdfsStorerJsonSpec extends AnyFlatSpec with Matchers { val expectedFilePath: String = TestResources.InputInfo.localPath val inputControlMeasure = TestResources.InputInfo.controlMeasure diff --git a/atum/src/test/scala/za/co/absa/atum/s3/ControlMeasuresS3LoaderJsonSpec.scala b/atum/src/test/scala/za/co/absa/atum/s3/ControlMeasuresS3LoaderJsonSpec.scala index 851dc796..7627004b 100644 --- a/atum/src/test/scala/za/co/absa/atum/s3/ControlMeasuresS3LoaderJsonSpec.scala +++ b/atum/src/test/scala/za/co/absa/atum/s3/ControlMeasuresS3LoaderJsonSpec.scala @@ -1,8 +1,9 @@ package za.co.absa.atum.s3 +import org.mockito.scalatest.IdiomaticMockito import org.mockito.{ArgumentMatcher, ArgumentMatchers, Mockito} -import org.scalatest.mock.MockitoSugar -import org.scalatest.{FlatSpec, Matchers} +import org.scalatest.flatspec.AnyFlatSpec +import org.scalatest.matchers.should.Matchers import software.amazon.awssdk.core.ResponseBytes import software.amazon.awssdk.regions.Region import software.amazon.awssdk.services.s3.S3Client @@ -11,12 +12,13 @@ import za.co.absa.atum.persistence.s3.ControlMeasuresS3LoaderJsonFile import za.co.absa.atum.persistence.{S3Location, TestResources} import za.co.absa.atum.utils.FileUtils -class ControlMeasuresS3LoaderJsonSpec extends FlatSpec with Matchers with MockitoSugar { +class ControlMeasuresS3LoaderJsonSpec extends AnyFlatSpec with Matchers with IdiomaticMockito { val inputPath: String = TestResources.InputInfo.localPath val expectedInputControlMeasure = TestResources.InputInfo.controlMeasure "ControlMeasuresS3LoaderJsonFile" should "load json file from (mocked) S3" in { + val inputLocation = S3Location(bucketName = "bucket1", "path/to/json.info", region = Region.EU_WEST_2) val mockedS3Client = mock[S3Client] val mockedRequest: ResponseBytes[GetObjectResponse] = mock[ResponseBytes[GetObjectResponse]] @@ -34,4 +36,12 @@ class ControlMeasuresS3LoaderJsonSpec extends FlatSpec with Matchers with Mockit loadedControlMeasure shouldBe expectedInputControlMeasure } + def argMatch[T](func: T => Boolean): T = { + ArgumentMatchers.argThat(new ArgumentMatcher[T] { + override def matches(param: T): Boolean = { + func(param) + } + }) + } + } diff --git a/examples/src/main/scala/za/co/absa/atum/utils/SparkJobRunnerMethods.scala b/examples/src/main/scala/za/co/absa/atum/utils/SparkJobRunnerMethods.scala index 2b2e2a53..37a3eab0 100644 --- a/examples/src/main/scala/za/co/absa/atum/utils/SparkJobRunnerMethods.scala +++ b/examples/src/main/scala/za/co/absa/atum/utils/SparkJobRunnerMethods.scala @@ -15,13 +15,13 @@ package za.co.absa.atum.utils -import org.scalatest.FunSuiteLike +import org.scalatest.funsuite.AnyFunSuiteLike import scala.reflect.ClassTag import scala.reflect.runtime.universe trait SparkJobRunnerMethods { - this: FunSuiteLike => + this: AnyFunSuiteLike => private def runSparkJob[T](implicit ct: ClassTag[T]): Unit = { type MainClass = {def main(args: Array[String]): Unit} diff --git a/examples/src/test/scala/za/co/absa/atum/examples/SampleMeasurementsAllRunnerSpec.scala b/examples/src/test/scala/za/co/absa/atum/examples/SampleMeasurementsAllRunnerSpec.scala index 5f50fae5..1ebbeded 100644 --- a/examples/src/test/scala/za/co/absa/atum/examples/SampleMeasurementsAllRunnerSpec.scala +++ b/examples/src/test/scala/za/co/absa/atum/examples/SampleMeasurementsAllRunnerSpec.scala @@ -15,10 +15,10 @@ package za.co.absa.atum.examples -import org.scalatest.FunSuite +import org.scalatest.funsuite.AnyFunSuite import za.co.absa.atum.utils._ -class SampleMeasurementsAllRunnerSpec extends FunSuite +class SampleMeasurementsAllRunnerSpec extends AnyFunSuite with SparkJobRunnerMethods with SparkLocalMaster { diff --git a/examples/src/test/scala/za/co/absa/atum/examples/SampleMeasurementsS3RunnerSpec.scala b/examples/src/test/scala/za/co/absa/atum/examples/SampleMeasurementsS3RunnerSpec.scala index 5cf399ee..212b14d9 100644 --- a/examples/src/test/scala/za/co/absa/atum/examples/SampleMeasurementsS3RunnerSpec.scala +++ b/examples/src/test/scala/za/co/absa/atum/examples/SampleMeasurementsS3RunnerSpec.scala @@ -15,13 +15,14 @@ package za.co.absa.atum.examples -import org.scalatest.FunSuite +import org.scalatest.funsuite.AnyFunSuite import za.co.absa.atum.utils._ -class SampleMeasurementsS3RunnerSpec extends FunSuite +class SampleMeasurementsS3RunnerSpec extends AnyFunSuite with SparkJobRunnerMethods with SparkLocalMaster { - runSparkJobAsTest[SampleS3Measurements1.type] - runSparkJobAsTest[SampleS3Measurements2.type] + // todo reenable when s3 mock is ready +// runSparkJobAsTest[SampleS3Measurements1.type] +// runSparkJobAsTest[SampleS3Measurements2.type] } diff --git a/pom.xml b/pom.xml index 3d41f645..17b6fa87 100644 --- a/pom.xml +++ b/pom.xml @@ -101,12 +101,12 @@ 2.11 2.11.8 1.0 - 2.2.4 + 3.2.2 1.7.25 2.4.5 2.4.16 2.13.65 - 3.5.7 + 1.15.0 From 003bb245ead64a71cc29e21aba3b7249effc3646 Mon Sep 17 00:00:00 2001 From: Daniel Kavan Date: Fri, 28 Aug 2020 17:19:31 +0200 Subject: [PATCH 09/18] ControlMeasuresS3LoaderJsonSpec enhanced - now checking the GetObjectRequest that is used to query object on S3. --- .../s3/ControlMeasuresS3LoaderJsonSpec.scala | 29 ++++++++++--------- 1 file changed, 16 insertions(+), 13 deletions(-) diff --git a/atum/src/test/scala/za/co/absa/atum/s3/ControlMeasuresS3LoaderJsonSpec.scala b/atum/src/test/scala/za/co/absa/atum/s3/ControlMeasuresS3LoaderJsonSpec.scala index 7627004b..e95dccfe 100644 --- a/atum/src/test/scala/za/co/absa/atum/s3/ControlMeasuresS3LoaderJsonSpec.scala +++ b/atum/src/test/scala/za/co/absa/atum/s3/ControlMeasuresS3LoaderJsonSpec.scala @@ -1,7 +1,8 @@ package za.co.absa.atum.s3 +import org.mockito.captor.{ArgCaptor, Captor} import org.mockito.scalatest.IdiomaticMockito -import org.mockito.{ArgumentMatcher, ArgumentMatchers, Mockito} +import org.mockito.{ArgumentMatchers, ArgumentMatchersSugar, Mockito} import org.scalatest.flatspec.AnyFlatSpec import org.scalatest.matchers.should.Matchers import software.amazon.awssdk.core.ResponseBytes @@ -14,7 +15,6 @@ import za.co.absa.atum.utils.FileUtils class ControlMeasuresS3LoaderJsonSpec extends AnyFlatSpec with Matchers with IdiomaticMockito { - val inputPath: String = TestResources.InputInfo.localPath val expectedInputControlMeasure = TestResources.InputInfo.controlMeasure "ControlMeasuresS3LoaderJsonFile" should "load json file from (mocked) S3" in { @@ -23,25 +23,28 @@ class ControlMeasuresS3LoaderJsonSpec extends AnyFlatSpec with Matchers with Idi val mockedS3Client = mock[S3Client] val mockedRequest: ResponseBytes[GetObjectResponse] = mock[ResponseBytes[GetObjectResponse]] - val loader = new ControlMeasuresS3LoaderJsonFile(inputLocation){ + val loader = new ControlMeasuresS3LoaderJsonFile(inputLocation) { override def getS3Client: S3Client = mockedS3Client } - val mockedS3Data = FileUtils.readFileToString(inputPath) + val inputFilePath: String = TestResources.InputInfo.localPath + val mockedS3Data = FileUtils.readFileToString(inputFilePath) + // mock S3 response Mockito.when(mockedS3Client.getObjectAsBytes(ArgumentMatchers.any[GetObjectRequest]())).thenReturn(mockedRequest) Mockito.when(mockedRequest.asUtf8String()).thenReturn(mockedS3Data) - val loadedControlMeasure = loader.load() - loadedControlMeasure shouldBe expectedInputControlMeasure - } - def argMatch[T](func: T => Boolean): T = { - ArgumentMatchers.argThat(new ArgumentMatcher[T] { - override def matches(param: T): Boolean = { - func(param) - } - }) + // verify request content + val getRequestCaptor: Captor[GetObjectRequest] = ArgCaptor[GetObjectRequest] + Mockito.verify(mockedS3Client).getObjectAsBytes(getRequestCaptor.capture) + val capturedGetRequest = getRequestCaptor.value + + capturedGetRequest.bucket shouldBe "bucket1" + capturedGetRequest.key shouldBe "path/to/json.info" + + // verify returned value + loadedControlMeasure shouldBe expectedInputControlMeasure } } From 3e2125d82a38200c38280c1bd26874cc2c9c76b3 Mon Sep 17 00:00:00 2001 From: Daniel Kavan Date: Mon, 31 Aug 2020 10:04:43 +0200 Subject: [PATCH 10/18] ControlMeasuresS3StorerJsonSpec added --- .../absa/atum/persistence/TestResources.scala | 4 ++ .../ControlMeasuresHdfsStorerJsonSpec.scala | 5 +- .../s3/ControlMeasuresS3LoaderJsonSpec.scala | 5 +- .../s3/ControlMeasuresS3StorerJsonSpec.scala | 58 +++++++++++++++++++ examples/pom.xml | 2 +- 5 files changed, 67 insertions(+), 7 deletions(-) create mode 100644 atum/src/test/scala/za/co/absa/atum/s3/ControlMeasuresS3StorerJsonSpec.scala diff --git a/atum/src/test/scala/za/co/absa/atum/persistence/TestResources.scala b/atum/src/test/scala/za/co/absa/atum/persistence/TestResources.scala index 434d6241..44feda9d 100644 --- a/atum/src/test/scala/za/co/absa/atum/persistence/TestResources.scala +++ b/atum/src/test/scala/za/co/absa/atum/persistence/TestResources.scala @@ -18,4 +18,8 @@ object TestResources { ) } + def filterWhitespaces(content: String): String = { + content.filterNot(_.isWhitespace) + } + } diff --git a/atum/src/test/scala/za/co/absa/atum/persistence/hdfs/ControlMeasuresHdfsStorerJsonSpec.scala b/atum/src/test/scala/za/co/absa/atum/persistence/hdfs/ControlMeasuresHdfsStorerJsonSpec.scala index a83c8604..c47db180 100644 --- a/atum/src/test/scala/za/co/absa/atum/persistence/hdfs/ControlMeasuresHdfsStorerJsonSpec.scala +++ b/atum/src/test/scala/za/co/absa/atum/persistence/hdfs/ControlMeasuresHdfsStorerJsonSpec.scala @@ -26,12 +26,9 @@ class ControlMeasuresHdfsStorerJsonSpec extends AnyFlatSpec with Matchers { val expectedContent = FileUtils.readFileToString(expectedFilePath) // some output may be prettified while other may not, we do not take this into account. - filterWhitespaces(actualContent) shouldBe filterWhitespaces(expectedContent) + TestResources.filterWhitespaces(actualContent) shouldBe TestResources.filterWhitespaces(expectedContent) fs.delete(outputPath, false) } - private def filterWhitespaces(content: String): String = { - content.filterNot(_.isWhitespace) - } } diff --git a/atum/src/test/scala/za/co/absa/atum/s3/ControlMeasuresS3LoaderJsonSpec.scala b/atum/src/test/scala/za/co/absa/atum/s3/ControlMeasuresS3LoaderJsonSpec.scala index e95dccfe..5d1681bb 100644 --- a/atum/src/test/scala/za/co/absa/atum/s3/ControlMeasuresS3LoaderJsonSpec.scala +++ b/atum/src/test/scala/za/co/absa/atum/s3/ControlMeasuresS3LoaderJsonSpec.scala @@ -2,7 +2,7 @@ package za.co.absa.atum.s3 import org.mockito.captor.{ArgCaptor, Captor} import org.mockito.scalatest.IdiomaticMockito -import org.mockito.{ArgumentMatchers, ArgumentMatchersSugar, Mockito} +import org.mockito.{ArgumentMatchers, Mockito} import org.scalatest.flatspec.AnyFlatSpec import org.scalatest.matchers.should.Matchers import software.amazon.awssdk.core.ResponseBytes @@ -17,7 +17,7 @@ class ControlMeasuresS3LoaderJsonSpec extends AnyFlatSpec with Matchers with Idi val expectedInputControlMeasure = TestResources.InputInfo.controlMeasure - "ControlMeasuresS3LoaderJsonFile" should "load json file from (mocked) S3" in { + "ControlMeasuresS3LoaderJsonFile" should "load measurements from json file from (mocked) S3" in { val inputLocation = S3Location(bucketName = "bucket1", "path/to/json.info", region = Region.EU_WEST_2) val mockedS3Client = mock[S3Client] @@ -27,6 +27,7 @@ class ControlMeasuresS3LoaderJsonSpec extends AnyFlatSpec with Matchers with Idi override def getS3Client: S3Client = mockedS3Client } + // This file is mocked to be read from in S3 val inputFilePath: String = TestResources.InputInfo.localPath val mockedS3Data = FileUtils.readFileToString(inputFilePath) diff --git a/atum/src/test/scala/za/co/absa/atum/s3/ControlMeasuresS3StorerJsonSpec.scala b/atum/src/test/scala/za/co/absa/atum/s3/ControlMeasuresS3StorerJsonSpec.scala new file mode 100644 index 00000000..990a170f --- /dev/null +++ b/atum/src/test/scala/za/co/absa/atum/s3/ControlMeasuresS3StorerJsonSpec.scala @@ -0,0 +1,58 @@ +package za.co.absa.atum.persistence.hdfs + +import org.mockito.captor.{ArgCaptor, Captor} +import org.mockito.scalatest.IdiomaticMockito +import org.mockito.{ArgumentMatchers, Mockito} +import org.scalatest.flatspec.AnyFlatSpec +import org.scalatest.matchers.should.Matchers +import software.amazon.awssdk.core.sync.RequestBody +import software.amazon.awssdk.regions.Region +import software.amazon.awssdk.services.s3.S3Client +import software.amazon.awssdk.services.s3.model.{PutObjectRequest, PutObjectResponse, ServerSideEncryption} +import za.co.absa.atum.persistence.s3.ControlMeasuresS3StorerJsonFile +import za.co.absa.atum.persistence.{S3KmsSettings, S3Location, TestResources} +import za.co.absa.atum.utils.FileUtils + +import scala.io.Source + +class ControlMeasuresS3StorerJsonSpec extends AnyFlatSpec with Matchers with IdiomaticMockito { + + val inputControlMeasure = TestResources.InputInfo.controlMeasure + + "ControlMeasuresS3StorerJsonFile" should "store measurements to json file to S3" in { + + val outputLocation = S3Location(bucketName = "bucket1", "path/to/json.info", region = Region.EU_WEST_2) + val kmsSettigns = S3KmsSettings("testingKeyId123") + val mockedS3Client = mock[S3Client] + + val storer = new ControlMeasuresS3StorerJsonFile(outputLocation, kmsSettigns) { + override def getS3Client: S3Client = mockedS3Client + } + + // mock S3 response + Mockito.when(mockedS3Client.putObject(ArgumentMatchers.any[PutObjectRequest], ArgumentMatchers.any[RequestBody])) + .thenReturn(mock[PutObjectResponse]) // anything non-throwing + val loadedControlMeasure = storer.store(inputControlMeasure) + + // verify request content + val putRequestCaptor: Captor[PutObjectRequest] = ArgCaptor[PutObjectRequest] + val requestBodyCaptor: Captor[RequestBody] = ArgCaptor[RequestBody] + + Mockito.verify(mockedS3Client).putObject(putRequestCaptor.capture, requestBodyCaptor.capture) + val (capturedPutRequest, capturedRequestBody) = (putRequestCaptor.value, requestBodyCaptor.value) + + capturedPutRequest.bucket shouldBe "bucket1" + capturedPutRequest.key shouldBe "path/to/json.info" + capturedPutRequest.ssekmsKeyId shouldBe "testingKeyId123" + capturedPutRequest.serverSideEncryption() shouldBe ServerSideEncryption.AWS_KMS + + // This expected request body content should be the same as content of this file ( ~inputControlMeasure) + val sameContentFile: String = TestResources.InputInfo.localPath + val expectedContent = FileUtils.readFileToString(sameContentFile) + + val requestDataContent = Source.fromInputStream(capturedRequestBody.contentStreamProvider().newStream()).mkString + TestResources.filterWhitespaces(requestDataContent) shouldBe TestResources.filterWhitespaces(expectedContent) + + } + +} diff --git a/examples/pom.xml b/examples/pom.xml index e3ce2290..fc19dbaf 100644 --- a/examples/pom.xml +++ b/examples/pom.xml @@ -75,7 +75,7 @@ scalatest-maven-plugin ${scalatest.maven.version} - false + true From 98f27b978eb00e868ad1b616f359a74eaf61acfa Mon Sep 17 00:00:00 2001 From: Daniel Kavan Date: Mon, 31 Aug 2020 12:27:42 +0200 Subject: [PATCH 11/18] SampleS3Measurements1|2 now work with SAML profile and KMS Key ID loaded from environment property: TOOLING_KMS_KEY_ID cleanup --- .../scala/za/co/absa/atum/AtumImplicits.scala | 21 +++++--- .../s3/ControlMeasuresS3LoaderJsonFile.scala | 17 ++++--- .../s3/ControlMeasuresS3StorerJsonFile.scala | 26 ++++++---- .../za/co/absa/atum/utils/S3ClientUtils.scala | 51 ------------------- .../scala/za/co/absa/atum/utils/S3Utils.scala | 25 +++++++++ .../absa/atum/persistence/TestResources.scala | 1 - .../s3/ControlMeasuresS3LoaderJsonSpec.scala | 5 +- .../s3/ControlMeasuresS3StorerJsonSpec.scala | 3 ++ .../atum/examples/SampleS3Measurements1.scala | 9 ++-- .../atum/examples/SampleS3Measurements2.scala | 13 +++-- ...=> SampleMeasurementsHdfsRunnerSpec.scala} | 2 +- .../SampleMeasurementsS3RunnerSpec.scala | 5 +- 12 files changed, 88 insertions(+), 90 deletions(-) delete mode 100644 atum/src/main/scala/za/co/absa/atum/utils/S3ClientUtils.scala create mode 100644 atum/src/main/scala/za/co/absa/atum/utils/S3Utils.scala rename atum/src/test/scala/za/co/absa/atum/{ => persistence}/s3/ControlMeasuresS3LoaderJsonSpec.scala (91%) rename atum/src/test/scala/za/co/absa/atum/{ => persistence}/s3/ControlMeasuresS3StorerJsonSpec.scala (94%) rename examples/src/test/scala/za/co/absa/atum/examples/{SampleMeasurementsAllRunnerSpec.scala => SampleMeasurementsHdfsRunnerSpec.scala} (94%) diff --git a/atum/src/main/scala/za/co/absa/atum/AtumImplicits.scala b/atum/src/main/scala/za/co/absa/atum/AtumImplicits.scala index df5d846b..7a50262e 100644 --- a/atum/src/main/scala/za/co/absa/atum/AtumImplicits.scala +++ b/atum/src/main/scala/za/co/absa/atum/AtumImplicits.scala @@ -15,15 +15,17 @@ package za.co.absa.atum -import scala.language.implicitConversions import org.apache.hadoop.fs.Path import org.apache.spark.sql.{Dataset, Row, SparkSession} +import software.amazon.awssdk.auth.credentials.{AwsCredentialsProvider, DefaultCredentialsProvider} import za.co.absa.atum.core.Atum.controlFrameworkState -import za.co.absa.atum.core.{Atum, Constants, SparkEventListener, SparkQueryExecutionListener} +import za.co.absa.atum.core.{Atum, Constants} import za.co.absa.atum.persistence._ import za.co.absa.atum.persistence.hdfs.{ControlMeasuresHdfsLoaderJsonFile, ControlMeasuresHdfsStorerJsonFile} import za.co.absa.atum.persistence.s3.{ControlMeasuresS3LoaderJsonFile, ControlMeasuresS3StorerJsonFile} +import scala.language.implicitConversions + /** * The object contains implicit methods for Control Framework * Minimalistic example of enabling control measurements tracking: @@ -83,17 +85,24 @@ object AtumImplicits { enableControlMeasuresTracking(loader, storer) } - // TODO need souceS3Location, dest s3location and possibly some s3 kms:sse, kmskeyId + /** + * Enable S3-based control measurements tracking. + * + * @param sourceS3Location s3 location to load info files from in S3 + * @param destinationS3Config s3 location and kms settings to save the data to in S3 + * @param credentialsProvider If you do not have a specific Credentials provider, use the default + * {@link software.amazon.awssdk.auth.credentials.DefaultCredentialsProvider#create()} + * @return spark session with atum tracking enabled + */ def enableControlMeasuresTrackingForS3(sourceS3Location: Option[S3Location], - destinationS3Config: Option[(S3Location, S3KmsSettings)] - ): SparkSession = { + destinationS3Config: Option[(S3Location, S3KmsSettings)]) + (implicit credentialsProvider: AwsCredentialsProvider): SparkSession = { val loader = sourceS3Location.map(new ControlMeasuresS3LoaderJsonFile(_)) val storer = destinationS3Config.map { case (destLoc, kms) => new ControlMeasuresS3StorerJsonFile(destLoc, kms) } - Atum.log.info(s"enableControlMeasuresTracking(loader = $loader, storer = $storer)") // TODO remove debug enableControlMeasuresTracking(loader, storer) } diff --git a/atum/src/main/scala/za/co/absa/atum/persistence/s3/ControlMeasuresS3LoaderJsonFile.scala b/atum/src/main/scala/za/co/absa/atum/persistence/s3/ControlMeasuresS3LoaderJsonFile.scala index 18756034..690e35c2 100644 --- a/atum/src/main/scala/za/co/absa/atum/persistence/s3/ControlMeasuresS3LoaderJsonFile.scala +++ b/atum/src/main/scala/za/co/absa/atum/persistence/s3/ControlMeasuresS3LoaderJsonFile.scala @@ -15,15 +15,20 @@ package za.co.absa.atum.persistence.s3 -import software.amazon.awssdk.auth.credentials.ProfileCredentialsProvider +import software.amazon.awssdk.auth.credentials.{AwsCredentialsProvider, DefaultCredentialsProvider} import software.amazon.awssdk.services.s3.S3Client import software.amazon.awssdk.services.s3.model.GetObjectRequest import za.co.absa.atum.model.ControlMeasure import za.co.absa.atum.persistence.{ControlMeasuresLoader, ControlMeasuresParser, S3Location} -import za.co.absa.atum.utils.{ControlUtils, S3ClientUtils} +import za.co.absa.atum.utils.{ControlUtils, S3Utils} -/** A loader of control measurements from a JSON file stored in AWS S3. */ -class ControlMeasuresS3LoaderJsonFile(inputLocation: S3Location) extends ControlMeasuresLoader { +/** + * A loader of control measurements from a JSON file stored in AWS S3. + * @param inputLocation S3 location to read the json measurements from + * @param credentialsProvider a specific credentials provider (e.g. SAML profile). use [[DefaultCredentialsProvider]] when in doubt + */ +class ControlMeasuresS3LoaderJsonFile(inputLocation: S3Location) + (implicit credentialsProvider: AwsCredentialsProvider) extends ControlMeasuresLoader { override def load(): ControlMeasure = { val s3Client: S3Client = getS3Client @@ -39,8 +44,6 @@ class ControlMeasuresS3LoaderJsonFile(inputLocation: S3Location) extends Control s"JSON deserializer from ${inputLocation.s3String()}" } - private[s3] def getS3Client: S3Client = S3ClientUtils.getS3Client(inputLocation.region) - - // S3ClientUtils.getS3ClientWithLocalProfile(inputLocation.region, "saml") // TODO remove + private[s3] def getS3Client: S3Client = S3Utils.getS3Client(inputLocation.region, credentialsProvider) } diff --git a/atum/src/main/scala/za/co/absa/atum/persistence/s3/ControlMeasuresS3StorerJsonFile.scala b/atum/src/main/scala/za/co/absa/atum/persistence/s3/ControlMeasuresS3StorerJsonFile.scala index f134cb72..dacd8a6e 100644 --- a/atum/src/main/scala/za/co/absa/atum/persistence/s3/ControlMeasuresS3StorerJsonFile.scala +++ b/atum/src/main/scala/za/co/absa/atum/persistence/s3/ControlMeasuresS3StorerJsonFile.scala @@ -15,16 +15,23 @@ package za.co.absa.atum.persistence.s3 -import software.amazon.awssdk.auth.credentials.ProfileCredentialsProvider +import software.amazon.awssdk.auth.credentials.{AwsCredentialsProvider, DefaultCredentialsProvider} import software.amazon.awssdk.core.sync.RequestBody import software.amazon.awssdk.services.s3.S3Client -import software.amazon.awssdk.services.s3.model.{PutObjectRequest, PutObjectResponse, ServerSideEncryption} +import software.amazon.awssdk.services.s3.model.PutObjectRequest import za.co.absa.atum.model.ControlMeasure import za.co.absa.atum.persistence.{ControlMeasuresParser, ControlMeasuresStorer, S3KmsSettings, S3Location} -import za.co.absa.atum.utils.S3ClientUtils +import za.co.absa.atum.utils.S3Utils -/** A storer of control measurements to AWS S3 as a JSON file . */ -class ControlMeasuresS3StorerJsonFile(outputLocation: S3Location, kmsSettings: S3KmsSettings) extends ControlMeasuresStorer { +/** + * A storer of control measurements to a JSON file stored in AWS S3. + * + * @param outputLocation s3 location to save measurements data to + * @param kmsSettings KMS settings - server side encryption configuration + * @param credentialsProvider a specific credentials provider (e.g. SAML profile). use [[DefaultCredentialsProvider]] when in doubt + */ +class ControlMeasuresS3StorerJsonFile(outputLocation: S3Location, kmsSettings: S3KmsSettings) + (implicit credentialsProvider: AwsCredentialsProvider) extends ControlMeasuresStorer { override def store(controlInfo: ControlMeasure): Unit = { val serialized = ControlMeasuresParser asJson controlInfo saveDataToFile(serialized) @@ -38,16 +45,13 @@ class ControlMeasuresS3StorerJsonFile(outputLocation: S3Location, kmsSettings: S .ssekmsKeyId(kmsSettings.kmsKeyId) .build() - // may throw S3Exception or SdkClientException (base exception class = SdkException) - s3Client.putObject(putRequest, RequestBody.fromString(data)) // would throw S3Exception or similar + // would throw S3Exception or SdkClientException in case of failure (base exception class: SdkException) + s3Client.putObject(putRequest, RequestBody.fromString(data)) } override def getInfo: String = { s"JSON serializer for Storer to ${outputLocation.s3String()}" } - private[s3] def getS3Client: S3Client = S3ClientUtils.getS3Client(outputLocation.region) - - // S3ClientUtils.getS3ClientWithLocalProfile(inputLocation.region, "saml") // TODO remove - + private[s3] def getS3Client: S3Client = S3Utils.getS3Client(outputLocation.region, credentialsProvider) } diff --git a/atum/src/main/scala/za/co/absa/atum/utils/S3ClientUtils.scala b/atum/src/main/scala/za/co/absa/atum/utils/S3ClientUtils.scala deleted file mode 100644 index 97f84614..00000000 --- a/atum/src/main/scala/za/co/absa/atum/utils/S3ClientUtils.scala +++ /dev/null @@ -1,51 +0,0 @@ -package za.co.absa.atum.utils - -import software.amazon.awssdk.auth.credentials.ProfileCredentialsProvider -import software.amazon.awssdk.regions.Region -import software.amazon.awssdk.services.s3.{S3Client, S3ClientBuilder} -import za.co.absa.atum.core.Atum.log - -object S3ClientUtils { - - def getS3ClientWithLocalProfile(region: Region, profileName: String): S3Client = { - val localProfileCredentials = ProfileCredentialsProvider.create(profileName) - log.debug(s"Credentials of local $profileName profile =" + - s" ${localProfileCredentials.resolveCredentials().accessKeyId()}, ${localProfileCredentials.resolveCredentials().secretAccessKey().take(5)}...") - - getS3Client(region, Some(localProfileCredentials)) - } - - def getS3Client(region: Region, credentialsProvider: Option[ProfileCredentialsProvider] = None): S3Client = { - S3Client.builder() - .region(region) - .applyCredentialsProviderIfDefined(credentialsProvider) - .build() - } - - implicit class S3ClientBuilderExt(s3ClientBuilder: S3ClientBuilder) { - /** - * Universal conditional S3ClientBuilder=>S3ClientBuilder apply method - * - * @param condition `fn` will be applied when true, not applied when false - * @param fn full - * @return original object - */ - def applyIf(condition: Boolean, fn: S3ClientBuilder => S3ClientBuilder): S3ClientBuilder = { - if (condition) { - fn(s3ClientBuilder) - } else - s3ClientBuilder - } - - /** - * Apply `optionalCredentialsProvider` if defined - * - * @param optionalCredentialsProvider - * @return original object - */ - def applyCredentialsProviderIfDefined(optionalCredentialsProvider: Option[ProfileCredentialsProvider]): S3ClientBuilder = { - optionalCredentialsProvider.fold(s3ClientBuilder)(s3ClientBuilder.credentialsProvider) - } - } - -} diff --git a/atum/src/main/scala/za/co/absa/atum/utils/S3Utils.scala b/atum/src/main/scala/za/co/absa/atum/utils/S3Utils.scala new file mode 100644 index 00000000..4f3c0aef --- /dev/null +++ b/atum/src/main/scala/za/co/absa/atum/utils/S3Utils.scala @@ -0,0 +1,25 @@ +package za.co.absa.atum.utils + +import software.amazon.awssdk.auth.credentials.{AwsCredentialsProvider, ProfileCredentialsProvider} +import software.amazon.awssdk.regions.Region +import software.amazon.awssdk.services.s3.S3Client +import za.co.absa.atum.core.Atum.log + +object S3Utils { + + def getLocalProfileCredentialsProvider(credentialsProfileName: String): ProfileCredentialsProvider = { + val localProfileCredentials = ProfileCredentialsProvider.create(credentialsProfileName) + log.debug(s"Credentials of local $credentialsProfileName profile =" + + s" ${localProfileCredentials.resolveCredentials().accessKeyId()}, ${localProfileCredentials.resolveCredentials().secretAccessKey().take(5)}...") + + localProfileCredentials + } + + def getS3Client(region: Region, credentialsProvider: AwsCredentialsProvider): S3Client = { + S3Client.builder() + .region(region) + .credentialsProvider(credentialsProvider) + .build() + } + +} diff --git a/atum/src/test/scala/za/co/absa/atum/persistence/TestResources.scala b/atum/src/test/scala/za/co/absa/atum/persistence/TestResources.scala index 44feda9d..5cb3b040 100644 --- a/atum/src/test/scala/za/co/absa/atum/persistence/TestResources.scala +++ b/atum/src/test/scala/za/co/absa/atum/persistence/TestResources.scala @@ -1,6 +1,5 @@ package za.co.absa.atum.persistence -import za.co.absa.atum.core.ControlType import za.co.absa.atum.model.{Checkpoint, ControlMeasure, ControlMeasureMetadata, Measurement} object TestResources { diff --git a/atum/src/test/scala/za/co/absa/atum/s3/ControlMeasuresS3LoaderJsonSpec.scala b/atum/src/test/scala/za/co/absa/atum/persistence/s3/ControlMeasuresS3LoaderJsonSpec.scala similarity index 91% rename from atum/src/test/scala/za/co/absa/atum/s3/ControlMeasuresS3LoaderJsonSpec.scala rename to atum/src/test/scala/za/co/absa/atum/persistence/s3/ControlMeasuresS3LoaderJsonSpec.scala index 5d1681bb..4386c8cf 100644 --- a/atum/src/test/scala/za/co/absa/atum/s3/ControlMeasuresS3LoaderJsonSpec.scala +++ b/atum/src/test/scala/za/co/absa/atum/persistence/s3/ControlMeasuresS3LoaderJsonSpec.scala @@ -1,15 +1,15 @@ -package za.co.absa.atum.s3 +package za.co.absa.atum.persistence.s3 import org.mockito.captor.{ArgCaptor, Captor} import org.mockito.scalatest.IdiomaticMockito import org.mockito.{ArgumentMatchers, Mockito} import org.scalatest.flatspec.AnyFlatSpec import org.scalatest.matchers.should.Matchers +import software.amazon.awssdk.auth.credentials.DefaultCredentialsProvider import software.amazon.awssdk.core.ResponseBytes import software.amazon.awssdk.regions.Region import software.amazon.awssdk.services.s3.S3Client import software.amazon.awssdk.services.s3.model.{GetObjectRequest, GetObjectResponse} -import za.co.absa.atum.persistence.s3.ControlMeasuresS3LoaderJsonFile import za.co.absa.atum.persistence.{S3Location, TestResources} import za.co.absa.atum.utils.FileUtils @@ -23,6 +23,7 @@ class ControlMeasuresS3LoaderJsonSpec extends AnyFlatSpec with Matchers with Idi val mockedS3Client = mock[S3Client] val mockedRequest: ResponseBytes[GetObjectResponse] = mock[ResponseBytes[GetObjectResponse]] + implicit val credentialsProvider = DefaultCredentialsProvider.create() val loader = new ControlMeasuresS3LoaderJsonFile(inputLocation) { override def getS3Client: S3Client = mockedS3Client } diff --git a/atum/src/test/scala/za/co/absa/atum/s3/ControlMeasuresS3StorerJsonSpec.scala b/atum/src/test/scala/za/co/absa/atum/persistence/s3/ControlMeasuresS3StorerJsonSpec.scala similarity index 94% rename from atum/src/test/scala/za/co/absa/atum/s3/ControlMeasuresS3StorerJsonSpec.scala rename to atum/src/test/scala/za/co/absa/atum/persistence/s3/ControlMeasuresS3StorerJsonSpec.scala index 990a170f..66f65e9d 100644 --- a/atum/src/test/scala/za/co/absa/atum/s3/ControlMeasuresS3StorerJsonSpec.scala +++ b/atum/src/test/scala/za/co/absa/atum/persistence/s3/ControlMeasuresS3StorerJsonSpec.scala @@ -5,6 +5,7 @@ import org.mockito.scalatest.IdiomaticMockito import org.mockito.{ArgumentMatchers, Mockito} import org.scalatest.flatspec.AnyFlatSpec import org.scalatest.matchers.should.Matchers +import software.amazon.awssdk.auth.credentials.DefaultCredentialsProvider import software.amazon.awssdk.core.sync.RequestBody import software.amazon.awssdk.regions.Region import software.amazon.awssdk.services.s3.S3Client @@ -25,6 +26,8 @@ class ControlMeasuresS3StorerJsonSpec extends AnyFlatSpec with Matchers with Idi val kmsSettigns = S3KmsSettings("testingKeyId123") val mockedS3Client = mock[S3Client] + implicit val credentialsProvider = DefaultCredentialsProvider.create() + val storer = new ControlMeasuresS3StorerJsonFile(outputLocation, kmsSettigns) { override def getS3Client: S3Client = mockedS3Client } diff --git a/examples/src/main/scala/za/co/absa/atum/examples/SampleS3Measurements1.scala b/examples/src/main/scala/za/co/absa/atum/examples/SampleS3Measurements1.scala index b5677d7c..471338b6 100644 --- a/examples/src/main/scala/za/co/absa/atum/examples/SampleS3Measurements1.scala +++ b/examples/src/main/scala/za/co/absa/atum/examples/SampleS3Measurements1.scala @@ -18,22 +18,25 @@ package za.co.absa.atum.examples import org.apache.spark.sql.{SaveMode, SparkSession} import za.co.absa.atum.AtumImplicits._ import za.co.absa.atum.persistence.S3Location +import za.co.absa.atum.utils.S3Utils object SampleS3Measurements1 { def main(args: Array[String]) { val sparkBuilder = SparkSession.builder().appName("Sample S3 Measurements 1 Job") val spark = sparkBuilder -// .master("local") + // .master("local") .getOrCreate() import spark.implicits._ + // This sample example relies on local credentials profile named "saml" with access to the s3 location defined below + implicit val samlCredentialsProvider = S3Utils.getLocalProfileCredentialsProvider("saml") + // Initializing library to hook up to Apache Spark spark.enableControlMeasuresTrackingForS3( sourceS3Location = Some(S3Location("euw1-ctodatadev-dev-bigdatarnd-s3-poc", "atum/input/wikidata.csv.info")), destinationS3Config = None - ) - .setControlMeasuresWorkflow("Job 1 S3 ") + ).setControlMeasuresWorkflow("Job 1 S3 ") // A business logic of a spark job ... diff --git a/examples/src/main/scala/za/co/absa/atum/examples/SampleS3Measurements2.scala b/examples/src/main/scala/za/co/absa/atum/examples/SampleS3Measurements2.scala index 90a03cc0..51be89ec 100644 --- a/examples/src/main/scala/za/co/absa/atum/examples/SampleS3Measurements2.scala +++ b/examples/src/main/scala/za/co/absa/atum/examples/SampleS3Measurements2.scala @@ -16,9 +16,10 @@ package za.co.absa.atum.examples import org.apache.spark.sql.{SaveMode, SparkSession} -import software.amazon.awssdk.services.s3.S3Configuration import za.co.absa.atum.AtumImplicits._ +import za.co.absa.atum.core.Atum import za.co.absa.atum.persistence.{S3KmsSettings, S3Location} +import za.co.absa.atum.utils.S3Utils object SampleS3Measurements2 { def main(args: Array[String]) { @@ -30,8 +31,11 @@ object SampleS3Measurements2 { val spark = sparkBuilder.getOrCreate() import spark.implicits._ - val kmsKeyId = "todo put keyId here" // TODO must be removed/resuplied - val s3KmsSettings = + // This sample example relies on local credentials profile named "saml" with access to the s3 location defined below + // AND by having explicitly defined KMS Key ID + implicit val samlCredentialsProvider = S3Utils.getLocalProfileCredentialsProvider("saml") + val kmsKeyId = System.getenv("TOOLING_KMS_KEY_ID") // load from an environment property in order not to disclose it here + Atum.log.info(s"kmsKeyId from env loaded = ${kmsKeyId.take(10)}...") // Initializing library to hook up to Apache Spark // No need to specify datasetName and datasetVersion as it is stage 2 and it will be determined automatically @@ -41,8 +45,7 @@ object SampleS3Measurements2 { S3Location("euw1-ctodatadev-dev-bigdatarnd-s3-poc", "atum/output/wikidata.csv.info"), S3KmsSettings(kmsKeyId) ) - ) - .setControlMeasuresWorkflow("Job 2") + ) .setControlMeasuresWorkflow("Job 2") val sourceDS = spark.read .parquet("data/output_s3/stage1_job_results") diff --git a/examples/src/test/scala/za/co/absa/atum/examples/SampleMeasurementsAllRunnerSpec.scala b/examples/src/test/scala/za/co/absa/atum/examples/SampleMeasurementsHdfsRunnerSpec.scala similarity index 94% rename from examples/src/test/scala/za/co/absa/atum/examples/SampleMeasurementsAllRunnerSpec.scala rename to examples/src/test/scala/za/co/absa/atum/examples/SampleMeasurementsHdfsRunnerSpec.scala index 1ebbeded..7d803e69 100644 --- a/examples/src/test/scala/za/co/absa/atum/examples/SampleMeasurementsAllRunnerSpec.scala +++ b/examples/src/test/scala/za/co/absa/atum/examples/SampleMeasurementsHdfsRunnerSpec.scala @@ -18,7 +18,7 @@ package za.co.absa.atum.examples import org.scalatest.funsuite.AnyFunSuite import za.co.absa.atum.utils._ -class SampleMeasurementsAllRunnerSpec extends AnyFunSuite +class SampleMeasurementsHdfsRunnerSpec extends AnyFunSuite with SparkJobRunnerMethods with SparkLocalMaster { diff --git a/examples/src/test/scala/za/co/absa/atum/examples/SampleMeasurementsS3RunnerSpec.scala b/examples/src/test/scala/za/co/absa/atum/examples/SampleMeasurementsS3RunnerSpec.scala index 212b14d9..e43de762 100644 --- a/examples/src/test/scala/za/co/absa/atum/examples/SampleMeasurementsS3RunnerSpec.scala +++ b/examples/src/test/scala/za/co/absa/atum/examples/SampleMeasurementsS3RunnerSpec.scala @@ -22,7 +22,6 @@ class SampleMeasurementsS3RunnerSpec extends AnyFunSuite with SparkJobRunnerMethods with SparkLocalMaster { - // todo reenable when s3 mock is ready -// runSparkJobAsTest[SampleS3Measurements1.type] -// runSparkJobAsTest[SampleS3Measurements2.type] + runSparkJobAsTest[SampleS3Measurements1.type] + runSparkJobAsTest[SampleS3Measurements2.type] } From e993f78919193753e02642410a43afaa7eaa9622 Mon Sep 17 00:00:00 2001 From: Daniel Kavan Date: Mon, 31 Aug 2020 13:31:24 +0200 Subject: [PATCH 12/18] scaladoc fix (reformat revert) --- .../scala/za/co/absa/atum/AtumImplicits.scala | 228 +++++++++--------- 1 file changed, 114 insertions(+), 114 deletions(-) diff --git a/atum/src/main/scala/za/co/absa/atum/AtumImplicits.scala b/atum/src/main/scala/za/co/absa/atum/AtumImplicits.scala index 7a50262e..2ffbe0e1 100644 --- a/atum/src/main/scala/za/co/absa/atum/AtumImplicits.scala +++ b/atum/src/main/scala/za/co/absa/atum/AtumImplicits.scala @@ -27,25 +27,25 @@ import za.co.absa.atum.persistence.s3.{ControlMeasuresS3LoaderJsonFile, ControlM import scala.language.implicitConversions /** - * The object contains implicit methods for Control Framework - * Minimalistic example of enabling control measurements tracking: - * {{{ - * import za.co.absa.atum.Atum - * import za.co.absa.atum.AtumImplicits._ - * - * ... - * - * spark.enableControlFrameworkTracking(sourceInfoFile = "/source/info/file/path") - * - * ... - * - * dataSet.setCheckpoint("Checkpoint Name") - * }}} - * - * You can use enableControlFrameworkTracking() without parameters if the _INFO file - * is in the path. - * - */ + * The object contains implicit methods for Control Framework + * Minimalistic example of enabling control measurements tracking: + * {{{ + * import za.co.absa.atum.Atum + * import za.co.absa.atum.AtumImplicits._ + * + * ... + * + * spark.enableControlFrameworkTracking(sourceInfoFile = "/source/info/file/path") + * + * ... + * + * dataSet.setCheckpoint("Checkpoint Name") + * }}} + * + * You can use enableControlFrameworkTracking() without parameters if the _INFO file + * is in the path. + * + */ object AtumImplicits { type DefaultControlInfoStorer = ControlMeasuresHdfsStorerJsonFile type DefaultControlInfoLoader = ControlMeasuresHdfsLoaderJsonFile @@ -53,28 +53,28 @@ object AtumImplicits { implicit def StringToPath(path: String): Path = new Path(path) /** - * The class contains implicit methods for [[org.apache.spark.sql.SparkSession]]. - */ + * The class contains implicit methods for [[org.apache.spark.sql.SparkSession]]. + */ implicit class SparkSessionWrapper(sparkSession: SparkSession) { /** - * Enable control measurements tracking. - * Input and output info file names will be inferred automatically based on data source and destination paths - * - */ + * Enable control measurements tracking. + * Input and output info file names will be inferred automatically based on data source and destination paths + * + */ def enableControlMeasuresTracking(): SparkSession = { enableControlMeasuresTracking(None, None) } /** - * Enable control measurements tracking. - * Both input and output info file paths need to be provided - * - * Example info file path name: "data/input/wikidata.csv.info" - * - * @param sourceInfoFile Pathname to a json-formatted info file containing control measurements - * @param destinationInfoFile Pathname to save the control measurement results to - */ + * Enable control measurements tracking. + * Both input and output info file paths need to be provided + * + * Example info file path name: "data/input/wikidata.csv.info" + * + * @param sourceInfoFile Pathname to a json-formatted info file containing control measurements + * @param destinationInfoFile Pathname to save the control measurement results to + */ def enableControlMeasuresTracking(sourceInfoFile: String = "", destinationInfoFile: String = ""): SparkSession = { val hadoopConfiguration = sparkSession.sparkContext.hadoopConfiguration @@ -107,13 +107,13 @@ object AtumImplicits { } /** - * Enable control measurements tracking. - * This is a generic way to enable control measurements tracking enabling to provide a custom - * control measurements loader and storer objects - * - * @param loader An object responsible for loading data source control measurements - * @param storer An object responsible for storing the result control measurements - */ + * Enable control measurements tracking. + * This is a generic way to enable control measurements tracking enabling to provide a custom + * control measurements loader and storer objects + * + * @param loader An object responsible for loading data source control measurements + * @param storer An object responsible for storing the result control measurements + */ def enableControlMeasuresTracking(loader: Option[ControlMeasuresLoader], storer: Option[ControlMeasuresStorer]): SparkSession = sparkSession.synchronized { @@ -131,26 +131,26 @@ object AtumImplicits { } /** - * Explicitly disable control measurements tracking. - * After invoking this routine control measuress will not be tracked for the rest of the Spark Job - * - */ + * Explicitly disable control measurements tracking. + * After invoking this routine control measuress will not be tracked for the rest of the Spark Job + * + */ def disableControlMeasuresTracking(): SparkSession = sparkSession.synchronized { Atum.dispose(sparkSession) sparkSession - } + } /** - * Sets control measurements file name for the source and destination data set. - * The file name should not contain path as it will be inferred from data source and destination. - * Use this only if info file paths and not specified when calling enableControlFrameworkTracking() - * - * Example info file name: "_INFO" - * - * @param fileName A file name for control measurements info - */ + * Sets control measurements file name for the source and destination data set. + * The file name should not contain path as it will be inferred from data source and destination. + * Use this only if info file paths and not specified when calling enableControlFrameworkTracking() + * + * Example info file name: "_INFO" + * + * @param fileName A file name for control measurements info + */ def setControlMeasuresFileName(fileName: String): SparkSession = { setControlMeasuresInputFileName(fileName) setControlMeasuresOutputFileName(fileName) @@ -158,59 +158,59 @@ object AtumImplicits { } /** - * Sets control measurements file name for the source data set. - * The file name should not contain path as it will be inferred from data source. - * Use this only if the input info file path and not specified when calling enableControlFrameworkTracking() - * - * Example info file name: "_INFO" - * - * @param fileName A file name for control measurements info - */ + * Sets control measurements file name for the source data set. + * The file name should not contain path as it will be inferred from data source. + * Use this only if the input info file path and not specified when calling enableControlFrameworkTracking() + * + * Example info file name: "_INFO" + * + * @param fileName A file name for control measurements info + */ def setControlMeasuresInputFileName(fileName: String): SparkSession = { Atum.setControlMeasuresInputFileName(fileName) sparkSession } /** - * Sets control measurements file name for the destination data set. - * The file name should not contain path as it will be inferred from data destination. - * Use this only if the output info file path and not specified when calling enableControlFrameworkTracking() - * - * Example info file name: "_INFO" - * - * @param fileName A file name for control measurements info - */ + * Sets control measurements file name for the destination data set. + * The file name should not contain path as it will be inferred from data destination. + * Use this only if the output info file path and not specified when calling enableControlFrameworkTracking() + * + * Example info file name: "_INFO" + * + * @param fileName A file name for control measurements info + */ def setControlMeasuresOutputFileName(fileName: String): SparkSession = { Atum.setControlMeasuresOutputFileName(fileName) sparkSession } /** - * The method sets workflow name for the current job - * - * @param workflowName Name of the checkpoint - */ + * The method sets workflow name for the current job + * + * @param workflowName Name of the checkpoint + */ def setControlMeasuresWorkflow(workflowName: String): SparkSession = { Atum.setWorkflowName(workflowName) sparkSession } /** - * Check if Control Framework is initialized - * - * @return true is Control Framework is initialized - */ + * Check if Control Framework is initialized + * + * @return true is Control Framework is initialized + */ def isControlMeasuresTrackingEnabled: Boolean = { sparkSession.sessionState.conf contains Constants.InitFlagKey } /** - * The method notifies Menas of a job failure - * - * @param jobStep A job step name - * @param errorDescription An error description - * @param techDetails A technical details - */ + * The method notifies Menas of a job failure + * + * @param jobStep A job step name + * @param errorDescription An error description + * @param techDetails A technical details + */ def setControlMeasurementError(jobStep: String, errorDescription: String, techDetails: String): SparkSession = { val errorDescriptionTrunc = if (errorDescription.length > Constants.maxErrorMessageSize) errorDescription.substring(0, Constants.maxErrorMessageSize) @@ -229,16 +229,16 @@ object AtumImplicits { } /** - * The class contains implicit methods for [[org.apache.spark.sql.Dataset]]. - */ + * The class contains implicit methods for [[org.apache.spark.sql.Dataset]]. + */ implicit class DataSetWrapper(dataset: Dataset[Row]) { /** - * The method creates a new checkpoint by calculating control measurements of the dataset - * On first checkpoint Spark Session Key ControlFrameworkKeys.InfoFileVersionKey is updated - * to the info file stored version - * - * @param name Name of the checkpoint - */ + * The method creates a new checkpoint by calculating control measurements of the dataset + * On first checkpoint Spark Session Key ControlFrameworkKeys.InfoFileVersionKey is updated + * to the info file stored version + * + * @param name Name of the checkpoint + */ def setCheckpoint(name: String, persistInDatabase: Boolean = true): Dataset[Row] = { if (!(dataset.sparkSession.sessionState.conf contains Constants.InitFlagKey)) throw new IllegalStateException("Control framework tracking is not initialized.") @@ -252,12 +252,12 @@ object AtumImplicits { } /** - * The method returns the number of records in the dataframe calculated during the last checkpoint. - * If record count is absent in the checkpoint measurements, None is returned. - * - * This is useful to optimize out an additional df.count() invocation in a Spark job with - * enabled control measurements. - */ + * The method returns the number of records in the dataframe calculated during the last checkpoint. + * If record count is absent in the checkpoint measurements, None is returned. + * + * This is useful to optimize out an additional df.count() invocation in a Spark job with + * enabled control measurements. + */ def lastCheckpointRowCount: Option[Long] = { if (!(dataset.sparkSession.sessionState.conf contains Constants.InitFlagKey)) throw new IllegalStateException("Control framework tracking is not initialized.") @@ -271,11 +271,11 @@ object AtumImplicits { } /** - * The method registers a column rename of a column that is used for control measurements - * - * @param oldName A job step name - * @param newName An error description - */ + * The method registers a column rename of a column that is used for control measurements + * + * @param oldName A job step name + * @param newName An error description + */ def registerColumnRename(oldName: String, newName: String): Dataset[Row] = { if (!(dataset.sparkSession.sessionState.conf contains Constants.InitFlagKey)) throw new IllegalStateException("Control framework tracking is not initialized.") @@ -284,10 +284,10 @@ object AtumImplicits { } /** - * The method registers a column drop when it is no longer needed for the column to calculate control measurements - * - * @param columnName A column to be dropped from measurements - */ + * The method registers a column drop when it is no longer needed for the column to calculate control measurements + * + * @param columnName A column to be dropped from measurements + */ def registerColumnDrop(columnName: String): Dataset[Row] = { if (!(dataset.sparkSession.sessionState.conf contains Constants.InitFlagKey)) throw new IllegalStateException("Control framework tracking is not initialized.") @@ -296,20 +296,20 @@ object AtumImplicits { } /** - * The method fetches the initial control measurements and puts version from info file - * to ControlFrameworkKeys.InfoFileVersionKey Spark Session Key - * - */ + * The method fetches the initial control measurements and puts version from info file + * to ControlFrameworkKeys.InfoFileVersionKey Spark Session Key + * + */ def loadControlInfoFile(): Dataset[Row] = { Atum.controlFrameworkState.initializeControlInfo(dataset) dataset } /** - * The method saves the info file to the specified destination path on HDFS - * - * @param outputPath A directory or a file name to save the info file to. - */ + * The method saves the info file to the specified destination path on HDFS + * + * @param outputPath A directory or a file name to save the info file to. + */ def writeInfoFile(outputPath: String): Dataset[Row] = { Atum.controlFrameworkState.storeCurrentInfoFile(outputPath) dataset From 6c51d928ca340d636f06d84ef0893aa0f5442437 Mon Sep 17 00:00:00 2001 From: Daniel Kavan Date: Mon, 31 Aug 2020 14:32:10 +0200 Subject: [PATCH 13/18] scala doc touchups README.md update with S3 specific example --- README.md | 61 ++++++++++++++++--- .../scala/za/co/absa/atum/AtumImplicits.scala | 2 +- .../s3/ControlMeasuresS3LoaderJsonFile.scala | 2 +- .../s3/ControlMeasuresS3StorerJsonFile.scala | 16 +++-- .../s3/ControlMeasuresS3StorerJsonSpec.scala | 2 +- 5 files changed, 69 insertions(+), 14 deletions(-) diff --git a/README.md b/README.md index bb4fb357..27293ca5 100644 --- a/README.md +++ b/README.md @@ -86,17 +86,17 @@ indepedently. ## Usage -#### Coordinate for Maven POM dependancy +### Coordinate for Maven POM dependancy ```xml za.co.absa atum - 0.2.6 + 0.3.0 ``` -#### Initial info file generation example +### Initial info file generation example Atum provides a method for initial creation of info files from a Spark dataframe. It can be used as is or can serve as a reference implementation for calculating control measurements. @@ -132,7 +132,7 @@ are not supoprted. // The info file contents are available as a String object in strJson. ``` -#### An ETL job example +### An ETL job example For the full example please see **SampleMeasurements1** and **SampleMeasurements2** objects from *atum.examples* project. It uses made up Wikipedia data for computations. The source data has an info file containing the initial checkpoints, @@ -183,13 +183,60 @@ In this example the data is read from 'data/input/mydata.csv' file. This data fi in 'data/input/_INFO'. Two checkpoints are created. Any business logic can be inserted between reading the source data and saving it to Parquet format. -### Atum library routines +### Storing Measurements in AWS S3 +Starting with version 0.3.0, persistence support for AWS S3 has been added. +AWS S3 can be both used for loading the measurement data from as well as saving the measurements back to. + +The following example demonstrates the setup: +```scala +import org.apache.spark.sql.SparkSession +import software.amazon.awssdk.auth.credentials.{AwsCredentialsProvider, DefaultCredentialsProvider, ProfileCredentialsProvider} +import za.co.absa.atum.AtumImplicits._ +import za.co.absa.atum.persistence.{S3KmsSettings, S3Location} + +object S3Example { + def main(args: Array[String]) { + val spark = SparkSession + .builder() + .appName("Example S3 Atum init showcase") + .getOrCreate() + + // Here we are using default credentials provider that relies on its default credentials provider chain to obtain the credentials + // (e.g. running in EMR/EC2 with correct role assigned) + implicit val defaultCredentialsProvider: AwsCredentialsProvider = DefaultCredentialsProvider.create() + // Alternatively, one could pass specific credentials provider. An example of using local profile named "saml" can be: + // implicit val samlCredentialsProvider = ProfileCredentialsProvider.create("saml") + + val sourceS3Location: S3Location = S3Location("my-bucket123", "atum/input/my_amazing_measures.csv.info") + + val kmsKeyId: String = "arn:aws:kms:eu-west-1:123456789012:key/12345678-90ab-cdef-1234-567890abcdef" // just example + val destinationS3Config: (S3Location, S3KmsSettings) = ( + S3Location("my-bucket123", "atum/output/my_amazing_measures2.csv.info"), + S3KmsSettings(kmsKeyId) + ) + + import spark.implicits._ + + // Initializing library to hook up to Apache Spark with S3 persistence + spark.enableControlMeasuresTrackingForS3( + sourceS3Location = Some(sourceS3Location), + destinationS3Config = Some(destinationS3Config) + ).setControlMeasuresWorkflow("A job with measurements saved to S3") + } +} + +``` +The rest of the processing logic and programatic approach to the library remains unchanged. + + +## Atum library routines The summary of common control framework routines you can use as Spark and Dataframe implicits are as follows: | Routine | Description | Example usage | | -------------- |:-------------------- |:---------------| -| enableControlMeasuresTracking(sourceInfoFile: *String*, destinationInfoFile: *String*) | Enable control measurements tracking. Source and destination info file paths can be omitted. If ommited, they will be automatically inferred from the input/output data sources. | spark.enableControlMeasurementsTracking() | +| enableControlMeasuresTracking(sourceInfoFile: *String*, destinationInfoFile: *String*) | Enable control measurements tracking. Source and destination info file paths can be omitted. If omitted, they will be automatically inferred from the input/output data sources. | spark.enableControlMeasurementsTracking() | +| enableControlMeasuresTrackingForS3(sourceS3Location: *Option[S3Location]*, destinationS3Config: *Option[(S3Location, S3KmsSettings)]*) | Enable control measurements tracking in S3. Source and destination parameters can be omitted. If omitted, the loading/storing part will not be used | spark.enableControlMeasuresTrackingForS3(optionalSourceS3Location, optionalDestinationS3Config) | | isControlMeasuresTrackingEnabled: *Boolean* | Retruns true if control measurements tracking is enabled. | if (spark.isControlMeasuresTrackingEnabled) {/*do something*/} | | disableControlMeasuresTracking() | Explicitly turn off control measurements tracking. | spark.disableControlMeasurementsTracking() | | setCheckpoint(name: *String*) | Calculates the control measurements and appends a new checkpoint. | df.setCheckpoint("Conformance Started") | @@ -204,7 +251,7 @@ The summary of common control framework routines you can use as Spark and Datafr | disableCaching() | Turns off caching that happens every time a checkpoint is generated. | disableCaching() | | setCachingStorageLevel(cacheStorageLevel: *StorageLevel*) | Specifies a Spark storage level to use for caching. Can be one of following: `NONE`, `DISK_ONLY`, `DISK_ONLY_2`, `MEMORY_ONLY`, `MEMORY_ONLY_2`, `MEMORY_ONLY_SER`, `MEMORY_ONLY_SER_2`, `MEMORY_AND_DISK`, `MEMORY_AND_DISK_2`, `MEMORY_AND_DISK_SER`, `MEMORY_AND_DISK_SER_2`, `MEMORY_AND_DISK_SER_2`, `OFF_HEAP`. | setCachingStorageLevel(StorageLevel.MEMORY_AND_DISK) | -### Control measurement types +## Control measurement types The control measurement of a column is a hash sum. It can be calculated differently depending on the column's data type and on business requirements. This table represents all currently supported measurement types: diff --git a/atum/src/main/scala/za/co/absa/atum/AtumImplicits.scala b/atum/src/main/scala/za/co/absa/atum/AtumImplicits.scala index 2ffbe0e1..aa14dc7e 100644 --- a/atum/src/main/scala/za/co/absa/atum/AtumImplicits.scala +++ b/atum/src/main/scala/za/co/absa/atum/AtumImplicits.scala @@ -17,7 +17,7 @@ package za.co.absa.atum import org.apache.hadoop.fs.Path import org.apache.spark.sql.{Dataset, Row, SparkSession} -import software.amazon.awssdk.auth.credentials.{AwsCredentialsProvider, DefaultCredentialsProvider} +import software.amazon.awssdk.auth.credentials.AwsCredentialsProvider import za.co.absa.atum.core.Atum.controlFrameworkState import za.co.absa.atum.core.{Atum, Constants} import za.co.absa.atum.persistence._ diff --git a/atum/src/main/scala/za/co/absa/atum/persistence/s3/ControlMeasuresS3LoaderJsonFile.scala b/atum/src/main/scala/za/co/absa/atum/persistence/s3/ControlMeasuresS3LoaderJsonFile.scala index 690e35c2..421e4029 100644 --- a/atum/src/main/scala/za/co/absa/atum/persistence/s3/ControlMeasuresS3LoaderJsonFile.scala +++ b/atum/src/main/scala/za/co/absa/atum/persistence/s3/ControlMeasuresS3LoaderJsonFile.scala @@ -25,7 +25,7 @@ import za.co.absa.atum.utils.{ControlUtils, S3Utils} /** * A loader of control measurements from a JSON file stored in AWS S3. * @param inputLocation S3 location to read the json measurements from - * @param credentialsProvider a specific credentials provider (e.g. SAML profile). use [[DefaultCredentialsProvider]] when in doubt + * @param credentialsProvider a specific credentials provider (e.g. SAML profile). Consider using [[DefaultCredentialsProvider#create()]] when in doubt. */ class ControlMeasuresS3LoaderJsonFile(inputLocation: S3Location) (implicit credentialsProvider: AwsCredentialsProvider) extends ControlMeasuresLoader { diff --git a/atum/src/main/scala/za/co/absa/atum/persistence/s3/ControlMeasuresS3StorerJsonFile.scala b/atum/src/main/scala/za/co/absa/atum/persistence/s3/ControlMeasuresS3StorerJsonFile.scala index dacd8a6e..10137d23 100644 --- a/atum/src/main/scala/za/co/absa/atum/persistence/s3/ControlMeasuresS3StorerJsonFile.scala +++ b/atum/src/main/scala/za/co/absa/atum/persistence/s3/ControlMeasuresS3StorerJsonFile.scala @@ -16,6 +16,7 @@ package za.co.absa.atum.persistence.s3 import software.amazon.awssdk.auth.credentials.{AwsCredentialsProvider, DefaultCredentialsProvider} +import software.amazon.awssdk.core.exception.SdkException import software.amazon.awssdk.core.sync.RequestBody import software.amazon.awssdk.services.s3.S3Client import software.amazon.awssdk.services.s3.model.PutObjectRequest @@ -26,14 +27,21 @@ import za.co.absa.atum.utils.S3Utils /** * A storer of control measurements to a JSON file stored in AWS S3. * - * @param outputLocation s3 location to save measurements data to - * @param kmsSettings KMS settings - server side encryption configuration - * @param credentialsProvider a specific credentials provider (e.g. SAML profile). use [[DefaultCredentialsProvider]] when in doubt + * @param outputLocation s3 location to save measurements data to + * @param kmsSettings KMS settings - server side encryption configuration + * @param credentialsProvider a specific credentials provider (e.g. SAML profile). Consider using [[DefaultCredentialsProvider#create()]] when in doubt. */ class ControlMeasuresS3StorerJsonFile(outputLocation: S3Location, kmsSettings: S3KmsSettings) (implicit credentialsProvider: AwsCredentialsProvider) extends ControlMeasuresStorer { + + /** + * Stores the `controlInfo` measurement to an S3 location. + * + * @param controlInfo measurements to store + * @throws SdkException when storing fails. + */ override def store(controlInfo: ControlMeasure): Unit = { - val serialized = ControlMeasuresParser asJson controlInfo + val serialized = ControlMeasuresParser asJson controlInfo saveDataToFile(serialized) } diff --git a/atum/src/test/scala/za/co/absa/atum/persistence/s3/ControlMeasuresS3StorerJsonSpec.scala b/atum/src/test/scala/za/co/absa/atum/persistence/s3/ControlMeasuresS3StorerJsonSpec.scala index 66f65e9d..ba5ee1cc 100644 --- a/atum/src/test/scala/za/co/absa/atum/persistence/s3/ControlMeasuresS3StorerJsonSpec.scala +++ b/atum/src/test/scala/za/co/absa/atum/persistence/s3/ControlMeasuresS3StorerJsonSpec.scala @@ -49,7 +49,7 @@ class ControlMeasuresS3StorerJsonSpec extends AnyFlatSpec with Matchers with Idi capturedPutRequest.ssekmsKeyId shouldBe "testingKeyId123" capturedPutRequest.serverSideEncryption() shouldBe ServerSideEncryption.AWS_KMS - // This expected request body content should be the same as content of this file ( ~inputControlMeasure) + // This expected request body content should be the same as content of this file (conforms to `inputControlMeasure`) val sameContentFile: String = TestResources.InputInfo.localPath val expectedContent = FileUtils.readFileToString(sameContentFile) From def1d730cf99d8daccf5e0cedc542fa0be56ef46 Mon Sep 17 00:00:00 2001 From: Daniel Kavan Date: Thu, 3 Sep 2020 09:47:15 +0200 Subject: [PATCH 14/18] storeCurrentInfoFile divided into HDFS and S3 version "implicit def StringToPath" changed to an implicit class wrapper. version update to 0.3.0-SNAPSHOT --- atum/pom.xml | 2 +- .../scala/za/co/absa/atum/AtumImplicits.scala | 17 ++++++++++++----- .../absa/atum/core/ControlFrameworkState.scala | 12 ++++++++++-- .../atum/core/SparkQueryExecutionListener.scala | 2 +- examples/pom.xml | 2 +- pom.xml | 2 +- 6 files changed, 26 insertions(+), 11 deletions(-) diff --git a/atum/pom.xml b/atum/pom.xml index c84f0646..18a91fa4 100644 --- a/atum/pom.xml +++ b/atum/pom.xml @@ -24,7 +24,7 @@ za.co.absa atum-parent - 0.2.7-SNAPSHOT + 0.3.0-SNAPSHOT diff --git a/atum/src/main/scala/za/co/absa/atum/AtumImplicits.scala b/atum/src/main/scala/za/co/absa/atum/AtumImplicits.scala index aa14dc7e..f85d0d0c 100644 --- a/atum/src/main/scala/za/co/absa/atum/AtumImplicits.scala +++ b/atum/src/main/scala/za/co/absa/atum/AtumImplicits.scala @@ -50,7 +50,9 @@ object AtumImplicits { type DefaultControlInfoStorer = ControlMeasuresHdfsStorerJsonFile type DefaultControlInfoLoader = ControlMeasuresHdfsLoaderJsonFile - implicit def StringToPath(path: String): Path = new Path(path) + implicit class StringPathExt(path: String) { + def toPath: Path = new Path(path) + } /** * The class contains implicit methods for [[org.apache.spark.sql.SparkSession]]. @@ -67,7 +69,7 @@ object AtumImplicits { } /** - * Enable control measurements tracking. + * Enable control measurements tracking on HDFS. * Both input and output info file paths need to be provided * * Example info file path name: "data/input/wikidata.csv.info" @@ -79,8 +81,8 @@ object AtumImplicits { destinationInfoFile: String = ""): SparkSession = { val hadoopConfiguration = sparkSession.sparkContext.hadoopConfiguration - val loader = if (sourceInfoFile.isEmpty) None else Some(new DefaultControlInfoLoader(hadoopConfiguration, sourceInfoFile)) - val storer = if (destinationInfoFile.isEmpty) None else Some(new DefaultControlInfoStorer(hadoopConfiguration, destinationInfoFile)) + val loader = if (sourceInfoFile.isEmpty) None else Some(new DefaultControlInfoLoader(hadoopConfiguration, sourceInfoFile.toPath)) + val storer = if (destinationInfoFile.isEmpty) None else Some(new DefaultControlInfoStorer(hadoopConfiguration, destinationInfoFile.toPath)) enableControlMeasuresTracking(loader, storer) } @@ -311,7 +313,12 @@ object AtumImplicits { * @param outputPath A directory or a file name to save the info file to. */ def writeInfoFile(outputPath: String): Dataset[Row] = { - Atum.controlFrameworkState.storeCurrentInfoFile(outputPath) + Atum.controlFrameworkState.storeCurrentInfoFileOnHdfs(outputPath.toPath) + dataset + } + + def writeInfoFileOnS3(s3Location: S3Location, s3KmsSettings: S3KmsSettings)(implicit credentialsProvider: AwsCredentialsProvider): Dataset[Row] = { + Atum.controlFrameworkState.storeCurrentInfoFileOnS3(s3Location, s3KmsSettings) dataset } diff --git a/atum/src/main/scala/za/co/absa/atum/core/ControlFrameworkState.scala b/atum/src/main/scala/za/co/absa/atum/core/ControlFrameworkState.scala index 560a08a4..6f2535ec 100644 --- a/atum/src/main/scala/za/co/absa/atum/core/ControlFrameworkState.scala +++ b/atum/src/main/scala/za/co/absa/atum/core/ControlFrameworkState.scala @@ -19,12 +19,14 @@ import org.apache.hadoop.conf.Configuration import org.apache.hadoop.fs.{FileSystem, Path} import org.apache.spark.sql.{Dataset, Row, SparkSession} import org.apache.spark.storage.StorageLevel +import software.amazon.awssdk.auth.credentials.AwsCredentialsProvider import za.co.absa.atum.AtumImplicits.DefaultControlInfoLoader import za.co.absa.atum.core.Atum.log import za.co.absa.atum.core.ControlType.Count import za.co.absa.atum.model.{RunError, RunState, _} import za.co.absa.atum.persistence.hdfs.ControlMeasuresHdfsStorerJsonFile -import za.co.absa.atum.persistence.{ControlMeasuresLoader, ControlMeasuresStorer} +import za.co.absa.atum.persistence.s3.ControlMeasuresS3StorerJsonFile +import za.co.absa.atum.persistence.{ControlMeasuresLoader, ControlMeasuresStorer, S3KmsSettings, S3Location} import za.co.absa.atum.plugins.EventListener import za.co.absa.atum.utils.ExecutionPlanUtils.inferInputInfoFileName @@ -246,7 +248,13 @@ class ControlFrameworkState(sparkSession: SparkSession) { } } - private[atum] def storeCurrentInfoFile(outputHDFSPathFileName: Path, hadoopConfiguration: Configuration = sparkSession.sparkContext.hadoopConfiguration): Unit = { + private[atum] def storeCurrentInfoFileOnS3(s3Location: S3Location, s3KmsSettings: S3KmsSettings)(implicit credentialsProvider: AwsCredentialsProvider): Unit = { + val storer = new ControlMeasuresS3StorerJsonFile(s3Location, s3KmsSettings) + storer.store(accumulator.getControlMeasure) + Atum.log.info(s"Control measurements saved to ${s3Location.s3String()}") + } + + private[atum] def storeCurrentInfoFileOnHdfs(outputHDFSPathFileName: Path, hadoopConfiguration: Configuration = sparkSession.sparkContext.hadoopConfiguration): Unit = { val fs = FileSystem.get(hadoopConfiguration) val outputFilePath = if (fs.isDirectory(outputHDFSPathFileName)) { new Path(outputHDFSPathFileName, outputInfoFileName) diff --git a/atum/src/main/scala/za/co/absa/atum/core/SparkQueryExecutionListener.scala b/atum/src/main/scala/za/co/absa/atum/core/SparkQueryExecutionListener.scala index f77d7711..7f0781fd 100644 --- a/atum/src/main/scala/za/co/absa/atum/core/SparkQueryExecutionListener.scala +++ b/atum/src/main/scala/za/co/absa/atum/core/SparkQueryExecutionListener.scala @@ -54,7 +54,7 @@ class SparkQueryExecutionListener(cf: ControlFrameworkState) extends QueryExecut // Write _INFO file to the output directory infoFilePath.foreach(path => { Atum.log.info(s"Infered _INFO Path = ${path.toUri.toString}") - cf.storeCurrentInfoFile(path, qe.sparkSession.sparkContext.hadoopConfiguration) + cf.storeCurrentInfoFileOnHdfs(path, qe.sparkSession.sparkContext.hadoopConfiguration) }) // Write _INFO file to a registered storer diff --git a/examples/pom.xml b/examples/pom.xml index fc19dbaf..ffd960f0 100644 --- a/examples/pom.xml +++ b/examples/pom.xml @@ -24,7 +24,7 @@ za.co.absa atum-parent - 0.2.7-SNAPSHOT + 0.3.0-SNAPSHOT diff --git a/pom.xml b/pom.xml index 17b6fa87..34cabef9 100644 --- a/pom.xml +++ b/pom.xml @@ -20,7 +20,7 @@ za.co.absa atum-parent - 0.2.7-SNAPSHOT + 0.3.0-SNAPSHOT pom From f6096868be5a3558975fe0fb7c777d68414a291f Mon Sep 17 00:00:00 2001 From: Daniel Kavan Date: Thu, 3 Sep 2020 17:09:50 +0200 Subject: [PATCH 15/18] writeInfoFileForQueryForS3 vs writeInfoFileForQuery - based on used storer (s3 based or not). may expose the kmsKeyId this way, though! --- .../za/co/absa/atum/core/Accumulator.scala | 6 +++ .../core/SparkQueryExecutionListener.scala | 31 +++++++++-- .../persistence/ControlMeasuresStorer.scala | 5 ++ .../s3/ControlMeasuresS3StorerJsonFile.scala | 6 +-- .../absa/atum/persistence/s3/S3Location.scala | 2 +- .../absa/atum/utils/ExecutionPlanUtils.scala | 26 +++++++++- .../za/co/absa/atum/utils/FileUtils.scala | 6 +++ .../atum/utils/ExecutionPlanUtilsSuite.scala | 52 +++++++++++++++++++ .../za/co/absa/atum/utils/FileUtilsSpec.scala | 17 ++++++ .../atum/examples/SampleS3Measurements1.scala | 3 +- .../atum/examples/SampleS3Measurements2.scala | 3 +- 11 files changed, 145 insertions(+), 12 deletions(-) create mode 100644 atum/src/test/scala/za/co/absa/atum/utils/ExecutionPlanUtilsSuite.scala create mode 100644 atum/src/test/scala/za/co/absa/atum/utils/FileUtilsSpec.scala diff --git a/atum/src/main/scala/za/co/absa/atum/core/Accumulator.scala b/atum/src/main/scala/za/co/absa/atum/core/Accumulator.scala index ea60572b..ca043cef 100644 --- a/atum/src/main/scala/za/co/absa/atum/core/Accumulator.scala +++ b/atum/src/main/scala/za/co/absa/atum/core/Accumulator.scala @@ -60,6 +60,12 @@ class Accumulator() { } } + /** + * Ability to view the storer if set. + * @return + */ + private[atum] def getStorer: Option[ControlMeasuresStorer] = if (isStorerLoaded) Some(storer) else None + /** * The method returns Control Info object in which checkpoints are sorted by calculation order. */ diff --git a/atum/src/main/scala/za/co/absa/atum/core/SparkQueryExecutionListener.scala b/atum/src/main/scala/za/co/absa/atum/core/SparkQueryExecutionListener.scala index 7f0781fd..6bfd2999 100644 --- a/atum/src/main/scala/za/co/absa/atum/core/SparkQueryExecutionListener.scala +++ b/atum/src/main/scala/za/co/absa/atum/core/SparkQueryExecutionListener.scala @@ -19,22 +19,29 @@ import java.io.{PrintWriter, StringWriter} import org.apache.spark.sql.execution.QueryExecution import org.apache.spark.sql.util.QueryExecutionListener -import za.co.absa.atum.persistence.hdfs.ControlMeasuresHdfsStorerJsonFile +import software.amazon.awssdk.regions.Region +import za.co.absa.atum.persistence.{S3ControlMeasuresStorer, S3KmsSettings} import za.co.absa.atum.utils.ExecutionPlanUtils.{inferOutputFileName, inferOutputInfoFileName} /** - * The class is responsible for listening to DataSet save events and outputting correcpoiding control measurements. + * The class is responsible for listening to DataSet save events and outputting corresponding control measurements. */ class SparkQueryExecutionListener(cf: ControlFrameworkState) extends QueryExecutionListener { override def onSuccess(funcName: String, qe: QueryExecution, durationNs: Long): Unit = { if (funcName == "save") { - writeInfoFileForQuery(qe) + + cf.accumulator.getStorer match { + case Some(s3storer: S3ControlMeasuresStorer) => + writeInfoFileForQueryForS3(qe, s3storer.outputLocation.region, s3storer.kmsSettings) + case _ => writeInfoFileForQuery(qe) + } // Notify listeners cf.updateRunCheckpoints(saveInfoFile = true) cf.updateStatusSuccess() + updateSplineRef(qe) } } @@ -53,7 +60,23 @@ class SparkQueryExecutionListener(cf: ControlFrameworkState) extends QueryExecut // Write _INFO file to the output directory infoFilePath.foreach(path => { - Atum.log.info(s"Infered _INFO Path = ${path.toUri.toString}") + Atum.log.info(s"Inferred _INFO Path = ${path.toUri.toString}") + cf.storeCurrentInfoFileOnHdfs(path, qe.sparkSession.sparkContext.hadoopConfiguration) + }) + + // Write _INFO file to a registered storer + if (cf.accumulator.isStorerLoaded) { + cf.accumulator.store() + } + } + + /** Write _INFO file with control measurements to the output directory based on the query plan */ + private def writeInfoFileForQueryForS3(qe: QueryExecution, region: Region, kmsSettings: S3KmsSettings): Unit = { + val infoFilePath = inferOutputInfoFileName(qe, cf.outputInfoFileName) + + // Write _INFO file to the output directory + infoFilePath.foreach(path => { + Atum.log.info(s"Inferred _INFO Path = ${path.toUri.toString}") cf.storeCurrentInfoFileOnHdfs(path, qe.sparkSession.sparkContext.hadoopConfiguration) }) diff --git a/atum/src/main/scala/za/co/absa/atum/persistence/ControlMeasuresStorer.scala b/atum/src/main/scala/za/co/absa/atum/persistence/ControlMeasuresStorer.scala index 483b4b27..0a84f3cf 100644 --- a/atum/src/main/scala/za/co/absa/atum/persistence/ControlMeasuresStorer.scala +++ b/atum/src/main/scala/za/co/absa/atum/persistence/ControlMeasuresStorer.scala @@ -22,3 +22,8 @@ trait ControlMeasuresStorer { def store(controlInfo: ControlMeasure): Unit def getInfo: String } + +trait S3ControlMeasuresStorer extends ControlMeasuresStorer { + def kmsSettings: S3KmsSettings + def outputLocation: S3Location +} diff --git a/atum/src/main/scala/za/co/absa/atum/persistence/s3/ControlMeasuresS3StorerJsonFile.scala b/atum/src/main/scala/za/co/absa/atum/persistence/s3/ControlMeasuresS3StorerJsonFile.scala index 10137d23..cf74456c 100644 --- a/atum/src/main/scala/za/co/absa/atum/persistence/s3/ControlMeasuresS3StorerJsonFile.scala +++ b/atum/src/main/scala/za/co/absa/atum/persistence/s3/ControlMeasuresS3StorerJsonFile.scala @@ -21,7 +21,7 @@ import software.amazon.awssdk.core.sync.RequestBody import software.amazon.awssdk.services.s3.S3Client import software.amazon.awssdk.services.s3.model.PutObjectRequest import za.co.absa.atum.model.ControlMeasure -import za.co.absa.atum.persistence.{ControlMeasuresParser, ControlMeasuresStorer, S3KmsSettings, S3Location} +import za.co.absa.atum.persistence.{ControlMeasuresParser, S3ControlMeasuresStorer, S3KmsSettings, S3Location} import za.co.absa.atum.utils.S3Utils /** @@ -31,8 +31,8 @@ import za.co.absa.atum.utils.S3Utils * @param kmsSettings KMS settings - server side encryption configuration * @param credentialsProvider a specific credentials provider (e.g. SAML profile). Consider using [[DefaultCredentialsProvider#create()]] when in doubt. */ -class ControlMeasuresS3StorerJsonFile(outputLocation: S3Location, kmsSettings: S3KmsSettings) - (implicit credentialsProvider: AwsCredentialsProvider) extends ControlMeasuresStorer { +class ControlMeasuresS3StorerJsonFile(val outputLocation: S3Location, val kmsSettings: S3KmsSettings) + (implicit credentialsProvider: AwsCredentialsProvider) extends S3ControlMeasuresStorer { /** * Stores the `controlInfo` measurement to an S3 location. diff --git a/atum/src/main/scala/za/co/absa/atum/persistence/s3/S3Location.scala b/atum/src/main/scala/za/co/absa/atum/persistence/s3/S3Location.scala index dde5aa99..4ecd496f 100644 --- a/atum/src/main/scala/za/co/absa/atum/persistence/s3/S3Location.scala +++ b/atum/src/main/scala/za/co/absa/atum/persistence/s3/S3Location.scala @@ -3,7 +3,7 @@ package za.co.absa.atum.persistence import software.amazon.awssdk.regions.Region import software.amazon.awssdk.services.s3.model.ServerSideEncryption -case class S3Location(bucketName: String, path: String, region: Region = Region.EU_WEST_1) { +case class S3Location(bucketName: String, path: String, region: Region) { /** * Returns formatted S3 string, e.g. `s3://myBucket/path/to/somewhere` * @param protocol http "s3" protocol, e.g. s3, s3n, s3a. Default = "s3". diff --git a/atum/src/main/scala/za/co/absa/atum/utils/ExecutionPlanUtils.scala b/atum/src/main/scala/za/co/absa/atum/utils/ExecutionPlanUtils.scala index 316683c5..17223c48 100644 --- a/atum/src/main/scala/za/co/absa/atum/utils/ExecutionPlanUtils.scala +++ b/atum/src/main/scala/za/co/absa/atum/utils/ExecutionPlanUtils.scala @@ -23,6 +23,7 @@ import org.apache.spark.sql.execution.QueryExecution import org.apache.spark.sql.execution.datasources.{HadoopFsRelation, InsertIntoHadoopFsRelationCommand, LogicalRelation, SaveIntoDataSourceCommand} import org.apache.spark.sql.{Dataset, Row} import za.co.absa.atum.core.Constants +import za.co.absa.atum.utils.FileUtils.PathJoin /** * This object contains utils for traversing execution plan DAG to infer control measurement input/output paths @@ -90,7 +91,7 @@ object ExecutionPlanUtils { } /** - * The method returns output control measurements info file name inferred from the source dataset + * The method returns output control measurements info file name inferred from the source dataset on HDFS * * @param qe A query execution object where output path name will be searched * @param infoFileName A file name of an info file, e.g. "_INFO" @@ -112,7 +113,28 @@ object ExecutionPlanUtils { } } - /** + /** + * The method returns output control measurements info file name inferred from the source dataset on S3 + * + * @param qe A query execution object where output path name will be searched + * @param infoFileName A file name of an info file, e.g. "_INFO" + * + * @return The inferred output control measurements file path of the source dataset + */ + def inferOutputInfoFileNameOnS3(qe: QueryExecution, infoFileName: String = Constants.DefaultInfoFileName): Option[String] = { + qe.analyzed match { + case s: SaveIntoDataSourceCommand => + Some(s.options("path") / infoFileName) + case _ => + log.warn(s"Logical plan: ${qe.logical.treeString}") + log.warn(s"Analyzed plan: ${qe.analyzed.treeString}") + log.warn(s"Optimized plan: ${qe.optimizedPlan.treeString}") + log.error(s"Unable to infer output path for control measurements for query execution $qe.") + None + } + } + + /** * The method returns source file names of a DataSet execution plan by traversing the DAG. * Thanks za.co.absa.spline.core * diff --git a/atum/src/main/scala/za/co/absa/atum/utils/FileUtils.scala b/atum/src/main/scala/za/co/absa/atum/utils/FileUtils.scala index 80d7ea5d..b179f46d 100644 --- a/atum/src/main/scala/za/co/absa/atum/utils/FileUtils.scala +++ b/atum/src/main/scala/za/co/absa/atum/utils/FileUtils.scala @@ -9,4 +9,10 @@ object FileUtils { str } + implicit class PathJoin(path: String) { + def /(pathSegment: String): String = { + s"${path.stripSuffix("/")}/${pathSegment.stripPrefix("/")}" + } + } + } diff --git a/atum/src/test/scala/za/co/absa/atum/utils/ExecutionPlanUtilsSuite.scala b/atum/src/test/scala/za/co/absa/atum/utils/ExecutionPlanUtilsSuite.scala new file mode 100644 index 00000000..91fffa8d --- /dev/null +++ b/atum/src/test/scala/za/co/absa/atum/utils/ExecutionPlanUtilsSuite.scala @@ -0,0 +1,52 @@ +package za.co.absa.atum.utils + +import org.apache.hadoop.conf.Configuration +import org.apache.hadoop.fs.Path +import org.apache.spark.sql.execution.QueryExecution +import org.apache.spark.sql.execution.datasources.SaveIntoDataSourceCommand +import org.mockito.Mockito +import org.mockito.scalatest.IdiomaticMockito +import org.scalatest.flatspec.AnyFlatSpec +import org.scalatest.matchers.should.Matchers + +class ExecutionPlanUtilsSuite extends AnyFlatSpec with Matchers with IdiomaticMockito { + + val hadoopConf = new Configuration + + "inferOutputInfoFileName" should "derive output file name for HDFS from SaveIntoDataSourceCommand" in { + val qe = mock[QueryExecution] + Mockito.when(qe.analyzed).thenReturn( + SaveIntoDataSourceCommand(null, null, options = Map(("path", "/tmp")), null) + ) + + ExecutionPlanUtils.inferOutputFileName(qe, hadoopConf).get.simplePath shouldBe "/tmp" + } + + "inferOutputInfoFileName" should "derive output info file name for HDFS from SaveIntoDataSourceCommand" in { + val qe = mock[QueryExecution] + val myInfoName = "myInfo" + Mockito.when(qe.analyzed).thenReturn( + SaveIntoDataSourceCommand(null, null, options = Map(("path", "/tmp/here")), null) + ) + + ExecutionPlanUtils.inferOutputInfoFileName(qe, myInfoName).get.simplePath shouldBe "/tmp/here/myInfo" + } + + "inferOutputInfoFileNameOnS3" should "derive output info file name for S3 from SaveIntoDataSourceCommand" in { + val qe = mock[QueryExecution] + val myInfoName = "myInfo" + Mockito.when(qe.analyzed).thenReturn( + // training slash should get taken care of + SaveIntoDataSourceCommand(null, null, options = Map(("path", "/tmp/here2/")), null) + ) + + ExecutionPlanUtils.inferOutputInfoFileNameOnS3(qe, myInfoName).get shouldBe "/tmp/here2/myInfo" + } + + + implicit class SimplePath(path: Path) { + // disregarding hdfs nameserver prefix or local FS fallback (file://) + def simplePath: String = path.toUri.getPath + } + +} diff --git a/atum/src/test/scala/za/co/absa/atum/utils/FileUtilsSpec.scala b/atum/src/test/scala/za/co/absa/atum/utils/FileUtilsSpec.scala new file mode 100644 index 00000000..6e723046 --- /dev/null +++ b/atum/src/test/scala/za/co/absa/atum/utils/FileUtilsSpec.scala @@ -0,0 +1,17 @@ +package za.co.absa.atum.utils + +import org.scalatest.flatspec.AnyFlatSpec +import org.scalatest.matchers.should.Matchers + +class FileUtilsSpec extends AnyFlatSpec with Matchers { + + "PathJoin" should "join paths correctly" in { + + import za.co.absa.atum.utils.FileUtils.PathJoin + "/path/to" / "file" shouldBe "/path/to/file" + "/path/to/" / "file" shouldBe "/path/to/file" + "/path/to" / "/file" shouldBe "/path/to/file" + "/path/to/" / "/file" shouldBe "/path/to/file" + + } +} diff --git a/examples/src/main/scala/za/co/absa/atum/examples/SampleS3Measurements1.scala b/examples/src/main/scala/za/co/absa/atum/examples/SampleS3Measurements1.scala index 471338b6..ea51455f 100644 --- a/examples/src/main/scala/za/co/absa/atum/examples/SampleS3Measurements1.scala +++ b/examples/src/main/scala/za/co/absa/atum/examples/SampleS3Measurements1.scala @@ -16,6 +16,7 @@ package za.co.absa.atum.examples import org.apache.spark.sql.{SaveMode, SparkSession} +import software.amazon.awssdk.regions.Region import za.co.absa.atum.AtumImplicits._ import za.co.absa.atum.persistence.S3Location import za.co.absa.atum.utils.S3Utils @@ -34,7 +35,7 @@ object SampleS3Measurements1 { // Initializing library to hook up to Apache Spark spark.enableControlMeasuresTrackingForS3( - sourceS3Location = Some(S3Location("euw1-ctodatadev-dev-bigdatarnd-s3-poc", "atum/input/wikidata.csv.info")), + sourceS3Location = Some(S3Location("my-bucket", "atum/input/wikidata.csv.info", Region.EU_WEST_1)), destinationS3Config = None ).setControlMeasuresWorkflow("Job 1 S3 ") diff --git a/examples/src/main/scala/za/co/absa/atum/examples/SampleS3Measurements2.scala b/examples/src/main/scala/za/co/absa/atum/examples/SampleS3Measurements2.scala index 51be89ec..947119f9 100644 --- a/examples/src/main/scala/za/co/absa/atum/examples/SampleS3Measurements2.scala +++ b/examples/src/main/scala/za/co/absa/atum/examples/SampleS3Measurements2.scala @@ -16,6 +16,7 @@ package za.co.absa.atum.examples import org.apache.spark.sql.{SaveMode, SparkSession} +import software.amazon.awssdk.regions.Region import za.co.absa.atum.AtumImplicits._ import za.co.absa.atum.core.Atum import za.co.absa.atum.persistence.{S3KmsSettings, S3Location} @@ -42,7 +43,7 @@ object SampleS3Measurements2 { spark.enableControlMeasuresTrackingForS3( sourceS3Location = None, destinationS3Config = Some( - S3Location("euw1-ctodatadev-dev-bigdatarnd-s3-poc", "atum/output/wikidata.csv.info"), + S3Location("my-bucket", "atum/output/wikidata.csv.info", Region.EU_WEST_1), S3KmsSettings(kmsKeyId) ) ) .setControlMeasuresWorkflow("Job 2") From 60c1be417138e03d6e289b2b40b344bc2bf20b20 Mon Sep 17 00:00:00 2001 From: Daniel Kavan Date: Fri, 4 Sep 2020 10:12:57 +0200 Subject: [PATCH 16/18] writeInfoFileForQueryForS3 vs writeInfoFileForQuery - fix todo = s3Location work, remove extensive logging --- .../core/SparkQueryExecutionListener.scala | 36 +++++++++++++------ .../persistence/ControlMeasuresStorer.scala | 3 ++ .../s3/ControlMeasuresS3StorerJsonFile.scala | 2 +- .../scala/za/co/absa/atum/utils/S3Utils.scala | 22 ++++++++++++ 4 files changed, 51 insertions(+), 12 deletions(-) diff --git a/atum/src/main/scala/za/co/absa/atum/core/SparkQueryExecutionListener.scala b/atum/src/main/scala/za/co/absa/atum/core/SparkQueryExecutionListener.scala index 6bfd2999..24919248 100644 --- a/atum/src/main/scala/za/co/absa/atum/core/SparkQueryExecutionListener.scala +++ b/atum/src/main/scala/za/co/absa/atum/core/SparkQueryExecutionListener.scala @@ -19,13 +19,15 @@ import java.io.{PrintWriter, StringWriter} import org.apache.spark.sql.execution.QueryExecution import org.apache.spark.sql.util.QueryExecutionListener +import software.amazon.awssdk.auth.credentials.AwsCredentialsProvider import software.amazon.awssdk.regions.Region -import za.co.absa.atum.persistence.{S3ControlMeasuresStorer, S3KmsSettings} -import za.co.absa.atum.utils.ExecutionPlanUtils.{inferOutputFileName, inferOutputInfoFileName} +import za.co.absa.atum.persistence.{S3ControlMeasuresStorer, S3KmsSettings, S3Location} +import za.co.absa.atum.utils.ExecutionPlanUtils._ +import za.co.absa.atum.utils.S3Utils /** - * The class is responsible for listening to DataSet save events and outputting corresponding control measurements. - */ + * The class is responsible for listening to DataSet save events and outputting corresponding control measurements. + */ class SparkQueryExecutionListener(cf: ControlFrameworkState) extends QueryExecutionListener { override def onSuccess(funcName: String, qe: QueryExecution, durationNs: Long): Unit = { @@ -33,15 +35,22 @@ class SparkQueryExecutionListener(cf: ControlFrameworkState) extends QueryExecut cf.accumulator.getStorer match { case Some(s3storer: S3ControlMeasuresStorer) => - writeInfoFileForQueryForS3(qe, s3storer.outputLocation.region, s3storer.kmsSettings) - case _ => writeInfoFileForQuery(qe) + // todo remove extra logging? + Atum.log.info("SparkQueryExecutionListener.onSuccess: S3ControlMeasuresStorer") + writeInfoFileForQueryForS3(qe, s3storer.outputLocation.region, s3storer.kmsSettings)(s3storer.credentialsProvider) + + case Some(otherStorer) => + Atum.log.info(s"SparkQueryExecutionListener.onSuccess: $otherStorer") + writeInfoFileForQuery(qe) + case None => + Atum.log.info(s"SparkQueryExecutionListener.onSuccess: ") + writeInfoFileForQuery(qe) } // Notify listeners cf.updateRunCheckpoints(saveInfoFile = true) cf.updateStatusSuccess() - updateSplineRef(qe) } } @@ -71,13 +80,18 @@ class SparkQueryExecutionListener(cf: ControlFrameworkState) extends QueryExecut } /** Write _INFO file with control measurements to the output directory based on the query plan */ - private def writeInfoFileForQueryForS3(qe: QueryExecution, region: Region, kmsSettings: S3KmsSettings): Unit = { - val infoFilePath = inferOutputInfoFileName(qe, cf.outputInfoFileName) + private def writeInfoFileForQueryForS3(qe: QueryExecution, region: Region, kmsSettings: S3KmsSettings)(implicit credentialsProvider: AwsCredentialsProvider): Unit = { + val infoFilePath = inferOutputInfoFileNameOnS3(qe, cf.outputInfoFileName) // Write _INFO file to the output directory infoFilePath.foreach(path => { - Atum.log.info(s"Inferred _INFO Path = ${path.toUri.toString}") - cf.storeCurrentInfoFileOnHdfs(path, qe.sparkSession.sparkContext.hadoopConfiguration) + + import S3Utils.StringS3LocationExt + val location = path.toS3Location(region) + + Atum.log.info(s"Inferred _INFO Location = $location") + + cf.storeCurrentInfoFileOnS3(location, kmsSettings) }) // Write _INFO file to a registered storer diff --git a/atum/src/main/scala/za/co/absa/atum/persistence/ControlMeasuresStorer.scala b/atum/src/main/scala/za/co/absa/atum/persistence/ControlMeasuresStorer.scala index 0a84f3cf..2ea99afd 100644 --- a/atum/src/main/scala/za/co/absa/atum/persistence/ControlMeasuresStorer.scala +++ b/atum/src/main/scala/za/co/absa/atum/persistence/ControlMeasuresStorer.scala @@ -15,6 +15,7 @@ package za.co.absa.atum.persistence +import software.amazon.awssdk.auth.credentials.AwsCredentialsProvider import za.co.absa.atum.model.ControlMeasure /** Trait for control measurements saving to a persistent storage */ @@ -26,4 +27,6 @@ trait ControlMeasuresStorer { trait S3ControlMeasuresStorer extends ControlMeasuresStorer { def kmsSettings: S3KmsSettings def outputLocation: S3Location + + def credentialsProvider: AwsCredentialsProvider } diff --git a/atum/src/main/scala/za/co/absa/atum/persistence/s3/ControlMeasuresS3StorerJsonFile.scala b/atum/src/main/scala/za/co/absa/atum/persistence/s3/ControlMeasuresS3StorerJsonFile.scala index cf74456c..03cb9dc7 100644 --- a/atum/src/main/scala/za/co/absa/atum/persistence/s3/ControlMeasuresS3StorerJsonFile.scala +++ b/atum/src/main/scala/za/co/absa/atum/persistence/s3/ControlMeasuresS3StorerJsonFile.scala @@ -32,7 +32,7 @@ import za.co.absa.atum.utils.S3Utils * @param credentialsProvider a specific credentials provider (e.g. SAML profile). Consider using [[DefaultCredentialsProvider#create()]] when in doubt. */ class ControlMeasuresS3StorerJsonFile(val outputLocation: S3Location, val kmsSettings: S3KmsSettings) - (implicit credentialsProvider: AwsCredentialsProvider) extends S3ControlMeasuresStorer { + (implicit val credentialsProvider: AwsCredentialsProvider) extends S3ControlMeasuresStorer { /** * Stores the `controlInfo` measurement to an S3 location. diff --git a/atum/src/main/scala/za/co/absa/atum/utils/S3Utils.scala b/atum/src/main/scala/za/co/absa/atum/utils/S3Utils.scala index 4f3c0aef..0544eb1a 100644 --- a/atum/src/main/scala/za/co/absa/atum/utils/S3Utils.scala +++ b/atum/src/main/scala/za/co/absa/atum/utils/S3Utils.scala @@ -4,6 +4,7 @@ import software.amazon.awssdk.auth.credentials.{AwsCredentialsProvider, ProfileC import software.amazon.awssdk.regions.Region import software.amazon.awssdk.services.s3.S3Client import za.co.absa.atum.core.Atum.log +import za.co.absa.atum.persistence.S3Location object S3Utils { @@ -22,4 +23,25 @@ object S3Utils { .build() } + // todo test/move + // hint: https://docs.aws.amazon.com/AmazonS3/latest/dev/BucketRestrictions.html#bucketnamingrules + val S3LocationRx = "s3(?:a|n)?://([-a-z0-9.]{3,63})/(.*)".r + + def isValidS3Path(path: String): Boolean = path match { + case S3LocationRx(_, _) => true + case _ => false + } + + implicit class StringS3LocationExt(path: String) { + + def toS3Location(withRegion: Region): S3Location = { + path match { + case S3LocationRx(bucketName, path) => S3Location(bucketName, path, withRegion) + case _ => throw new IllegalArgumentException(s"Could not parse S3 Location from $path using rx $S3LocationRx.") + } + } + + def isValidS3Path: Boolean = S3Utils.isValidS3Path(path) + } + } From 5d051b93718629cbab6d4dcef3acff151285aed2 Mon Sep 17 00:00:00 2001 From: Daniel Kavan Date: Mon, 7 Sep 2020 10:36:52 +0200 Subject: [PATCH 17/18] SparkQueryExecutionListerner.onSuccess - only writing info with a defined storer, no storing otherwise. toS3Location test added --- .../core/SparkQueryExecutionListener.scala | 16 +++++----- .../scala/za/co/absa/atum/utils/S3Utils.scala | 7 ----- .../za/co/absa/atum/utils/S3UtilsSpec.scala | 31 +++++++++++++++++++ 3 files changed, 38 insertions(+), 16 deletions(-) create mode 100644 atum/src/test/scala/za/co/absa/atum/utils/S3UtilsSpec.scala diff --git a/atum/src/main/scala/za/co/absa/atum/core/SparkQueryExecutionListener.scala b/atum/src/main/scala/za/co/absa/atum/core/SparkQueryExecutionListener.scala index 24919248..d62e1f59 100644 --- a/atum/src/main/scala/za/co/absa/atum/core/SparkQueryExecutionListener.scala +++ b/atum/src/main/scala/za/co/absa/atum/core/SparkQueryExecutionListener.scala @@ -21,7 +21,7 @@ import org.apache.spark.sql.execution.QueryExecution import org.apache.spark.sql.util.QueryExecutionListener import software.amazon.awssdk.auth.credentials.AwsCredentialsProvider import software.amazon.awssdk.regions.Region -import za.co.absa.atum.persistence.{S3ControlMeasuresStorer, S3KmsSettings, S3Location} +import za.co.absa.atum.persistence.{S3ControlMeasuresStorer, S3KmsSettings} import za.co.absa.atum.utils.ExecutionPlanUtils._ import za.co.absa.atum.utils.S3Utils @@ -35,16 +35,15 @@ class SparkQueryExecutionListener(cf: ControlFrameworkState) extends QueryExecut cf.accumulator.getStorer match { case Some(s3storer: S3ControlMeasuresStorer) => - // todo remove extra logging? - Atum.log.info("SparkQueryExecutionListener.onSuccess: S3ControlMeasuresStorer") + Atum.log.debug(s"SparkQueryExecutionListener.onSuccess for S3ControlMeasuresStorer: writing to ${s3storer.outputLocation.s3String()}") writeInfoFileForQueryForS3(qe, s3storer.outputLocation.region, s3storer.kmsSettings)(s3storer.credentialsProvider) - case Some(otherStorer) => - Atum.log.info(s"SparkQueryExecutionListener.onSuccess: $otherStorer") + case Some(_) => + Atum.log.debug(s"SparkQueryExecutionListener.onSuccess: writing to HDFS") writeInfoFileForQuery(qe) + case None => - Atum.log.info(s"SparkQueryExecutionListener.onSuccess: ") - writeInfoFileForQuery(qe) + Atum.log.info("No storer is set, therefore no data will be written the automatically with DF-save to an _INFO file.") } // Notify listeners @@ -89,8 +88,7 @@ class SparkQueryExecutionListener(cf: ControlFrameworkState) extends QueryExecut import S3Utils.StringS3LocationExt val location = path.toS3Location(region) - Atum.log.info(s"Inferred _INFO Location = $location") - + Atum.log.debug(s"Inferred _INFO Location = $location") cf.storeCurrentInfoFileOnS3(location, kmsSettings) }) diff --git a/atum/src/main/scala/za/co/absa/atum/utils/S3Utils.scala b/atum/src/main/scala/za/co/absa/atum/utils/S3Utils.scala index 0544eb1a..22525a56 100644 --- a/atum/src/main/scala/za/co/absa/atum/utils/S3Utils.scala +++ b/atum/src/main/scala/za/co/absa/atum/utils/S3Utils.scala @@ -23,15 +23,9 @@ object S3Utils { .build() } - // todo test/move // hint: https://docs.aws.amazon.com/AmazonS3/latest/dev/BucketRestrictions.html#bucketnamingrules val S3LocationRx = "s3(?:a|n)?://([-a-z0-9.]{3,63})/(.*)".r - def isValidS3Path(path: String): Boolean = path match { - case S3LocationRx(_, _) => true - case _ => false - } - implicit class StringS3LocationExt(path: String) { def toS3Location(withRegion: Region): S3Location = { @@ -41,7 +35,6 @@ object S3Utils { } } - def isValidS3Path: Boolean = S3Utils.isValidS3Path(path) } } diff --git a/atum/src/test/scala/za/co/absa/atum/utils/S3UtilsSpec.scala b/atum/src/test/scala/za/co/absa/atum/utils/S3UtilsSpec.scala new file mode 100644 index 00000000..52817c8e --- /dev/null +++ b/atum/src/test/scala/za/co/absa/atum/utils/S3UtilsSpec.scala @@ -0,0 +1,31 @@ +package za.co.absa.atum.utils + +import org.scalatest.flatspec.AnyFlatSpec +import software.amazon.awssdk.regions.Region +import za.co.absa.atum.persistence.S3Location +import S3Utils.StringS3LocationExt +import org.scalatest.matchers.should.Matchers + +class S3UtilsSpec extends AnyFlatSpec with Matchers { + + val region1 = Region.EU_WEST_1 + + val validPathsWithExpectedLocations = Seq( + // (path, expected parsed value) + ("s3://mybucket-123/path/to/file.ext", S3Location("mybucket-123", "path/to/file.ext", region1)), + ("s3n://mybucket-123/path/to/ends/with/slash/", S3Location("mybucket-123", "path/to/ends/with/slash/", region1)), + ("s3a://mybucket-123.asdf.cz/path-to-$_file!@#$.ext", S3Location("mybucket-123.asdf.cz", "path-to-$_file!@#$.ext", region1)) + ) + + val invalidPaths = Seq( + "s3x://mybucket-123/path/to/file/on/invalid/prefix", + "s3://bb/some/path/but/bucketname/too/short" + ) + + "S3Utils.StringS3LocationExt" should "parse S3 path from String using toS3Location" in { + validPathsWithExpectedLocations.foreach { case (path, expectedLocation) => + path.toS3Location(region1) shouldBe expectedLocation + } + } + +} From 3fa219825fc36192ac709dc441ae3bd5071a4f34 Mon Sep 17 00:00:00 2001 From: Daniel Kavan Date: Mon, 7 Sep 2020 16:15:32 +0200 Subject: [PATCH 18/18] isValidS3Path added to S3Utils, test updates --- .../scala/za/co/absa/atum/utils/S3Utils.scala | 4 ++++ .../atum/utils/ExecutionPlanUtilsSuite.scala | 11 ++++++----- .../za/co/absa/atum/utils/S3UtilsSpec.scala | 16 ++++++++++++++++ 3 files changed, 26 insertions(+), 5 deletions(-) diff --git a/atum/src/main/scala/za/co/absa/atum/utils/S3Utils.scala b/atum/src/main/scala/za/co/absa/atum/utils/S3Utils.scala index 22525a56..a55b6009 100644 --- a/atum/src/main/scala/za/co/absa/atum/utils/S3Utils.scala +++ b/atum/src/main/scala/za/co/absa/atum/utils/S3Utils.scala @@ -35,6 +35,10 @@ object S3Utils { } } + def isValidS3Path: Boolean = path match { + case S3LocationRx(_, _) => true + case _ => false + } } } diff --git a/atum/src/test/scala/za/co/absa/atum/utils/ExecutionPlanUtilsSuite.scala b/atum/src/test/scala/za/co/absa/atum/utils/ExecutionPlanUtilsSuite.scala index 91fffa8d..28644135 100644 --- a/atum/src/test/scala/za/co/absa/atum/utils/ExecutionPlanUtilsSuite.scala +++ b/atum/src/test/scala/za/co/absa/atum/utils/ExecutionPlanUtilsSuite.scala @@ -13,6 +13,11 @@ class ExecutionPlanUtilsSuite extends AnyFlatSpec with Matchers with IdiomaticMo val hadoopConf = new Configuration + implicit class SimplePath(path: Path) { + // disregarding hdfs nameserver prefix or local FS fallback (file://) + def simplePath: String = path.toUri.getPath + } + "inferOutputInfoFileName" should "derive output file name for HDFS from SaveIntoDataSourceCommand" in { val qe = mock[QueryExecution] Mockito.when(qe.analyzed).thenReturn( @@ -36,7 +41,7 @@ class ExecutionPlanUtilsSuite extends AnyFlatSpec with Matchers with IdiomaticMo val qe = mock[QueryExecution] val myInfoName = "myInfo" Mockito.when(qe.analyzed).thenReturn( - // training slash should get taken care of + // trailing slash should get taken care of SaveIntoDataSourceCommand(null, null, options = Map(("path", "/tmp/here2/")), null) ) @@ -44,9 +49,5 @@ class ExecutionPlanUtilsSuite extends AnyFlatSpec with Matchers with IdiomaticMo } - implicit class SimplePath(path: Path) { - // disregarding hdfs nameserver prefix or local FS fallback (file://) - def simplePath: String = path.toUri.getPath - } } diff --git a/atum/src/test/scala/za/co/absa/atum/utils/S3UtilsSpec.scala b/atum/src/test/scala/za/co/absa/atum/utils/S3UtilsSpec.scala index 52817c8e..293e19f4 100644 --- a/atum/src/test/scala/za/co/absa/atum/utils/S3UtilsSpec.scala +++ b/atum/src/test/scala/za/co/absa/atum/utils/S3UtilsSpec.scala @@ -28,4 +28,20 @@ class S3UtilsSpec extends AnyFlatSpec with Matchers { } } + it should "fail parsing invalid S3 path from String using toS3Location" in { + invalidPaths.foreach { path => + assertThrows[IllegalArgumentException] { + path.toS3Location(region1) + } + } + } + + it should "check path using isValidS3Path" in { + validPathsWithExpectedLocations.map(_._1).foreach { path => + path.isValidS3Path shouldBe true + } + + invalidPaths.foreach(_.isValidS3Path shouldBe false) + } + }