AbsaOSS · jozefbakus · Mar 9, 2023 · Feb 17, 2023 · Feb 17, 2023 · Feb 17, 2023
@@ -0,0 +1,32 @@
+/*
+ * Copyright 2018 ABSA Group Limited
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package za.co.absa.hyperdrive.trigger.api.rest.controllers
+
+import org.springframework.web.bind.annotation._
+import za.co.absa.hyperdrive.trigger.api.rest.services.HyperdriveService
+import za.co.absa.hyperdrive.trigger.models._
+
+import java.util.concurrent.CompletableFuture
+import javax.inject.Inject
+import scala.compat.java8.FutureConverters._
+import scala.concurrent.ExecutionContext.Implicits.global
+
+@RestController
+class HyperdriveController @Inject() (hyperdriveService: HyperdriveService) {
+  @GetMapping(path = Array("/hyperdrive/workflows/{id}/ingestionStatus"))
+  def getIngestionStatus(@PathVariable id: Long): CompletableFuture[Seq[IngestionStatus]] =
+    hyperdriveService.getIngestionStatus(id).toJava.toCompletableFuture
+}
@@ -25,7 +25,7 @@ import org.springframework.stereotype.Service
 import za.co.absa.hyperdrive.trigger.api.rest.utils.ScalaUtil.swap
 
 import javax.inject.Inject
-import scala.util.Try
+import scala.util.{Success, Try}
 
 trait CheckpointService {
   type TopicPartitionOffsets = Map[String, Map[Int, Long]]
@@ -34,6 +34,10 @@ trait CheckpointService {
   def getLatestOffsetFilePath(params: HdfsParameters)(
     implicit ugi: UserGroupInformation
   ): Try[Option[(String, Boolean)]]
+
+  def getLatestCommittedOffset(params: HdfsParameters)(
+    implicit ugi: UserGroupInformation
+  ): Try[Option[TopicPartitionOffsets]]
 }
 
 class HdfsParameters(
@@ -98,6 +102,16 @@ class CheckpointServiceImpl @Inject() (@Lazy hdfsService: HdfsService) extends C
     }
   }
 
+  override def getLatestCommittedOffset(
+    params: HdfsParameters
+  )(implicit ugi: UserGroupInformation): Try[Option[TopicPartitionOffsets]] = {
+    getLatestCommitBatchId(params.checkpointLocation).flatMap {
+      _.map { latestCommit =>
+        getOffsetsFromFile(new Path(s"${params.checkpointLocation}/$offsetsDirName/$latestCommit").toString)
+      }.getOrElse(Success(None))
+    }
+  }
+
   /**
    *  see org.apache.spark.sql.execution.streaming.OffsetSeqLog
    *  and org.apache.spark.sql.kafka010.JsonUtils

@@ -26,23 +26,28 @@ import org.springframework.context.annotation.Lazy
 import org.springframework.stereotype.Service
 import za.co.absa.hyperdrive.trigger.configuration.application.SparkConfig
 import za.co.absa.hyperdrive.trigger.models.enums.JobTypes
-import za.co.absa.hyperdrive.trigger.models.{JobInstanceParameters, SparkInstanceParameters}
+import za.co.absa.hyperdrive.trigger.models.{BeginningEndOffsets, JobInstanceParameters, SparkInstanceParameters}
 
 import java.util.Properties
 import javax.inject.Inject
 import scala.concurrent.{ExecutionContext, Future}
+import scala.util.{Failure, Success}
 
-trait HyperdriveOffsetComparisonService {
+trait HyperdriveOffsetService {
   def isNewJobInstanceRequired(jobParameters: JobInstanceParameters)(implicit ec: ExecutionContext): Future[Boolean]
+
+  def getNumberOfMessagesLeft(jobParameters: JobInstanceParameters)(
+    implicit ec: ExecutionContext
+  ): Future[Option[(String, Map[Int, Long])]]
 }
 
 @Service
 @Lazy
-class HyperdriveOffsetComparisonServiceImpl @Inject() (sparkConfig: SparkConfig,
-                                                       @Lazy checkpointService: CheckpointService,
-                                                       @Lazy userGroupInformationService: UserGroupInformationService,
-                                                       kafkaService: KafkaService
-) extends HyperdriveOffsetComparisonService {
+class HyperdriveOffsetServiceImpl @Inject() (sparkConfig: SparkConfig,
+                                             @Lazy checkpointService: CheckpointService,
+                                             @Lazy userGroupInformationService: UserGroupInformationService,
+                                             kafkaService: KafkaService
+) extends HyperdriveOffsetService {
   private val logger = LoggerFactory.getLogger(this.getClass)
   private val HyperdriveCheckpointKey = "writer.common.checkpoint.location"
   private val HyperdriveKafkaTopicKey = "reader.kafka.topic"
@@ -52,6 +57,58 @@ class HyperdriveOffsetComparisonServiceImpl @Inject() (sparkConfig: SparkConfig,
   private val ListDelimiter = ','
   private val defaultDeserializer = "org.apache.kafka.common.serialization.StringDeserializer"
 
+  /**
+   *  @param jobParameters Parameters for the job instance. Should contain at least
+   *                      - reader.kafka.topic
+   *                      - reader.kafka.brokers
+   *                      - writer.common.checkpoint.location
+   *  @param ec            ExecutionContext
+   *  @return - number of not ingested messages for each topic and partition.
+   */
+  def getNumberOfMessagesLeft(
+    jobParameters: JobInstanceParameters
+  )(implicit ec: ExecutionContext): Future[Option[(String, Map[Int, Long])]] = {
+    val kafkaParametersOpt = getKafkaParameters(jobParameters)
+    val hdfsParametersOpt: Option[HdfsParameters] = getResolvedAppArguments(jobParameters).flatMap(getHdfsParameters)
+
+    if (kafkaParametersOpt.isEmpty) {
+      logger.debug(s"Kafka parameters were not found in job definition $jobParameters")
+    }
+
+    Future(
+      for {
+        kafkaParameters <- kafkaParametersOpt
+        hdfsParameters <- hdfsParametersOpt
+      } yield {
+        val kafkaOffsets = kafkaService.getBeginningEndOffsets(kafkaParameters._1, kafkaParameters._2)
+        kafkaOffsets match {
+          case BeginningEndOffsets(_, start, end) if start.nonEmpty && end.nonEmpty && start.keySet == end.keySet =>
+            val ugi = userGroupInformationService.loginUserFromKeytab(hdfsParameters.principal, hdfsParameters.keytab)
+            val hdfsOffsetsTry = checkpointService.getLatestCommittedOffset(hdfsParameters)(ugi).map(_.map(_.head._2))
+
+            hdfsOffsetsTry match {
+              case Failure(_) => None
+              case Success(hdfsOffsetsOption) =>
+                val messagesLeft = kafkaOffsets.beginningOffsets.map { case (partition, kafkaBeginningOffset) =>
+                  val kafkaEndOffset = kafkaOffsets.endOffsets(partition)
+                  val numberOfMessages = hdfsOffsetsOption.flatMap(_.get(partition)) match {
+                    case Some(hdfsOffset) if hdfsOffset > kafkaEndOffset        => kafkaEndOffset - hdfsOffset
+                    case Some(hdfsOffset) if hdfsOffset > kafkaBeginningOffset  => kafkaEndOffset - hdfsOffset
+                    case Some(hdfsOffset) if hdfsOffset <= kafkaBeginningOffset => kafkaEndOffset - kafkaBeginningOffset
+                    case None                                                   => kafkaEndOffset - kafkaBeginningOffset
+                  }
+                  partition -> numberOfMessages
+                }
+                Some((kafkaOffsets.topic, messagesLeft))
+            }
+          case _ =>
+            logger.warn(s"Inconsistent response from kafka for topic: ${kafkaOffsets.topic}")
+            None
+        }
+      }
+    ).map(_.flatten)
+  }
+
   /**
    *  @param jobParameters Parameters for the job instance. Should contain at least
    *                      - reader.kafka.topic

@@ -0,0 +1,83 @@
+/*
+ * Copyright 2018 ABSA Group Limited
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package za.co.absa.hyperdrive.trigger.api.rest.services
+
+import org.slf4j.LoggerFactory
+import org.springframework.stereotype.Service
+import za.co.absa.hyperdrive.trigger.models.{IngestionStatus, TopicStatus}
+import za.co.absa.hyperdrive.trigger.models.enums.JobTypes
+import za.co.absa.hyperdrive.trigger.persistance.WorkflowRepository
+
+import scala.concurrent.{ExecutionContext, Future}
+import scala.util.{Failure, Success}
+
+trait HyperdriveService {
+  protected val workflowRepository: WorkflowRepository
+  protected val jobTemplateService: JobTemplateService
+  protected val hyperdriveOffsetService: HyperdriveOffsetService
+
+  def getIngestionStatus(id: Long)(implicit ec: ExecutionContext): Future[Seq[IngestionStatus]]
+}
+
+@Service
+class HyperdriveServiceImpl(
+  override protected val workflowRepository: WorkflowRepository,
+  override protected val jobTemplateService: JobTemplateService,
+  override protected val hyperdriveOffsetService: HyperdriveOffsetService
+) extends HyperdriveService {
+  private val logger = LoggerFactory.getLogger(this.getClass)
+
+  override def getIngestionStatus(id: Long)(implicit ec: ExecutionContext): Future[Seq[IngestionStatus]] = {
+    workflowRepository.getWorkflow(id).flatMap { workflow =>
+      jobTemplateService
+        .resolveJobTemplate(workflow.dagDefinitionJoined)
+        .flatMap(resolvedJobs =>
+          Future.sequence(
+            resolvedJobs.map {
+              case resolvedJob if resolvedJob.jobParameters.jobType == JobTypes.Hyperdrive =>
+                hyperdriveOffsetService.getNumberOfMessagesLeft(resolvedJob.jobParameters).transformWith {
+                  case Failure(exception) =>
+                    logger.error(s"Failed to get number of messages left to ingest for a workflow: $id", exception)
+                    Future.successful(
+                      IngestionStatus(
+                        jobName = resolvedJob.name,
+                        jobType = resolvedJob.jobParameters.jobType.name,
+                        topicStatus = None
+                      )
+                    )
+                  case Success(messagesLeftOpt) =>
+                    Future.successful(
+                      IngestionStatus(
+                        jobName = resolvedJob.name,
+                        jobType = resolvedJob.jobParameters.jobType.name,
+                        topicStatus = messagesLeftOpt.map(messagesLeft => TopicStatus(messagesLeft._1, messagesLeft._2))
+                      )
+                    )
+                }
+              case resolvedJob =>
+                Future.successful(
+                  IngestionStatus(
+                    jobName = resolvedJob.name,
+                    jobType = resolvedJob.jobParameters.jobType.name,
+                    topicStatus = None
+                  )
+                )
+            }
+          )
+        )
+    }
+  }
+}
@@ -22,6 +22,7 @@ import org.springframework.stereotype.Service
 import org.springframework.util.ConcurrentLruCache
 import za.co.absa.hyperdrive.trigger.api.rest.services.KafkaServiceImpl.{BeginningOffsets, EndOffsets, OffsetFunction}
 import za.co.absa.hyperdrive.trigger.configuration.application.GeneralConfig
+import za.co.absa.hyperdrive.trigger.models.BeginningEndOffsets
 
 import java.util.Properties
 import java.util.UUID.randomUUID
@@ -31,6 +32,7 @@ import scala.collection.JavaConverters._
 trait KafkaService {
   def getBeginningOffsets(topic: String, consumerProperties: Properties): Map[Int, Long]
   def getEndOffsets(topic: String, consumerProperties: Properties): Map[Int, Long]
+  def getBeginningEndOffsets(topic: String, consumerProperties: Properties): BeginningEndOffsets
 }
 
 @Service
@@ -50,6 +52,14 @@ class KafkaServiceImpl @Inject() (generalConfig: GeneralConfig) extends KafkaSer
     getOffsets(topic, consumerProperties, EndOffsets)
   }
 
+  def getBeginningEndOffsets(topic: String, consumerProperties: Properties): BeginningEndOffsets = {
+    BeginningEndOffsets(
+      topic,
+      getOffsets(topic, consumerProperties, BeginningOffsets),
+      getOffsets(topic, consumerProperties, EndOffsets)
+    )
+  }
+
   def createKafkaConsumer(propertiesThreadId: (Properties, Long)): KafkaConsumer[String, String] = {
     logger.info(
       s"Creating new Kafka Consumer for thread id ${propertiesThreadId._2} and" +

@@ -0,0 +1,22 @@
+/*
+ * Copyright 2018 ABSA Group Limited
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package za.co.absa.hyperdrive.trigger.models
+
+case class BeginningEndOffsets(
+  topic: String,
+  beginningOffsets: Map[Int, Long],
+  endOffsets: Map[Int, Long]
+)
@@ -0,0 +1,24 @@
+/*
+ * Copyright 2018 ABSA Group Limited
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package za.co.absa.hyperdrive.trigger.models
+
+case class IngestionStatus(
+  jobName: String,
+  jobType: String,
+  topicStatus: Option[TopicStatus]
+)
+
+case class TopicStatus(topic: String, messagesToIngest: Map[Int, Long])
@@ -34,7 +34,7 @@ import org.springframework.beans.factory.BeanFactory
 import org.springframework.context.annotation.Lazy
 import za.co.absa.hyperdrive.trigger.scheduler.executors.shell.ShellExecutor
 import org.springframework.stereotype.Component
-import za.co.absa.hyperdrive.trigger.api.rest.services.HyperdriveOffsetComparisonService
+import za.co.absa.hyperdrive.trigger.api.rest.services.HyperdriveOffsetService
 import za.co.absa.hyperdrive.trigger.configuration.application.{SchedulerConfig, SparkConfig}
 import za.co.absa.hyperdrive.trigger.scheduler.notifications.NotificationSender
 
@@ -49,7 +49,7 @@ class Executors @Inject() (
   beanFactory: BeanFactory,
   implicit val sparkConfig: SparkConfig,
   schedulerConfig: SchedulerConfig,
-  @Lazy hyperdriveOffsetComparisonService: HyperdriveOffsetComparisonService
+  @Lazy hyperdriveOffsetComparisonService: HyperdriveOffsetService
 ) {
   private val logger = LoggerFactory.getLogger(this.getClass)
   private implicit val executionContext: ExecutionContextExecutor =

@@ -16,7 +16,7 @@
 package za.co.absa.hyperdrive.trigger.scheduler.executors.spark
 
 import org.slf4j.LoggerFactory
-import za.co.absa.hyperdrive.trigger.api.rest.services.HyperdriveOffsetComparisonService
+import za.co.absa.hyperdrive.trigger.api.rest.services.HyperdriveOffsetService
 import za.co.absa.hyperdrive.trigger.configuration.application.SparkConfig
 import za.co.absa.hyperdrive.trigger.models.enums.JobStatuses
 import za.co.absa.hyperdrive.trigger.models.{JobInstance, SparkInstanceParameters}
@@ -31,23 +31,23 @@ object HyperdriveExecutor {
     jobParameters: SparkInstanceParameters,
     updateJob: JobInstance => Future[Unit],
     sparkClusterService: SparkClusterService,
-    offsetComparisonService: HyperdriveOffsetComparisonService
+    offsetService: HyperdriveOffsetService
   )(implicit executionContext: ExecutionContext, sparkConfig: SparkConfig): Future[Unit] =
     jobInstance.executorJobId match {
-      case None => submitJob(sparkClusterService, offsetComparisonService, jobInstance, jobParameters, updateJob)
+      case None => submitJob(sparkClusterService, offsetService, jobInstance, jobParameters, updateJob)
       case Some(executorJobId) =>
         SparkExecutor.updateJobStatus(executorJobId, jobInstance, updateJob, sparkClusterService)
     }
 
   private def submitJob(sparkClusterService: SparkClusterService,
-                        offsetComparisonService: HyperdriveOffsetComparisonService,
+                        offsetService: HyperdriveOffsetService,
                         jobInstance: JobInstance,
                         jobParameters: SparkInstanceParameters,
                         updateJob: JobInstance => Future[Unit]
   )(implicit executionContext: ExecutionContext) = {
     logger.debug("Using HyperdriveExecutor")
     for {
-      newJobRequired <- offsetComparisonService.isNewJobInstanceRequired(jobParameters)
+      newJobRequired <- offsetService.isNewJobInstanceRequired(jobParameters)
       _ <-
         if (newJobRequired) sparkClusterService.submitJob(jobInstance, jobParameters, updateJob)
         else updateJob(jobInstance.copy(jobStatus = JobStatuses.NoData))