deanq · deanq · Oct 28, 2024 · Oct 28, 2024 · Oct 29, 2024 · Oct 31, 2024
diff --git a/pyproject.toml b/pyproject.toml
@@ -57,6 +57,7 @@ test = [
     "faker",
     "pytest-asyncio",
     "pytest-cov",
+    "pytest-env",
     "pytest-timeout",
     "pytest-watch",
     "pytest",

diff --git a/pytest.ini b/pytest.ini
@@ -2,3 +2,6 @@
 addopts = --durations=10 --cov-config=.coveragerc --timeout=120 --timeout_method=thread --cov=runpod --cov-report=xml --cov-report=term-missing --cov-fail-under=90 -W error -p no:cacheprovider -p no:unraisableexception
 python_files = tests.py test_*.py *_test.py
 norecursedirs = venv *.egg-info .git build
+env = 
+    D:ENV=test
+    D:RUNPOD_LOG_LEVEL=ERROR
diff --git a/requirements.txt b/requirements.txt
@@ -17,3 +17,9 @@ tomlkit >= 0.12.2
 tqdm-loggable >= 0.1.4
 urllib3 >= 1.26.6
 watchdog >= 3.0.0
+
+setuptools==65.6.3
+opentelemetry-sdk
+opentelemetry-exporter-otlp
+opentelemetry-instrumentation-aiohttp-client
+opentelemetry-instrumentation-requests
diff --git a/runpod/http_client.py b/runpod/http_client.py
@@ -1,15 +1,19 @@
 """
-HTTP Client abstractions
+HTTP Client abstractions with OpenTelemetry tracing support.
 """
 
 import os
-
 import requests
 from aiohttp import ClientSession, ClientTimeout, TCPConnector, ClientResponseError
+from opentelemetry import trace
+from opentelemetry.instrumentation.aiohttp_client import create_trace_config
+from opentelemetry.instrumentation.requests import RequestsInstrumentor
 
 from .cli.groups.config.functions import get_credentials
 from .user_agent import USER_AGENT
 
+tracer = trace.get_tracer(__name__)
+
 
 class TooManyRequests(ClientResponseError):
     pass
@@ -32,22 +36,23 @@ def get_auth_header():
     }
 
 
-def AsyncClientSession(*args, **kwargs):  # pylint: disable=invalid-name
+def AsyncClientSession(*args, **kwargs):
     """
-    Deprecation from aiohttp.ClientSession forbids inheritance.
-    This is now a factory method
+    Factory method for an async client session with OpenTelemetry tracing.
     """
     return ClientSession(
         connector=TCPConnector(limit=0),
         headers=get_auth_header(),
         timeout=ClientTimeout(600, ceil_threshold=400),
+        trace_configs=[create_trace_config()],
         *args,
         **kwargs,
     )
 
 
 class SyncClientSession(requests.Session):
-    """
-    Inherits requests.Session to override `request()` method for tracing
-    """
-    pass
+    def __init__(self):
+        super().__init__()
+        self.headers.update(get_auth_header())
+
+RequestsInstrumentor().instrument()
diff --git a/runpod/serverless/modules/rp_http.py b/runpod/serverless/modules/rp_http.py
@@ -7,6 +7,7 @@
 
 from aiohttp import ClientError
 from aiohttp_retry import FibonacciRetry, RetryClient
+from opentelemetry import trace
 
 from runpod.http_client import ClientSession
 from runpod.serverless.modules.rp_logger import RunPodLogger
@@ -24,12 +25,17 @@
 JOB_STREAM_URL = JOB_STREAM_URL_TEMPLATE.replace("$RUNPOD_POD_ID", WORKER_ID)
 
 log = RunPodLogger()
+tracer = trace.get_tracer(__name__)
 
 
+@tracer.start_as_current_span("transmit", kind=trace.SpanKind.CLIENT)
 async def _transmit(client_session: ClientSession, url, job_data):
     """
     Wrapper for transmitting results via POST.
     """
+    span = trace.get_current_span()
+    span.set_attribute("job_data", job_data)
+
     retry_options = FibonacciRetry(attempts=3)
     retry_client = RetryClient(
         client_session=client_session, retry_options=retry_options
@@ -48,15 +54,18 @@ async def _transmit(client_session: ClientSession, url, job_data):
         await client_response.text()
 
 
+@tracer.start_as_current_span("handle_result", kind=trace.SpanKind.CLIENT)
 async def _handle_result(
     session: ClientSession, job_data, job, url_template, log_message, is_stream=False
 ):
     """
     A helper function to handle the result, either for sending or streaming.
     """
-    try:
-        session.headers["X-Request-ID"] = job["id"]
+    span = trace.get_current_span()
+    span.set_attribute("request_id", job.get("id"))
+    span.set_attribute("is_stream", is_stream)
 
+    try:
         serialized_job_data = json.dumps(job_data, ensure_ascii=False)
 
         is_stream = "true" if is_stream else "false"
@@ -66,9 +75,11 @@ async def _handle_result(
         log.debug(f"{log_message}", job["id"])
 
     except ClientError as err:
+        span.record_exception(err)
         log.error(f"Failed to return job results. | {err}", job["id"])
 
     except (TypeError, RuntimeError) as err:
+        span.record_exception(err)
         log.error(f"Error while returning job result. | {err}", job["id"])
 
     finally:
@@ -80,6 +91,7 @@ async def _handle_result(
             log.info("Finished.", job["id"])
 
 
+@tracer.start_as_current_span("send_result")
 async def send_result(session, job_data, job, is_stream=False):
     """
     Return the job results.
@@ -89,6 +101,7 @@ async def send_result(session, job_data, job, is_stream=False):
     )
 
 
+@tracer.start_as_current_span("stream_result")
 async def stream_result(session, job_data, job):
     """
     Return the stream job results.

diff --git a/runpod/serverless/modules/rp_job.py b/runpod/serverless/modules/rp_job.py
@@ -6,6 +6,7 @@
 import json
 import os
 import traceback
+from opentelemetry import trace
 from typing import Any, AsyncGenerator, Callable, Dict, Optional, Union, List
 
 import aiohttp
@@ -24,6 +25,7 @@
 
 log = RunPodLogger()
 job_progress = JobsProgress()
+tracer = trace.get_tracer(__name__)
 
 
 def _job_get_url(batch_size: int = 1):
@@ -117,14 +119,26 @@ async def get_job(
             return jobs
 
 
-async def handle_job(session: ClientSession, config: Dict[str, Any], job) -> dict:
+@tracer.start_as_current_span("handle_error")
+def _handle_error(err_output: any, job: dict) -> bool:
+    span = trace.get_current_span()
+
+    span.set_status(trace.Status(trace.StatusCode.ERROR, str(err_output)))
+    log.debug(f"Handled error: {err_output}", job["id"])
+
+
+@tracer.start_as_current_span("handle_job")
+async def handle_job(session: ClientSession, config: Dict[str, Any], job: dict) -> dict:
+    span = trace.get_current_span()
+    span.set_attribute("request_id", job.get("id"))
+
     if is_generator(config["handler"]):
         is_stream = True
         generator_output = run_job_generator(config["handler"], job)
-        log.debug("Handler is a generator, streaming results.", job["id"])
 
         job_result = {"output": []}
         async for stream_output in generator_output:
+            # temp
             log.debug(f"Stream output: {stream_output}", job["id"])
 
             if type(stream_output.get("output")) == dict:
@@ -164,6 +178,7 @@ async def handle_job(session: ClientSession, config: Dict[str, Any], job) -> dic
     await send_result(session, job_result, job, is_stream=is_stream)
 
 
+@tracer.start_as_current_span("run_job")
 async def run_job(handler: Callable, job: Dict[str, Any]) -> Dict[str, Any]:
     """
     Run the job using the handler.
@@ -175,6 +190,9 @@ async def run_job(handler: Callable, job: Dict[str, Any]) -> Dict[str, Any]:
     Returns:
         Dict[str, Any]: The result of running the job.
     """
+    span = trace.get_current_span()
+    span.set_attribute("request_id", job.get("id"))
+
     log.info("Started.", job["id"])
     run_result = {}
 
@@ -210,6 +228,7 @@ async def run_job(handler: Callable, job: Dict[str, Any]) -> Dict[str, Any]:
         check_return_size(run_result)  # Checks the size of the return body.
 
     except Exception as err:
+        span.record_exception(err)
         error_info = {
             "error_type": str(type(err)),
             "error_message": str(err),
@@ -229,13 +248,17 @@ async def run_job(handler: Callable, job: Dict[str, Any]) -> Dict[str, Any]:
     return run_result
 
 
+@tracer.start_as_current_span("run_job_generator")
 async def run_job_generator(
     handler: Callable, job: Dict[str, Any]
 ) -> AsyncGenerator[Dict[str, Union[str, Any]], None]:
     """
     Run generator job used to stream output.
     Yields output partials from the generator.
     """
+    span = trace.get_current_span()
+    span.set_attribute("request_id", job.get("id"))
+
     is_async_gen = inspect.isasyncgenfunction(handler)
     log.debug(
         "Using Async Generator" if is_async_gen else "Using Standard Generator",
@@ -255,6 +278,7 @@ async def run_job_generator(
                 yield {"output": output_partial}
 
     except Exception as err:
+        span.record_exception(err)
         log.error(err, job["id"])
         yield {"error": f"handler: {str(err)} \ntraceback: {traceback.format_exc()}"}
     finally:

diff --git a/runpod/serverless/modules/rp_ping.py b/runpod/serverless/modules/rp_ping.py
@@ -8,6 +8,7 @@
 import time
 
 import requests
+from opentelemetry import trace
 from urllib3.util.retry import Retry
 
 from runpod.http_client import SyncClientSession
@@ -16,7 +17,8 @@
 from runpod.version import __version__ as runpod_version
 
 log = RunPodLogger()
-jobs = JobsProgress()  # Contains the list of jobs that are currently running.
+job_progress = JobsProgress()  # Contains the list of jobs that are currently running.
+tracer = trace.get_tracer(__name__)
 
 
 class Heartbeat:
@@ -83,12 +85,18 @@ def ping_loop(self, test=False):
             if test:
                 return
 
+    @tracer.start_as_current_span("send_ping", kind=trace.SpanKind.CLIENT)
     def _send_ping(self):
         """
         Sends a heartbeat to the Runpod server.
         """
-        job_ids = jobs.get_job_list()
-        ping_params = {"job_id": job_ids, "runpod_version": runpod_version}
+        span = trace.get_current_span()
+        job_ids = []
+        for job in job_progress:
+            span.add_event("ping", {"request_id": job.id})
+            job_ids.append(job.id)
+
+        ping_params = {"job_id": ",".join(job_ids), "runpod_version": runpod_version}
 
         try:
             result = self._session.get(
@@ -100,4 +108,5 @@ def _send_ping(self):
             )
 
         except requests.RequestException as err:
+            span.record_exception(err)
             log.error(f"Ping Request Error: {err}, attempting to restart ping.")