From 022f846d81f516351e07d0ddc3949552d3d8b8a6 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Dean=20Qui=C3=B1anola?= <dean.quinanola@runpod.io>
Date: Mon, 28 Oct 2024 14:48:10 -0700
Subject: [PATCH 01/53] tmp: disable existing tracers

---
 runpod/http_client.py | 50 ++++---------------------------------------
 1 file changed, 4 insertions(+), 46 deletions(-)

diff --git a/runpod/http_client.py b/runpod/http_client.py
index 145060bf..ab838094 100644
--- a/runpod/http_client.py
+++ b/runpod/http_client.py
@@ -8,7 +8,6 @@
 from aiohttp import ClientSession, ClientTimeout, TCPConnector, ClientResponseError
 
 from .cli.groups.config.functions import get_credentials
-from .tracer import create_aiohttp_tracer, create_request_tracer
 from .user_agent import USER_AGENT
 
 
@@ -33,62 +32,21 @@ def get_auth_header():
     }
 
 
-def AsyncClientSession(*args, **kwargs):  # pylint: disable=invalid-name
+def AsyncClientSession(*args, **kwargs):
     """
     Deprecation from aiohttp.ClientSession forbids inheritance.
     This is now a factory method
-    TODO: use httpx
     """
     return ClientSession(
         connector=TCPConnector(limit=0),
         headers=get_auth_header(),
         timeout=ClientTimeout(600, ceil_threshold=400),
-        trace_configs=[create_aiohttp_tracer()],
         *args,
         **kwargs,
     )
 
 
 class SyncClientSession(requests.Session):
-    """
-    Inherits requests.Session to override `request()` method for tracing
-    TODO: use httpx
-    """
-
-    def request(self, method, url, **kwargs):  # pylint: disable=arguments-differ
-        """
-        Override for tracing. Not using super().request()
-        to capture metrics for connection and transfer times
-        """
-        with create_request_tracer() as tracer:
-            # Separate out the kwargs that are not applicable to `requests.Request`
-            request_kwargs = {
-                k: v
-                for k, v in kwargs.items()
-                # contains the names of the arguments
-                if k in requests.Request.__init__.__code__.co_varnames
-            }
-
-            # Separate out the kwargs that are applicable to `requests.Request`
-            send_kwargs = {k: v for k, v in kwargs.items() if k not in request_kwargs}
-
-            # Create a PreparedRequest object to hold the request details
-            req = requests.Request(method, url, **request_kwargs)
-            prepped = self.prepare_request(req)
-            tracer.request = prepped  # Assign the request to the tracer
-
-            # Merge environment settings
-            settings = self.merge_environment_settings(
-                prepped.url,
-                send_kwargs.get("proxies"),
-                send_kwargs.get("stream"),
-                send_kwargs.get("verify"),
-                send_kwargs.get("cert"),
-            )
-            send_kwargs.update(settings)
-
-            # Send the request
-            response = self.send(prepped, **send_kwargs)
-            tracer.response = response  # Assign the response to the tracer
-
-            return response
+    def __init__(self):
+        super().__init__()
+        self.headers.update({"User-Agent": USER_AGENT,})

From 150f534d84b8e0a0aabc91d6bc01f8422a6060f1 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Dean=20Qui=C3=B1anola?= <dean.quinanola@runpod.io>
Date: Mon, 28 Oct 2024 14:49:10 -0700
Subject: [PATCH 02/53] tmp: auto-instrumentations for OTEL

---
 requirements.txt   |  9 +++++
 runpod/__init__.py |  1 +
 runpod/otel.py     | 88 ++++++++++++++++++++++++++++++++++++++++++++++
 3 files changed, 98 insertions(+)
 create mode 100644 runpod/otel.py

diff --git a/requirements.txt b/requirements.txt
index 4c7681fe..261bb588 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -17,3 +17,12 @@ tomlkit >= 0.12.2
 tqdm-loggable >= 0.1.4
 urllib3 >= 1.26.6
 watchdog >= 3.0.0
+
+setuptools==65.6.3
+opentelemetry-sdk
+opentelemetry-exporter-otlp
+opentelemetry-instrumentation-aiohttp-client
+opentelemetry-instrumentation-asyncio
+opentelemetry-instrumentation-requests
+opentelemetry-instrumentation-threading
+opentelemetry-instrumentation-urllib3
diff --git a/runpod/__init__.py b/runpod/__init__.py
index 6611587d..6ea28ade 100644
--- a/runpod/__init__.py
+++ b/runpod/__init__.py
@@ -3,6 +3,7 @@
 import logging
 import os
 
+from . import otel
 from . import serverless
 from .api.ctl_commands import (
     create_container_registry_auth,
diff --git a/runpod/otel.py b/runpod/otel.py
new file mode 100644
index 00000000..741a680d
--- /dev/null
+++ b/runpod/otel.py
@@ -0,0 +1,88 @@
+import os
+import typing
+import aiohttp
+from requests import PreparedRequest, Response
+
+from opentelemetry import trace
+from opentelemetry.sdk.trace import Resource, TracerProvider, Span
+from opentelemetry.sdk.trace.export import BatchSpanProcessor, ConsoleSpanExporter
+from opentelemetry.exporter.otlp.proto.grpc.trace_exporter import OTLPSpanExporter
+
+from opentelemetry.instrumentation.aiohttp_client import AioHttpClientInstrumentor
+from opentelemetry.instrumentation.asyncio import AsyncioInstrumentor
+from opentelemetry.instrumentation.requests import RequestsInstrumentor
+from opentelemetry.instrumentation.threading import ThreadingInstrumentor
+from opentelemetry.instrumentation.urllib3 import URLLib3Instrumentor
+
+from runpod.version import __version__ as runpod_version
+
+
+trace.set_tracer_provider(
+    TracerProvider(
+        resource=Resource.create(
+            {
+                "service.name": "runpod-python-sdk",
+                "service.version": runpod_version,
+                "application": "runpod-serverless",
+            }
+        )
+    )
+)
+
+tracer = trace.get_tracer_provider()
+
+if os.getenv("RUNPOD_LOG_LEVEL", "").lower() == "trace":
+    tracer.add_span_processor(BatchSpanProcessor(ConsoleSpanExporter()))
+
+if os.getenv("OTEL_EXPORTER_OTLP_ENDPOINT"):
+    tracer.add_span_processor(BatchSpanProcessor(OTLPSpanExporter()))
+
+
+# --- threading --- #
+ThreadingInstrumentor().instrument()
+
+
+# --- urllib3 --- #
+URLLib3Instrumentor().instrument()
+
+
+# --- asyncio --- #
+AsyncioInstrumentor().instrument()
+
+
+# --- requests --- #
+def requests_request_hook(span: Span, request_obj: PreparedRequest):
+    pass
+
+
+def requests_response_hook(
+    span: Span, request_obj: PreparedRequest, response: Response
+):
+    pass
+
+
+RequestsInstrumentor().instrument()
+
+
+# --- aiohttp --- #
+def aiohttp_request_hook(span: Span, params: aiohttp.TraceRequestStartParams):
+    if span and span.is_recording():
+        span.set_attribute(
+            "custom_user_attribute_from_request_hook", "aiohttp_request_hook"
+        )
+
+
+def aiohttp_response_hook(
+    span: Span,
+    params: typing.Union[
+        aiohttp.TraceRequestEndParams,
+        aiohttp.TraceRequestExceptionParams,
+    ],
+):
+    if span and span.is_recording():
+        span.set_attribute(
+            "custom_user_attribute_from_response_hook", "aiohttp_response_hook"
+        )
+
+
+AioHttpClientInstrumentor().instrument()

From ecda2d4699b00cfbf62acfb2032b7d93eb3801f5 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Dean=20Qui=C3=B1anola?= <dean.quinanola@runpod.io>
Date: Mon, 28 Oct 2024 22:36:46 -0700
Subject: [PATCH 03/53] tmp: our collector can't support gRPC behind ALB

---
 runpod/otel.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/runpod/otel.py b/runpod/otel.py
index 741a680d..622e3461 100644
--- a/runpod/otel.py
+++ b/runpod/otel.py
@@ -6,7 +6,7 @@
 from opentelemetry import trace
 from opentelemetry.sdk.trace import Resource, TracerProvider, Span
 from opentelemetry.sdk.trace.export import BatchSpanProcessor, ConsoleSpanExporter
-from opentelemetry.exporter.otlp.proto.grpc.trace_exporter import OTLPSpanExporter
+from opentelemetry.exporter.otlp.proto.http.trace_exporter import OTLPSpanExporter
 
 from opentelemetry.instrumentation.aiohttp_client import AioHttpClientInstrumentor
 from opentelemetry.instrumentation.asyncio import AsyncioInstrumentor

From f03a03e7b443f4b56d743f63200e063ab873f99b Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Dean=20Qui=C3=B1anola?= <dean.quinanola@runpod.io>
Date: Wed, 30 Oct 2024 23:18:44 -0700
Subject: [PATCH 04/53] tmp: trace the entire JobScaler functions

---
 runpod/otel.py                        |  49 ++----
 runpod/serverless/modules/rp_http.py  |  23 +--
 runpod/serverless/modules/rp_job.py   | 117 ++++++++-------
 runpod/serverless/modules/rp_scale.py | 205 +++++++++++++++-----------
 4 files changed, 206 insertions(+), 188 deletions(-)

diff --git a/runpod/otel.py b/runpod/otel.py
index 622e3461..c328f05c 100644
--- a/runpod/otel.py
+++ b/runpod/otel.py
@@ -1,12 +1,16 @@
 import os
-import typing
-import aiohttp
-from requests import PreparedRequest, Response
 
 from opentelemetry import trace
-from opentelemetry.sdk.trace import Resource, TracerProvider, Span
-from opentelemetry.sdk.trace.export import BatchSpanProcessor, ConsoleSpanExporter
 from opentelemetry.exporter.otlp.proto.http.trace_exporter import OTLPSpanExporter
+from opentelemetry.sdk.trace import TracerProvider
+from opentelemetry.sdk.trace.export import BatchSpanProcessor, ConsoleSpanExporter
+from opentelemetry.sdk.resources import (
+    Resource,
+    SERVICE_NAME,
+    SERVICE_VERSION,
+    HOST_NAME,
+
+)
 
 from opentelemetry.instrumentation.aiohttp_client import AioHttpClientInstrumentor
 from opentelemetry.instrumentation.asyncio import AsyncioInstrumentor
@@ -21,9 +25,10 @@
     TracerProvider(
         resource=Resource.create(
             {
-                "service.name": "runpod-python-sdk",
-                "service.version": runpod_version,
                 "application": "runpod-serverless",
+                SERVICE_NAME: "runpod-python-sdk",
+                SERVICE_VERSION: runpod_version,
+                HOST_NAME: os.getenv("RUNPOD_POD_HOSTNAME"),
             }
         )
     )
@@ -51,38 +56,8 @@
 
 
 # --- requests --- #
-def requests_request_hook(span: Span, request_obj: PreparedRequest):
-    pass
-
-
-def requests_response_hook(
-    span: Span, request_obj: PreparedRequest, response: Response
-):
-    pass
-
-
 RequestsInstrumentor().instrument()
 
 
 # --- aiohttp --- #
-def aiohttp_request_hook(span: Span, params: aiohttp.TraceRequestStartParams):
-    if span and span.is_recording():
-        span.set_attribute(
-            "custom_user_attribute_from_request_hook", "aiohttp_request_hook"
-        )
-
-
-def aiohttp_response_hook(
-    span: Span,
-    params: typing.Union[
-        aiohttp.TraceRequestEndParams,
-        aiohttp.TraceRequestExceptionParams,
-    ],
-):
-    if span and span.is_recording():
-        span.set_attribute(
-            "custom_user_attribute_from_response_hook", "aiohttp_response_hook"
-        )
-
-
 AioHttpClientInstrumentor().instrument()
diff --git a/runpod/serverless/modules/rp_http.py b/runpod/serverless/modules/rp_http.py
index 3d82d35b..26925e47 100644
--- a/runpod/serverless/modules/rp_http.py
+++ b/runpod/serverless/modules/rp_http.py
@@ -5,6 +5,7 @@
 import json
 import os
 
+from opentelemetry.trace import get_tracer
 from aiohttp import ClientError
 from aiohttp_retry import FibonacciRetry, RetryClient
 
@@ -24,6 +25,7 @@
 JOB_STREAM_URL = JOB_STREAM_URL_TEMPLATE.replace("$RUNPOD_POD_ID", WORKER_ID)
 
 log = RunPodLogger()
+tracer = get_tracer(__name__)
 
 
 async def _transmit(client_session: ClientSession, url, job_data):
@@ -44,8 +46,9 @@ async def _transmit(client_session: ClientSession, url, job_data):
         "raise_for_status": True,
     }
 
-    async with retry_client.post(url, **kwargs) as client_response:
-        await client_response.text()
+    with tracer.start_as_current_span("rp_http.transmit"):
+        async with retry_client.post(url, **kwargs) as client_response:
+            await client_response.text()
 
 
 async def _handle_result(
@@ -55,7 +58,7 @@ async def _handle_result(
     A helper function to handle the result, either for sending or streaming.
     """
     try:
-        session.headers["X-Request-ID"] = job["id"]
+        session.headers["X-Request-ID"] = job["id"]  # legacy
 
         serialized_job_data = json.dumps(job_data, ensure_ascii=False)
 
@@ -84,15 +87,17 @@ async def send_result(session, job_data, job, is_stream=False):
     """
     Return the job results.
     """
-    await _handle_result(
-        session, job_data, job, JOB_DONE_URL, "Results sent.", is_stream=is_stream
-    )
+    with tracer.start_as_current_span("rp_http.send_result"):
+        await _handle_result(
+            session, job_data, job, JOB_DONE_URL, "Results sent.", is_stream=is_stream
+        )
 
 
 async def stream_result(session, job_data, job):
     """
     Return the stream job results.
     """
-    await _handle_result(
-        session, job_data, job, JOB_STREAM_URL, "Intermediate results sent."
-    )
+    with tracer.start_as_current_span("rp_http.stream_result"):
+        await _handle_result(
+            session, job_data, job, JOB_STREAM_URL, "Intermediate results sent."
+        )
diff --git a/runpod/serverless/modules/rp_job.py b/runpod/serverless/modules/rp_job.py
index ddac4ec0..c244c22e 100644
--- a/runpod/serverless/modules/rp_job.py
+++ b/runpod/serverless/modules/rp_job.py
@@ -7,6 +7,7 @@
 import os
 import traceback
 from typing import Any, AsyncGenerator, Callable, Dict, Optional, Union, List
+from opentelemetry.trace import get_tracer
 
 from runpod.http_client import ClientSession, TooManyRequests
 from runpod.serverless.modules.rp_logger import RunPodLogger
@@ -22,6 +23,7 @@
 
 log = RunPodLogger()
 job_progress = JobsProgress()
+tracer = get_tracer(__name__)
 
 
 def _job_get_url(batch_size: int = 1):
@@ -160,53 +162,58 @@ async def run_job(handler: Callable, job: Dict[str, Any]) -> Dict[str, Any]:
     log.info("Started.", job["id"])
     run_result = {}
 
-    try:
-        handler_return = handler(job)
-        job_output = (
-            await handler_return
-            if inspect.isawaitable(handler_return)
-            else handler_return
-        )
+    with tracer.start_as_current_span("rp_job.run_job") as span:
+        span.set_attribute("job.id", job.get("id"))
+        span.set_attribute("request_id", job.get("id"))  # legacy
 
-        log.debug(f"Handler output: {job_output}", job["id"])
+        try:
+            handler_return = handler(job)
+            job_output = (
+                await handler_return
+                if inspect.isawaitable(handler_return)
+                else handler_return
+            )
+
+            log.debug(f"Handler output: {job_output}", job["id"])
 
-        if isinstance(job_output, dict):
-            error_msg = job_output.pop("error", None)
-            refresh_worker = job_output.pop("refresh_worker", None)
-            run_result["output"] = job_output
+            if isinstance(job_output, dict):
+                error_msg = job_output.pop("error", None)
+                refresh_worker = job_output.pop("refresh_worker", None)
+                run_result["output"] = job_output
 
-            if error_msg:
-                run_result["error"] = error_msg
-            if refresh_worker:
-                run_result["stopPod"] = True
+                if error_msg:
+                    run_result["error"] = error_msg
+                if refresh_worker:
+                    run_result["stopPod"] = True
 
-        elif isinstance(job_output, bool):
-            run_result = {"output": job_output}
+            elif isinstance(job_output, bool):
+                run_result = {"output": job_output}
 
-        else:
-            run_result = {"output": job_output}
+            else:
+                run_result = {"output": job_output}
 
-        if run_result.get("output") == {}:
-            run_result.pop("output")
+            if run_result.get("output") == {}:
+                run_result.pop("output")
 
-        check_return_size(run_result)  # Checks the size of the return body.
+            check_return_size(run_result)  # Checks the size of the return body.
 
-    except Exception as err:
-        error_info = {
-            "error_type": str(type(err)),
-            "error_message": str(err),
-            "error_traceback": traceback.format_exc(),
-            "hostname": os.environ.get("RUNPOD_POD_HOSTNAME", "unknown"),
-            "worker_id": os.environ.get("RUNPOD_POD_ID", "unknown"),
-            "runpod_version": runpod_version,
-        }
+        except Exception as err:
+            span.record_exception(err)
+            error_info = {
+                "error_type": str(type(err)),
+                "error_message": str(err),
+                "error_traceback": traceback.format_exc(),
+                "hostname": os.environ.get("RUNPOD_POD_HOSTNAME", "unknown"),
+                "worker_id": os.environ.get("RUNPOD_POD_ID", "unknown"),
+                "runpod_version": runpod_version,
+            }
 
-        log.error("Captured Handler Exception", job["id"])
-        log.error(json.dumps(error_info, indent=4))
-        run_result = {"error": json.dumps(error_info)}
+            log.error("Captured Handler Exception", job["id"])
+            log.error(json.dumps(error_info, indent=4))
+            run_result = {"error": json.dumps(error_info)}
 
-    finally:
-        log.debug(f"run_job return: {run_result}", job["id"])
+        finally:
+            log.debug(f"run_job return: {run_result}", job["id"])
 
     return run_result
 
@@ -224,20 +231,22 @@ async def run_job_generator(
         job["id"],
     )
 
-    try:
-        job_output = handler(job)
-
-        if is_async_gen:
-            async for output_partial in job_output:
-                log.debug(f"Async Generator output: {output_partial}", job["id"])
-                yield {"output": output_partial}
-        else:
-            for output_partial in job_output:
-                log.debug(f"Generator output: {output_partial}", job["id"])
-                yield {"output": output_partial}
-
-    except Exception as err:
-        log.error(err, job["id"])
-        yield {"error": f"handler: {str(err)} \ntraceback: {traceback.format_exc()}"}
-    finally:
-        log.info("Finished running generator.", job["id"])
+    with tracer.start_as_current_span("rp_job.run_job_generator") as span:
+        try:
+            job_output = handler(job)
+
+            if is_async_gen:
+                async for output_partial in job_output:
+                    log.debug(f"Async Generator output: {output_partial}", job["id"])
+                    yield {"output": output_partial}
+            else:
+                for output_partial in job_output:
+                    log.debug(f"Generator output: {output_partial}", job["id"])
+                    yield {"output": output_partial}
+
+        except Exception as err:
+            span.record_exception(err)
+            log.error(err, job["id"])
+            yield {"error": f"handler: {str(err)} \ntraceback: {traceback.format_exc()}"}
+        finally:
+            log.info("Finished running generator.", job["id"])
diff --git a/runpod/serverless/modules/rp_scale.py b/runpod/serverless/modules/rp_scale.py
index da4b0fd0..99b2b70a 100644
--- a/runpod/serverless/modules/rp_scale.py
+++ b/runpod/serverless/modules/rp_scale.py
@@ -4,8 +4,10 @@
 """
 
 import asyncio
+import os
 import signal
 from typing import Any, Dict
+from opentelemetry.trace import get_tracer
 
 from ...http_client import AsyncClientSession, ClientSession, TooManyRequests
 from .rp_job import get_job, handle_job
@@ -15,6 +17,7 @@
 log = RunPodLogger()
 job_list = JobsQueue()
 job_progress = JobsProgress()
+tracer = get_tracer(__name__)
 
 
 def _default_concurrency_modifier(current_concurrency: int) -> int:
@@ -54,16 +57,23 @@ def start(self):
         when the user sends a SIGTERM or SIGINT signal. This is typically
         the case when the worker is running in a container.
         """
-        try:
-            # Register signal handlers for graceful shutdown
-            signal.signal(signal.SIGTERM, self.handle_shutdown)
-            signal.signal(signal.SIGINT, self.handle_shutdown)
-        except ValueError:
-            log.warning("Signal handling is only supported in the main thread.")
+        with tracer.start_as_current_span("JobScaler.start") as span:
+            span.set_attributes({
+                "worker.hostname": os.getenv("RUNPOD_POD_HOSTNAME", "unknown"),
+                "worker.id": os.getenv("RUNPOD_POD_ID", "unknown"),
+                "endpoint.id": os.getenv("RUNPOD_ENDPOINT_ID", "unknown"),
+            })
 
-        # Start the main loop
-        # Run forever until the worker is signalled to shut down.
-        asyncio.run(self.run())
+            try:
+                # Register signal handlers for graceful shutdown
+                signal.signal(signal.SIGTERM, self.handle_shutdown)
+                signal.signal(signal.SIGINT, self.handle_shutdown)
+            except ValueError:
+                log.warning("Signal handling is only supported in the main thread.")
+
+            # Start the main loop
+            # Run forever until the worker is signalled to shut down.
+            asyncio.run(self.run())
 
     def handle_shutdown(self, signum, frame):
         """
@@ -81,16 +91,17 @@ def handle_shutdown(self, signum, frame):
         self.kill_worker()
 
     async def run(self):
-        # Create an async session that will be closed when the worker is killed.
-        async with AsyncClientSession() as session:
-            # Create tasks for getting and running jobs.
-            jobtake_task = asyncio.create_task(self.get_jobs(session))
-            jobrun_task = asyncio.create_task(self.run_jobs(session))
+        with tracer.start_as_current_span("JobScaler.run"):
+            # Create an async session that will be closed when the worker is killed.
+            async with AsyncClientSession() as session:
+                # Create tasks for getting and running jobs.
+                jobtake_task = asyncio.create_task(self.get_jobs(session))
+                jobrun_task = asyncio.create_task(self.run_jobs(session))
 
-            tasks = [jobtake_task, jobrun_task]
+                tasks = [jobtake_task, jobrun_task]
 
-            # Concurrently run both tasks and wait for both to finish.
-            await asyncio.gather(*tasks)
+                # Concurrently run both tasks and wait for both to finish.
+                await asyncio.gather(*tasks)
 
     def is_alive(self):
         """
@@ -114,50 +125,61 @@ async def get_jobs(self, session: ClientSession):
         Adds jobs to the JobsQueue
         """
         while self.is_alive():
-            log.debug(f"JobScaler.get_jobs | Jobs in progress: {job_progress.get_job_count()}")
-
-            self.current_concurrency = self.concurrency_modifier(
-                self.current_concurrency
-            )
-            log.debug(f"JobScaler.get_jobs | Concurrency set to: {self.current_concurrency}")
-
-            jobs_needed = self.current_concurrency - job_progress.get_job_count()
-            if jobs_needed <= 0:
-                log.debug("JobScaler.get_jobs | Queue is full. Retrying soon.")
-                await asyncio.sleep(1)  # don't go rapidly
-                continue
-
-            try:
-                # Keep the connection to the blocking call up to 30 seconds
-                acquired_jobs = await asyncio.wait_for(
-                    get_job(session, jobs_needed), timeout=30
+            with tracer.start_as_current_span("JobScaler.get_jobs") as span:
+                self.current_concurrency = self.concurrency_modifier(
+                    self.current_concurrency
                 )
 
-                if not acquired_jobs:
-                    log.debug("JobScaler.get_jobs | No jobs acquired.")
+                jobs_needed = self.current_concurrency - job_progress.get_job_count()
+                if jobs_needed <= 0:
+                    log.debug("JobScaler.get_jobs | Queue is full. Retrying soon.")
+                    await asyncio.sleep(1)  # don't go rapidly
                     continue
-
-                for job in acquired_jobs:
-                    await job_list.add_job(job)
-
-                log.info(f"Jobs in queue: {job_list.get_job_count()}")
-
-            except TooManyRequests:
-                log.debug(f"JobScaler.get_jobs | Too many requests. Debounce for 5 seconds.")
-                await asyncio.sleep(5)  # debounce for 5 seconds
-            except asyncio.CancelledError:
-                log.debug("JobScaler.get_jobs | Request was cancelled.")
-            except TimeoutError:
-                log.debug("JobScaler.get_jobs | Job acquisition timed out. Retrying.")
-            except TypeError as error:
-                log.debug(f"JobScaler.get_jobs | Unexpected error: {error}.")
-            except Exception as error:
-                log.error(
-                    f"Failed to get job. | Error Type: {type(error).__name__} | Error Message: {str(error)}"
-                )
-            finally:
-                # Yield control back to the event loop
-                await asyncio.sleep(0)
+                
+                span.set_attributes({
+                    "jobs.current_concurrency": self.current_concurrency,
+                    "jobs.in_progress": job_progress.get_job_count(),
+                    "jobs.needed": jobs_needed,
+                })
+
+                try:
+                    # Keep the connection to the blocking call up to 30 seconds
+                    acquired_jobs = await asyncio.wait_for(
+                        get_job(session, jobs_needed), timeout=30
+                    )
+                    span.set_attribute("jobs.acquired", len(acquired_jobs))
+
+                    if not acquired_jobs:
+                        log.debug("JobScaler.get_jobs | No jobs acquired.")
+                        continue
+
+                    for job in acquired_jobs:
+                        await job_list.add_job(job)
+
+                    span.set_attribute("jobs.in_queue", len(job_list.get_job_count()))
+                    log.info(f"Jobs in queue: {job_list.get_job_count()}")
+
+                except TooManyRequests as error:
+                    span.record_exception(error)
+                    log.debug(f"JobScaler.get_jobs | Too many requests. Debounce for 5 seconds.")
+                    await asyncio.sleep(5)  # debounce for 5 seconds
+                except asyncio.CancelledError as error:
+                    span.record_exception(error)
+                    log.debug("JobScaler.get_jobs | Request was cancelled.")
+                except TimeoutError as error:
+                    span.record_exception(error)
+                    log.debug("JobScaler.get_jobs | Job acquisition timed out. Retrying.")
+                except TypeError as error:
+                    span.record_exception(error)
+                    log.debug(f"JobScaler.get_jobs | Unexpected error: {error}.")
+                except Exception as error:
+                    span.record_exception(error)
+                    log.error(
+                        f"Failed to get job. | Error Type: {type(error).__name__} | Error Message: {str(error)}"
+                    )
+                finally:
+                    # Yield control back to the event loop
+                    await asyncio.sleep(0)
 
     async def run_jobs(self, session: ClientSession):
         """
@@ -168,27 +190,29 @@ async def run_jobs(self, session: ClientSession):
         tasks = []  # Store the tasks for concurrent job processing
 
         while self.is_alive() or not job_list.empty():
-            # Fetch as many jobs as the concurrency allows
-            while len(tasks) < self.current_concurrency and not job_list.empty():
-                job = await job_list.get_job()
+            with tracer.start_as_current_span("JobScaler.run_jobs") as span:
+                # Fetch as many jobs as the concurrency allows
+                while len(tasks) < self.current_concurrency and not job_list.empty():
+                    job = await job_list.get_job()
 
-                # Create a new task for each job and add it to the task list
-                task = asyncio.create_task(self.handle_job(session, job))
-                tasks.append(task)
+                    # Create a new task for each job and add it to the task list
+                    task = asyncio.create_task(self.handle_job(session, job))
+                    tasks.append(task)
 
-            # Wait for any job to finish
-            if tasks:
-                log.info(f"Jobs in progress: {len(tasks)}")
+                # Wait for any job to finish
+                if tasks:
+                    span.set_attribute("jobs.running", len(tasks))
+                    log.info(f"Jobs in progress: {len(tasks)}")
 
-                done, pending = await asyncio.wait(
-                    tasks, return_when=asyncio.FIRST_COMPLETED
-                )
+                    done, pending = await asyncio.wait(
+                        tasks, return_when=asyncio.FIRST_COMPLETED
+                    )
 
-                # Remove completed tasks from the list
-                tasks = [t for t in tasks if t not in done]
+                    # Remove completed tasks from the list
+                    tasks = [t for t in tasks if t not in done]
 
-            # Yield control back to the event loop
-            await asyncio.sleep(0)
+                # Yield control back to the event loop
+                await asyncio.sleep(0)
 
         # Ensure all remaining tasks finish before stopping
         await asyncio.gather(*tasks)
@@ -197,22 +221,27 @@ async def handle_job(self, session: ClientSession, job: dict):
         """
         Process an individual job. This function is run concurrently for multiple jobs.
         """
-        log.debug(f"JobScaler.handle_job | {job}")
-        job_progress.add(job)
+        with tracer.start_as_current_span("JobScaler.handle_job") as span:
+            span.set_attribute("job.id", job.get("id"))
+            span.set_attribute("request_id", job.get("id"))  # legacy
+
+            log.debug(f"JobScaler.handle_job | {job}")
+            job_progress.add(job)
 
-        try:
-            await handle_job(session, self.config, job)
+            try:
+                await handle_job(session, self.config, job)
 
-            if self.config.get("refresh_worker", False):
-                self.kill_worker()
+                if self.config.get("refresh_worker", False):
+                    self.kill_worker()
 
-        except Exception as err:
-            log.error(f"Error handling job: {err}", job["id"])
-            raise err
+            except Exception as err:
+                span.record_exception(err)
+                log.error(f"Error handling job: {err}", job["id"])
+                raise err
 
-        finally:
-            # Inform JobsQueue of a task completion
-            job_list.task_done()
+            finally:
+                # Inform JobsQueue of a task completion
+                job_list.task_done()
 
-            # Job is no longer in progress
-            job_progress.remove(job["id"])
+                # Job is no longer in progress
+                job_progress.remove(job["id"])

From 1b4d9e3a78a77632db19d15bf8ec3981da87696d Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Dean=20Qui=C3=B1anola?= <dean.quinanola@runpod.io>
Date: Wed, 30 Oct 2024 23:47:24 -0700
Subject: [PATCH 05/53] tmp: too much junk traces from the loop

---
 runpod/serverless/modules/rp_scale.py | 16 ++++++++--------
 1 file changed, 8 insertions(+), 8 deletions(-)

diff --git a/runpod/serverless/modules/rp_scale.py b/runpod/serverless/modules/rp_scale.py
index 99b2b70a..5078db06 100644
--- a/runpod/serverless/modules/rp_scale.py
+++ b/runpod/serverless/modules/rp_scale.py
@@ -124,8 +124,8 @@ async def get_jobs(self, session: ClientSession):
 
         Adds jobs to the JobsQueue
         """
-        while self.is_alive():
-            with tracer.start_as_current_span("JobScaler.get_jobs") as span:
+        with tracer.start_as_current_span("JobScaler.get_jobs") as span:
+            while self.is_alive():
                 self.current_concurrency = self.concurrency_modifier(
                     self.current_concurrency
                 )
@@ -136,7 +136,7 @@ async def get_jobs(self, session: ClientSession):
                     await asyncio.sleep(1)  # don't go rapidly
                     continue
                 
-                span.set_attributes({
+                span.add_event("getting jobs", {
                     "jobs.current_concurrency": self.current_concurrency,
                     "jobs.in_progress": job_progress.get_job_count(),
                     "jobs.needed": jobs_needed,
@@ -147,7 +147,7 @@ async def get_jobs(self, session: ClientSession):
                     acquired_jobs = await asyncio.wait_for(
                         get_job(session, jobs_needed), timeout=30
                     )
-                    span.set_attribute("jobs.acquired", len(acquired_jobs))
+                    span.add_event("acquired jobs", {"jobs.acquired": len(acquired_jobs)})
 
                     if not acquired_jobs:
                         log.debug("JobScaler.get_jobs | No jobs acquired.")
@@ -156,7 +156,7 @@ async def get_jobs(self, session: ClientSession):
                     for job in acquired_jobs:
                         await job_list.add_job(job)
 
-                    span.set_attribute("jobs.in_queue", len(job_list.get_job_count()))
+                    span.add_event("queued jobs", {"jobs.in_queue", job_list.get_job_count()})
                     log.info(f"Jobs in queue: {job_list.get_job_count()}")
 
                 except TooManyRequests as error:
@@ -189,8 +189,8 @@ async def run_jobs(self, session: ClientSession):
         """
         tasks = []  # Store the tasks for concurrent job processing
 
-        while self.is_alive() or not job_list.empty():
-            with tracer.start_as_current_span("JobScaler.run_jobs") as span:
+        with tracer.start_as_current_span("JobScaler.run_jobs") as span:
+            while self.is_alive() or not job_list.empty():
                 # Fetch as many jobs as the concurrency allows
                 while len(tasks) < self.current_concurrency and not job_list.empty():
                     job = await job_list.get_job()
@@ -201,7 +201,7 @@ async def run_jobs(self, session: ClientSession):
 
                 # Wait for any job to finish
                 if tasks:
-                    span.set_attribute("jobs.running", len(tasks))
+                    span.add_event("running jobs", {"jobs.running": len(tasks)})
                     log.info(f"Jobs in progress: {len(tasks)}")
 
                     done, pending = await asyncio.wait(

From a2759af14a62cde1881d5972a9e380ea38b941d6 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Dean=20Qui=C3=B1anola?= <dean.quinanola@runpod.io>
Date: Thu, 31 Oct 2024 00:20:45 -0700
Subject: [PATCH 06/53] tmp: still too much traces

---
 runpod/serverless/modules/rp_scale.py | 131 ++++++++++++++------------
 1 file changed, 69 insertions(+), 62 deletions(-)

diff --git a/runpod/serverless/modules/rp_scale.py b/runpod/serverless/modules/rp_scale.py
index 5078db06..6dd9aa8d 100644
--- a/runpod/serverless/modules/rp_scale.py
+++ b/runpod/serverless/modules/rp_scale.py
@@ -20,6 +20,13 @@
 tracer = get_tracer(__name__)
 
 
+worker_attributes = {
+    "worker.hostname": os.getenv("RUNPOD_POD_HOSTNAME", "unknown"),
+    "worker.id": os.getenv("RUNPOD_POD_ID", "unknown"),
+    "endpoint.id": os.getenv("RUNPOD_ENDPOINT_ID", "unknown"),
+}
+
+
 def _default_concurrency_modifier(current_concurrency: int) -> int:
     """
     Default concurrency modifier.
@@ -57,23 +64,16 @@ def start(self):
         when the user sends a SIGTERM or SIGINT signal. This is typically
         the case when the worker is running in a container.
         """
-        with tracer.start_as_current_span("JobScaler.start") as span:
-            span.set_attributes({
-                "worker.hostname": os.getenv("RUNPOD_POD_HOSTNAME", "unknown"),
-                "worker.id": os.getenv("RUNPOD_POD_ID", "unknown"),
-                "endpoint.id": os.getenv("RUNPOD_ENDPOINT_ID", "unknown"),
-            })
-
-            try:
-                # Register signal handlers for graceful shutdown
-                signal.signal(signal.SIGTERM, self.handle_shutdown)
-                signal.signal(signal.SIGINT, self.handle_shutdown)
-            except ValueError:
-                log.warning("Signal handling is only supported in the main thread.")
+        try:
+            # Register signal handlers for graceful shutdown
+            signal.signal(signal.SIGTERM, self.handle_shutdown)
+            signal.signal(signal.SIGINT, self.handle_shutdown)
+        except ValueError:
+            log.warning("Signal handling is only supported in the main thread.")
 
-            # Start the main loop
-            # Run forever until the worker is signalled to shut down.
-            asyncio.run(self.run())
+        # Start the main loop
+        # Run forever until the worker is signalled to shut down.
+        asyncio.run(self.run())
 
     def handle_shutdown(self, signum, frame):
         """
@@ -91,17 +91,16 @@ def handle_shutdown(self, signum, frame):
         self.kill_worker()
 
     async def run(self):
-        with tracer.start_as_current_span("JobScaler.run"):
-            # Create an async session that will be closed when the worker is killed.
-            async with AsyncClientSession() as session:
-                # Create tasks for getting and running jobs.
-                jobtake_task = asyncio.create_task(self.get_jobs(session))
-                jobrun_task = asyncio.create_task(self.run_jobs(session))
+        # Create an async session that will be closed when the worker is killed.
+        async with AsyncClientSession() as session:
+            # Create tasks for getting and running jobs.
+            jobtake_task = asyncio.create_task(self.get_jobs(session))
+            jobrun_task = asyncio.create_task(self.run_jobs(session))
 
-                tasks = [jobtake_task, jobrun_task]
+            tasks = [jobtake_task, jobrun_task]
 
-                # Concurrently run both tasks and wait for both to finish.
-                await asyncio.gather(*tasks)
+            # Concurrently run both tasks and wait for both to finish.
+            await asyncio.gather(*tasks)
 
     def is_alive(self):
         """
@@ -124,30 +123,37 @@ async def get_jobs(self, session: ClientSession):
 
         Adds jobs to the JobsQueue
         """
-        with tracer.start_as_current_span("JobScaler.get_jobs") as span:
-            while self.is_alive():
-                self.current_concurrency = self.concurrency_modifier(
-                    self.current_concurrency
-                )
+        while self.is_alive():
+            self.current_concurrency = self.concurrency_modifier(
+                self.current_concurrency
+            )
+
+            jobs_needed = self.current_concurrency - job_progress.get_job_count()
+            if jobs_needed <= 0:
+                log.debug("JobScaler.get_jobs | Queue is full. Retrying soon.")
+                await asyncio.sleep(1)  # don't go rapidly
+                continue
 
-                jobs_needed = self.current_concurrency - job_progress.get_job_count()
-                if jobs_needed <= 0:
-                    log.debug("JobScaler.get_jobs | Queue is full. Retrying soon.")
-                    await asyncio.sleep(1)  # don't go rapidly
-                    continue
-                
-                span.add_event("getting jobs", {
-                    "jobs.current_concurrency": self.current_concurrency,
-                    "jobs.in_progress": job_progress.get_job_count(),
-                    "jobs.needed": jobs_needed,
-                })
+            with tracer.start_as_current_span("JobScaler.get_jobs") as span:
+                span.set_attributes(worker_attributes)
 
                 try:
+                    span.add_event(
+                        "getting jobs",
+                        {
+                            "jobs.current_concurrency": self.current_concurrency,
+                            "jobs.in_progress": job_progress.get_job_count(),
+                            "jobs.needed": jobs_needed,
+                        },
+                    )
+
                     # Keep the connection to the blocking call up to 30 seconds
                     acquired_jobs = await asyncio.wait_for(
                         get_job(session, jobs_needed), timeout=30
                     )
-                    span.add_event("acquired jobs", {"jobs.acquired": len(acquired_jobs)})
+                    span.add_event(
+                        "acquired jobs", {"jobs.acquired": len(acquired_jobs)}
+                    )
 
                     if not acquired_jobs:
                         log.debug("JobScaler.get_jobs | No jobs acquired.")
@@ -156,7 +162,9 @@ async def get_jobs(self, session: ClientSession):
                     for job in acquired_jobs:
                         await job_list.add_job(job)
 
-                    span.add_event("queued jobs", {"jobs.in_queue", job_list.get_job_count()})
+                    span.add_event(
+                        "queued jobs", {"jobs.in_queue", job_list.get_job_count()}
+                    )
                     log.info(f"Jobs in queue: {job_list.get_job_count()}")
 
                 except TooManyRequests as error:
@@ -189,30 +197,28 @@ async def run_jobs(self, session: ClientSession):
         """
         tasks = []  # Store the tasks for concurrent job processing
 
-        with tracer.start_as_current_span("JobScaler.run_jobs") as span:
-            while self.is_alive() or not job_list.empty():
-                # Fetch as many jobs as the concurrency allows
-                while len(tasks) < self.current_concurrency and not job_list.empty():
-                    job = await job_list.get_job()
+        while self.is_alive() or not job_list.empty():
+            # Fetch as many jobs as the concurrency allows
+            while len(tasks) < self.current_concurrency and not job_list.empty():
+                job = await job_list.get_job()
 
-                    # Create a new task for each job and add it to the task list
-                    task = asyncio.create_task(self.handle_job(session, job))
-                    tasks.append(task)
+                # Create a new task for each job and add it to the task list
+                task = asyncio.create_task(self.handle_job(session, job))
+                tasks.append(task)
 
-                # Wait for any job to finish
-                if tasks:
-                    span.add_event("running jobs", {"jobs.running": len(tasks)})
-                    log.info(f"Jobs in progress: {len(tasks)}")
+            # Wait for any job to finish
+            if tasks:
+                log.info(f"Jobs in progress: {len(tasks)}")
 
-                    done, pending = await asyncio.wait(
-                        tasks, return_when=asyncio.FIRST_COMPLETED
-                    )
+                done, pending = await asyncio.wait(
+                    tasks, return_when=asyncio.FIRST_COMPLETED
+                )
 
-                    # Remove completed tasks from the list
-                    tasks = [t for t in tasks if t not in done]
+                # Remove completed tasks from the list
+                tasks = [t for t in tasks if t not in done]
 
-                # Yield control back to the event loop
-                await asyncio.sleep(0)
+            # Yield control back to the event loop
+            await asyncio.sleep(0)
 
         # Ensure all remaining tasks finish before stopping
         await asyncio.gather(*tasks)
@@ -222,6 +228,7 @@ async def handle_job(self, session: ClientSession, job: dict):
         Process an individual job. This function is run concurrently for multiple jobs.
         """
         with tracer.start_as_current_span("JobScaler.handle_job") as span:
+            span.set_attributes(worker_attributes)
             span.set_attribute("job.id", job.get("id"))
             span.set_attribute("request_id", job.get("id"))  # legacy
 

From b3fae8f130707b32a6a0500f9013847e38c8d294 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Dean=20Qui=C3=B1anola?= <dean.quinanola@runpod.io>
Date: Thu, 31 Oct 2024 09:27:46 -0700
Subject: [PATCH 07/53] tmp: correction

---
 runpod/serverless/modules/rp_scale.py | 16 +++++++++++-----
 1 file changed, 11 insertions(+), 5 deletions(-)

diff --git a/runpod/serverless/modules/rp_scale.py b/runpod/serverless/modules/rp_scale.py
index 6dd9aa8d..f9cc6956 100644
--- a/runpod/serverless/modules/rp_scale.py
+++ b/runpod/serverless/modules/rp_scale.py
@@ -151,14 +151,16 @@ async def get_jobs(self, session: ClientSession):
                     acquired_jobs = await asyncio.wait_for(
                         get_job(session, jobs_needed), timeout=30
                     )
-                    span.add_event(
-                        "acquired jobs", {"jobs.acquired": len(acquired_jobs)}
-                    )
 
                     if not acquired_jobs:
+                        span.add_event("acquired no jobs", {"jobs.acquired": 0})
                         log.debug("JobScaler.get_jobs | No jobs acquired.")
                         continue
 
+                    span.add_event(
+                        "acquired jobs", {"jobs.acquired": len(acquired_jobs)}
+                    )
+
                     for job in acquired_jobs:
                         await job_list.add_job(job)
 
@@ -169,14 +171,18 @@ async def get_jobs(self, session: ClientSession):
 
                 except TooManyRequests as error:
                     span.record_exception(error)
-                    log.debug(f"JobScaler.get_jobs | Too many requests. Debounce for 5 seconds.")
+                    log.debug(
+                        f"JobScaler.get_jobs | Too many requests. Debounce for 5 seconds."
+                    )
                     await asyncio.sleep(5)  # debounce for 5 seconds
                 except asyncio.CancelledError as error:
                     span.record_exception(error)
                     log.debug("JobScaler.get_jobs | Request was cancelled.")
                 except TimeoutError as error:
                     span.record_exception(error)
-                    log.debug("JobScaler.get_jobs | Job acquisition timed out. Retrying.")
+                    log.debug(
+                        "JobScaler.get_jobs | Job acquisition timed out. Retrying."
+                    )
                 except TypeError as error:
                     span.record_exception(error)
                     log.debug(f"JobScaler.get_jobs | Unexpected error: {error}.")

From 6f0efac14bf5050cc1d78c11a5c8325aa1f6794e Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Dean=20Qui=C3=B1anola?= <dean.quinanola@runpod.io>
Date: Sun, 3 Nov 2024 13:05:40 -0800
Subject: [PATCH 08/53] tmp: only trace http requests from http_client.py

---
 runpod/http_client.py | 15 ++++++++++-----
 runpod/otel.py        | 14 ++++----------
 2 files changed, 14 insertions(+), 15 deletions(-)

diff --git a/runpod/http_client.py b/runpod/http_client.py
index ab838094..9fb2924c 100644
--- a/runpod/http_client.py
+++ b/runpod/http_client.py
@@ -1,15 +1,19 @@
 """
-HTTP Client abstractions
+HTTP Client abstractions with OpenTelemetry tracing support.
 """
 
 import os
-
 import requests
 from aiohttp import ClientSession, ClientTimeout, TCPConnector, ClientResponseError
+from opentelemetry import trace
+from opentelemetry.instrumentation.aiohttp_client import create_trace_config
+from opentelemetry.instrumentation.requests import RequestsInstrumentor
 
 from .cli.groups.config.functions import get_credentials
 from .user_agent import USER_AGENT
 
+tracer = trace.get_tracer(__name__)
+
 
 class TooManyRequests(ClientResponseError):
     pass
@@ -34,13 +38,13 @@ def get_auth_header():
 
 def AsyncClientSession(*args, **kwargs):
     """
-    Deprecation from aiohttp.ClientSession forbids inheritance.
-    This is now a factory method
+    Factory method for an async client session with OpenTelemetry tracing.
     """
     return ClientSession(
         connector=TCPConnector(limit=0),
         headers=get_auth_header(),
         timeout=ClientTimeout(600, ceil_threshold=400),
+        trace_configs=[create_trace_config()],
         *args,
         **kwargs,
     )
@@ -49,4 +53,5 @@ def AsyncClientSession(*args, **kwargs):
 class SyncClientSession(requests.Session):
     def __init__(self):
         super().__init__()
-        self.headers.update({"User-Agent": USER_AGENT,})
+        self.headers.update(get_auth_header())
+        RequestsInstrumentor().instrument_session(self)
diff --git a/runpod/otel.py b/runpod/otel.py
index c328f05c..c7b25d7e 100644
--- a/runpod/otel.py
+++ b/runpod/otel.py
@@ -7,14 +7,14 @@
 from opentelemetry.sdk.resources import (
     Resource,
     SERVICE_NAME,
+    SERVICE_NAMESPACE,
+    SERVICE_INSTANCE_ID,
     SERVICE_VERSION,
     HOST_NAME,
 
 )
 
-from opentelemetry.instrumentation.aiohttp_client import AioHttpClientInstrumentor
 from opentelemetry.instrumentation.asyncio import AsyncioInstrumentor
-from opentelemetry.instrumentation.requests import RequestsInstrumentor
 from opentelemetry.instrumentation.threading import ThreadingInstrumentor
 from opentelemetry.instrumentation.urllib3 import URLLib3Instrumentor
 
@@ -27,6 +27,8 @@
             {
                 "application": "runpod-serverless",
                 SERVICE_NAME: "runpod-python-sdk",
+                SERVICE_NAMESPACE: os.getenv("RUNPOD_ENDPOINT_ID"),
+                SERVICE_INSTANCE_ID: os.getenv("RUNPOD_POD_ID"),
                 SERVICE_VERSION: runpod_version,
                 HOST_NAME: os.getenv("RUNPOD_POD_HOSTNAME"),
             }
@@ -53,11 +55,3 @@
 
 # --- asyncio --- #
 AsyncioInstrumentor().instrument()
-
-
-# --- requests --- #
-RequestsInstrumentor().instrument()
-
-
-# --- aiohttp --- #
-AioHttpClientInstrumentor().instrument()

From 4a96d0fe6e7867499e750133fac4d239b669be91 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Dean=20Qui=C3=B1anola?= <dean.quinanola@runpod.io>
Date: Sun, 3 Nov 2024 14:19:21 -0800
Subject: [PATCH 09/53] tmp: trace to connect job queues, progress, handling,
 and reporting

---
 runpod/http_client.py                 |  2 +-
 runpod/serverless/modules/rp_http.py  | 60 +++++++++++------------
 runpod/serverless/modules/rp_job.py   | 14 +++---
 runpod/serverless/modules/rp_scale.py | 70 ++++++++++-----------------
 4 files changed, 64 insertions(+), 82 deletions(-)

diff --git a/runpod/http_client.py b/runpod/http_client.py
index 9fb2924c..97e98829 100644
--- a/runpod/http_client.py
+++ b/runpod/http_client.py
@@ -54,4 +54,4 @@ class SyncClientSession(requests.Session):
     def __init__(self):
         super().__init__()
         self.headers.update(get_auth_header())
-        RequestsInstrumentor().instrument_session(self)
+        RequestsInstrumentor().instrument(session=self)
\ No newline at end of file
diff --git a/runpod/serverless/modules/rp_http.py b/runpod/serverless/modules/rp_http.py
index 26925e47..be5e640a 100644
--- a/runpod/serverless/modules/rp_http.py
+++ b/runpod/serverless/modules/rp_http.py
@@ -5,7 +5,7 @@
 import json
 import os
 
-from opentelemetry.trace import get_tracer
+from opentelemetry.trace import get_tracer, SpanKind
 from aiohttp import ClientError
 from aiohttp_retry import FibonacciRetry, RetryClient
 
@@ -46,9 +46,8 @@ async def _transmit(client_session: ClientSession, url, job_data):
         "raise_for_status": True,
     }
 
-    with tracer.start_as_current_span("rp_http.transmit"):
-        async with retry_client.post(url, **kwargs) as client_response:
-            await client_response.text()
+    async with retry_client.post(url, **kwargs) as client_response:
+        await client_response.text()
 
 
 async def _handle_result(
@@ -57,47 +56,48 @@ async def _handle_result(
     """
     A helper function to handle the result, either for sending or streaming.
     """
-    try:
-        session.headers["X-Request-ID"] = job["id"]  # legacy
+    with tracer.start_as_current_span("handle_result", kind=SpanKind.INTERNAL) as span:
+        span.set_attribute("request_id", job.get("id"))
 
-        serialized_job_data = json.dumps(job_data, ensure_ascii=False)
+        try:
+            serialized_job_data = json.dumps(job_data, ensure_ascii=False)
 
-        is_stream = "true" if is_stream else "false"
-        url = url_template.replace("$ID", job["id"]) + f"&isStream={is_stream}"
+            is_stream = "true" if is_stream else "false"
+            url = url_template.replace("$ID", job["id"]) + f"&isStream={is_stream}"
 
-        await _transmit(session, url, serialized_job_data)
-        log.debug(f"{log_message}", job["id"])
+            await _transmit(session, url, serialized_job_data)
+            log.debug(f"{log_message}", job["id"])
 
-    except ClientError as err:
-        log.error(f"Failed to return job results. | {err}", job["id"])
+        except ClientError as err:
+            span.record_exception(err)
+            log.error(f"Failed to return job results. | {err}", job["id"])
 
-    except (TypeError, RuntimeError) as err:
-        log.error(f"Error while returning job result. | {err}", job["id"])
+        except (TypeError, RuntimeError) as err:
+            span.record_exception(err)
+            log.error(f"Error while returning job result. | {err}", job["id"])
 
-    finally:
-        # job_data status is used for local development with FastAPI
-        if (
-            url_template == JOB_DONE_URL
-            and job_data.get("status", None) != "IN_PROGRESS"
-        ):
-            log.info("Finished.", job["id"])
+        finally:
+            # job_data status is used for local development with FastAPI
+            if (
+                url_template == JOB_DONE_URL
+                and job_data.get("status", None) != "IN_PROGRESS"
+            ):
+                log.info("Finished.", job["id"])
 
 
 async def send_result(session, job_data, job, is_stream=False):
     """
     Return the job results.
     """
-    with tracer.start_as_current_span("rp_http.send_result"):
-        await _handle_result(
-            session, job_data, job, JOB_DONE_URL, "Results sent.", is_stream=is_stream
-        )
+    await _handle_result(
+        session, job_data, job, JOB_DONE_URL, "Results sent.", is_stream=is_stream
+    )
 
 
 async def stream_result(session, job_data, job):
     """
     Return the stream job results.
     """
-    with tracer.start_as_current_span("rp_http.stream_result"):
-        await _handle_result(
-            session, job_data, job, JOB_STREAM_URL, "Intermediate results sent."
-        )
+    await _handle_result(
+        session, job_data, job, JOB_STREAM_URL, "Intermediate results sent."
+    )
diff --git a/runpod/serverless/modules/rp_job.py b/runpod/serverless/modules/rp_job.py
index c244c22e..e6b729b8 100644
--- a/runpod/serverless/modules/rp_job.py
+++ b/runpod/serverless/modules/rp_job.py
@@ -7,7 +7,7 @@
 import os
 import traceback
 from typing import Any, AsyncGenerator, Callable, Dict, Optional, Union, List
-from opentelemetry.trace import get_tracer
+from opentelemetry.trace import get_tracer, SpanKind
 
 from runpod.http_client import ClientSession, TooManyRequests
 from runpod.serverless.modules.rp_logger import RunPodLogger
@@ -107,11 +107,10 @@ async def get_job(
             return jobs
 
 
-async def handle_job(session: ClientSession, config: Dict[str, Any], job) -> dict:
+async def handle_job(session: ClientSession, config: Dict[str, Any], job: dict) -> dict:
     if is_generator(config["handler"]):
         is_stream = True
         generator_output = run_job_generator(config["handler"], job)
-        log.debug("Handler is a generator, streaming results.", job["id"])
 
         job_result = {"output": []}
         async for stream_output in generator_output:
@@ -162,9 +161,8 @@ async def run_job(handler: Callable, job: Dict[str, Any]) -> Dict[str, Any]:
     log.info("Started.", job["id"])
     run_result = {}
 
-    with tracer.start_as_current_span("rp_job.run_job") as span:
-        span.set_attribute("job.id", job.get("id"))
-        span.set_attribute("request_id", job.get("id"))  # legacy
+    with tracer.start_as_current_span("run_job", kind=SpanKind.INTERNAL) as span:
+        span.set_attribute("request_id", job.get("id"))
 
         try:
             handler_return = handler(job)
@@ -231,7 +229,9 @@ async def run_job_generator(
         job["id"],
     )
 
-    with tracer.start_as_current_span("rp_job.run_job_generator") as span:
+    with tracer.start_as_current_span("run_job_generator", kind=SpanKind.INTERNAL) as span:
+        span.set_attribute("request_id", job.get("id"))
+
         try:
             job_output = handler(job)
 
diff --git a/runpod/serverless/modules/rp_scale.py b/runpod/serverless/modules/rp_scale.py
index f9cc6956..34c10a1d 100644
--- a/runpod/serverless/modules/rp_scale.py
+++ b/runpod/serverless/modules/rp_scale.py
@@ -7,7 +7,8 @@
 import os
 import signal
 from typing import Any, Dict
-from opentelemetry.trace import get_tracer
+from uuid import uuid1  # traceable to machine's MAC address + timestamp
+from opentelemetry.trace import get_tracer, SpanKind
 
 from ...http_client import AsyncClientSession, ClientSession, TooManyRequests
 from .rp_job import get_job, handle_job
@@ -20,13 +21,6 @@
 tracer = get_tracer(__name__)
 
 
-worker_attributes = {
-    "worker.hostname": os.getenv("RUNPOD_POD_HOSTNAME", "unknown"),
-    "worker.id": os.getenv("RUNPOD_POD_ID", "unknown"),
-    "endpoint.id": os.getenv("RUNPOD_ENDPOINT_ID", "unknown"),
-}
-
-
 def _default_concurrency_modifier(current_concurrency: int) -> int:
     """
     Default concurrency modifier.
@@ -130,22 +124,20 @@ async def get_jobs(self, session: ClientSession):
 
             jobs_needed = self.current_concurrency - job_progress.get_job_count()
             if jobs_needed <= 0:
-                log.debug("JobScaler.get_jobs | Queue is full. Retrying soon.")
+                log.debug("Queue is full. Retrying soon.")
                 await asyncio.sleep(1)  # don't go rapidly
                 continue
 
-            with tracer.start_as_current_span("JobScaler.get_jobs") as span:
-                span.set_attributes(worker_attributes)
+            with tracer.start_as_current_span("get_jobs", kind=SpanKind.CLIENT) as span:
+                span.set_attribute("batch_id", uuid1().hex)
 
                 try:
-                    span.add_event(
-                        "getting jobs",
-                        {
-                            "jobs.current_concurrency": self.current_concurrency,
-                            "jobs.in_progress": job_progress.get_job_count(),
-                            "jobs.needed": jobs_needed,
-                        },
-                    )
+                    # TODO: metrics
+                    # {
+                    #     "jobs.current_concurrency": self.current_concurrency,
+                    #     "jobs.in_progress": job_progress.get_job_count(),
+                    #     "jobs.needed": jobs_needed,
+                    # }
 
                     # Keep the connection to the blocking call up to 30 seconds
                     acquired_jobs = await asyncio.wait_for(
@@ -153,39 +145,30 @@ async def get_jobs(self, session: ClientSession):
                     )
 
                     if not acquired_jobs:
-                        span.add_event("acquired no jobs", {"jobs.acquired": 0})
-                        log.debug("JobScaler.get_jobs | No jobs acquired.")
+                        span.add_event("No jobs acquired")
+                        log.debug("No jobs acquired")
                         continue
 
-                    span.add_event(
-                        "acquired jobs", {"jobs.acquired": len(acquired_jobs)}
-                    )
+                    span.set_attribute("jobs_acquired_count", len(acquired_jobs))
 
                     for job in acquired_jobs:
-                        await job_list.add_job(job)
+                        with tracer.start_as_current_span("queue_job", kind=SpanKind.PRODUCER) as job_span:
+                            job_span.set_attribute("request_id", job.get("id"))
+                            await job_list.add_job(job)
 
-                    span.add_event(
-                        "queued jobs", {"jobs.in_queue", job_list.get_job_count()}
-                    )
+                    # TODO: metrics {"jobs.queued", job_list.get_job_count()}
                     log.info(f"Jobs in queue: {job_list.get_job_count()}")
 
                 except TooManyRequests as error:
-                    span.record_exception(error)
-                    log.debug(
-                        f"JobScaler.get_jobs | Too many requests. Debounce for 5 seconds."
-                    )
+                    span.add_event("Too many requests. Debounce for 5 seconds.")
                     await asyncio.sleep(5)  # debounce for 5 seconds
                 except asyncio.CancelledError as error:
-                    span.record_exception(error)
-                    log.debug("JobScaler.get_jobs | Request was cancelled.")
+                    span.add_event("Request was cancelled")
                 except TimeoutError as error:
-                    span.record_exception(error)
-                    log.debug(
-                        "JobScaler.get_jobs | Job acquisition timed out. Retrying."
-                    )
+                    span.add_event("Job acquisition timed out")
                 except TypeError as error:
+                    # worker waking up produces a JSON error here
                     span.record_exception(error)
-                    log.debug(f"JobScaler.get_jobs | Unexpected error: {error}.")
                 except Exception as error:
                     span.record_exception(error)
                     log.error(
@@ -214,6 +197,7 @@ async def run_jobs(self, session: ClientSession):
 
             # Wait for any job to finish
             if tasks:
+                # TODO: metrics {"jobs.in_progress", len(tasks)}
                 log.info(f"Jobs in progress: {len(tasks)}")
 
                 done, pending = await asyncio.wait(
@@ -233,18 +217,16 @@ async def handle_job(self, session: ClientSession, job: dict):
         """
         Process an individual job. This function is run concurrently for multiple jobs.
         """
-        with tracer.start_as_current_span("JobScaler.handle_job") as span:
-            span.set_attributes(worker_attributes)
-            span.set_attribute("job.id", job.get("id"))
-            span.set_attribute("request_id", job.get("id"))  # legacy
+        with tracer.start_as_current_span("handle_job", kind=SpanKind.CONSUMER) as span:
+            span.set_attribute("request_id", job.get("id"))
 
-            log.debug(f"JobScaler.handle_job | {job}")
             job_progress.add(job)
 
             try:
                 await handle_job(session, self.config, job)
 
                 if self.config.get("refresh_worker", False):
+                    span.add_event("refresh_worker")
                     self.kill_worker()
 
             except Exception as err:

From 83dc31b274728703be292dddf5d567a9aabd37d2 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Dean=20Qui=C3=B1anola?= <dean.quinanola@runpod.io>
Date: Sun, 3 Nov 2024 15:05:24 -0800
Subject: [PATCH 10/53] tmp: remove unused instrumentations

---
 requirements.txt |  3 ---
 runpod/otel.py   | 15 ---------------
 2 files changed, 18 deletions(-)

diff --git a/requirements.txt b/requirements.txt
index 261bb588..f7bfe3b7 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -22,7 +22,4 @@ setuptools==65.6.3
 opentelemetry-sdk
 opentelemetry-exporter-otlp
 opentelemetry-instrumentation-aiohttp-client
-opentelemetry-instrumentation-asyncio
 opentelemetry-instrumentation-requests
-opentelemetry-instrumentation-threading
-opentelemetry-instrumentation-urllib3
diff --git a/runpod/otel.py b/runpod/otel.py
index c7b25d7e..c2f21831 100644
--- a/runpod/otel.py
+++ b/runpod/otel.py
@@ -14,9 +14,6 @@
 
 )
 
-from opentelemetry.instrumentation.asyncio import AsyncioInstrumentor
-from opentelemetry.instrumentation.threading import ThreadingInstrumentor
-from opentelemetry.instrumentation.urllib3 import URLLib3Instrumentor
 
 from runpod.version import __version__ as runpod_version
 
@@ -43,15 +40,3 @@
 
 if os.getenv("OTEL_EXPORTER_OTLP_ENDPOINT"):
     tracer.add_span_processor(BatchSpanProcessor(OTLPSpanExporter()))
-
-
-# --- threading --- #
-ThreadingInstrumentor().instrument()
-
-
-# --- urllib3 --- #
-URLLib3Instrumentor().instrument()
-
-
-# --- asyncio --- #
-AsyncioInstrumentor().instrument()

From 25762a6c092acedc0b53217a3ac47e9ad3f379a6 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Dean=20Qui=C3=B1anola?= <dean.quinanola@runpod.io>
Date: Sun, 3 Nov 2024 15:06:20 -0800
Subject: [PATCH 11/53] tmp: handle_job is child to queue_job

---
 runpod/serverless/modules/rp_http.py  | 2 +-
 runpod/serverless/modules/rp_scale.py | 3 ++-
 2 files changed, 3 insertions(+), 2 deletions(-)

diff --git a/runpod/serverless/modules/rp_http.py b/runpod/serverless/modules/rp_http.py
index be5e640a..8d7ed972 100644
--- a/runpod/serverless/modules/rp_http.py
+++ b/runpod/serverless/modules/rp_http.py
@@ -56,7 +56,7 @@ async def _handle_result(
     """
     A helper function to handle the result, either for sending or streaming.
     """
-    with tracer.start_as_current_span("handle_result", kind=SpanKind.INTERNAL) as span:
+    with tracer.start_as_current_span("handle_result", kind=SpanKind.SERVER) as span:
         span.set_attribute("request_id", job.get("id"))
 
         try:
diff --git a/runpod/serverless/modules/rp_scale.py b/runpod/serverless/modules/rp_scale.py
index 34c10a1d..4df00456 100644
--- a/runpod/serverless/modules/rp_scale.py
+++ b/runpod/serverless/modules/rp_scale.py
@@ -154,6 +154,7 @@ async def get_jobs(self, session: ClientSession):
                     for job in acquired_jobs:
                         with tracer.start_as_current_span("queue_job", kind=SpanKind.PRODUCER) as job_span:
                             job_span.set_attribute("request_id", job.get("id"))
+                            job["trace"] = job_span.get_span_context()
                             await job_list.add_job(job)
 
                     # TODO: metrics {"jobs.queued", job_list.get_job_count()}
@@ -217,7 +218,7 @@ async def handle_job(self, session: ClientSession, job: dict):
         """
         Process an individual job. This function is run concurrently for multiple jobs.
         """
-        with tracer.start_as_current_span("handle_job", kind=SpanKind.CONSUMER) as span:
+        with tracer.start_as_current_span("handle_job", context=job.get("trace"), kind=SpanKind.CONSUMER) as span:
             span.set_attribute("request_id", job.get("id"))
 
             job_progress.add(job)

From 97dbb78ad160db651e7ef3feccbac71f0aa5ccf2 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Dean=20Qui=C3=B1anola?= <dean.quinanola@runpod.io>
Date: Sun, 3 Nov 2024 15:13:42 -0800
Subject: [PATCH 12/53] tmp: disable http_client tracing temporarily

---
 runpod/http_client.py | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/runpod/http_client.py b/runpod/http_client.py
index 97e98829..8bf6dca2 100644
--- a/runpod/http_client.py
+++ b/runpod/http_client.py
@@ -6,8 +6,8 @@
 import requests
 from aiohttp import ClientSession, ClientTimeout, TCPConnector, ClientResponseError
 from opentelemetry import trace
-from opentelemetry.instrumentation.aiohttp_client import create_trace_config
-from opentelemetry.instrumentation.requests import RequestsInstrumentor
+# from opentelemetry.instrumentation.aiohttp_client import create_trace_config
+# from opentelemetry.instrumentation.requests import RequestsInstrumentor
 
 from .cli.groups.config.functions import get_credentials
 from .user_agent import USER_AGENT
@@ -44,7 +44,7 @@ def AsyncClientSession(*args, **kwargs):
         connector=TCPConnector(limit=0),
         headers=get_auth_header(),
         timeout=ClientTimeout(600, ceil_threshold=400),
-        trace_configs=[create_trace_config()],
+        # trace_configs=[create_trace_config()],
         *args,
         **kwargs,
     )
@@ -54,4 +54,4 @@ class SyncClientSession(requests.Session):
     def __init__(self):
         super().__init__()
         self.headers.update(get_auth_header())
-        RequestsInstrumentor().instrument(session=self)
\ No newline at end of file
+        # RequestsInstrumentor().instrument(session=self)
\ No newline at end of file

From f0f6997af593f349917a493dc0c8611e4407fb1c Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Dean=20Qui=C3=B1anola?= <dean.quinanola@runpod.io>
Date: Sun, 3 Nov 2024 15:24:46 -0800
Subject: [PATCH 13/53] tmp: job.get("trace").get_span_context()

---
 runpod/serverless/modules/rp_scale.py | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/runpod/serverless/modules/rp_scale.py b/runpod/serverless/modules/rp_scale.py
index 4df00456..744d1332 100644
--- a/runpod/serverless/modules/rp_scale.py
+++ b/runpod/serverless/modules/rp_scale.py
@@ -154,7 +154,7 @@ async def get_jobs(self, session: ClientSession):
                     for job in acquired_jobs:
                         with tracer.start_as_current_span("queue_job", kind=SpanKind.PRODUCER) as job_span:
                             job_span.set_attribute("request_id", job.get("id"))
-                            job["trace"] = job_span.get_span_context()
+                            job["trace"] = job_span
                             await job_list.add_job(job)
 
                     # TODO: metrics {"jobs.queued", job_list.get_job_count()}
@@ -218,7 +218,8 @@ async def handle_job(self, session: ClientSession, job: dict):
         """
         Process an individual job. This function is run concurrently for multiple jobs.
         """
-        with tracer.start_as_current_span("handle_job", context=job.get("trace"), kind=SpanKind.CONSUMER) as span:
+        context = job.get("trace").get_span_context()
+        with tracer.start_as_current_span("handle_job", context=context, kind=SpanKind.CONSUMER) as span:
             span.set_attribute("request_id", job.get("id"))
 
             job_progress.add(job)

From fec5a53eba21c12a85feb976eda5fec9965db1a2 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Dean=20Qui=C3=B1anola?= <dean.quinanola@runpod.io>
Date: Sun, 3 Nov 2024 15:52:33 -0800
Subject: [PATCH 14/53] tmp: correction to tracer span context

---
 runpod/serverless/modules/rp_scale.py | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/runpod/serverless/modules/rp_scale.py b/runpod/serverless/modules/rp_scale.py
index 744d1332..e16c8036 100644
--- a/runpod/serverless/modules/rp_scale.py
+++ b/runpod/serverless/modules/rp_scale.py
@@ -8,7 +8,7 @@
 import signal
 from typing import Any, Dict
 from uuid import uuid1  # traceable to machine's MAC address + timestamp
-from opentelemetry.trace import get_tracer, SpanKind
+from opentelemetry.trace import get_tracer, SpanKind, set_span_in_context, NonRecordingSpan
 
 from ...http_client import AsyncClientSession, ClientSession, TooManyRequests
 from .rp_job import get_job, handle_job
@@ -154,7 +154,7 @@ async def get_jobs(self, session: ClientSession):
                     for job in acquired_jobs:
                         with tracer.start_as_current_span("queue_job", kind=SpanKind.PRODUCER) as job_span:
                             job_span.set_attribute("request_id", job.get("id"))
-                            job["trace"] = job_span
+                            job["context"] = job_span.get_span_context()
                             await job_list.add_job(job)
 
                     # TODO: metrics {"jobs.queued", job_list.get_job_count()}
@@ -218,7 +218,8 @@ async def handle_job(self, session: ClientSession, job: dict):
         """
         Process an individual job. This function is run concurrently for multiple jobs.
         """
-        context = job.get("trace").get_span_context()
+        context = set_span_in_context(NonRecordingSpan(job["context"]))
+
         with tracer.start_as_current_span("handle_job", context=context, kind=SpanKind.CONSUMER) as span:
             span.set_attribute("request_id", job.get("id"))
 

From b3bced1d6b91b7e7686c879f5599a99c018bf9cb Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Dean=20Qui=C3=B1anola?= <dean.quinanola@runpod.io>
Date: Mon, 4 Nov 2024 16:42:58 -0800
Subject: [PATCH 15/53] tmp: custom runpod namespace for process tags

---
 runpod/otel.py | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/runpod/otel.py b/runpod/otel.py
index c2f21831..6672c3ad 100644
--- a/runpod/otel.py
+++ b/runpod/otel.py
@@ -11,9 +11,10 @@
     SERVICE_INSTANCE_ID,
     SERVICE_VERSION,
     HOST_NAME,
-
 )
 
+RUNPOD_ENDPOINT_ID = "runpod.endpoint_id"
+RUNPOD_POD_ID = "runpod.pod_id"
 
 from runpod.version import __version__ as runpod_version
 
@@ -25,7 +26,9 @@
                 "application": "runpod-serverless",
                 SERVICE_NAME: "runpod-python-sdk",
                 SERVICE_NAMESPACE: os.getenv("RUNPOD_ENDPOINT_ID"),
+                RUNPOD_ENDPOINT_ID: os.getenv("RUNPOD_ENDPOINT_ID"),
                 SERVICE_INSTANCE_ID: os.getenv("RUNPOD_POD_ID"),
+                RUNPOD_POD_ID: os.getenv("RUNPOD_POD_ID"),
                 SERVICE_VERSION: runpod_version,
                 HOST_NAME: os.getenv("RUNPOD_POD_HOSTNAME"),
             }

From d19ac103239afa1c7bd2b7c1f9a1d997ea760b32 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Dean=20Qui=C3=B1anola?= <dean.quinanola@runpod.io>
Date: Mon, 4 Nov 2024 16:43:11 -0800
Subject: [PATCH 16/53] tmp: cleanup by black format

---
 runpod/serverless/modules/rp_scale.py | 16 ++++++++++++----
 1 file changed, 12 insertions(+), 4 deletions(-)

diff --git a/runpod/serverless/modules/rp_scale.py b/runpod/serverless/modules/rp_scale.py
index e16c8036..ef7b24ae 100644
--- a/runpod/serverless/modules/rp_scale.py
+++ b/runpod/serverless/modules/rp_scale.py
@@ -4,11 +4,15 @@
 """
 
 import asyncio
-import os
 import signal
 from typing import Any, Dict
 from uuid import uuid1  # traceable to machine's MAC address + timestamp
-from opentelemetry.trace import get_tracer, SpanKind, set_span_in_context, NonRecordingSpan
+from opentelemetry.trace import (
+    get_tracer,
+    set_span_in_context,
+    SpanKind,
+    NonRecordingSpan,
+)
 
 from ...http_client import AsyncClientSession, ClientSession, TooManyRequests
 from .rp_job import get_job, handle_job
@@ -152,7 +156,9 @@ async def get_jobs(self, session: ClientSession):
                     span.set_attribute("jobs_acquired_count", len(acquired_jobs))
 
                     for job in acquired_jobs:
-                        with tracer.start_as_current_span("queue_job", kind=SpanKind.PRODUCER) as job_span:
+                        with tracer.start_as_current_span(
+                            "queue_job", kind=SpanKind.PRODUCER
+                        ) as job_span:
                             job_span.set_attribute("request_id", job.get("id"))
                             job["context"] = job_span.get_span_context()
                             await job_list.add_job(job)
@@ -220,7 +226,9 @@ async def handle_job(self, session: ClientSession, job: dict):
         """
         context = set_span_in_context(NonRecordingSpan(job["context"]))
 
-        with tracer.start_as_current_span("handle_job", context=context, kind=SpanKind.CONSUMER) as span:
+        with tracer.start_as_current_span(
+            "handle_job", context=context, kind=SpanKind.CONSUMER
+        ) as span:
             span.set_attribute("request_id", job.get("id"))
 
             job_progress.add(job)

From c21cfe51918e97f622817d3289c23e4404574d66 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Dean=20Qui=C3=B1anola?= <dean.quinanola@runpod.io>
Date: Mon, 4 Nov 2024 16:59:01 -0800
Subject: [PATCH 17/53] tmp: otel tracing sls-core hooks

---
 runpod/serverless/core.py            | 156 ++++++++++++++++-----------
 runpod/serverless/modules/rp_http.py |   1 +
 2 files changed, 96 insertions(+), 61 deletions(-)

diff --git a/runpod/serverless/core.py b/runpod/serverless/core.py
index 657dbe64..43e8c4fc 100644
--- a/runpod/serverless/core.py
+++ b/runpod/serverless/core.py
@@ -9,12 +9,20 @@
 import typing
 from ctypes import CDLL, byref, c_char_p, c_int
 from typing import Any, Callable, Dict, List, Optional
+from uuid import uuid1  # traceable to machine's MAC address + timestamp
+from opentelemetry.trace import (
+    get_tracer,
+    set_span_in_context,
+    SpanKind,
+    NonRecordingSpan,
+)
 
 from runpod.serverless.modules import rp_job
 from runpod.serverless.modules.rp_logger import RunPodLogger
 from runpod.version import __version__ as runpod_version
 
 log = RunPodLogger()
+tracer = get_tracer(__name__)
 
 # _runpod_sls_get_jobs status codes
 STILL_WAITING = 0 
@@ -188,32 +196,40 @@ async def stream_output(self, job_id: str, job_output: bytes) -> bool:
         """
         send part of a streaming result to AI-API.
         """
-        json_data = self._json_serialize_job_data(job_output)
-        id_bytes = job_id.encode("utf-8")
-        return bool(
-            self._stream_output(
-                c_char_p(id_bytes),
-                c_int(len(id_bytes)),
-                c_char_p(json_data),
-                c_int(len(json_data)),
+        with tracer.start_as_current_span("handle_result", kind=SpanKind.SERVER) as span:
+            span.set_attribute("request_id", job_id)
+            span.set_attribute("is_stream", True)
+
+            json_data = self._json_serialize_job_data(job_output)
+            id_bytes = job_id.encode("utf-8")
+            return bool(
+                self._stream_output(
+                    c_char_p(id_bytes),
+                    c_int(len(id_bytes)),
+                    c_char_p(json_data),
+                    c_int(len(json_data)),
+                )
             )
-        )
 
     def post_output(self, job_id: str, job_output: bytes) -> bool:
         """
         send the result of a job to AI-API.
         Returns True if the task was successfully stored, False otherwise.
         """
-        json_data = self._json_serialize_job_data(job_output)
-        id_bytes = job_id.encode("utf-8")
-        return bool(
-            self._post_output(
-                c_char_p(id_bytes),
-                c_int(len(id_bytes)),
-                c_char_p(json_data),
-                c_int(len(json_data)),
+        with tracer.start_as_current_span("handle_result", kind=SpanKind.SERVER) as span:
+            span.set_attribute("request_id", job_id)
+            span.set_attribute("is_stream", False)
+
+            json_data = self._json_serialize_job_data(job_output)
+            id_bytes = job_id.encode("utf-8")
+            return bool(
+                self._post_output(
+                    c_char_p(id_bytes),
+                    c_int(len(id_bytes)),
+                    c_char_p(json_data),
+                    c_int(len(json_data)),
+                )
             )
-        )
 
     def finish_stream(self, job_id: str) -> bool:
         """
@@ -225,46 +241,53 @@ def finish_stream(self, job_id: str) -> bool:
 
 # -------------------------------- Process Job ------------------------------- #
 async def _process_job(
-    config: Dict[str, Any], job: Dict[str, Any], hook
+    config: Dict[str, Any], job: Dict[str, Any], hook: Hook
 ) -> Dict[str, Any]:
     """Process a single job."""
     handler = config["handler"]
 
     result = {}
-    try:
-        if inspect.isgeneratorfunction(handler) or inspect.isasyncgenfunction(handler):
-            log.debug("SLS Core | Running job as a generator.")
-            generator_output = rp_job.run_job_generator(handler, job)
-            aggregated_output: dict[str, typing.Any] = {"output": []}
 
-            async for part in generator_output:
-                log.trace(f"SLS Core | Streaming output: {part}", job["id"])
+    context = set_span_in_context(NonRecordingSpan(job["context"]))
 
-                if "error" in part:
-                    aggregated_output = part
-                    break
-                if config.get("return_aggregate_stream", False):
-                    aggregated_output["output"].append(part["output"])
+    with tracer.start_as_current_span("handle_job", context=context, kind=SpanKind.CONSUMER) as span:
+        span.set_attribute("request_id", job.get("id"))
 
-                await hook.stream_output(job["id"], part)
+        try:
+            if inspect.isgeneratorfunction(handler) or inspect.isasyncgenfunction(handler):
+                log.debug("SLS Core | Running job as a generator.")
+                generator_output = rp_job.run_job_generator(handler, job)
+                aggregated_output: dict[str, typing.Any] = {"output": []}
 
-            log.debug("SLS Core | Finished streaming output.", job["id"])
-            hook.finish_stream(job["id"])
-            result = aggregated_output
+                async for part in generator_output:
+                    log.trace(f"SLS Core | Streaming output: {part}", job["id"])
 
-        else:
-            log.debug("SLS Core | Running job as a standard function.")
-            result = await rp_job.run_job(handler, job)
-            result = result.get("output", result)
+                    if "error" in part:
+                        aggregated_output = part
+                        break
+                    if config.get("return_aggregate_stream", False):
+                        aggregated_output["output"].append(part["output"])
 
-    except Exception as err:  # pylint: disable=broad-except
-        log.error(f"SLS Core | Error running job: {err}", job["id"])
-        result = {"error": str(err)}
+                    await hook.stream_output(job["id"], part)
 
-    finally:
-        log.debug(f"SLS Core | Posting output: {result}", job["id"])
-        hook.post_output(job["id"], result)
-        return result
+                log.debug("SLS Core | Finished streaming output.", job["id"])
+                hook.finish_stream(job["id"])
+                result = aggregated_output
+
+            else:
+                log.debug("SLS Core | Running job as a standard function.")
+                result = await rp_job.run_job(handler, job)
+                result = result.get("output", result)
+
+        except Exception as err:  # pylint: disable=broad-except
+            span.record_exception(err)
+            log.error(f"SLS Core | Error running job: {err}", job["id"])
+            result = {"error": str(err)}
+
+        finally:
+            log.debug(f"SLS Core | Posting output: {result}", job["id"])
+            hook.post_output(job["id"], result)
+            return result
 
 
 # ---------------------------------------------------------------------------- #
@@ -282,25 +305,36 @@ async def run(config: Dict[str, Any]) -> None:
 
     serverless_hook = Hook()
     while True:
-        try:
-            jobs = serverless_hook.get_jobs(max_concurrency, max_jobs)
-        except SlsCoreError as err:
-            log.error(f"SLS Core | Error getting jobs: {err}")
-            await asyncio.sleep(0.2) # sleep for a bit before trying again
-            continue
+        with tracer.start_as_current_span("get_jobs", kind=SpanKind.CLIENT) as span:
+            span.set_attribute("runpod.sls_core_enabled", True)
+            span.set_attribute("batch_id", uuid1().hex)
 
-        if len(jobs) == 0 or jobs is None:
-            await asyncio.sleep(0)
-            continue
+            try:
+                jobs = serverless_hook.get_jobs(max_concurrency, max_jobs)
+            except SlsCoreError as err:
+                span.record_exception(err)
+                log.error(f"SLS Core | Error getting jobs: {err}")
+                await asyncio.sleep(0.2) # sleep for a bit before trying again
+                continue
+
+            if len(jobs) == 0 or jobs is None:
+                span.add_event("No jobs acquired")
+                await asyncio.sleep(0)
+                continue
+
+            span.set_attribute("jobs_acquired_count", len(jobs))
+
+            for job in jobs:
+                with tracer.start_as_current_span("queue_job", kind=SpanKind.PRODUCER) as job_span:
+                    job_span.set_attribute("request_id", job.get("id"))
+                    job["context"] = job_span.get_span_context()
+                    asyncio.create_task(
+                        _process_job(config, job, serverless_hook), name=job["id"]
+                    )
+                    await asyncio.sleep(0)
 
-        for job in jobs:
-            asyncio.create_task(
-                _process_job(config, job, serverless_hook), name=job["id"]
-            )
             await asyncio.sleep(0)
 
-        await asyncio.sleep(0)
-
 
 def main(config: Dict[str, Any]) -> None:
     """Run the worker in an asyncio event loop."""
diff --git a/runpod/serverless/modules/rp_http.py b/runpod/serverless/modules/rp_http.py
index 8d7ed972..2b0b3343 100644
--- a/runpod/serverless/modules/rp_http.py
+++ b/runpod/serverless/modules/rp_http.py
@@ -58,6 +58,7 @@ async def _handle_result(
     """
     with tracer.start_as_current_span("handle_result", kind=SpanKind.SERVER) as span:
         span.set_attribute("request_id", job.get("id"))
+        span.set_attribute("is_stream", is_stream)
 
         try:
             serialized_job_data = json.dumps(job_data, ensure_ascii=False)

From 9bac6229db5f61e21c2acca2da3700b2a09baab2 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Dean=20Qui=C3=B1anola?= <dean.quinanola@runpod.io>
Date: Mon, 4 Nov 2024 21:24:17 -0800
Subject: [PATCH 18/53] tmp: using decorators where appropriate

---
 runpod/serverless/core.py            |  71 +++++++--------
 runpod/serverless/modules/rp_http.py |  61 ++++++-------
 runpod/serverless/modules/rp_job.py  | 130 ++++++++++++++-------------
 3 files changed, 131 insertions(+), 131 deletions(-)

diff --git a/runpod/serverless/core.py b/runpod/serverless/core.py
index 43e8c4fc..9a1e9f62 100644
--- a/runpod/serverless/core.py
+++ b/runpod/serverless/core.py
@@ -8,21 +8,16 @@
 import pathlib
 import typing
 from ctypes import CDLL, byref, c_char_p, c_int
+from opentelemetry import trace
 from typing import Any, Callable, Dict, List, Optional
 from uuid import uuid1  # traceable to machine's MAC address + timestamp
-from opentelemetry.trace import (
-    get_tracer,
-    set_span_in_context,
-    SpanKind,
-    NonRecordingSpan,
-)
 
 from runpod.serverless.modules import rp_job
 from runpod.serverless.modules.rp_logger import RunPodLogger
 from runpod.version import __version__ as runpod_version
 
 log = RunPodLogger()
-tracer = get_tracer(__name__)
+tracer = trace.get_tracer(__name__)
 
 # _runpod_sls_get_jobs status codes
 STILL_WAITING = 0 
@@ -192,44 +187,46 @@ def progress_update(self, job_id: str, json_data: bytes) -> bool:
             )
         )
 
+    @tracer.start_as_current_span("handle_result", kind=trace.SpanKind.SERVER)
     async def stream_output(self, job_id: str, job_output: bytes) -> bool:
         """
         send part of a streaming result to AI-API.
         """
-        with tracer.start_as_current_span("handle_result", kind=SpanKind.SERVER) as span:
-            span.set_attribute("request_id", job_id)
-            span.set_attribute("is_stream", True)
-
-            json_data = self._json_serialize_job_data(job_output)
-            id_bytes = job_id.encode("utf-8")
-            return bool(
-                self._stream_output(
-                    c_char_p(id_bytes),
-                    c_int(len(id_bytes)),
-                    c_char_p(json_data),
-                    c_int(len(json_data)),
-                )
+        span = trace.get_current_span()
+        span.set_attribute("request_id", job_id)
+        span.set_attribute("is_stream", True)
+
+        json_data = self._json_serialize_job_data(job_output)
+        id_bytes = job_id.encode("utf-8")
+        return bool(
+            self._stream_output(
+                c_char_p(id_bytes),
+                c_int(len(id_bytes)),
+                c_char_p(json_data),
+                c_int(len(json_data)),
             )
+        )
 
+    @tracer.start_as_current_span("handle_result", kind=trace.SpanKind.SERVER)
     def post_output(self, job_id: str, job_output: bytes) -> bool:
         """
         send the result of a job to AI-API.
         Returns True if the task was successfully stored, False otherwise.
         """
-        with tracer.start_as_current_span("handle_result", kind=SpanKind.SERVER) as span:
-            span.set_attribute("request_id", job_id)
-            span.set_attribute("is_stream", False)
-
-            json_data = self._json_serialize_job_data(job_output)
-            id_bytes = job_id.encode("utf-8")
-            return bool(
-                self._post_output(
-                    c_char_p(id_bytes),
-                    c_int(len(id_bytes)),
-                    c_char_p(json_data),
-                    c_int(len(json_data)),
-                )
+        span = trace.get_current_span()
+        span.set_attribute("request_id", job_id)
+        span.set_attribute("is_stream", False)
+
+        json_data = self._json_serialize_job_data(job_output)
+        id_bytes = job_id.encode("utf-8")
+        return bool(
+            self._post_output(
+                c_char_p(id_bytes),
+                c_int(len(id_bytes)),
+                c_char_p(json_data),
+                c_int(len(json_data)),
             )
+        )
 
     def finish_stream(self, job_id: str) -> bool:
         """
@@ -248,9 +245,9 @@ async def _process_job(
 
     result = {}
 
-    context = set_span_in_context(NonRecordingSpan(job["context"]))
+    context = trace.set_span_in_context(trace.NonRecordingSpan(job["context"]))
 
-    with tracer.start_as_current_span("handle_job", context=context, kind=SpanKind.CONSUMER) as span:
+    with tracer.start_as_current_span("handle_job", context=context, kind=trace.SpanKind.CONSUMER) as span:
         span.set_attribute("request_id", job.get("id"))
 
         try:
@@ -305,7 +302,7 @@ async def run(config: Dict[str, Any]) -> None:
 
     serverless_hook = Hook()
     while True:
-        with tracer.start_as_current_span("get_jobs", kind=SpanKind.CLIENT) as span:
+        with tracer.start_as_current_span("get_jobs", kind=trace.SpanKind.CLIENT) as span:
             span.set_attribute("runpod.sls_core_enabled", True)
             span.set_attribute("batch_id", uuid1().hex)
 
@@ -325,7 +322,7 @@ async def run(config: Dict[str, Any]) -> None:
             span.set_attribute("jobs_acquired_count", len(jobs))
 
             for job in jobs:
-                with tracer.start_as_current_span("queue_job", kind=SpanKind.PRODUCER) as job_span:
+                with tracer.start_as_current_span("queue_job", kind=trace.SpanKind.PRODUCER) as job_span:
                     job_span.set_attribute("request_id", job.get("id"))
                     job["context"] = job_span.get_span_context()
                     asyncio.create_task(
diff --git a/runpod/serverless/modules/rp_http.py b/runpod/serverless/modules/rp_http.py
index 2b0b3343..2423cfa4 100644
--- a/runpod/serverless/modules/rp_http.py
+++ b/runpod/serverless/modules/rp_http.py
@@ -5,9 +5,9 @@
 import json
 import os
 
-from opentelemetry.trace import get_tracer, SpanKind
 from aiohttp import ClientError
 from aiohttp_retry import FibonacciRetry, RetryClient
+from opentelemetry import trace
 
 from runpod.http_client import ClientSession
 from runpod.serverless.modules.rp_logger import RunPodLogger
@@ -25,7 +25,7 @@
 JOB_STREAM_URL = JOB_STREAM_URL_TEMPLATE.replace("$RUNPOD_POD_ID", WORKER_ID)
 
 log = RunPodLogger()
-tracer = get_tracer(__name__)
+tracer = trace.get_tracer(__name__)
 
 
 async def _transmit(client_session: ClientSession, url, job_data):
@@ -50,40 +50,41 @@ async def _transmit(client_session: ClientSession, url, job_data):
         await client_response.text()
 
 
+@tracer.start_as_current_span("handle_result", kind=trace.SpanKind.SERVER)
 async def _handle_result(
     session: ClientSession, job_data, job, url_template, log_message, is_stream=False
 ):
     """
     A helper function to handle the result, either for sending or streaming.
     """
-    with tracer.start_as_current_span("handle_result", kind=SpanKind.SERVER) as span:
-        span.set_attribute("request_id", job.get("id"))
-        span.set_attribute("is_stream", is_stream)
-
-        try:
-            serialized_job_data = json.dumps(job_data, ensure_ascii=False)
-
-            is_stream = "true" if is_stream else "false"
-            url = url_template.replace("$ID", job["id"]) + f"&isStream={is_stream}"
-
-            await _transmit(session, url, serialized_job_data)
-            log.debug(f"{log_message}", job["id"])
-
-        except ClientError as err:
-            span.record_exception(err)
-            log.error(f"Failed to return job results. | {err}", job["id"])
-
-        except (TypeError, RuntimeError) as err:
-            span.record_exception(err)
-            log.error(f"Error while returning job result. | {err}", job["id"])
-
-        finally:
-            # job_data status is used for local development with FastAPI
-            if (
-                url_template == JOB_DONE_URL
-                and job_data.get("status", None) != "IN_PROGRESS"
-            ):
-                log.info("Finished.", job["id"])
+    span = trace.get_current_span()
+    span.set_attribute("request_id", job.get("id"))
+    span.set_attribute("is_stream", is_stream)
+
+    try:
+        serialized_job_data = json.dumps(job_data, ensure_ascii=False)
+
+        is_stream = "true" if is_stream else "false"
+        url = url_template.replace("$ID", job["id"]) + f"&isStream={is_stream}"
+
+        await _transmit(session, url, serialized_job_data)
+        log.debug(f"{log_message}", job["id"])
+
+    except ClientError as err:
+        span.record_exception(err)
+        log.error(f"Failed to return job results. | {err}", job["id"])
+
+    except (TypeError, RuntimeError) as err:
+        span.record_exception(err)
+        log.error(f"Error while returning job result. | {err}", job["id"])
+
+    finally:
+        # job_data status is used for local development with FastAPI
+        if (
+            url_template == JOB_DONE_URL
+            and job_data.get("status", None) != "IN_PROGRESS"
+        ):
+            log.info("Finished.", job["id"])
 
 
 async def send_result(session, job_data, job, is_stream=False):
diff --git a/runpod/serverless/modules/rp_job.py b/runpod/serverless/modules/rp_job.py
index e6b729b8..1da93fa1 100644
--- a/runpod/serverless/modules/rp_job.py
+++ b/runpod/serverless/modules/rp_job.py
@@ -6,8 +6,8 @@
 import json
 import os
 import traceback
+from opentelemetry import trace
 from typing import Any, AsyncGenerator, Callable, Dict, Optional, Union, List
-from opentelemetry.trace import get_tracer, SpanKind
 
 from runpod.http_client import ClientSession, TooManyRequests
 from runpod.serverless.modules.rp_logger import RunPodLogger
@@ -23,7 +23,7 @@
 
 log = RunPodLogger()
 job_progress = JobsProgress()
-tracer = get_tracer(__name__)
+tracer = trace.get_tracer(__name__)
 
 
 def _job_get_url(batch_size: int = 1):
@@ -147,6 +147,7 @@ async def handle_job(session: ClientSession, config: Dict[str, Any], job: dict)
     await send_result(session, job_result, job, is_stream=is_stream)
 
 
+@tracer.start_as_current_span("run_job")
 async def run_job(handler: Callable, job: Dict[str, Any]) -> Dict[str, Any]:
     """
     Run the job using the handler.
@@ -158,64 +159,65 @@ async def run_job(handler: Callable, job: Dict[str, Any]) -> Dict[str, Any]:
     Returns:
         Dict[str, Any]: The result of running the job.
     """
+    span = trace.get_current_span()
+    span.set_attribute("request_id", job.get("id"))
+
     log.info("Started.", job["id"])
     run_result = {}
 
-    with tracer.start_as_current_span("run_job", kind=SpanKind.INTERNAL) as span:
-        span.set_attribute("request_id", job.get("id"))
-
-        try:
-            handler_return = handler(job)
-            job_output = (
-                await handler_return
-                if inspect.isawaitable(handler_return)
-                else handler_return
-            )
+    try:
+        handler_return = handler(job)
+        job_output = (
+            await handler_return
+            if inspect.isawaitable(handler_return)
+            else handler_return
+        )
 
-            log.debug(f"Handler output: {job_output}", job["id"])
+        log.debug(f"Handler output: {job_output}", job["id"])
 
-            if isinstance(job_output, dict):
-                error_msg = job_output.pop("error", None)
-                refresh_worker = job_output.pop("refresh_worker", None)
-                run_result["output"] = job_output
+        if isinstance(job_output, dict):
+            error_msg = job_output.pop("error", None)
+            refresh_worker = job_output.pop("refresh_worker", None)
+            run_result["output"] = job_output
 
-                if error_msg:
-                    run_result["error"] = error_msg
-                if refresh_worker:
-                    run_result["stopPod"] = True
+            if error_msg:
+                run_result["error"] = error_msg
+            if refresh_worker:
+                run_result["stopPod"] = True
 
-            elif isinstance(job_output, bool):
-                run_result = {"output": job_output}
+        elif isinstance(job_output, bool):
+            run_result = {"output": job_output}
 
-            else:
-                run_result = {"output": job_output}
+        else:
+            run_result = {"output": job_output}
 
-            if run_result.get("output") == {}:
-                run_result.pop("output")
+        if run_result.get("output") == {}:
+            run_result.pop("output")
 
-            check_return_size(run_result)  # Checks the size of the return body.
+        check_return_size(run_result)  # Checks the size of the return body.
 
-        except Exception as err:
-            span.record_exception(err)
-            error_info = {
-                "error_type": str(type(err)),
-                "error_message": str(err),
-                "error_traceback": traceback.format_exc(),
-                "hostname": os.environ.get("RUNPOD_POD_HOSTNAME", "unknown"),
-                "worker_id": os.environ.get("RUNPOD_POD_ID", "unknown"),
-                "runpod_version": runpod_version,
-            }
+    except Exception as err:
+        span.record_exception(err)
+        error_info = {
+            "error_type": str(type(err)),
+            "error_message": str(err),
+            "error_traceback": traceback.format_exc(),
+            "hostname": os.environ.get("RUNPOD_POD_HOSTNAME", "unknown"),
+            "worker_id": os.environ.get("RUNPOD_POD_ID", "unknown"),
+            "runpod_version": runpod_version,
+        }
 
-            log.error("Captured Handler Exception", job["id"])
-            log.error(json.dumps(error_info, indent=4))
-            run_result = {"error": json.dumps(error_info)}
+        log.error("Captured Handler Exception", job["id"])
+        log.error(json.dumps(error_info, indent=4))
+        run_result = {"error": json.dumps(error_info)}
 
-        finally:
-            log.debug(f"run_job return: {run_result}", job["id"])
+    finally:
+        log.debug(f"run_job return: {run_result}", job["id"])
 
     return run_result
 
 
+@tracer.start_as_current_span("run_job_generator")
 async def run_job_generator(
     handler: Callable, job: Dict[str, Any]
 ) -> AsyncGenerator[Dict[str, Union[str, Any]], None]:
@@ -223,30 +225,30 @@ async def run_job_generator(
     Run generator job used to stream output.
     Yields output partials from the generator.
     """
+    span = trace.get_current_span()
+    span.set_attribute("request_id", job.get("id"))
+
     is_async_gen = inspect.isasyncgenfunction(handler)
     log.debug(
         "Using Async Generator" if is_async_gen else "Using Standard Generator",
         job["id"],
     )
 
-    with tracer.start_as_current_span("run_job_generator", kind=SpanKind.INTERNAL) as span:
-        span.set_attribute("request_id", job.get("id"))
-
-        try:
-            job_output = handler(job)
-
-            if is_async_gen:
-                async for output_partial in job_output:
-                    log.debug(f"Async Generator output: {output_partial}", job["id"])
-                    yield {"output": output_partial}
-            else:
-                for output_partial in job_output:
-                    log.debug(f"Generator output: {output_partial}", job["id"])
-                    yield {"output": output_partial}
-
-        except Exception as err:
-            span.record_exception(err)
-            log.error(err, job["id"])
-            yield {"error": f"handler: {str(err)} \ntraceback: {traceback.format_exc()}"}
-        finally:
-            log.info("Finished running generator.", job["id"])
+    try:
+        job_output = handler(job)
+
+        if is_async_gen:
+            async for output_partial in job_output:
+                log.debug(f"Async Generator output: {output_partial}", job["id"])
+                yield {"output": output_partial}
+        else:
+            for output_partial in job_output:
+                log.debug(f"Generator output: {output_partial}", job["id"])
+                yield {"output": output_partial}
+
+    except Exception as err:
+        span.record_exception(err)
+        log.error(err, job["id"])
+        yield {"error": f"handler: {str(err)} \ntraceback: {traceback.format_exc()}"}
+    finally:
+        log.info("Finished running generator.", job["id"])

From 4939bb1f68e0d261c5a57e69d4bd3367875a85d9 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Dean=20Qui=C3=B1anola?= <dean.quinanola@runpod.io>
Date: Mon, 4 Nov 2024 22:15:47 -0800
Subject: [PATCH 19/53] tmp: cleanup otel resource definition

---
 runpod/otel.py | 9 +--------
 1 file changed, 1 insertion(+), 8 deletions(-)

diff --git a/runpod/otel.py b/runpod/otel.py
index 6672c3ad..ed94ea9e 100644
--- a/runpod/otel.py
+++ b/runpod/otel.py
@@ -7,10 +7,7 @@
 from opentelemetry.sdk.resources import (
     Resource,
     SERVICE_NAME,
-    SERVICE_NAMESPACE,
-    SERVICE_INSTANCE_ID,
     SERVICE_VERSION,
-    HOST_NAME,
 )
 
 RUNPOD_ENDPOINT_ID = "runpod.endpoint_id"
@@ -23,14 +20,10 @@
     TracerProvider(
         resource=Resource.create(
             {
-                "application": "runpod-serverless",
-                SERVICE_NAME: "runpod-python-sdk",
-                SERVICE_NAMESPACE: os.getenv("RUNPOD_ENDPOINT_ID"),
                 RUNPOD_ENDPOINT_ID: os.getenv("RUNPOD_ENDPOINT_ID"),
-                SERVICE_INSTANCE_ID: os.getenv("RUNPOD_POD_ID"),
                 RUNPOD_POD_ID: os.getenv("RUNPOD_POD_ID"),
+                SERVICE_NAME: "runpod-python-sdk",
                 SERVICE_VERSION: runpod_version,
-                HOST_NAME: os.getenv("RUNPOD_POD_HOSTNAME"),
             }
         )
     )

From d14decd2238b06a46273d123c88a24f332d7db45 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Dean=20Qui=C3=B1anola?= <dean.quinanola@runpod.io>
Date: Tue, 5 Nov 2024 02:00:27 -0800
Subject: [PATCH 20/53] tmp: trace pings

---
 runpod/serverless/modules/rp_ping.py | 7 +++++++
 1 file changed, 7 insertions(+)

diff --git a/runpod/serverless/modules/rp_ping.py b/runpod/serverless/modules/rp_ping.py
index 88fa1049..6575b94a 100644
--- a/runpod/serverless/modules/rp_ping.py
+++ b/runpod/serverless/modules/rp_ping.py
@@ -8,6 +8,7 @@
 import time
 
 import requests
+from opentelemetry import trace
 from urllib3.util.retry import Retry
 
 from runpod.http_client import SyncClientSession
@@ -17,6 +18,7 @@
 
 log = RunPodLogger()
 jobs = JobsProgress()  # Contains the list of jobs that are currently running.
+tracer = trace.get_tracer(__name__)
 
 
 class Heartbeat:
@@ -83,6 +85,7 @@ def ping_loop(self, test=False):
             if test:
                 return
 
+    @tracer.start_as_current_span("send_ping")
     def _send_ping(self):
         """
         Sends a heartbeat to the Runpod server.
@@ -90,6 +93,9 @@ def _send_ping(self):
         job_ids = jobs.get_job_list()
         ping_params = {"job_id": job_ids, "runpod_version": runpod_version}
 
+        span = trace.get_current_span()
+        span.set_attribute("job_id", job_ids)
+
         try:
             result = self._session.get(
                 self.PING_URL, params=ping_params, timeout=self.PING_INTERVAL * 2
@@ -100,4 +106,5 @@ def _send_ping(self):
             )
 
         except requests.RequestException as err:
+            span.record_exception(err)
             log.error(f"Ping Request Error: {err}, attempting to restart ping.")

From b82c6a8c87eaa75435bfcd0359b04e6b0b8a66fe Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Dean=20Qui=C3=B1anola?= <dean.quinanola@runpod.io>
Date: Tue, 5 Nov 2024 15:56:53 -0800
Subject: [PATCH 21/53] tmp: refactored worker_state to trace request_id list
 in ping

---
 runpod/serverless/modules/rp_ping.py          |  8 +++----
 runpod/serverless/modules/worker_state.py     | 23 ++++++-------------
 .../test_modules/test_state.py                | 21 ++---------------
 3 files changed, 13 insertions(+), 39 deletions(-)

diff --git a/runpod/serverless/modules/rp_ping.py b/runpod/serverless/modules/rp_ping.py
index 6575b94a..18336c44 100644
--- a/runpod/serverless/modules/rp_ping.py
+++ b/runpod/serverless/modules/rp_ping.py
@@ -17,7 +17,7 @@
 from runpod.version import __version__ as runpod_version
 
 log = RunPodLogger()
-jobs = JobsProgress()  # Contains the list of jobs that are currently running.
+job_progress = JobsProgress()  # Contains the list of jobs that are currently running.
 tracer = trace.get_tracer(__name__)
 
 
@@ -90,11 +90,11 @@ def _send_ping(self):
         """
         Sends a heartbeat to the Runpod server.
         """
-        job_ids = jobs.get_job_list()
-        ping_params = {"job_id": job_ids, "runpod_version": runpod_version}
+        job_ids = job_progress.get_job_list()
+        ping_params = {"job_id": ",".join(job_ids), "runpod_version": runpod_version}
 
         span = trace.get_current_span()
-        span.set_attribute("job_id", job_ids)
+        span.set_attribute("request_id", job_ids)
 
         try:
             result = self._session.get(
diff --git a/runpod/serverless/modules/worker_state.py b/runpod/serverless/modules/worker_state.py
index 81e62799..349e7496 100644
--- a/runpod/serverless/modules/worker_state.py
+++ b/runpod/serverless/modules/worker_state.py
@@ -97,11 +97,11 @@ def add(self, element: Any):
 
     def remove(self, element: Any):
         """
-        Adds a Job object to the set.
+        Removes a Job object from the set.
 
-        If the added element is a string, then `Job(id=element)` is added
+        If the element is a string, then `Job(id=element)` is recognized
         
-        If the added element is a dict, that `Job(**element)` is added
+        If the element is a dict, that `Job(**element)` is recognized
         """
         if isinstance(element, str):
             element = Job(id=element)
@@ -126,14 +126,14 @@ def get(self, element: Any) -> Job:
             if job == element:
                 return job
 
-    def get_job_list(self) -> str:
+    def get_job_list(self) -> set[str]:
         """
-        Returns the list of job IDs as comma-separated string.
+        Returns the list of job IDs
         """
         if not self.get_job_count():
-            return None
+            return set()
 
-        return ",".join(str(job) for job in self)
+        return set(str(job) for job in self)
 
     def get_job_count(self) -> int:
         """
@@ -175,15 +175,6 @@ async def get_job(self) -> dict:
         """
         return await self.get()
 
-    def get_job_list(self) -> Optional[str]:
-        """
-        Returns the comma-separated list of jobs as a string. (read-only)
-        """
-        if self.empty():
-            return None
-
-        return ",".join(job.get("id") for job in self)
-
     def get_job_count(self) -> int:
         """
         Returns the number of jobs.
diff --git a/tests/test_serverless/test_modules/test_state.py b/tests/test_serverless/test_modules/test_state.py
index 6b26a64c..7057822f 100644
--- a/tests/test_serverless/test_modules/test_state.py
+++ b/tests/test_serverless/test_modules/test_state.py
@@ -154,23 +154,6 @@ async def test_get_job(self):
         assert next_job not in self.jobs
         assert next_job == job2
 
-    async def test_get_job_list(self):
-        """
-        Tests if get_job_list() returns comma-separated IDs
-        """
-        self.assertTrue(self.jobs.get_job_list() is None)
-
-        job1 = {"id": "123"}
-        await self.jobs.add_job(job1)
-
-        job2 = {"id": "456"}
-        await self.jobs.add_job(job2)
-
-        assert self.jobs.get_job_count() == 2
-        assert job1 in self.jobs
-        assert job2 in self.jobs
-        assert self.jobs.get_job_list() in ["123,456", "456,123"]
-
 
 class TestJobsProgress(unittest.TestCase):
     """Tests for JobsProgress class"""
@@ -223,7 +206,7 @@ def test_get_job(self):
         assert job1 in self.jobs
 
     def test_get_job_list(self):
-        self.assertTrue(self.jobs.get_job_list() is None)
+        assert not self.jobs.get_job_list()
 
         job1 = {"id": "123"}
         self.jobs.add(job1)
@@ -232,4 +215,4 @@ def test_get_job_list(self):
         self.jobs.add(job2)
 
         assert self.jobs.get_job_count() == 2
-        assert self.jobs.get_job_list() in ["123,456", "456,123"]
+        assert not self.jobs.get_job_list().difference(("123","456",))

From 6d4d0e2a7d4c451e7f87f4621152a0b382eb07a5 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Dean=20Qui=C3=B1anola?= <dean.quinanola@runpod.io>
Date: Tue, 5 Nov 2024 22:57:12 -0800
Subject: [PATCH 22/53] tmp: add_event for each request_id in the send_ping

---
 runpod/serverless/modules/rp_ping.py      | 12 +++++++-----
 runpod/serverless/modules/worker_state.py |  4 ++--
 2 files changed, 9 insertions(+), 7 deletions(-)

diff --git a/runpod/serverless/modules/rp_ping.py b/runpod/serverless/modules/rp_ping.py
index 18336c44..3c268468 100644
--- a/runpod/serverless/modules/rp_ping.py
+++ b/runpod/serverless/modules/rp_ping.py
@@ -85,16 +85,18 @@ def ping_loop(self, test=False):
             if test:
                 return
 
-    @tracer.start_as_current_span("send_ping")
+    @tracer.start_as_current_span("send_ping", kind=trace.SpanKind.CLIENT)
     def _send_ping(self):
         """
         Sends a heartbeat to the Runpod server.
         """
-        job_ids = job_progress.get_job_list()
-        ping_params = {"job_id": ",".join(job_ids), "runpod_version": runpod_version}
-
         span = trace.get_current_span()
-        span.set_attribute("request_id", job_ids)
+        job_ids = []
+        for job in job_progress:
+            span.add_event("ping", {"request_id": job.id})
+            job_ids.append(job.id)
+
+        ping_params = {"job_id": ",".join(job_ids), "runpod_version": runpod_version}
 
         try:
             result = self._session.get(
diff --git a/runpod/serverless/modules/worker_state.py b/runpod/serverless/modules/worker_state.py
index 349e7496..56d84a8a 100644
--- a/runpod/serverless/modules/worker_state.py
+++ b/runpod/serverless/modules/worker_state.py
@@ -5,7 +5,7 @@
 import os
 import time
 import uuid
-from typing import Any, Dict, Optional
+from typing import Any, Dict, Optional, Set
 from asyncio import Queue
 
 from .rp_logger import RunPodLogger
@@ -62,7 +62,7 @@ def __str__(self) -> str:
 # ---------------------------------------------------------------------------- #
 #                                    Tracker                                   #
 # ---------------------------------------------------------------------------- #
-class JobsProgress(set):
+class JobsProgress(Set[Job]):
     """Track the state of current jobs in progress."""
 
     _instance = None

From 764dd6afc5ebe9425970eb4bfcd81c235b25d71d Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Dean=20Qui=C3=B1anola?= <dean.quinanola@runpod.io>
Date: Wed, 6 Nov 2024 09:44:20 -0800
Subject: [PATCH 23/53] tmp: removed get_job_list

---
 runpod/serverless/modules/worker_state.py        | 12 ------------
 tests/test_serverless/test_modules/test_state.py | 12 ------------
 2 files changed, 24 deletions(-)

diff --git a/runpod/serverless/modules/worker_state.py b/runpod/serverless/modules/worker_state.py
index 56d84a8a..cf82b89c 100644
--- a/runpod/serverless/modules/worker_state.py
+++ b/runpod/serverless/modules/worker_state.py
@@ -72,9 +72,6 @@ def __new__(cls):
             JobsProgress._instance = set.__new__(cls)
         return JobsProgress._instance
 
-    def __repr__(self) -> str:
-        return f"<{self.__class__.__name__}>: {self.get_job_list()}"
-
     def add(self, element: Any):
         """
         Adds a Job object to the set.
@@ -126,15 +123,6 @@ def get(self, element: Any) -> Job:
             if job == element:
                 return job
 
-    def get_job_list(self) -> set[str]:
-        """
-        Returns the list of job IDs
-        """
-        if not self.get_job_count():
-            return set()
-
-        return set(str(job) for job in self)
-
     def get_job_count(self) -> int:
         """
         Returns the number of jobs.
diff --git a/tests/test_serverless/test_modules/test_state.py b/tests/test_serverless/test_modules/test_state.py
index 7057822f..af605dd9 100644
--- a/tests/test_serverless/test_modules/test_state.py
+++ b/tests/test_serverless/test_modules/test_state.py
@@ -204,15 +204,3 @@ def test_get_job(self):
 
         job1 = self.jobs.get(id)
         assert job1 in self.jobs
-
-    def test_get_job_list(self):
-        assert not self.jobs.get_job_list()
-
-        job1 = {"id": "123"}
-        self.jobs.add(job1)
-
-        job2 = {"id": "456"}
-        self.jobs.add(job2)
-
-        assert self.jobs.get_job_count() == 2
-        assert not self.jobs.get_job_list().difference(("123","456",))

From b9687c9f4d87a5681b64dd045f0d7ca5a9e4b78e Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Dean=20Qui=C3=B1anola?= <dean.quinanola@runpod.io>
Date: Wed, 6 Nov 2024 11:43:49 -0800
Subject: [PATCH 24/53] tmp: context propagation

---
 runpod/http_client.py | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/runpod/http_client.py b/runpod/http_client.py
index 8bf6dca2..146d317b 100644
--- a/runpod/http_client.py
+++ b/runpod/http_client.py
@@ -6,8 +6,8 @@
 import requests
 from aiohttp import ClientSession, ClientTimeout, TCPConnector, ClientResponseError
 from opentelemetry import trace
-# from opentelemetry.instrumentation.aiohttp_client import create_trace_config
-# from opentelemetry.instrumentation.requests import RequestsInstrumentor
+from opentelemetry.instrumentation.aiohttp_client import create_trace_config
+from opentelemetry.instrumentation.requests import RequestsInstrumentor
 
 from .cli.groups.config.functions import get_credentials
 from .user_agent import USER_AGENT
@@ -44,7 +44,7 @@ def AsyncClientSession(*args, **kwargs):
         connector=TCPConnector(limit=0),
         headers=get_auth_header(),
         timeout=ClientTimeout(600, ceil_threshold=400),
-        # trace_configs=[create_trace_config()],
+        trace_configs=[create_trace_config()],
         *args,
         **kwargs,
     )
@@ -54,4 +54,4 @@ class SyncClientSession(requests.Session):
     def __init__(self):
         super().__init__()
         self.headers.update(get_auth_header())
-        # RequestsInstrumentor().instrument(session=self)
\ No newline at end of file
+        RequestsInstrumentor().instrument(session=self)

From 0e666b56eb1a02c77e7a8d476f27109c3a5b69d9 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Dean=20Qui=C3=B1anola?= <dean.quinanola@runpod.io>
Date: Wed, 20 Nov 2024 20:10:28 -0800
Subject: [PATCH 25/53] tmp: missed this merge

---
 runpod/serverless/modules/rp_scale.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/runpod/serverless/modules/rp_scale.py b/runpod/serverless/modules/rp_scale.py
index 6e902def..89212871 100644
--- a/runpod/serverless/modules/rp_scale.py
+++ b/runpod/serverless/modules/rp_scale.py
@@ -241,7 +241,7 @@ async def handle_job(self, session: ClientSession, job: dict):
             span.set_attribute("request_id", job.get("id"))
 
             try:
-                job_progress.add(job)
+                await job_progress.add(job)
 
                 await handle_job(session, self.config, job)
 

From e53d6358a0de479520379df3466e96fad5b293df Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Dean=20Qui=C3=B1anola?= <dean.quinanola@runpod.io>
Date: Wed, 20 Nov 2024 21:32:54 -0800
Subject: [PATCH 26/53] tmp: missed this merge

---
 runpod/serverless/modules/rp_scale.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/runpod/serverless/modules/rp_scale.py b/runpod/serverless/modules/rp_scale.py
index 89212871..1384d6ef 100644
--- a/runpod/serverless/modules/rp_scale.py
+++ b/runpod/serverless/modules/rp_scale.py
@@ -259,4 +259,4 @@ async def handle_job(self, session: ClientSession, job: dict):
                 job_list.task_done()
 
                 # Job is no longer in progress
-                job_progress.remove(job["id"])
+                await job_progress.remove(job["id"])

From c774bfde8344c43aa0b9dd26eb79bf629eff0660 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Dean=20Qui=C3=B1anola?= <dean.quinanola@runpod.io>
Date: Thu, 21 Nov 2024 00:54:02 -0800
Subject: [PATCH 27/53] tmp: noop for disabled otel

---
 runpod/otel.py | 12 +++++++++---
 1 file changed, 9 insertions(+), 3 deletions(-)

diff --git a/runpod/otel.py b/runpod/otel.py
index ed94ea9e..276d7e8f 100644
--- a/runpod/otel.py
+++ b/runpod/otel.py
@@ -6,6 +6,7 @@
 from opentelemetry.sdk.trace.export import BatchSpanProcessor, ConsoleSpanExporter
 from opentelemetry.sdk.resources import (
     Resource,
+    DEPLOYMENT_ENVIRONMENT,
     SERVICE_NAME,
     SERVICE_VERSION,
 )
@@ -20,6 +21,7 @@
     TracerProvider(
         resource=Resource.create(
             {
+                DEPLOYMENT_ENVIRONMENT: os.getenv("ENV"),
                 RUNPOD_ENDPOINT_ID: os.getenv("RUNPOD_ENDPOINT_ID"),
                 RUNPOD_POD_ID: os.getenv("RUNPOD_POD_ID"),
                 SERVICE_NAME: "runpod-python-sdk",
@@ -31,8 +33,12 @@
 
 tracer = trace.get_tracer_provider()
 
-if os.getenv("RUNPOD_LOG_LEVEL", "").lower() == "trace":
-    tracer.add_span_processor(BatchSpanProcessor(ConsoleSpanExporter()))
-
 if os.getenv("OTEL_EXPORTER_OTLP_ENDPOINT"):
     tracer.add_span_processor(BatchSpanProcessor(OTLPSpanExporter()))
+
+elif os.getenv("RUNPOD_LOG_LEVEL", "").lower() == "trace":
+    tracer.add_span_processor(BatchSpanProcessor(ConsoleSpanExporter()))
+
+else:
+    # Use NoOpTracerProvider to disable OTEL
+    trace.set_tracer_provider(trace.NoOpTracerProvider())

From 7c9ed5784a1e7b417be57fa653c8c8bf1991289b Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Dean=20Qui=C3=B1anola?= <dean.quinanola@runpod.io>
Date: Sat, 7 Dec 2024 22:37:00 -0800
Subject: [PATCH 28/53] tmp: proper spankind

---
 runpod/serverless/core.py            | 4 ++--
 runpod/serverless/modules/rp_http.py | 2 +-
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/runpod/serverless/core.py b/runpod/serverless/core.py
index 9a1e9f62..c70abeae 100644
--- a/runpod/serverless/core.py
+++ b/runpod/serverless/core.py
@@ -187,7 +187,7 @@ def progress_update(self, job_id: str, json_data: bytes) -> bool:
             )
         )
 
-    @tracer.start_as_current_span("handle_result", kind=trace.SpanKind.SERVER)
+    @tracer.start_as_current_span("handle_result", kind=trace.SpanKind.CLIENT)
     async def stream_output(self, job_id: str, job_output: bytes) -> bool:
         """
         send part of a streaming result to AI-API.
@@ -207,7 +207,7 @@ async def stream_output(self, job_id: str, job_output: bytes) -> bool:
             )
         )
 
-    @tracer.start_as_current_span("handle_result", kind=trace.SpanKind.SERVER)
+    @tracer.start_as_current_span("handle_result", kind=trace.SpanKind.CLIENT)
     def post_output(self, job_id: str, job_output: bytes) -> bool:
         """
         send the result of a job to AI-API.
diff --git a/runpod/serverless/modules/rp_http.py b/runpod/serverless/modules/rp_http.py
index 2423cfa4..02a8c059 100644
--- a/runpod/serverless/modules/rp_http.py
+++ b/runpod/serverless/modules/rp_http.py
@@ -50,7 +50,7 @@ async def _transmit(client_session: ClientSession, url, job_data):
         await client_response.text()
 
 
-@tracer.start_as_current_span("handle_result", kind=trace.SpanKind.SERVER)
+@tracer.start_as_current_span("handle_result", kind=trace.SpanKind.CLIENT)
 async def _handle_result(
     session: ClientSession, job_data, job, url_template, log_message, is_stream=False
 ):

From b030cecdb7c410a48654ec4d1f5ef418f49b284e Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Dean=20Qui=C3=B1anola?= <dean.quinanola@runpod.io>
Date: Mon, 9 Dec 2024 19:54:43 -0800
Subject: [PATCH 29/53] tmp: force sampling from this parent span

This tells ai-api to trace despite the 1% ratio
---
 runpod/otel.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/runpod/otel.py b/runpod/otel.py
index 276d7e8f..3fa06ec1 100644
--- a/runpod/otel.py
+++ b/runpod/otel.py
@@ -2,7 +2,7 @@
 
 from opentelemetry import trace
 from opentelemetry.exporter.otlp.proto.http.trace_exporter import OTLPSpanExporter
-from opentelemetry.sdk.trace import TracerProvider
+from opentelemetry.sdk.trace import TracerProvider, sampling
 from opentelemetry.sdk.trace.export import BatchSpanProcessor, ConsoleSpanExporter
 from opentelemetry.sdk.resources import (
     Resource,
@@ -19,6 +19,7 @@
 
 trace.set_tracer_provider(
     TracerProvider(
+        sampler=sampling.ALWAYS_ON,
         resource=Resource.create(
             {
                 DEPLOYMENT_ENVIRONMENT: os.getenv("ENV"),

From a800261d8f52ef0d119c390cf543fa2c8afa77d1 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Dean=20Qui=C3=B1anola?= <dean.quinanola@runpod.io>
Date: Mon, 9 Dec 2024 20:11:57 -0800
Subject: [PATCH 30/53] tmp: revert

---
 runpod/http_client.py | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/runpod/http_client.py b/runpod/http_client.py
index cea3f117..146d317b 100644
--- a/runpod/http_client.py
+++ b/runpod/http_client.py
@@ -38,20 +38,20 @@ def get_auth_header():
 
 def AsyncClientSession(*args, **kwargs):
     """
-    Deprecation from aiohttp.ClientSession forbids inheritance.
-    This is now a factory method
+    Factory method for an async client session with OpenTelemetry tracing.
     """
     return ClientSession(
         connector=TCPConnector(limit=0),
         headers=get_auth_header(),
         timeout=ClientTimeout(600, ceil_threshold=400),
+        trace_configs=[create_trace_config()],
         *args,
         **kwargs,
     )
 
 
 class SyncClientSession(requests.Session):
-    """
-    Inherits requests.Session to override `request()` method for tracing
-    """
-    pass
+    def __init__(self):
+        super().__init__()
+        self.headers.update(get_auth_header())
+        RequestsInstrumentor().instrument(session=self)

From b8377b85c5244482045e123f30a736e346c2824d Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Dean=20Qui=C3=B1anola?= <dean.quinanola@runpod.io>
Date: Mon, 9 Dec 2024 20:44:15 -0800
Subject: [PATCH 31/53] tmp: capture job_output

---
 runpod/serverless/modules/rp_job.py | 8 ++++++++
 1 file changed, 8 insertions(+)

diff --git a/runpod/serverless/modules/rp_job.py b/runpod/serverless/modules/rp_job.py
index 3437da76..a8dab360 100644
--- a/runpod/serverless/modules/rp_job.py
+++ b/runpod/serverless/modules/rp_job.py
@@ -252,10 +252,18 @@ async def run_job_generator(
         if is_async_gen:
             async for output_partial in job_output:
                 log.debug(f"Async Generator output: {output_partial}", job["id"])
+                span.add_event(
+                    "Async generator output",
+                    attributes={"output_partial": str(output_partial)},
+                )
                 yield {"output": output_partial}
         else:
             for output_partial in job_output:
                 log.debug(f"Generator output: {output_partial}", job["id"])
+                span.add_event(
+                    "Async generator output",
+                    attributes={"output_partial": str(output_partial)},
+                )
                 yield {"output": output_partial}
 
     except Exception as err:

From f10693fbb9c338f530c434936d1b343a1b801b6c Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Dean=20Qui=C3=B1anola?= <dean.quinanola@runpod.io>
Date: Mon, 9 Dec 2024 23:45:43 -0800
Subject: [PATCH 32/53] tmp: check for "error" in a dict

---
 runpod/serverless/modules/rp_job.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/runpod/serverless/modules/rp_job.py b/runpod/serverless/modules/rp_job.py
index a8dab360..be7fc14b 100644
--- a/runpod/serverless/modules/rp_job.py
+++ b/runpod/serverless/modules/rp_job.py
@@ -127,7 +127,7 @@ async def handle_job(session: ClientSession, config: Dict[str, Any], job: dict)
         job_result = {"output": []}
         async for stream_output in generator_output:
             log.debug(f"Stream output: {stream_output}", job["id"])
-            if "error" in stream_output:
+            if stream_output.get("error"):
                 job_result = stream_output
                 break
             if config.get("return_aggregate_stream", False):

From 1aed002f06cd04f80a548aa045d798a6c1ff6f02 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Dean=20Qui=C3=B1anola?= <dean.quinanola@runpod.io>
Date: Tue, 10 Dec 2024 07:50:00 -0800
Subject: [PATCH 33/53] tmp: capture and report error

---
 runpod/serverless/modules/rp_http.py |  2 ++
 runpod/serverless/modules/rp_job.py  | 16 +++++++---------
 2 files changed, 9 insertions(+), 9 deletions(-)

diff --git a/runpod/serverless/modules/rp_http.py b/runpod/serverless/modules/rp_http.py
index 02a8c059..242b7f4a 100644
--- a/runpod/serverless/modules/rp_http.py
+++ b/runpod/serverless/modules/rp_http.py
@@ -87,6 +87,7 @@ async def _handle_result(
             log.info("Finished.", job["id"])
 
 
+@tracer.start_as_current_span("send_result")
 async def send_result(session, job_data, job, is_stream=False):
     """
     Return the job results.
@@ -96,6 +97,7 @@ async def send_result(session, job_data, job, is_stream=False):
     )
 
 
+@tracer.start_as_current_span("stream_result")
 async def stream_result(session, job_data, job):
     """
     Return the stream job results.
diff --git a/runpod/serverless/modules/rp_job.py b/runpod/serverless/modules/rp_job.py
index be7fc14b..a3eb1a71 100644
--- a/runpod/serverless/modules/rp_job.py
+++ b/runpod/serverless/modules/rp_job.py
@@ -119,7 +119,11 @@ async def get_job(
             return jobs
 
 
+@tracer.start_as_current_span("handle_job")
 async def handle_job(session: ClientSession, config: Dict[str, Any], job: dict) -> dict:
+    span = trace.get_current_span()
+    span.set_attribute("request_id", job.get("id"))
+
     if is_generator(config["handler"]):
         is_stream = True
         generator_output = run_job_generator(config["handler"], job)
@@ -128,7 +132,9 @@ async def handle_job(session: ClientSession, config: Dict[str, Any], job: dict)
         async for stream_output in generator_output:
             log.debug(f"Stream output: {stream_output}", job["id"])
             if stream_output.get("error"):
-                job_result = stream_output
+                span.record_exception(stream_output)
+                span.set_status(trace.Status(trace.StatusCode.ERROR, str(stream_output)))
+                await send_result(session, stream_output, job, is_stream=is_stream)
                 break
             if config.get("return_aggregate_stream", False):
                 job_result["output"].append(stream_output["output"])
@@ -252,18 +258,10 @@ async def run_job_generator(
         if is_async_gen:
             async for output_partial in job_output:
                 log.debug(f"Async Generator output: {output_partial}", job["id"])
-                span.add_event(
-                    "Async generator output",
-                    attributes={"output_partial": str(output_partial)},
-                )
                 yield {"output": output_partial}
         else:
             for output_partial in job_output:
                 log.debug(f"Generator output: {output_partial}", job["id"])
-                span.add_event(
-                    "Async generator output",
-                    attributes={"output_partial": str(output_partial)},
-                )
                 yield {"output": output_partial}
 
     except Exception as err:

From 619d2c17c4e54b648f75d7bc50d1ee4499f204e4 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Dean=20Qui=C3=B1anola?= <dean.quinanola@runpod.io>
Date: Tue, 10 Dec 2024 08:08:14 -0800
Subject: [PATCH 34/53] tmp: record Stream output

---
 runpod/serverless/modules/rp_job.py | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/runpod/serverless/modules/rp_job.py b/runpod/serverless/modules/rp_job.py
index a3eb1a71..b7b60488 100644
--- a/runpod/serverless/modules/rp_job.py
+++ b/runpod/serverless/modules/rp_job.py
@@ -131,6 +131,10 @@ async def handle_job(session: ClientSession, config: Dict[str, Any], job: dict)
         job_result = {"output": []}
         async for stream_output in generator_output:
             log.debug(f"Stream output: {stream_output}", job["id"])
+            span.add_event(
+                "Stream output",
+                attributes={"stream_output": str(stream_output)},
+            )
             if stream_output.get("error"):
                 span.record_exception(stream_output)
                 span.set_status(trace.Status(trace.StatusCode.ERROR, str(stream_output)))

From bd6d7dece42f5702e9292321ef6a20470e7a54e7 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Dean=20Qui=C3=B1anola?= <dean.quinanola@runpod.io>
Date: Tue, 10 Dec 2024 08:08:35 -0800
Subject: [PATCH 35/53] tmp: avoid confusion with rp_job.handle_job

---
 runpod/serverless/modules/rp_scale.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/runpod/serverless/modules/rp_scale.py b/runpod/serverless/modules/rp_scale.py
index e00fd94d..c69adc89 100644
--- a/runpod/serverless/modules/rp_scale.py
+++ b/runpod/serverless/modules/rp_scale.py
@@ -242,7 +242,7 @@ async def run_jobs(self, session: ClientSession):
                 job = await self.jobs_queue.get()
 
                 # Create a new task for each job and add it to the task list
-                task = asyncio.create_task(self.handle_job(session, job))
+                task = asyncio.create_task(self.run_job(session, job))
                 tasks.append(task)
 
             # Wait for any job to finish
@@ -263,14 +263,14 @@ async def run_jobs(self, session: ClientSession):
         # Ensure all remaining tasks finish before stopping
         await asyncio.gather(*tasks)
 
-    async def handle_job(self, session: ClientSession, job: dict):
+    async def run_job(self, session: ClientSession, job: dict):
         """
         Process an individual job. This function is run concurrently for multiple jobs.
         """
         context = set_span_in_context(NonRecordingSpan(job["context"]))
 
         with tracer.start_as_current_span(
-            "handle_job", context=context, kind=SpanKind.CONSUMER
+            "run_job", context=context, kind=SpanKind.CONSUMER
         ) as span:
 
             try:

From 85e4b4dde858c6fc9bb06270560e7ed0e62151be Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Dean=20Qui=C3=B1anola?= <dean.quinanola@runpod.io>
Date: Tue, 10 Dec 2024 08:24:37 -0800
Subject: [PATCH 36/53] tmp: perform_job

---
 runpod/serverless/modules/rp_scale.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/runpod/serverless/modules/rp_scale.py b/runpod/serverless/modules/rp_scale.py
index c69adc89..10f417b7 100644
--- a/runpod/serverless/modules/rp_scale.py
+++ b/runpod/serverless/modules/rp_scale.py
@@ -242,7 +242,7 @@ async def run_jobs(self, session: ClientSession):
                 job = await self.jobs_queue.get()
 
                 # Create a new task for each job and add it to the task list
-                task = asyncio.create_task(self.run_job(session, job))
+                task = asyncio.create_task(self.perform_job(session, job))
                 tasks.append(task)
 
             # Wait for any job to finish
@@ -263,14 +263,14 @@ async def run_jobs(self, session: ClientSession):
         # Ensure all remaining tasks finish before stopping
         await asyncio.gather(*tasks)
 
-    async def run_job(self, session: ClientSession, job: dict):
+    async def perform_job(self, session: ClientSession, job: dict):
         """
         Process an individual job. This function is run concurrently for multiple jobs.
         """
         context = set_span_in_context(NonRecordingSpan(job["context"]))
 
         with tracer.start_as_current_span(
-            "run_job", context=context, kind=SpanKind.CONSUMER
+            "perform_job", context=context, kind=SpanKind.CONSUMER
         ) as span:
 
             try:

From db3b6603220e2b1d3e9d5fdf9cd2f2863f74183a Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Dean=20Qui=C3=B1anola?= <dean.quinanola@runpod.io>
Date: Tue, 10 Dec 2024 08:41:58 -0800
Subject: [PATCH 37/53] tmp: capture proper error

---
 runpod/serverless/modules/rp_job.py | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/runpod/serverless/modules/rp_job.py b/runpod/serverless/modules/rp_job.py
index b7b60488..b78322c2 100644
--- a/runpod/serverless/modules/rp_job.py
+++ b/runpod/serverless/modules/rp_job.py
@@ -135,10 +135,10 @@ async def handle_job(session: ClientSession, config: Dict[str, Any], job: dict)
                 "Stream output",
                 attributes={"stream_output": str(stream_output)},
             )
-            if stream_output.get("error"):
-                span.record_exception(stream_output)
-                span.set_status(trace.Status(trace.StatusCode.ERROR, str(stream_output)))
-                await send_result(session, stream_output, job, is_stream=is_stream)
+            if err_output := stream_output["output"].get("error"):
+                span.record_exception(err_output)
+                span.set_status(trace.Status(trace.StatusCode.ERROR, str(err_output)))
+                await send_result(session, stream_output["output"], job, is_stream=is_stream)
                 break
             if config.get("return_aggregate_stream", False):
                 job_result["output"].append(stream_output["output"])

From 962df33b3d67d66698de70e1494118103e2b4c1e Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Dean=20Qui=C3=B1anola?= <dean.quinanola@runpod.io>
Date: Tue, 10 Dec 2024 09:42:16 -0800
Subject: [PATCH 38/53] tmp: need to capture "error" in output better

---
 runpod/serverless/modules/rp_job.py | 38 ++++++++++++++++++++++++-----
 1 file changed, 32 insertions(+), 6 deletions(-)

diff --git a/runpod/serverless/modules/rp_job.py b/runpod/serverless/modules/rp_job.py
index b78322c2..c1428f56 100644
--- a/runpod/serverless/modules/rp_job.py
+++ b/runpod/serverless/modules/rp_job.py
@@ -119,6 +119,15 @@ async def get_job(
             return jobs
 
 
+@tracer.start_as_current_span("handle_error")
+def _handle_error(err_output: any, job: dict) -> bool:
+    span = trace.get_current_span()
+
+    span.record_exception(err_output)
+    span.set_status(trace.Status(trace.StatusCode.ERROR, str(err_output)))
+    log.debug(f"Handled error: {err_output}", job["id"])
+
+
 @tracer.start_as_current_span("handle_job")
 async def handle_job(session: ClientSession, config: Dict[str, Any], job: dict) -> dict:
     span = trace.get_current_span()
@@ -130,16 +139,33 @@ async def handle_job(session: ClientSession, config: Dict[str, Any], job: dict)
 
         job_result = {"output": []}
         async for stream_output in generator_output:
-            log.debug(f"Stream output: {stream_output}", job["id"])
+            # temp
+            log.debug(f"Stream output: {stream_output['output']}", job["id"])
             span.add_event(
                 "Stream output",
-                attributes={"stream_output": str(stream_output)},
+                attributes={
+                    "stream_output": str(stream_output["output"]),
+                    "stream_output_type": str(type(stream_output["output"])),
+                },
             )
-            if err_output := stream_output["output"].get("error"):
-                span.record_exception(err_output)
-                span.set_status(trace.Status(trace.StatusCode.ERROR, str(err_output)))
-                await send_result(session, stream_output["output"], job, is_stream=is_stream)
+            # end temp
+
+            if type(stream_output["output"]) == dict:
+                if error_output := stream_output.get("error"):
+                    _handle_error(error_output, job)
+                    job_result = stream_output
+                    break
+
+            if type(stream_output["output"]) != str:
+                _handle_error(stream_output["output"], job)
+                job_result = stream_output
+                break
+
+            if "error" in stream_output:
+                _handle_error(stream_output, job)
+                job_result = stream_output
                 break
+
             if config.get("return_aggregate_stream", False):
                 job_result["output"].append(stream_output["output"])
 

From 8da936aa6c1225320bef0d481fba0291bf1282f1 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Dean=20Qui=C3=B1anola?= <dean.quinanola@runpod.io>
Date: Tue, 10 Dec 2024 10:07:06 -0800
Subject: [PATCH 39/53] tmp: record_exception fix

---
 runpod/serverless/modules/rp_job.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/runpod/serverless/modules/rp_job.py b/runpod/serverless/modules/rp_job.py
index c1428f56..7259740b 100644
--- a/runpod/serverless/modules/rp_job.py
+++ b/runpod/serverless/modules/rp_job.py
@@ -123,7 +123,7 @@ async def get_job(
 def _handle_error(err_output: any, job: dict) -> bool:
     span = trace.get_current_span()
 
-    span.record_exception(err_output)
+    span.record_exception(Exception(str(err_output)))
     span.set_status(trace.Status(trace.StatusCode.ERROR, str(err_output)))
     log.debug(f"Handled error: {err_output}", job["id"])
 

From 940b355aef6c00ac217bda45c5fa72e545e3353a Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Dean=20Qui=C3=B1anola?= <dean.quinanola@runpod.io>
Date: Tue, 10 Dec 2024 10:34:15 -0800
Subject: [PATCH 40/53] tmp: explicit

---
 runpod/serverless/modules/rp_job.py | 30 +++++++++++++++++++++--------
 1 file changed, 22 insertions(+), 8 deletions(-)

diff --git a/runpod/serverless/modules/rp_job.py b/runpod/serverless/modules/rp_job.py
index 7259740b..a93eb165 100644
--- a/runpod/serverless/modules/rp_job.py
+++ b/runpod/serverless/modules/rp_job.py
@@ -140,28 +140,42 @@ async def handle_job(session: ClientSession, config: Dict[str, Any], job: dict)
         job_result = {"output": []}
         async for stream_output in generator_output:
             # temp
-            log.debug(f"Stream output: {stream_output['output']}", job["id"])
-            span.add_event(
-                "Stream output",
-                attributes={
-                    "stream_output": str(stream_output["output"]),
-                    "stream_output_type": str(type(stream_output["output"])),
-                },
-            )
+            log.debug(f"Stream output: {stream_output}", job["id"])
             # end temp
 
             if type(stream_output["output"]) == dict:
+                span.add_event(
+                    "Stream output is dict",
+                    attributes={
+                        "stream_output": str(stream_output.get("output")),
+                        "stream_output_type": str(type(stream_output.get("output"))),
+                    },
+                )
                 if error_output := stream_output.get("error"):
                     _handle_error(error_output, job)
                     job_result = stream_output
                     break
 
             if type(stream_output["output"]) != str:
+                span.add_event(
+                    "Stream output is not string",
+                    attributes={
+                        "stream_output": str(stream_output.get("output")),
+                        "stream_output_type": str(type(stream_output.get("output"))),
+                    },
+                )
                 _handle_error(stream_output["output"], job)
                 job_result = stream_output
                 break
 
             if "error" in stream_output:
+                span.add_event(
+                    "Stream output has `error`",
+                    attributes={
+                        "stream_output": str(stream_output),
+                        "stream_output_type": str(type(stream_output)),
+                    },
+                )
                 _handle_error(stream_output, job)
                 job_result = stream_output
                 break

From 7e024b281803b87b316aea76b9a886134b47b7f0 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Dean=20Qui=C3=B1anola?= <dean.quinanola@runpod.io>
Date: Tue, 10 Dec 2024 10:43:22 -0800
Subject: [PATCH 41/53] tmp: omg

---
 runpod/serverless/modules/rp_job.py | 30 ++++++++++++++---------------
 1 file changed, 15 insertions(+), 15 deletions(-)

diff --git a/runpod/serverless/modules/rp_job.py b/runpod/serverless/modules/rp_job.py
index a93eb165..82552e09 100644
--- a/runpod/serverless/modules/rp_job.py
+++ b/runpod/serverless/modules/rp_job.py
@@ -143,7 +143,19 @@ async def handle_job(session: ClientSession, config: Dict[str, Any], job: dict)
             log.debug(f"Stream output: {stream_output}", job["id"])
             # end temp
 
-            if type(stream_output["output"]) == dict:
+            if stream_output.get("error"):
+                span.add_event(
+                    "Stream output has `error`",
+                    attributes={
+                        "stream_output": str(stream_output),
+                        "stream_output_type": str(type(stream_output)),
+                    },
+                )
+                _handle_error(stream_output, job)
+                job_result = stream_output
+                break
+
+            if type(stream_output.get("output")) == dict:
                 span.add_event(
                     "Stream output is dict",
                     attributes={
@@ -156,9 +168,9 @@ async def handle_job(session: ClientSession, config: Dict[str, Any], job: dict)
                     job_result = stream_output
                     break
 
-            if type(stream_output["output"]) != str:
+            if type(stream_output.get("output")) != str:
                 span.add_event(
-                    "Stream output is not string",
+                    "Stream output is not string or dict",
                     attributes={
                         "stream_output": str(stream_output.get("output")),
                         "stream_output_type": str(type(stream_output.get("output"))),
@@ -168,18 +180,6 @@ async def handle_job(session: ClientSession, config: Dict[str, Any], job: dict)
                 job_result = stream_output
                 break
 
-            if "error" in stream_output:
-                span.add_event(
-                    "Stream output has `error`",
-                    attributes={
-                        "stream_output": str(stream_output),
-                        "stream_output_type": str(type(stream_output)),
-                    },
-                )
-                _handle_error(stream_output, job)
-                job_result = stream_output
-                break
-
             if config.get("return_aggregate_stream", False):
                 job_result["output"].append(stream_output["output"])
 

From ccd01fd85fcd4c194f65045ba02a0263d5f5b68a Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Dean=20Qui=C3=B1anola?= <dean.quinanola@runpod.io>
Date: Tue, 10 Dec 2024 10:56:36 -0800
Subject: [PATCH 42/53] tmp: fix

---
 runpod/serverless/modules/rp_job.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/runpod/serverless/modules/rp_job.py b/runpod/serverless/modules/rp_job.py
index 82552e09..965689b2 100644
--- a/runpod/serverless/modules/rp_job.py
+++ b/runpod/serverless/modules/rp_job.py
@@ -123,7 +123,6 @@ async def get_job(
 def _handle_error(err_output: any, job: dict) -> bool:
     span = trace.get_current_span()
 
-    span.record_exception(Exception(str(err_output)))
     span.set_status(trace.Status(trace.StatusCode.ERROR, str(err_output)))
     log.debug(f"Handled error: {err_output}", job["id"])
 

From 32eebaa31bf262ddc1d787447f81f3cdded9fa90 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Dean=20Qui=C3=B1anola?= <dean.quinanola@runpod.io>
Date: Tue, 10 Dec 2024 11:03:43 -0800
Subject: [PATCH 43/53] tmp: shift error as job_result

---
 runpod/serverless/modules/rp_job.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/runpod/serverless/modules/rp_job.py b/runpod/serverless/modules/rp_job.py
index 965689b2..02f86ce9 100644
--- a/runpod/serverless/modules/rp_job.py
+++ b/runpod/serverless/modules/rp_job.py
@@ -142,7 +142,7 @@ async def handle_job(session: ClientSession, config: Dict[str, Any], job: dict)
             log.debug(f"Stream output: {stream_output}", job["id"])
             # end temp
 
-            if stream_output.get("error"):
+            if error_output := stream_output.get("error"):
                 span.add_event(
                     "Stream output has `error`",
                     attributes={
@@ -151,7 +151,7 @@ async def handle_job(session: ClientSession, config: Dict[str, Any], job: dict)
                     },
                 )
                 _handle_error(stream_output, job)
-                job_result = stream_output
+                job_result = error_output
                 break
 
             if type(stream_output.get("output")) == dict:

From 1e9777fc9a06771f45169a9af6abbdb8b84e78f8 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Dean=20Qui=C3=B1anola?= <dean.quinanola@runpod.io>
Date: Tue, 10 Dec 2024 11:16:51 -0800
Subject: [PATCH 44/53] tmp: seriously?

---
 runpod/serverless/modules/rp_job.py | 26 +++++++++++++-------------
 1 file changed, 13 insertions(+), 13 deletions(-)

diff --git a/runpod/serverless/modules/rp_job.py b/runpod/serverless/modules/rp_job.py
index 02f86ce9..97ac6774 100644
--- a/runpod/serverless/modules/rp_job.py
+++ b/runpod/serverless/modules/rp_job.py
@@ -142,30 +142,30 @@ async def handle_job(session: ClientSession, config: Dict[str, Any], job: dict)
             log.debug(f"Stream output: {stream_output}", job["id"])
             # end temp
 
-            if error_output := stream_output.get("error"):
+            if type(stream_output.get("output")) == dict:
                 span.add_event(
-                    "Stream output has `error`",
+                    "Stream output has `output.error`",
                     attributes={
                         "stream_output": str(stream_output),
                         "stream_output_type": str(type(stream_output)),
+                        "stream_output_error": str(stream_output["output"].get("error")),
+                        "stream_output_error_type": str(type(stream_output["output"].get("error"))),
                     },
                 )
-                _handle_error(stream_output, job)
-                job_result = error_output
-                break
+                if stream_output["output"].get("error"):
+                    stream_output["error"] = stream_output["output"]["error"]
 
-            if type(stream_output.get("output")) == dict:
+            if stream_output.get("error"):
                 span.add_event(
-                    "Stream output is dict",
+                    "Stream output has `error`",
                     attributes={
-                        "stream_output": str(stream_output.get("output")),
-                        "stream_output_type": str(type(stream_output.get("output"))),
+                        "stream_output": str(stream_output),
+                        "stream_output_type": str(type(stream_output)),
                     },
                 )
-                if error_output := stream_output.get("error"):
-                    _handle_error(error_output, job)
-                    job_result = stream_output
-                    break
+                _handle_error(stream_output, job)
+                job_result = stream_output
+                break
 
             if type(stream_output.get("output")) != str:
                 span.add_event(

From e432baf48f31ae34a583c8d62bad9589f631fd5c Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Dean=20Qui=C3=B1anola?= <dean.quinanola@runpod.io>
Date: Tue, 10 Dec 2024 11:42:34 -0800
Subject: [PATCH 45/53] tmp: 1

---
 runpod/serverless/modules/rp_job.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/runpod/serverless/modules/rp_job.py b/runpod/serverless/modules/rp_job.py
index 97ac6774..ad1ec303 100644
--- a/runpod/serverless/modules/rp_job.py
+++ b/runpod/serverless/modules/rp_job.py
@@ -153,7 +153,7 @@ async def handle_job(session: ClientSession, config: Dict[str, Any], job: dict)
                     },
                 )
                 if stream_output["output"].get("error"):
-                    stream_output["error"] = stream_output["output"]["error"]
+                    stream_output = {"error": stream_output["output"]["error"]}
 
             if stream_output.get("error"):
                 span.add_event(

From 3dbf5f16904436edd4c1f521fc7ebe9d0e5ec16f Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Dean=20Qui=C3=B1anola?= <dean.quinanola@runpod.io>
Date: Tue, 10 Dec 2024 11:58:25 -0800
Subject: [PATCH 46/53] tmp: trace the transmit job_data

---
 runpod/serverless/modules/rp_http.py | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/runpod/serverless/modules/rp_http.py b/runpod/serverless/modules/rp_http.py
index 242b7f4a..3d050f66 100644
--- a/runpod/serverless/modules/rp_http.py
+++ b/runpod/serverless/modules/rp_http.py
@@ -28,10 +28,14 @@
 tracer = trace.get_tracer(__name__)
 
 
+@tracer.start_as_current_span("transmit", kind=trace.SpanKind.CLIENT)
 async def _transmit(client_session: ClientSession, url, job_data):
     """
     Wrapper for transmitting results via POST.
     """
+    span = trace.get_current_span()
+    span.set_attribute("job_data", job_data)
+
     retry_options = FibonacciRetry(attempts=3)
     retry_client = RetryClient(
         client_session=client_session, retry_options=retry_options

From 86a357deda93e5058079f1dc42e350f98a71e0fc Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Dean=20Qui=C3=B1anola?= <dean.quinanola@runpod.io>
Date: Tue, 10 Dec 2024 12:01:36 -0800
Subject: [PATCH 47/53] tmp: stringify the error object

---
 runpod/serverless/modules/rp_job.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/runpod/serverless/modules/rp_job.py b/runpod/serverless/modules/rp_job.py
index ad1ec303..7b99c18b 100644
--- a/runpod/serverless/modules/rp_job.py
+++ b/runpod/serverless/modules/rp_job.py
@@ -153,7 +153,7 @@ async def handle_job(session: ClientSession, config: Dict[str, Any], job: dict)
                     },
                 )
                 if stream_output["output"].get("error"):
-                    stream_output = {"error": stream_output["output"]["error"]}
+                    stream_output = {"error": str(stream_output["output"]["error"])}
 
             if stream_output.get("error"):
                 span.add_event(

From bfd6a0b7be23245112531fa8806547b230ba696a Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Dean=20Qui=C3=B1anola?= <dean.quinanola@runpod.io>
Date: Fri, 13 Dec 2024 14:59:27 -0800
Subject: [PATCH 48/53] tmp: forced tracing by RUNPOD_LOG_LEVEL=TRACE

---
 runpod/otel.py | 65 ++++++++++++++++++++++++++++++++++----------------
 1 file changed, 45 insertions(+), 20 deletions(-)

diff --git a/runpod/otel.py b/runpod/otel.py
index 3fa06ec1..69cff30d 100644
--- a/runpod/otel.py
+++ b/runpod/otel.py
@@ -1,45 +1,70 @@
 import os
+import logging
+from typing import List
 
 from opentelemetry import trace
 from opentelemetry.exporter.otlp.proto.http.trace_exporter import OTLPSpanExporter
 from opentelemetry.sdk.trace import TracerProvider, sampling
-from opentelemetry.sdk.trace.export import BatchSpanProcessor, ConsoleSpanExporter
+from opentelemetry.sdk.trace.export import SpanExporter, BatchSpanProcessor, ConsoleSpanExporter
 from opentelemetry.sdk.resources import (
     Resource,
     DEPLOYMENT_ENVIRONMENT,
     SERVICE_NAME,
     SERVICE_VERSION,
 )
+from runpod.version import __version__ as runpod_version
+
+
+log = logging.getLogger(__name__)
+FMT = "%(filename)-20s:%(lineno)-4d %(asctime)s %(message)s"
+logging.basicConfig(level=logging.INFO, format=FMT, handlers=[logging.StreamHandler()])
+
 
+OTEL_COLLECTOR = os.getenv("OTEL_EXPORTER_OTLP_ENDPOINT", "")
+OTEL_SAMPLING_RATE = float(os.getenv("OTEL_SAMPLING_RATE", "0.01"))
 RUNPOD_ENDPOINT_ID = "runpod.endpoint_id"
+RUNPOD_ENDPOINT_ID_VALUE = os.getenv("RUNPOD_ENDPOINT_ID", "")
 RUNPOD_POD_ID = "runpod.pod_id"
+RUNPOD_POD_ID_VALUE = os.getenv("RUNPOD_POD_ID", "")
+RUNPOD_ENV = os.getenv("ENV", "local")
 
-from runpod.version import __version__ as runpod_version
 
+if os.getenv("RUNPOD_LOG_LEVEL", "").lower() == "trace":
+    log.setLevel(logging.TRACE)
+    sampler = sampling.ALWAYS_ON
+else:
+    sampler = sampling.TraceIdRatioBased(OTEL_SAMPLING_RATE)
 
-trace.set_tracer_provider(
-    TracerProvider(
-        sampler=sampling.ALWAYS_ON,
-        resource=Resource.create(
-            {
-                DEPLOYMENT_ENVIRONMENT: os.getenv("ENV"),
-                RUNPOD_ENDPOINT_ID: os.getenv("RUNPOD_ENDPOINT_ID"),
-                RUNPOD_POD_ID: os.getenv("RUNPOD_POD_ID"),
-                SERVICE_NAME: "runpod-python-sdk",
-                SERVICE_VERSION: runpod_version,
-            }
-        )
-    )
+otlp_provider = TracerProvider(
+    sampler=sampler,
+    resource=Resource.create(
+        {
+            DEPLOYMENT_ENVIRONMENT: RUNPOD_ENV,
+            RUNPOD_ENDPOINT_ID: RUNPOD_ENDPOINT_ID_VALUE,
+            RUNPOD_POD_ID: RUNPOD_POD_ID_VALUE,
+            SERVICE_NAME: "runpod-python-sdk",
+            SERVICE_VERSION: runpod_version,
+        }
+    ),
 )
 
-tracer = trace.get_tracer_provider()
+span_processors: List[SpanExporter] = []
+
+if RUNPOD_ENV.lower() == "local":
+    span_processors.append(ConsoleSpanExporter())
+
+if OTEL_COLLECTOR:
+    span_processors.append(OTLPSpanExporter())
 
-if os.getenv("OTEL_EXPORTER_OTLP_ENDPOINT"):
-    tracer.add_span_processor(BatchSpanProcessor(OTLPSpanExporter()))
+    trace.set_tracer_provider(otlp_provider)
+    tracer = trace.get_tracer_provider()
 
-elif os.getenv("RUNPOD_LOG_LEVEL", "").lower() == "trace":
-    tracer.add_span_processor(BatchSpanProcessor(ConsoleSpanExporter()))
+    for span_processor in span_processors:
+        tracer.add_span_processor(BatchSpanProcessor(span_processor))
+        log.debug(f"Span processor: {span_processor}")
 
 else:
     # Use NoOpTracerProvider to disable OTEL
     trace.set_tracer_provider(trace.NoOpTracerProvider())
+    tracer = trace.get_tracer_provider()
+    log.debug(f"No tracer is active")

From 1ef0f35b26c66c7ba53f30d184692b24864440ca Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Dean=20Qui=C3=B1anola?= <dean.quinanola@runpod.io>
Date: Tue, 17 Dec 2024 01:52:03 -0800
Subject: [PATCH 49/53] tmp: otel.start() to activate

---
 runpod/__init__.py                            |   1 -
 runpod/otel.py                                | 100 ++++++++----------
 runpod/serverless/worker.py                   |   3 +
 .../test_serverless/test_modules/run_scale.py |   2 +
 4 files changed, 49 insertions(+), 57 deletions(-)

diff --git a/runpod/__init__.py b/runpod/__init__.py
index 6ea28ade..6611587d 100644
--- a/runpod/__init__.py
+++ b/runpod/__init__.py
@@ -3,7 +3,6 @@
 import logging
 import os
 
-from . import otel
 from . import serverless
 from .api.ctl_commands import (
     create_container_registry_auth,
diff --git a/runpod/otel.py b/runpod/otel.py
index 69cff30d..96bfd590 100644
--- a/runpod/otel.py
+++ b/runpod/otel.py
@@ -1,11 +1,9 @@
 import os
-import logging
-from typing import List
 
 from opentelemetry import trace
 from opentelemetry.exporter.otlp.proto.http.trace_exporter import OTLPSpanExporter
 from opentelemetry.sdk.trace import TracerProvider, sampling
-from opentelemetry.sdk.trace.export import SpanExporter, BatchSpanProcessor, ConsoleSpanExporter
+from opentelemetry.sdk.trace.export import BatchSpanProcessor, ConsoleSpanExporter
 from opentelemetry.sdk.resources import (
     Resource,
     DEPLOYMENT_ENVIRONMENT,
@@ -15,56 +13,46 @@
 from runpod.version import __version__ as runpod_version
 
 
-log = logging.getLogger(__name__)
-FMT = "%(filename)-20s:%(lineno)-4d %(asctime)s %(message)s"
-logging.basicConfig(level=logging.INFO, format=FMT, handlers=[logging.StreamHandler()])
-
-
-OTEL_COLLECTOR = os.getenv("OTEL_EXPORTER_OTLP_ENDPOINT", "")
-OTEL_SAMPLING_RATE = float(os.getenv("OTEL_SAMPLING_RATE", "0.01"))
-RUNPOD_ENDPOINT_ID = "runpod.endpoint_id"
-RUNPOD_ENDPOINT_ID_VALUE = os.getenv("RUNPOD_ENDPOINT_ID", "")
-RUNPOD_POD_ID = "runpod.pod_id"
-RUNPOD_POD_ID_VALUE = os.getenv("RUNPOD_POD_ID", "")
-RUNPOD_ENV = os.getenv("ENV", "local")
-
-
-if os.getenv("RUNPOD_LOG_LEVEL", "").lower() == "trace":
-    log.setLevel(logging.TRACE)
-    sampler = sampling.ALWAYS_ON
-else:
-    sampler = sampling.TraceIdRatioBased(OTEL_SAMPLING_RATE)
-
-otlp_provider = TracerProvider(
-    sampler=sampler,
-    resource=Resource.create(
-        {
-            DEPLOYMENT_ENVIRONMENT: RUNPOD_ENV,
-            RUNPOD_ENDPOINT_ID: RUNPOD_ENDPOINT_ID_VALUE,
-            RUNPOD_POD_ID: RUNPOD_POD_ID_VALUE,
-            SERVICE_NAME: "runpod-python-sdk",
-            SERVICE_VERSION: runpod_version,
-        }
-    ),
-)
-
-span_processors: List[SpanExporter] = []
-
-if RUNPOD_ENV.lower() == "local":
-    span_processors.append(ConsoleSpanExporter())
-
-if OTEL_COLLECTOR:
-    span_processors.append(OTLPSpanExporter())
-
-    trace.set_tracer_provider(otlp_provider)
-    tracer = trace.get_tracer_provider()
-
-    for span_processor in span_processors:
-        tracer.add_span_processor(BatchSpanProcessor(span_processor))
-        log.debug(f"Span processor: {span_processor}")
-
-else:
-    # Use NoOpTracerProvider to disable OTEL
-    trace.set_tracer_provider(trace.NoOpTracerProvider())
-    tracer = trace.get_tracer_provider()
-    log.debug(f"No tracer is active")
+def start():
+    OTEL_COLLECTOR = os.getenv("OTEL_EXPORTER_OTLP_ENDPOINT", "")
+    OTEL_SAMPLING_RATE = float(os.getenv("OTEL_SAMPLING_RATE", "0.01"))
+
+    RUNPOD_ENV = os.getenv("ENV", "local").lower()
+    RUNPOD_LOG_LEVEL = os.getenv("RUNPOD_LOG_LEVEL", "").lower()
+
+    RUNPOD_ENDPOINT_ID = "runpod.endpoint_id"
+    RUNPOD_ENDPOINT_ID_VALUE = os.getenv("RUNPOD_ENDPOINT_ID", "")
+    RUNPOD_POD_ID = "runpod.pod_id"
+    RUNPOD_POD_ID_VALUE = os.getenv("RUNPOD_POD_ID", "")
+
+    if RUNPOD_LOG_LEVEL == "trace":
+        sampler = sampling.ALWAYS_ON
+    else:
+        sampler = sampling.TraceIdRatioBased(OTEL_SAMPLING_RATE)
+
+    tracer = TracerProvider(
+        sampler=sampler,
+        resource=Resource.create(
+            {
+                DEPLOYMENT_ENVIRONMENT: RUNPOD_ENV,
+                RUNPOD_ENDPOINT_ID: RUNPOD_ENDPOINT_ID_VALUE,
+                RUNPOD_POD_ID: RUNPOD_POD_ID_VALUE,
+                SERVICE_NAME: "runpod-python-sdk",
+                SERVICE_VERSION: runpod_version,
+            }
+        ),
+    )
+
+    if OTEL_COLLECTOR:
+        tracer.add_span_processor(BatchSpanProcessor(OTLPSpanExporter()))
+        trace.set_tracer_provider(tracer)
+        print("OpenTelemetry is on")
+
+    elif RUNPOD_ENV == "local":
+        tracer.add_span_processor(BatchSpanProcessor(ConsoleSpanExporter()))
+        trace.set_tracer_provider(tracer)
+        print("Console tracing is on")
+
+    else:
+        # Use NoOpTracerProvider to disable OTEL
+        trace.set_tracer_provider(trace.NoOpTracerProvider())
diff --git a/runpod/serverless/worker.py b/runpod/serverless/worker.py
index ec98347d..fa262755 100644
--- a/runpod/serverless/worker.py
+++ b/runpod/serverless/worker.py
@@ -7,6 +7,7 @@
 import os
 from typing import Any, Dict
 
+from runpod import otel
 from runpod.serverless.modules import rp_logger, rp_local, rp_ping, rp_scale
 
 log = rp_logger.RunPodLogger()
@@ -35,6 +36,8 @@ def run_worker(config: Dict[str, Any]) -> None:
     Args:
         config (Dict[str, Any]): Configuration parameters for the worker.
     """
+    otel.start()
+
     # Start pinging RunPod to show that the worker is alive.
     heartbeat.start_ping()
 
diff --git a/tests/test_serverless/test_modules/run_scale.py b/tests/test_serverless/test_modules/run_scale.py
index 5983c7a6..2150fea2 100644
--- a/tests/test_serverless/test_modules/run_scale.py
+++ b/tests/test_serverless/test_modules/run_scale.py
@@ -3,6 +3,7 @@
 from faker import Faker
 from typing import Any, Dict, Optional, List
 
+from runpod import otel
 from runpod.serverless.modules.rp_scale import JobScaler, RunPodLogger, JobsProgress
 
 fake = Faker()
@@ -60,4 +61,5 @@ async def fake_handle_job(session, config, job) -> dict:
         "jobs_handler": fake_handle_job,
     }
 )
+otel.start()
 job_scaler.start()

From ddff0a252f05d29812b3c9ba76a6cbcd8f2520b4 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Dean=20Qui=C3=B1anola?= <dean.quinanola@runpod.io>
Date: Tue, 17 Dec 2024 02:02:09 -0800
Subject: [PATCH 50/53] tmp: print sampling strategy

---
 runpod/otel.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/runpod/otel.py b/runpod/otel.py
index 96bfd590..3ec2bea7 100644
--- a/runpod/otel.py
+++ b/runpod/otel.py
@@ -46,12 +46,12 @@ def start():
     if OTEL_COLLECTOR:
         tracer.add_span_processor(BatchSpanProcessor(OTLPSpanExporter()))
         trace.set_tracer_provider(tracer)
-        print("OpenTelemetry is on")
+        print(f"OpenTelemetry is on: {sampler.get_description()}")
 
     elif RUNPOD_ENV == "local":
         tracer.add_span_processor(BatchSpanProcessor(ConsoleSpanExporter()))
         trace.set_tracer_provider(tracer)
-        print("Console tracing is on")
+        print(f"Console tracing is on: {sampler.get_description()}")
 
     else:
         # Use NoOpTracerProvider to disable OTEL

From 14899393b62d435d4ff66f955be2783ecb0018a8 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Dean=20Qui=C3=B1anola?= <dean.quinanola@runpod.io>
Date: Wed, 18 Dec 2024 00:54:25 -0800
Subject: [PATCH 51/53] tmp: pytest-env

---
 pyproject.toml | 1 +
 pytest.ini     | 3 +++
 setup.py       | 1 +
 3 files changed, 5 insertions(+)

diff --git a/pyproject.toml b/pyproject.toml
index 0641259b..a8950274 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -57,6 +57,7 @@ test = [
     "faker",
     "pytest-asyncio",
     "pytest-cov",
+    "pytest-env",
     "pytest-timeout",
     "pytest-watch",
     "pytest",
diff --git a/pytest.ini b/pytest.ini
index 68e2f208..1fecc333 100644
--- a/pytest.ini
+++ b/pytest.ini
@@ -2,3 +2,6 @@
 addopts = --durations=10 --cov-config=.coveragerc --timeout=120 --timeout_method=thread --cov=runpod --cov-report=xml --cov-report=term-missing --cov-fail-under=90 -W error -p no:cacheprovider -p no:unraisableexception
 python_files = tests.py test_*.py *_test.py
 norecursedirs = venv *.egg-info .git build
+env = 
+    D:ENV=test
+    D:RUNPOD_LOG_LEVEL=ERROR
\ No newline at end of file
diff --git a/setup.py b/setup.py
index 11fe7ce5..d9583e72 100644
--- a/setup.py
+++ b/setup.py
@@ -20,6 +20,7 @@
         "nest_asyncio",
         "pytest",
         "pytest-cov",
+        "pytest-env",
         "pytest-timeout",
         "pytest-asyncio",
     ]

From 749a2c77a68c41a28bba5a931a5fcfa02b0e9504 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Dean=20Qui=C3=B1anola?= <dean.quinanola@runpod.io>
Date: Thu, 19 Dec 2024 05:05:42 -0800
Subject: [PATCH 52/53] tmp: fix `Attempting to instrument while already
 instrumented`

---
 runpod/http_client.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/runpod/http_client.py b/runpod/http_client.py
index 146d317b..268d548b 100644
--- a/runpod/http_client.py
+++ b/runpod/http_client.py
@@ -54,4 +54,5 @@ class SyncClientSession(requests.Session):
     def __init__(self):
         super().__init__()
         self.headers.update(get_auth_header())
-        RequestsInstrumentor().instrument(session=self)
+
+RequestsInstrumentor().instrument()

From ed057a34555a3936a2f68614536f354a1b905ccc Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Dean=20Qui=C3=B1anola?= <dean.quinanola@runpod.io>
Date: Thu, 19 Dec 2024 05:06:43 -0800
Subject: [PATCH 53/53] tmp: otel scoped to serverless only

for now
---
 runpod/otel.py                                |  58 ---------
 runpod/serverless/modules/rp_tracer.py        | 117 ++++++++++++++++++
 runpod/serverless/worker.py                   |   5 +-
 .../test_serverless/test_modules/run_scale.py |   4 +-
 4 files changed, 121 insertions(+), 63 deletions(-)
 delete mode 100644 runpod/otel.py
 create mode 100644 runpod/serverless/modules/rp_tracer.py

diff --git a/runpod/otel.py b/runpod/otel.py
deleted file mode 100644
index 3ec2bea7..00000000
--- a/runpod/otel.py
+++ /dev/null
@@ -1,58 +0,0 @@
-import os
-
-from opentelemetry import trace
-from opentelemetry.exporter.otlp.proto.http.trace_exporter import OTLPSpanExporter
-from opentelemetry.sdk.trace import TracerProvider, sampling
-from opentelemetry.sdk.trace.export import BatchSpanProcessor, ConsoleSpanExporter
-from opentelemetry.sdk.resources import (
-    Resource,
-    DEPLOYMENT_ENVIRONMENT,
-    SERVICE_NAME,
-    SERVICE_VERSION,
-)
-from runpod.version import __version__ as runpod_version
-
-
-def start():
-    OTEL_COLLECTOR = os.getenv("OTEL_EXPORTER_OTLP_ENDPOINT", "")
-    OTEL_SAMPLING_RATE = float(os.getenv("OTEL_SAMPLING_RATE", "0.01"))
-
-    RUNPOD_ENV = os.getenv("ENV", "local").lower()
-    RUNPOD_LOG_LEVEL = os.getenv("RUNPOD_LOG_LEVEL", "").lower()
-
-    RUNPOD_ENDPOINT_ID = "runpod.endpoint_id"
-    RUNPOD_ENDPOINT_ID_VALUE = os.getenv("RUNPOD_ENDPOINT_ID", "")
-    RUNPOD_POD_ID = "runpod.pod_id"
-    RUNPOD_POD_ID_VALUE = os.getenv("RUNPOD_POD_ID", "")
-
-    if RUNPOD_LOG_LEVEL == "trace":
-        sampler = sampling.ALWAYS_ON
-    else:
-        sampler = sampling.TraceIdRatioBased(OTEL_SAMPLING_RATE)
-
-    tracer = TracerProvider(
-        sampler=sampler,
-        resource=Resource.create(
-            {
-                DEPLOYMENT_ENVIRONMENT: RUNPOD_ENV,
-                RUNPOD_ENDPOINT_ID: RUNPOD_ENDPOINT_ID_VALUE,
-                RUNPOD_POD_ID: RUNPOD_POD_ID_VALUE,
-                SERVICE_NAME: "runpod-python-sdk",
-                SERVICE_VERSION: runpod_version,
-            }
-        ),
-    )
-
-    if OTEL_COLLECTOR:
-        tracer.add_span_processor(BatchSpanProcessor(OTLPSpanExporter()))
-        trace.set_tracer_provider(tracer)
-        print(f"OpenTelemetry is on: {sampler.get_description()}")
-
-    elif RUNPOD_ENV == "local":
-        tracer.add_span_processor(BatchSpanProcessor(ConsoleSpanExporter()))
-        trace.set_tracer_provider(tracer)
-        print(f"Console tracing is on: {sampler.get_description()}")
-
-    else:
-        # Use NoOpTracerProvider to disable OTEL
-        trace.set_tracer_provider(trace.NoOpTracerProvider())
diff --git a/runpod/serverless/modules/rp_tracer.py b/runpod/serverless/modules/rp_tracer.py
new file mode 100644
index 00000000..da8a0925
--- /dev/null
+++ b/runpod/serverless/modules/rp_tracer.py
@@ -0,0 +1,117 @@
+import os
+
+from opentelemetry import trace
+from opentelemetry.exporter.otlp.proto.http.trace_exporter import OTLPSpanExporter
+from opentelemetry.sdk.trace import TracerProvider, sampling
+from opentelemetry.sdk.trace.export import BatchSpanProcessor, ConsoleSpanExporter
+from opentelemetry.sdk.resources import (
+    Resource,
+    DEPLOYMENT_ENVIRONMENT,
+    SERVICE_NAME,
+    SERVICE_VERSION,
+)
+from runpod.version import __version__ as runpod_version
+from .rp_logger import RunPodLogger
+
+
+log = RunPodLogger()
+
+# https://opentelemetry.io/docs/languages/sdk-configuration/otlp-exporter/
+OTEL_EXPORTER_OTLP_ENDPOINT = os.getenv("OTEL_EXPORTER_OTLP_ENDPOINT", "")
+
+# https://opentelemetry.io/docs/languages/sdk-configuration/general/#otel_service_name
+OTEL_SERVICE_NAME = os.getenv("OTEL_SERVICE_NAME", "serverless-worker")
+
+OTEL_SAMPLING_RATE = float(os.getenv("OTEL_SAMPLING_RATE", "0.01"))
+
+
+def start(
+    service_name: str = OTEL_SERVICE_NAME,
+    collector: str = OTEL_EXPORTER_OTLP_ENDPOINT,
+    rate: float = OTEL_SAMPLING_RATE,
+):
+    """
+    Initializes the OpenTelemetry global tracer provider.
+
+    Args:
+        service_name: The service name to associate with the OTEL spans.
+        collector: The URL of the OTEL collector to report to. Defaults to
+            the `OTEL_EXPORTER_OTLP_ENDPOINT` environment variable.
+        rate: The sampling rate between 0.0 and 1.0. Defaults to the
+            `OTEL_SAMPLING_RATE` env var or 0.01 (1%)
+
+    Notes:
+        The env var `RUNPOD_LOG_LEVEL=trace` can be set to force mandatory tracing.
+        Otherwise, the sampling rate is used to control the amount of tracing.
+
+        If a collector is provided, the traces are exported to it.
+        Else if the environment is "local", the traces are printed to the console.
+
+        If neither of the above conditions are met, then tracing is disabled.
+    """
+    RUNPOD_ENV = get_deployment_env()
+    RUNPOD_LOG_LEVEL = os.getenv("RUNPOD_LOG_LEVEL", "").lower()
+
+    if RUNPOD_LOG_LEVEL == "trace":
+        sampler = sampling.ALWAYS_ON
+    else:
+        sampler = sampling.TraceIdRatioBased(rate)
+
+    tracer = TracerProvider(
+        sampler=sampler,
+        resource=get_resource(service_name, RUNPOD_ENV),
+    )
+
+    if collector:
+        tracer.add_span_processor(BatchSpanProcessor(OTLPSpanExporter()))
+        trace.set_tracer_provider(tracer)
+        log.info(f"OpenTelemetry is on: {sampler.get_description()}")
+
+    elif RUNPOD_ENV == "local":
+        tracer.add_span_processor(BatchSpanProcessor(ConsoleSpanExporter()))
+        trace.set_tracer_provider(tracer)
+        log.info(f"Tracing prints to console: {sampler.get_description()}")
+
+    else:
+        # Use NoOpTracerProvider to disable OTEL
+        trace.set_tracer_provider(trace.NoOpTracerProvider())
+
+
+def get_resource(service_name: str, environment: str) -> Resource:
+    """
+    Constructs and returns a Resource object for OpenTelemetry.
+
+    The Resource object includes essential metadata such as deployment
+    environment, service name, service version, and unique identifiers
+    for the RunPod endpoint and pod.
+
+    Args:
+        service_name: The name of the service to associate with the resource.
+        environment: The deployment environment (e.g., dev, prod, local).
+
+    Returns:
+        A Resource object containing metadata for tracing and monitoring.
+    """
+    RUNPOD_ENDPOINT_ID = "runpod.endpoint_id"
+    RUNPOD_ENDPOINT_ID_VALUE = os.getenv("RUNPOD_ENDPOINT_ID", "")
+    RUNPOD_POD_ID = "runpod.pod_id"
+    RUNPOD_POD_ID_VALUE = os.getenv("RUNPOD_POD_ID", "")
+
+    return Resource.create(
+        {
+            DEPLOYMENT_ENVIRONMENT: environment,
+            RUNPOD_ENDPOINT_ID: RUNPOD_ENDPOINT_ID_VALUE,
+            RUNPOD_POD_ID: RUNPOD_POD_ID_VALUE,
+            SERVICE_NAME: service_name,
+            SERVICE_VERSION: runpod_version,
+        }
+    )
+
+
+def get_deployment_env() -> str:
+    RUNPOD_API_URL = os.getenv("RUNPOD_WEBHOOK_PING", "")
+    if "runpod.dev" in RUNPOD_API_URL:
+        return "dev"
+    if "runpod.ai" in RUNPOD_API_URL:
+        return "prod"
+    return "local"
diff --git a/runpod/serverless/worker.py b/runpod/serverless/worker.py
index fa262755..ed72e76f 100644
--- a/runpod/serverless/worker.py
+++ b/runpod/serverless/worker.py
@@ -7,8 +7,7 @@
 import os
 from typing import Any, Dict
 
-from runpod import otel
-from runpod.serverless.modules import rp_logger, rp_local, rp_ping, rp_scale
+from runpod.serverless.modules import rp_logger, rp_local, rp_ping, rp_scale, rp_tracer
 
 log = rp_logger.RunPodLogger()
 heartbeat = rp_ping.Heartbeat()
@@ -36,7 +35,7 @@ def run_worker(config: Dict[str, Any]) -> None:
     Args:
         config (Dict[str, Any]): Configuration parameters for the worker.
     """
-    otel.start()
+    rp_tracer.start()
 
     # Start pinging RunPod to show that the worker is alive.
     heartbeat.start_ping()
diff --git a/tests/test_serverless/test_modules/run_scale.py b/tests/test_serverless/test_modules/run_scale.py
index 2150fea2..1730505b 100644
--- a/tests/test_serverless/test_modules/run_scale.py
+++ b/tests/test_serverless/test_modules/run_scale.py
@@ -3,7 +3,7 @@
 from faker import Faker
 from typing import Any, Dict, Optional, List
 
-from runpod import otel
+from runpod.serverless.modules import rp_tracer
 from runpod.serverless.modules.rp_scale import JobScaler, RunPodLogger, JobsProgress
 
 fake = Faker()
@@ -61,5 +61,5 @@ async def fake_handle_job(session, config, job) -> dict:
         "jobs_handler": fake_handle_job,
     }
 )
-otel.start()
+rp_tracer.start()
 job_scaler.start()