From 022f846d81f516351e07d0ddc3949552d3d8b8a6 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Dean=20Qui=C3=B1anola?= Date: Mon, 28 Oct 2024 14:48:10 -0700 Subject: [PATCH 01/53] tmp: disable existing tracers --- runpod/http_client.py | 50 ++++--------------------------------------- 1 file changed, 4 insertions(+), 46 deletions(-) diff --git a/runpod/http_client.py b/runpod/http_client.py index 145060bf..ab838094 100644 --- a/runpod/http_client.py +++ b/runpod/http_client.py @@ -8,7 +8,6 @@ from aiohttp import ClientSession, ClientTimeout, TCPConnector, ClientResponseError from .cli.groups.config.functions import get_credentials -from .tracer import create_aiohttp_tracer, create_request_tracer from .user_agent import USER_AGENT @@ -33,62 +32,21 @@ def get_auth_header(): } -def AsyncClientSession(*args, **kwargs): # pylint: disable=invalid-name +def AsyncClientSession(*args, **kwargs): """ Deprecation from aiohttp.ClientSession forbids inheritance. This is now a factory method - TODO: use httpx """ return ClientSession( connector=TCPConnector(limit=0), headers=get_auth_header(), timeout=ClientTimeout(600, ceil_threshold=400), - trace_configs=[create_aiohttp_tracer()], *args, **kwargs, ) class SyncClientSession(requests.Session): - """ - Inherits requests.Session to override `request()` method for tracing - TODO: use httpx - """ - - def request(self, method, url, **kwargs): # pylint: disable=arguments-differ - """ - Override for tracing. Not using super().request() - to capture metrics for connection and transfer times - """ - with create_request_tracer() as tracer: - # Separate out the kwargs that are not applicable to `requests.Request` - request_kwargs = { - k: v - for k, v in kwargs.items() - # contains the names of the arguments - if k in requests.Request.__init__.__code__.co_varnames - } - - # Separate out the kwargs that are applicable to `requests.Request` - send_kwargs = {k: v for k, v in kwargs.items() if k not in request_kwargs} - - # Create a PreparedRequest object to hold the request details - req = requests.Request(method, url, **request_kwargs) - prepped = self.prepare_request(req) - tracer.request = prepped # Assign the request to the tracer - - # Merge environment settings - settings = self.merge_environment_settings( - prepped.url, - send_kwargs.get("proxies"), - send_kwargs.get("stream"), - send_kwargs.get("verify"), - send_kwargs.get("cert"), - ) - send_kwargs.update(settings) - - # Send the request - response = self.send(prepped, **send_kwargs) - tracer.response = response # Assign the response to the tracer - - return response + def __init__(self): + super().__init__() + self.headers.update({"User-Agent": USER_AGENT,}) From 150f534d84b8e0a0aabc91d6bc01f8422a6060f1 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Dean=20Qui=C3=B1anola?= Date: Mon, 28 Oct 2024 14:49:10 -0700 Subject: [PATCH 02/53] tmp: auto-instrumentations for OTEL --- requirements.txt | 9 +++++ runpod/__init__.py | 1 + runpod/otel.py | 88 ++++++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 98 insertions(+) create mode 100644 runpod/otel.py diff --git a/requirements.txt b/requirements.txt index 4c7681fe..261bb588 100644 --- a/requirements.txt +++ b/requirements.txt @@ -17,3 +17,12 @@ tomlkit >= 0.12.2 tqdm-loggable >= 0.1.4 urllib3 >= 1.26.6 watchdog >= 3.0.0 + +setuptools==65.6.3 +opentelemetry-sdk +opentelemetry-exporter-otlp +opentelemetry-instrumentation-aiohttp-client +opentelemetry-instrumentation-asyncio +opentelemetry-instrumentation-requests +opentelemetry-instrumentation-threading +opentelemetry-instrumentation-urllib3 diff --git a/runpod/__init__.py b/runpod/__init__.py index 6611587d..6ea28ade 100644 --- a/runpod/__init__.py +++ b/runpod/__init__.py @@ -3,6 +3,7 @@ import logging import os +from . import otel from . import serverless from .api.ctl_commands import ( create_container_registry_auth, diff --git a/runpod/otel.py b/runpod/otel.py new file mode 100644 index 00000000..741a680d --- /dev/null +++ b/runpod/otel.py @@ -0,0 +1,88 @@ +import os +import typing +import aiohttp +from requests import PreparedRequest, Response + +from opentelemetry import trace +from opentelemetry.sdk.trace import Resource, TracerProvider, Span +from opentelemetry.sdk.trace.export import BatchSpanProcessor, ConsoleSpanExporter +from opentelemetry.exporter.otlp.proto.grpc.trace_exporter import OTLPSpanExporter + +from opentelemetry.instrumentation.aiohttp_client import AioHttpClientInstrumentor +from opentelemetry.instrumentation.asyncio import AsyncioInstrumentor +from opentelemetry.instrumentation.requests import RequestsInstrumentor +from opentelemetry.instrumentation.threading import ThreadingInstrumentor +from opentelemetry.instrumentation.urllib3 import URLLib3Instrumentor + +from runpod.version import __version__ as runpod_version + + +trace.set_tracer_provider( + TracerProvider( + resource=Resource.create( + { + "service.name": "runpod-python-sdk", + "service.version": runpod_version, + "application": "runpod-serverless", + } + ) + ) +) + +tracer = trace.get_tracer_provider() + +if os.getenv("RUNPOD_LOG_LEVEL", "").lower() == "trace": + tracer.add_span_processor(BatchSpanProcessor(ConsoleSpanExporter())) + +if os.getenv("OTEL_EXPORTER_OTLP_ENDPOINT"): + tracer.add_span_processor(BatchSpanProcessor(OTLPSpanExporter())) + + +# --- threading --- # +ThreadingInstrumentor().instrument() + + +# --- urllib3 --- # +URLLib3Instrumentor().instrument() + + +# --- asyncio --- # +AsyncioInstrumentor().instrument() + + +# --- requests --- # +def requests_request_hook(span: Span, request_obj: PreparedRequest): + pass + + +def requests_response_hook( + span: Span, request_obj: PreparedRequest, response: Response +): + pass + + +RequestsInstrumentor().instrument() + + +# --- aiohttp --- # +def aiohttp_request_hook(span: Span, params: aiohttp.TraceRequestStartParams): + if span and span.is_recording(): + span.set_attribute( + "custom_user_attribute_from_request_hook", "aiohttp_request_hook" + ) + + +def aiohttp_response_hook( + span: Span, + params: typing.Union[ + aiohttp.TraceRequestEndParams, + aiohttp.TraceRequestExceptionParams, + ], +): + if span and span.is_recording(): + span.set_attribute( + "custom_user_attribute_from_response_hook", "aiohttp_response_hook" + ) + + +AioHttpClientInstrumentor().instrument() From ecda2d4699b00cfbf62acfb2032b7d93eb3801f5 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Dean=20Qui=C3=B1anola?= Date: Mon, 28 Oct 2024 22:36:46 -0700 Subject: [PATCH 03/53] tmp: our collector can't support gRPC behind ALB --- runpod/otel.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/runpod/otel.py b/runpod/otel.py index 741a680d..622e3461 100644 --- a/runpod/otel.py +++ b/runpod/otel.py @@ -6,7 +6,7 @@ from opentelemetry import trace from opentelemetry.sdk.trace import Resource, TracerProvider, Span from opentelemetry.sdk.trace.export import BatchSpanProcessor, ConsoleSpanExporter -from opentelemetry.exporter.otlp.proto.grpc.trace_exporter import OTLPSpanExporter +from opentelemetry.exporter.otlp.proto.http.trace_exporter import OTLPSpanExporter from opentelemetry.instrumentation.aiohttp_client import AioHttpClientInstrumentor from opentelemetry.instrumentation.asyncio import AsyncioInstrumentor From f03a03e7b443f4b56d743f63200e063ab873f99b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Dean=20Qui=C3=B1anola?= Date: Wed, 30 Oct 2024 23:18:44 -0700 Subject: [PATCH 04/53] tmp: trace the entire JobScaler functions --- runpod/otel.py | 49 ++---- runpod/serverless/modules/rp_http.py | 23 +-- runpod/serverless/modules/rp_job.py | 117 ++++++++------- runpod/serverless/modules/rp_scale.py | 205 +++++++++++++++----------- 4 files changed, 206 insertions(+), 188 deletions(-) diff --git a/runpod/otel.py b/runpod/otel.py index 622e3461..c328f05c 100644 --- a/runpod/otel.py +++ b/runpod/otel.py @@ -1,12 +1,16 @@ import os -import typing -import aiohttp -from requests import PreparedRequest, Response from opentelemetry import trace -from opentelemetry.sdk.trace import Resource, TracerProvider, Span -from opentelemetry.sdk.trace.export import BatchSpanProcessor, ConsoleSpanExporter from opentelemetry.exporter.otlp.proto.http.trace_exporter import OTLPSpanExporter +from opentelemetry.sdk.trace import TracerProvider +from opentelemetry.sdk.trace.export import BatchSpanProcessor, ConsoleSpanExporter +from opentelemetry.sdk.resources import ( + Resource, + SERVICE_NAME, + SERVICE_VERSION, + HOST_NAME, + +) from opentelemetry.instrumentation.aiohttp_client import AioHttpClientInstrumentor from opentelemetry.instrumentation.asyncio import AsyncioInstrumentor @@ -21,9 +25,10 @@ TracerProvider( resource=Resource.create( { - "service.name": "runpod-python-sdk", - "service.version": runpod_version, "application": "runpod-serverless", + SERVICE_NAME: "runpod-python-sdk", + SERVICE_VERSION: runpod_version, + HOST_NAME: os.getenv("RUNPOD_POD_HOSTNAME"), } ) ) @@ -51,38 +56,8 @@ # --- requests --- # -def requests_request_hook(span: Span, request_obj: PreparedRequest): - pass - - -def requests_response_hook( - span: Span, request_obj: PreparedRequest, response: Response -): - pass - - RequestsInstrumentor().instrument() # --- aiohttp --- # -def aiohttp_request_hook(span: Span, params: aiohttp.TraceRequestStartParams): - if span and span.is_recording(): - span.set_attribute( - "custom_user_attribute_from_request_hook", "aiohttp_request_hook" - ) - - -def aiohttp_response_hook( - span: Span, - params: typing.Union[ - aiohttp.TraceRequestEndParams, - aiohttp.TraceRequestExceptionParams, - ], -): - if span and span.is_recording(): - span.set_attribute( - "custom_user_attribute_from_response_hook", "aiohttp_response_hook" - ) - - AioHttpClientInstrumentor().instrument() diff --git a/runpod/serverless/modules/rp_http.py b/runpod/serverless/modules/rp_http.py index 3d82d35b..26925e47 100644 --- a/runpod/serverless/modules/rp_http.py +++ b/runpod/serverless/modules/rp_http.py @@ -5,6 +5,7 @@ import json import os +from opentelemetry.trace import get_tracer from aiohttp import ClientError from aiohttp_retry import FibonacciRetry, RetryClient @@ -24,6 +25,7 @@ JOB_STREAM_URL = JOB_STREAM_URL_TEMPLATE.replace("$RUNPOD_POD_ID", WORKER_ID) log = RunPodLogger() +tracer = get_tracer(__name__) async def _transmit(client_session: ClientSession, url, job_data): @@ -44,8 +46,9 @@ async def _transmit(client_session: ClientSession, url, job_data): "raise_for_status": True, } - async with retry_client.post(url, **kwargs) as client_response: - await client_response.text() + with tracer.start_as_current_span("rp_http.transmit"): + async with retry_client.post(url, **kwargs) as client_response: + await client_response.text() async def _handle_result( @@ -55,7 +58,7 @@ async def _handle_result( A helper function to handle the result, either for sending or streaming. """ try: - session.headers["X-Request-ID"] = job["id"] + session.headers["X-Request-ID"] = job["id"] # legacy serialized_job_data = json.dumps(job_data, ensure_ascii=False) @@ -84,15 +87,17 @@ async def send_result(session, job_data, job, is_stream=False): """ Return the job results. """ - await _handle_result( - session, job_data, job, JOB_DONE_URL, "Results sent.", is_stream=is_stream - ) + with tracer.start_as_current_span("rp_http.send_result"): + await _handle_result( + session, job_data, job, JOB_DONE_URL, "Results sent.", is_stream=is_stream + ) async def stream_result(session, job_data, job): """ Return the stream job results. """ - await _handle_result( - session, job_data, job, JOB_STREAM_URL, "Intermediate results sent." - ) + with tracer.start_as_current_span("rp_http.stream_result"): + await _handle_result( + session, job_data, job, JOB_STREAM_URL, "Intermediate results sent." + ) diff --git a/runpod/serverless/modules/rp_job.py b/runpod/serverless/modules/rp_job.py index ddac4ec0..c244c22e 100644 --- a/runpod/serverless/modules/rp_job.py +++ b/runpod/serverless/modules/rp_job.py @@ -7,6 +7,7 @@ import os import traceback from typing import Any, AsyncGenerator, Callable, Dict, Optional, Union, List +from opentelemetry.trace import get_tracer from runpod.http_client import ClientSession, TooManyRequests from runpod.serverless.modules.rp_logger import RunPodLogger @@ -22,6 +23,7 @@ log = RunPodLogger() job_progress = JobsProgress() +tracer = get_tracer(__name__) def _job_get_url(batch_size: int = 1): @@ -160,53 +162,58 @@ async def run_job(handler: Callable, job: Dict[str, Any]) -> Dict[str, Any]: log.info("Started.", job["id"]) run_result = {} - try: - handler_return = handler(job) - job_output = ( - await handler_return - if inspect.isawaitable(handler_return) - else handler_return - ) + with tracer.start_as_current_span("rp_job.run_job") as span: + span.set_attribute("job.id", job.get("id")) + span.set_attribute("request_id", job.get("id")) # legacy - log.debug(f"Handler output: {job_output}", job["id"]) + try: + handler_return = handler(job) + job_output = ( + await handler_return + if inspect.isawaitable(handler_return) + else handler_return + ) + + log.debug(f"Handler output: {job_output}", job["id"]) - if isinstance(job_output, dict): - error_msg = job_output.pop("error", None) - refresh_worker = job_output.pop("refresh_worker", None) - run_result["output"] = job_output + if isinstance(job_output, dict): + error_msg = job_output.pop("error", None) + refresh_worker = job_output.pop("refresh_worker", None) + run_result["output"] = job_output - if error_msg: - run_result["error"] = error_msg - if refresh_worker: - run_result["stopPod"] = True + if error_msg: + run_result["error"] = error_msg + if refresh_worker: + run_result["stopPod"] = True - elif isinstance(job_output, bool): - run_result = {"output": job_output} + elif isinstance(job_output, bool): + run_result = {"output": job_output} - else: - run_result = {"output": job_output} + else: + run_result = {"output": job_output} - if run_result.get("output") == {}: - run_result.pop("output") + if run_result.get("output") == {}: + run_result.pop("output") - check_return_size(run_result) # Checks the size of the return body. + check_return_size(run_result) # Checks the size of the return body. - except Exception as err: - error_info = { - "error_type": str(type(err)), - "error_message": str(err), - "error_traceback": traceback.format_exc(), - "hostname": os.environ.get("RUNPOD_POD_HOSTNAME", "unknown"), - "worker_id": os.environ.get("RUNPOD_POD_ID", "unknown"), - "runpod_version": runpod_version, - } + except Exception as err: + span.record_exception(err) + error_info = { + "error_type": str(type(err)), + "error_message": str(err), + "error_traceback": traceback.format_exc(), + "hostname": os.environ.get("RUNPOD_POD_HOSTNAME", "unknown"), + "worker_id": os.environ.get("RUNPOD_POD_ID", "unknown"), + "runpod_version": runpod_version, + } - log.error("Captured Handler Exception", job["id"]) - log.error(json.dumps(error_info, indent=4)) - run_result = {"error": json.dumps(error_info)} + log.error("Captured Handler Exception", job["id"]) + log.error(json.dumps(error_info, indent=4)) + run_result = {"error": json.dumps(error_info)} - finally: - log.debug(f"run_job return: {run_result}", job["id"]) + finally: + log.debug(f"run_job return: {run_result}", job["id"]) return run_result @@ -224,20 +231,22 @@ async def run_job_generator( job["id"], ) - try: - job_output = handler(job) - - if is_async_gen: - async for output_partial in job_output: - log.debug(f"Async Generator output: {output_partial}", job["id"]) - yield {"output": output_partial} - else: - for output_partial in job_output: - log.debug(f"Generator output: {output_partial}", job["id"]) - yield {"output": output_partial} - - except Exception as err: - log.error(err, job["id"]) - yield {"error": f"handler: {str(err)} \ntraceback: {traceback.format_exc()}"} - finally: - log.info("Finished running generator.", job["id"]) + with tracer.start_as_current_span("rp_job.run_job_generator") as span: + try: + job_output = handler(job) + + if is_async_gen: + async for output_partial in job_output: + log.debug(f"Async Generator output: {output_partial}", job["id"]) + yield {"output": output_partial} + else: + for output_partial in job_output: + log.debug(f"Generator output: {output_partial}", job["id"]) + yield {"output": output_partial} + + except Exception as err: + span.record_exception(err) + log.error(err, job["id"]) + yield {"error": f"handler: {str(err)} \ntraceback: {traceback.format_exc()}"} + finally: + log.info("Finished running generator.", job["id"]) diff --git a/runpod/serverless/modules/rp_scale.py b/runpod/serverless/modules/rp_scale.py index da4b0fd0..99b2b70a 100644 --- a/runpod/serverless/modules/rp_scale.py +++ b/runpod/serverless/modules/rp_scale.py @@ -4,8 +4,10 @@ """ import asyncio +import os import signal from typing import Any, Dict +from opentelemetry.trace import get_tracer from ...http_client import AsyncClientSession, ClientSession, TooManyRequests from .rp_job import get_job, handle_job @@ -15,6 +17,7 @@ log = RunPodLogger() job_list = JobsQueue() job_progress = JobsProgress() +tracer = get_tracer(__name__) def _default_concurrency_modifier(current_concurrency: int) -> int: @@ -54,16 +57,23 @@ def start(self): when the user sends a SIGTERM or SIGINT signal. This is typically the case when the worker is running in a container. """ - try: - # Register signal handlers for graceful shutdown - signal.signal(signal.SIGTERM, self.handle_shutdown) - signal.signal(signal.SIGINT, self.handle_shutdown) - except ValueError: - log.warning("Signal handling is only supported in the main thread.") + with tracer.start_as_current_span("JobScaler.start") as span: + span.set_attributes({ + "worker.hostname": os.getenv("RUNPOD_POD_HOSTNAME", "unknown"), + "worker.id": os.getenv("RUNPOD_POD_ID", "unknown"), + "endpoint.id": os.getenv("RUNPOD_ENDPOINT_ID", "unknown"), + }) - # Start the main loop - # Run forever until the worker is signalled to shut down. - asyncio.run(self.run()) + try: + # Register signal handlers for graceful shutdown + signal.signal(signal.SIGTERM, self.handle_shutdown) + signal.signal(signal.SIGINT, self.handle_shutdown) + except ValueError: + log.warning("Signal handling is only supported in the main thread.") + + # Start the main loop + # Run forever until the worker is signalled to shut down. + asyncio.run(self.run()) def handle_shutdown(self, signum, frame): """ @@ -81,16 +91,17 @@ def handle_shutdown(self, signum, frame): self.kill_worker() async def run(self): - # Create an async session that will be closed when the worker is killed. - async with AsyncClientSession() as session: - # Create tasks for getting and running jobs. - jobtake_task = asyncio.create_task(self.get_jobs(session)) - jobrun_task = asyncio.create_task(self.run_jobs(session)) + with tracer.start_as_current_span("JobScaler.run"): + # Create an async session that will be closed when the worker is killed. + async with AsyncClientSession() as session: + # Create tasks for getting and running jobs. + jobtake_task = asyncio.create_task(self.get_jobs(session)) + jobrun_task = asyncio.create_task(self.run_jobs(session)) - tasks = [jobtake_task, jobrun_task] + tasks = [jobtake_task, jobrun_task] - # Concurrently run both tasks and wait for both to finish. - await asyncio.gather(*tasks) + # Concurrently run both tasks and wait for both to finish. + await asyncio.gather(*tasks) def is_alive(self): """ @@ -114,50 +125,61 @@ async def get_jobs(self, session: ClientSession): Adds jobs to the JobsQueue """ while self.is_alive(): - log.debug(f"JobScaler.get_jobs | Jobs in progress: {job_progress.get_job_count()}") - - self.current_concurrency = self.concurrency_modifier( - self.current_concurrency - ) - log.debug(f"JobScaler.get_jobs | Concurrency set to: {self.current_concurrency}") - - jobs_needed = self.current_concurrency - job_progress.get_job_count() - if jobs_needed <= 0: - log.debug("JobScaler.get_jobs | Queue is full. Retrying soon.") - await asyncio.sleep(1) # don't go rapidly - continue - - try: - # Keep the connection to the blocking call up to 30 seconds - acquired_jobs = await asyncio.wait_for( - get_job(session, jobs_needed), timeout=30 + with tracer.start_as_current_span("JobScaler.get_jobs") as span: + self.current_concurrency = self.concurrency_modifier( + self.current_concurrency ) - if not acquired_jobs: - log.debug("JobScaler.get_jobs | No jobs acquired.") + jobs_needed = self.current_concurrency - job_progress.get_job_count() + if jobs_needed <= 0: + log.debug("JobScaler.get_jobs | Queue is full. Retrying soon.") + await asyncio.sleep(1) # don't go rapidly continue - - for job in acquired_jobs: - await job_list.add_job(job) - - log.info(f"Jobs in queue: {job_list.get_job_count()}") - - except TooManyRequests: - log.debug(f"JobScaler.get_jobs | Too many requests. Debounce for 5 seconds.") - await asyncio.sleep(5) # debounce for 5 seconds - except asyncio.CancelledError: - log.debug("JobScaler.get_jobs | Request was cancelled.") - except TimeoutError: - log.debug("JobScaler.get_jobs | Job acquisition timed out. Retrying.") - except TypeError as error: - log.debug(f"JobScaler.get_jobs | Unexpected error: {error}.") - except Exception as error: - log.error( - f"Failed to get job. | Error Type: {type(error).__name__} | Error Message: {str(error)}" - ) - finally: - # Yield control back to the event loop - await asyncio.sleep(0) + + span.set_attributes({ + "jobs.current_concurrency": self.current_concurrency, + "jobs.in_progress": job_progress.get_job_count(), + "jobs.needed": jobs_needed, + }) + + try: + # Keep the connection to the blocking call up to 30 seconds + acquired_jobs = await asyncio.wait_for( + get_job(session, jobs_needed), timeout=30 + ) + span.set_attribute("jobs.acquired", len(acquired_jobs)) + + if not acquired_jobs: + log.debug("JobScaler.get_jobs | No jobs acquired.") + continue + + for job in acquired_jobs: + await job_list.add_job(job) + + span.set_attribute("jobs.in_queue", len(job_list.get_job_count())) + log.info(f"Jobs in queue: {job_list.get_job_count()}") + + except TooManyRequests as error: + span.record_exception(error) + log.debug(f"JobScaler.get_jobs | Too many requests. Debounce for 5 seconds.") + await asyncio.sleep(5) # debounce for 5 seconds + except asyncio.CancelledError as error: + span.record_exception(error) + log.debug("JobScaler.get_jobs | Request was cancelled.") + except TimeoutError as error: + span.record_exception(error) + log.debug("JobScaler.get_jobs | Job acquisition timed out. Retrying.") + except TypeError as error: + span.record_exception(error) + log.debug(f"JobScaler.get_jobs | Unexpected error: {error}.") + except Exception as error: + span.record_exception(error) + log.error( + f"Failed to get job. | Error Type: {type(error).__name__} | Error Message: {str(error)}" + ) + finally: + # Yield control back to the event loop + await asyncio.sleep(0) async def run_jobs(self, session: ClientSession): """ @@ -168,27 +190,29 @@ async def run_jobs(self, session: ClientSession): tasks = [] # Store the tasks for concurrent job processing while self.is_alive() or not job_list.empty(): - # Fetch as many jobs as the concurrency allows - while len(tasks) < self.current_concurrency and not job_list.empty(): - job = await job_list.get_job() + with tracer.start_as_current_span("JobScaler.run_jobs") as span: + # Fetch as many jobs as the concurrency allows + while len(tasks) < self.current_concurrency and not job_list.empty(): + job = await job_list.get_job() - # Create a new task for each job and add it to the task list - task = asyncio.create_task(self.handle_job(session, job)) - tasks.append(task) + # Create a new task for each job and add it to the task list + task = asyncio.create_task(self.handle_job(session, job)) + tasks.append(task) - # Wait for any job to finish - if tasks: - log.info(f"Jobs in progress: {len(tasks)}") + # Wait for any job to finish + if tasks: + span.set_attribute("jobs.running", len(tasks)) + log.info(f"Jobs in progress: {len(tasks)}") - done, pending = await asyncio.wait( - tasks, return_when=asyncio.FIRST_COMPLETED - ) + done, pending = await asyncio.wait( + tasks, return_when=asyncio.FIRST_COMPLETED + ) - # Remove completed tasks from the list - tasks = [t for t in tasks if t not in done] + # Remove completed tasks from the list + tasks = [t for t in tasks if t not in done] - # Yield control back to the event loop - await asyncio.sleep(0) + # Yield control back to the event loop + await asyncio.sleep(0) # Ensure all remaining tasks finish before stopping await asyncio.gather(*tasks) @@ -197,22 +221,27 @@ async def handle_job(self, session: ClientSession, job: dict): """ Process an individual job. This function is run concurrently for multiple jobs. """ - log.debug(f"JobScaler.handle_job | {job}") - job_progress.add(job) + with tracer.start_as_current_span("JobScaler.handle_job") as span: + span.set_attribute("job.id", job.get("id")) + span.set_attribute("request_id", job.get("id")) # legacy + + log.debug(f"JobScaler.handle_job | {job}") + job_progress.add(job) - try: - await handle_job(session, self.config, job) + try: + await handle_job(session, self.config, job) - if self.config.get("refresh_worker", False): - self.kill_worker() + if self.config.get("refresh_worker", False): + self.kill_worker() - except Exception as err: - log.error(f"Error handling job: {err}", job["id"]) - raise err + except Exception as err: + span.record_exception(err) + log.error(f"Error handling job: {err}", job["id"]) + raise err - finally: - # Inform JobsQueue of a task completion - job_list.task_done() + finally: + # Inform JobsQueue of a task completion + job_list.task_done() - # Job is no longer in progress - job_progress.remove(job["id"]) + # Job is no longer in progress + job_progress.remove(job["id"]) From 1b4d9e3a78a77632db19d15bf8ec3981da87696d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Dean=20Qui=C3=B1anola?= Date: Wed, 30 Oct 2024 23:47:24 -0700 Subject: [PATCH 05/53] tmp: too much junk traces from the loop --- runpod/serverless/modules/rp_scale.py | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/runpod/serverless/modules/rp_scale.py b/runpod/serverless/modules/rp_scale.py index 99b2b70a..5078db06 100644 --- a/runpod/serverless/modules/rp_scale.py +++ b/runpod/serverless/modules/rp_scale.py @@ -124,8 +124,8 @@ async def get_jobs(self, session: ClientSession): Adds jobs to the JobsQueue """ - while self.is_alive(): - with tracer.start_as_current_span("JobScaler.get_jobs") as span: + with tracer.start_as_current_span("JobScaler.get_jobs") as span: + while self.is_alive(): self.current_concurrency = self.concurrency_modifier( self.current_concurrency ) @@ -136,7 +136,7 @@ async def get_jobs(self, session: ClientSession): await asyncio.sleep(1) # don't go rapidly continue - span.set_attributes({ + span.add_event("getting jobs", { "jobs.current_concurrency": self.current_concurrency, "jobs.in_progress": job_progress.get_job_count(), "jobs.needed": jobs_needed, @@ -147,7 +147,7 @@ async def get_jobs(self, session: ClientSession): acquired_jobs = await asyncio.wait_for( get_job(session, jobs_needed), timeout=30 ) - span.set_attribute("jobs.acquired", len(acquired_jobs)) + span.add_event("acquired jobs", {"jobs.acquired": len(acquired_jobs)}) if not acquired_jobs: log.debug("JobScaler.get_jobs | No jobs acquired.") @@ -156,7 +156,7 @@ async def get_jobs(self, session: ClientSession): for job in acquired_jobs: await job_list.add_job(job) - span.set_attribute("jobs.in_queue", len(job_list.get_job_count())) + span.add_event("queued jobs", {"jobs.in_queue", job_list.get_job_count()}) log.info(f"Jobs in queue: {job_list.get_job_count()}") except TooManyRequests as error: @@ -189,8 +189,8 @@ async def run_jobs(self, session: ClientSession): """ tasks = [] # Store the tasks for concurrent job processing - while self.is_alive() or not job_list.empty(): - with tracer.start_as_current_span("JobScaler.run_jobs") as span: + with tracer.start_as_current_span("JobScaler.run_jobs") as span: + while self.is_alive() or not job_list.empty(): # Fetch as many jobs as the concurrency allows while len(tasks) < self.current_concurrency and not job_list.empty(): job = await job_list.get_job() @@ -201,7 +201,7 @@ async def run_jobs(self, session: ClientSession): # Wait for any job to finish if tasks: - span.set_attribute("jobs.running", len(tasks)) + span.add_event("running jobs", {"jobs.running": len(tasks)}) log.info(f"Jobs in progress: {len(tasks)}") done, pending = await asyncio.wait( From a2759af14a62cde1881d5972a9e380ea38b941d6 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Dean=20Qui=C3=B1anola?= Date: Thu, 31 Oct 2024 00:20:45 -0700 Subject: [PATCH 06/53] tmp: still too much traces --- runpod/serverless/modules/rp_scale.py | 131 ++++++++++++++------------ 1 file changed, 69 insertions(+), 62 deletions(-) diff --git a/runpod/serverless/modules/rp_scale.py b/runpod/serverless/modules/rp_scale.py index 5078db06..6dd9aa8d 100644 --- a/runpod/serverless/modules/rp_scale.py +++ b/runpod/serverless/modules/rp_scale.py @@ -20,6 +20,13 @@ tracer = get_tracer(__name__) +worker_attributes = { + "worker.hostname": os.getenv("RUNPOD_POD_HOSTNAME", "unknown"), + "worker.id": os.getenv("RUNPOD_POD_ID", "unknown"), + "endpoint.id": os.getenv("RUNPOD_ENDPOINT_ID", "unknown"), +} + + def _default_concurrency_modifier(current_concurrency: int) -> int: """ Default concurrency modifier. @@ -57,23 +64,16 @@ def start(self): when the user sends a SIGTERM or SIGINT signal. This is typically the case when the worker is running in a container. """ - with tracer.start_as_current_span("JobScaler.start") as span: - span.set_attributes({ - "worker.hostname": os.getenv("RUNPOD_POD_HOSTNAME", "unknown"), - "worker.id": os.getenv("RUNPOD_POD_ID", "unknown"), - "endpoint.id": os.getenv("RUNPOD_ENDPOINT_ID", "unknown"), - }) - - try: - # Register signal handlers for graceful shutdown - signal.signal(signal.SIGTERM, self.handle_shutdown) - signal.signal(signal.SIGINT, self.handle_shutdown) - except ValueError: - log.warning("Signal handling is only supported in the main thread.") + try: + # Register signal handlers for graceful shutdown + signal.signal(signal.SIGTERM, self.handle_shutdown) + signal.signal(signal.SIGINT, self.handle_shutdown) + except ValueError: + log.warning("Signal handling is only supported in the main thread.") - # Start the main loop - # Run forever until the worker is signalled to shut down. - asyncio.run(self.run()) + # Start the main loop + # Run forever until the worker is signalled to shut down. + asyncio.run(self.run()) def handle_shutdown(self, signum, frame): """ @@ -91,17 +91,16 @@ def handle_shutdown(self, signum, frame): self.kill_worker() async def run(self): - with tracer.start_as_current_span("JobScaler.run"): - # Create an async session that will be closed when the worker is killed. - async with AsyncClientSession() as session: - # Create tasks for getting and running jobs. - jobtake_task = asyncio.create_task(self.get_jobs(session)) - jobrun_task = asyncio.create_task(self.run_jobs(session)) + # Create an async session that will be closed when the worker is killed. + async with AsyncClientSession() as session: + # Create tasks for getting and running jobs. + jobtake_task = asyncio.create_task(self.get_jobs(session)) + jobrun_task = asyncio.create_task(self.run_jobs(session)) - tasks = [jobtake_task, jobrun_task] + tasks = [jobtake_task, jobrun_task] - # Concurrently run both tasks and wait for both to finish. - await asyncio.gather(*tasks) + # Concurrently run both tasks and wait for both to finish. + await asyncio.gather(*tasks) def is_alive(self): """ @@ -124,30 +123,37 @@ async def get_jobs(self, session: ClientSession): Adds jobs to the JobsQueue """ - with tracer.start_as_current_span("JobScaler.get_jobs") as span: - while self.is_alive(): - self.current_concurrency = self.concurrency_modifier( - self.current_concurrency - ) + while self.is_alive(): + self.current_concurrency = self.concurrency_modifier( + self.current_concurrency + ) + + jobs_needed = self.current_concurrency - job_progress.get_job_count() + if jobs_needed <= 0: + log.debug("JobScaler.get_jobs | Queue is full. Retrying soon.") + await asyncio.sleep(1) # don't go rapidly + continue - jobs_needed = self.current_concurrency - job_progress.get_job_count() - if jobs_needed <= 0: - log.debug("JobScaler.get_jobs | Queue is full. Retrying soon.") - await asyncio.sleep(1) # don't go rapidly - continue - - span.add_event("getting jobs", { - "jobs.current_concurrency": self.current_concurrency, - "jobs.in_progress": job_progress.get_job_count(), - "jobs.needed": jobs_needed, - }) + with tracer.start_as_current_span("JobScaler.get_jobs") as span: + span.set_attributes(worker_attributes) try: + span.add_event( + "getting jobs", + { + "jobs.current_concurrency": self.current_concurrency, + "jobs.in_progress": job_progress.get_job_count(), + "jobs.needed": jobs_needed, + }, + ) + # Keep the connection to the blocking call up to 30 seconds acquired_jobs = await asyncio.wait_for( get_job(session, jobs_needed), timeout=30 ) - span.add_event("acquired jobs", {"jobs.acquired": len(acquired_jobs)}) + span.add_event( + "acquired jobs", {"jobs.acquired": len(acquired_jobs)} + ) if not acquired_jobs: log.debug("JobScaler.get_jobs | No jobs acquired.") @@ -156,7 +162,9 @@ async def get_jobs(self, session: ClientSession): for job in acquired_jobs: await job_list.add_job(job) - span.add_event("queued jobs", {"jobs.in_queue", job_list.get_job_count()}) + span.add_event( + "queued jobs", {"jobs.in_queue", job_list.get_job_count()} + ) log.info(f"Jobs in queue: {job_list.get_job_count()}") except TooManyRequests as error: @@ -189,30 +197,28 @@ async def run_jobs(self, session: ClientSession): """ tasks = [] # Store the tasks for concurrent job processing - with tracer.start_as_current_span("JobScaler.run_jobs") as span: - while self.is_alive() or not job_list.empty(): - # Fetch as many jobs as the concurrency allows - while len(tasks) < self.current_concurrency and not job_list.empty(): - job = await job_list.get_job() + while self.is_alive() or not job_list.empty(): + # Fetch as many jobs as the concurrency allows + while len(tasks) < self.current_concurrency and not job_list.empty(): + job = await job_list.get_job() - # Create a new task for each job and add it to the task list - task = asyncio.create_task(self.handle_job(session, job)) - tasks.append(task) + # Create a new task for each job and add it to the task list + task = asyncio.create_task(self.handle_job(session, job)) + tasks.append(task) - # Wait for any job to finish - if tasks: - span.add_event("running jobs", {"jobs.running": len(tasks)}) - log.info(f"Jobs in progress: {len(tasks)}") + # Wait for any job to finish + if tasks: + log.info(f"Jobs in progress: {len(tasks)}") - done, pending = await asyncio.wait( - tasks, return_when=asyncio.FIRST_COMPLETED - ) + done, pending = await asyncio.wait( + tasks, return_when=asyncio.FIRST_COMPLETED + ) - # Remove completed tasks from the list - tasks = [t for t in tasks if t not in done] + # Remove completed tasks from the list + tasks = [t for t in tasks if t not in done] - # Yield control back to the event loop - await asyncio.sleep(0) + # Yield control back to the event loop + await asyncio.sleep(0) # Ensure all remaining tasks finish before stopping await asyncio.gather(*tasks) @@ -222,6 +228,7 @@ async def handle_job(self, session: ClientSession, job: dict): Process an individual job. This function is run concurrently for multiple jobs. """ with tracer.start_as_current_span("JobScaler.handle_job") as span: + span.set_attributes(worker_attributes) span.set_attribute("job.id", job.get("id")) span.set_attribute("request_id", job.get("id")) # legacy From b3fae8f130707b32a6a0500f9013847e38c8d294 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Dean=20Qui=C3=B1anola?= Date: Thu, 31 Oct 2024 09:27:46 -0700 Subject: [PATCH 07/53] tmp: correction --- runpod/serverless/modules/rp_scale.py | 16 +++++++++++----- 1 file changed, 11 insertions(+), 5 deletions(-) diff --git a/runpod/serverless/modules/rp_scale.py b/runpod/serverless/modules/rp_scale.py index 6dd9aa8d..f9cc6956 100644 --- a/runpod/serverless/modules/rp_scale.py +++ b/runpod/serverless/modules/rp_scale.py @@ -151,14 +151,16 @@ async def get_jobs(self, session: ClientSession): acquired_jobs = await asyncio.wait_for( get_job(session, jobs_needed), timeout=30 ) - span.add_event( - "acquired jobs", {"jobs.acquired": len(acquired_jobs)} - ) if not acquired_jobs: + span.add_event("acquired no jobs", {"jobs.acquired": 0}) log.debug("JobScaler.get_jobs | No jobs acquired.") continue + span.add_event( + "acquired jobs", {"jobs.acquired": len(acquired_jobs)} + ) + for job in acquired_jobs: await job_list.add_job(job) @@ -169,14 +171,18 @@ async def get_jobs(self, session: ClientSession): except TooManyRequests as error: span.record_exception(error) - log.debug(f"JobScaler.get_jobs | Too many requests. Debounce for 5 seconds.") + log.debug( + f"JobScaler.get_jobs | Too many requests. Debounce for 5 seconds." + ) await asyncio.sleep(5) # debounce for 5 seconds except asyncio.CancelledError as error: span.record_exception(error) log.debug("JobScaler.get_jobs | Request was cancelled.") except TimeoutError as error: span.record_exception(error) - log.debug("JobScaler.get_jobs | Job acquisition timed out. Retrying.") + log.debug( + "JobScaler.get_jobs | Job acquisition timed out. Retrying." + ) except TypeError as error: span.record_exception(error) log.debug(f"JobScaler.get_jobs | Unexpected error: {error}.") From 6f0efac14bf5050cc1d78c11a5c8325aa1f6794e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Dean=20Qui=C3=B1anola?= Date: Sun, 3 Nov 2024 13:05:40 -0800 Subject: [PATCH 08/53] tmp: only trace http requests from http_client.py --- runpod/http_client.py | 15 ++++++++++----- runpod/otel.py | 14 ++++---------- 2 files changed, 14 insertions(+), 15 deletions(-) diff --git a/runpod/http_client.py b/runpod/http_client.py index ab838094..9fb2924c 100644 --- a/runpod/http_client.py +++ b/runpod/http_client.py @@ -1,15 +1,19 @@ """ -HTTP Client abstractions +HTTP Client abstractions with OpenTelemetry tracing support. """ import os - import requests from aiohttp import ClientSession, ClientTimeout, TCPConnector, ClientResponseError +from opentelemetry import trace +from opentelemetry.instrumentation.aiohttp_client import create_trace_config +from opentelemetry.instrumentation.requests import RequestsInstrumentor from .cli.groups.config.functions import get_credentials from .user_agent import USER_AGENT +tracer = trace.get_tracer(__name__) + class TooManyRequests(ClientResponseError): pass @@ -34,13 +38,13 @@ def get_auth_header(): def AsyncClientSession(*args, **kwargs): """ - Deprecation from aiohttp.ClientSession forbids inheritance. - This is now a factory method + Factory method for an async client session with OpenTelemetry tracing. """ return ClientSession( connector=TCPConnector(limit=0), headers=get_auth_header(), timeout=ClientTimeout(600, ceil_threshold=400), + trace_configs=[create_trace_config()], *args, **kwargs, ) @@ -49,4 +53,5 @@ def AsyncClientSession(*args, **kwargs): class SyncClientSession(requests.Session): def __init__(self): super().__init__() - self.headers.update({"User-Agent": USER_AGENT,}) + self.headers.update(get_auth_header()) + RequestsInstrumentor().instrument_session(self) diff --git a/runpod/otel.py b/runpod/otel.py index c328f05c..c7b25d7e 100644 --- a/runpod/otel.py +++ b/runpod/otel.py @@ -7,14 +7,14 @@ from opentelemetry.sdk.resources import ( Resource, SERVICE_NAME, + SERVICE_NAMESPACE, + SERVICE_INSTANCE_ID, SERVICE_VERSION, HOST_NAME, ) -from opentelemetry.instrumentation.aiohttp_client import AioHttpClientInstrumentor from opentelemetry.instrumentation.asyncio import AsyncioInstrumentor -from opentelemetry.instrumentation.requests import RequestsInstrumentor from opentelemetry.instrumentation.threading import ThreadingInstrumentor from opentelemetry.instrumentation.urllib3 import URLLib3Instrumentor @@ -27,6 +27,8 @@ { "application": "runpod-serverless", SERVICE_NAME: "runpod-python-sdk", + SERVICE_NAMESPACE: os.getenv("RUNPOD_ENDPOINT_ID"), + SERVICE_INSTANCE_ID: os.getenv("RUNPOD_POD_ID"), SERVICE_VERSION: runpod_version, HOST_NAME: os.getenv("RUNPOD_POD_HOSTNAME"), } @@ -53,11 +55,3 @@ # --- asyncio --- # AsyncioInstrumentor().instrument() - - -# --- requests --- # -RequestsInstrumentor().instrument() - - -# --- aiohttp --- # -AioHttpClientInstrumentor().instrument() From 4a96d0fe6e7867499e750133fac4d239b669be91 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Dean=20Qui=C3=B1anola?= Date: Sun, 3 Nov 2024 14:19:21 -0800 Subject: [PATCH 09/53] tmp: trace to connect job queues, progress, handling, and reporting --- runpod/http_client.py | 2 +- runpod/serverless/modules/rp_http.py | 60 +++++++++++------------ runpod/serverless/modules/rp_job.py | 14 +++--- runpod/serverless/modules/rp_scale.py | 70 ++++++++++----------------- 4 files changed, 64 insertions(+), 82 deletions(-) diff --git a/runpod/http_client.py b/runpod/http_client.py index 9fb2924c..97e98829 100644 --- a/runpod/http_client.py +++ b/runpod/http_client.py @@ -54,4 +54,4 @@ class SyncClientSession(requests.Session): def __init__(self): super().__init__() self.headers.update(get_auth_header()) - RequestsInstrumentor().instrument_session(self) + RequestsInstrumentor().instrument(session=self) \ No newline at end of file diff --git a/runpod/serverless/modules/rp_http.py b/runpod/serverless/modules/rp_http.py index 26925e47..be5e640a 100644 --- a/runpod/serverless/modules/rp_http.py +++ b/runpod/serverless/modules/rp_http.py @@ -5,7 +5,7 @@ import json import os -from opentelemetry.trace import get_tracer +from opentelemetry.trace import get_tracer, SpanKind from aiohttp import ClientError from aiohttp_retry import FibonacciRetry, RetryClient @@ -46,9 +46,8 @@ async def _transmit(client_session: ClientSession, url, job_data): "raise_for_status": True, } - with tracer.start_as_current_span("rp_http.transmit"): - async with retry_client.post(url, **kwargs) as client_response: - await client_response.text() + async with retry_client.post(url, **kwargs) as client_response: + await client_response.text() async def _handle_result( @@ -57,47 +56,48 @@ async def _handle_result( """ A helper function to handle the result, either for sending or streaming. """ - try: - session.headers["X-Request-ID"] = job["id"] # legacy + with tracer.start_as_current_span("handle_result", kind=SpanKind.INTERNAL) as span: + span.set_attribute("request_id", job.get("id")) - serialized_job_data = json.dumps(job_data, ensure_ascii=False) + try: + serialized_job_data = json.dumps(job_data, ensure_ascii=False) - is_stream = "true" if is_stream else "false" - url = url_template.replace("$ID", job["id"]) + f"&isStream={is_stream}" + is_stream = "true" if is_stream else "false" + url = url_template.replace("$ID", job["id"]) + f"&isStream={is_stream}" - await _transmit(session, url, serialized_job_data) - log.debug(f"{log_message}", job["id"]) + await _transmit(session, url, serialized_job_data) + log.debug(f"{log_message}", job["id"]) - except ClientError as err: - log.error(f"Failed to return job results. | {err}", job["id"]) + except ClientError as err: + span.record_exception(err) + log.error(f"Failed to return job results. | {err}", job["id"]) - except (TypeError, RuntimeError) as err: - log.error(f"Error while returning job result. | {err}", job["id"]) + except (TypeError, RuntimeError) as err: + span.record_exception(err) + log.error(f"Error while returning job result. | {err}", job["id"]) - finally: - # job_data status is used for local development with FastAPI - if ( - url_template == JOB_DONE_URL - and job_data.get("status", None) != "IN_PROGRESS" - ): - log.info("Finished.", job["id"]) + finally: + # job_data status is used for local development with FastAPI + if ( + url_template == JOB_DONE_URL + and job_data.get("status", None) != "IN_PROGRESS" + ): + log.info("Finished.", job["id"]) async def send_result(session, job_data, job, is_stream=False): """ Return the job results. """ - with tracer.start_as_current_span("rp_http.send_result"): - await _handle_result( - session, job_data, job, JOB_DONE_URL, "Results sent.", is_stream=is_stream - ) + await _handle_result( + session, job_data, job, JOB_DONE_URL, "Results sent.", is_stream=is_stream + ) async def stream_result(session, job_data, job): """ Return the stream job results. """ - with tracer.start_as_current_span("rp_http.stream_result"): - await _handle_result( - session, job_data, job, JOB_STREAM_URL, "Intermediate results sent." - ) + await _handle_result( + session, job_data, job, JOB_STREAM_URL, "Intermediate results sent." + ) diff --git a/runpod/serverless/modules/rp_job.py b/runpod/serverless/modules/rp_job.py index c244c22e..e6b729b8 100644 --- a/runpod/serverless/modules/rp_job.py +++ b/runpod/serverless/modules/rp_job.py @@ -7,7 +7,7 @@ import os import traceback from typing import Any, AsyncGenerator, Callable, Dict, Optional, Union, List -from opentelemetry.trace import get_tracer +from opentelemetry.trace import get_tracer, SpanKind from runpod.http_client import ClientSession, TooManyRequests from runpod.serverless.modules.rp_logger import RunPodLogger @@ -107,11 +107,10 @@ async def get_job( return jobs -async def handle_job(session: ClientSession, config: Dict[str, Any], job) -> dict: +async def handle_job(session: ClientSession, config: Dict[str, Any], job: dict) -> dict: if is_generator(config["handler"]): is_stream = True generator_output = run_job_generator(config["handler"], job) - log.debug("Handler is a generator, streaming results.", job["id"]) job_result = {"output": []} async for stream_output in generator_output: @@ -162,9 +161,8 @@ async def run_job(handler: Callable, job: Dict[str, Any]) -> Dict[str, Any]: log.info("Started.", job["id"]) run_result = {} - with tracer.start_as_current_span("rp_job.run_job") as span: - span.set_attribute("job.id", job.get("id")) - span.set_attribute("request_id", job.get("id")) # legacy + with tracer.start_as_current_span("run_job", kind=SpanKind.INTERNAL) as span: + span.set_attribute("request_id", job.get("id")) try: handler_return = handler(job) @@ -231,7 +229,9 @@ async def run_job_generator( job["id"], ) - with tracer.start_as_current_span("rp_job.run_job_generator") as span: + with tracer.start_as_current_span("run_job_generator", kind=SpanKind.INTERNAL) as span: + span.set_attribute("request_id", job.get("id")) + try: job_output = handler(job) diff --git a/runpod/serverless/modules/rp_scale.py b/runpod/serverless/modules/rp_scale.py index f9cc6956..34c10a1d 100644 --- a/runpod/serverless/modules/rp_scale.py +++ b/runpod/serverless/modules/rp_scale.py @@ -7,7 +7,8 @@ import os import signal from typing import Any, Dict -from opentelemetry.trace import get_tracer +from uuid import uuid1 # traceable to machine's MAC address + timestamp +from opentelemetry.trace import get_tracer, SpanKind from ...http_client import AsyncClientSession, ClientSession, TooManyRequests from .rp_job import get_job, handle_job @@ -20,13 +21,6 @@ tracer = get_tracer(__name__) -worker_attributes = { - "worker.hostname": os.getenv("RUNPOD_POD_HOSTNAME", "unknown"), - "worker.id": os.getenv("RUNPOD_POD_ID", "unknown"), - "endpoint.id": os.getenv("RUNPOD_ENDPOINT_ID", "unknown"), -} - - def _default_concurrency_modifier(current_concurrency: int) -> int: """ Default concurrency modifier. @@ -130,22 +124,20 @@ async def get_jobs(self, session: ClientSession): jobs_needed = self.current_concurrency - job_progress.get_job_count() if jobs_needed <= 0: - log.debug("JobScaler.get_jobs | Queue is full. Retrying soon.") + log.debug("Queue is full. Retrying soon.") await asyncio.sleep(1) # don't go rapidly continue - with tracer.start_as_current_span("JobScaler.get_jobs") as span: - span.set_attributes(worker_attributes) + with tracer.start_as_current_span("get_jobs", kind=SpanKind.CLIENT) as span: + span.set_attribute("batch_id", uuid1().hex) try: - span.add_event( - "getting jobs", - { - "jobs.current_concurrency": self.current_concurrency, - "jobs.in_progress": job_progress.get_job_count(), - "jobs.needed": jobs_needed, - }, - ) + # TODO: metrics + # { + # "jobs.current_concurrency": self.current_concurrency, + # "jobs.in_progress": job_progress.get_job_count(), + # "jobs.needed": jobs_needed, + # } # Keep the connection to the blocking call up to 30 seconds acquired_jobs = await asyncio.wait_for( @@ -153,39 +145,30 @@ async def get_jobs(self, session: ClientSession): ) if not acquired_jobs: - span.add_event("acquired no jobs", {"jobs.acquired": 0}) - log.debug("JobScaler.get_jobs | No jobs acquired.") + span.add_event("No jobs acquired") + log.debug("No jobs acquired") continue - span.add_event( - "acquired jobs", {"jobs.acquired": len(acquired_jobs)} - ) + span.set_attribute("jobs_acquired_count", len(acquired_jobs)) for job in acquired_jobs: - await job_list.add_job(job) + with tracer.start_as_current_span("queue_job", kind=SpanKind.PRODUCER) as job_span: + job_span.set_attribute("request_id", job.get("id")) + await job_list.add_job(job) - span.add_event( - "queued jobs", {"jobs.in_queue", job_list.get_job_count()} - ) + # TODO: metrics {"jobs.queued", job_list.get_job_count()} log.info(f"Jobs in queue: {job_list.get_job_count()}") except TooManyRequests as error: - span.record_exception(error) - log.debug( - f"JobScaler.get_jobs | Too many requests. Debounce for 5 seconds." - ) + span.add_event("Too many requests. Debounce for 5 seconds.") await asyncio.sleep(5) # debounce for 5 seconds except asyncio.CancelledError as error: - span.record_exception(error) - log.debug("JobScaler.get_jobs | Request was cancelled.") + span.add_event("Request was cancelled") except TimeoutError as error: - span.record_exception(error) - log.debug( - "JobScaler.get_jobs | Job acquisition timed out. Retrying." - ) + span.add_event("Job acquisition timed out") except TypeError as error: + # worker waking up produces a JSON error here span.record_exception(error) - log.debug(f"JobScaler.get_jobs | Unexpected error: {error}.") except Exception as error: span.record_exception(error) log.error( @@ -214,6 +197,7 @@ async def run_jobs(self, session: ClientSession): # Wait for any job to finish if tasks: + # TODO: metrics {"jobs.in_progress", len(tasks)} log.info(f"Jobs in progress: {len(tasks)}") done, pending = await asyncio.wait( @@ -233,18 +217,16 @@ async def handle_job(self, session: ClientSession, job: dict): """ Process an individual job. This function is run concurrently for multiple jobs. """ - with tracer.start_as_current_span("JobScaler.handle_job") as span: - span.set_attributes(worker_attributes) - span.set_attribute("job.id", job.get("id")) - span.set_attribute("request_id", job.get("id")) # legacy + with tracer.start_as_current_span("handle_job", kind=SpanKind.CONSUMER) as span: + span.set_attribute("request_id", job.get("id")) - log.debug(f"JobScaler.handle_job | {job}") job_progress.add(job) try: await handle_job(session, self.config, job) if self.config.get("refresh_worker", False): + span.add_event("refresh_worker") self.kill_worker() except Exception as err: From 83dc31b274728703be292dddf5d567a9aabd37d2 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Dean=20Qui=C3=B1anola?= Date: Sun, 3 Nov 2024 15:05:24 -0800 Subject: [PATCH 10/53] tmp: remove unused instrumentations --- requirements.txt | 3 --- runpod/otel.py | 15 --------------- 2 files changed, 18 deletions(-) diff --git a/requirements.txt b/requirements.txt index 261bb588..f7bfe3b7 100644 --- a/requirements.txt +++ b/requirements.txt @@ -22,7 +22,4 @@ setuptools==65.6.3 opentelemetry-sdk opentelemetry-exporter-otlp opentelemetry-instrumentation-aiohttp-client -opentelemetry-instrumentation-asyncio opentelemetry-instrumentation-requests -opentelemetry-instrumentation-threading -opentelemetry-instrumentation-urllib3 diff --git a/runpod/otel.py b/runpod/otel.py index c7b25d7e..c2f21831 100644 --- a/runpod/otel.py +++ b/runpod/otel.py @@ -14,9 +14,6 @@ ) -from opentelemetry.instrumentation.asyncio import AsyncioInstrumentor -from opentelemetry.instrumentation.threading import ThreadingInstrumentor -from opentelemetry.instrumentation.urllib3 import URLLib3Instrumentor from runpod.version import __version__ as runpod_version @@ -43,15 +40,3 @@ if os.getenv("OTEL_EXPORTER_OTLP_ENDPOINT"): tracer.add_span_processor(BatchSpanProcessor(OTLPSpanExporter())) - - -# --- threading --- # -ThreadingInstrumentor().instrument() - - -# --- urllib3 --- # -URLLib3Instrumentor().instrument() - - -# --- asyncio --- # -AsyncioInstrumentor().instrument() From 25762a6c092acedc0b53217a3ac47e9ad3f379a6 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Dean=20Qui=C3=B1anola?= Date: Sun, 3 Nov 2024 15:06:20 -0800 Subject: [PATCH 11/53] tmp: handle_job is child to queue_job --- runpod/serverless/modules/rp_http.py | 2 +- runpod/serverless/modules/rp_scale.py | 3 ++- 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/runpod/serverless/modules/rp_http.py b/runpod/serverless/modules/rp_http.py index be5e640a..8d7ed972 100644 --- a/runpod/serverless/modules/rp_http.py +++ b/runpod/serverless/modules/rp_http.py @@ -56,7 +56,7 @@ async def _handle_result( """ A helper function to handle the result, either for sending or streaming. """ - with tracer.start_as_current_span("handle_result", kind=SpanKind.INTERNAL) as span: + with tracer.start_as_current_span("handle_result", kind=SpanKind.SERVER) as span: span.set_attribute("request_id", job.get("id")) try: diff --git a/runpod/serverless/modules/rp_scale.py b/runpod/serverless/modules/rp_scale.py index 34c10a1d..4df00456 100644 --- a/runpod/serverless/modules/rp_scale.py +++ b/runpod/serverless/modules/rp_scale.py @@ -154,6 +154,7 @@ async def get_jobs(self, session: ClientSession): for job in acquired_jobs: with tracer.start_as_current_span("queue_job", kind=SpanKind.PRODUCER) as job_span: job_span.set_attribute("request_id", job.get("id")) + job["trace"] = job_span.get_span_context() await job_list.add_job(job) # TODO: metrics {"jobs.queued", job_list.get_job_count()} @@ -217,7 +218,7 @@ async def handle_job(self, session: ClientSession, job: dict): """ Process an individual job. This function is run concurrently for multiple jobs. """ - with tracer.start_as_current_span("handle_job", kind=SpanKind.CONSUMER) as span: + with tracer.start_as_current_span("handle_job", context=job.get("trace"), kind=SpanKind.CONSUMER) as span: span.set_attribute("request_id", job.get("id")) job_progress.add(job) From 97dbb78ad160db651e7ef3feccbac71f0aa5ccf2 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Dean=20Qui=C3=B1anola?= Date: Sun, 3 Nov 2024 15:13:42 -0800 Subject: [PATCH 12/53] tmp: disable http_client tracing temporarily --- runpod/http_client.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/runpod/http_client.py b/runpod/http_client.py index 97e98829..8bf6dca2 100644 --- a/runpod/http_client.py +++ b/runpod/http_client.py @@ -6,8 +6,8 @@ import requests from aiohttp import ClientSession, ClientTimeout, TCPConnector, ClientResponseError from opentelemetry import trace -from opentelemetry.instrumentation.aiohttp_client import create_trace_config -from opentelemetry.instrumentation.requests import RequestsInstrumentor +# from opentelemetry.instrumentation.aiohttp_client import create_trace_config +# from opentelemetry.instrumentation.requests import RequestsInstrumentor from .cli.groups.config.functions import get_credentials from .user_agent import USER_AGENT @@ -44,7 +44,7 @@ def AsyncClientSession(*args, **kwargs): connector=TCPConnector(limit=0), headers=get_auth_header(), timeout=ClientTimeout(600, ceil_threshold=400), - trace_configs=[create_trace_config()], + # trace_configs=[create_trace_config()], *args, **kwargs, ) @@ -54,4 +54,4 @@ class SyncClientSession(requests.Session): def __init__(self): super().__init__() self.headers.update(get_auth_header()) - RequestsInstrumentor().instrument(session=self) \ No newline at end of file + # RequestsInstrumentor().instrument(session=self) \ No newline at end of file From f0f6997af593f349917a493dc0c8611e4407fb1c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Dean=20Qui=C3=B1anola?= Date: Sun, 3 Nov 2024 15:24:46 -0800 Subject: [PATCH 13/53] tmp: job.get("trace").get_span_context() --- runpod/serverless/modules/rp_scale.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/runpod/serverless/modules/rp_scale.py b/runpod/serverless/modules/rp_scale.py index 4df00456..744d1332 100644 --- a/runpod/serverless/modules/rp_scale.py +++ b/runpod/serverless/modules/rp_scale.py @@ -154,7 +154,7 @@ async def get_jobs(self, session: ClientSession): for job in acquired_jobs: with tracer.start_as_current_span("queue_job", kind=SpanKind.PRODUCER) as job_span: job_span.set_attribute("request_id", job.get("id")) - job["trace"] = job_span.get_span_context() + job["trace"] = job_span await job_list.add_job(job) # TODO: metrics {"jobs.queued", job_list.get_job_count()} @@ -218,7 +218,8 @@ async def handle_job(self, session: ClientSession, job: dict): """ Process an individual job. This function is run concurrently for multiple jobs. """ - with tracer.start_as_current_span("handle_job", context=job.get("trace"), kind=SpanKind.CONSUMER) as span: + context = job.get("trace").get_span_context() + with tracer.start_as_current_span("handle_job", context=context, kind=SpanKind.CONSUMER) as span: span.set_attribute("request_id", job.get("id")) job_progress.add(job) From fec5a53eba21c12a85feb976eda5fec9965db1a2 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Dean=20Qui=C3=B1anola?= Date: Sun, 3 Nov 2024 15:52:33 -0800 Subject: [PATCH 14/53] tmp: correction to tracer span context --- runpod/serverless/modules/rp_scale.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/runpod/serverless/modules/rp_scale.py b/runpod/serverless/modules/rp_scale.py index 744d1332..e16c8036 100644 --- a/runpod/serverless/modules/rp_scale.py +++ b/runpod/serverless/modules/rp_scale.py @@ -8,7 +8,7 @@ import signal from typing import Any, Dict from uuid import uuid1 # traceable to machine's MAC address + timestamp -from opentelemetry.trace import get_tracer, SpanKind +from opentelemetry.trace import get_tracer, SpanKind, set_span_in_context, NonRecordingSpan from ...http_client import AsyncClientSession, ClientSession, TooManyRequests from .rp_job import get_job, handle_job @@ -154,7 +154,7 @@ async def get_jobs(self, session: ClientSession): for job in acquired_jobs: with tracer.start_as_current_span("queue_job", kind=SpanKind.PRODUCER) as job_span: job_span.set_attribute("request_id", job.get("id")) - job["trace"] = job_span + job["context"] = job_span.get_span_context() await job_list.add_job(job) # TODO: metrics {"jobs.queued", job_list.get_job_count()} @@ -218,7 +218,8 @@ async def handle_job(self, session: ClientSession, job: dict): """ Process an individual job. This function is run concurrently for multiple jobs. """ - context = job.get("trace").get_span_context() + context = set_span_in_context(NonRecordingSpan(job["context"])) + with tracer.start_as_current_span("handle_job", context=context, kind=SpanKind.CONSUMER) as span: span.set_attribute("request_id", job.get("id")) From b3bced1d6b91b7e7686c879f5599a99c018bf9cb Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Dean=20Qui=C3=B1anola?= Date: Mon, 4 Nov 2024 16:42:58 -0800 Subject: [PATCH 15/53] tmp: custom runpod namespace for process tags --- runpod/otel.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/runpod/otel.py b/runpod/otel.py index c2f21831..6672c3ad 100644 --- a/runpod/otel.py +++ b/runpod/otel.py @@ -11,9 +11,10 @@ SERVICE_INSTANCE_ID, SERVICE_VERSION, HOST_NAME, - ) +RUNPOD_ENDPOINT_ID = "runpod.endpoint_id" +RUNPOD_POD_ID = "runpod.pod_id" from runpod.version import __version__ as runpod_version @@ -25,7 +26,9 @@ "application": "runpod-serverless", SERVICE_NAME: "runpod-python-sdk", SERVICE_NAMESPACE: os.getenv("RUNPOD_ENDPOINT_ID"), + RUNPOD_ENDPOINT_ID: os.getenv("RUNPOD_ENDPOINT_ID"), SERVICE_INSTANCE_ID: os.getenv("RUNPOD_POD_ID"), + RUNPOD_POD_ID: os.getenv("RUNPOD_POD_ID"), SERVICE_VERSION: runpod_version, HOST_NAME: os.getenv("RUNPOD_POD_HOSTNAME"), } From d19ac103239afa1c7bd2b7c1f9a1d997ea760b32 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Dean=20Qui=C3=B1anola?= Date: Mon, 4 Nov 2024 16:43:11 -0800 Subject: [PATCH 16/53] tmp: cleanup by black format --- runpod/serverless/modules/rp_scale.py | 16 ++++++++++++---- 1 file changed, 12 insertions(+), 4 deletions(-) diff --git a/runpod/serverless/modules/rp_scale.py b/runpod/serverless/modules/rp_scale.py index e16c8036..ef7b24ae 100644 --- a/runpod/serverless/modules/rp_scale.py +++ b/runpod/serverless/modules/rp_scale.py @@ -4,11 +4,15 @@ """ import asyncio -import os import signal from typing import Any, Dict from uuid import uuid1 # traceable to machine's MAC address + timestamp -from opentelemetry.trace import get_tracer, SpanKind, set_span_in_context, NonRecordingSpan +from opentelemetry.trace import ( + get_tracer, + set_span_in_context, + SpanKind, + NonRecordingSpan, +) from ...http_client import AsyncClientSession, ClientSession, TooManyRequests from .rp_job import get_job, handle_job @@ -152,7 +156,9 @@ async def get_jobs(self, session: ClientSession): span.set_attribute("jobs_acquired_count", len(acquired_jobs)) for job in acquired_jobs: - with tracer.start_as_current_span("queue_job", kind=SpanKind.PRODUCER) as job_span: + with tracer.start_as_current_span( + "queue_job", kind=SpanKind.PRODUCER + ) as job_span: job_span.set_attribute("request_id", job.get("id")) job["context"] = job_span.get_span_context() await job_list.add_job(job) @@ -220,7 +226,9 @@ async def handle_job(self, session: ClientSession, job: dict): """ context = set_span_in_context(NonRecordingSpan(job["context"])) - with tracer.start_as_current_span("handle_job", context=context, kind=SpanKind.CONSUMER) as span: + with tracer.start_as_current_span( + "handle_job", context=context, kind=SpanKind.CONSUMER + ) as span: span.set_attribute("request_id", job.get("id")) job_progress.add(job) From c21cfe51918e97f622817d3289c23e4404574d66 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Dean=20Qui=C3=B1anola?= Date: Mon, 4 Nov 2024 16:59:01 -0800 Subject: [PATCH 17/53] tmp: otel tracing sls-core hooks --- runpod/serverless/core.py | 156 ++++++++++++++++----------- runpod/serverless/modules/rp_http.py | 1 + 2 files changed, 96 insertions(+), 61 deletions(-) diff --git a/runpod/serverless/core.py b/runpod/serverless/core.py index 657dbe64..43e8c4fc 100644 --- a/runpod/serverless/core.py +++ b/runpod/serverless/core.py @@ -9,12 +9,20 @@ import typing from ctypes import CDLL, byref, c_char_p, c_int from typing import Any, Callable, Dict, List, Optional +from uuid import uuid1 # traceable to machine's MAC address + timestamp +from opentelemetry.trace import ( + get_tracer, + set_span_in_context, + SpanKind, + NonRecordingSpan, +) from runpod.serverless.modules import rp_job from runpod.serverless.modules.rp_logger import RunPodLogger from runpod.version import __version__ as runpod_version log = RunPodLogger() +tracer = get_tracer(__name__) # _runpod_sls_get_jobs status codes STILL_WAITING = 0 @@ -188,32 +196,40 @@ async def stream_output(self, job_id: str, job_output: bytes) -> bool: """ send part of a streaming result to AI-API. """ - json_data = self._json_serialize_job_data(job_output) - id_bytes = job_id.encode("utf-8") - return bool( - self._stream_output( - c_char_p(id_bytes), - c_int(len(id_bytes)), - c_char_p(json_data), - c_int(len(json_data)), + with tracer.start_as_current_span("handle_result", kind=SpanKind.SERVER) as span: + span.set_attribute("request_id", job_id) + span.set_attribute("is_stream", True) + + json_data = self._json_serialize_job_data(job_output) + id_bytes = job_id.encode("utf-8") + return bool( + self._stream_output( + c_char_p(id_bytes), + c_int(len(id_bytes)), + c_char_p(json_data), + c_int(len(json_data)), + ) ) - ) def post_output(self, job_id: str, job_output: bytes) -> bool: """ send the result of a job to AI-API. Returns True if the task was successfully stored, False otherwise. """ - json_data = self._json_serialize_job_data(job_output) - id_bytes = job_id.encode("utf-8") - return bool( - self._post_output( - c_char_p(id_bytes), - c_int(len(id_bytes)), - c_char_p(json_data), - c_int(len(json_data)), + with tracer.start_as_current_span("handle_result", kind=SpanKind.SERVER) as span: + span.set_attribute("request_id", job_id) + span.set_attribute("is_stream", False) + + json_data = self._json_serialize_job_data(job_output) + id_bytes = job_id.encode("utf-8") + return bool( + self._post_output( + c_char_p(id_bytes), + c_int(len(id_bytes)), + c_char_p(json_data), + c_int(len(json_data)), + ) ) - ) def finish_stream(self, job_id: str) -> bool: """ @@ -225,46 +241,53 @@ def finish_stream(self, job_id: str) -> bool: # -------------------------------- Process Job ------------------------------- # async def _process_job( - config: Dict[str, Any], job: Dict[str, Any], hook + config: Dict[str, Any], job: Dict[str, Any], hook: Hook ) -> Dict[str, Any]: """Process a single job.""" handler = config["handler"] result = {} - try: - if inspect.isgeneratorfunction(handler) or inspect.isasyncgenfunction(handler): - log.debug("SLS Core | Running job as a generator.") - generator_output = rp_job.run_job_generator(handler, job) - aggregated_output: dict[str, typing.Any] = {"output": []} - async for part in generator_output: - log.trace(f"SLS Core | Streaming output: {part}", job["id"]) + context = set_span_in_context(NonRecordingSpan(job["context"])) - if "error" in part: - aggregated_output = part - break - if config.get("return_aggregate_stream", False): - aggregated_output["output"].append(part["output"]) + with tracer.start_as_current_span("handle_job", context=context, kind=SpanKind.CONSUMER) as span: + span.set_attribute("request_id", job.get("id")) - await hook.stream_output(job["id"], part) + try: + if inspect.isgeneratorfunction(handler) or inspect.isasyncgenfunction(handler): + log.debug("SLS Core | Running job as a generator.") + generator_output = rp_job.run_job_generator(handler, job) + aggregated_output: dict[str, typing.Any] = {"output": []} - log.debug("SLS Core | Finished streaming output.", job["id"]) - hook.finish_stream(job["id"]) - result = aggregated_output + async for part in generator_output: + log.trace(f"SLS Core | Streaming output: {part}", job["id"]) - else: - log.debug("SLS Core | Running job as a standard function.") - result = await rp_job.run_job(handler, job) - result = result.get("output", result) + if "error" in part: + aggregated_output = part + break + if config.get("return_aggregate_stream", False): + aggregated_output["output"].append(part["output"]) - except Exception as err: # pylint: disable=broad-except - log.error(f"SLS Core | Error running job: {err}", job["id"]) - result = {"error": str(err)} + await hook.stream_output(job["id"], part) - finally: - log.debug(f"SLS Core | Posting output: {result}", job["id"]) - hook.post_output(job["id"], result) - return result + log.debug("SLS Core | Finished streaming output.", job["id"]) + hook.finish_stream(job["id"]) + result = aggregated_output + + else: + log.debug("SLS Core | Running job as a standard function.") + result = await rp_job.run_job(handler, job) + result = result.get("output", result) + + except Exception as err: # pylint: disable=broad-except + span.record_exception(err) + log.error(f"SLS Core | Error running job: {err}", job["id"]) + result = {"error": str(err)} + + finally: + log.debug(f"SLS Core | Posting output: {result}", job["id"]) + hook.post_output(job["id"], result) + return result # ---------------------------------------------------------------------------- # @@ -282,25 +305,36 @@ async def run(config: Dict[str, Any]) -> None: serverless_hook = Hook() while True: - try: - jobs = serverless_hook.get_jobs(max_concurrency, max_jobs) - except SlsCoreError as err: - log.error(f"SLS Core | Error getting jobs: {err}") - await asyncio.sleep(0.2) # sleep for a bit before trying again - continue + with tracer.start_as_current_span("get_jobs", kind=SpanKind.CLIENT) as span: + span.set_attribute("runpod.sls_core_enabled", True) + span.set_attribute("batch_id", uuid1().hex) - if len(jobs) == 0 or jobs is None: - await asyncio.sleep(0) - continue + try: + jobs = serverless_hook.get_jobs(max_concurrency, max_jobs) + except SlsCoreError as err: + span.record_exception(err) + log.error(f"SLS Core | Error getting jobs: {err}") + await asyncio.sleep(0.2) # sleep for a bit before trying again + continue + + if len(jobs) == 0 or jobs is None: + span.add_event("No jobs acquired") + await asyncio.sleep(0) + continue + + span.set_attribute("jobs_acquired_count", len(jobs)) + + for job in jobs: + with tracer.start_as_current_span("queue_job", kind=SpanKind.PRODUCER) as job_span: + job_span.set_attribute("request_id", job.get("id")) + job["context"] = job_span.get_span_context() + asyncio.create_task( + _process_job(config, job, serverless_hook), name=job["id"] + ) + await asyncio.sleep(0) - for job in jobs: - asyncio.create_task( - _process_job(config, job, serverless_hook), name=job["id"] - ) await asyncio.sleep(0) - await asyncio.sleep(0) - def main(config: Dict[str, Any]) -> None: """Run the worker in an asyncio event loop.""" diff --git a/runpod/serverless/modules/rp_http.py b/runpod/serverless/modules/rp_http.py index 8d7ed972..2b0b3343 100644 --- a/runpod/serverless/modules/rp_http.py +++ b/runpod/serverless/modules/rp_http.py @@ -58,6 +58,7 @@ async def _handle_result( """ with tracer.start_as_current_span("handle_result", kind=SpanKind.SERVER) as span: span.set_attribute("request_id", job.get("id")) + span.set_attribute("is_stream", is_stream) try: serialized_job_data = json.dumps(job_data, ensure_ascii=False) From 9bac6229db5f61e21c2acca2da3700b2a09baab2 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Dean=20Qui=C3=B1anola?= Date: Mon, 4 Nov 2024 21:24:17 -0800 Subject: [PATCH 18/53] tmp: using decorators where appropriate --- runpod/serverless/core.py | 71 +++++++-------- runpod/serverless/modules/rp_http.py | 61 ++++++------- runpod/serverless/modules/rp_job.py | 130 ++++++++++++++------------- 3 files changed, 131 insertions(+), 131 deletions(-) diff --git a/runpod/serverless/core.py b/runpod/serverless/core.py index 43e8c4fc..9a1e9f62 100644 --- a/runpod/serverless/core.py +++ b/runpod/serverless/core.py @@ -8,21 +8,16 @@ import pathlib import typing from ctypes import CDLL, byref, c_char_p, c_int +from opentelemetry import trace from typing import Any, Callable, Dict, List, Optional from uuid import uuid1 # traceable to machine's MAC address + timestamp -from opentelemetry.trace import ( - get_tracer, - set_span_in_context, - SpanKind, - NonRecordingSpan, -) from runpod.serverless.modules import rp_job from runpod.serverless.modules.rp_logger import RunPodLogger from runpod.version import __version__ as runpod_version log = RunPodLogger() -tracer = get_tracer(__name__) +tracer = trace.get_tracer(__name__) # _runpod_sls_get_jobs status codes STILL_WAITING = 0 @@ -192,44 +187,46 @@ def progress_update(self, job_id: str, json_data: bytes) -> bool: ) ) + @tracer.start_as_current_span("handle_result", kind=trace.SpanKind.SERVER) async def stream_output(self, job_id: str, job_output: bytes) -> bool: """ send part of a streaming result to AI-API. """ - with tracer.start_as_current_span("handle_result", kind=SpanKind.SERVER) as span: - span.set_attribute("request_id", job_id) - span.set_attribute("is_stream", True) - - json_data = self._json_serialize_job_data(job_output) - id_bytes = job_id.encode("utf-8") - return bool( - self._stream_output( - c_char_p(id_bytes), - c_int(len(id_bytes)), - c_char_p(json_data), - c_int(len(json_data)), - ) + span = trace.get_current_span() + span.set_attribute("request_id", job_id) + span.set_attribute("is_stream", True) + + json_data = self._json_serialize_job_data(job_output) + id_bytes = job_id.encode("utf-8") + return bool( + self._stream_output( + c_char_p(id_bytes), + c_int(len(id_bytes)), + c_char_p(json_data), + c_int(len(json_data)), ) + ) + @tracer.start_as_current_span("handle_result", kind=trace.SpanKind.SERVER) def post_output(self, job_id: str, job_output: bytes) -> bool: """ send the result of a job to AI-API. Returns True if the task was successfully stored, False otherwise. """ - with tracer.start_as_current_span("handle_result", kind=SpanKind.SERVER) as span: - span.set_attribute("request_id", job_id) - span.set_attribute("is_stream", False) - - json_data = self._json_serialize_job_data(job_output) - id_bytes = job_id.encode("utf-8") - return bool( - self._post_output( - c_char_p(id_bytes), - c_int(len(id_bytes)), - c_char_p(json_data), - c_int(len(json_data)), - ) + span = trace.get_current_span() + span.set_attribute("request_id", job_id) + span.set_attribute("is_stream", False) + + json_data = self._json_serialize_job_data(job_output) + id_bytes = job_id.encode("utf-8") + return bool( + self._post_output( + c_char_p(id_bytes), + c_int(len(id_bytes)), + c_char_p(json_data), + c_int(len(json_data)), ) + ) def finish_stream(self, job_id: str) -> bool: """ @@ -248,9 +245,9 @@ async def _process_job( result = {} - context = set_span_in_context(NonRecordingSpan(job["context"])) + context = trace.set_span_in_context(trace.NonRecordingSpan(job["context"])) - with tracer.start_as_current_span("handle_job", context=context, kind=SpanKind.CONSUMER) as span: + with tracer.start_as_current_span("handle_job", context=context, kind=trace.SpanKind.CONSUMER) as span: span.set_attribute("request_id", job.get("id")) try: @@ -305,7 +302,7 @@ async def run(config: Dict[str, Any]) -> None: serverless_hook = Hook() while True: - with tracer.start_as_current_span("get_jobs", kind=SpanKind.CLIENT) as span: + with tracer.start_as_current_span("get_jobs", kind=trace.SpanKind.CLIENT) as span: span.set_attribute("runpod.sls_core_enabled", True) span.set_attribute("batch_id", uuid1().hex) @@ -325,7 +322,7 @@ async def run(config: Dict[str, Any]) -> None: span.set_attribute("jobs_acquired_count", len(jobs)) for job in jobs: - with tracer.start_as_current_span("queue_job", kind=SpanKind.PRODUCER) as job_span: + with tracer.start_as_current_span("queue_job", kind=trace.SpanKind.PRODUCER) as job_span: job_span.set_attribute("request_id", job.get("id")) job["context"] = job_span.get_span_context() asyncio.create_task( diff --git a/runpod/serverless/modules/rp_http.py b/runpod/serverless/modules/rp_http.py index 2b0b3343..2423cfa4 100644 --- a/runpod/serverless/modules/rp_http.py +++ b/runpod/serverless/modules/rp_http.py @@ -5,9 +5,9 @@ import json import os -from opentelemetry.trace import get_tracer, SpanKind from aiohttp import ClientError from aiohttp_retry import FibonacciRetry, RetryClient +from opentelemetry import trace from runpod.http_client import ClientSession from runpod.serverless.modules.rp_logger import RunPodLogger @@ -25,7 +25,7 @@ JOB_STREAM_URL = JOB_STREAM_URL_TEMPLATE.replace("$RUNPOD_POD_ID", WORKER_ID) log = RunPodLogger() -tracer = get_tracer(__name__) +tracer = trace.get_tracer(__name__) async def _transmit(client_session: ClientSession, url, job_data): @@ -50,40 +50,41 @@ async def _transmit(client_session: ClientSession, url, job_data): await client_response.text() +@tracer.start_as_current_span("handle_result", kind=trace.SpanKind.SERVER) async def _handle_result( session: ClientSession, job_data, job, url_template, log_message, is_stream=False ): """ A helper function to handle the result, either for sending or streaming. """ - with tracer.start_as_current_span("handle_result", kind=SpanKind.SERVER) as span: - span.set_attribute("request_id", job.get("id")) - span.set_attribute("is_stream", is_stream) - - try: - serialized_job_data = json.dumps(job_data, ensure_ascii=False) - - is_stream = "true" if is_stream else "false" - url = url_template.replace("$ID", job["id"]) + f"&isStream={is_stream}" - - await _transmit(session, url, serialized_job_data) - log.debug(f"{log_message}", job["id"]) - - except ClientError as err: - span.record_exception(err) - log.error(f"Failed to return job results. | {err}", job["id"]) - - except (TypeError, RuntimeError) as err: - span.record_exception(err) - log.error(f"Error while returning job result. | {err}", job["id"]) - - finally: - # job_data status is used for local development with FastAPI - if ( - url_template == JOB_DONE_URL - and job_data.get("status", None) != "IN_PROGRESS" - ): - log.info("Finished.", job["id"]) + span = trace.get_current_span() + span.set_attribute("request_id", job.get("id")) + span.set_attribute("is_stream", is_stream) + + try: + serialized_job_data = json.dumps(job_data, ensure_ascii=False) + + is_stream = "true" if is_stream else "false" + url = url_template.replace("$ID", job["id"]) + f"&isStream={is_stream}" + + await _transmit(session, url, serialized_job_data) + log.debug(f"{log_message}", job["id"]) + + except ClientError as err: + span.record_exception(err) + log.error(f"Failed to return job results. | {err}", job["id"]) + + except (TypeError, RuntimeError) as err: + span.record_exception(err) + log.error(f"Error while returning job result. | {err}", job["id"]) + + finally: + # job_data status is used for local development with FastAPI + if ( + url_template == JOB_DONE_URL + and job_data.get("status", None) != "IN_PROGRESS" + ): + log.info("Finished.", job["id"]) async def send_result(session, job_data, job, is_stream=False): diff --git a/runpod/serverless/modules/rp_job.py b/runpod/serverless/modules/rp_job.py index e6b729b8..1da93fa1 100644 --- a/runpod/serverless/modules/rp_job.py +++ b/runpod/serverless/modules/rp_job.py @@ -6,8 +6,8 @@ import json import os import traceback +from opentelemetry import trace from typing import Any, AsyncGenerator, Callable, Dict, Optional, Union, List -from opentelemetry.trace import get_tracer, SpanKind from runpod.http_client import ClientSession, TooManyRequests from runpod.serverless.modules.rp_logger import RunPodLogger @@ -23,7 +23,7 @@ log = RunPodLogger() job_progress = JobsProgress() -tracer = get_tracer(__name__) +tracer = trace.get_tracer(__name__) def _job_get_url(batch_size: int = 1): @@ -147,6 +147,7 @@ async def handle_job(session: ClientSession, config: Dict[str, Any], job: dict) await send_result(session, job_result, job, is_stream=is_stream) +@tracer.start_as_current_span("run_job") async def run_job(handler: Callable, job: Dict[str, Any]) -> Dict[str, Any]: """ Run the job using the handler. @@ -158,64 +159,65 @@ async def run_job(handler: Callable, job: Dict[str, Any]) -> Dict[str, Any]: Returns: Dict[str, Any]: The result of running the job. """ + span = trace.get_current_span() + span.set_attribute("request_id", job.get("id")) + log.info("Started.", job["id"]) run_result = {} - with tracer.start_as_current_span("run_job", kind=SpanKind.INTERNAL) as span: - span.set_attribute("request_id", job.get("id")) - - try: - handler_return = handler(job) - job_output = ( - await handler_return - if inspect.isawaitable(handler_return) - else handler_return - ) + try: + handler_return = handler(job) + job_output = ( + await handler_return + if inspect.isawaitable(handler_return) + else handler_return + ) - log.debug(f"Handler output: {job_output}", job["id"]) + log.debug(f"Handler output: {job_output}", job["id"]) - if isinstance(job_output, dict): - error_msg = job_output.pop("error", None) - refresh_worker = job_output.pop("refresh_worker", None) - run_result["output"] = job_output + if isinstance(job_output, dict): + error_msg = job_output.pop("error", None) + refresh_worker = job_output.pop("refresh_worker", None) + run_result["output"] = job_output - if error_msg: - run_result["error"] = error_msg - if refresh_worker: - run_result["stopPod"] = True + if error_msg: + run_result["error"] = error_msg + if refresh_worker: + run_result["stopPod"] = True - elif isinstance(job_output, bool): - run_result = {"output": job_output} + elif isinstance(job_output, bool): + run_result = {"output": job_output} - else: - run_result = {"output": job_output} + else: + run_result = {"output": job_output} - if run_result.get("output") == {}: - run_result.pop("output") + if run_result.get("output") == {}: + run_result.pop("output") - check_return_size(run_result) # Checks the size of the return body. + check_return_size(run_result) # Checks the size of the return body. - except Exception as err: - span.record_exception(err) - error_info = { - "error_type": str(type(err)), - "error_message": str(err), - "error_traceback": traceback.format_exc(), - "hostname": os.environ.get("RUNPOD_POD_HOSTNAME", "unknown"), - "worker_id": os.environ.get("RUNPOD_POD_ID", "unknown"), - "runpod_version": runpod_version, - } + except Exception as err: + span.record_exception(err) + error_info = { + "error_type": str(type(err)), + "error_message": str(err), + "error_traceback": traceback.format_exc(), + "hostname": os.environ.get("RUNPOD_POD_HOSTNAME", "unknown"), + "worker_id": os.environ.get("RUNPOD_POD_ID", "unknown"), + "runpod_version": runpod_version, + } - log.error("Captured Handler Exception", job["id"]) - log.error(json.dumps(error_info, indent=4)) - run_result = {"error": json.dumps(error_info)} + log.error("Captured Handler Exception", job["id"]) + log.error(json.dumps(error_info, indent=4)) + run_result = {"error": json.dumps(error_info)} - finally: - log.debug(f"run_job return: {run_result}", job["id"]) + finally: + log.debug(f"run_job return: {run_result}", job["id"]) return run_result +@tracer.start_as_current_span("run_job_generator") async def run_job_generator( handler: Callable, job: Dict[str, Any] ) -> AsyncGenerator[Dict[str, Union[str, Any]], None]: @@ -223,30 +225,30 @@ async def run_job_generator( Run generator job used to stream output. Yields output partials from the generator. """ + span = trace.get_current_span() + span.set_attribute("request_id", job.get("id")) + is_async_gen = inspect.isasyncgenfunction(handler) log.debug( "Using Async Generator" if is_async_gen else "Using Standard Generator", job["id"], ) - with tracer.start_as_current_span("run_job_generator", kind=SpanKind.INTERNAL) as span: - span.set_attribute("request_id", job.get("id")) - - try: - job_output = handler(job) - - if is_async_gen: - async for output_partial in job_output: - log.debug(f"Async Generator output: {output_partial}", job["id"]) - yield {"output": output_partial} - else: - for output_partial in job_output: - log.debug(f"Generator output: {output_partial}", job["id"]) - yield {"output": output_partial} - - except Exception as err: - span.record_exception(err) - log.error(err, job["id"]) - yield {"error": f"handler: {str(err)} \ntraceback: {traceback.format_exc()}"} - finally: - log.info("Finished running generator.", job["id"]) + try: + job_output = handler(job) + + if is_async_gen: + async for output_partial in job_output: + log.debug(f"Async Generator output: {output_partial}", job["id"]) + yield {"output": output_partial} + else: + for output_partial in job_output: + log.debug(f"Generator output: {output_partial}", job["id"]) + yield {"output": output_partial} + + except Exception as err: + span.record_exception(err) + log.error(err, job["id"]) + yield {"error": f"handler: {str(err)} \ntraceback: {traceback.format_exc()}"} + finally: + log.info("Finished running generator.", job["id"]) From 4939bb1f68e0d261c5a57e69d4bd3367875a85d9 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Dean=20Qui=C3=B1anola?= Date: Mon, 4 Nov 2024 22:15:47 -0800 Subject: [PATCH 19/53] tmp: cleanup otel resource definition --- runpod/otel.py | 9 +-------- 1 file changed, 1 insertion(+), 8 deletions(-) diff --git a/runpod/otel.py b/runpod/otel.py index 6672c3ad..ed94ea9e 100644 --- a/runpod/otel.py +++ b/runpod/otel.py @@ -7,10 +7,7 @@ from opentelemetry.sdk.resources import ( Resource, SERVICE_NAME, - SERVICE_NAMESPACE, - SERVICE_INSTANCE_ID, SERVICE_VERSION, - HOST_NAME, ) RUNPOD_ENDPOINT_ID = "runpod.endpoint_id" @@ -23,14 +20,10 @@ TracerProvider( resource=Resource.create( { - "application": "runpod-serverless", - SERVICE_NAME: "runpod-python-sdk", - SERVICE_NAMESPACE: os.getenv("RUNPOD_ENDPOINT_ID"), RUNPOD_ENDPOINT_ID: os.getenv("RUNPOD_ENDPOINT_ID"), - SERVICE_INSTANCE_ID: os.getenv("RUNPOD_POD_ID"), RUNPOD_POD_ID: os.getenv("RUNPOD_POD_ID"), + SERVICE_NAME: "runpod-python-sdk", SERVICE_VERSION: runpod_version, - HOST_NAME: os.getenv("RUNPOD_POD_HOSTNAME"), } ) ) From d14decd2238b06a46273d123c88a24f332d7db45 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Dean=20Qui=C3=B1anola?= Date: Tue, 5 Nov 2024 02:00:27 -0800 Subject: [PATCH 20/53] tmp: trace pings --- runpod/serverless/modules/rp_ping.py | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/runpod/serverless/modules/rp_ping.py b/runpod/serverless/modules/rp_ping.py index 88fa1049..6575b94a 100644 --- a/runpod/serverless/modules/rp_ping.py +++ b/runpod/serverless/modules/rp_ping.py @@ -8,6 +8,7 @@ import time import requests +from opentelemetry import trace from urllib3.util.retry import Retry from runpod.http_client import SyncClientSession @@ -17,6 +18,7 @@ log = RunPodLogger() jobs = JobsProgress() # Contains the list of jobs that are currently running. +tracer = trace.get_tracer(__name__) class Heartbeat: @@ -83,6 +85,7 @@ def ping_loop(self, test=False): if test: return + @tracer.start_as_current_span("send_ping") def _send_ping(self): """ Sends a heartbeat to the Runpod server. @@ -90,6 +93,9 @@ def _send_ping(self): job_ids = jobs.get_job_list() ping_params = {"job_id": job_ids, "runpod_version": runpod_version} + span = trace.get_current_span() + span.set_attribute("job_id", job_ids) + try: result = self._session.get( self.PING_URL, params=ping_params, timeout=self.PING_INTERVAL * 2 @@ -100,4 +106,5 @@ def _send_ping(self): ) except requests.RequestException as err: + span.record_exception(err) log.error(f"Ping Request Error: {err}, attempting to restart ping.") From b82c6a8c87eaa75435bfcd0359b04e6b0b8a66fe Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Dean=20Qui=C3=B1anola?= Date: Tue, 5 Nov 2024 15:56:53 -0800 Subject: [PATCH 21/53] tmp: refactored worker_state to trace request_id list in ping --- runpod/serverless/modules/rp_ping.py | 8 +++---- runpod/serverless/modules/worker_state.py | 23 ++++++------------- .../test_modules/test_state.py | 21 ++--------------- 3 files changed, 13 insertions(+), 39 deletions(-) diff --git a/runpod/serverless/modules/rp_ping.py b/runpod/serverless/modules/rp_ping.py index 6575b94a..18336c44 100644 --- a/runpod/serverless/modules/rp_ping.py +++ b/runpod/serverless/modules/rp_ping.py @@ -17,7 +17,7 @@ from runpod.version import __version__ as runpod_version log = RunPodLogger() -jobs = JobsProgress() # Contains the list of jobs that are currently running. +job_progress = JobsProgress() # Contains the list of jobs that are currently running. tracer = trace.get_tracer(__name__) @@ -90,11 +90,11 @@ def _send_ping(self): """ Sends a heartbeat to the Runpod server. """ - job_ids = jobs.get_job_list() - ping_params = {"job_id": job_ids, "runpod_version": runpod_version} + job_ids = job_progress.get_job_list() + ping_params = {"job_id": ",".join(job_ids), "runpod_version": runpod_version} span = trace.get_current_span() - span.set_attribute("job_id", job_ids) + span.set_attribute("request_id", job_ids) try: result = self._session.get( diff --git a/runpod/serverless/modules/worker_state.py b/runpod/serverless/modules/worker_state.py index 81e62799..349e7496 100644 --- a/runpod/serverless/modules/worker_state.py +++ b/runpod/serverless/modules/worker_state.py @@ -97,11 +97,11 @@ def add(self, element: Any): def remove(self, element: Any): """ - Adds a Job object to the set. + Removes a Job object from the set. - If the added element is a string, then `Job(id=element)` is added + If the element is a string, then `Job(id=element)` is recognized - If the added element is a dict, that `Job(**element)` is added + If the element is a dict, that `Job(**element)` is recognized """ if isinstance(element, str): element = Job(id=element) @@ -126,14 +126,14 @@ def get(self, element: Any) -> Job: if job == element: return job - def get_job_list(self) -> str: + def get_job_list(self) -> set[str]: """ - Returns the list of job IDs as comma-separated string. + Returns the list of job IDs """ if not self.get_job_count(): - return None + return set() - return ",".join(str(job) for job in self) + return set(str(job) for job in self) def get_job_count(self) -> int: """ @@ -175,15 +175,6 @@ async def get_job(self) -> dict: """ return await self.get() - def get_job_list(self) -> Optional[str]: - """ - Returns the comma-separated list of jobs as a string. (read-only) - """ - if self.empty(): - return None - - return ",".join(job.get("id") for job in self) - def get_job_count(self) -> int: """ Returns the number of jobs. diff --git a/tests/test_serverless/test_modules/test_state.py b/tests/test_serverless/test_modules/test_state.py index 6b26a64c..7057822f 100644 --- a/tests/test_serverless/test_modules/test_state.py +++ b/tests/test_serverless/test_modules/test_state.py @@ -154,23 +154,6 @@ async def test_get_job(self): assert next_job not in self.jobs assert next_job == job2 - async def test_get_job_list(self): - """ - Tests if get_job_list() returns comma-separated IDs - """ - self.assertTrue(self.jobs.get_job_list() is None) - - job1 = {"id": "123"} - await self.jobs.add_job(job1) - - job2 = {"id": "456"} - await self.jobs.add_job(job2) - - assert self.jobs.get_job_count() == 2 - assert job1 in self.jobs - assert job2 in self.jobs - assert self.jobs.get_job_list() in ["123,456", "456,123"] - class TestJobsProgress(unittest.TestCase): """Tests for JobsProgress class""" @@ -223,7 +206,7 @@ def test_get_job(self): assert job1 in self.jobs def test_get_job_list(self): - self.assertTrue(self.jobs.get_job_list() is None) + assert not self.jobs.get_job_list() job1 = {"id": "123"} self.jobs.add(job1) @@ -232,4 +215,4 @@ def test_get_job_list(self): self.jobs.add(job2) assert self.jobs.get_job_count() == 2 - assert self.jobs.get_job_list() in ["123,456", "456,123"] + assert not self.jobs.get_job_list().difference(("123","456",)) From 6d4d0e2a7d4c451e7f87f4621152a0b382eb07a5 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Dean=20Qui=C3=B1anola?= Date: Tue, 5 Nov 2024 22:57:12 -0800 Subject: [PATCH 22/53] tmp: add_event for each request_id in the send_ping --- runpod/serverless/modules/rp_ping.py | 12 +++++++----- runpod/serverless/modules/worker_state.py | 4 ++-- 2 files changed, 9 insertions(+), 7 deletions(-) diff --git a/runpod/serverless/modules/rp_ping.py b/runpod/serverless/modules/rp_ping.py index 18336c44..3c268468 100644 --- a/runpod/serverless/modules/rp_ping.py +++ b/runpod/serverless/modules/rp_ping.py @@ -85,16 +85,18 @@ def ping_loop(self, test=False): if test: return - @tracer.start_as_current_span("send_ping") + @tracer.start_as_current_span("send_ping", kind=trace.SpanKind.CLIENT) def _send_ping(self): """ Sends a heartbeat to the Runpod server. """ - job_ids = job_progress.get_job_list() - ping_params = {"job_id": ",".join(job_ids), "runpod_version": runpod_version} - span = trace.get_current_span() - span.set_attribute("request_id", job_ids) + job_ids = [] + for job in job_progress: + span.add_event("ping", {"request_id": job.id}) + job_ids.append(job.id) + + ping_params = {"job_id": ",".join(job_ids), "runpod_version": runpod_version} try: result = self._session.get( diff --git a/runpod/serverless/modules/worker_state.py b/runpod/serverless/modules/worker_state.py index 349e7496..56d84a8a 100644 --- a/runpod/serverless/modules/worker_state.py +++ b/runpod/serverless/modules/worker_state.py @@ -5,7 +5,7 @@ import os import time import uuid -from typing import Any, Dict, Optional +from typing import Any, Dict, Optional, Set from asyncio import Queue from .rp_logger import RunPodLogger @@ -62,7 +62,7 @@ def __str__(self) -> str: # ---------------------------------------------------------------------------- # # Tracker # # ---------------------------------------------------------------------------- # -class JobsProgress(set): +class JobsProgress(Set[Job]): """Track the state of current jobs in progress.""" _instance = None From 764dd6afc5ebe9425970eb4bfcd81c235b25d71d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Dean=20Qui=C3=B1anola?= Date: Wed, 6 Nov 2024 09:44:20 -0800 Subject: [PATCH 23/53] tmp: removed get_job_list --- runpod/serverless/modules/worker_state.py | 12 ------------ tests/test_serverless/test_modules/test_state.py | 12 ------------ 2 files changed, 24 deletions(-) diff --git a/runpod/serverless/modules/worker_state.py b/runpod/serverless/modules/worker_state.py index 56d84a8a..cf82b89c 100644 --- a/runpod/serverless/modules/worker_state.py +++ b/runpod/serverless/modules/worker_state.py @@ -72,9 +72,6 @@ def __new__(cls): JobsProgress._instance = set.__new__(cls) return JobsProgress._instance - def __repr__(self) -> str: - return f"<{self.__class__.__name__}>: {self.get_job_list()}" - def add(self, element: Any): """ Adds a Job object to the set. @@ -126,15 +123,6 @@ def get(self, element: Any) -> Job: if job == element: return job - def get_job_list(self) -> set[str]: - """ - Returns the list of job IDs - """ - if not self.get_job_count(): - return set() - - return set(str(job) for job in self) - def get_job_count(self) -> int: """ Returns the number of jobs. diff --git a/tests/test_serverless/test_modules/test_state.py b/tests/test_serverless/test_modules/test_state.py index 7057822f..af605dd9 100644 --- a/tests/test_serverless/test_modules/test_state.py +++ b/tests/test_serverless/test_modules/test_state.py @@ -204,15 +204,3 @@ def test_get_job(self): job1 = self.jobs.get(id) assert job1 in self.jobs - - def test_get_job_list(self): - assert not self.jobs.get_job_list() - - job1 = {"id": "123"} - self.jobs.add(job1) - - job2 = {"id": "456"} - self.jobs.add(job2) - - assert self.jobs.get_job_count() == 2 - assert not self.jobs.get_job_list().difference(("123","456",)) From b9687c9f4d87a5681b64dd045f0d7ca5a9e4b78e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Dean=20Qui=C3=B1anola?= Date: Wed, 6 Nov 2024 11:43:49 -0800 Subject: [PATCH 24/53] tmp: context propagation --- runpod/http_client.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/runpod/http_client.py b/runpod/http_client.py index 8bf6dca2..146d317b 100644 --- a/runpod/http_client.py +++ b/runpod/http_client.py @@ -6,8 +6,8 @@ import requests from aiohttp import ClientSession, ClientTimeout, TCPConnector, ClientResponseError from opentelemetry import trace -# from opentelemetry.instrumentation.aiohttp_client import create_trace_config -# from opentelemetry.instrumentation.requests import RequestsInstrumentor +from opentelemetry.instrumentation.aiohttp_client import create_trace_config +from opentelemetry.instrumentation.requests import RequestsInstrumentor from .cli.groups.config.functions import get_credentials from .user_agent import USER_AGENT @@ -44,7 +44,7 @@ def AsyncClientSession(*args, **kwargs): connector=TCPConnector(limit=0), headers=get_auth_header(), timeout=ClientTimeout(600, ceil_threshold=400), - # trace_configs=[create_trace_config()], + trace_configs=[create_trace_config()], *args, **kwargs, ) @@ -54,4 +54,4 @@ class SyncClientSession(requests.Session): def __init__(self): super().__init__() self.headers.update(get_auth_header()) - # RequestsInstrumentor().instrument(session=self) \ No newline at end of file + RequestsInstrumentor().instrument(session=self) From 0e666b56eb1a02c77e7a8d476f27109c3a5b69d9 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Dean=20Qui=C3=B1anola?= Date: Wed, 20 Nov 2024 20:10:28 -0800 Subject: [PATCH 25/53] tmp: missed this merge --- runpod/serverless/modules/rp_scale.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/runpod/serverless/modules/rp_scale.py b/runpod/serverless/modules/rp_scale.py index 6e902def..89212871 100644 --- a/runpod/serverless/modules/rp_scale.py +++ b/runpod/serverless/modules/rp_scale.py @@ -241,7 +241,7 @@ async def handle_job(self, session: ClientSession, job: dict): span.set_attribute("request_id", job.get("id")) try: - job_progress.add(job) + await job_progress.add(job) await handle_job(session, self.config, job) From e53d6358a0de479520379df3466e96fad5b293df Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Dean=20Qui=C3=B1anola?= Date: Wed, 20 Nov 2024 21:32:54 -0800 Subject: [PATCH 26/53] tmp: missed this merge --- runpod/serverless/modules/rp_scale.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/runpod/serverless/modules/rp_scale.py b/runpod/serverless/modules/rp_scale.py index 89212871..1384d6ef 100644 --- a/runpod/serverless/modules/rp_scale.py +++ b/runpod/serverless/modules/rp_scale.py @@ -259,4 +259,4 @@ async def handle_job(self, session: ClientSession, job: dict): job_list.task_done() # Job is no longer in progress - job_progress.remove(job["id"]) + await job_progress.remove(job["id"]) From c774bfde8344c43aa0b9dd26eb79bf629eff0660 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Dean=20Qui=C3=B1anola?= Date: Thu, 21 Nov 2024 00:54:02 -0800 Subject: [PATCH 27/53] tmp: noop for disabled otel --- runpod/otel.py | 12 +++++++++--- 1 file changed, 9 insertions(+), 3 deletions(-) diff --git a/runpod/otel.py b/runpod/otel.py index ed94ea9e..276d7e8f 100644 --- a/runpod/otel.py +++ b/runpod/otel.py @@ -6,6 +6,7 @@ from opentelemetry.sdk.trace.export import BatchSpanProcessor, ConsoleSpanExporter from opentelemetry.sdk.resources import ( Resource, + DEPLOYMENT_ENVIRONMENT, SERVICE_NAME, SERVICE_VERSION, ) @@ -20,6 +21,7 @@ TracerProvider( resource=Resource.create( { + DEPLOYMENT_ENVIRONMENT: os.getenv("ENV"), RUNPOD_ENDPOINT_ID: os.getenv("RUNPOD_ENDPOINT_ID"), RUNPOD_POD_ID: os.getenv("RUNPOD_POD_ID"), SERVICE_NAME: "runpod-python-sdk", @@ -31,8 +33,12 @@ tracer = trace.get_tracer_provider() -if os.getenv("RUNPOD_LOG_LEVEL", "").lower() == "trace": - tracer.add_span_processor(BatchSpanProcessor(ConsoleSpanExporter())) - if os.getenv("OTEL_EXPORTER_OTLP_ENDPOINT"): tracer.add_span_processor(BatchSpanProcessor(OTLPSpanExporter())) + +elif os.getenv("RUNPOD_LOG_LEVEL", "").lower() == "trace": + tracer.add_span_processor(BatchSpanProcessor(ConsoleSpanExporter())) + +else: + # Use NoOpTracerProvider to disable OTEL + trace.set_tracer_provider(trace.NoOpTracerProvider()) From 7c9ed5784a1e7b417be57fa653c8c8bf1991289b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Dean=20Qui=C3=B1anola?= Date: Sat, 7 Dec 2024 22:37:00 -0800 Subject: [PATCH 28/53] tmp: proper spankind --- runpod/serverless/core.py | 4 ++-- runpod/serverless/modules/rp_http.py | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/runpod/serverless/core.py b/runpod/serverless/core.py index 9a1e9f62..c70abeae 100644 --- a/runpod/serverless/core.py +++ b/runpod/serverless/core.py @@ -187,7 +187,7 @@ def progress_update(self, job_id: str, json_data: bytes) -> bool: ) ) - @tracer.start_as_current_span("handle_result", kind=trace.SpanKind.SERVER) + @tracer.start_as_current_span("handle_result", kind=trace.SpanKind.CLIENT) async def stream_output(self, job_id: str, job_output: bytes) -> bool: """ send part of a streaming result to AI-API. @@ -207,7 +207,7 @@ async def stream_output(self, job_id: str, job_output: bytes) -> bool: ) ) - @tracer.start_as_current_span("handle_result", kind=trace.SpanKind.SERVER) + @tracer.start_as_current_span("handle_result", kind=trace.SpanKind.CLIENT) def post_output(self, job_id: str, job_output: bytes) -> bool: """ send the result of a job to AI-API. diff --git a/runpod/serverless/modules/rp_http.py b/runpod/serverless/modules/rp_http.py index 2423cfa4..02a8c059 100644 --- a/runpod/serverless/modules/rp_http.py +++ b/runpod/serverless/modules/rp_http.py @@ -50,7 +50,7 @@ async def _transmit(client_session: ClientSession, url, job_data): await client_response.text() -@tracer.start_as_current_span("handle_result", kind=trace.SpanKind.SERVER) +@tracer.start_as_current_span("handle_result", kind=trace.SpanKind.CLIENT) async def _handle_result( session: ClientSession, job_data, job, url_template, log_message, is_stream=False ): From b030cecdb7c410a48654ec4d1f5ef418f49b284e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Dean=20Qui=C3=B1anola?= Date: Mon, 9 Dec 2024 19:54:43 -0800 Subject: [PATCH 29/53] tmp: force sampling from this parent span This tells ai-api to trace despite the 1% ratio --- runpod/otel.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/runpod/otel.py b/runpod/otel.py index 276d7e8f..3fa06ec1 100644 --- a/runpod/otel.py +++ b/runpod/otel.py @@ -2,7 +2,7 @@ from opentelemetry import trace from opentelemetry.exporter.otlp.proto.http.trace_exporter import OTLPSpanExporter -from opentelemetry.sdk.trace import TracerProvider +from opentelemetry.sdk.trace import TracerProvider, sampling from opentelemetry.sdk.trace.export import BatchSpanProcessor, ConsoleSpanExporter from opentelemetry.sdk.resources import ( Resource, @@ -19,6 +19,7 @@ trace.set_tracer_provider( TracerProvider( + sampler=sampling.ALWAYS_ON, resource=Resource.create( { DEPLOYMENT_ENVIRONMENT: os.getenv("ENV"), From a800261d8f52ef0d119c390cf543fa2c8afa77d1 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Dean=20Qui=C3=B1anola?= Date: Mon, 9 Dec 2024 20:11:57 -0800 Subject: [PATCH 30/53] tmp: revert --- runpod/http_client.py | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/runpod/http_client.py b/runpod/http_client.py index cea3f117..146d317b 100644 --- a/runpod/http_client.py +++ b/runpod/http_client.py @@ -38,20 +38,20 @@ def get_auth_header(): def AsyncClientSession(*args, **kwargs): """ - Deprecation from aiohttp.ClientSession forbids inheritance. - This is now a factory method + Factory method for an async client session with OpenTelemetry tracing. """ return ClientSession( connector=TCPConnector(limit=0), headers=get_auth_header(), timeout=ClientTimeout(600, ceil_threshold=400), + trace_configs=[create_trace_config()], *args, **kwargs, ) class SyncClientSession(requests.Session): - """ - Inherits requests.Session to override `request()` method for tracing - """ - pass + def __init__(self): + super().__init__() + self.headers.update(get_auth_header()) + RequestsInstrumentor().instrument(session=self) From b8377b85c5244482045e123f30a736e346c2824d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Dean=20Qui=C3=B1anola?= Date: Mon, 9 Dec 2024 20:44:15 -0800 Subject: [PATCH 31/53] tmp: capture job_output --- runpod/serverless/modules/rp_job.py | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/runpod/serverless/modules/rp_job.py b/runpod/serverless/modules/rp_job.py index 3437da76..a8dab360 100644 --- a/runpod/serverless/modules/rp_job.py +++ b/runpod/serverless/modules/rp_job.py @@ -252,10 +252,18 @@ async def run_job_generator( if is_async_gen: async for output_partial in job_output: log.debug(f"Async Generator output: {output_partial}", job["id"]) + span.add_event( + "Async generator output", + attributes={"output_partial": str(output_partial)}, + ) yield {"output": output_partial} else: for output_partial in job_output: log.debug(f"Generator output: {output_partial}", job["id"]) + span.add_event( + "Async generator output", + attributes={"output_partial": str(output_partial)}, + ) yield {"output": output_partial} except Exception as err: From f10693fbb9c338f530c434936d1b343a1b801b6c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Dean=20Qui=C3=B1anola?= Date: Mon, 9 Dec 2024 23:45:43 -0800 Subject: [PATCH 32/53] tmp: check for "error" in a dict --- runpod/serverless/modules/rp_job.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/runpod/serverless/modules/rp_job.py b/runpod/serverless/modules/rp_job.py index a8dab360..be7fc14b 100644 --- a/runpod/serverless/modules/rp_job.py +++ b/runpod/serverless/modules/rp_job.py @@ -127,7 +127,7 @@ async def handle_job(session: ClientSession, config: Dict[str, Any], job: dict) job_result = {"output": []} async for stream_output in generator_output: log.debug(f"Stream output: {stream_output}", job["id"]) - if "error" in stream_output: + if stream_output.get("error"): job_result = stream_output break if config.get("return_aggregate_stream", False): From 1aed002f06cd04f80a548aa045d798a6c1ff6f02 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Dean=20Qui=C3=B1anola?= Date: Tue, 10 Dec 2024 07:50:00 -0800 Subject: [PATCH 33/53] tmp: capture and report error --- runpod/serverless/modules/rp_http.py | 2 ++ runpod/serverless/modules/rp_job.py | 16 +++++++--------- 2 files changed, 9 insertions(+), 9 deletions(-) diff --git a/runpod/serverless/modules/rp_http.py b/runpod/serverless/modules/rp_http.py index 02a8c059..242b7f4a 100644 --- a/runpod/serverless/modules/rp_http.py +++ b/runpod/serverless/modules/rp_http.py @@ -87,6 +87,7 @@ async def _handle_result( log.info("Finished.", job["id"]) +@tracer.start_as_current_span("send_result") async def send_result(session, job_data, job, is_stream=False): """ Return the job results. @@ -96,6 +97,7 @@ async def send_result(session, job_data, job, is_stream=False): ) +@tracer.start_as_current_span("stream_result") async def stream_result(session, job_data, job): """ Return the stream job results. diff --git a/runpod/serverless/modules/rp_job.py b/runpod/serverless/modules/rp_job.py index be7fc14b..a3eb1a71 100644 --- a/runpod/serverless/modules/rp_job.py +++ b/runpod/serverless/modules/rp_job.py @@ -119,7 +119,11 @@ async def get_job( return jobs +@tracer.start_as_current_span("handle_job") async def handle_job(session: ClientSession, config: Dict[str, Any], job: dict) -> dict: + span = trace.get_current_span() + span.set_attribute("request_id", job.get("id")) + if is_generator(config["handler"]): is_stream = True generator_output = run_job_generator(config["handler"], job) @@ -128,7 +132,9 @@ async def handle_job(session: ClientSession, config: Dict[str, Any], job: dict) async for stream_output in generator_output: log.debug(f"Stream output: {stream_output}", job["id"]) if stream_output.get("error"): - job_result = stream_output + span.record_exception(stream_output) + span.set_status(trace.Status(trace.StatusCode.ERROR, str(stream_output))) + await send_result(session, stream_output, job, is_stream=is_stream) break if config.get("return_aggregate_stream", False): job_result["output"].append(stream_output["output"]) @@ -252,18 +258,10 @@ async def run_job_generator( if is_async_gen: async for output_partial in job_output: log.debug(f"Async Generator output: {output_partial}", job["id"]) - span.add_event( - "Async generator output", - attributes={"output_partial": str(output_partial)}, - ) yield {"output": output_partial} else: for output_partial in job_output: log.debug(f"Generator output: {output_partial}", job["id"]) - span.add_event( - "Async generator output", - attributes={"output_partial": str(output_partial)}, - ) yield {"output": output_partial} except Exception as err: From 619d2c17c4e54b648f75d7bc50d1ee4499f204e4 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Dean=20Qui=C3=B1anola?= Date: Tue, 10 Dec 2024 08:08:14 -0800 Subject: [PATCH 34/53] tmp: record Stream output --- runpod/serverless/modules/rp_job.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/runpod/serverless/modules/rp_job.py b/runpod/serverless/modules/rp_job.py index a3eb1a71..b7b60488 100644 --- a/runpod/serverless/modules/rp_job.py +++ b/runpod/serverless/modules/rp_job.py @@ -131,6 +131,10 @@ async def handle_job(session: ClientSession, config: Dict[str, Any], job: dict) job_result = {"output": []} async for stream_output in generator_output: log.debug(f"Stream output: {stream_output}", job["id"]) + span.add_event( + "Stream output", + attributes={"stream_output": str(stream_output)}, + ) if stream_output.get("error"): span.record_exception(stream_output) span.set_status(trace.Status(trace.StatusCode.ERROR, str(stream_output))) From bd6d7dece42f5702e9292321ef6a20470e7a54e7 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Dean=20Qui=C3=B1anola?= Date: Tue, 10 Dec 2024 08:08:35 -0800 Subject: [PATCH 35/53] tmp: avoid confusion with rp_job.handle_job --- runpod/serverless/modules/rp_scale.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/runpod/serverless/modules/rp_scale.py b/runpod/serverless/modules/rp_scale.py index e00fd94d..c69adc89 100644 --- a/runpod/serverless/modules/rp_scale.py +++ b/runpod/serverless/modules/rp_scale.py @@ -242,7 +242,7 @@ async def run_jobs(self, session: ClientSession): job = await self.jobs_queue.get() # Create a new task for each job and add it to the task list - task = asyncio.create_task(self.handle_job(session, job)) + task = asyncio.create_task(self.run_job(session, job)) tasks.append(task) # Wait for any job to finish @@ -263,14 +263,14 @@ async def run_jobs(self, session: ClientSession): # Ensure all remaining tasks finish before stopping await asyncio.gather(*tasks) - async def handle_job(self, session: ClientSession, job: dict): + async def run_job(self, session: ClientSession, job: dict): """ Process an individual job. This function is run concurrently for multiple jobs. """ context = set_span_in_context(NonRecordingSpan(job["context"])) with tracer.start_as_current_span( - "handle_job", context=context, kind=SpanKind.CONSUMER + "run_job", context=context, kind=SpanKind.CONSUMER ) as span: try: From 85e4b4dde858c6fc9bb06270560e7ed0e62151be Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Dean=20Qui=C3=B1anola?= Date: Tue, 10 Dec 2024 08:24:37 -0800 Subject: [PATCH 36/53] tmp: perform_job --- runpod/serverless/modules/rp_scale.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/runpod/serverless/modules/rp_scale.py b/runpod/serverless/modules/rp_scale.py index c69adc89..10f417b7 100644 --- a/runpod/serverless/modules/rp_scale.py +++ b/runpod/serverless/modules/rp_scale.py @@ -242,7 +242,7 @@ async def run_jobs(self, session: ClientSession): job = await self.jobs_queue.get() # Create a new task for each job and add it to the task list - task = asyncio.create_task(self.run_job(session, job)) + task = asyncio.create_task(self.perform_job(session, job)) tasks.append(task) # Wait for any job to finish @@ -263,14 +263,14 @@ async def run_jobs(self, session: ClientSession): # Ensure all remaining tasks finish before stopping await asyncio.gather(*tasks) - async def run_job(self, session: ClientSession, job: dict): + async def perform_job(self, session: ClientSession, job: dict): """ Process an individual job. This function is run concurrently for multiple jobs. """ context = set_span_in_context(NonRecordingSpan(job["context"])) with tracer.start_as_current_span( - "run_job", context=context, kind=SpanKind.CONSUMER + "perform_job", context=context, kind=SpanKind.CONSUMER ) as span: try: From db3b6603220e2b1d3e9d5fdf9cd2f2863f74183a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Dean=20Qui=C3=B1anola?= Date: Tue, 10 Dec 2024 08:41:58 -0800 Subject: [PATCH 37/53] tmp: capture proper error --- runpod/serverless/modules/rp_job.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/runpod/serverless/modules/rp_job.py b/runpod/serverless/modules/rp_job.py index b7b60488..b78322c2 100644 --- a/runpod/serverless/modules/rp_job.py +++ b/runpod/serverless/modules/rp_job.py @@ -135,10 +135,10 @@ async def handle_job(session: ClientSession, config: Dict[str, Any], job: dict) "Stream output", attributes={"stream_output": str(stream_output)}, ) - if stream_output.get("error"): - span.record_exception(stream_output) - span.set_status(trace.Status(trace.StatusCode.ERROR, str(stream_output))) - await send_result(session, stream_output, job, is_stream=is_stream) + if err_output := stream_output["output"].get("error"): + span.record_exception(err_output) + span.set_status(trace.Status(trace.StatusCode.ERROR, str(err_output))) + await send_result(session, stream_output["output"], job, is_stream=is_stream) break if config.get("return_aggregate_stream", False): job_result["output"].append(stream_output["output"]) From 962df33b3d67d66698de70e1494118103e2b4c1e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Dean=20Qui=C3=B1anola?= Date: Tue, 10 Dec 2024 09:42:16 -0800 Subject: [PATCH 38/53] tmp: need to capture "error" in output better --- runpod/serverless/modules/rp_job.py | 38 ++++++++++++++++++++++++----- 1 file changed, 32 insertions(+), 6 deletions(-) diff --git a/runpod/serverless/modules/rp_job.py b/runpod/serverless/modules/rp_job.py index b78322c2..c1428f56 100644 --- a/runpod/serverless/modules/rp_job.py +++ b/runpod/serverless/modules/rp_job.py @@ -119,6 +119,15 @@ async def get_job( return jobs +@tracer.start_as_current_span("handle_error") +def _handle_error(err_output: any, job: dict) -> bool: + span = trace.get_current_span() + + span.record_exception(err_output) + span.set_status(trace.Status(trace.StatusCode.ERROR, str(err_output))) + log.debug(f"Handled error: {err_output}", job["id"]) + + @tracer.start_as_current_span("handle_job") async def handle_job(session: ClientSession, config: Dict[str, Any], job: dict) -> dict: span = trace.get_current_span() @@ -130,16 +139,33 @@ async def handle_job(session: ClientSession, config: Dict[str, Any], job: dict) job_result = {"output": []} async for stream_output in generator_output: - log.debug(f"Stream output: {stream_output}", job["id"]) + # temp + log.debug(f"Stream output: {stream_output['output']}", job["id"]) span.add_event( "Stream output", - attributes={"stream_output": str(stream_output)}, + attributes={ + "stream_output": str(stream_output["output"]), + "stream_output_type": str(type(stream_output["output"])), + }, ) - if err_output := stream_output["output"].get("error"): - span.record_exception(err_output) - span.set_status(trace.Status(trace.StatusCode.ERROR, str(err_output))) - await send_result(session, stream_output["output"], job, is_stream=is_stream) + # end temp + + if type(stream_output["output"]) == dict: + if error_output := stream_output.get("error"): + _handle_error(error_output, job) + job_result = stream_output + break + + if type(stream_output["output"]) != str: + _handle_error(stream_output["output"], job) + job_result = stream_output + break + + if "error" in stream_output: + _handle_error(stream_output, job) + job_result = stream_output break + if config.get("return_aggregate_stream", False): job_result["output"].append(stream_output["output"]) From 8da936aa6c1225320bef0d481fba0291bf1282f1 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Dean=20Qui=C3=B1anola?= Date: Tue, 10 Dec 2024 10:07:06 -0800 Subject: [PATCH 39/53] tmp: record_exception fix --- runpod/serverless/modules/rp_job.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/runpod/serverless/modules/rp_job.py b/runpod/serverless/modules/rp_job.py index c1428f56..7259740b 100644 --- a/runpod/serverless/modules/rp_job.py +++ b/runpod/serverless/modules/rp_job.py @@ -123,7 +123,7 @@ async def get_job( def _handle_error(err_output: any, job: dict) -> bool: span = trace.get_current_span() - span.record_exception(err_output) + span.record_exception(Exception(str(err_output))) span.set_status(trace.Status(trace.StatusCode.ERROR, str(err_output))) log.debug(f"Handled error: {err_output}", job["id"]) From 940b355aef6c00ac217bda45c5fa72e545e3353a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Dean=20Qui=C3=B1anola?= Date: Tue, 10 Dec 2024 10:34:15 -0800 Subject: [PATCH 40/53] tmp: explicit --- runpod/serverless/modules/rp_job.py | 30 +++++++++++++++++++++-------- 1 file changed, 22 insertions(+), 8 deletions(-) diff --git a/runpod/serverless/modules/rp_job.py b/runpod/serverless/modules/rp_job.py index 7259740b..a93eb165 100644 --- a/runpod/serverless/modules/rp_job.py +++ b/runpod/serverless/modules/rp_job.py @@ -140,28 +140,42 @@ async def handle_job(session: ClientSession, config: Dict[str, Any], job: dict) job_result = {"output": []} async for stream_output in generator_output: # temp - log.debug(f"Stream output: {stream_output['output']}", job["id"]) - span.add_event( - "Stream output", - attributes={ - "stream_output": str(stream_output["output"]), - "stream_output_type": str(type(stream_output["output"])), - }, - ) + log.debug(f"Stream output: {stream_output}", job["id"]) # end temp if type(stream_output["output"]) == dict: + span.add_event( + "Stream output is dict", + attributes={ + "stream_output": str(stream_output.get("output")), + "stream_output_type": str(type(stream_output.get("output"))), + }, + ) if error_output := stream_output.get("error"): _handle_error(error_output, job) job_result = stream_output break if type(stream_output["output"]) != str: + span.add_event( + "Stream output is not string", + attributes={ + "stream_output": str(stream_output.get("output")), + "stream_output_type": str(type(stream_output.get("output"))), + }, + ) _handle_error(stream_output["output"], job) job_result = stream_output break if "error" in stream_output: + span.add_event( + "Stream output has `error`", + attributes={ + "stream_output": str(stream_output), + "stream_output_type": str(type(stream_output)), + }, + ) _handle_error(stream_output, job) job_result = stream_output break From 7e024b281803b87b316aea76b9a886134b47b7f0 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Dean=20Qui=C3=B1anola?= Date: Tue, 10 Dec 2024 10:43:22 -0800 Subject: [PATCH 41/53] tmp: omg --- runpod/serverless/modules/rp_job.py | 30 ++++++++++++++--------------- 1 file changed, 15 insertions(+), 15 deletions(-) diff --git a/runpod/serverless/modules/rp_job.py b/runpod/serverless/modules/rp_job.py index a93eb165..82552e09 100644 --- a/runpod/serverless/modules/rp_job.py +++ b/runpod/serverless/modules/rp_job.py @@ -143,7 +143,19 @@ async def handle_job(session: ClientSession, config: Dict[str, Any], job: dict) log.debug(f"Stream output: {stream_output}", job["id"]) # end temp - if type(stream_output["output"]) == dict: + if stream_output.get("error"): + span.add_event( + "Stream output has `error`", + attributes={ + "stream_output": str(stream_output), + "stream_output_type": str(type(stream_output)), + }, + ) + _handle_error(stream_output, job) + job_result = stream_output + break + + if type(stream_output.get("output")) == dict: span.add_event( "Stream output is dict", attributes={ @@ -156,9 +168,9 @@ async def handle_job(session: ClientSession, config: Dict[str, Any], job: dict) job_result = stream_output break - if type(stream_output["output"]) != str: + if type(stream_output.get("output")) != str: span.add_event( - "Stream output is not string", + "Stream output is not string or dict", attributes={ "stream_output": str(stream_output.get("output")), "stream_output_type": str(type(stream_output.get("output"))), @@ -168,18 +180,6 @@ async def handle_job(session: ClientSession, config: Dict[str, Any], job: dict) job_result = stream_output break - if "error" in stream_output: - span.add_event( - "Stream output has `error`", - attributes={ - "stream_output": str(stream_output), - "stream_output_type": str(type(stream_output)), - }, - ) - _handle_error(stream_output, job) - job_result = stream_output - break - if config.get("return_aggregate_stream", False): job_result["output"].append(stream_output["output"]) From ccd01fd85fcd4c194f65045ba02a0263d5f5b68a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Dean=20Qui=C3=B1anola?= Date: Tue, 10 Dec 2024 10:56:36 -0800 Subject: [PATCH 42/53] tmp: fix --- runpod/serverless/modules/rp_job.py | 1 - 1 file changed, 1 deletion(-) diff --git a/runpod/serverless/modules/rp_job.py b/runpod/serverless/modules/rp_job.py index 82552e09..965689b2 100644 --- a/runpod/serverless/modules/rp_job.py +++ b/runpod/serverless/modules/rp_job.py @@ -123,7 +123,6 @@ async def get_job( def _handle_error(err_output: any, job: dict) -> bool: span = trace.get_current_span() - span.record_exception(Exception(str(err_output))) span.set_status(trace.Status(trace.StatusCode.ERROR, str(err_output))) log.debug(f"Handled error: {err_output}", job["id"]) From 32eebaa31bf262ddc1d787447f81f3cdded9fa90 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Dean=20Qui=C3=B1anola?= Date: Tue, 10 Dec 2024 11:03:43 -0800 Subject: [PATCH 43/53] tmp: shift error as job_result --- runpod/serverless/modules/rp_job.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/runpod/serverless/modules/rp_job.py b/runpod/serverless/modules/rp_job.py index 965689b2..02f86ce9 100644 --- a/runpod/serverless/modules/rp_job.py +++ b/runpod/serverless/modules/rp_job.py @@ -142,7 +142,7 @@ async def handle_job(session: ClientSession, config: Dict[str, Any], job: dict) log.debug(f"Stream output: {stream_output}", job["id"]) # end temp - if stream_output.get("error"): + if error_output := stream_output.get("error"): span.add_event( "Stream output has `error`", attributes={ @@ -151,7 +151,7 @@ async def handle_job(session: ClientSession, config: Dict[str, Any], job: dict) }, ) _handle_error(stream_output, job) - job_result = stream_output + job_result = error_output break if type(stream_output.get("output")) == dict: From 1e9777fc9a06771f45169a9af6abbdb8b84e78f8 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Dean=20Qui=C3=B1anola?= Date: Tue, 10 Dec 2024 11:16:51 -0800 Subject: [PATCH 44/53] tmp: seriously? --- runpod/serverless/modules/rp_job.py | 26 +++++++++++++------------- 1 file changed, 13 insertions(+), 13 deletions(-) diff --git a/runpod/serverless/modules/rp_job.py b/runpod/serverless/modules/rp_job.py index 02f86ce9..97ac6774 100644 --- a/runpod/serverless/modules/rp_job.py +++ b/runpod/serverless/modules/rp_job.py @@ -142,30 +142,30 @@ async def handle_job(session: ClientSession, config: Dict[str, Any], job: dict) log.debug(f"Stream output: {stream_output}", job["id"]) # end temp - if error_output := stream_output.get("error"): + if type(stream_output.get("output")) == dict: span.add_event( - "Stream output has `error`", + "Stream output has `output.error`", attributes={ "stream_output": str(stream_output), "stream_output_type": str(type(stream_output)), + "stream_output_error": str(stream_output["output"].get("error")), + "stream_output_error_type": str(type(stream_output["output"].get("error"))), }, ) - _handle_error(stream_output, job) - job_result = error_output - break + if stream_output["output"].get("error"): + stream_output["error"] = stream_output["output"]["error"] - if type(stream_output.get("output")) == dict: + if stream_output.get("error"): span.add_event( - "Stream output is dict", + "Stream output has `error`", attributes={ - "stream_output": str(stream_output.get("output")), - "stream_output_type": str(type(stream_output.get("output"))), + "stream_output": str(stream_output), + "stream_output_type": str(type(stream_output)), }, ) - if error_output := stream_output.get("error"): - _handle_error(error_output, job) - job_result = stream_output - break + _handle_error(stream_output, job) + job_result = stream_output + break if type(stream_output.get("output")) != str: span.add_event( From e432baf48f31ae34a583c8d62bad9589f631fd5c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Dean=20Qui=C3=B1anola?= Date: Tue, 10 Dec 2024 11:42:34 -0800 Subject: [PATCH 45/53] tmp: 1 --- runpod/serverless/modules/rp_job.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/runpod/serverless/modules/rp_job.py b/runpod/serverless/modules/rp_job.py index 97ac6774..ad1ec303 100644 --- a/runpod/serverless/modules/rp_job.py +++ b/runpod/serverless/modules/rp_job.py @@ -153,7 +153,7 @@ async def handle_job(session: ClientSession, config: Dict[str, Any], job: dict) }, ) if stream_output["output"].get("error"): - stream_output["error"] = stream_output["output"]["error"] + stream_output = {"error": stream_output["output"]["error"]} if stream_output.get("error"): span.add_event( From 3dbf5f16904436edd4c1f521fc7ebe9d0e5ec16f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Dean=20Qui=C3=B1anola?= Date: Tue, 10 Dec 2024 11:58:25 -0800 Subject: [PATCH 46/53] tmp: trace the transmit job_data --- runpod/serverless/modules/rp_http.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/runpod/serverless/modules/rp_http.py b/runpod/serverless/modules/rp_http.py index 242b7f4a..3d050f66 100644 --- a/runpod/serverless/modules/rp_http.py +++ b/runpod/serverless/modules/rp_http.py @@ -28,10 +28,14 @@ tracer = trace.get_tracer(__name__) +@tracer.start_as_current_span("transmit", kind=trace.SpanKind.CLIENT) async def _transmit(client_session: ClientSession, url, job_data): """ Wrapper for transmitting results via POST. """ + span = trace.get_current_span() + span.set_attribute("job_data", job_data) + retry_options = FibonacciRetry(attempts=3) retry_client = RetryClient( client_session=client_session, retry_options=retry_options From 86a357deda93e5058079f1dc42e350f98a71e0fc Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Dean=20Qui=C3=B1anola?= Date: Tue, 10 Dec 2024 12:01:36 -0800 Subject: [PATCH 47/53] tmp: stringify the error object --- runpod/serverless/modules/rp_job.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/runpod/serverless/modules/rp_job.py b/runpod/serverless/modules/rp_job.py index ad1ec303..7b99c18b 100644 --- a/runpod/serverless/modules/rp_job.py +++ b/runpod/serverless/modules/rp_job.py @@ -153,7 +153,7 @@ async def handle_job(session: ClientSession, config: Dict[str, Any], job: dict) }, ) if stream_output["output"].get("error"): - stream_output = {"error": stream_output["output"]["error"]} + stream_output = {"error": str(stream_output["output"]["error"])} if stream_output.get("error"): span.add_event( From bfd6a0b7be23245112531fa8806547b230ba696a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Dean=20Qui=C3=B1anola?= Date: Fri, 13 Dec 2024 14:59:27 -0800 Subject: [PATCH 48/53] tmp: forced tracing by RUNPOD_LOG_LEVEL=TRACE --- runpod/otel.py | 65 ++++++++++++++++++++++++++++++++++---------------- 1 file changed, 45 insertions(+), 20 deletions(-) diff --git a/runpod/otel.py b/runpod/otel.py index 3fa06ec1..69cff30d 100644 --- a/runpod/otel.py +++ b/runpod/otel.py @@ -1,45 +1,70 @@ import os +import logging +from typing import List from opentelemetry import trace from opentelemetry.exporter.otlp.proto.http.trace_exporter import OTLPSpanExporter from opentelemetry.sdk.trace import TracerProvider, sampling -from opentelemetry.sdk.trace.export import BatchSpanProcessor, ConsoleSpanExporter +from opentelemetry.sdk.trace.export import SpanExporter, BatchSpanProcessor, ConsoleSpanExporter from opentelemetry.sdk.resources import ( Resource, DEPLOYMENT_ENVIRONMENT, SERVICE_NAME, SERVICE_VERSION, ) +from runpod.version import __version__ as runpod_version + + +log = logging.getLogger(__name__) +FMT = "%(filename)-20s:%(lineno)-4d %(asctime)s %(message)s" +logging.basicConfig(level=logging.INFO, format=FMT, handlers=[logging.StreamHandler()]) + +OTEL_COLLECTOR = os.getenv("OTEL_EXPORTER_OTLP_ENDPOINT", "") +OTEL_SAMPLING_RATE = float(os.getenv("OTEL_SAMPLING_RATE", "0.01")) RUNPOD_ENDPOINT_ID = "runpod.endpoint_id" +RUNPOD_ENDPOINT_ID_VALUE = os.getenv("RUNPOD_ENDPOINT_ID", "") RUNPOD_POD_ID = "runpod.pod_id" +RUNPOD_POD_ID_VALUE = os.getenv("RUNPOD_POD_ID", "") +RUNPOD_ENV = os.getenv("ENV", "local") -from runpod.version import __version__ as runpod_version +if os.getenv("RUNPOD_LOG_LEVEL", "").lower() == "trace": + log.setLevel(logging.TRACE) + sampler = sampling.ALWAYS_ON +else: + sampler = sampling.TraceIdRatioBased(OTEL_SAMPLING_RATE) -trace.set_tracer_provider( - TracerProvider( - sampler=sampling.ALWAYS_ON, - resource=Resource.create( - { - DEPLOYMENT_ENVIRONMENT: os.getenv("ENV"), - RUNPOD_ENDPOINT_ID: os.getenv("RUNPOD_ENDPOINT_ID"), - RUNPOD_POD_ID: os.getenv("RUNPOD_POD_ID"), - SERVICE_NAME: "runpod-python-sdk", - SERVICE_VERSION: runpod_version, - } - ) - ) +otlp_provider = TracerProvider( + sampler=sampler, + resource=Resource.create( + { + DEPLOYMENT_ENVIRONMENT: RUNPOD_ENV, + RUNPOD_ENDPOINT_ID: RUNPOD_ENDPOINT_ID_VALUE, + RUNPOD_POD_ID: RUNPOD_POD_ID_VALUE, + SERVICE_NAME: "runpod-python-sdk", + SERVICE_VERSION: runpod_version, + } + ), ) -tracer = trace.get_tracer_provider() +span_processors: List[SpanExporter] = [] + +if RUNPOD_ENV.lower() == "local": + span_processors.append(ConsoleSpanExporter()) + +if OTEL_COLLECTOR: + span_processors.append(OTLPSpanExporter()) -if os.getenv("OTEL_EXPORTER_OTLP_ENDPOINT"): - tracer.add_span_processor(BatchSpanProcessor(OTLPSpanExporter())) + trace.set_tracer_provider(otlp_provider) + tracer = trace.get_tracer_provider() -elif os.getenv("RUNPOD_LOG_LEVEL", "").lower() == "trace": - tracer.add_span_processor(BatchSpanProcessor(ConsoleSpanExporter())) + for span_processor in span_processors: + tracer.add_span_processor(BatchSpanProcessor(span_processor)) + log.debug(f"Span processor: {span_processor}") else: # Use NoOpTracerProvider to disable OTEL trace.set_tracer_provider(trace.NoOpTracerProvider()) + tracer = trace.get_tracer_provider() + log.debug(f"No tracer is active") From 1ef0f35b26c66c7ba53f30d184692b24864440ca Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Dean=20Qui=C3=B1anola?= Date: Tue, 17 Dec 2024 01:52:03 -0800 Subject: [PATCH 49/53] tmp: otel.start() to activate --- runpod/__init__.py | 1 - runpod/otel.py | 100 ++++++++---------- runpod/serverless/worker.py | 3 + .../test_serverless/test_modules/run_scale.py | 2 + 4 files changed, 49 insertions(+), 57 deletions(-) diff --git a/runpod/__init__.py b/runpod/__init__.py index 6ea28ade..6611587d 100644 --- a/runpod/__init__.py +++ b/runpod/__init__.py @@ -3,7 +3,6 @@ import logging import os -from . import otel from . import serverless from .api.ctl_commands import ( create_container_registry_auth, diff --git a/runpod/otel.py b/runpod/otel.py index 69cff30d..96bfd590 100644 --- a/runpod/otel.py +++ b/runpod/otel.py @@ -1,11 +1,9 @@ import os -import logging -from typing import List from opentelemetry import trace from opentelemetry.exporter.otlp.proto.http.trace_exporter import OTLPSpanExporter from opentelemetry.sdk.trace import TracerProvider, sampling -from opentelemetry.sdk.trace.export import SpanExporter, BatchSpanProcessor, ConsoleSpanExporter +from opentelemetry.sdk.trace.export import BatchSpanProcessor, ConsoleSpanExporter from opentelemetry.sdk.resources import ( Resource, DEPLOYMENT_ENVIRONMENT, @@ -15,56 +13,46 @@ from runpod.version import __version__ as runpod_version -log = logging.getLogger(__name__) -FMT = "%(filename)-20s:%(lineno)-4d %(asctime)s %(message)s" -logging.basicConfig(level=logging.INFO, format=FMT, handlers=[logging.StreamHandler()]) - - -OTEL_COLLECTOR = os.getenv("OTEL_EXPORTER_OTLP_ENDPOINT", "") -OTEL_SAMPLING_RATE = float(os.getenv("OTEL_SAMPLING_RATE", "0.01")) -RUNPOD_ENDPOINT_ID = "runpod.endpoint_id" -RUNPOD_ENDPOINT_ID_VALUE = os.getenv("RUNPOD_ENDPOINT_ID", "") -RUNPOD_POD_ID = "runpod.pod_id" -RUNPOD_POD_ID_VALUE = os.getenv("RUNPOD_POD_ID", "") -RUNPOD_ENV = os.getenv("ENV", "local") - - -if os.getenv("RUNPOD_LOG_LEVEL", "").lower() == "trace": - log.setLevel(logging.TRACE) - sampler = sampling.ALWAYS_ON -else: - sampler = sampling.TraceIdRatioBased(OTEL_SAMPLING_RATE) - -otlp_provider = TracerProvider( - sampler=sampler, - resource=Resource.create( - { - DEPLOYMENT_ENVIRONMENT: RUNPOD_ENV, - RUNPOD_ENDPOINT_ID: RUNPOD_ENDPOINT_ID_VALUE, - RUNPOD_POD_ID: RUNPOD_POD_ID_VALUE, - SERVICE_NAME: "runpod-python-sdk", - SERVICE_VERSION: runpod_version, - } - ), -) - -span_processors: List[SpanExporter] = [] - -if RUNPOD_ENV.lower() == "local": - span_processors.append(ConsoleSpanExporter()) - -if OTEL_COLLECTOR: - span_processors.append(OTLPSpanExporter()) - - trace.set_tracer_provider(otlp_provider) - tracer = trace.get_tracer_provider() - - for span_processor in span_processors: - tracer.add_span_processor(BatchSpanProcessor(span_processor)) - log.debug(f"Span processor: {span_processor}") - -else: - # Use NoOpTracerProvider to disable OTEL - trace.set_tracer_provider(trace.NoOpTracerProvider()) - tracer = trace.get_tracer_provider() - log.debug(f"No tracer is active") +def start(): + OTEL_COLLECTOR = os.getenv("OTEL_EXPORTER_OTLP_ENDPOINT", "") + OTEL_SAMPLING_RATE = float(os.getenv("OTEL_SAMPLING_RATE", "0.01")) + + RUNPOD_ENV = os.getenv("ENV", "local").lower() + RUNPOD_LOG_LEVEL = os.getenv("RUNPOD_LOG_LEVEL", "").lower() + + RUNPOD_ENDPOINT_ID = "runpod.endpoint_id" + RUNPOD_ENDPOINT_ID_VALUE = os.getenv("RUNPOD_ENDPOINT_ID", "") + RUNPOD_POD_ID = "runpod.pod_id" + RUNPOD_POD_ID_VALUE = os.getenv("RUNPOD_POD_ID", "") + + if RUNPOD_LOG_LEVEL == "trace": + sampler = sampling.ALWAYS_ON + else: + sampler = sampling.TraceIdRatioBased(OTEL_SAMPLING_RATE) + + tracer = TracerProvider( + sampler=sampler, + resource=Resource.create( + { + DEPLOYMENT_ENVIRONMENT: RUNPOD_ENV, + RUNPOD_ENDPOINT_ID: RUNPOD_ENDPOINT_ID_VALUE, + RUNPOD_POD_ID: RUNPOD_POD_ID_VALUE, + SERVICE_NAME: "runpod-python-sdk", + SERVICE_VERSION: runpod_version, + } + ), + ) + + if OTEL_COLLECTOR: + tracer.add_span_processor(BatchSpanProcessor(OTLPSpanExporter())) + trace.set_tracer_provider(tracer) + print("OpenTelemetry is on") + + elif RUNPOD_ENV == "local": + tracer.add_span_processor(BatchSpanProcessor(ConsoleSpanExporter())) + trace.set_tracer_provider(tracer) + print("Console tracing is on") + + else: + # Use NoOpTracerProvider to disable OTEL + trace.set_tracer_provider(trace.NoOpTracerProvider()) diff --git a/runpod/serverless/worker.py b/runpod/serverless/worker.py index ec98347d..fa262755 100644 --- a/runpod/serverless/worker.py +++ b/runpod/serverless/worker.py @@ -7,6 +7,7 @@ import os from typing import Any, Dict +from runpod import otel from runpod.serverless.modules import rp_logger, rp_local, rp_ping, rp_scale log = rp_logger.RunPodLogger() @@ -35,6 +36,8 @@ def run_worker(config: Dict[str, Any]) -> None: Args: config (Dict[str, Any]): Configuration parameters for the worker. """ + otel.start() + # Start pinging RunPod to show that the worker is alive. heartbeat.start_ping() diff --git a/tests/test_serverless/test_modules/run_scale.py b/tests/test_serverless/test_modules/run_scale.py index 5983c7a6..2150fea2 100644 --- a/tests/test_serverless/test_modules/run_scale.py +++ b/tests/test_serverless/test_modules/run_scale.py @@ -3,6 +3,7 @@ from faker import Faker from typing import Any, Dict, Optional, List +from runpod import otel from runpod.serverless.modules.rp_scale import JobScaler, RunPodLogger, JobsProgress fake = Faker() @@ -60,4 +61,5 @@ async def fake_handle_job(session, config, job) -> dict: "jobs_handler": fake_handle_job, } ) +otel.start() job_scaler.start() From ddff0a252f05d29812b3c9ba76a6cbcd8f2520b4 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Dean=20Qui=C3=B1anola?= Date: Tue, 17 Dec 2024 02:02:09 -0800 Subject: [PATCH 50/53] tmp: print sampling strategy --- runpod/otel.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/runpod/otel.py b/runpod/otel.py index 96bfd590..3ec2bea7 100644 --- a/runpod/otel.py +++ b/runpod/otel.py @@ -46,12 +46,12 @@ def start(): if OTEL_COLLECTOR: tracer.add_span_processor(BatchSpanProcessor(OTLPSpanExporter())) trace.set_tracer_provider(tracer) - print("OpenTelemetry is on") + print(f"OpenTelemetry is on: {sampler.get_description()}") elif RUNPOD_ENV == "local": tracer.add_span_processor(BatchSpanProcessor(ConsoleSpanExporter())) trace.set_tracer_provider(tracer) - print("Console tracing is on") + print(f"Console tracing is on: {sampler.get_description()}") else: # Use NoOpTracerProvider to disable OTEL From 14899393b62d435d4ff66f955be2783ecb0018a8 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Dean=20Qui=C3=B1anola?= Date: Wed, 18 Dec 2024 00:54:25 -0800 Subject: [PATCH 51/53] tmp: pytest-env --- pyproject.toml | 1 + pytest.ini | 3 +++ setup.py | 1 + 3 files changed, 5 insertions(+) diff --git a/pyproject.toml b/pyproject.toml index 0641259b..a8950274 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -57,6 +57,7 @@ test = [ "faker", "pytest-asyncio", "pytest-cov", + "pytest-env", "pytest-timeout", "pytest-watch", "pytest", diff --git a/pytest.ini b/pytest.ini index 68e2f208..1fecc333 100644 --- a/pytest.ini +++ b/pytest.ini @@ -2,3 +2,6 @@ addopts = --durations=10 --cov-config=.coveragerc --timeout=120 --timeout_method=thread --cov=runpod --cov-report=xml --cov-report=term-missing --cov-fail-under=90 -W error -p no:cacheprovider -p no:unraisableexception python_files = tests.py test_*.py *_test.py norecursedirs = venv *.egg-info .git build +env = + D:ENV=test + D:RUNPOD_LOG_LEVEL=ERROR \ No newline at end of file diff --git a/setup.py b/setup.py index 11fe7ce5..d9583e72 100644 --- a/setup.py +++ b/setup.py @@ -20,6 +20,7 @@ "nest_asyncio", "pytest", "pytest-cov", + "pytest-env", "pytest-timeout", "pytest-asyncio", ] From 749a2c77a68c41a28bba5a931a5fcfa02b0e9504 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Dean=20Qui=C3=B1anola?= Date: Thu, 19 Dec 2024 05:05:42 -0800 Subject: [PATCH 52/53] tmp: fix `Attempting to instrument while already instrumented` --- runpod/http_client.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/runpod/http_client.py b/runpod/http_client.py index 146d317b..268d548b 100644 --- a/runpod/http_client.py +++ b/runpod/http_client.py @@ -54,4 +54,5 @@ class SyncClientSession(requests.Session): def __init__(self): super().__init__() self.headers.update(get_auth_header()) - RequestsInstrumentor().instrument(session=self) + +RequestsInstrumentor().instrument() From ed057a34555a3936a2f68614536f354a1b905ccc Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Dean=20Qui=C3=B1anola?= Date: Thu, 19 Dec 2024 05:06:43 -0800 Subject: [PATCH 53/53] tmp: otel scoped to serverless only for now --- runpod/otel.py | 58 --------- runpod/serverless/modules/rp_tracer.py | 117 ++++++++++++++++++ runpod/serverless/worker.py | 5 +- .../test_serverless/test_modules/run_scale.py | 4 +- 4 files changed, 121 insertions(+), 63 deletions(-) delete mode 100644 runpod/otel.py create mode 100644 runpod/serverless/modules/rp_tracer.py diff --git a/runpod/otel.py b/runpod/otel.py deleted file mode 100644 index 3ec2bea7..00000000 --- a/runpod/otel.py +++ /dev/null @@ -1,58 +0,0 @@ -import os - -from opentelemetry import trace -from opentelemetry.exporter.otlp.proto.http.trace_exporter import OTLPSpanExporter -from opentelemetry.sdk.trace import TracerProvider, sampling -from opentelemetry.sdk.trace.export import BatchSpanProcessor, ConsoleSpanExporter -from opentelemetry.sdk.resources import ( - Resource, - DEPLOYMENT_ENVIRONMENT, - SERVICE_NAME, - SERVICE_VERSION, -) -from runpod.version import __version__ as runpod_version - - -def start(): - OTEL_COLLECTOR = os.getenv("OTEL_EXPORTER_OTLP_ENDPOINT", "") - OTEL_SAMPLING_RATE = float(os.getenv("OTEL_SAMPLING_RATE", "0.01")) - - RUNPOD_ENV = os.getenv("ENV", "local").lower() - RUNPOD_LOG_LEVEL = os.getenv("RUNPOD_LOG_LEVEL", "").lower() - - RUNPOD_ENDPOINT_ID = "runpod.endpoint_id" - RUNPOD_ENDPOINT_ID_VALUE = os.getenv("RUNPOD_ENDPOINT_ID", "") - RUNPOD_POD_ID = "runpod.pod_id" - RUNPOD_POD_ID_VALUE = os.getenv("RUNPOD_POD_ID", "") - - if RUNPOD_LOG_LEVEL == "trace": - sampler = sampling.ALWAYS_ON - else: - sampler = sampling.TraceIdRatioBased(OTEL_SAMPLING_RATE) - - tracer = TracerProvider( - sampler=sampler, - resource=Resource.create( - { - DEPLOYMENT_ENVIRONMENT: RUNPOD_ENV, - RUNPOD_ENDPOINT_ID: RUNPOD_ENDPOINT_ID_VALUE, - RUNPOD_POD_ID: RUNPOD_POD_ID_VALUE, - SERVICE_NAME: "runpod-python-sdk", - SERVICE_VERSION: runpod_version, - } - ), - ) - - if OTEL_COLLECTOR: - tracer.add_span_processor(BatchSpanProcessor(OTLPSpanExporter())) - trace.set_tracer_provider(tracer) - print(f"OpenTelemetry is on: {sampler.get_description()}") - - elif RUNPOD_ENV == "local": - tracer.add_span_processor(BatchSpanProcessor(ConsoleSpanExporter())) - trace.set_tracer_provider(tracer) - print(f"Console tracing is on: {sampler.get_description()}") - - else: - # Use NoOpTracerProvider to disable OTEL - trace.set_tracer_provider(trace.NoOpTracerProvider()) diff --git a/runpod/serverless/modules/rp_tracer.py b/runpod/serverless/modules/rp_tracer.py new file mode 100644 index 00000000..da8a0925 --- /dev/null +++ b/runpod/serverless/modules/rp_tracer.py @@ -0,0 +1,117 @@ +import os + +from opentelemetry import trace +from opentelemetry.exporter.otlp.proto.http.trace_exporter import OTLPSpanExporter +from opentelemetry.sdk.trace import TracerProvider, sampling +from opentelemetry.sdk.trace.export import BatchSpanProcessor, ConsoleSpanExporter +from opentelemetry.sdk.resources import ( + Resource, + DEPLOYMENT_ENVIRONMENT, + SERVICE_NAME, + SERVICE_VERSION, +) +from runpod.version import __version__ as runpod_version +from .rp_logger import RunPodLogger + + +log = RunPodLogger() + +# https://opentelemetry.io/docs/languages/sdk-configuration/otlp-exporter/ +OTEL_EXPORTER_OTLP_ENDPOINT = os.getenv("OTEL_EXPORTER_OTLP_ENDPOINT", "") + +# https://opentelemetry.io/docs/languages/sdk-configuration/general/#otel_service_name +OTEL_SERVICE_NAME = os.getenv("OTEL_SERVICE_NAME", "serverless-worker") + +OTEL_SAMPLING_RATE = float(os.getenv("OTEL_SAMPLING_RATE", "0.01")) + + +def start( + service_name: str = OTEL_SERVICE_NAME, + collector: str = OTEL_EXPORTER_OTLP_ENDPOINT, + rate: float = OTEL_SAMPLING_RATE, +): + """ + Initializes the OpenTelemetry global tracer provider. + + Args: + service_name: The service name to associate with the OTEL spans. + collector: The URL of the OTEL collector to report to. Defaults to + the `OTEL_EXPORTER_OTLP_ENDPOINT` environment variable. + rate: The sampling rate between 0.0 and 1.0. Defaults to the + `OTEL_SAMPLING_RATE` env var or 0.01 (1%) + + Notes: + The env var `RUNPOD_LOG_LEVEL=trace` can be set to force mandatory tracing. + Otherwise, the sampling rate is used to control the amount of tracing. + + If a collector is provided, the traces are exported to it. + Else if the environment is "local", the traces are printed to the console. + + If neither of the above conditions are met, then tracing is disabled. + """ + RUNPOD_ENV = get_deployment_env() + RUNPOD_LOG_LEVEL = os.getenv("RUNPOD_LOG_LEVEL", "").lower() + + if RUNPOD_LOG_LEVEL == "trace": + sampler = sampling.ALWAYS_ON + else: + sampler = sampling.TraceIdRatioBased(rate) + + tracer = TracerProvider( + sampler=sampler, + resource=get_resource(service_name, RUNPOD_ENV), + ) + + if collector: + tracer.add_span_processor(BatchSpanProcessor(OTLPSpanExporter())) + trace.set_tracer_provider(tracer) + log.info(f"OpenTelemetry is on: {sampler.get_description()}") + + elif RUNPOD_ENV == "local": + tracer.add_span_processor(BatchSpanProcessor(ConsoleSpanExporter())) + trace.set_tracer_provider(tracer) + log.info(f"Tracing prints to console: {sampler.get_description()}") + + else: + # Use NoOpTracerProvider to disable OTEL + trace.set_tracer_provider(trace.NoOpTracerProvider()) + + +def get_resource(service_name: str, environment: str) -> Resource: + """ + Constructs and returns a Resource object for OpenTelemetry. + + The Resource object includes essential metadata such as deployment + environment, service name, service version, and unique identifiers + for the RunPod endpoint and pod. + + Args: + service_name: The name of the service to associate with the resource. + environment: The deployment environment (e.g., dev, prod, local). + + Returns: + A Resource object containing metadata for tracing and monitoring. + """ + RUNPOD_ENDPOINT_ID = "runpod.endpoint_id" + RUNPOD_ENDPOINT_ID_VALUE = os.getenv("RUNPOD_ENDPOINT_ID", "") + RUNPOD_POD_ID = "runpod.pod_id" + RUNPOD_POD_ID_VALUE = os.getenv("RUNPOD_POD_ID", "") + + return Resource.create( + { + DEPLOYMENT_ENVIRONMENT: environment, + RUNPOD_ENDPOINT_ID: RUNPOD_ENDPOINT_ID_VALUE, + RUNPOD_POD_ID: RUNPOD_POD_ID_VALUE, + SERVICE_NAME: service_name, + SERVICE_VERSION: runpod_version, + } + ) + + +def get_deployment_env() -> str: + RUNPOD_API_URL = os.getenv("RUNPOD_WEBHOOK_PING", "") + if "runpod.dev" in RUNPOD_API_URL: + return "dev" + if "runpod.ai" in RUNPOD_API_URL: + return "prod" + return "local" diff --git a/runpod/serverless/worker.py b/runpod/serverless/worker.py index fa262755..ed72e76f 100644 --- a/runpod/serverless/worker.py +++ b/runpod/serverless/worker.py @@ -7,8 +7,7 @@ import os from typing import Any, Dict -from runpod import otel -from runpod.serverless.modules import rp_logger, rp_local, rp_ping, rp_scale +from runpod.serverless.modules import rp_logger, rp_local, rp_ping, rp_scale, rp_tracer log = rp_logger.RunPodLogger() heartbeat = rp_ping.Heartbeat() @@ -36,7 +35,7 @@ def run_worker(config: Dict[str, Any]) -> None: Args: config (Dict[str, Any]): Configuration parameters for the worker. """ - otel.start() + rp_tracer.start() # Start pinging RunPod to show that the worker is alive. heartbeat.start_ping() diff --git a/tests/test_serverless/test_modules/run_scale.py b/tests/test_serverless/test_modules/run_scale.py index 2150fea2..1730505b 100644 --- a/tests/test_serverless/test_modules/run_scale.py +++ b/tests/test_serverless/test_modules/run_scale.py @@ -3,7 +3,7 @@ from faker import Faker from typing import Any, Dict, Optional, List -from runpod import otel +from runpod.serverless.modules import rp_tracer from runpod.serverless.modules.rp_scale import JobScaler, RunPodLogger, JobsProgress fake = Faker() @@ -61,5 +61,5 @@ async def fake_handle_job(session, config, job) -> dict: "jobs_handler": fake_handle_job, } ) -otel.start() +rp_tracer.start() job_scaler.start()