Skip to content

Commit f0ae76d

Browse files
committed
adding back br wait
1 parent c73d115 commit f0ae76d

File tree

4 files changed

+58
-57
lines changed

4 files changed

+58
-57
lines changed

batchtools/br.py

Lines changed: 12 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -21,6 +21,7 @@
2121
from .prom_metrics import (
2222
PROMETHEUS_INSTANCE,
2323
IN_PROGRESS,
24+
# record helpers (emit both histogram + counter)
2425
record_batch_observation,
2526
record_queue_observation,
2627
record_wall_observation,
@@ -37,7 +38,7 @@ class CreateJobCommandArgs(argparse.Namespace):
3738
name: str = "job"
3839
job_id: str = uuid.uuid5(uuid.NAMESPACE_OID, f"{os.getpid()}-{time.time()}").hex
3940
job_delete: bool = True
40-
wait: bool = True
41+
# no more wait flag exposed to users; br always waits for completion
4142
timeout: int = 60 * 15 * 4
4243
max_sec: int = 60 * 15
4344
gpu_numreq: int = 1
@@ -80,12 +81,7 @@ def build_parser(cls, subparsers: SubParserFactory):
8081
default=CreateJobCommandArgs.job_delete,
8182
help="Delete job on completion",
8283
)
83-
p.add_argument(
84-
"--wait",
85-
action=argparse.BooleanOptionalAction,
86-
default=CreateJobCommandArgs.wait,
87-
help="Wait for job completion",
88-
)
84+
# removed --wait / --no-wait flag: br always waits for completion now
8985
p.add_argument(
9086
"--timeout",
9187
default=CreateJobCommandArgs.timeout,
@@ -185,11 +181,12 @@ def run(args: argparse.Namespace):
185181
queue_wait = None
186182
total_wall = None
187183

188-
if args.wait:
189-
result_phase, run_elapsed, queue_wait, total_wall = log_job_output(
190-
job_name=job_name, wait=True, timeout=args.timeout
191-
)
184+
# always wait for the job to complete
185+
result_phase, run_elapsed, queue_wait, total_wall = log_job_output(
186+
job_name=job_name, wait=True, timeout=args.timeout
187+
)
192188

189+
# Emit metrics if we captured any timing
193190
if (
194191
run_elapsed is not None
195192
or queue_wait is not None
@@ -220,21 +217,21 @@ def run(args: argparse.Namespace):
220217
IN_PROGRESS.labels(**labels, result=result_phase).dec()
221218

222219
group = {
223-
"instance": PROMETHEUS_INSTANCE, # oc project
224-
"job_name": job_name, # unique name
220+
"instance": PROMETHEUS_INSTANCE, # e.g. "ope-test"
221+
"job_name": job_name # e.g. "job-none-b799d46a6f995a9d8d58b6aff76abefc"
225222
}
226223

227224
push_registry_text(grouping_key=group)
228225

229226
except oc.OpenShiftPythonException as e:
230227
sys.exit(f"Error occurred while creating job: {e}")
231228

232-
if args.job_delete and args.wait:
229+
if args.job_delete:
233230
print(f"RUNDIR: jobs/{job_name}")
234231
oc_delete("job", job_name)
235232
else:
236233
print(
237-
f"User specified not to wait, or not to delete, so {job_name} must be deleted by user.\n"
234+
f"User specified not to delete, so {job_name} must be deleted by user.\n"
238235
f"You can do this by running:\n"
239236
f" batchtools bd {job_name} OR\n"
240237
f" oc delete job {job_name}"
@@ -306,7 +303,6 @@ def log_job_output(
306303
total_wall = time.monotonic() - start_poll # fallback
307304
run_elapsed = total_wall - (queue_wait or 0.0)
308305
else:
309-
# Never reached Running; keep convention for failures:
310306
run_elapsed = 0.0 if result_phase == "failed" else None
311307
if total_wall is None:
312308
total_wall = 0.0

batchtools/prom_metrics.py

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,4 @@
1+
import subprocess
12
from datetime import datetime, timezone
23
from .helpers import is_on_project
34

@@ -17,8 +18,9 @@
1718

1819
LONG_JOB_BUCKETS = (1, 2, 5, 10, 20, 30, 60, 120, 180, 300, 600, 900, float("inf"))
1920

20-
PUSHGATEWAY_ADDR = os.getenv("PUSHGATEWAY_ADDR", "pushgateway.ope-test.svc:9091")
21-
21+
PUSHGATEWAY_ADDR = os.getenv(
22+
"PUSHGATEWAY_ADDR", "pushgateway.ope-test.svc:9091"
23+
)
2224

2325
def detect_instance() -> str:
2426
if shutil.which("oc") is None:
@@ -92,7 +94,6 @@ def detect_instance() -> str:
9294
registry=registry,
9395
)
9496

95-
9697
def now_rfc3339() -> str:
9798
return datetime.now(timezone.utc).isoformat()
9899

@@ -140,7 +141,6 @@ def generate_metrics_text() -> tuple[str, str]:
140141
payload = generate_latest(registry)
141142
return payload.decode("utf-8"), CONTENT_TYPE_LATEST
142143

143-
144144
def push_registry_text(grouping_key: dict[str, str] | None = None) -> None:
145145
if not PUSHGATEWAY_ADDR:
146146
body, _ = generate_metrics_text()

pushgateway/prom.yaml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -48,4 +48,4 @@ spec:
4848
- name: http
4949
port: 9091
5050
targetPort: 9091
51-
type: ClusterIP
51+
type: ClusterIP

tests/test_prom_metrics.py

Lines changed: 41 additions & 36 deletions
Original file line numberDiff line numberDiff line change
@@ -1,17 +1,19 @@
11
import batchtools.prom_metrics as pm
2+
import types
23
from datetime import datetime
34
from unittest import mock
45

56

67
def test_now_rfc3339_parses_with_timezone():
78
s = pm.now_rfc3339()
9+
# ISO 8601 parseable and includes timezone info (+00:00)
810
dt = datetime.fromisoformat(s)
911
assert dt.tzinfo is not None
1012
assert s.endswith("+00:00")
1113

1214

1315
def _labels(job="job-x", gpu="none", queue="dummy-localqueue", instance="ns"):
14-
return {"job_name": job, "gpu": gpu, "queue": queue, "instance": instance}
16+
return {"job": job, "gpu": gpu, "queue": queue, "instance": instance}
1517

1618

1719
def _registry_text() -> str:
@@ -76,58 +78,61 @@ def test_generate_metrics_text_returns_valid_payload_and_ctype():
7678

7779

7880
def test_push_registry_text_no_url_prints_payload(capsys):
79-
# check "no PUSHGATEWAY_ADDR" branch
80-
with mock.patch.object(pm, "PUSHGATEWAY_ADDR", ""):
81+
# Temporarily clear the push URL so it prints instead of POSTing
82+
with mock.patch.object(pm, "PROMETHEUS_PUSH_URL", "", create=True):
8183
pm.push_registry_text()
82-
8384
out = capsys.readouterr().out
84-
assert "PROM: PUSHGATEWAY_ADDR not set" in out
85-
assert "# HELP" in out
86-
assert "# TYPE" in out
85+
assert "PROMETHEUS_PUSH_URL not set" in out
86+
assert "# HELP" in out # the metrics text is printed
8787

8888

8989
def test_push_registry_text_posts_success(capsys):
9090
with (
91-
mock.patch.object(pm, "PUSHGATEWAY_ADDR", "pushgateway.example:9091"),
92-
mock.patch.object(pm, "pushadd_to_gateway") as mock_push,
91+
mock.patch.object(
92+
pm, "PROMETHEUS_PUSH_URL", "http://example/metrics", create=True
93+
),
94+
mock.patch.object(pm.subprocess, "run") as mock_run,
9395
):
94-
pm.push_registry_text(grouping_key={"instance": "test-ns"})
96+
mock_run.return_value = types.SimpleNamespace(returncode=0)
9597

96-
out = capsys.readouterr().out
97-
assert "PROM: metrics pushed to pushgateway=pushgateway.example:9091" in out
98+
pm.push_registry_text()
99+
out = capsys.readouterr().out
100+
assert "metrics successfully pushed" in out
98101

99-
# verify pushadd_to_gateway was called with expected args
100-
mock_push.assert_called_once()
101-
args, kwargs = mock_push.call_args
102-
assert args[0] == "pushgateway.example:9091"
103-
# job and registry are passed as keyword args
104-
assert kwargs.get("job") == "batchtools"
105-
assert kwargs.get("registry") is pm.registry
106-
assert kwargs.get("grouping_key") == {"instance": "test-ns"}
102+
# Verify we invoked curl with POST and sent our payload
103+
assert mock_run.called
104+
args, kwargs = mock_run.call_args
105+
argv = args[0]
106+
assert "curl" in argv[0]
107+
assert "-X" in argv and "POST" in argv
108+
assert "http://example/metrics" in argv
109+
110+
payload = kwargs.get("input", b"").decode("utf-8")
111+
assert "# HELP" in payload
112+
assert "# TYPE" in payload
107113

108114

109115
def test_push_registry_text_posts_failure(capsys):
110116
with (
111-
mock.patch.object(pm, "PUSHGATEWAY_ADDR", "pushgateway.example:9091"),
112-
mock.patch.object(pm, "pushadd_to_gateway", side_effect=RuntimeError("boom")),
117+
mock.patch.object(
118+
pm, "PROMETHEUS_PUSH_URL", "http://example/metrics", create=True
119+
),
120+
mock.patch.object(pm.subprocess, "run") as mock_run,
113121
):
114-
pm.push_registry_text(grouping_key={"instance": "test-ns"})
115-
116-
out = capsys.readouterr().out
117-
assert "PROM: failed to push metrics to pushgateway pushgateway.example:9091" in out
118-
assert "boom" in out
122+
mock_run.return_value = types.SimpleNamespace(returncode=7)
119123

124+
pm.push_registry_text()
125+
out = capsys.readouterr().out
126+
assert "curl returned nonzero exit 7" in out
120127

121-
def test_push_registry_text_handles_generic_exception(capsys):
122-
class WeirdError(Exception):
123-
pass
124128

129+
def test_push_registry_text_handles_exception(capsys):
125130
with (
126-
mock.patch.object(pm, "PUSHGATEWAY_ADDR", "pushgateway.example:9091"),
127-
mock.patch.object(pm, "pushadd_to_gateway", side_effect=WeirdError("weird")),
131+
mock.patch.object(
132+
pm, "PROMETHEUS_PUSH_URL", "http://example/metrics", create=True
133+
),
134+
mock.patch.object(pm.subprocess, "run", side_effect=RuntimeError("boom")),
128135
):
129-
pm.push_registry_text(grouping_key={"instance": "test-ns"})
130-
136+
pm.push_registry_text()
131137
out = capsys.readouterr().out
132-
assert "PROM: failed to push metrics to pushgateway pushgateway.example:9091" in out
133-
assert "weird" in out
138+
assert "failed to push metrics via curl" in out

0 commit comments

Comments
 (0)