drisspg · xmfan · Apr 13, 2026 · Apr 9, 2026 · Apr 9, 2026 · Apr 9, 2026
diff --git a/README.md b/README.md
@@ -245,6 +245,32 @@ Removes job directories and prunes git worktrees.
 | `--keep` | clean | 0 | Number of recent jobs to keep |
 | `--log` | peek | 0 | Number of log lines to show |
 
+## Adding a new repo
+
+1. Add a `[repos.<name>]` section to `~/.ptq/config.toml`:
+
+```toml
+[repos.torchtitan]
+github_repo = "pytorch/torchtitan"
+clone_url = "https://github.com/pytorch/torchtitan.git"
+dir_name = "torchtitan"
+smoke_test_import = "torchtitan"
+repro_import_hint = "import torchtitan"
+```
+
+2. Create prompt templates in `prompts/`:
+   - `prompts/investigate_<name>.md` — issue investigation prompt
+   - `prompts/adhoc_<name>.md` — freeform task prompt
+
+The prompt templates are where the real work is — they teach the agent about the repo's build system, directory layout, debugging tools, and testing conventions. See the existing `investigate.md` and `investigate_torchtitan.md` for examples.
+
+Optional profile fields (all default to `false`/`null`):
+| Field | Description |
+|-------|-------------|
+| `uses_custom_worktree_tool` | Use `tools/create_worktree.py` instead of `git worktree add` |
+| `needs_cpp_build` | Run C++ rebuild after worktree creation |
+| `lint_cmd` | Lint command to run before PRs |
+
 ## Project layout
 
 ```
@@ -277,8 +303,10 @@ pt_job_queue/
 │       ├── static/style.css            # Dark-theme styles
 │       └── templates/                  # Jinja2 templates (Pico CSS + htmx)
 ├── prompts/
-│   ├── investigate.md                  # Issue investigation prompt
-│   └── adhoc.md                        # Freeform task prompt
+│   ├── investigate.md                  # PyTorch issue investigation prompt
+│   ├── adhoc.md                        # PyTorch freeform task prompt
+│   ├── investigate_torchtitan.md       # TorchTitan issue investigation prompt
+│   └── adhoc_torchtitan.md             # TorchTitan freeform task prompt
 └── scripts/
     └── rebuild.sh
 ```

diff --git a/prompts/adhoc.md b/prompts/adhoc.md
@@ -11,6 +11,7 @@ You are performing a task on a PyTorch codebase.
 - **PyTorch source** (edit here): `{workspace}/jobs/{job_id}/pytorch/`
 - **Job artifacts** (write output here): `{workspace}/jobs/{job_id}/`
 - **Rebuild script** (after C++ changes): `bash {workspace}/scripts/rebuild.sh {workspace}/jobs/{job_id}/pytorch`
+- **Add-on repos** (available for cross-referencing): `{workspace}/torchtitan/`
 
 ## Task
 

diff --git a/prompts/adhoc_torchtitan.md b/prompts/adhoc_torchtitan.md
@@ -0,0 +1,66 @@
+# TorchTitan Task Agent
+
+You are performing a task on a TorchTitan codebase.
+
+## Job Info
+- **Job ID**: {job_id}
+- **Mode**: adhoc
+
+## Environment
+- **Python** (always use this): `{workspace}/jobs/{job_id}/.venv/bin/python`
+- **TorchTitan source** (edit here): `{workspace}/jobs/{job_id}/torchtitan/`
+- **Job artifacts** (write output here): `{workspace}/jobs/{job_id}/`
+
+## Task
+
+{task_description}
+
+## Worklog
+
+Maintain a running worklog at `{workspace}/jobs/{job_id}/worklog.md`. Append to it after each significant step (exploring, finding a clue, making a change, test results). Each entry should have a short heading and a few lines describing what you did and what you found. This lets the user check progress while you're still running.
+
+## CRITICAL RULES
+
+### Stay in your worktree
+You MUST only read and write files within these directories:
+- `{workspace}/jobs/{job_id}/` (your job directory — edits, scripts, artifacts)
+- `{workspace}/pytorch/` (upstream PyTorch source — read and edit if the root cause is in PyTorch)
+- `{workspace}/scripts/` (read-only)
+
+NEVER `cd` outside these directories. All TorchTitan source is in YOUR worktree at `{workspace}/jobs/{job_id}/torchtitan/`.
+
+### Always use your job's python
+Run ALL python commands with `{workspace}/jobs/{job_id}/.venv/bin/python`. NEVER use bare `python`, `python3`, or any other python binary. NEVER use `conda`, `pip install`, or modify the environment.
+
+### Syncing changes
+- **Python changes**: Picked up automatically (editable install). No action needed.
+- TorchTitan is pure Python — no C++ rebuild needed.
+
+## Debugging Tools
+
+**Distributed training debugging**:
+- Single-GPU debugging: `{workspace}/jobs/{job_id}/.venv/bin/torchrun --nproc_per_node=1 <script.py>`
+- Multi-GPU: `{workspace}/jobs/{job_id}/.venv/bin/torchrun --nproc_per_node=N <script.py>`
+- Enable debug logging: `TORCH_DISTRIBUTED_DEBUG=DETAIL <command>`
+- Trace compilation: `TORCH_LOGS="output_code" <command>`
+
+**CUDA errors**:
+```
+CUDA_LAUNCH_BLOCKING=1 PYTORCH_NO_CUDA_MEMORY_CACHING=1 compute-sanitizer --tool memcheck {workspace}/jobs/{job_id}/.venv/bin/python <script.py>
+```
+
+## Output
+Write these files to `{workspace}/jobs/{job_id}/`:
+
+**report.md** — A concise summary of what you did and what you found.
+
+**fix.diff** (if you made code changes) — Generate with:
+```
+cd {workspace}/jobs/{job_id}/torchtitan && git diff > {workspace}/jobs/{job_id}/fix.diff
+```
+If you also edited PyTorch source, generate a separate diff:
+```
+cd {workspace}/pytorch && git diff > {workspace}/jobs/{job_id}/pytorch-fix.diff
+```
+
+IMPORTANT: Always generate report.md before finishing. Generate fix.diff if you made any code changes.
diff --git a/prompts/investigate.md b/prompts/investigate.md
@@ -11,6 +11,7 @@ You are investigating a PyTorch bug. Your goal is to reproduce, understand, and
 - **PyTorch source** (edit here): `{workspace}/jobs/{job_id}/pytorch/`
 - **Job artifacts** (write output here): `{workspace}/jobs/{job_id}/`
 - **Rebuild script** (after C++ changes): `bash {workspace}/scripts/rebuild.sh {workspace}/jobs/{job_id}/pytorch`
+- **Add-on repos** (available for cross-referencing): `{workspace}/torchtitan/`
 
 ## Issue Context
 

diff --git a/prompts/investigate_torchtitan.md b/prompts/investigate_torchtitan.md
@@ -0,0 +1,101 @@
+# TorchTitan Issue Investigation Agent
+
+You are investigating a TorchTitan bug. Your goal is to reproduce, understand, and fix the issue.
+
+## Job Info
+- **Job ID**: {job_id}
+- **Issue**: pytorch/torchtitan#{issue_number}
+
+## Environment
+- **Python** (always use this): `{workspace}/jobs/{job_id}/.venv/bin/python`
+- **TorchTitan source** (edit here): `{workspace}/jobs/{job_id}/torchtitan/`
+- **Job artifacts** (write output here): `{workspace}/jobs/{job_id}/`
+
+## Issue Context
+
+{issue_context}
+
+## Worklog
+
+Maintain a running worklog at `{workspace}/jobs/{job_id}/worklog.md`. Append to it after each significant step (reproducing, finding a clue, making a fix attempt, test results). Each entry should have a short heading and a few lines describing what you did and what you found. This lets the user check progress while you're still running.
+
+## CRITICAL RULES
+
+### Stay in your worktree
+You MUST only read and write files within these directories:
+- `{workspace}/jobs/{job_id}/` (your job directory — edits, scripts, artifacts)
+- `{workspace}/pytorch/` (upstream PyTorch source — read and edit if the root cause is in PyTorch)
+- `{workspace}/scripts/` (read-only)
+
+NEVER `cd` outside these directories. All TorchTitan source is in YOUR worktree at `{workspace}/jobs/{job_id}/torchtitan/`.
+
+### Always use your job's python
+Run ALL python commands with `{workspace}/jobs/{job_id}/.venv/bin/python`. NEVER use bare `python`, `python3`, or any other python binary. NEVER use `conda`, `pip install`, or modify the environment.
+
+### Syncing changes
+- **Python changes**: Picked up automatically (editable install). No action needed.
+- TorchTitan is pure Python — no C++ rebuild needed.
+
+## Debugging Tools
+
+**Distributed training debugging**:
+- Single-GPU debugging: `{workspace}/jobs/{job_id}/.venv/bin/torchrun --nproc_per_node=1 <script.py>`
+- Multi-GPU: `{workspace}/jobs/{job_id}/.venv/bin/torchrun --nproc_per_node=N <script.py>`
+- Enable debug logging: `TORCH_DISTRIBUTED_DEBUG=DETAIL <command>`
+- Trace compilation: `TORCH_LOGS="output_code" <command>`
+- Disable async compile: `TORCHINDUCTOR_COMPILE_THREADS=1 <command>`
+
+**CUDA errors**:
+```
+CUDA_LAUNCH_BLOCKING=1 PYTORCH_NO_CUDA_MEMORY_CACHING=1 compute-sanitizer --tool memcheck {workspace}/jobs/{job_id}/.venv/bin/python <script.py>
+```
+
+## Instructions
+
+### 1. Reproduce
+- If a repro script exists at `{workspace}/jobs/{job_id}/repro.py`, run it:
+  ```
+  {workspace}/jobs/{job_id}/.venv/bin/python {workspace}/jobs/{job_id}/repro.py
+  ```
+- If no repro script exists, write one based on the issue description and run it.
+- For distributed issues, use `torchrun` with the appropriate number of processes.
+- **You MUST confirm you can reproduce the reported failure before moving on.** If you cannot reproduce after reasonable attempts, stop and document in `report.md` that the issue could not be reproduced, including hardware, PyTorch version, TorchTitan version, and what you tried.
+
+### 2. Investigate
+- Read relevant TorchTitan source code in `{workspace}/jobs/{job_id}/torchtitan/`.
+- Key source locations: `torchtitan/models/`, `torchtitan/parallelisms/`, `torchtitan/train.py`, `torchtitan/config_manager.py`
+- **Also check upstream PyTorch** at `{workspace}/pytorch/` — TorchTitan bugs are often caused by changes in PyTorch (FSDP, tensor parallel, compile, distributed). Cross-reference if the stack trace touches `torch.*` internals.
+- Trace the code path from the repro to the root cause.
+- Understand how TorchTitan's parallelism wrappers, model definitions, and training loop interact.
+
+### 3. Fix
+- Edit source files in `{workspace}/jobs/{job_id}/torchtitan/` to fix the bug.
+- If the root cause is in PyTorch, edit files in `{workspace}/pytorch/` instead.
+  - **Python-only changes**: picked up automatically.
+  - **C++ changes**: rebuild with `bash {workspace}/scripts/rebuild.sh {workspace}/pytorch`
+- Make minimal, targeted changes.
+
+### 4. Test
+- Re-run the repro script to confirm the fix works.
+- Write additional edge-case tests if appropriate.
+
+### 5. Output
+Write these files to `{workspace}/jobs/{job_id}/`:
+
+**report.md** — A concise report covering:
+- Summary of the issue
+- Root cause analysis
+- What the fix does
+- Repro script — wrap in a collapsible `<details>` block with `<summary>Repro Script</summary>`, containing the full script as a fenced python code block followed by its output
+- Test results
+
+**fix.diff** — Generate with:
+```
+cd {workspace}/jobs/{job_id}/torchtitan && git diff > {workspace}/jobs/{job_id}/fix.diff
+```
+If you also edited PyTorch source, generate a separate diff:
+```
+cd {workspace}/pytorch && git diff > {workspace}/jobs/{job_id}/pytorch-fix.diff
+```
+
+IMPORTANT: Always generate both report.md and fix.diff before finishing.
diff --git a/ptq/agent.py b/ptq/agent.py
@@ -3,13 +3,13 @@
 import re
 from pathlib import Path
 
-PROMPT_TEMPLATE = (
-    Path(__file__).parent.parent / "prompts" / "investigate.md"
-).read_text()
+from ptq.issue import format_issue_context
+from ptq.repo_profiles import get_profile
 
-ADHOC_PROMPT_TEMPLATE = (
-    Path(__file__).parent.parent / "prompts" / "adhoc.md"
-).read_text()
+PROMPTS_DIR = Path(__file__).parent.parent / "prompts"
+
+PROMPT_TEMPLATE = (PROMPTS_DIR / "investigate.md").read_text()
+ADHOC_PROMPT_TEMPLATE = (PROMPTS_DIR / "adhoc.md").read_text()
 
 RESERVED_HEADER_RE = re.compile(r"x-anthropic-\S+", re.IGNORECASE)
 
@@ -18,21 +18,33 @@
 MAX_OUTPUT_LINES = 30
 
 DEFAULT_MESSAGE = (
-    "Investigate and fix the PyTorch issue described in your system prompt."
+    "Investigate and fix the issue described in your system prompt."
 )
 
+_template_cache: dict[str, str] = {}
+
+
+def _load_template(filename: str) -> str:
+    if filename not in _template_cache:
+        _template_cache[filename] = (PROMPTS_DIR / filename).read_text()
+    return _template_cache[filename]
+
 
 def _sanitize_for_api(text: str) -> str:
     return RESERVED_HEADER_RE.sub("[redacted-header]", text)
 
 
 def build_system_prompt(
-    issue_data: dict, issue_number: int, job_id: str, workspace: str
+    issue_data: dict,
+    issue_number: int,
+    job_id: str,
+    workspace: str,
+    repo: str = "pytorch",
 ) -> str:
-    from ptq.issue import format_issue_context
-
+    profile = get_profile(repo)
+    template = _load_template(profile.prompt_template)
     return _sanitize_for_api(
-        PROMPT_TEMPLATE.format(
+        template.format(
             job_id=job_id,
             issue_number=issue_number,
             issue_context=format_issue_context(issue_data, issue_number),
@@ -41,9 +53,13 @@ def build_system_prompt(
     )
 
 
-def build_adhoc_prompt(message: str, job_id: str, workspace: str) -> str:
+def build_adhoc_prompt(
+    message: str, job_id: str, workspace: str, repo: str = "pytorch"
+) -> str:
+    profile = get_profile(repo)
+    template = _load_template(profile.adhoc_prompt_template)
     return _sanitize_for_api(
-        ADHOC_PROMPT_TEMPLATE.format(
+        template.format(
             job_id=job_id,
             task_description=message,
             workspace=workspace,

diff --git a/ptq/application/job_service.py b/ptq/application/job_service.py
@@ -3,6 +3,7 @@
 from ptq.domain.models import JobRecord, JobStatus
 from ptq.infrastructure.backends import backend_for_job
 from ptq.infrastructure.job_repository import JobRepository
+from ptq.repo_profiles import get_profile
 from ptq.ssh import Backend
 
 
@@ -30,20 +31,26 @@ def kill_job(repo: JobRepository, job_id: str) -> bool:
 def clean_single_job(repo: JobRepository, job_id: str) -> JobRecord:
     """Remove a job: kill agent, delete files, drop from DB. Returns the removed record."""
     job = repo.get(job_id)
+    profile = get_profile(job.repo)
     backend = backend_for_job(job)
     ws = backend.workspace
     job_dir = f"{ws}/jobs/{job_id}"
 
     if job.pid is not None and backend.is_pid_alive(job.pid):
         backend.kill_pid(job.pid)
 
-    backend.run(
-        f"cd {ws}/pytorch && {ws}/.venv/bin/python tools/create_worktree.py remove pytorch "
-        f"--parent-dir {job_dir}",
-        check=False,
-    )
+    if profile.uses_custom_worktree_tool:
+        backend.run(
+            f"cd {ws}/pytorch && {ws}/.venv/bin/python tools/create_worktree.py remove pytorch "
+            f"--parent-dir {job_dir}",
+            check=False,
+        )
+    else:
+        worktree_path = f"{job_dir}/{profile.dir_name}"
+        backend.run(f"git -C {ws}/{profile.dir_name} worktree remove --force {worktree_path}", check=False)
+
     backend.run(f"rm -rf {job_dir}", check=False)
-    backend.run(f"cd {ws}/pytorch && git worktree prune", check=False)
+    backend.run(f"cd {ws}/{profile.dir_name} && git worktree prune", check=False)
     repo.delete(job_id)
     return job
 
@@ -80,21 +87,30 @@ def clean_machine(
         return [], skipped_running
 
     ws = backend.workspace
-    backend.run(f"cd {ws}/pytorch && git worktree prune", check=False)
+    # Prune worktrees for all repos that have jobs being cleaned
+    repos_seen: set[str] = set()
 
     removed: list[str] = []
     for jid, job in to_remove:
+        profile = get_profile(job.repo)
+        repos_seen.add(job.repo)
         if job.pid is not None and backend.is_pid_alive(job.pid):
             backend.kill_pid(job.pid)
         job_dir = f"{ws}/jobs/{jid}"
-        backend.run(
-            f"cd {ws}/pytorch && {ws}/.venv/bin/python tools/create_worktree.py remove pytorch "
-            f"--parent-dir {job_dir}",
-            check=False,
-        )
+        if profile.uses_custom_worktree_tool:
+            backend.run(
+                f"cd {ws}/pytorch && {ws}/.venv/bin/python tools/create_worktree.py remove pytorch "
+                f"--parent-dir {job_dir}",
+                check=False,
+            )
+        else:
+            worktree_path = f"{job_dir}/{profile.dir_name}"
+            backend.run(f"git -C {ws}/{profile.dir_name} worktree remove --force {worktree_path}", check=False)
         backend.run(f"rm -rf {job_dir}")
         repo.delete(jid)
         removed.append(jid)
 
-    backend.run(f"cd {ws}/pytorch && git worktree prune", check=False)
+    for repo_name in repos_seen:
+        p = get_profile(repo_name)
+        backend.run(f"cd {ws}/{p.dir_name} && git worktree prune", check=False)
     return removed, skipped_running