Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
94 changes: 90 additions & 4 deletions scripts/lib_agent.py
Original file line number Diff line number Diff line change
Expand Up @@ -727,6 +727,34 @@ def _extract_usage_from_transcript(transcript: List[Dict[str, Any]]) -> Dict[str
return totals


def _archive_transcript(
*,
agent_id: str,
current_session_id: str,
start_time: float,
output_dir: Optional[Path],
task_id: str,
session_index: int,
) -> None:
"""Archive the transcript for a session before starting a new one.

In multi-session tasks with ``new_session: true``, we need to save each
session's transcript separately before cleaning up the agent's session
state. This ensures the grading engine can inspect the full conversation
history across all sessions.
"""
transcript, transcript_path = _load_transcript(agent_id, current_session_id, start_time)
if transcript_path and output_dir:
import shutil as _shutil
output_dir.mkdir(parents=True, exist_ok=True)
archive_dest = output_dir / f"{task_id}_session{session_index}.jsonl"
try:
_shutil.copy2(transcript_path, archive_dest)
logger.info("Archived session %d transcript to %s", session_index, archive_dest)
except OSError as exc:
logger.warning("Failed to archive session transcript: %s", exc)


def execute_openclaw_task(
*,
task: Task,
Expand Down Expand Up @@ -773,19 +801,42 @@ def execute_openclaw_task(
# Check if this is a multi-session task
sessions = task.frontmatter.get("sessions", [])
if sessions:
# Multi-session task: send each prompt in sequence
# Multi-session task: send each prompt in sequence.
# When a session entry has `new_session: true`, we start a fresh
# OpenClaw session so the agent has no conversation history from
# prior sessions — simulating a user returning after closing the
# agent. The workspace (and any files created) are preserved.
logger.info("📋 Multi-session task with %d sessions", len(sessions))
current_session_id = session_id
for i, session_entry in enumerate(sessions, 1):
# Extract prompt text from session entry (handle both string and dict formats)
is_new_session = False
if isinstance(session_entry, str):
session_prompt = session_entry
elif isinstance(session_entry, dict):
session_prompt = session_entry.get("prompt") or session_entry.get("message", "")
is_new_session = bool(session_entry.get("new_session", False))
else:
logger.warning("⚠️ Skipping invalid session entry: %s", session_entry)
continue

logger.info(" Session %d/%d", i, len(sessions))
if is_new_session:
# Archive the current transcript before starting a new session
# so we don't lose the conversation history from prior sessions.
_archive_transcript(
agent_id=agent_id,
current_session_id=current_session_id,
start_time=start_time,
output_dir=output_dir,
task_id=task.task_id,
session_index=i - 1,
)
# Clean up old session state so the agent starts with a blank slate
cleanup_agent_sessions(agent_id)
current_session_id = f"{task.task_id}_s{i}_{int(time.time() * 1000)}"
logger.info(" 🔄 Starting new session (session_id=%s)", current_session_id)

logger.info(" Session %d/%d (new_session=%s)", i, len(sessions), is_new_session)
elapsed = time.time() - start_time
remaining = timeout_seconds - elapsed
if remaining <= 0:
Expand All @@ -798,7 +849,7 @@ def execute_openclaw_task(
"--agent",
agent_id,
"--session-id",
session_id,
current_session_id,
"--message",
session_prompt,
]
Expand Down Expand Up @@ -860,7 +911,42 @@ def execute_openclaw_task(
except FileNotFoundError as exc:
stderr = f"openclaw command not found: {exc}"

transcript, transcript_path = _load_transcript(agent_id, session_id, start_time)
# For multi-session tasks that used new_session, the final transcript only
# contains the last session's conversation. Merge archived session
# transcripts (from _archive_transcript) so the grading engine sees the
# full history across all sessions.
has_new_session = sessions and any(
isinstance(s, dict) and s.get("new_session") for s in sessions
)
if has_new_session:
final_transcript, transcript_path = _load_transcript(
agent_id, current_session_id, start_time
)
# Load archived session transcripts and merge them in order
merged_transcript: List[Dict[str, Any]] = []
if output_dir:
for session_idx in range(len(sessions)):
archive_path = output_dir / f"{task.task_id}_session{session_idx}.jsonl"
if archive_path.exists():
try:
for line in archive_path.read_text(encoding="utf-8").splitlines():
if not line.strip():
continue
try:
merged_transcript.append(json.loads(line))
except json.JSONDecodeError:
pass
# Clean up per-session archive after merging
try:
archive_path.unlink()
except OSError:
pass
except OSError as exc:
logger.warning("Failed to read archived session transcript: %s", exc)
merged_transcript.extend(final_transcript)
transcript = merged_transcript
else:
transcript, transcript_path = _load_transcript(agent_id, session_id, start_time)
usage = _extract_usage_from_transcript(transcript)
execution_time = time.time() - start_time

Expand Down
71 changes: 71 additions & 0 deletions tasks/TASK_TEMPLATE.md
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,18 @@ workspace_files: []
# dest: input.txt
# - source: assets/config.json
# dest: config.json
# multi_session: true # Optional: set to true for multi-turn tasks
# sessions: # Optional: list of sequential user prompts
# - id: step_1
# prompt: |
# First prompt sent to the agent.
# - id: step_2
# prompt: |
# Second prompt, sent in the same conversation context.
# - id: fresh_start
# new_session: true # Start a new session (no conversation history)
# prompt: |
# Third prompt in a fresh session — agent has no memory of prior turns.
---

# Task Template
Expand Down Expand Up @@ -243,6 +255,62 @@ workspace_files:

---

## Multi-Session Tasks

{Optional: Define a sequence of prompts to test multi-turn conversation, iterative refinement, or cross-session memory.}

**YAML Frontmatter Format:**

```yaml
multi_session: true
sessions:
- id: first_turn
prompt: |
First message sent to the agent.
- id: follow_up
prompt: |
Follow-up message in the same conversation context.
The agent retains all prior conversation history.
- id: fresh_context
new_session: true
prompt: |
Message sent in a brand-new session. The agent has no
conversation history from prior turns, but workspace
files created earlier are still accessible.
```

**Fields:**

| Field | Type | Description |
|-------|------|-------------|
| `id` | string | Unique identifier for this session step |
| `prompt` | string | The user message sent to the agent |
| `new_session` | bool | If `true`, start a fresh OpenClaw session with no conversation history. Workspace files are preserved. Default: `false` |

**How It Works:**

1. Each session entry's `prompt` is sent to the agent in order
2. By default, all prompts share the same conversation context (the agent remembers previous turns)
3. When `new_session: true` is set, the agent's session is reset before sending the prompt — simulating a user returning after closing and reopening the agent
4. The workspace directory (including any files the agent created) is **never** reset between sessions
5. All session transcripts are merged for grading

**When to Use Multi-Session:**

- **Iterative refinement**: User asks the agent to create something, then asks for modifications
- **Cross-session memory**: Test if the agent can persist and recall information across separate conversations
- **Follow-up instructions**: Test if the agent correctly applies new constraints to prior work
- **Realistic workflows**: Simulate a user who interacts with the agent over multiple conversations

**Guidelines:**

- Use `new_session: true` when you want to test the agent's ability to recover context from files (not conversation history)
- Keep the total number of sessions reasonable (2-5) to avoid excessive timeout
- Increase `timeout_seconds` proportionally to the number of sessions
- Ensure the Prompt section notes that this is a multi-session task

---

## Additional Notes

{Optional: Any additional context, edge cases, or implementation notes for task authors or developers.}
Expand Down Expand Up @@ -271,3 +339,6 @@ Before submitting a new task, verify:
- [ ] Weights in rubric sum to 100% (if applicable)
- [ ] Timeout is reasonable for the task complexity
- [ ] Workspace files are included in `assets/` (if needed)
- [ ] Multi-session prompts are in the `sessions` field (if applicable)
- [ ] `new_session: true` is set for sessions that should start fresh (if applicable)
- [ ] `timeout_seconds` accounts for multiple sessions (if applicable)
Loading
Loading