pinchbench · olearycrew · Apr 21, 2026 · Apr 15, 2026
diff --git a/scripts/lib_agent.py b/scripts/lib_agent.py
@@ -727,6 +727,34 @@ def _extract_usage_from_transcript(transcript: List[Dict[str, Any]]) -> Dict[str
     return totals
 
 
+def _archive_transcript(
+    *,
+    agent_id: str,
+    current_session_id: str,
+    start_time: float,
+    output_dir: Optional[Path],
+    task_id: str,
+    session_index: int,
+) -> None:
+    """Archive the transcript for a session before starting a new one.
+
+    In multi-session tasks with ``new_session: true``, we need to save each
+    session's transcript separately before cleaning up the agent's session
+    state.  This ensures the grading engine can inspect the full conversation
+    history across all sessions.
+    """
+    transcript, transcript_path = _load_transcript(agent_id, current_session_id, start_time)
+    if transcript_path and output_dir:
+        import shutil as _shutil
+        output_dir.mkdir(parents=True, exist_ok=True)
+        archive_dest = output_dir / f"{task_id}_session{session_index}.jsonl"
+        try:
+            _shutil.copy2(transcript_path, archive_dest)
+            logger.info("Archived session %d transcript to %s", session_index, archive_dest)
+        except OSError as exc:
+            logger.warning("Failed to archive session transcript: %s", exc)
+
+
 def execute_openclaw_task(
     *,
     task: Task,
@@ -773,19 +801,42 @@ def execute_openclaw_task(
     # Check if this is a multi-session task
     sessions = task.frontmatter.get("sessions", [])
     if sessions:
-        # Multi-session task: send each prompt in sequence
+        # Multi-session task: send each prompt in sequence.
+        # When a session entry has `new_session: true`, we start a fresh
+        # OpenClaw session so the agent has no conversation history from
+        # prior sessions — simulating a user returning after closing the
+        # agent.  The workspace (and any files created) are preserved.
         logger.info("📋 Multi-session task with %d sessions", len(sessions))
+        current_session_id = session_id
         for i, session_entry in enumerate(sessions, 1):
             # Extract prompt text from session entry (handle both string and dict formats)
+            is_new_session = False
             if isinstance(session_entry, str):
                 session_prompt = session_entry
             elif isinstance(session_entry, dict):
                 session_prompt = session_entry.get("prompt") or session_entry.get("message", "")
+                is_new_session = bool(session_entry.get("new_session", False))
             else:
                 logger.warning("⚠️ Skipping invalid session entry: %s", session_entry)
                 continue
 
-            logger.info("   Session %d/%d", i, len(sessions))
+            if is_new_session:
+                # Archive the current transcript before starting a new session
+                # so we don't lose the conversation history from prior sessions.
+                _archive_transcript(
+                    agent_id=agent_id,
+                    current_session_id=current_session_id,
+                    start_time=start_time,
+                    output_dir=output_dir,
+                    task_id=task.task_id,
+                    session_index=i - 1,
+                )
+                # Clean up old session state so the agent starts with a blank slate
+                cleanup_agent_sessions(agent_id)
+                current_session_id = f"{task.task_id}_s{i}_{int(time.time() * 1000)}"
+                logger.info("   🔄 Starting new session (session_id=%s)", current_session_id)
+
+            logger.info("   Session %d/%d (new_session=%s)", i, len(sessions), is_new_session)
             elapsed = time.time() - start_time
             remaining = timeout_seconds - elapsed
             if remaining <= 0:
@@ -798,7 +849,7 @@ def execute_openclaw_task(
                         "--agent",
                         agent_id,
                         "--session-id",
-                        session_id,
+                        current_session_id,
                         "--message",
                         session_prompt,
                     ]
@@ -860,7 +911,42 @@ def execute_openclaw_task(
         except FileNotFoundError as exc:
             stderr = f"openclaw command not found: {exc}"
 
-    transcript, transcript_path = _load_transcript(agent_id, session_id, start_time)
+    # For multi-session tasks that used new_session, the final transcript only
+    # contains the last session's conversation.  Merge archived session
+    # transcripts (from _archive_transcript) so the grading engine sees the
+    # full history across all sessions.
+    has_new_session = sessions and any(
+        isinstance(s, dict) and s.get("new_session") for s in sessions
+    )
+    if has_new_session:
+        final_transcript, transcript_path = _load_transcript(
+            agent_id, current_session_id, start_time
+        )
+        # Load archived session transcripts and merge them in order
+        merged_transcript: List[Dict[str, Any]] = []
+        if output_dir:
+            for session_idx in range(len(sessions)):
+                archive_path = output_dir / f"{task.task_id}_session{session_idx}.jsonl"
+                if archive_path.exists():
+                    try:
+                        for line in archive_path.read_text(encoding="utf-8").splitlines():
+                            if not line.strip():
+                                continue
+                            try:
+                                merged_transcript.append(json.loads(line))
+                            except json.JSONDecodeError:
+                                pass
+                        # Clean up per-session archive after merging
+                        try:
+                            archive_path.unlink()
+                        except OSError:
+                            pass
+                    except OSError as exc:
+                        logger.warning("Failed to read archived session transcript: %s", exc)
+        merged_transcript.extend(final_transcript)
+        transcript = merged_transcript
+    else:
+        transcript, transcript_path = _load_transcript(agent_id, session_id, start_time)
     usage = _extract_usage_from_transcript(transcript)
     execution_time = time.time() - start_time
 

diff --git a/tasks/TASK_TEMPLATE.md b/tasks/TASK_TEMPLATE.md
@@ -10,6 +10,18 @@ workspace_files: []
 #     dest: input.txt
 #   - source: assets/config.json
 #     dest: config.json
+# multi_session: true  # Optional: set to true for multi-turn tasks
+# sessions:            # Optional: list of sequential user prompts
+#   - id: step_1
+#     prompt: |
+#       First prompt sent to the agent.
+#   - id: step_2
+#     prompt: |
+#       Second prompt, sent in the same conversation context.
+#   - id: fresh_start
+#     new_session: true  # Start a new session (no conversation history)
+#     prompt: |
+#       Third prompt in a fresh session — agent has no memory of prior turns.
 ---
 
 # Task Template
@@ -243,6 +255,62 @@ workspace_files:
 
 ---
 
+## Multi-Session Tasks
+
+{Optional: Define a sequence of prompts to test multi-turn conversation, iterative refinement, or cross-session memory.}
+
+**YAML Frontmatter Format:**
+
+```yaml
+multi_session: true
+sessions:
+  - id: first_turn
+    prompt: |
+      First message sent to the agent.
+  - id: follow_up
+    prompt: |
+      Follow-up message in the same conversation context.
+      The agent retains all prior conversation history.
+  - id: fresh_context
+    new_session: true
+    prompt: |
+      Message sent in a brand-new session. The agent has no
+      conversation history from prior turns, but workspace
+      files created earlier are still accessible.
+```
+
+**Fields:**
+
+| Field | Type | Description |
+|-------|------|-------------|
+| `id` | string | Unique identifier for this session step |
+| `prompt` | string | The user message sent to the agent |
+| `new_session` | bool | If `true`, start a fresh OpenClaw session with no conversation history. Workspace files are preserved. Default: `false` |
+
+**How It Works:**
+
+1. Each session entry's `prompt` is sent to the agent in order
+2. By default, all prompts share the same conversation context (the agent remembers previous turns)
+3. When `new_session: true` is set, the agent's session is reset before sending the prompt — simulating a user returning after closing and reopening the agent
+4. The workspace directory (including any files the agent created) is **never** reset between sessions
+5. All session transcripts are merged for grading
+
+**When to Use Multi-Session:**
+
+- **Iterative refinement**: User asks the agent to create something, then asks for modifications
+- **Cross-session memory**: Test if the agent can persist and recall information across separate conversations
+- **Follow-up instructions**: Test if the agent correctly applies new constraints to prior work
+- **Realistic workflows**: Simulate a user who interacts with the agent over multiple conversations
+
+**Guidelines:**
+
+- Use `new_session: true` when you want to test the agent's ability to recover context from files (not conversation history)
+- Keep the total number of sessions reasonable (2-5) to avoid excessive timeout
+- Increase `timeout_seconds` proportionally to the number of sessions
+- Ensure the Prompt section notes that this is a multi-session task
+
+---
+
 ## Additional Notes
 
 {Optional: Any additional context, edge cases, or implementation notes for task authors or developers.}
@@ -271,3 +339,6 @@ Before submitting a new task, verify:
 - [ ] Weights in rubric sum to 100% (if applicable)
 - [ ] Timeout is reasonable for the task complexity
 - [ ] Workspace files are included in `assets/` (if needed)
+- [ ] Multi-session prompts are in the `sessions` field (if applicable)
+- [ ] `new_session: true` is set for sessions that should start fresh (if applicable)
+- [ ] `timeout_seconds` accounts for multiple sessions (if applicable)