diff --git a/.gitignore b/.gitignore
index 17a9f02..550d57d 100644
--- a/.gitignore
+++ b/.gitignore
@@ -184,6 +184,10 @@ test_outputs/
 
 eng_plans/
 
+# RL experiments
+rl/experiments/
+rl/gameplay_logs/
+
 # RL training artifacts
 rl/models/*.zip
 !rl/models/cuttle_rl_final.zip
diff --git a/Makefile b/Makefile
index 0533d60..7356b3e 100644
--- a/Makefile
+++ b/Makefile
@@ -66,3 +66,38 @@ test-rl:
 	source $(VENV_NAME)/bin/activate && PYTHONPATH=$(CURRENT_DIR) python -c \
 		"from rl import config; config.TRAINING_CONFIG['total_timesteps'] = 10000; \
 		exec(open('rl/train.py').read())"
+
+debug-rl:
+	@echo "Running RL games with detailed logging..."
+	source $(VENV_NAME)/bin/activate && PYTHONPATH=$(CURRENT_DIR) python rl/debug_gameplay.py
+
+analyze-rl:
+	@echo "Analyzing RL gameplay logs..."
+	source $(VENV_NAME)/bin/activate && PYTHONPATH=$(CURRENT_DIR) python rl/analyze_logs.py
+
+view-rl:
+	@echo "Viewing RL gameplay logs..."
+	source $(VENV_NAME)/bin/activate && PYTHONPATH=$(CURRENT_DIR) python rl/view_game.py
+
+hypersearch-rl:
+	@echo "Running hyperparameter search (full)..."
+	source $(VENV_NAME)/bin/activate && PYTHONPATH=$(CURRENT_DIR) python rl/hyperparameter_search.py
+
+hypersearch-quick-rl:
+	@echo "Running quick hyperparameter search..."
+	source $(VENV_NAME)/bin/activate && PYTHONPATH=$(CURRENT_DIR) python rl/hyperparameter_search.py --quick
+
+compare-rl:
+	@echo "Compare experiment results..."
+	@echo "Usage: make compare-rl DIR=rl/experiments/20260125_120000"
+	@if [ -z "$(DIR)" ]; then \
+		echo "Error: DIR not specified"; \
+		exit 1; \
+	fi
+	source $(VENV_NAME)/bin/activate && PYTHONPATH=$(CURRENT_DIR) python rl/compare_experiments.py $(DIR)
+
+monitor-rl:
+	@source $(VENV_NAME)/bin/activate && PYTHONPATH=$(CURRENT_DIR) python rl/monitor.py
+
+watch-rl:
+	@source $(VENV_NAME)/bin/activate && PYTHONPATH=$(CURRENT_DIR) python rl/monitor.py --watch
diff --git a/game/game_state.py b/game/game_state.py
index 4bfb6b6..e3672a6 100644
--- a/game/game_state.py
+++ b/game/game_state.py
@@ -826,6 +826,10 @@ def play_one_off(
                 if card not in self.discard_pile:
                     self._move_card_to_discard(card)
 
+            # One-off resolution is complete (counter accepted or effect applied).
+            self.resolving_one_off = False
+            self.one_off_card_to_counter = None
+
             # Turn is finished after resolution
             return True, None
 
diff --git a/game/rl_ai_player.py b/game/rl_ai_player.py
index 74a3cf2..9df5a08 100644
--- a/game/rl_ai_player.py
+++ b/game/rl_ai_player.py
@@ -107,17 +107,10 @@ def _encode_game_state(self, game_state: GameState) -> np.ndarray:
         return self.env.env.unwrapped._encode_state()
     
     def _get_action_mask(self, legal_actions: List[Action]) -> np.ndarray:
-        """Get action mask for the current legal actions.
-        
-        Args:
-            legal_actions (List[Action]): List of legal actions.
-            
-        Returns:
-            np.ndarray: Boolean mask for valid actions.
-        """
-        mask = np.zeros(50, dtype=bool)  # Max 50 actions
-        mask[:len(legal_actions)] = True
-        return mask
+        """Get action mask for the current legal actions."""
+        from rl.action_mapping import legal_action_mask_from_actions
+
+        return legal_action_mask_from_actions(legal_actions)
     
     async def get_action(
         self, 
@@ -167,12 +160,14 @@ async def get_action(
                     deterministic=True
                 )
                 
-                # Ensure action index is valid
-                if action_index >= len(legal_actions):
-                    action_index = 0  # Fallback to first legal action
-                
-                # Return the chosen action
-                return legal_actions[action_index]
+                action_index = int(action_index)
+
+                from rl.action_mapping import build_action_map
+
+                action_map = build_action_map(legal_actions)
+                if action_index not in action_map:
+                    return legal_actions[0]
+                return action_map[action_index]
                 
             except Exception as e:
                 last_error = e
diff --git a/requirements.txt b/requirements.txt
index 2099c36..b03cf85 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -1,33 +1,33 @@
-astroid==3.2.4
-black==24.8.0
-click==8.1.7
-dill==0.3.9
-flake8==7.1.1
-fastapi==0.115.0
-isort==5.13.2
+astroid==4.0.3
+black==26.1.0
+click==8.3.1
+dill==0.4.1
+flake8==7.3.0
+fastapi==0.128.0
+isort==7.0.0
 mccabe==0.7.0
-mypy==1.13.0
-mypy-extensions==1.0.0
-packaging==24.2
-pathspec==0.12.1
-platformdirs==4.3.6
-pdoc==14.7.0
-pycodestyle==2.12.1
-pyflakes==3.2.0
-pylint==3.2.7
-pytest==8.1.1
-pytest-timeout==2.3.1
-tomli==2.2.1
-tomlkit==0.13.2
-typing-extensions==4.12.2
-uvicorn[standard]==0.30.6
-ollama==0.4.6
-pytest-asyncio==0.23.8
+mypy==1.19.1
+mypy-extensions==1.1.0
+packaging==26.0
+pathspec==1.0.3
+platformdirs==4.5.1
+pdoc==16.0.0
+pycodestyle==2.14.0
+pyflakes==3.4.0
+pylint==4.0.4
+pytest==9.0.2
+pytest-timeout==2.4.0
+tomli==2.4.0
+tomlkit==0.14.0
+typing-extensions==4.15.0
+uvicorn[standard]==0.40.0
+ollama==0.6.1
+pytest-asyncio==1.3.0
 
 # RL Training Dependencies
-gymnasium==0.29.1
-stable-baselines3==2.2.1
-sb3-contrib==2.2.1
-torch>=2.0.0
-tensorboard==2.13.0
-numpy>=1.24.0
+gymnasium==1.2.3
+stable-baselines3==2.7.1
+sb3-contrib==2.7.1
+torch==2.10.0
+tensorboard==2.20.0
+numpy==2.4.1
diff --git a/rl/README.md b/rl/README.md
index 36fe21a..267761f 100644
--- a/rl/README.md
+++ b/rl/README.md
@@ -1,312 +1,423 @@
 # RL Training for Cuttle Game
 
-Reinforcement Learning training setup for the Cuttle card game using **MaskablePPO** (Proximal Policy Optimization with action masking) from Stable Baselines3.
+Reinforcement Learning training for the Cuttle card game using **MaskablePPO** (Proximal Policy Optimization with action masking) from Stable Baselines3.
 
 ## Quick Start
 
 ### Train a Model
 
 ```bash
-# Full training (100K timesteps, ~1-2 minutes)
+# Full training (500K timesteps, ~2-3 hours)
 make train-rl
 
-# Or directly:
-source cuttle-bot-3.12/bin/activate
-PYTHONPATH=. python rl/train.py
+# Quick test (10K timesteps, ~2-3 minutes)
+make test-rl
 ```
 
 ### Evaluate a Trained Model
 
 ```bash
 make eval-rl
-
-# Or directly:
-source cuttle-bot-3.12/bin/activate
-PYTHONPATH=. python rl/evaluate.py
 ```
 
 ### Monitor Training
 
 ```bash
 make tensorboard
-# Then open http://localhost:6006
+# Open http://localhost:6006
 ```
 
-### Quick Test
+## Hyperparameter Search
+
+Test multiple configurations to find the best settings:
 
 ```bash
-# Quick test with 10K timesteps (~2-3 minutes)
-make test-rl
+# Quick search (50K steps each, ~1 hour total)
+make hypersearch-quick-rl
+
+# Full search (200K steps each, ~3-4 hours)
+make hypersearch-rl
+
+# Compare results
+make compare-rl DIR=rl/experiments/<timestamp>
 ```
 
-## File Structure
+## Debugging Tools
+
+```bash
+# Generate detailed gameplay logs
+make debug-rl
+
+# Analyze action patterns
+make analyze-rl
 
+# View individual games interactively
+make view-rl
 ```
-rl/
-├── README.md              # This file
-├── config.py              # Hyperparameters and configuration
-├── cuttle_env.py          # Gymnasium environment wrapper
-├── self_play_env.py       # Self-play wrapper
-├── train.py               # Training script
-├── evaluate.py            # Evaluation script
-├── models/                # Saved model checkpoints (gitignored)
-│   └── cuttle_rl_final.zip
-└── logs/                  # TensorBoard logs (gitignored)
+
+---
+
+## Architecture
+
+### Files
+
+| File                       | Purpose                                   |
+| -------------------------- | ----------------------------------------- |
+| `config.py`                | Hyperparameters and reward settings       |
+| `cuttle_env.py`            | Gymnasium environment with action masking |
+| `self_play_env.py`         | Self-play wrapper for training            |
+| `train.py`                 | Main training script                      |
+| `evaluate.py`              | Model evaluation                          |
+| `hyperparameter_search.py` | Automated config testing                  |
+| `compare_experiments.py`   | Result analysis                           |
+| `debug_gameplay.py`        | Generate debug logs                       |
+| `analyze_logs.py`          | Pattern analysis                          |
+| `view_game.py`             | Interactive game viewer                   |
+| `game_logger.py`           | Logging implementation                    |
+
+### Environment
+
+- **Observation space**: 610-dimensional vector encoding game state (hand, fields, scores, flags, discard, revealed)
+- **Action space**: Discrete(8478) with fixed card-identity mapping (not per-turn legal-action indices)
+- **Action masking**: Only legal actions are considered by the policy
+
+---
+
+## Action Space Mapping (Fixed Indices)
+
+Action indices are stable across turns and map to a specific semantic action based on **card identity** (rank + suit), not on a per-turn legal-action list. The mapping lives in `rl/action_mapping.py`.
+
+### Card Identity Index
+
+Each card maps to `0..51` using a canonical order:
+
+```
+card_index = (rank_value - 1) * 4 + suit_value
 ```
 
-## Key Features
+- `rank_value` comes from `game/card.py` `Rank` enum (Ace=1 .. King=13).
+- `suit_value` comes from `Suit` enum (Clubs=0 .. Spades=3).
 
-- **Action Masking**: Agent only considers legal moves (no invalid action penalties)
-- **State Encoding**: 206-dimensional observation vector encoding full game state
-- **Self-Play**: Trains against random opponent (extensible to previous model checkpoints)
-- **Checkpointing**: Auto-saves model every 10K timesteps
-- **TensorBoard Logging**: Real-time training metrics visualization
+### Action Groups and Offsets
 
-## Configuration
+Action indices are grouped; each group has a fixed size and offset:
 
-All configuration is in `config.py`:
+1. Draw: `1`
+2. Resolve one-off: `1`
+3. Play points (card identity): `52`
+4. Play face card (card identity): `52`
+5. Play one-off (untargeted, card identity): `52`
+6. Play one-off (targeted: attacker, target): `52 * 52`
+7. Counter (two) (card identity): `52`
+8. Take from discard (card identity): `52`
+9. Discard from hand (four) (card identity): `52`
+10. Discard revealed (seven) (card identity): `52`
+11. Scuttle (attacker, target): `52 * 52`
+12. Jack (attacker, target): `52 * 52`
 
-### Training Hyperparameters
+Total size: `8478`.
+
+For paired actions (scuttle/jack/targeted one-off), the pair index is:
 
-```python
-TRAINING_CONFIG = {
-    "total_timesteps": 100000,  # Total training steps
-    "learning_rate": 3e-4,      # Learning rate
-    "n_steps": 2048,            # Steps per update
-    "batch_size": 64,           # Minibatch size
-    "n_epochs": 10,             # Epochs per update
-    "gamma": 0.99,              # Discount factor
-    "gae_lambda": 0.95,         # GAE parameter
-    "clip_range": 0.2,          # PPO clip range
-    "ent_coef": 0.01,           # Entropy coefficient
-}
+```
+pair_index = attacker_index * 52 + target_index
 ```
 
-### Reward Structure
+### Mapping to Concrete Actions
 
-```python
-REWARD_CONFIG = {
-    "win": 100.0,                    # Win reward
-    "loss": -100.0,                  # Loss penalty
-    "stalemate": 0.0,                # Draw reward
-    "progress_multiplier": 10.0,     # Score progress multiplier
-    "turn_penalty": -1.0,            # Per-turn penalty
-    "invalid_action_penalty": -50.0, # Shouldn't occur with masking
-}
+The environment builds a mapping each step using `game_state.get_legal_actions()`:
+
+1. `build_action_map(legal_actions)` converts each `Action` into a fixed index.
+2. `action_masks()` marks only those indices as legal.
+3. `step(action_index)` resolves the index back to the matching `Action`.
+
+### Illegal Action Handling
+
+If a predicted index does **not** map to a legal action:
+
+- The environment returns `invalid_action_penalty` and ends the episode early.
+- This should not occur when action masking is properly applied.
+
+### Index Calculation Examples
+
+Assume `Rank` and `Suit` enum values from `game/card.py` (Clubs=0, Diamonds=1, Hearts=2, Spades=3) and:
+
+```
+card_index = (rank_value - 1) * 4 + suit_value
 ```
 
-### Environment Config
+Also assume the group offsets from `rl/action_mapping.py`:
 
-```python
-ENV_CONFIG = {
-    "max_actions": 50,       # Max actions per turn
-    "observation_dim": 206,  # State vector size
-    "max_hand_size": 8,      # Max cards in hand
-    "max_field_size": 10,    # Max cards on field
-}
 ```
+draw=0
+resolve=1
+points=2
+face=54
+one_off=106
+one_off_target=158
+counter=2862
+take_from_discard=2914
+discard_from_hand=2966
+discard_revealed=3018
+scuttle=3070
+jack=5774
+```
+
+Examples:
+
+- **Play points**: `10 of Hearts` (rank=10, suit=Hearts=2)  
+  `card_index = (10-1)*4 + 2 = 38`  
+  `action_index = points_offset + card_index = 2 + 38 = 40`
+
+- **Seven one-off** (untargeted): `7 of Clubs` (rank=7, suit=Clubs=0)  
+  `card_index = (7-1)*4 + 0 = 24`  
+  `action_index = one_off_offset + card_index = 106 + 24 = 130`
+
+- **Seven from revealed pile** (discard revealed): `7 of Clubs`  
+  `action_index = discard_revealed_offset + card_index = 3018 + 24 = 3042`
 
-## How It Works
+---
 
-### Action Masking
+## Key Findings (might be outdated)
 
-The environment uses **action masking** to ensure the agent only considers legal moves:
+### Best Configuration: Baseline (Minimal Reward Shaping)
 
-1. `get_legal_actions()` returns list of valid `Action` objects
-2. Action mask is boolean array: `True` for legal actions, `False` for illegal
-3. Model predicts action index into legal actions list
-4. Mask prevents model from selecting invalid actions
+After extensive hyperparameter search, the **simplest configuration** performed best:
 
-**Benefits**: Faster training, no wasted exploration on invalid moves.
+```python
+REWARD_CONFIG = {
+    "win": 100.0,
+    "loss": -100.0,
+    "progress_multiplier": 0.1,   # Minimal
+    "turn_penalty": -0.01,        # Minimal
+}
+```
 
-### State Encoding
+**Why it works:**
+- Sparse rewards (win/loss) let the agent learn actual game strategy
+- Heavy reward shaping causes overfitting to intermediate rewards
+- Agent learns to play the game, not exploit reward hacking
 
-Game state is encoded as a **206-dimensional vector**:
+### Hyperparameter Search Results
 
-- **Hand cards** (136 dims): 8 slots × 17 dims (suit + rank)
-- **Opponent hand size** (1 dim): Normalized
-- **Player 0 field** (30 dims): 10 slots × 3 dims
-- **Player 1 field** (30 dims): 10 slots × 3 dims
-- **Scores & targets** (4 dims): Normalized scores
-- **Game flags** (5 dims): Current player, resolving flags, deck/discard sizes
+| Config             | Eval Reward | Notes                               |
+| ------------------ | ----------- | ----------------------------------- |
+| **baseline**       | -4.31       | ✅ Best - won games, longer episodes |
+| aggressive_scoring | -9.07       | ❌ Crashed early                     |
+| high_progress      | -9.59       | ❌ Overfitted to progress            |
+| fast_learning      | -9.86       | ❌ Unstable                          |
+| conservative       | -9.97       | ❌ Too slow                          |
 
-### Training Flow
+---
 
-1. Environment resets to new game
-2. Agent observes state (206-dim vector)
-3. Agent predicts action (with masking)
-4. Action executed, reward calculated
-5. Opponent takes turn (random, also masked)
-6. Repeat until game ends
-7. Model updates using PPO algorithm
+## Important: Action Masking
 
-## Usage Examples
+### The Bug We Fixed
 
-### Custom Training Run
+MaskablePPO requires proper environment wrapping to use action masks:
 
 ```python
-from rl import config
-from rl.train import main
+# WRONG - action masking doesn't work
+env = CuttleRLEnvironment()
+env = Monitor(env, LOG_DIR)
 
-# Modify config
-config.TRAINING_CONFIG["total_timesteps"] = 500000
-config.TRAINING_CONFIG["learning_rate"] = 1e-4
+# CORRECT - action masking works
+from sb3_contrib.common.wrappers import ActionMasker
 
-# Train
-main()
+def mask_fn(env):
+    while hasattr(env, 'env'):
+        if hasattr(env, 'action_masks'):
+            return env.action_masks()
+        env = env.env
+    return env.action_masks()
+
+env = CuttleRLEnvironment()
+env = SelfPlayWrapper(env)
+env = Monitor(env, LOG_DIR)
+env = ActionMasker(env, mask_fn)  # CRITICAL
 ```
 
-### Load and Use Model
+### Why Action Masking Matters
 
-```python
-from sb3_contrib import MaskablePPO
-from rl.cuttle_env import CuttleRLEnvironment
+Without it:
+- Agent attempts invalid actions (-10 penalty each)
+- Episodes crash after 1-2 steps
+- No learning occurs
 
-# Load model
-model = MaskablePPO.load("rl/models/cuttle_rl_final")
+With it:
+- Agent only sees legal actions
+- Full games play out (50-150 steps)
+- Agent learns actual strategy
 
-# Create environment
-env = CuttleRLEnvironment()
-obs, info = env.reset()
+---
+
+## Configuration Reference
 
-# Get action with masking
-action_mask = env.action_masks()
-action, _ = model.predict(obs, action_masks=action_mask, deterministic=True)
+### Training Parameters
 
-# Execute action
-obs, reward, done, truncated, info = env.step(action)
+```python
+TRAINING_CONFIG = {
+    "total_timesteps": 500000,  # How long to train
+    "learning_rate": 3e-4,      # Adam optimizer LR
+    "n_steps": 2048,            # Steps before policy update
+    "batch_size": 64,           # Minibatch size
+    "n_epochs": 10,             # Epochs per update
+    "gamma": 0.99,              # Discount factor
+    "gae_lambda": 0.95,         # GAE parameter
+    "clip_range": 0.2,          # PPO clip range
+    "ent_coef": 0.01,           # Entropy coefficient
+}
 ```
 
-### Evaluate Custom Model
+### Reward Parameters
 
 ```python
-from rl.evaluate import evaluate_agent
-
-# Evaluate specific model
-evaluate_agent("rl/models/cuttle_rl_100000_steps", n_episodes=50)
+REWARD_CONFIG = {
+    "win": 100.0,               # Terminal reward for winning
+    "loss": -100.0,             # Terminal penalty for losing
+    "stalemate": -50.0,         # Penalty for draw
+    "progress_multiplier": 0.1, # Intermediate reward multiplier
+    "turn_penalty": -0.01,      # Per-turn penalty
+    "invalid_action_penalty": -10.0,  # Safety check
+}
 ```
 
-## Output Files
+### Parameter Guidelines
 
-### Models
+| Parameter             | Too Low       | Recommended   | Too High    |
+| --------------------- | ------------- | ------------- | ----------- |
+| `learning_rate`       | Slow learning | 1e-4 to 3e-4  | Unstable    |
+| `progress_multiplier` | No guidance   | 0.1 to 1.0    | Overfitting |
+| `turn_penalty`        | Long games    | -0.01 to -0.1 | Rushed play |
 
-- `rl/models/cuttle_rl_final.zip` - Final trained model
-- `rl/models/cuttle_rl_10000_steps.zip` - Checkpoint at 10K steps
-- `rl/models/cuttle_rl_20000_steps.zip` - Checkpoint at 20K steps
-- etc.
+---
 
-### Logs
+## Troubleshooting
 
-- `rl/logs/` - TensorBoard logs
-  - View with: `make tensorboard`
-  - Metrics: reward, episode length, policy loss, value loss, etc.
+### High Timeout Rate
 
-## Dependencies
+**Symptoms:** Games exceed 200 steps frequently
 
-Required packages (already in main `requirements.txt`):
+**Solutions:**
+1. Increase `turn_penalty` slightly
+2. Check for resolve loops in gameplay logs
+3. Verify action masking is working
 
-```
-gymnasium==0.29.1
-stable-baselines3==2.2.1
-sb3-contrib==2.2.1
-torch>=2.0.0
-tensorboard==2.15.1
-numpy>=1.24.0
-tqdm>=4.67.0
-rich>=14.2.0
-```
+### Negative Eval Rewards
 
-## Troubleshooting
+**Symptoms:** Agent loses more than wins
 
-### Model Not Found
+**Solutions:**
+1. Train longer (500K+ timesteps)
+2. Reduce reward shaping (use baseline config)
+3. Check for invalid action penalties
 
-```
-ERROR: Model not found at rl/models/cuttle_rl_final.zip
-```
+### Short Eval Episodes
 
-**Solution**: Train a model first with `make train-rl`
+**Symptoms:** Episodes only 1-5 steps
 
-### Import Errors
+**Cause:** Action masking not working, or overfitted model
 
-```
-ImportError: You must install tqdm and rich...
-```
+**Solution:** Verify `ActionMasker` wrapper is applied correctly
+
+### Training Instability
+
+**Symptoms:** Reward oscillates wildly, NaN losses
+
+**Solutions:**
+1. Reduce `learning_rate` by 10x
+2. Increase `batch_size`
+3. Check reward calculations for bugs
+
+---
+
+## Development Notes
+
+### Testing Changes
 
-**Solution**: Install missing packages:
 ```bash
-source cuttle-bot-3.12/bin/activate
-pip install tqdm rich
-```
+# Run quick training test
+make test-rl
 
-### Games Taking Too Long
+# Generate debug logs
+make debug-rl
 
-Untrained agents may play very long games (both players just drawing cards). This is normal! The agent needs more training to learn strategic play.
+# Analyze patterns
+make analyze-rl
+```
 
-**Solution**: 
-- Train longer (increase `total_timesteps` in `config.py`)
-- Adjust reward structure to encourage strategic moves
-- Add episode length limits (see `cuttle_env.py`)
+### Viewing Results
 
-### Action Masking Not Working
+```bash
+# TensorBoard
+tensorboard --logdir rl/logs
 
-If you see "WARNING: Invalid action attempted", action masking may not be properly passed to the model.
+# Compare experiments
+make compare-rl DIR=rl/experiments/<timestamp>
 
-**Solution**: Ensure `action_masks` is passed to `model.predict()`:
-```python
-action_mask = env.action_masks()
-action, _ = model.predict(obs, action_masks=action_mask)
+# Interactive game viewer
+make view-rl
 ```
 
-## Performance
+### Output Locations
 
-- **Training Speed**: ~1,200 FPS on modern CPU
-- **100K Timesteps**: ~1-2 minutes
-- **Model Size**: ~1-2 MB (compressed)
-- **Memory Usage**: ~200-500 MB during training
+| Output         | Location            |
+| -------------- | ------------------- |
+| Trained models | `rl/models/`        |
+| Training logs  | `rl/logs/`          |
+| Experiments    | `rl/experiments/`   |
+| Gameplay logs  | `rl/gameplay_logs/` |
 
-## Key Concepts
+---
 
-### Action Masking
+## Changelog
 
-Action masking is **critical** for efficient training. Without it, the agent would waste time exploring invalid moves. The mask tells the model which actions are legal in the current state.
+### 2026-01-25: Action Masking Fix
 
-### Self-Play
+**Problem:** MaskablePPO wasn't receiving action masks correctly.
 
-Currently uses random opponent. Future improvements:
-- Train against previous model checkpoints
-- Use stronger opponents as agent improves
-- Implement population-based training
+**Impact:** 
+- Agents attempted invalid actions constantly
+- Episodes crashed after 1-2 steps
+- All previous training was invalid
 
-### Reward Shaping
+**Solution:** Added `ActionMasker` wrapper with proper environment unwrapping.
 
-Rewards are designed to:
-- Strongly reward winning (+100)
-- Strongly penalize losing (-100)
-- Provide intermediate feedback for score progress
-- Slightly penalize each turn to encourage efficiency
+**Files changed:**
+- `train.py` - Added ActionMasker
+- `hyperparameter_search.py` - Added ActionMasker
+- `debug_gameplay.py` - Added ActionMasker
 
-## Next Steps
+### 2026-01-25: Hyperparameter Search
 
-1. **Train Longer**: Increase `total_timesteps` to 1M+ for better strategy
-2. **Tune Rewards**: Adjust `REWARD_CONFIG` to encourage specific behaviors
-3. **Better Opponents**: Implement self-play with previous checkpoints
-4. **Hyperparameter Tuning**: Experiment with learning rate, batch size, etc.
-5. **Evaluation Metrics**: Add detailed analysis (action distribution, game length)
+**Finding:** Baseline config (minimal reward shaping) outperforms all others.
 
-## References
+**Implication:** Sparse rewards work better than dense reward shaping for this game.
+
+**Config adopted:**
+```python
+progress_multiplier: 0.1  # Was 10.0
+turn_penalty: -0.01       # Was -1.0
+```
 
-- **Detailed Documentation**: See `eng_plans/rl_implementation_summary.md`
-- **Stable Baselines3**: https://stable-baselines3.readthedocs.io/
-- **MaskablePPO**: https://sb3-contrib.readthedocs.io/en/master/modules/ppo_mask.html
-- **Gymnasium**: https://gymnasium.farama.org/
+### 2026-01-25: Debug Tools
 
-## Notes
+**Added:**
+- `game_logger.py` - Step-by-step game logging
+- `analyze_logs.py` - Automated pattern analysis
+- `view_game.py` - Interactive game viewer
+- `compare_experiments.py` - Experiment comparison
 
-- Models and logs are gitignored (see `.gitignore`)
-- Training is deterministic with fixed seeds
-- Environment uses action masking - invalid actions should never occur
-- State encoding is fixed-size (206 dims) for neural network compatibility
+**Commands:**
+- `make debug-rl`
+- `make analyze-rl`
+- `make view-rl`
+- `make compare-rl`
 
 ---
 
-**Last Updated**: 2025-10
\ No newline at end of file
+## References
+
+- [Stable-Baselines3 Documentation](https://stable-baselines3.readthedocs.io/)
+- [SB3-Contrib MaskablePPO](https://sb3-contrib.readthedocs.io/en/master/modules/ppo_mask.html)
+- [PPO Paper](https://arxiv.org/abs/1707.06347) (Schulman et al., 2017)
diff --git a/rl/action_mapping.py b/rl/action_mapping.py
new file mode 100644
index 0000000..e2bd314
--- /dev/null
+++ b/rl/action_mapping.py
@@ -0,0 +1,150 @@
+"""Action mapping for Cuttle RL.
+
+Maps fixed action indices to game Action objects using card identity (rank/suit).
+This avoids per-turn reindexing based on legal action list order.
+"""
+from __future__ import annotations
+
+from dataclasses import dataclass
+from typing import Dict, Iterable, Optional
+
+import numpy as np
+
+from game.action import Action, ActionType
+from game.card import Card
+
+
+NUM_CARDS = 52
+PAIR_SIZE = NUM_CARDS * NUM_CARDS
+
+
+@dataclass(frozen=True)
+class ActionGroup:
+    name: str
+    size: int
+
+
+ACTION_GROUPS = (
+    ActionGroup("draw", 1),
+    ActionGroup("resolve", 1),
+    ActionGroup("points", NUM_CARDS),
+    ActionGroup("face", NUM_CARDS),
+    ActionGroup("one_off", NUM_CARDS),
+    ActionGroup("one_off_target", PAIR_SIZE),
+    ActionGroup("counter", NUM_CARDS),
+    ActionGroup("take_from_discard", NUM_CARDS),
+    ActionGroup("discard_from_hand", NUM_CARDS),
+    ActionGroup("discard_revealed", NUM_CARDS),
+    ActionGroup("scuttle", PAIR_SIZE),
+    ActionGroup("jack", PAIR_SIZE),
+)
+
+
+_OFFSETS: Dict[str, int] = {}
+_running = 0
+for _group in ACTION_GROUPS:
+    _OFFSETS[_group.name] = _running
+    _running += _group.size
+
+ACTION_SPACE_SIZE = _running
+
+
+def card_index(card: Card) -> int:
+    """Return canonical 0..51 index for a card based on rank/suit."""
+    return (card.rank.value[1] - 1) * 4 + card.suit.value[1]
+
+
+def _pair_index(attacker_idx: int, target_idx: int) -> int:
+    return attacker_idx * NUM_CARDS + target_idx
+
+
+def action_to_index(action: Action) -> Optional[int]:
+    """Map a concrete Action to a fixed action index."""
+    if action.action_type == ActionType.DRAW:
+        return _OFFSETS["draw"]
+    if action.action_type == ActionType.RESOLVE:
+        return _OFFSETS["resolve"]
+    if action.action_type == ActionType.POINTS:
+        if action.card is None:
+            return None
+        return _OFFSETS["points"] + card_index(action.card)
+    if action.action_type == ActionType.FACE_CARD:
+        if action.card is None:
+            return None
+        return _OFFSETS["face"] + card_index(action.card)
+    if action.action_type == ActionType.ONE_OFF:
+        if action.card is None:
+            return None
+        attacker_idx = card_index(action.card)
+        if action.target is None:
+            return _OFFSETS["one_off"] + attacker_idx
+        target_idx = card_index(action.target)
+        return _OFFSETS["one_off_target"] + _pair_index(attacker_idx, target_idx)
+    if action.action_type == ActionType.COUNTER:
+        if action.card is None:
+            return None
+        return _OFFSETS["counter"] + card_index(action.card)
+    if action.action_type == ActionType.TAKE_FROM_DISCARD:
+        if action.card is None:
+            return None
+        return _OFFSETS["take_from_discard"] + card_index(action.card)
+    if action.action_type == ActionType.DISCARD_FROM_HAND:
+        if action.card is None:
+            return None
+        return _OFFSETS["discard_from_hand"] + card_index(action.card)
+    if action.action_type == ActionType.DISCARD_REVEALED:
+        if action.card is None:
+            return None
+        return _OFFSETS["discard_revealed"] + card_index(action.card)
+    if action.action_type == ActionType.SCUTTLE:
+        if action.card is None or action.target is None:
+            return None
+        attacker_idx = card_index(action.card)
+        target_idx = card_index(action.target)
+        return _OFFSETS["scuttle"] + _pair_index(attacker_idx, target_idx)
+    if action.action_type == ActionType.JACK:
+        if action.card is None or action.target is None:
+            return None
+        attacker_idx = card_index(action.card)
+        target_idx = card_index(action.target)
+        return _OFFSETS["jack"] + _pair_index(attacker_idx, target_idx)
+    return None
+
+
+def build_action_map(legal_actions: Iterable[Action]) -> Dict[int, Action]:
+    """Build a mapping from fixed action index to Action for the current state."""
+    index_to_action: Dict[int, Action] = {}
+    for action in legal_actions:
+        idx = action_to_index(action)
+        if idx is None:
+            continue
+        if idx in index_to_action:
+            continue
+        index_to_action[idx] = action
+    return index_to_action
+
+
+def legal_action_mask_from_actions(
+    legal_actions: Iterable[Action],
+    action_space_size: int = ACTION_SPACE_SIZE,
+) -> np.ndarray:
+    """Return a boolean mask over the full action space for given legal actions."""
+    mask = np.zeros(action_space_size, dtype=np.bool_)
+    for idx in build_action_map(legal_actions).keys():
+        if 0 <= idx < action_space_size:
+            mask[idx] = True
+    return mask
+
+
+def legal_action_mask(game_state) -> np.ndarray:
+    """Return a boolean mask over the full action space for a game state."""
+    return legal_action_mask_from_actions(
+        game_state.get_legal_actions(),
+        action_space_size=ACTION_SPACE_SIZE,
+    )
+
+
+def action_index_to_action(game_state, action_index: int) -> Optional[Action]:
+    """Resolve a fixed action index into a concrete legal Action, if any."""
+    action_map = build_action_map(game_state.get_legal_actions())
+    return action_map.get(action_index)
diff --git a/rl/analyze_logs.py b/rl/analyze_logs.py
new file mode 100644
index 0000000..ed2221f
--- /dev/null
+++ b/rl/analyze_logs.py
@@ -0,0 +1,140 @@
+"""Analyze RL gameplay logs to identify issues."""
+from __future__ import annotations
+
+import json
+from collections import Counter, defaultdict
+from pathlib import Path
+from typing import Any, Dict, List
+
+
+def analyze_logs(log_dir: str = "rl/gameplay_logs") -> None:
+    """Analyze gameplay logs to identify patterns and issues.
+    
+    Args:
+        log_dir: Directory containing log files
+    """
+    log_path = Path(log_dir)
+    
+    if not log_path.exists():
+        print(f"❌ No logs found at {log_dir}")
+        print("   Run 'make debug-rl' first to generate logs")
+        return
+    
+    # Find all game logs
+    game_files = sorted(log_path.glob("game_*.json"))
+    
+    if not game_files:
+        print(f"❌ No game logs found in {log_dir}")
+        return
+    
+    print(f"\n{'='*70}")
+    print("RL GAMEPLAY ANALYSIS")
+    print(f"{'='*70}\n")
+    print(f"Analyzing {len(game_files)} games...\n")
+    
+    # Collect statistics
+    action_types = Counter()
+    action_patterns = defaultdict(list)
+    timeout_games = []
+    quick_wins = []
+    
+    for game_file in game_files:
+        with open(game_file, "r") as f:
+            game_data = json.load(f)
+        
+        game_id = game_data["game_id"]
+        steps = game_data["steps"]
+        outcome = game_data["outcome"]
+        
+        # Count action types
+        for step in steps:
+            action_type = step["action"]["type"]
+            action_types[action_type] += 1
+        
+        # Detect patterns
+        recent_actions = [s["action"]["type"] for s in steps[-20:]]
+        pattern_key = " -> ".join(recent_actions[-5:]) if len(recent_actions) >= 5 else ""
+        action_patterns[pattern_key].append(game_id)
+        
+        # Categorize games
+        if outcome["reason"] == "timeout":
+            timeout_games.append({
+                "id": game_id,
+                "steps": outcome["total_steps"],
+                "final_scores": outcome["final_scores"],
+                "recent_actions": recent_actions[-10:],
+            })
+        elif outcome["total_steps"] < 50 and outcome["reason"] == "win":
+            quick_wins.append({
+                "id": game_id,
+                "steps": outcome["total_steps"],
+                "winner": outcome["winner"],
+            })
+    
+    # Print analysis
+    print("📊 ACTION TYPE DISTRIBUTION")
+    print("-" * 70)
+    total_actions = sum(action_types.values())
+    for action_type, count in action_types.most_common():
+        percentage = (count / total_actions) * 100
+        bar = "█" * int(percentage / 2)
+        print(f"  {action_type:20s} {count:5d} ({percentage:5.1f}%) {bar}")
+    
+    print(f"\n🔄 TIMEOUT GAMES: {len(timeout_games)}/{len(game_files)}")
+    print("-" * 70)
+    if timeout_games:
+        for game in timeout_games[:5]:  # Show first 5
+            print(f"\n  Game {game['id']}:")
+            print(f"    Steps: {game['steps']}")
+            print(f"    Final scores: P0={game['final_scores']['player_0']}, "
+                  f"P1={game['final_scores']['player_1']}")
+            print(f"    Last 10 actions: {' -> '.join(game['recent_actions'])}")
+        
+        if len(timeout_games) > 5:
+            print(f"\n  ... and {len(timeout_games) - 5} more timeout games")
+    
+    print(f"\n⚡ QUICK WINS: {len(quick_wins)}/{len(game_files)}")
+    print("-" * 70)
+    if quick_wins:
+        for game in quick_wins:
+            print(f"  Game {game['id']}: Winner P{game['winner']} in {game['steps']} steps")
+    
+    # Detect stuck patterns
+    print(f"\n🔍 COMMON ACTION PATTERNS (last 5 moves)")
+    print("-" * 70)
+    common_patterns = [(p, len(games)) for p, games in action_patterns.items() 
+                      if len(games) > 1 and p]
+    common_patterns.sort(key=lambda x: x[1], reverse=True)
+    
+    for pattern, count in common_patterns[:10]:
+        print(f"  [{count} games] {pattern}")
+    
+    # Recommendations
+    print(f"\n💡 RECOMMENDATIONS")
+    print("-" * 70)
+    
+    draw_percentage = (action_types.get("Draw", 0) / total_actions) * 100
+    if draw_percentage > 40:
+        print("  ⚠️  HIGH DRAW RATE: Bot is drawing too often without playing cards")
+        print("     Consider adjusting reward to penalize excessive draws")
+    
+    if len(timeout_games) / len(game_files) > 0.5:
+        print("  ⚠️  HIGH TIMEOUT RATE: Games are not progressing")
+        print("     Bot may not understand how to play for points")
+        print("     Consider:")
+        print("       - Increase reward for playing point cards")
+        print("       - Add reward shaping for field control")
+        print("       - Reduce max_steps or add progress penalty")
+    
+    points_percentage = (action_types.get("Points", 0) / total_actions) * 100
+    if points_percentage < 10:
+        print("  ⚠️  LOW POINTS PLAY: Bot rarely plays point cards")
+        print("     Increase reward for point cards significantly")
+    
+    print(f"\n{'='*70}\n")
+    print(f"📁 Full logs available at: {log_path.absolute()}")
+    print(f"{'='*70}\n")
+
+
+if __name__ == "__main__":
+    analyze_logs()
diff --git a/rl/compare_experiments.py b/rl/compare_experiments.py
new file mode 100644
index 0000000..3177523
--- /dev/null
+++ b/rl/compare_experiments.py
@@ -0,0 +1,227 @@
+"""Compare results from multiple experiments."""
+from __future__ import annotations
+
+import json
+import sys
+from pathlib import Path
+from typing import Any, Dict, List
+
+import numpy as np
+
+
+def load_monitor_data(monitor_file: Path) -> Dict[str, Any]:
+    """Load data from a stable-baselines3 monitor file.
+    
+    Args:
+        monitor_file: Path to monitor.csv file
+        
+    Returns:
+        Dictionary with episode rewards and lengths
+    """
+    if not monitor_file.exists():
+        return {"rewards": [], "lengths": [], "times": []}
+    
+    rewards = []
+    lengths = []
+    times = []
+    
+    with open(monitor_file, "r") as f:
+        # Skip header lines
+        for _ in range(2):
+            f.readline()
+        
+        # Read data
+        for line in f:
+            if line.strip():
+                parts = line.strip().split(',')
+                if len(parts) >= 3:
+                    try:
+                        rewards.append(float(parts[0]))
+                        lengths.append(int(parts[1]))
+                        times.append(float(parts[2]))
+                    except ValueError:
+                        continue
+    
+    return {
+        "rewards": rewards,
+        "lengths": lengths,
+        "times": times,
+    }
+
+
+def analyze_experiment(exp_path: Path) -> Dict[str, Any]:
+    """Analyze results from a single experiment.
+    
+    Args:
+        exp_path: Path to experiment directory
+        
+    Returns:
+        Dictionary with analysis results
+    """
+    # Load config
+    config_file = exp_path / "config.json"
+    if not config_file.exists():
+        return {"error": "No config.json found"}
+    
+    with open(config_file, "r") as f:
+        config = json.load(f)
+    
+    # Load training monitor data
+    train_monitor = exp_path / "train.monitor.csv"
+    train_data = load_monitor_data(train_monitor)
+    
+    # Load evaluation monitor data  
+    eval_monitor = exp_path / "eval.monitor.csv"
+    eval_data = load_monitor_data(eval_monitor)
+    
+    # Calculate statistics
+    analysis = {
+        "name": config["name"],
+        "config": config,
+        "train": {
+            "total_episodes": len(train_data["rewards"]),
+            "mean_reward": float(np.mean(train_data["rewards"])) if train_data["rewards"] else 0.0,
+            "std_reward": float(np.std(train_data["rewards"])) if train_data["rewards"] else 0.0,
+            "mean_length": float(np.mean(train_data["lengths"])) if train_data["lengths"] else 0.0,
+            "final_100_mean_reward": float(np.mean(train_data["rewards"][-100:])) if len(train_data["rewards"]) >= 100 else 0.0,
+        },
+        "eval": {
+            "total_episodes": len(eval_data["rewards"]),
+            "mean_reward": float(np.mean(eval_data["rewards"])) if eval_data["rewards"] else 0.0,
+            "std_reward": float(np.std(eval_data["rewards"])) if eval_data["rewards"] else 0.0,
+            "mean_length": float(np.mean(eval_data["lengths"])) if eval_data["lengths"] else 0.0,
+            "best_reward": float(max(eval_data["rewards"])) if eval_data["rewards"] else 0.0,
+        },
+    }
+    
+    # Check for timeout issues
+    timeout_rate = sum(1 for length in train_data["lengths"] if length >= 200) / max(len(train_data["lengths"]), 1)
+    analysis["train"]["timeout_rate"] = float(timeout_rate)
+    
+    return analysis
+
+
+def compare_experiments(experiments_dir: Path) -> None:
+    """Compare all experiments in a directory.
+    
+    Args:
+        experiments_dir: Directory containing experiment subdirectories
+    """
+    if not experiments_dir.exists():
+        print(f"❌ Directory not found: {experiments_dir}")
+        return
+    
+    # Find all experiment directories
+    exp_dirs = [d for d in experiments_dir.iterdir() if d.is_dir() and (d / "config.json").exists()]
+    
+    if not exp_dirs:
+        print(f"❌ No experiments found in {experiments_dir}")
+        return
+    
+    print(f"\n{'='*80}")
+    print("EXPERIMENT COMPARISON")
+    print(f"{'='*80}")
+    print(f"Found {len(exp_dirs)} experiments\n")
+    
+    # Analyze all experiments
+    analyses = []
+    for exp_dir in sorted(exp_dirs):
+        analysis = analyze_experiment(exp_dir)
+        if "error" not in analysis:
+            analyses.append(analysis)
+    
+    if not analyses:
+        print("❌ No valid experiments to compare")
+        return
+    
+    # Sort by evaluation mean reward (best first)
+    analyses.sort(key=lambda x: x["eval"]["mean_reward"], reverse=True)
+    
+    # Print comparison table
+    print(f"{'Rank':<6} {'Name':<25} {'Train Reward':<15} {'Eval Reward':<15} {'Timeout %':<12} {'Avg Length':<12}")
+    print(f"{'-'*6} {'-'*25} {'-'*15} {'-'*15} {'-'*12} {'-'*12}")
+    
+    for i, analysis in enumerate(analyses, 1):
+        train_reward = analysis["train"]["final_100_mean_reward"]
+        eval_reward = analysis["eval"]["mean_reward"]
+        timeout_pct = analysis["train"]["timeout_rate"] * 100
+        avg_length = analysis["train"]["mean_length"]
+        
+        print(f"{i:<6} {analysis['name']:<25} {train_reward:>14.2f} {eval_reward:>14.2f} {timeout_pct:>11.1f} {avg_length:>11.1f}")
+    
+    # Detailed analysis of top 3
+    print(f"\n{'='*80}")
+    print("TOP 3 EXPERIMENTS (Detailed)")
+    print(f"{'='*80}\n")
+    
+    for i, analysis in enumerate(analyses[:3], 1):
+        print(f"{i}. {analysis['name']}")
+        print(f"   {'-'*76}")
+        print(f"   Description: {analysis['config']['description']}")
+        print(f"\n   Training Config:")
+        for key, value in analysis['config']['training'].items():
+            print(f"     {key:20s}: {value}")
+        print(f"\n   Reward Config:")
+        for key, value in analysis['config']['reward'].items():
+            print(f"     {key:20s}: {value}")
+        print(f"\n   Training Results:")
+        print(f"     Episodes:            {analysis['train']['total_episodes']}")
+        print(f"     Mean Reward:         {analysis['train']['mean_reward']:.2f} ± {analysis['train']['std_reward']:.2f}")
+        print(f"     Final 100 Reward:    {analysis['train']['final_100_mean_reward']:.2f}")
+        print(f"     Mean Episode Length: {analysis['train']['mean_length']:.1f}")
+        print(f"     Timeout Rate:        {analysis['train']['timeout_rate']*100:.1f}%")
+        print(f"\n   Evaluation Results:")
+        print(f"     Episodes:            {analysis['eval']['total_episodes']}")
+        print(f"     Mean Reward:         {analysis['eval']['mean_reward']:.2f} ± {analysis['eval']['std_reward']:.2f}")
+        print(f"     Best Reward:         {analysis['eval']['best_reward']:.2f}")
+        print(f"     Mean Episode Length: {analysis['eval']['mean_length']:.1f}")
+        print()
+    
+    # Recommendations
+    print(f"{'='*80}")
+    print("RECOMMENDATIONS")
+    print(f"{'='*80}\n")
+    
+    best = analyses[0]
+    
+    print(f"🏆 Best performing: {best['name']}")
+    print(f"   Mean eval reward: {best['eval']['mean_reward']:.2f}")
+    
+    if best['train']['timeout_rate'] > 0.3:
+        print(f"\n⚠️  Warning: High timeout rate ({best['train']['timeout_rate']*100:.1f}%)")
+        print("   Consider:")
+        print("     - Increase progress_multiplier")
+        print("     - Increase turn_penalty")
+        print("     - Train for more timesteps")
+    
+    if best['eval']['mean_reward'] < 0:
+        print(f"\n⚠️  Warning: Negative mean reward")
+        print("   The agent is losing more than winning. Consider:")
+        print("     - Adjusting reward shaping")
+        print("     - Training for more timesteps")
+        print("     - Using a different learning rate")
+    
+    print(f"\n📊 View detailed metrics:")
+    print(f"   tensorboard --logdir {experiments_dir.absolute()}")
+    
+    print(f"\n🎮 Test best model:")
+    best_model = experiments_dir / best['name'] / "best_model" / "best_model.zip"
+    if best_model.exists():
+        print(f"   make debug-rl MODEL={best_model}")
+    
+    print()
+
+
+if __name__ == "__main__":
+    import argparse
+    
+    parser = argparse.ArgumentParser(description="Compare experiment results")
+    parser.add_argument(
+        "experiments_dir",
+        type=Path,
+        help="Directory containing experiments",
+    )
+    
+    args = parser.parse_args()
+    
+    compare_experiments(args.experiments_dir)
diff --git a/rl/config.py b/rl/config.py
index b9fb0b6..3459119 100644
--- a/rl/config.py
+++ b/rl/config.py
@@ -2,8 +2,9 @@
 from typing import Any, Dict
 
 # Training hyperparameters for MaskablePPO algorithm
+# Using baseline config - performed best in hyperparameter search
 TRAINING_CONFIG: Dict[str, Any] = {
-    "total_timesteps": 100000,  # Total training steps
+    "total_timesteps": 500000,  # Extended training (was 100K)
     "learning_rate": 3e-4,      # Learning rate for optimizer
     "n_steps": 2048,            # Steps per update
     "batch_size": 64,           # Minibatch size
@@ -15,20 +16,23 @@
     "verbose": 1,               # Logging verbosity
 }
 
-# Reward structure - critical for agent learning
+# Reward structure - optimized for self-play
+# Key insight: reward scoring points, don't over-penalize turns
 REWARD_CONFIG: Dict[str, float] = {
     "win": 100.0,                    # Reward for winning
     "loss": -100.0,                  # Penalty for losing
-    "stalemate": 0.0,                # No reward for draw
-    "progress_multiplier": 10.0,     # Multiplier for score progress
-    "turn_penalty": -1.0,            # Small penalty each turn
-    "invalid_action_penalty": -50.0, # Heavy penalty for illegal moves (safety check)
+    "stalemate": -50.0,              # Penalty for stalemate (discourage draws)
+    "progress_multiplier": 2.0,      # Reward for scoring points (balanced)
+    "turn_penalty": -0.01,           # Keep small to not overwhelm learning
+    "invalid_action_penalty": -10.0, # Penalty for illegal moves (safety check)
 }
 
 # Environment configuration
+from rl.action_mapping import ACTION_SPACE_SIZE
+
 ENV_CONFIG: Dict[str, Any] = {
-    "max_actions": 50,       # Max possible actions per turn
-    "observation_dim": 206,  # State vector dimension (136+1+30+30+4+5)
+    "max_actions": ACTION_SPACE_SIZE,  # Fixed action space size
+    "observation_dim": 610,  # State vector dimension (136+1+180+180+4+5+52+52)
     "max_hand_size": 8,      # Max cards in hand
     "max_field_size": 10,    # Max cards on field
 }
diff --git a/rl/cuttle_env.py b/rl/cuttle_env.py
index fb5fcd4..36c146a 100644
--- a/rl/cuttle_env.py
+++ b/rl/cuttle_env.py
@@ -6,7 +6,9 @@
 
 from game.card import Purpose
 from game.game import Game
+from rl.action_mapping import build_action_map, card_index, legal_action_mask
 from rl.config import ENV_CONFIG, REWARD_CONFIG
+from rl.game_logger import GameplayLogger
 
 
 class CuttleRLEnvironment(gym.Env):
@@ -14,7 +16,7 @@ class CuttleRLEnvironment(gym.Env):
     
     metadata = {"render_modes": ["human"]}
     
-    def __init__(self):
+    def __init__(self, enable_logging: bool = False):
         super().__init__()
         
         # Define action and observation spaces
@@ -30,7 +32,13 @@ def __init__(self):
         self.game: Optional[Game] = None
         self.current_player = 0
         self.step_count = 0
-        self.max_steps = 200  # Add timeout to prevent infinite loops
+        self.max_steps = 300  # Increased to allow games to conclude naturally
+        self.no_progress_steps = 0
+        self.no_progress_limit = 60  # End early if no scoring progress
+        
+        # Logging
+        self.logger = GameplayLogger() if enable_logging else None
+        self.enable_logging = enable_logging
 
     def reset(
         self, 
@@ -44,6 +52,16 @@ def reset(
         self.game = Game(manual_selection=False, ai_player=None)
         self.current_player = 0
         self.step_count = 0
+        self.no_progress_steps = 0
+        
+        # Reset score tracking for difference-based rewards
+        self._prev_score = 0
+        self._prev_opponent_score = 0
+        self._prev_total_score = 0
+        
+        # Start logging if enabled
+        if self.logger:
+            self.logger.start_game(self.game)
         
         # Get initial observation
         observation = self._encode_state()
@@ -62,14 +80,7 @@ def action_masks(self) -> np.ndarray:
         """
         assert self.game is not None, "Must call reset() first"
         
-        # Get current legal actions
-        legal_actions = self.game.game_state.get_legal_actions()
-        
-        # Create mask: True for legal actions, False for illegal
-        mask = np.zeros(self.action_space.n, dtype=np.bool_)
-        mask[:len(legal_actions)] = True
-        
-        return mask
+        return legal_action_mask(self.game.game_state)
 
     def step(
         self, action: int
@@ -81,9 +92,11 @@ def step(
         self.step_count += 1
         if self.step_count > self.max_steps:
             print(f"⚠️  TIMEOUT: Game exceeded {self.max_steps} steps, forcing termination")
+            if self.logger:
+                self.logger.end_game(self.game, None, "timeout", self.step_count)
             return (
                 self._encode_state(),
-                -10.0,  # Penalty for timeout
+                -50.0,  # Strong penalty for timeout (same as stalemate)
                 True,   # done
                 True,   # truncated
                 {"error": "timeout", "steps": self.step_count}
@@ -91,11 +104,15 @@ def step(
         
         # Get current legal actions
         legal_actions = self.game.game_state.get_legal_actions()
-        
+
+        # Decode fixed action index into a concrete legal action
+        action_map = build_action_map(legal_actions)
+        chosen_action = action_map.get(action)
+
         # With action masking, invalid actions should never happen
         # but keep as safety check
-        if action >= len(legal_actions):
-            print(f"WARNING: Invalid action {action} attempted (max: {len(legal_actions)-1})")
+        if chosen_action is None:
+            print(f"WARNING: Invalid action {action} attempted (no matching legal action)")
             print("This should not happen with proper action masking!")
             return (
                 self._encode_state(),
@@ -105,13 +122,33 @@ def step(
                 {"error": "invalid_action"}
             )
         
-        # Execute the chosen action
-        chosen_action = legal_actions[action]
+        # Log the action before execution
+        if self.logger:
+            self.logger.log_step(
+                self.step_count,
+                self.current_player,
+                chosen_action,
+                self.game,
+                0.0,  # Reward will be updated after
+                len(legal_actions)
+            )
+        
         turn_finished, game_ended, winner = \
             self.game.game_state.update_state(chosen_action)
         
         # Calculate reward
         reward = self._calculate_reward(game_ended, winner)
+
+        # Track total score progress to detect stalls
+        total_score = (
+            self.game.game_state.get_player_score(0)
+            + self.game.game_state.get_player_score(1)
+        )
+        if total_score > getattr(self, "_prev_total_score", 0):
+            self.no_progress_steps = 0
+        else:
+            self.no_progress_steps += 1
+        self._prev_total_score = total_score
         
         # Update game state if turn finished
         if turn_finished:
@@ -120,6 +157,27 @@ def step(
         
         # Check if episode is done
         done = game_ended or self.game.game_state.is_stalemate()
+
+        # Early termination if the game is stuck with no progress
+        if not done and self.no_progress_steps >= self.no_progress_limit:
+            print(
+                f"⚠️  STALL: No scoring progress for "
+                f"{self.no_progress_limit} steps, ending episode"
+            )
+            if self.logger:
+                self.logger.end_game(self.game, None, "stall", self.step_count)
+            return (
+                self._encode_state(),
+                REWARD_CONFIG["stalemate"],
+                True,   # done
+                True,   # truncated
+                {"error": "stall", "steps": self.step_count}
+            )
+        
+        # Log game end if done
+        if done and self.logger:
+            reason = "win" if winner is not None else "stalemate"
+            self.logger.end_game(self.game, winner, reason, self.step_count)
         
         # Get new observation
         observation = self._encode_state()
@@ -148,25 +206,25 @@ def _encode_state(self) -> np.ndarray:
         obs[idx] = len(self.game.game_state.hands[opponent]) / 8.0
         idx += 1
         
-        # 3. Player 0 field cards (30 dims: 10 cards × 3 dims each)
+        # 3. Player 0 field cards (180 dims: 10 cards × 18 dims each)
         for i in range(ENV_CONFIG["max_field_size"]):
             field = self.game.game_state.get_player_field(0)
             if i < len(field):
                 card = field[i]
-                obs[idx] = 1.0
-                obs[idx + 1] = card.rank.value[1] / 13.0
-                obs[idx + 2] = 1.0 if card.purpose == Purpose.POINTS else 0.0
-            idx += 3
+                obs[idx + card.suit.value[1]] = 1.0
+                obs[idx + 4 + card.rank.value[1] - 1] = 1.0
+                obs[idx + 17] = 1.0 if card.purpose == Purpose.POINTS else 0.0
+            idx += 18
         
-        # 4. Player 1 field cards (30 dims: same encoding)
+        # 4. Player 1 field cards (180 dims: same encoding)
         for i in range(ENV_CONFIG["max_field_size"]):
             field = self.game.game_state.get_player_field(1)
             if i < len(field):
                 card = field[i]
-                obs[idx] = 1.0
-                obs[idx + 1] = card.rank.value[1] / 13.0
-                obs[idx + 2] = 1.0 if card.purpose == Purpose.POINTS else 0.0
-            idx += 3
+                obs[idx + card.suit.value[1]] = 1.0
+                obs[idx + 4 + card.rank.value[1] - 1] = 1.0
+                obs[idx + 17] = 1.0 if card.purpose == Purpose.POINTS else 0.0
+            idx += 18
         
         # 5. Scores and targets (4 dims)
         obs[idx] = self.game.game_state.get_player_score(0) / 21.0
@@ -181,11 +239,25 @@ def _encode_state(self) -> np.ndarray:
         obs[idx + 2] = 1.0 if self.game.game_state.resolving_three else 0.0
         obs[idx + 3] = len(self.game.game_state.deck) / 52.0
         obs[idx + 4] = len(self.game.game_state.discard_pile) / 52.0
+        idx += 5
+
+        # 7. Discard pile identity (52 dims)
+        for card in self.game.game_state.discard_pile:
+            obs[idx + card_index(card)] = 1.0
+        idx += 52
+
+        # 8. Revealed cards for seven (52 dims)
+        for card in self.game.game_state.pending_seven_cards:
+            obs[idx + card_index(card)] = 1.0
+        idx += 52
         
         return obs
 
     def _calculate_reward(self, game_ended: bool, winner: Optional[int]) -> float:
-        """Calculate reward for the current state."""
+        """Calculate reward for the current state.
+        
+        Simple reward structure focused on scoring points and winning.
+        """
         if game_ended:
             if winner == self.current_player:
                 return REWARD_CONFIG["win"]
@@ -194,16 +266,19 @@ def _calculate_reward(self, game_ended: bool, winner: Optional[int]) -> float:
             else:
                 return REWARD_CONFIG["stalemate"]
         
-        # Intermediate reward: progress toward target
+        # Only reward our own score gains (simpler, less noisy)
         current_score = self.game.game_state.get_player_score(self.current_player)
-        target = self.game.game_state.get_player_target(self.current_player)
+        prev_score = getattr(self, '_prev_score', 0)
         
-        if target > 0:
-            progress = current_score / target
-            return (progress * REWARD_CONFIG["progress_multiplier"] + 
-                    REWARD_CONFIG["turn_penalty"])
-        else:
-            return REWARD_CONFIG["turn_penalty"]
+        score_gain = current_score - prev_score
+        self._prev_score = current_score
+        
+        # Small reward for scoring points
+        if score_gain > 0:
+            return score_gain * REWARD_CONFIG["progress_multiplier"]
+        
+        # Minimal turn penalty otherwise
+        return REWARD_CONFIG["turn_penalty"]
     
     def _get_info(self) -> Dict[str, Any]:
         """Get additional information about game state."""
diff --git a/rl/debug_gameplay.py b/rl/debug_gameplay.py
new file mode 100644
index 0000000..141fee2
--- /dev/null
+++ b/rl/debug_gameplay.py
@@ -0,0 +1,97 @@
+"""Debug script to analyze RL gameplay with detailed logging."""
+from __future__ import annotations
+
+import sys
+from pathlib import Path
+
+# Add parent directory to path
+sys.path.insert(0, str(Path(__file__).parent.parent))
+
+from sb3_contrib import MaskablePPO
+from sb3_contrib.common.wrappers import ActionMasker
+from rl.cuttle_env import CuttleRLEnvironment
+from rl.self_play_env import SelfPlayWrapper
+
+
+def mask_fn(env):
+    """Function that returns action mask for MaskablePPO."""
+    # Unwrap to get to the actual environment with action_masks method
+    while hasattr(env, 'env'):
+        if hasattr(env, 'action_masks'):
+            return env.action_masks()
+        env = env.env
+    return env.action_masks()
+
+
+def run_debug_games(num_games: int = 10, model_path: str = "rl/models/best_model.zip"):
+    """Run games with detailed logging for debugging.
+    
+    Args:
+        num_games: Number of games to play and log
+        model_path: Path to trained model (or None to use random actions)
+    """
+    print(f"\n{'='*60}")
+    print("DEBUGGING RL GAMEPLAY")
+    print(f"{'='*60}\n")
+    
+    # Create environment with logging enabled
+    base_env = CuttleRLEnvironment(enable_logging=True)
+    env = SelfPlayWrapper(base_env)
+    env = ActionMasker(env, mask_fn)  # Critical: wrap with ActionMasker
+    
+    # Load model if available
+    try:
+        model = MaskablePPO.load(model_path)
+        print(f"✅ Loaded model from: {model_path}\n")
+        use_model = True
+    except Exception as e:
+        print(f"⚠️  Could not load model: {e}")
+        print("Using random actions instead\n")
+        use_model = False
+    
+    # Run games
+    for game_num in range(num_games):
+        print(f"Playing game {game_num + 1}/{num_games}...", end=" ")
+        
+        obs, info = env.reset()
+        done = False
+        step_count = 0
+        
+        while not done and step_count < 200:
+            action_masks = env.action_masks()
+            
+            if use_model:
+                action, _ = model.predict(obs, action_masks=action_masks, deterministic=False)
+            else:
+                # Random action from legal actions
+                import numpy as np
+                legal_actions = np.where(action_masks)[0]
+                action = np.random.choice(legal_actions) if len(legal_actions) > 0 else 0
+            
+            obs, reward, done, truncated, info = env.step(action)
+            step_count += 1
+            
+            if done or truncated:
+                break
+        
+        print(f"Finished in {step_count} steps")
+    
+    # Generate summary
+    if base_env.logger:
+        base_env.logger.generate_summary()
+    
+    print("\n💡 TIP: Check the JSON logs in rl/gameplay_logs/ for detailed analysis")
+    print("   Each file contains step-by-step actions, game state, and outcomes\n")
+
+
+if __name__ == "__main__":
+    import argparse
+    
+    parser = argparse.ArgumentParser(description="Debug RL gameplay with detailed logs")
+    parser.add_argument("--games", type=int, default=10, help="Number of games to play")
+    parser.add_argument("--model", type=str, default="rl/models/best_model.zip", 
+                       help="Path to model")
+    
+    args = parser.parse_args()
+    
+    run_debug_games(num_games=args.games, model_path=args.model)
diff --git a/rl/evaluate.py b/rl/evaluate.py
index e451919..bb05bb5 100644
--- a/rl/evaluate.py
+++ b/rl/evaluate.py
@@ -1,36 +1,107 @@
 """Evaluate trained RL agent with action masking."""
+import json
 import os
-from typing import Optional, Tuple
+from typing import Any, Dict, Optional, Tuple
 
 import numpy as np
 from sb3_contrib import MaskablePPO
 
-from rl.config import MODEL_DIR
+from rl.action_mapping import action_index_to_action
+from rl.config import LOG_DIR, MODEL_DIR
 from rl.cuttle_env import CuttleRLEnvironment
 
 
+def _snapshot_game_state(env: CuttleRLEnvironment) -> Dict[str, Any]:
+    """Capture a compact, readable snapshot of the current game state."""
+    if not env.game:
+        return {}
+    state = env.game.game_state
+    return {
+        "turn": state.turn,
+        "current_action_player": state.current_action_player,
+        "overall_turn": state.overall_turn,
+        "scores": {
+            "player_0": state.get_player_score(0),
+            "player_1": state.get_player_score(1),
+        },
+        "targets": {
+            "player_0": state.get_player_target(0),
+            "player_1": state.get_player_target(1),
+        },
+        "hands": [
+            [str(card) for card in state.hands[0]],
+            [str(card) for card in state.hands[1]],
+        ],
+        "fields": [
+            [str(card) for card in state.get_player_field(0)],
+            [str(card) for card in state.get_player_field(1)],
+        ],
+        "deck_count": len(state.deck),
+        "discard_pile": [str(card) for card in state.discard_pile],
+        "resolving_one_off": state.resolving_one_off,
+        "resolving_three": state.resolving_three,
+        "resolving_seven": state.resolving_seven,
+        "pending_three_player": state.pending_three_player,
+        "pending_four_player": state.pending_four_player,
+        "pending_four_count": state.pending_four_count,
+        "pending_seven_requires_discard": state.pending_seven_requires_discard,
+    }
+
+
 def play_episode(
     model: MaskablePPO, 
     env: CuttleRLEnvironment, 
-    deterministic: bool = True
-) -> Tuple[float, int, Optional[int]]:
+    deterministic: bool = True,
+    record: bool = False,
+) -> Tuple[float, int, Optional[int], Optional[Dict[str, Any]]]:
     """Play one episode with action masking."""
     obs, info = env.reset()
     done = False
     episode_reward = 0.0
     steps = 0
+    trace: Optional[Dict[str, Any]] = {"steps": []} if record else None
     
     while not done:
         # Agent's turn with action mask
         action_mask = env.action_masks()
+        obs_before = obs
         action, _ = model.predict(
-            obs, 
+            obs,
             action_masks=action_mask,  # Pass mask to model
             deterministic=deterministic
         )
+        if env.game:
+            action_obj = action_index_to_action(env.game.game_state, int(action))
+            legal_actions = env.game.game_state.get_legal_actions()
+        else:
+            action_obj = None
+            legal_actions = []
+        state_before = _snapshot_game_state(env) if record else None
+
         obs, reward, done, truncated, info = env.step(action)
+        state_after = _snapshot_game_state(env) if record else None
         episode_reward += reward
         steps += 1
+
+        if trace is not None:
+            trace["steps"].append(
+                {
+                    "actor": "agent",
+                    "step": steps,
+                    "obs": obs_before.tolist(),
+                    "next_obs": obs.tolist(),
+                    "action_index": int(action),
+                    "action": str(action_obj) if action_obj else None,
+                    "legal_actions": [str(a) for a in legal_actions],
+                    "action_mask": action_mask.astype(int).tolist(),
+                    "reward": float(reward),
+                    "done": bool(done),
+                    "truncated": bool(truncated),
+                    "info": info,
+                    "state_before": state_before,
+                    "state_after": state_after,
+                }
+            )
         
         if done:
             break
@@ -40,17 +111,59 @@ def play_episode(
         legal_indices = np.where(opponent_mask)[0]
         if len(legal_indices) > 0:
             opp_action = np.random.choice(legal_indices)
+            obs_before = obs
+            if env.game:
+                opp_action_obj = action_index_to_action(env.game.game_state, int(opp_action))
+                opp_legal_actions = env.game.game_state.get_legal_actions()
+            else:
+                opp_action_obj = None
+                opp_legal_actions = []
+            state_before = _snapshot_game_state(env) if record else None
+
             obs, opp_reward, done, truncated, info = env.step(opp_action)
+            state_after = _snapshot_game_state(env) if record else None
             episode_reward -= opp_reward
             steps += 1
+
+            if trace is not None:
+                trace["steps"].append(
+                    {
+                        "actor": "opponent",
+                        "step": steps,
+                        "obs": obs_before.tolist(),
+                        "next_obs": obs.tolist(),
+                        "action_index": int(opp_action),
+                        "action": str(opp_action_obj) if opp_action_obj else None,
+                        "legal_actions": [str(a) for a in opp_legal_actions],
+                        "action_mask": opponent_mask.astype(int).tolist(),
+                        "reward": float(opp_reward),
+                        "done": bool(done),
+                        "truncated": bool(truncated),
+                        "info": info,
+                        "state_before": state_before,
+                        "state_after": state_after,
+                    }
+                )
     
     # Get winner
     winner = env.game.game_state.winner() if env.game else None
     
-    return episode_reward, steps, winner
+    if trace is not None:
+        trace["summary"] = {
+            "episode_reward": float(episode_reward),
+            "steps": steps,
+            "winner": winner,
+            "deterministic": deterministic,
+        }
+
+    return episode_reward, steps, winner, trace
 
 
-def evaluate_agent(model_path: str, n_episodes: int = 100):
+def evaluate_agent(
+    model_path: str,
+    n_episodes: int = 100,
+    record_path: Optional[str] = None,
+):
     """Evaluate agent over multiple episodes."""
     print(f"Loading MaskablePPO model from: {model_path}")
     model = MaskablePPO.load(model_path)
@@ -71,7 +184,10 @@ def evaluate_agent(model_path: str, n_episodes: int = 100):
         if (episode + 1) % 10 == 0:
             print(f"  Episode {episode + 1}/{n_episodes}")
         
-        episode_reward, steps, winner = play_episode(model, env, deterministic=True)
+        record = record_path is not None and episode == 0
+        episode_reward, steps, winner, trace = play_episode(
+            model, env, deterministic=True, record=record
+        )
         
         # Record results
         total_rewards.append(episode_reward)
@@ -84,6 +200,12 @@ def evaluate_agent(model_path: str, n_episodes: int = 100):
             losses += 1
         else:
             stalemates += 1
+
+        if record and trace is not None:
+            os.makedirs(os.path.dirname(record_path), exist_ok=True)
+            with open(record_path, "w", encoding="utf-8") as handle:
+                json.dump(trace, handle, indent=2)
+            print(f"Saved episode trace to: {record_path}")
     
     # Print results
     print("\n" + "=" * 50)
@@ -101,13 +223,14 @@ def evaluate_agent(model_path: str, n_episodes: int = 100):
 def main():
     """Main evaluation function."""
     model_path = os.path.join(MODEL_DIR, "cuttle_rl_final")
+    record_path = os.path.join(LOG_DIR, "eval_rollout.json")
     
     if not os.path.exists(model_path + ".zip"):
         print(f"ERROR: Model not found at {model_path}.zip")
         print("Please train a model first using: make train-rl")
         return
     
-    evaluate_agent(model_path, n_episodes=100)
+    evaluate_agent(model_path, n_episodes=100, record_path=record_path)
 
 
 if __name__ == "__main__":
diff --git a/rl/game_logger.py b/rl/game_logger.py
new file mode 100644
index 0000000..834acb9
--- /dev/null
+++ b/rl/game_logger.py
@@ -0,0 +1,185 @@
+"""Logger for detailed RL gameplay analysis."""
+from __future__ import annotations
+
+import json
+from datetime import datetime
+from pathlib import Path
+from typing import Any, Dict, List, Optional
+
+from game.action import Action
+from game.card import Card
+from game.game import Game
+
+
+class GameplayLogger:
+    """Logs detailed gameplay information for debugging RL agents."""
+    
+    def __init__(self, log_dir: str = "rl/gameplay_logs"):
+        """Initialize logger.
+        
+        Args:
+            log_dir: Directory to save logs
+        """
+        self.log_dir = Path(log_dir)
+        self.log_dir.mkdir(parents=True, exist_ok=True)
+        self.current_game: Optional[Dict[str, Any]] = None
+        self.games_logged = 0
+        self.max_games_per_session = 10  # Only log first 10 games per training
+        
+    def start_game(self, game: Game) -> None:
+        """Start logging a new game."""
+        if self.games_logged >= self.max_games_per_session:
+            return  # Don't log more than max games
+            
+        self.current_game = {
+            "game_id": self.games_logged,
+            "start_time": datetime.now().isoformat(),
+            "steps": [],
+            "outcome": None,
+            "step_count": 0,
+        }
+        
+    def log_step(
+        self,
+        step_num: int,
+        player: int,
+        action: Action,
+        game: Game,
+        reward: float,
+        legal_action_count: int,
+    ) -> None:
+        """Log a single step of gameplay."""
+        if self.current_game is None:
+            return
+            
+        step_info = {
+            "step": step_num,
+            "player": player,
+            "action": {
+                "type": action.action_type.name if hasattr(action.action_type, 'name') else str(action.action_type),
+                "card": self._card_to_dict(action.card) if action.card else None,
+                "target": self._card_to_dict(action.target) if action.target else None,
+            },
+            "reward": float(reward),
+            "legal_actions_count": legal_action_count,
+            "state": self._get_game_state_snapshot(game, player),
+        }
+        
+        self.current_game["steps"].append(step_info)
+        self.current_game["step_count"] = step_num
+        
+    def end_game(
+        self,
+        game: Game,
+        winner: Optional[int],
+        reason: str,
+        step_count: int,
+    ) -> None:
+        """End current game and save log."""
+        if self.current_game is None:
+            return
+            
+        self.current_game["outcome"] = {
+            "winner": winner,
+            "reason": reason,
+            "total_steps": step_count,
+            "final_scores": {
+                "player_0": game.game_state.get_player_score(0),
+                "player_1": game.game_state.get_player_score(1),
+            },
+            "final_targets": {
+                "player_0": game.game_state.get_player_target(0),
+                "player_1": game.game_state.get_player_target(1),
+            },
+        }
+        
+        # Save to file
+        filename = f"game_{self.games_logged:03d}_{reason}.json"
+        filepath = self.log_dir / filename
+        
+        with open(filepath, "w") as f:
+            json.dump(self.current_game, f, indent=2)
+            
+        print(f"📝 Saved gameplay log: {filepath}")
+        self.games_logged += 1
+        self.current_game = None
+        
+    def _card_to_dict(self, card: Card) -> Dict[str, Any]:
+        """Convert card to dictionary."""
+        return {
+            "rank": card.rank.name,
+            "suit": card.suit.name,
+            "display": str(card),
+        }
+        
+    def _get_game_state_snapshot(self, game: Game, current_player: int) -> Dict[str, Any]:
+        """Get snapshot of current game state."""
+        return {
+            "current_player": current_player,
+            "scores": {
+                "player_0": game.game_state.get_player_score(0),
+                "player_1": game.game_state.get_player_score(1),
+            },
+            "hand_sizes": {
+                "player_0": len(game.game_state.hands[0]),
+                "player_1": len(game.game_state.hands[1]),
+            },
+            "field_cards": {
+                "player_0": [self._card_to_dict(c) for c in game.game_state.get_player_field(0)],
+                "player_1": [self._card_to_dict(c) for c in game.game_state.get_player_field(1)],
+            },
+            "deck_size": len(game.game_state.deck),
+            "discard_size": len(game.game_state.discard_pile),
+            "resolving_one_off": game.game_state.resolving_one_off,
+            "resolving_three": game.game_state.resolving_three,
+        }
+    
+    def generate_summary(self) -> None:
+        """Generate a summary of all logged games."""
+        if self.games_logged == 0:
+            print("No games logged yet.")
+            return
+            
+        summary = {
+            "total_games": self.games_logged,
+            "outcomes": {},
+            "avg_steps": 0,
+            "timeout_rate": 0,
+        }
+        
+        total_steps = 0
+        timeouts = 0
+        
+        for i in range(self.games_logged):
+            for reason in ["timeout", "win", "stalemate"]:
+                filepath = self.log_dir / f"game_{i:03d}_{reason}.json"
+                if filepath.exists():
+                    with open(filepath, "r") as f:
+                        game_data = json.load(f)
+                        reason = game_data["outcome"]["reason"]
+                        summary["outcomes"][reason] = summary["outcomes"].get(reason, 0) + 1
+                        total_steps += game_data["outcome"]["total_steps"]
+                        if reason == "timeout":
+                            timeouts += 1
+                    break
+        
+        if self.games_logged > 0:
+            summary["avg_steps"] = total_steps / self.games_logged
+            summary["timeout_rate"] = timeouts / self.games_logged
+        
+        summary_path = self.log_dir / "summary.json"
+        with open(summary_path, "w") as f:
+            json.dump(summary, f, indent=2)
+        
+        print(f"\n{'='*60}")
+        print("GAMEPLAY SUMMARY")
+        print(f"{'='*60}")
+        print(f"Total games logged: {summary['total_games']}")
+        print(f"Average steps per game: {summary['avg_steps']:.1f}")
+        print(f"Timeout rate: {summary['timeout_rate']*100:.1f}%")
+        print("\nOutcomes:")
+        for outcome, count in summary["outcomes"].items():
+            print(f"  {outcome}: {count} ({count/self.games_logged*100:.1f}%)")
+        print(f"{'='*60}\n")
+        print(f"📁 Logs saved to: {self.log_dir.absolute()}")
+        print(f"{'='*60}\n")
diff --git a/rl/hyperparameter_search.py b/rl/hyperparameter_search.py
new file mode 100644
index 0000000..87277d6
--- /dev/null
+++ b/rl/hyperparameter_search.py
@@ -0,0 +1,324 @@
+"""Hyperparameter search for RL training."""
+from __future__ import annotations
+
+import json
+import sys
+from datetime import datetime
+from pathlib import Path
+from typing import Any, Dict, List
+
+sys.path.insert(0, str(Path(__file__).parent.parent))
+
+from sb3_contrib import MaskablePPO
+from sb3_contrib.common.wrappers import ActionMasker
+from stable_baselines3.common.callbacks import CheckpointCallback, EvalCallback
+from stable_baselines3.common.monitor import Monitor
+
+from rl.cuttle_env import CuttleRLEnvironment
+from rl.self_play_env import SelfPlayWrapper
+
+
+def mask_fn(env):
+    """Function that returns action mask for MaskablePPO."""
+    # Unwrap to get to the actual environment with action_masks method
+    while hasattr(env, 'env'):
+        if hasattr(env, 'action_masks'):
+            return env.action_masks()
+        env = env.env
+    return env.action_masks()
+
+
+# Define hyperparameter configurations to test
+EXPERIMENT_CONFIGS: List[Dict[str, Any]] = [
+    {
+        "name": "baseline",
+        "description": "Current baseline configuration",
+        "training": {
+            "total_timesteps": 200_000,
+            "n_steps": 2048,
+            "batch_size": 64,
+            "learning_rate": 3e-4,
+        },
+        "reward": {
+            "win": 100.0,
+            "loss": -100.0,
+            "stalemate": -50.0,
+            "invalid_action_penalty": -10.0,
+            "progress_multiplier": 0.1,
+            "turn_penalty": -0.01,
+        },
+    },
+    {
+        "name": "high_progress_reward",
+        "description": "Emphasize progress toward winning",
+        "training": {
+            "total_timesteps": 200_000,
+            "n_steps": 2048,
+            "batch_size": 64,
+            "learning_rate": 3e-4,
+        },
+        "reward": {
+            "win": 100.0,
+            "loss": -100.0,
+            "stalemate": -50.0,
+            "invalid_action_penalty": -10.0,
+            "progress_multiplier": 10.0,  # 100x increase
+            "turn_penalty": -0.5,          # Penalize longer games
+        },
+    },
+    {
+        "name": "fast_learning",
+        "description": "Higher learning rate for faster initial learning",
+        "training": {
+            "total_timesteps": 200_000,
+            "n_steps": 1024,              # Smaller steps
+            "batch_size": 128,            # Larger batches
+            "learning_rate": 1e-3,        # Higher LR
+        },
+        "reward": {
+            "win": 100.0,
+            "loss": -100.0,
+            "stalemate": -50.0,
+            "invalid_action_penalty": -10.0,
+            "progress_multiplier": 5.0,
+            "turn_penalty": -0.2,
+        },
+    },
+    {
+        "name": "conservative",
+        "description": "Lower LR, larger batches for stable learning",
+        "training": {
+            "total_timesteps": 200_000,
+            "n_steps": 4096,              # Larger steps
+            "batch_size": 32,             # Smaller batches
+            "learning_rate": 1e-4,        # Lower LR
+        },
+        "reward": {
+            "win": 100.0,
+            "loss": -100.0,
+            "stalemate": -50.0,
+            "invalid_action_penalty": -10.0,
+            "progress_multiplier": 3.0,
+            "turn_penalty": -0.1,
+        },
+    },
+    {
+        "name": "aggressive_scoring",
+        "description": "Heavy emphasis on scoring points",
+        "training": {
+            "total_timesteps": 200_000,
+            "n_steps": 2048,
+            "batch_size": 64,
+            "learning_rate": 3e-4,
+        },
+        "reward": {
+            "win": 100.0,
+            "loss": -100.0,
+            "stalemate": -50.0,
+            "invalid_action_penalty": -10.0,
+            "progress_multiplier": 20.0,  # Very high
+            "turn_penalty": -1.0,         # Strong penalty for long games
+        },
+    },
+]
+
+
+def run_experiment(config: Dict[str, Any], experiment_dir: Path) -> Dict[str, Any]:
+    """Run a single experiment with given configuration.
+    
+    Args:
+        config: Experiment configuration
+        experiment_dir: Directory to save experiment results
+        
+    Returns:
+        Dictionary with experiment results
+    """
+    exp_name = config["name"]
+    print(f"\n{'='*70}")
+    print(f"EXPERIMENT: {exp_name}")
+    print(f"Description: {config['description']}")
+    print(f"{'='*70}\n")
+    
+    # Create experiment directory
+    exp_path = experiment_dir / exp_name
+    exp_path.mkdir(parents=True, exist_ok=True)
+    
+    # Save config
+    with open(exp_path / "config.json", "w") as f:
+        json.dump(config, f, indent=2)
+    
+    # Apply reward config (monkey patch for this experiment)
+    import rl.config as rl_config
+    for key, value in config["reward"].items():
+        rl_config.REWARD_CONFIG[key] = value
+    
+    # Create environments with action masking
+    train_env = SelfPlayWrapper(CuttleRLEnvironment())
+    train_env = Monitor(train_env, str(exp_path / "train"))
+    train_env = ActionMasker(train_env, mask_fn)  # Critical: wrap with ActionMasker
+    
+    eval_env = SelfPlayWrapper(CuttleRLEnvironment())
+    eval_env = Monitor(eval_env, str(exp_path / "eval"))
+    eval_env = ActionMasker(eval_env, mask_fn)  # Critical: wrap with ActionMasker
+    
+    # Training parameters
+    training_config = config["training"]
+    
+    # Create model
+    model = MaskablePPO(
+        "MlpPolicy",
+        train_env,
+        n_steps=training_config["n_steps"],
+        batch_size=training_config["batch_size"],
+        learning_rate=training_config["learning_rate"],
+        verbose=1,
+        tensorboard_log=str(exp_path / "tensorboard"),
+    )
+    
+    # Callbacks
+    checkpoint_callback = CheckpointCallback(
+        save_freq=10_000,
+        save_path=str(exp_path / "checkpoints"),
+        name_prefix=f"{exp_name}_model",
+    )
+    
+    eval_callback = EvalCallback(
+        eval_env,
+        best_model_save_path=str(exp_path / "best_model"),
+        log_path=str(exp_path / "eval_logs"),
+        eval_freq=5_000,
+        deterministic=True,
+        render=False,
+        n_eval_episodes=10,
+    )
+    
+    # Train
+    start_time = datetime.now()
+    print(f"Training started at {start_time.isoformat()}\n")
+    
+    model.learn(
+        total_timesteps=training_config["total_timesteps"],
+        callback=[checkpoint_callback, eval_callback],
+        progress_bar=True,
+    )
+    
+    end_time = datetime.now()
+    duration = (end_time - start_time).total_seconds()
+    
+    print(f"\nTraining completed in {duration:.1f} seconds ({duration/60:.1f} minutes)")
+    
+    # Save final model
+    model.save(exp_path / "final_model")
+    
+    # Collect results
+    results = {
+        "name": exp_name,
+        "config": config,
+        "start_time": start_time.isoformat(),
+        "end_time": end_time.isoformat(),
+        "duration_seconds": duration,
+        "model_path": str(exp_path / "final_model.zip"),
+        "best_model_path": str(exp_path / "best_model" / "best_model.zip"),
+    }
+    
+    # Save results
+    with open(exp_path / "results.json", "w") as f:
+        json.dump(results, f, indent=2)
+    
+    # Cleanup
+    train_env.close()
+    eval_env.close()
+    
+    return results
+
+
+def run_all_experiments(
+    configs: List[Dict[str, Any]],
+    base_dir: str = "rl/experiments",
+) -> None:
+    """Run all experiments and save results.
+    
+    Args:
+        configs: List of experiment configurations
+        base_dir: Base directory for all experiments
+    """
+    experiment_dir = Path(base_dir) / datetime.now().strftime("%Y%m%d_%H%M%S")
+    experiment_dir.mkdir(parents=True, exist_ok=True)
+    
+    print(f"\n{'='*70}")
+    print("HYPERPARAMETER SEARCH")
+    print(f"{'='*70}")
+    print(f"Running {len(configs)} experiments")
+    print(f"Results will be saved to: {experiment_dir.absolute()}")
+    print(f"{'='*70}\n")
+    
+    all_results = []
+    
+    for i, config in enumerate(configs):
+        print(f"\nExperiment {i+1}/{len(configs)}")
+        try:
+            results = run_experiment(config, experiment_dir)
+            all_results.append(results)
+        except Exception as e:
+            print(f"❌ Experiment {config['name']} failed: {e}")
+            import traceback
+            traceback.print_exc()
+            continue
+    
+    # Save summary
+    summary = {
+        "timestamp": datetime.now().isoformat(),
+        "total_experiments": len(configs),
+        "successful_experiments": len(all_results),
+        "experiments": all_results,
+    }
+    
+    with open(experiment_dir / "summary.json", "w") as f:
+        json.dump(summary, f, indent=2)
+    
+    print(f"\n{'='*70}")
+    print("ALL EXPERIMENTS COMPLETED")
+    print(f"{'='*70}")
+    print(f"Results saved to: {experiment_dir.absolute()}")
+    print(f"Successful: {len(all_results)}/{len(configs)}")
+    print(f"{'='*70}\n")
+    
+    print("Next steps:")
+    print(f"  1. Compare results: python rl/compare_experiments.py {experiment_dir}")
+    print(f"  2. View tensorboard: tensorboard --logdir {experiment_dir}")
+    print(f"  3. Test best model: make debug-rl --model <path_to_best>\n")
+
+
+if __name__ == "__main__":
+    import argparse
+    
+    parser = argparse.ArgumentParser(description="Run hyperparameter search")
+    parser.add_argument(
+        "--quick",
+        action="store_true",
+        help="Run quick experiments (50K timesteps each)",
+    )
+    parser.add_argument(
+        "--configs",
+        nargs="+",
+        help="Run only specific configs by name",
+    )
+    
+    args = parser.parse_args()
+    
+    # Filter configs if specified
+    configs = EXPERIMENT_CONFIGS
+    if args.configs:
+        configs = [c for c in configs if c["name"] in args.configs]
+        if not configs:
+            print(f"❌ No configs found matching: {args.configs}")
+            print(f"Available: {[c['name'] for c in EXPERIMENT_CONFIGS]}")
+            sys.exit(1)
+    
+    # Reduce timesteps for quick mode
+    if args.quick:
+        print("🚀 Quick mode: Using 50K timesteps per experiment\n")
+        for config in configs:
+            config["training"]["total_timesteps"] = 50_000
+    
+    run_all_experiments(configs)
diff --git a/rl/models/cuttle_rl_final.zip b/rl/models/cuttle_rl_final.zip
index d45786d..1eca565 100644
Binary files a/rl/models/cuttle_rl_final.zip and b/rl/models/cuttle_rl_final.zip differ
diff --git a/rl/monitor.py b/rl/monitor.py
new file mode 100644
index 0000000..f0efe1e
--- /dev/null
+++ b/rl/monitor.py
@@ -0,0 +1,176 @@
+#!/usr/bin/env python3
+"""Simple training monitor for RL training."""
+import re
+import sys
+import time
+import os
+import glob
+
+MONITOR_CSV = "/Users/lihao/Documents/Projects/cuttle-bot/rl/logs/monitor.csv"
+
+def find_latest_log() -> str:
+    """Find the most recent training log file."""
+    # Prefer explicit current log if present
+    current_log = "/tmp/train_current.log"
+    if os.path.exists(current_log) and os.path.getsize(current_log) > 100:
+        return current_log
+
+    # Use glob to find all training logs
+    all_logs = list(set(glob.glob("/tmp/train*.log")))
+
+    # Filter to logs with content and return most recently modified
+    valid = [
+        (path, os.path.getmtime(path))
+        for path in all_logs
+        if os.path.exists(path) and os.path.getsize(path) > 100
+    ]
+    if valid:
+        return max(valid, key=lambda entry: entry[1])[0]
+
+    # Fallback
+    return current_log
+
+
+def _read_monitor_episode_stats() -> dict:
+    """Read episode count and average length from Monitor CSV."""
+    if not os.path.exists(MONITOR_CSV):
+        return {"episodes": 0, "avg_len": 0.0}
+
+    with open(MONITOR_CSV, "r") as monitor_file:
+        lines = [line.strip() for line in monitor_file if line.strip()]
+
+    if not lines:
+        return {"episodes": 0, "avg_len": 0.0}
+
+    # Find last header line to avoid mixing runs
+    last_header_idx = 0
+    for idx, line in enumerate(lines):
+        if line.startswith("#"):
+            last_header_idx = idx
+
+    data_lines = lines[last_header_idx + 2 :]  # skip header + column line
+    lengths = []
+    for line in data_lines:
+        parts = line.split(",")
+        if len(parts) >= 2:
+            try:
+                lengths.append(float(parts[1]))
+            except ValueError:
+                continue
+
+    if not lengths:
+        return {"episodes": 0, "avg_len": 0.0}
+
+    return {"episodes": len(lengths), "avg_len": sum(lengths) / len(lengths)}
+
+def monitor(log_file: str | None = None, refresh: bool = False) -> None:
+    """Monitor training progress."""
+    if log_file is None:
+        log_file = find_latest_log()
+    
+    while True:
+        if not os.path.exists(log_file):
+            print(f"Log file not found: {log_file}")
+            return
+            
+        with open(log_file, 'r') as f:
+            content = f.read()
+        
+        # Parse metrics
+        timesteps = re.findall(r'total_timesteps\s+\|\s+(\d+)', content)
+        ep_rew = re.findall(r'ep_rew_mean\s+\|\s+([-\d.]+)', content)
+        ep_len = re.findall(r'ep_len_mean\s+\|\s+([\d.]+)', content)
+        fps_vals = re.findall(r'fps\s+\|\s+(\d+)', content)
+        time_elapsed = re.findall(r'time_elapsed\s+\|\s+(\d+)', content)
+        
+        total = 500000
+        
+        # Clear screen if refreshing
+        if refresh:
+            print("\033[2J\033[H", end="")
+        
+        print("=" * 60)
+        print("🎮 CUTTLE RL TRAINING MONITOR")
+        print(f"   Log: {os.path.basename(log_file)}")
+        print("=" * 60)
+        
+        if timesteps:
+            latest = int(timesteps[-1])
+            pct = (latest / total) * 100
+            bar = '█' * int(pct / 2) + '░' * (50 - int(pct / 2))
+            
+            print(f"\n📊 Progress: {latest:,} / {total:,} ({pct:.1f}%)")
+            print(f"   [{bar}]")
+        
+        if ep_rew:
+            current = float(ep_rew[-1])
+            print(f"\n📈 Reward: {current:.2f}", end="")
+            if len(ep_rew) >= 5:
+                early = sum(float(r) for r in ep_rew[:5]) / 5
+                recent = sum(float(r) for r in ep_rew[-5:]) / 5
+                change = recent - early
+                print(f"  (trend: {'+' if change > 0 else ''}{change:.2f})")
+            else:
+                print()
+        
+        if ep_len:
+            print(f"🎲 Episode Length: {float(ep_len[-1]):.1f} steps")
+        
+        if fps_vals and timesteps:
+            fps = int(fps_vals[-1])
+            remaining = total - int(timesteps[-1])
+            eta_min = (remaining / fps) / 60 if fps > 0 else 0
+            print(f"⏱️  Speed: {fps:,}/s | ETA: {eta_min:.1f} min")
+        
+        # Calculate timeout stats
+        timeouts = content.count("TIMEOUT")
+        stalls = content.count("STALL")
+        invalid = content.count("Invalid action")
+        episode_stats = _read_monitor_episode_stats()
+        episodes = episode_stats["episodes"]
+
+        if episodes > 0:
+            timeout_pct = (timeouts / episodes * 100)
+            stall_pct = (stalls / episodes * 100)
+            non_finish_pct = ((timeouts + stalls) / episodes * 100)
+            avg_len = episode_stats["avg_len"]
+            print(f"\n⚠️  Timeouts: {timeouts} / {episodes} games ({timeout_pct:.1f}%)")
+            if stalls > 0:
+                print(f"⚠️  Stalls:   {stalls} / {episodes} games ({stall_pct:.1f}%)")
+            print(f"⚠️  Non-finish rate: {non_finish_pct:.1f}%")
+            print(f"   Avg episode length: {avg_len:.1f} steps")
+
+            # Visual indicator
+            if non_finish_pct > 90:
+                print("   Status: 🔴 Most games end without a winner")
+            elif non_finish_pct > 50:
+                print("   Status: 🟡 Many games stall/timeout")
+            elif non_finish_pct > 20:
+                print("   Status: 🟢 Good progress (agent winning more)")
+            else:
+                print("   Status: ✅ Excellent (agent wins most games)")
+        else:
+            print(f"\n⚠️  Timeouts: {timeouts}")
+        
+        if invalid > 0:
+            print(f"❌ Invalid actions: {invalid}")
+        
+        # Self-play info
+        model_probs = re.findall(r'opponent model prob = ([\d.]+)%', content)
+        if model_probs:
+            current_prob = float(model_probs[-1])
+            print(f"\n🤖 Self-Play: {current_prob:.0f}% model opponent")
+        elif "Self-play initialized" in content:
+            print(f"\n🤖 Self-Play: Starting (0% model, 100% random)")
+        
+        print("=" * 60)
+        
+        if not refresh:
+            break
+            
+        time.sleep(5)
+
+
+if __name__ == "__main__":
+    refresh = "--watch" in sys.argv or "-w" in sys.argv
+    monitor(refresh=refresh)
diff --git a/rl/self_play_env.py b/rl/self_play_env.py
index c9a8c36..94c0543 100644
--- a/rl/self_play_env.py
+++ b/rl/self_play_env.py
@@ -1,5 +1,5 @@
-"""Self-play wrapper with action masking support."""
-from typing import Any, Dict, Tuple
+"""Self-play wrapper with action masking support and model-based opponent."""
+from typing import Any, Dict, Optional, Tuple
 
 import gymnasium as gym
 import numpy as np
@@ -8,16 +8,61 @@
 
 
 class SelfPlayWrapper(gym.Wrapper):
-    """Wrapper that enables self-play training with action masking."""
+    """Wrapper that enables self-play training with action masking.
     
-    def __init__(self, env: CuttleRLEnvironment):
-        super().__init__(env)
-        self.opponent_policy = "random"  # Strategy: "random" or future: "model"
+    Supports two opponent modes:
+    - "random": Opponent chooses randomly from legal actions (default for early training)
+    - "model": Opponent uses the trained model (true self-play)
+    """
     
+    def __init__(
+        self, 
+        env: CuttleRLEnvironment,
+        opponent_type: str = "random",
+    ):
+        super().__init__(env)
+        self.opponent_type = opponent_type
+        self._opponent_model = None
+        self._update_freq = 1000  # Update opponent model every N steps
+        self._steps_since_update = 0
+        
+    def set_opponent_model(self, model) -> None:
+        """Set the model to use for opponent actions.
+        
+        Args:
+            model: A trained MaskablePPO model (or compatible)
+        """
+        self._opponent_model = model
+        self.opponent_type = "model"
+        
     def action_masks(self) -> np.ndarray:
         """Forward action masks from wrapped environment."""
         return self.env.action_masks()
     
+    def _get_opponent_action(self, mask: np.ndarray) -> int:
+        """Get opponent's action based on opponent_type."""
+        legal_indices = np.where(mask)[0]
+        
+        if len(legal_indices) == 0:
+            return 0  # Fallback (shouldn't happen with proper masking)
+        
+        if self.opponent_type == "model" and self._opponent_model is not None:
+            # Use the model to predict action
+            obs = self.env._encode_state()
+            try:
+                action, _ = self._opponent_model.predict(
+                    obs,
+                    deterministic=False,  # Add some exploration
+                    action_masks=mask,
+                )
+                return int(action)
+            except Exception:
+                # Fallback to random if prediction fails
+                return int(np.random.choice(legal_indices))
+        else:
+            # Random opponent (default)
+            return int(np.random.choice(legal_indices))
+    
     def step(self, action: int) -> Tuple[np.ndarray, float, bool, bool, Dict[str, Any]]:
         """Execute agent's action, then opponent's action (both use masking)."""
         # Agent's move
@@ -28,14 +73,73 @@ def step(self, action: int) -> Tuple[np.ndarray, float, bool, bool, Dict[str, An
         
         # Opponent's turn with action masking
         opponent_mask = self.env.action_masks()
-        opponent_legal_indices = np.where(opponent_mask)[0]
+        opponent_action = self._get_opponent_action(opponent_mask)
+        
+        obs, opp_reward, done, truncated, info = self.env.step(opponent_action)
         
-        if len(opponent_legal_indices) > 0:
-            # Random opponent chooses from legal actions only
-            opponent_action = np.random.choice(opponent_legal_indices)
-            obs, opp_reward, done, truncated, info = self.env.step(opponent_action)
-            
-            # Flip reward: opponent's loss is agent's gain
-            reward = -opp_reward
+        # Flip reward: opponent's loss is agent's gain
+        reward = -opp_reward
+        
+        self._steps_since_update += 1
         
         return obs, reward, done, truncated, info
+
+
+class AdaptiveSelfPlayWrapper(SelfPlayWrapper):
+    """Self-play wrapper that gradually transitions from random to model opponent.
+    
+    Starts with random opponent and progressively increases model usage
+    based on training progress.
+    """
+    
+    def __init__(
+        self,
+        env: CuttleRLEnvironment,
+        model_prob_start: float = 0.0,
+        model_prob_end: float = 0.8,
+        transition_steps: int = 100000,
+    ):
+        super().__init__(env, opponent_type="adaptive")
+        self.model_prob_start = model_prob_start
+        self.model_prob_end = model_prob_end
+        self.transition_steps = transition_steps
+        self._total_steps = 0
+        
+    def _get_model_probability(self) -> float:
+        """Get current probability of using model opponent."""
+        if self._opponent_model is None:
+            return 0.0
+        progress = min(1.0, self._total_steps / self.transition_steps)
+        return self.model_prob_start + progress * (self.model_prob_end - self.model_prob_start)
+    
+    def _get_opponent_action(self, mask: np.ndarray) -> int:
+        """Get opponent action, mixing random and model based on progress."""
+        legal_indices = np.where(mask)[0]
+        
+        if len(legal_indices) == 0:
+            return 0
+        
+        # Decide whether to use model or random
+        use_model = (
+            self._opponent_model is not None 
+            and np.random.random() < self._get_model_probability()
+        )
+        
+        if use_model:
+            obs = self.env._encode_state()
+            try:
+                action, _ = self._opponent_model.predict(
+                    obs,
+                    deterministic=False,
+                    action_masks=mask,
+                )
+                return int(action)
+            except Exception:
+                return int(np.random.choice(legal_indices))
+        else:
+            return int(np.random.choice(legal_indices))
+    
+    def step(self, action: int) -> Tuple[np.ndarray, float, bool, bool, Dict[str, Any]]:
+        """Execute step and track total steps for adaptive scheduling."""
+        self._total_steps += 1
+        return super().step(action)
diff --git a/rl/train.py b/rl/train.py
index e0e3fbf..ee2095e 100644
--- a/rl/train.py
+++ b/rl/train.py
@@ -1,26 +1,208 @@
-"""Train RL agent for Cuttle game using MaskablePPO."""
+"""Train RL agent for Cuttle game using MaskablePPO with true self-play."""
+from __future__ import annotations
 import os
 
+import numpy as np
+import torch
+
 from sb3_contrib import MaskablePPO
-from stable_baselines3.common.callbacks import CheckpointCallback
+from sb3_contrib.common.wrappers import ActionMasker
+from stable_baselines3.common.callbacks import BaseCallback, CheckpointCallback
+from stable_baselines3.common.env_util import make_vec_env
+from stable_baselines3.common.logger import TensorBoardOutputFormat
 from stable_baselines3.common.monitor import Monitor
 
 from rl.config import LOG_DIR, MODEL_DIR, TRAINING_CONFIG
 from rl.cuttle_env import CuttleRLEnvironment
-from rl.self_play_env import SelfPlayWrapper
+from rl.self_play_env import AdaptiveSelfPlayWrapper
+
+
+class ActivationLogger:
+    """Capture policy activations for TensorBoard logging."""
+
+    def __init__(self, policy: torch.nn.Module) -> None:
+        self._policy = policy
+        self._activations: dict[str, torch.Tensor] = {}
+        self._handles = []
+        self._register_hooks()
+
+    def _register_hooks(self) -> None:
+        for name, module in self._policy.named_modules():
+            if isinstance(module, torch.nn.Linear):
+                handle = module.register_forward_hook(self._make_hook(name))
+                self._handles.append(handle)
+
+    def _make_hook(self, name: str):
+        def hook(_module, _inputs, output):
+            self._activations[name] = output.detach().cpu()
+
+        return hook
+
+    def clear(self) -> None:
+        self._activations.clear()
+
+    def get(self) -> dict[str, torch.Tensor]:
+        return self._activations
+
+    def close(self) -> None:
+        for handle in self._handles:
+            handle.remove()
+        self._handles.clear()
+
+
+class DiagnosticsCallback(BaseCallback):
+    """Log action stats, masks, and activations to TensorBoard."""
+
+    def __init__(self, log_freq: int = 1000, activation_freq: int = 5000) -> None:
+        super().__init__()
+        self.log_freq = log_freq
+        self.activation_freq = activation_freq
+        self._tb_writer = None
+        self._activation_logger: ActivationLogger | None = None
+
+    def _on_training_start(self) -> None:
+        for fmt in self.logger.output_formats:
+            if isinstance(fmt, TensorBoardOutputFormat):
+                self._tb_writer = fmt.writer
+                break
+        self._activation_logger = ActivationLogger(self.model.policy)
+
+    def _on_training_end(self) -> None:
+        if self._activation_logger:
+            self._activation_logger.close()
+
+    def _get_action_mask(self) -> np.ndarray | None:
+        if not hasattr(self.training_env, "envs"):
+            return None
+        base_env = self.training_env.envs[0]
+        try:
+            return base_env.unwrapped.action_masks()
+        except Exception:
+            return None
+
+    def _log_activations(self, obs: np.ndarray) -> None:
+        if not self._tb_writer or not self._activation_logger:
+            return
+        self._activation_logger.clear()
+        with torch.no_grad():
+            obs_tensor, _ = self.model.policy.obs_to_tensor(obs)
+            self.model.policy(obs_tensor)
+        for name, activation in self._activation_logger.get().items():
+            self._tb_writer.add_histogram(
+                f"activations/{name}",
+                activation,
+                self.num_timesteps,
+            )
+
+    def _on_step(self) -> bool:
+        if self.n_calls % self.log_freq != 0:
+            return True
+
+        def to_numpy(data):
+            if torch.is_tensor(data):
+                return data.detach().cpu().numpy()
+            return np.asarray(data)
+
+        actions = self.locals.get("actions")
+        if actions is not None:
+            actions_np = to_numpy(actions).flatten()
+            self.logger.record("rollout/action_mean", float(np.mean(actions_np)))
+            if self._tb_writer:
+                self._tb_writer.add_histogram(
+                    "actions/selected",
+                    actions_np,
+                    self.num_timesteps,
+                )
+
+        values = self.locals.get("values")
+        if values is not None:
+            values_np = to_numpy(values)
+            self.logger.record("rollout/value_mean", float(np.mean(values_np)))
+
+        mask = self._get_action_mask()
+        if mask is not None:
+            self.logger.record("rollout/legal_action_count", float(mask.sum()))
+            self.logger.record("rollout/legal_action_fraction", float(mask.mean()))
+            if self._tb_writer:
+                self._tb_writer.add_histogram(
+                    "actions/mask",
+                    mask.astype(np.int32),
+                    self.num_timesteps,
+                )
+
+        if self.n_calls % self.activation_freq == 0:
+            obs = self.locals.get("new_obs")
+            if obs is None:
+                obs = self.locals.get("obs")
+            if obs is not None:
+                self._log_activations(obs)
+
+        return True
+
+
+class SelfPlayCallback(BaseCallback):
+    """Callback to update opponent model during training for true self-play."""
+    
+    def __init__(self, self_play_env: AdaptiveSelfPlayWrapper, update_freq: int = 10000):
+        super().__init__()
+        self.self_play_env = self_play_env
+        self.update_freq = update_freq
+        self._last_update = 0
+        
+    def _on_training_start(self) -> None:
+        # Set initial opponent model
+        self.self_play_env.set_opponent_model(self.model)
+        print("🎮 Self-play initialized: opponent will gradually use trained model")
+        
+    def _on_step(self) -> bool:
+        # Update opponent model periodically
+        if self.num_timesteps - self._last_update >= self.update_freq:
+            self.self_play_env.set_opponent_model(self.model)
+            self._last_update = self.num_timesteps
+            
+            # Log current model usage probability
+            prob = self.self_play_env._get_model_probability()
+            self.logger.record("self_play/model_prob", prob)
+            print(f"📊 Self-play update @ {self.num_timesteps}: opponent model prob = {prob:.1%}")
+            
+        return True
+
+
+def mask_fn(env):
+    """Function that returns action mask for MaskablePPO."""
+    # Unwrap to get to the actual environment with action_masks method
+    while hasattr(env, 'env'):
+        if hasattr(env, 'action_masks'):
+            return env.action_masks()
+        env = env.env
+    return env.action_masks()
 
 
 def main():
-    """Main training function."""
+    """Main training function with true self-play."""
+    # Large action spaces can trip strict simplex validation in torch distributions.
+    torch.distributions.Distribution.set_default_validate_args(False)
+
     # Create directories
     os.makedirs(MODEL_DIR, exist_ok=True)
     os.makedirs(LOG_DIR, exist_ok=True)
     
-    print("Initializing environment with action masking...")
-    # Create and wrap environment
-    env = CuttleRLEnvironment()
-    env = SelfPlayWrapper(env)
-    env = Monitor(env, LOG_DIR)
+    print("Initializing environment with action masking and self-play...")
+    
+    # Create base environment with adaptive self-play
+    # Strategy: Start random-only, gradually introduce model opponent
+    # This ensures agent learns to win before facing harder opponents
+    base_env = CuttleRLEnvironment()
+    self_play_env = AdaptiveSelfPlayWrapper(
+        base_env,
+        model_prob_start=0.0,    # Start with 100% random opponent
+        model_prob_end=0.3,       # End with only 30% model (mostly random for wins)
+        transition_steps=300000,  # Very slow transition over 300K steps
+    )
+    
+    # Wrap with Monitor and ActionMasker
+    env = Monitor(self_play_env, LOG_DIR)
+    env = ActionMasker(env, mask_fn)  # Critical: wrap with ActionMasker
     
     print("Creating MaskablePPO model...")
     # Create MaskablePPO model (supports action masking)
@@ -39,7 +221,7 @@ def main():
         tensorboard_log=LOG_DIR,
     )
     
-    # Setup checkpoint callback
+    # Setup callbacks
     checkpoint_callback = CheckpointCallback(
         save_freq=10000,
         save_path=MODEL_DIR,
@@ -47,16 +229,22 @@ def main():
         save_replay_buffer=False,
         save_vecnormalize=False,
     )
+    diagnostics_callback = DiagnosticsCallback()
+    self_play_callback = SelfPlayCallback(
+        self_play_env,
+        update_freq=10000,  # Update opponent model every 10K steps
+    )
     
     # Train the model
     print(f"Starting training for {TRAINING_CONFIG['total_timesteps']} timesteps...")
     print("Using action masking - model will only consider legal actions!")
+    print("Using adaptive self-play - opponent gradually uses trained model!")
     print("Progress will be shown below. This may take 15-30 minutes.")
     
     model.learn(
         total_timesteps=TRAINING_CONFIG["total_timesteps"],
-        callback=checkpoint_callback,
-        progress_bar=True,
+        callback=[checkpoint_callback, diagnostics_callback, self_play_callback],
+        progress_bar=False,
     )
     
     # Save final model
diff --git a/rl/view_game.py b/rl/view_game.py
new file mode 100644
index 0000000..f7330fa
--- /dev/null
+++ b/rl/view_game.py
@@ -0,0 +1,123 @@
+"""Interactive viewer for logged RL games."""
+from __future__ import annotations
+
+import json
+from pathlib import Path
+from typing import Any, Dict
+
+
+def format_card(card_dict: Dict[str, Any]) -> str:
+    """Format card dict as readable string."""
+    if not card_dict:
+        return "None"
+    return f"{card_dict['rank']}♦♥♠♣"[0] if card_dict['suit'] == 'DIAMONDS' else \
+           f"{card_dict['rank']}♥" if card_dict['suit'] == 'HEARTS' else \
+           f"{card_dict['rank']}♠" if card_dict['suit'] == 'SPADES' else \
+           f"{card_dict['rank']}♣"
+
+
+def display_game(game_file: Path) -> None:
+    """Display a game log in a readable format."""
+    with open(game_file, "r") as f:
+        game = json.load(f)
+    
+    print(f"\n{'='*80}")
+    print(f"GAME {game['game_id']}")
+    print(f"{'='*80}")
+    print(f"Start: {game['start_time']}")
+    print(f"Outcome: {game['outcome']['reason'].upper()}")
+    if game['outcome']['winner'] is not None:
+        print(f"Winner: Player {game['outcome']['winner']}")
+    print(f"Total Steps: {game['outcome']['total_steps']}")
+    print(f"Final Scores: P0={game['outcome']['final_scores']['player_0']}, "
+          f"P1={game['outcome']['final_scores']['player_1']}")
+    print(f"{'='*80}\n")
+    
+    # Display step by step
+    for i, step in enumerate(game['steps'][:50]):  # Show first 50 steps
+        player = step['player']
+        action = step['action']
+        state = step['state']
+        
+        print(f"Step {step['step']:3d} | P{player} | {action['type']:15s}", end="")
+        
+        if action['card']:
+            print(f" | Card: {format_card(action['card'])}", end="")
+        if action['target']:
+            print(f" | Target: {format_card(action['target'])}", end="")
+        
+        print(f" | Score: P0={state['scores']['player_0']:2d} P1={state['scores']['player_1']:2d}", end="")
+        print(f" | Hands: P0={state['hand_sizes']['player_0']} P1={state['hand_sizes']['player_1']}", end="")
+        print(f" | Deck: {state['deck_size']:2d}")
+        
+        # Show field state every 10 steps
+        if (i + 1) % 10 == 0:
+            print(f"        {'─'*72}")
+            p0_field = [format_card(c) for c in state['field_cards']['player_0']]
+            p1_field = [format_card(c) for c in state['field_cards']['player_1']]
+            print(f"        P0 Field: {', '.join(p0_field) if p0_field else '(empty)'}")
+            print(f"        P1 Field: {', '.join(p1_field) if p1_field else '(empty)'}")
+            print(f"        {'─'*72}")
+    
+    if len(game['steps']) > 50:
+        print(f"\n... ({len(game['steps']) - 50} more steps) ...\n")
+        
+        # Show last 10 steps
+        print(f"{'─'*80}")
+        print("LAST 10 STEPS:")
+        print(f"{'─'*80}\n")
+        for step in game['steps'][-10:]:
+            player = step['player']
+            action = step['action']
+            state = step['state']
+            
+            print(f"Step {step['step']:3d} | P{player} | {action['type']:15s}", end="")
+            if action['card']:
+                print(f" | {format_card(action['card'])}", end="")
+            print(f" | Score: P0={state['scores']['player_0']:2d} P1={state['scores']['player_1']:2d}")
+    
+    print(f"\n{'='*80}\n")
+
+
+def main():
+    """Main function to view game logs."""
+    log_dir = Path("rl/gameplay_logs")
+    
+    if not log_dir.exists():
+        print("❌ No logs found. Run 'make debug-rl' first.")
+        return
+    
+    game_files = sorted(log_dir.glob("game_*.json"))
+    
+    if not game_files:
+        print("❌ No game logs found.")
+        return
+    
+    print(f"\nFound {len(game_files)} games:")
+    for i, game_file in enumerate(game_files):
+        print(f"  {i}: {game_file.name}")
+    
+    print("\nEnter game number to view (or 'all' for all games, 'q' to quit):")
+    
+    while True:
+        choice = input("> ").strip().lower()
+        
+        if choice == 'q':
+            break
+        elif choice == 'all':
+            for game_file in game_files:
+                display_game(game_file)
+            break
+        else:
+            try:
+                game_num = int(choice)
+                if 0 <= game_num < len(game_files):
+                    display_game(game_files[game_num])
+                else:
+                    print(f"Invalid game number. Choose 0-{len(game_files)-1}")
+            except ValueError:
+                print("Invalid input. Enter a number, 'all', or 'q'")
+
+
+if __name__ == "__main__":
+    main()
diff --git a/tests/test_action_mapping.py b/tests/test_action_mapping.py
new file mode 100644
index 0000000..ec76444
--- /dev/null
+++ b/tests/test_action_mapping.py
@@ -0,0 +1,33 @@
+"""Sanity checks for fixed action mapping."""
+from game.game import Game
+from rl.action_mapping import (
+    ACTION_SPACE_SIZE,
+    action_index_to_action,
+    action_to_index,
+    build_action_map,
+    legal_action_mask_from_actions,
+)
+
+
+def test_action_mask_roundtrip() -> None:
+    """Ensure mask/index mapping round-trips for legal actions in a fresh game state."""
+    game = Game(manual_selection=False, ai_player=None)
+    state = game.game_state
+    legal_actions = state.get_legal_actions()
+
+    action_map = build_action_map(legal_actions)
+    mask = legal_action_mask_from_actions(legal_actions)
+
+    assert mask.shape[0] == ACTION_SPACE_SIZE
+    assert int(mask.sum()) == len(action_map)
+
+    for idx, action in action_map.items():
+        assert mask[idx]
+        decoded = action_index_to_action(state, idx)
+        assert decoded is not None
+        assert action_to_index(decoded) == idx
+
+    for action in legal_actions:
+        idx = action_to_index(action)
+        assert idx is not None
+        assert mask[idx]