diff --git a/.gitignore b/.gitignore index 17a9f02..550d57d 100644 --- a/.gitignore +++ b/.gitignore @@ -184,6 +184,10 @@ test_outputs/ eng_plans/ +# RL experiments +rl/experiments/ +rl/gameplay_logs/ + # RL training artifacts rl/models/*.zip !rl/models/cuttle_rl_final.zip diff --git a/Makefile b/Makefile index 0533d60..7356b3e 100644 --- a/Makefile +++ b/Makefile @@ -66,3 +66,38 @@ test-rl: source $(VENV_NAME)/bin/activate && PYTHONPATH=$(CURRENT_DIR) python -c \ "from rl import config; config.TRAINING_CONFIG['total_timesteps'] = 10000; \ exec(open('rl/train.py').read())" + +debug-rl: + @echo "Running RL games with detailed logging..." + source $(VENV_NAME)/bin/activate && PYTHONPATH=$(CURRENT_DIR) python rl/debug_gameplay.py + +analyze-rl: + @echo "Analyzing RL gameplay logs..." + source $(VENV_NAME)/bin/activate && PYTHONPATH=$(CURRENT_DIR) python rl/analyze_logs.py + +view-rl: + @echo "Viewing RL gameplay logs..." + source $(VENV_NAME)/bin/activate && PYTHONPATH=$(CURRENT_DIR) python rl/view_game.py + +hypersearch-rl: + @echo "Running hyperparameter search (full)..." + source $(VENV_NAME)/bin/activate && PYTHONPATH=$(CURRENT_DIR) python rl/hyperparameter_search.py + +hypersearch-quick-rl: + @echo "Running quick hyperparameter search..." + source $(VENV_NAME)/bin/activate && PYTHONPATH=$(CURRENT_DIR) python rl/hyperparameter_search.py --quick + +compare-rl: + @echo "Compare experiment results..." + @echo "Usage: make compare-rl DIR=rl/experiments/20260125_120000" + @if [ -z "$(DIR)" ]; then \ + echo "Error: DIR not specified"; \ + exit 1; \ + fi + source $(VENV_NAME)/bin/activate && PYTHONPATH=$(CURRENT_DIR) python rl/compare_experiments.py $(DIR) + +monitor-rl: + @source $(VENV_NAME)/bin/activate && PYTHONPATH=$(CURRENT_DIR) python rl/monitor.py + +watch-rl: + @source $(VENV_NAME)/bin/activate && PYTHONPATH=$(CURRENT_DIR) python rl/monitor.py --watch diff --git a/game/game_state.py b/game/game_state.py index 4bfb6b6..e3672a6 100644 --- a/game/game_state.py +++ b/game/game_state.py @@ -826,6 +826,10 @@ def play_one_off( if card not in self.discard_pile: self._move_card_to_discard(card) + # One-off resolution is complete (counter accepted or effect applied). + self.resolving_one_off = False + self.one_off_card_to_counter = None + # Turn is finished after resolution return True, None diff --git a/game/rl_ai_player.py b/game/rl_ai_player.py index 74a3cf2..9df5a08 100644 --- a/game/rl_ai_player.py +++ b/game/rl_ai_player.py @@ -107,17 +107,10 @@ def _encode_game_state(self, game_state: GameState) -> np.ndarray: return self.env.env.unwrapped._encode_state() def _get_action_mask(self, legal_actions: List[Action]) -> np.ndarray: - """Get action mask for the current legal actions. - - Args: - legal_actions (List[Action]): List of legal actions. - - Returns: - np.ndarray: Boolean mask for valid actions. - """ - mask = np.zeros(50, dtype=bool) # Max 50 actions - mask[:len(legal_actions)] = True - return mask + """Get action mask for the current legal actions.""" + from rl.action_mapping import legal_action_mask_from_actions + + return legal_action_mask_from_actions(legal_actions) async def get_action( self, @@ -167,12 +160,14 @@ async def get_action( deterministic=True ) - # Ensure action index is valid - if action_index >= len(legal_actions): - action_index = 0 # Fallback to first legal action - - # Return the chosen action - return legal_actions[action_index] + action_index = int(action_index) + + from rl.action_mapping import build_action_map + + action_map = build_action_map(legal_actions) + if action_index not in action_map: + return legal_actions[0] + return action_map[action_index] except Exception as e: last_error = e diff --git a/requirements.txt b/requirements.txt index 2099c36..b03cf85 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,33 +1,33 @@ -astroid==3.2.4 -black==24.8.0 -click==8.1.7 -dill==0.3.9 -flake8==7.1.1 -fastapi==0.115.0 -isort==5.13.2 +astroid==4.0.3 +black==26.1.0 +click==8.3.1 +dill==0.4.1 +flake8==7.3.0 +fastapi==0.128.0 +isort==7.0.0 mccabe==0.7.0 -mypy==1.13.0 -mypy-extensions==1.0.0 -packaging==24.2 -pathspec==0.12.1 -platformdirs==4.3.6 -pdoc==14.7.0 -pycodestyle==2.12.1 -pyflakes==3.2.0 -pylint==3.2.7 -pytest==8.1.1 -pytest-timeout==2.3.1 -tomli==2.2.1 -tomlkit==0.13.2 -typing-extensions==4.12.2 -uvicorn[standard]==0.30.6 -ollama==0.4.6 -pytest-asyncio==0.23.8 +mypy==1.19.1 +mypy-extensions==1.1.0 +packaging==26.0 +pathspec==1.0.3 +platformdirs==4.5.1 +pdoc==16.0.0 +pycodestyle==2.14.0 +pyflakes==3.4.0 +pylint==4.0.4 +pytest==9.0.2 +pytest-timeout==2.4.0 +tomli==2.4.0 +tomlkit==0.14.0 +typing-extensions==4.15.0 +uvicorn[standard]==0.40.0 +ollama==0.6.1 +pytest-asyncio==1.3.0 # RL Training Dependencies -gymnasium==0.29.1 -stable-baselines3==2.2.1 -sb3-contrib==2.2.1 -torch>=2.0.0 -tensorboard==2.13.0 -numpy>=1.24.0 +gymnasium==1.2.3 +stable-baselines3==2.7.1 +sb3-contrib==2.7.1 +torch==2.10.0 +tensorboard==2.20.0 +numpy==2.4.1 diff --git a/rl/README.md b/rl/README.md index 36fe21a..267761f 100644 --- a/rl/README.md +++ b/rl/README.md @@ -1,312 +1,423 @@ # RL Training for Cuttle Game -Reinforcement Learning training setup for the Cuttle card game using **MaskablePPO** (Proximal Policy Optimization with action masking) from Stable Baselines3. +Reinforcement Learning training for the Cuttle card game using **MaskablePPO** (Proximal Policy Optimization with action masking) from Stable Baselines3. ## Quick Start ### Train a Model ```bash -# Full training (100K timesteps, ~1-2 minutes) +# Full training (500K timesteps, ~2-3 hours) make train-rl -# Or directly: -source cuttle-bot-3.12/bin/activate -PYTHONPATH=. python rl/train.py +# Quick test (10K timesteps, ~2-3 minutes) +make test-rl ``` ### Evaluate a Trained Model ```bash make eval-rl - -# Or directly: -source cuttle-bot-3.12/bin/activate -PYTHONPATH=. python rl/evaluate.py ``` ### Monitor Training ```bash make tensorboard -# Then open http://localhost:6006 +# Open http://localhost:6006 ``` -### Quick Test +## Hyperparameter Search + +Test multiple configurations to find the best settings: ```bash -# Quick test with 10K timesteps (~2-3 minutes) -make test-rl +# Quick search (50K steps each, ~1 hour total) +make hypersearch-quick-rl + +# Full search (200K steps each, ~3-4 hours) +make hypersearch-rl + +# Compare results +make compare-rl DIR=rl/experiments/ ``` -## File Structure +## Debugging Tools + +```bash +# Generate detailed gameplay logs +make debug-rl + +# Analyze action patterns +make analyze-rl +# View individual games interactively +make view-rl ``` -rl/ -├── README.md # This file -├── config.py # Hyperparameters and configuration -├── cuttle_env.py # Gymnasium environment wrapper -├── self_play_env.py # Self-play wrapper -├── train.py # Training script -├── evaluate.py # Evaluation script -├── models/ # Saved model checkpoints (gitignored) -│ └── cuttle_rl_final.zip -└── logs/ # TensorBoard logs (gitignored) + +--- + +## Architecture + +### Files + +| File | Purpose | +| -------------------------- | ----------------------------------------- | +| `config.py` | Hyperparameters and reward settings | +| `cuttle_env.py` | Gymnasium environment with action masking | +| `self_play_env.py` | Self-play wrapper for training | +| `train.py` | Main training script | +| `evaluate.py` | Model evaluation | +| `hyperparameter_search.py` | Automated config testing | +| `compare_experiments.py` | Result analysis | +| `debug_gameplay.py` | Generate debug logs | +| `analyze_logs.py` | Pattern analysis | +| `view_game.py` | Interactive game viewer | +| `game_logger.py` | Logging implementation | + +### Environment + +- **Observation space**: 610-dimensional vector encoding game state (hand, fields, scores, flags, discard, revealed) +- **Action space**: Discrete(8478) with fixed card-identity mapping (not per-turn legal-action indices) +- **Action masking**: Only legal actions are considered by the policy + +--- + +## Action Space Mapping (Fixed Indices) + +Action indices are stable across turns and map to a specific semantic action based on **card identity** (rank + suit), not on a per-turn legal-action list. The mapping lives in `rl/action_mapping.py`. + +### Card Identity Index + +Each card maps to `0..51` using a canonical order: + +``` +card_index = (rank_value - 1) * 4 + suit_value ``` -## Key Features +- `rank_value` comes from `game/card.py` `Rank` enum (Ace=1 .. King=13). +- `suit_value` comes from `Suit` enum (Clubs=0 .. Spades=3). -- **Action Masking**: Agent only considers legal moves (no invalid action penalties) -- **State Encoding**: 206-dimensional observation vector encoding full game state -- **Self-Play**: Trains against random opponent (extensible to previous model checkpoints) -- **Checkpointing**: Auto-saves model every 10K timesteps -- **TensorBoard Logging**: Real-time training metrics visualization +### Action Groups and Offsets -## Configuration +Action indices are grouped; each group has a fixed size and offset: -All configuration is in `config.py`: +1. Draw: `1` +2. Resolve one-off: `1` +3. Play points (card identity): `52` +4. Play face card (card identity): `52` +5. Play one-off (untargeted, card identity): `52` +6. Play one-off (targeted: attacker, target): `52 * 52` +7. Counter (two) (card identity): `52` +8. Take from discard (card identity): `52` +9. Discard from hand (four) (card identity): `52` +10. Discard revealed (seven) (card identity): `52` +11. Scuttle (attacker, target): `52 * 52` +12. Jack (attacker, target): `52 * 52` -### Training Hyperparameters +Total size: `8478`. + +For paired actions (scuttle/jack/targeted one-off), the pair index is: -```python -TRAINING_CONFIG = { - "total_timesteps": 100000, # Total training steps - "learning_rate": 3e-4, # Learning rate - "n_steps": 2048, # Steps per update - "batch_size": 64, # Minibatch size - "n_epochs": 10, # Epochs per update - "gamma": 0.99, # Discount factor - "gae_lambda": 0.95, # GAE parameter - "clip_range": 0.2, # PPO clip range - "ent_coef": 0.01, # Entropy coefficient -} +``` +pair_index = attacker_index * 52 + target_index ``` -### Reward Structure +### Mapping to Concrete Actions -```python -REWARD_CONFIG = { - "win": 100.0, # Win reward - "loss": -100.0, # Loss penalty - "stalemate": 0.0, # Draw reward - "progress_multiplier": 10.0, # Score progress multiplier - "turn_penalty": -1.0, # Per-turn penalty - "invalid_action_penalty": -50.0, # Shouldn't occur with masking -} +The environment builds a mapping each step using `game_state.get_legal_actions()`: + +1. `build_action_map(legal_actions)` converts each `Action` into a fixed index. +2. `action_masks()` marks only those indices as legal. +3. `step(action_index)` resolves the index back to the matching `Action`. + +### Illegal Action Handling + +If a predicted index does **not** map to a legal action: + +- The environment returns `invalid_action_penalty` and ends the episode early. +- This should not occur when action masking is properly applied. + +### Index Calculation Examples + +Assume `Rank` and `Suit` enum values from `game/card.py` (Clubs=0, Diamonds=1, Hearts=2, Spades=3) and: + +``` +card_index = (rank_value - 1) * 4 + suit_value ``` -### Environment Config +Also assume the group offsets from `rl/action_mapping.py`: -```python -ENV_CONFIG = { - "max_actions": 50, # Max actions per turn - "observation_dim": 206, # State vector size - "max_hand_size": 8, # Max cards in hand - "max_field_size": 10, # Max cards on field -} ``` +draw=0 +resolve=1 +points=2 +face=54 +one_off=106 +one_off_target=158 +counter=2862 +take_from_discard=2914 +discard_from_hand=2966 +discard_revealed=3018 +scuttle=3070 +jack=5774 +``` + +Examples: + +- **Play points**: `10 of Hearts` (rank=10, suit=Hearts=2) + `card_index = (10-1)*4 + 2 = 38` + `action_index = points_offset + card_index = 2 + 38 = 40` + +- **Seven one-off** (untargeted): `7 of Clubs` (rank=7, suit=Clubs=0) + `card_index = (7-1)*4 + 0 = 24` + `action_index = one_off_offset + card_index = 106 + 24 = 130` + +- **Seven from revealed pile** (discard revealed): `7 of Clubs` + `action_index = discard_revealed_offset + card_index = 3018 + 24 = 3042` -## How It Works +--- -### Action Masking +## Key Findings (might be outdated) -The environment uses **action masking** to ensure the agent only considers legal moves: +### Best Configuration: Baseline (Minimal Reward Shaping) -1. `get_legal_actions()` returns list of valid `Action` objects -2. Action mask is boolean array: `True` for legal actions, `False` for illegal -3. Model predicts action index into legal actions list -4. Mask prevents model from selecting invalid actions +After extensive hyperparameter search, the **simplest configuration** performed best: -**Benefits**: Faster training, no wasted exploration on invalid moves. +```python +REWARD_CONFIG = { + "win": 100.0, + "loss": -100.0, + "progress_multiplier": 0.1, # Minimal + "turn_penalty": -0.01, # Minimal +} +``` -### State Encoding +**Why it works:** +- Sparse rewards (win/loss) let the agent learn actual game strategy +- Heavy reward shaping causes overfitting to intermediate rewards +- Agent learns to play the game, not exploit reward hacking -Game state is encoded as a **206-dimensional vector**: +### Hyperparameter Search Results -- **Hand cards** (136 dims): 8 slots × 17 dims (suit + rank) -- **Opponent hand size** (1 dim): Normalized -- **Player 0 field** (30 dims): 10 slots × 3 dims -- **Player 1 field** (30 dims): 10 slots × 3 dims -- **Scores & targets** (4 dims): Normalized scores -- **Game flags** (5 dims): Current player, resolving flags, deck/discard sizes +| Config | Eval Reward | Notes | +| ------------------ | ----------- | ----------------------------------- | +| **baseline** | -4.31 | ✅ Best - won games, longer episodes | +| aggressive_scoring | -9.07 | ❌ Crashed early | +| high_progress | -9.59 | ❌ Overfitted to progress | +| fast_learning | -9.86 | ❌ Unstable | +| conservative | -9.97 | ❌ Too slow | -### Training Flow +--- -1. Environment resets to new game -2. Agent observes state (206-dim vector) -3. Agent predicts action (with masking) -4. Action executed, reward calculated -5. Opponent takes turn (random, also masked) -6. Repeat until game ends -7. Model updates using PPO algorithm +## Important: Action Masking -## Usage Examples +### The Bug We Fixed -### Custom Training Run +MaskablePPO requires proper environment wrapping to use action masks: ```python -from rl import config -from rl.train import main +# WRONG - action masking doesn't work +env = CuttleRLEnvironment() +env = Monitor(env, LOG_DIR) -# Modify config -config.TRAINING_CONFIG["total_timesteps"] = 500000 -config.TRAINING_CONFIG["learning_rate"] = 1e-4 +# CORRECT - action masking works +from sb3_contrib.common.wrappers import ActionMasker -# Train -main() +def mask_fn(env): + while hasattr(env, 'env'): + if hasattr(env, 'action_masks'): + return env.action_masks() + env = env.env + return env.action_masks() + +env = CuttleRLEnvironment() +env = SelfPlayWrapper(env) +env = Monitor(env, LOG_DIR) +env = ActionMasker(env, mask_fn) # CRITICAL ``` -### Load and Use Model +### Why Action Masking Matters -```python -from sb3_contrib import MaskablePPO -from rl.cuttle_env import CuttleRLEnvironment +Without it: +- Agent attempts invalid actions (-10 penalty each) +- Episodes crash after 1-2 steps +- No learning occurs -# Load model -model = MaskablePPO.load("rl/models/cuttle_rl_final") +With it: +- Agent only sees legal actions +- Full games play out (50-150 steps) +- Agent learns actual strategy -# Create environment -env = CuttleRLEnvironment() -obs, info = env.reset() +--- + +## Configuration Reference -# Get action with masking -action_mask = env.action_masks() -action, _ = model.predict(obs, action_masks=action_mask, deterministic=True) +### Training Parameters -# Execute action -obs, reward, done, truncated, info = env.step(action) +```python +TRAINING_CONFIG = { + "total_timesteps": 500000, # How long to train + "learning_rate": 3e-4, # Adam optimizer LR + "n_steps": 2048, # Steps before policy update + "batch_size": 64, # Minibatch size + "n_epochs": 10, # Epochs per update + "gamma": 0.99, # Discount factor + "gae_lambda": 0.95, # GAE parameter + "clip_range": 0.2, # PPO clip range + "ent_coef": 0.01, # Entropy coefficient +} ``` -### Evaluate Custom Model +### Reward Parameters ```python -from rl.evaluate import evaluate_agent - -# Evaluate specific model -evaluate_agent("rl/models/cuttle_rl_100000_steps", n_episodes=50) +REWARD_CONFIG = { + "win": 100.0, # Terminal reward for winning + "loss": -100.0, # Terminal penalty for losing + "stalemate": -50.0, # Penalty for draw + "progress_multiplier": 0.1, # Intermediate reward multiplier + "turn_penalty": -0.01, # Per-turn penalty + "invalid_action_penalty": -10.0, # Safety check +} ``` -## Output Files +### Parameter Guidelines -### Models +| Parameter | Too Low | Recommended | Too High | +| --------------------- | ------------- | ------------- | ----------- | +| `learning_rate` | Slow learning | 1e-4 to 3e-4 | Unstable | +| `progress_multiplier` | No guidance | 0.1 to 1.0 | Overfitting | +| `turn_penalty` | Long games | -0.01 to -0.1 | Rushed play | -- `rl/models/cuttle_rl_final.zip` - Final trained model -- `rl/models/cuttle_rl_10000_steps.zip` - Checkpoint at 10K steps -- `rl/models/cuttle_rl_20000_steps.zip` - Checkpoint at 20K steps -- etc. +--- -### Logs +## Troubleshooting -- `rl/logs/` - TensorBoard logs - - View with: `make tensorboard` - - Metrics: reward, episode length, policy loss, value loss, etc. +### High Timeout Rate -## Dependencies +**Symptoms:** Games exceed 200 steps frequently -Required packages (already in main `requirements.txt`): +**Solutions:** +1. Increase `turn_penalty` slightly +2. Check for resolve loops in gameplay logs +3. Verify action masking is working -``` -gymnasium==0.29.1 -stable-baselines3==2.2.1 -sb3-contrib==2.2.1 -torch>=2.0.0 -tensorboard==2.15.1 -numpy>=1.24.0 -tqdm>=4.67.0 -rich>=14.2.0 -``` +### Negative Eval Rewards -## Troubleshooting +**Symptoms:** Agent loses more than wins -### Model Not Found +**Solutions:** +1. Train longer (500K+ timesteps) +2. Reduce reward shaping (use baseline config) +3. Check for invalid action penalties -``` -ERROR: Model not found at rl/models/cuttle_rl_final.zip -``` +### Short Eval Episodes -**Solution**: Train a model first with `make train-rl` +**Symptoms:** Episodes only 1-5 steps -### Import Errors +**Cause:** Action masking not working, or overfitted model -``` -ImportError: You must install tqdm and rich... -``` +**Solution:** Verify `ActionMasker` wrapper is applied correctly + +### Training Instability + +**Symptoms:** Reward oscillates wildly, NaN losses + +**Solutions:** +1. Reduce `learning_rate` by 10x +2. Increase `batch_size` +3. Check reward calculations for bugs + +--- + +## Development Notes + +### Testing Changes -**Solution**: Install missing packages: ```bash -source cuttle-bot-3.12/bin/activate -pip install tqdm rich -``` +# Run quick training test +make test-rl -### Games Taking Too Long +# Generate debug logs +make debug-rl -Untrained agents may play very long games (both players just drawing cards). This is normal! The agent needs more training to learn strategic play. +# Analyze patterns +make analyze-rl +``` -**Solution**: -- Train longer (increase `total_timesteps` in `config.py`) -- Adjust reward structure to encourage strategic moves -- Add episode length limits (see `cuttle_env.py`) +### Viewing Results -### Action Masking Not Working +```bash +# TensorBoard +tensorboard --logdir rl/logs -If you see "WARNING: Invalid action attempted", action masking may not be properly passed to the model. +# Compare experiments +make compare-rl DIR=rl/experiments/ -**Solution**: Ensure `action_masks` is passed to `model.predict()`: -```python -action_mask = env.action_masks() -action, _ = model.predict(obs, action_masks=action_mask) +# Interactive game viewer +make view-rl ``` -## Performance +### Output Locations -- **Training Speed**: ~1,200 FPS on modern CPU -- **100K Timesteps**: ~1-2 minutes -- **Model Size**: ~1-2 MB (compressed) -- **Memory Usage**: ~200-500 MB during training +| Output | Location | +| -------------- | ------------------- | +| Trained models | `rl/models/` | +| Training logs | `rl/logs/` | +| Experiments | `rl/experiments/` | +| Gameplay logs | `rl/gameplay_logs/` | -## Key Concepts +--- -### Action Masking +## Changelog -Action masking is **critical** for efficient training. Without it, the agent would waste time exploring invalid moves. The mask tells the model which actions are legal in the current state. +### 2026-01-25: Action Masking Fix -### Self-Play +**Problem:** MaskablePPO wasn't receiving action masks correctly. -Currently uses random opponent. Future improvements: -- Train against previous model checkpoints -- Use stronger opponents as agent improves -- Implement population-based training +**Impact:** +- Agents attempted invalid actions constantly +- Episodes crashed after 1-2 steps +- All previous training was invalid -### Reward Shaping +**Solution:** Added `ActionMasker` wrapper with proper environment unwrapping. -Rewards are designed to: -- Strongly reward winning (+100) -- Strongly penalize losing (-100) -- Provide intermediate feedback for score progress -- Slightly penalize each turn to encourage efficiency +**Files changed:** +- `train.py` - Added ActionMasker +- `hyperparameter_search.py` - Added ActionMasker +- `debug_gameplay.py` - Added ActionMasker -## Next Steps +### 2026-01-25: Hyperparameter Search -1. **Train Longer**: Increase `total_timesteps` to 1M+ for better strategy -2. **Tune Rewards**: Adjust `REWARD_CONFIG` to encourage specific behaviors -3. **Better Opponents**: Implement self-play with previous checkpoints -4. **Hyperparameter Tuning**: Experiment with learning rate, batch size, etc. -5. **Evaluation Metrics**: Add detailed analysis (action distribution, game length) +**Finding:** Baseline config (minimal reward shaping) outperforms all others. -## References +**Implication:** Sparse rewards work better than dense reward shaping for this game. + +**Config adopted:** +```python +progress_multiplier: 0.1 # Was 10.0 +turn_penalty: -0.01 # Was -1.0 +``` -- **Detailed Documentation**: See `eng_plans/rl_implementation_summary.md` -- **Stable Baselines3**: https://stable-baselines3.readthedocs.io/ -- **MaskablePPO**: https://sb3-contrib.readthedocs.io/en/master/modules/ppo_mask.html -- **Gymnasium**: https://gymnasium.farama.org/ +### 2026-01-25: Debug Tools -## Notes +**Added:** +- `game_logger.py` - Step-by-step game logging +- `analyze_logs.py` - Automated pattern analysis +- `view_game.py` - Interactive game viewer +- `compare_experiments.py` - Experiment comparison -- Models and logs are gitignored (see `.gitignore`) -- Training is deterministic with fixed seeds -- Environment uses action masking - invalid actions should never occur -- State encoding is fixed-size (206 dims) for neural network compatibility +**Commands:** +- `make debug-rl` +- `make analyze-rl` +- `make view-rl` +- `make compare-rl` --- -**Last Updated**: 2025-10 \ No newline at end of file +## References + +- [Stable-Baselines3 Documentation](https://stable-baselines3.readthedocs.io/) +- [SB3-Contrib MaskablePPO](https://sb3-contrib.readthedocs.io/en/master/modules/ppo_mask.html) +- [PPO Paper](https://arxiv.org/abs/1707.06347) (Schulman et al., 2017) diff --git a/rl/action_mapping.py b/rl/action_mapping.py new file mode 100644 index 0000000..e2bd314 --- /dev/null +++ b/rl/action_mapping.py @@ -0,0 +1,150 @@ +"""Action mapping for Cuttle RL. + +Maps fixed action indices to game Action objects using card identity (rank/suit). +This avoids per-turn reindexing based on legal action list order. +""" +from __future__ import annotations + +from dataclasses import dataclass +from typing import Dict, Iterable, Optional + +import numpy as np + +from game.action import Action, ActionType +from game.card import Card + + +NUM_CARDS = 52 +PAIR_SIZE = NUM_CARDS * NUM_CARDS + + +@dataclass(frozen=True) +class ActionGroup: + name: str + size: int + + +ACTION_GROUPS = ( + ActionGroup("draw", 1), + ActionGroup("resolve", 1), + ActionGroup("points", NUM_CARDS), + ActionGroup("face", NUM_CARDS), + ActionGroup("one_off", NUM_CARDS), + ActionGroup("one_off_target", PAIR_SIZE), + ActionGroup("counter", NUM_CARDS), + ActionGroup("take_from_discard", NUM_CARDS), + ActionGroup("discard_from_hand", NUM_CARDS), + ActionGroup("discard_revealed", NUM_CARDS), + ActionGroup("scuttle", PAIR_SIZE), + ActionGroup("jack", PAIR_SIZE), +) + + +_OFFSETS: Dict[str, int] = {} +_running = 0 +for _group in ACTION_GROUPS: + _OFFSETS[_group.name] = _running + _running += _group.size + +ACTION_SPACE_SIZE = _running + + +def card_index(card: Card) -> int: + """Return canonical 0..51 index for a card based on rank/suit.""" + return (card.rank.value[1] - 1) * 4 + card.suit.value[1] + + +def _pair_index(attacker_idx: int, target_idx: int) -> int: + return attacker_idx * NUM_CARDS + target_idx + + +def action_to_index(action: Action) -> Optional[int]: + """Map a concrete Action to a fixed action index.""" + if action.action_type == ActionType.DRAW: + return _OFFSETS["draw"] + if action.action_type == ActionType.RESOLVE: + return _OFFSETS["resolve"] + if action.action_type == ActionType.POINTS: + if action.card is None: + return None + return _OFFSETS["points"] + card_index(action.card) + if action.action_type == ActionType.FACE_CARD: + if action.card is None: + return None + return _OFFSETS["face"] + card_index(action.card) + if action.action_type == ActionType.ONE_OFF: + if action.card is None: + return None + attacker_idx = card_index(action.card) + if action.target is None: + return _OFFSETS["one_off"] + attacker_idx + target_idx = card_index(action.target) + return _OFFSETS["one_off_target"] + _pair_index(attacker_idx, target_idx) + if action.action_type == ActionType.COUNTER: + if action.card is None: + return None + return _OFFSETS["counter"] + card_index(action.card) + if action.action_type == ActionType.TAKE_FROM_DISCARD: + if action.card is None: + return None + return _OFFSETS["take_from_discard"] + card_index(action.card) + if action.action_type == ActionType.DISCARD_FROM_HAND: + if action.card is None: + return None + return _OFFSETS["discard_from_hand"] + card_index(action.card) + if action.action_type == ActionType.DISCARD_REVEALED: + if action.card is None: + return None + return _OFFSETS["discard_revealed"] + card_index(action.card) + if action.action_type == ActionType.SCUTTLE: + if action.card is None or action.target is None: + return None + attacker_idx = card_index(action.card) + target_idx = card_index(action.target) + return _OFFSETS["scuttle"] + _pair_index(attacker_idx, target_idx) + if action.action_type == ActionType.JACK: + if action.card is None or action.target is None: + return None + attacker_idx = card_index(action.card) + target_idx = card_index(action.target) + return _OFFSETS["jack"] + _pair_index(attacker_idx, target_idx) + return None + + +def build_action_map(legal_actions: Iterable[Action]) -> Dict[int, Action]: + """Build a mapping from fixed action index to Action for the current state.""" + index_to_action: Dict[int, Action] = {} + for action in legal_actions: + idx = action_to_index(action) + if idx is None: + continue + if idx in index_to_action: + continue + index_to_action[idx] = action + return index_to_action + + +def legal_action_mask_from_actions( + legal_actions: Iterable[Action], + action_space_size: int = ACTION_SPACE_SIZE, +) -> np.ndarray: + """Return a boolean mask over the full action space for given legal actions.""" + mask = np.zeros(action_space_size, dtype=np.bool_) + for idx in build_action_map(legal_actions).keys(): + if 0 <= idx < action_space_size: + mask[idx] = True + return mask + + +def legal_action_mask(game_state) -> np.ndarray: + """Return a boolean mask over the full action space for a game state.""" + return legal_action_mask_from_actions( + game_state.get_legal_actions(), + action_space_size=ACTION_SPACE_SIZE, + ) + + +def action_index_to_action(game_state, action_index: int) -> Optional[Action]: + """Resolve a fixed action index into a concrete legal Action, if any.""" + action_map = build_action_map(game_state.get_legal_actions()) + return action_map.get(action_index) diff --git a/rl/analyze_logs.py b/rl/analyze_logs.py new file mode 100644 index 0000000..ed2221f --- /dev/null +++ b/rl/analyze_logs.py @@ -0,0 +1,140 @@ +"""Analyze RL gameplay logs to identify issues.""" +from __future__ import annotations + +import json +from collections import Counter, defaultdict +from pathlib import Path +from typing import Any, Dict, List + + +def analyze_logs(log_dir: str = "rl/gameplay_logs") -> None: + """Analyze gameplay logs to identify patterns and issues. + + Args: + log_dir: Directory containing log files + """ + log_path = Path(log_dir) + + if not log_path.exists(): + print(f"❌ No logs found at {log_dir}") + print(" Run 'make debug-rl' first to generate logs") + return + + # Find all game logs + game_files = sorted(log_path.glob("game_*.json")) + + if not game_files: + print(f"❌ No game logs found in {log_dir}") + return + + print(f"\n{'='*70}") + print("RL GAMEPLAY ANALYSIS") + print(f"{'='*70}\n") + print(f"Analyzing {len(game_files)} games...\n") + + # Collect statistics + action_types = Counter() + action_patterns = defaultdict(list) + timeout_games = [] + quick_wins = [] + + for game_file in game_files: + with open(game_file, "r") as f: + game_data = json.load(f) + + game_id = game_data["game_id"] + steps = game_data["steps"] + outcome = game_data["outcome"] + + # Count action types + for step in steps: + action_type = step["action"]["type"] + action_types[action_type] += 1 + + # Detect patterns + recent_actions = [s["action"]["type"] for s in steps[-20:]] + pattern_key = " -> ".join(recent_actions[-5:]) if len(recent_actions) >= 5 else "" + action_patterns[pattern_key].append(game_id) + + # Categorize games + if outcome["reason"] == "timeout": + timeout_games.append({ + "id": game_id, + "steps": outcome["total_steps"], + "final_scores": outcome["final_scores"], + "recent_actions": recent_actions[-10:], + }) + elif outcome["total_steps"] < 50 and outcome["reason"] == "win": + quick_wins.append({ + "id": game_id, + "steps": outcome["total_steps"], + "winner": outcome["winner"], + }) + + # Print analysis + print("📊 ACTION TYPE DISTRIBUTION") + print("-" * 70) + total_actions = sum(action_types.values()) + for action_type, count in action_types.most_common(): + percentage = (count / total_actions) * 100 + bar = "█" * int(percentage / 2) + print(f" {action_type:20s} {count:5d} ({percentage:5.1f}%) {bar}") + + print(f"\n🔄 TIMEOUT GAMES: {len(timeout_games)}/{len(game_files)}") + print("-" * 70) + if timeout_games: + for game in timeout_games[:5]: # Show first 5 + print(f"\n Game {game['id']}:") + print(f" Steps: {game['steps']}") + print(f" Final scores: P0={game['final_scores']['player_0']}, " + f"P1={game['final_scores']['player_1']}") + print(f" Last 10 actions: {' -> '.join(game['recent_actions'])}") + + if len(timeout_games) > 5: + print(f"\n ... and {len(timeout_games) - 5} more timeout games") + + print(f"\n⚡ QUICK WINS: {len(quick_wins)}/{len(game_files)}") + print("-" * 70) + if quick_wins: + for game in quick_wins: + print(f" Game {game['id']}: Winner P{game['winner']} in {game['steps']} steps") + + # Detect stuck patterns + print(f"\n🔍 COMMON ACTION PATTERNS (last 5 moves)") + print("-" * 70) + common_patterns = [(p, len(games)) for p, games in action_patterns.items() + if len(games) > 1 and p] + common_patterns.sort(key=lambda x: x[1], reverse=True) + + for pattern, count in common_patterns[:10]: + print(f" [{count} games] {pattern}") + + # Recommendations + print(f"\n💡 RECOMMENDATIONS") + print("-" * 70) + + draw_percentage = (action_types.get("Draw", 0) / total_actions) * 100 + if draw_percentage > 40: + print(" ⚠️ HIGH DRAW RATE: Bot is drawing too often without playing cards") + print(" Consider adjusting reward to penalize excessive draws") + + if len(timeout_games) / len(game_files) > 0.5: + print(" ⚠️ HIGH TIMEOUT RATE: Games are not progressing") + print(" Bot may not understand how to play for points") + print(" Consider:") + print(" - Increase reward for playing point cards") + print(" - Add reward shaping for field control") + print(" - Reduce max_steps or add progress penalty") + + points_percentage = (action_types.get("Points", 0) / total_actions) * 100 + if points_percentage < 10: + print(" ⚠️ LOW POINTS PLAY: Bot rarely plays point cards") + print(" Increase reward for point cards significantly") + + print(f"\n{'='*70}\n") + print(f"📁 Full logs available at: {log_path.absolute()}") + print(f"{'='*70}\n") + + +if __name__ == "__main__": + analyze_logs() diff --git a/rl/compare_experiments.py b/rl/compare_experiments.py new file mode 100644 index 0000000..3177523 --- /dev/null +++ b/rl/compare_experiments.py @@ -0,0 +1,227 @@ +"""Compare results from multiple experiments.""" +from __future__ import annotations + +import json +import sys +from pathlib import Path +from typing import Any, Dict, List + +import numpy as np + + +def load_monitor_data(monitor_file: Path) -> Dict[str, Any]: + """Load data from a stable-baselines3 monitor file. + + Args: + monitor_file: Path to monitor.csv file + + Returns: + Dictionary with episode rewards and lengths + """ + if not monitor_file.exists(): + return {"rewards": [], "lengths": [], "times": []} + + rewards = [] + lengths = [] + times = [] + + with open(monitor_file, "r") as f: + # Skip header lines + for _ in range(2): + f.readline() + + # Read data + for line in f: + if line.strip(): + parts = line.strip().split(',') + if len(parts) >= 3: + try: + rewards.append(float(parts[0])) + lengths.append(int(parts[1])) + times.append(float(parts[2])) + except ValueError: + continue + + return { + "rewards": rewards, + "lengths": lengths, + "times": times, + } + + +def analyze_experiment(exp_path: Path) -> Dict[str, Any]: + """Analyze results from a single experiment. + + Args: + exp_path: Path to experiment directory + + Returns: + Dictionary with analysis results + """ + # Load config + config_file = exp_path / "config.json" + if not config_file.exists(): + return {"error": "No config.json found"} + + with open(config_file, "r") as f: + config = json.load(f) + + # Load training monitor data + train_monitor = exp_path / "train.monitor.csv" + train_data = load_monitor_data(train_monitor) + + # Load evaluation monitor data + eval_monitor = exp_path / "eval.monitor.csv" + eval_data = load_monitor_data(eval_monitor) + + # Calculate statistics + analysis = { + "name": config["name"], + "config": config, + "train": { + "total_episodes": len(train_data["rewards"]), + "mean_reward": float(np.mean(train_data["rewards"])) if train_data["rewards"] else 0.0, + "std_reward": float(np.std(train_data["rewards"])) if train_data["rewards"] else 0.0, + "mean_length": float(np.mean(train_data["lengths"])) if train_data["lengths"] else 0.0, + "final_100_mean_reward": float(np.mean(train_data["rewards"][-100:])) if len(train_data["rewards"]) >= 100 else 0.0, + }, + "eval": { + "total_episodes": len(eval_data["rewards"]), + "mean_reward": float(np.mean(eval_data["rewards"])) if eval_data["rewards"] else 0.0, + "std_reward": float(np.std(eval_data["rewards"])) if eval_data["rewards"] else 0.0, + "mean_length": float(np.mean(eval_data["lengths"])) if eval_data["lengths"] else 0.0, + "best_reward": float(max(eval_data["rewards"])) if eval_data["rewards"] else 0.0, + }, + } + + # Check for timeout issues + timeout_rate = sum(1 for length in train_data["lengths"] if length >= 200) / max(len(train_data["lengths"]), 1) + analysis["train"]["timeout_rate"] = float(timeout_rate) + + return analysis + + +def compare_experiments(experiments_dir: Path) -> None: + """Compare all experiments in a directory. + + Args: + experiments_dir: Directory containing experiment subdirectories + """ + if not experiments_dir.exists(): + print(f"❌ Directory not found: {experiments_dir}") + return + + # Find all experiment directories + exp_dirs = [d for d in experiments_dir.iterdir() if d.is_dir() and (d / "config.json").exists()] + + if not exp_dirs: + print(f"❌ No experiments found in {experiments_dir}") + return + + print(f"\n{'='*80}") + print("EXPERIMENT COMPARISON") + print(f"{'='*80}") + print(f"Found {len(exp_dirs)} experiments\n") + + # Analyze all experiments + analyses = [] + for exp_dir in sorted(exp_dirs): + analysis = analyze_experiment(exp_dir) + if "error" not in analysis: + analyses.append(analysis) + + if not analyses: + print("❌ No valid experiments to compare") + return + + # Sort by evaluation mean reward (best first) + analyses.sort(key=lambda x: x["eval"]["mean_reward"], reverse=True) + + # Print comparison table + print(f"{'Rank':<6} {'Name':<25} {'Train Reward':<15} {'Eval Reward':<15} {'Timeout %':<12} {'Avg Length':<12}") + print(f"{'-'*6} {'-'*25} {'-'*15} {'-'*15} {'-'*12} {'-'*12}") + + for i, analysis in enumerate(analyses, 1): + train_reward = analysis["train"]["final_100_mean_reward"] + eval_reward = analysis["eval"]["mean_reward"] + timeout_pct = analysis["train"]["timeout_rate"] * 100 + avg_length = analysis["train"]["mean_length"] + + print(f"{i:<6} {analysis['name']:<25} {train_reward:>14.2f} {eval_reward:>14.2f} {timeout_pct:>11.1f} {avg_length:>11.1f}") + + # Detailed analysis of top 3 + print(f"\n{'='*80}") + print("TOP 3 EXPERIMENTS (Detailed)") + print(f"{'='*80}\n") + + for i, analysis in enumerate(analyses[:3], 1): + print(f"{i}. {analysis['name']}") + print(f" {'-'*76}") + print(f" Description: {analysis['config']['description']}") + print(f"\n Training Config:") + for key, value in analysis['config']['training'].items(): + print(f" {key:20s}: {value}") + print(f"\n Reward Config:") + for key, value in analysis['config']['reward'].items(): + print(f" {key:20s}: {value}") + print(f"\n Training Results:") + print(f" Episodes: {analysis['train']['total_episodes']}") + print(f" Mean Reward: {analysis['train']['mean_reward']:.2f} ± {analysis['train']['std_reward']:.2f}") + print(f" Final 100 Reward: {analysis['train']['final_100_mean_reward']:.2f}") + print(f" Mean Episode Length: {analysis['train']['mean_length']:.1f}") + print(f" Timeout Rate: {analysis['train']['timeout_rate']*100:.1f}%") + print(f"\n Evaluation Results:") + print(f" Episodes: {analysis['eval']['total_episodes']}") + print(f" Mean Reward: {analysis['eval']['mean_reward']:.2f} ± {analysis['eval']['std_reward']:.2f}") + print(f" Best Reward: {analysis['eval']['best_reward']:.2f}") + print(f" Mean Episode Length: {analysis['eval']['mean_length']:.1f}") + print() + + # Recommendations + print(f"{'='*80}") + print("RECOMMENDATIONS") + print(f"{'='*80}\n") + + best = analyses[0] + + print(f"🏆 Best performing: {best['name']}") + print(f" Mean eval reward: {best['eval']['mean_reward']:.2f}") + + if best['train']['timeout_rate'] > 0.3: + print(f"\n⚠️ Warning: High timeout rate ({best['train']['timeout_rate']*100:.1f}%)") + print(" Consider:") + print(" - Increase progress_multiplier") + print(" - Increase turn_penalty") + print(" - Train for more timesteps") + + if best['eval']['mean_reward'] < 0: + print(f"\n⚠️ Warning: Negative mean reward") + print(" The agent is losing more than winning. Consider:") + print(" - Adjusting reward shaping") + print(" - Training for more timesteps") + print(" - Using a different learning rate") + + print(f"\n📊 View detailed metrics:") + print(f" tensorboard --logdir {experiments_dir.absolute()}") + + print(f"\n🎮 Test best model:") + best_model = experiments_dir / best['name'] / "best_model" / "best_model.zip" + if best_model.exists(): + print(f" make debug-rl MODEL={best_model}") + + print() + + +if __name__ == "__main__": + import argparse + + parser = argparse.ArgumentParser(description="Compare experiment results") + parser.add_argument( + "experiments_dir", + type=Path, + help="Directory containing experiments", + ) + + args = parser.parse_args() + + compare_experiments(args.experiments_dir) diff --git a/rl/config.py b/rl/config.py index b9fb0b6..3459119 100644 --- a/rl/config.py +++ b/rl/config.py @@ -2,8 +2,9 @@ from typing import Any, Dict # Training hyperparameters for MaskablePPO algorithm +# Using baseline config - performed best in hyperparameter search TRAINING_CONFIG: Dict[str, Any] = { - "total_timesteps": 100000, # Total training steps + "total_timesteps": 500000, # Extended training (was 100K) "learning_rate": 3e-4, # Learning rate for optimizer "n_steps": 2048, # Steps per update "batch_size": 64, # Minibatch size @@ -15,20 +16,23 @@ "verbose": 1, # Logging verbosity } -# Reward structure - critical for agent learning +# Reward structure - optimized for self-play +# Key insight: reward scoring points, don't over-penalize turns REWARD_CONFIG: Dict[str, float] = { "win": 100.0, # Reward for winning "loss": -100.0, # Penalty for losing - "stalemate": 0.0, # No reward for draw - "progress_multiplier": 10.0, # Multiplier for score progress - "turn_penalty": -1.0, # Small penalty each turn - "invalid_action_penalty": -50.0, # Heavy penalty for illegal moves (safety check) + "stalemate": -50.0, # Penalty for stalemate (discourage draws) + "progress_multiplier": 2.0, # Reward for scoring points (balanced) + "turn_penalty": -0.01, # Keep small to not overwhelm learning + "invalid_action_penalty": -10.0, # Penalty for illegal moves (safety check) } # Environment configuration +from rl.action_mapping import ACTION_SPACE_SIZE + ENV_CONFIG: Dict[str, Any] = { - "max_actions": 50, # Max possible actions per turn - "observation_dim": 206, # State vector dimension (136+1+30+30+4+5) + "max_actions": ACTION_SPACE_SIZE, # Fixed action space size + "observation_dim": 610, # State vector dimension (136+1+180+180+4+5+52+52) "max_hand_size": 8, # Max cards in hand "max_field_size": 10, # Max cards on field } diff --git a/rl/cuttle_env.py b/rl/cuttle_env.py index fb5fcd4..36c146a 100644 --- a/rl/cuttle_env.py +++ b/rl/cuttle_env.py @@ -6,7 +6,9 @@ from game.card import Purpose from game.game import Game +from rl.action_mapping import build_action_map, card_index, legal_action_mask from rl.config import ENV_CONFIG, REWARD_CONFIG +from rl.game_logger import GameplayLogger class CuttleRLEnvironment(gym.Env): @@ -14,7 +16,7 @@ class CuttleRLEnvironment(gym.Env): metadata = {"render_modes": ["human"]} - def __init__(self): + def __init__(self, enable_logging: bool = False): super().__init__() # Define action and observation spaces @@ -30,7 +32,13 @@ def __init__(self): self.game: Optional[Game] = None self.current_player = 0 self.step_count = 0 - self.max_steps = 200 # Add timeout to prevent infinite loops + self.max_steps = 300 # Increased to allow games to conclude naturally + self.no_progress_steps = 0 + self.no_progress_limit = 60 # End early if no scoring progress + + # Logging + self.logger = GameplayLogger() if enable_logging else None + self.enable_logging = enable_logging def reset( self, @@ -44,6 +52,16 @@ def reset( self.game = Game(manual_selection=False, ai_player=None) self.current_player = 0 self.step_count = 0 + self.no_progress_steps = 0 + + # Reset score tracking for difference-based rewards + self._prev_score = 0 + self._prev_opponent_score = 0 + self._prev_total_score = 0 + + # Start logging if enabled + if self.logger: + self.logger.start_game(self.game) # Get initial observation observation = self._encode_state() @@ -62,14 +80,7 @@ def action_masks(self) -> np.ndarray: """ assert self.game is not None, "Must call reset() first" - # Get current legal actions - legal_actions = self.game.game_state.get_legal_actions() - - # Create mask: True for legal actions, False for illegal - mask = np.zeros(self.action_space.n, dtype=np.bool_) - mask[:len(legal_actions)] = True - - return mask + return legal_action_mask(self.game.game_state) def step( self, action: int @@ -81,9 +92,11 @@ def step( self.step_count += 1 if self.step_count > self.max_steps: print(f"⚠️ TIMEOUT: Game exceeded {self.max_steps} steps, forcing termination") + if self.logger: + self.logger.end_game(self.game, None, "timeout", self.step_count) return ( self._encode_state(), - -10.0, # Penalty for timeout + -50.0, # Strong penalty for timeout (same as stalemate) True, # done True, # truncated {"error": "timeout", "steps": self.step_count} @@ -91,11 +104,15 @@ def step( # Get current legal actions legal_actions = self.game.game_state.get_legal_actions() - + + # Decode fixed action index into a concrete legal action + action_map = build_action_map(legal_actions) + chosen_action = action_map.get(action) + # With action masking, invalid actions should never happen # but keep as safety check - if action >= len(legal_actions): - print(f"WARNING: Invalid action {action} attempted (max: {len(legal_actions)-1})") + if chosen_action is None: + print(f"WARNING: Invalid action {action} attempted (no matching legal action)") print("This should not happen with proper action masking!") return ( self._encode_state(), @@ -105,13 +122,33 @@ def step( {"error": "invalid_action"} ) - # Execute the chosen action - chosen_action = legal_actions[action] + # Log the action before execution + if self.logger: + self.logger.log_step( + self.step_count, + self.current_player, + chosen_action, + self.game, + 0.0, # Reward will be updated after + len(legal_actions) + ) + turn_finished, game_ended, winner = \ self.game.game_state.update_state(chosen_action) # Calculate reward reward = self._calculate_reward(game_ended, winner) + + # Track total score progress to detect stalls + total_score = ( + self.game.game_state.get_player_score(0) + + self.game.game_state.get_player_score(1) + ) + if total_score > getattr(self, "_prev_total_score", 0): + self.no_progress_steps = 0 + else: + self.no_progress_steps += 1 + self._prev_total_score = total_score # Update game state if turn finished if turn_finished: @@ -120,6 +157,27 @@ def step( # Check if episode is done done = game_ended or self.game.game_state.is_stalemate() + + # Early termination if the game is stuck with no progress + if not done and self.no_progress_steps >= self.no_progress_limit: + print( + f"⚠️ STALL: No scoring progress for " + f"{self.no_progress_limit} steps, ending episode" + ) + if self.logger: + self.logger.end_game(self.game, None, "stall", self.step_count) + return ( + self._encode_state(), + REWARD_CONFIG["stalemate"], + True, # done + True, # truncated + {"error": "stall", "steps": self.step_count} + ) + + # Log game end if done + if done and self.logger: + reason = "win" if winner is not None else "stalemate" + self.logger.end_game(self.game, winner, reason, self.step_count) # Get new observation observation = self._encode_state() @@ -148,25 +206,25 @@ def _encode_state(self) -> np.ndarray: obs[idx] = len(self.game.game_state.hands[opponent]) / 8.0 idx += 1 - # 3. Player 0 field cards (30 dims: 10 cards × 3 dims each) + # 3. Player 0 field cards (180 dims: 10 cards × 18 dims each) for i in range(ENV_CONFIG["max_field_size"]): field = self.game.game_state.get_player_field(0) if i < len(field): card = field[i] - obs[idx] = 1.0 - obs[idx + 1] = card.rank.value[1] / 13.0 - obs[idx + 2] = 1.0 if card.purpose == Purpose.POINTS else 0.0 - idx += 3 + obs[idx + card.suit.value[1]] = 1.0 + obs[idx + 4 + card.rank.value[1] - 1] = 1.0 + obs[idx + 17] = 1.0 if card.purpose == Purpose.POINTS else 0.0 + idx += 18 - # 4. Player 1 field cards (30 dims: same encoding) + # 4. Player 1 field cards (180 dims: same encoding) for i in range(ENV_CONFIG["max_field_size"]): field = self.game.game_state.get_player_field(1) if i < len(field): card = field[i] - obs[idx] = 1.0 - obs[idx + 1] = card.rank.value[1] / 13.0 - obs[idx + 2] = 1.0 if card.purpose == Purpose.POINTS else 0.0 - idx += 3 + obs[idx + card.suit.value[1]] = 1.0 + obs[idx + 4 + card.rank.value[1] - 1] = 1.0 + obs[idx + 17] = 1.0 if card.purpose == Purpose.POINTS else 0.0 + idx += 18 # 5. Scores and targets (4 dims) obs[idx] = self.game.game_state.get_player_score(0) / 21.0 @@ -181,11 +239,25 @@ def _encode_state(self) -> np.ndarray: obs[idx + 2] = 1.0 if self.game.game_state.resolving_three else 0.0 obs[idx + 3] = len(self.game.game_state.deck) / 52.0 obs[idx + 4] = len(self.game.game_state.discard_pile) / 52.0 + idx += 5 + + # 7. Discard pile identity (52 dims) + for card in self.game.game_state.discard_pile: + obs[idx + card_index(card)] = 1.0 + idx += 52 + + # 8. Revealed cards for seven (52 dims) + for card in self.game.game_state.pending_seven_cards: + obs[idx + card_index(card)] = 1.0 + idx += 52 return obs def _calculate_reward(self, game_ended: bool, winner: Optional[int]) -> float: - """Calculate reward for the current state.""" + """Calculate reward for the current state. + + Simple reward structure focused on scoring points and winning. + """ if game_ended: if winner == self.current_player: return REWARD_CONFIG["win"] @@ -194,16 +266,19 @@ def _calculate_reward(self, game_ended: bool, winner: Optional[int]) -> float: else: return REWARD_CONFIG["stalemate"] - # Intermediate reward: progress toward target + # Only reward our own score gains (simpler, less noisy) current_score = self.game.game_state.get_player_score(self.current_player) - target = self.game.game_state.get_player_target(self.current_player) + prev_score = getattr(self, '_prev_score', 0) - if target > 0: - progress = current_score / target - return (progress * REWARD_CONFIG["progress_multiplier"] + - REWARD_CONFIG["turn_penalty"]) - else: - return REWARD_CONFIG["turn_penalty"] + score_gain = current_score - prev_score + self._prev_score = current_score + + # Small reward for scoring points + if score_gain > 0: + return score_gain * REWARD_CONFIG["progress_multiplier"] + + # Minimal turn penalty otherwise + return REWARD_CONFIG["turn_penalty"] def _get_info(self) -> Dict[str, Any]: """Get additional information about game state.""" diff --git a/rl/debug_gameplay.py b/rl/debug_gameplay.py new file mode 100644 index 0000000..141fee2 --- /dev/null +++ b/rl/debug_gameplay.py @@ -0,0 +1,97 @@ +"""Debug script to analyze RL gameplay with detailed logging.""" +from __future__ import annotations + +import sys +from pathlib import Path + +# Add parent directory to path +sys.path.insert(0, str(Path(__file__).parent.parent)) + +from sb3_contrib import MaskablePPO +from sb3_contrib.common.wrappers import ActionMasker +from rl.cuttle_env import CuttleRLEnvironment +from rl.self_play_env import SelfPlayWrapper + + +def mask_fn(env): + """Function that returns action mask for MaskablePPO.""" + # Unwrap to get to the actual environment with action_masks method + while hasattr(env, 'env'): + if hasattr(env, 'action_masks'): + return env.action_masks() + env = env.env + return env.action_masks() + + +def run_debug_games(num_games: int = 10, model_path: str = "rl/models/best_model.zip"): + """Run games with detailed logging for debugging. + + Args: + num_games: Number of games to play and log + model_path: Path to trained model (or None to use random actions) + """ + print(f"\n{'='*60}") + print("DEBUGGING RL GAMEPLAY") + print(f"{'='*60}\n") + + # Create environment with logging enabled + base_env = CuttleRLEnvironment(enable_logging=True) + env = SelfPlayWrapper(base_env) + env = ActionMasker(env, mask_fn) # Critical: wrap with ActionMasker + + # Load model if available + try: + model = MaskablePPO.load(model_path) + print(f"✅ Loaded model from: {model_path}\n") + use_model = True + except Exception as e: + print(f"⚠️ Could not load model: {e}") + print("Using random actions instead\n") + use_model = False + + # Run games + for game_num in range(num_games): + print(f"Playing game {game_num + 1}/{num_games}...", end=" ") + + obs, info = env.reset() + done = False + step_count = 0 + + while not done and step_count < 200: + action_masks = env.action_masks() + + if use_model: + action, _ = model.predict(obs, action_masks=action_masks, deterministic=False) + else: + # Random action from legal actions + import numpy as np + legal_actions = np.where(action_masks)[0] + action = np.random.choice(legal_actions) if len(legal_actions) > 0 else 0 + + obs, reward, done, truncated, info = env.step(action) + step_count += 1 + + if done or truncated: + break + + print(f"Finished in {step_count} steps") + + # Generate summary + if base_env.logger: + base_env.logger.generate_summary() + + print("\n💡 TIP: Check the JSON logs in rl/gameplay_logs/ for detailed analysis") + print(" Each file contains step-by-step actions, game state, and outcomes\n") + + +if __name__ == "__main__": + import argparse + + parser = argparse.ArgumentParser(description="Debug RL gameplay with detailed logs") + parser.add_argument("--games", type=int, default=10, help="Number of games to play") + parser.add_argument("--model", type=str, default="rl/models/best_model.zip", + help="Path to model") + + args = parser.parse_args() + + run_debug_games(num_games=args.games, model_path=args.model) diff --git a/rl/evaluate.py b/rl/evaluate.py index e451919..bb05bb5 100644 --- a/rl/evaluate.py +++ b/rl/evaluate.py @@ -1,36 +1,107 @@ """Evaluate trained RL agent with action masking.""" +import json import os -from typing import Optional, Tuple +from typing import Any, Dict, Optional, Tuple import numpy as np from sb3_contrib import MaskablePPO -from rl.config import MODEL_DIR +from rl.action_mapping import action_index_to_action +from rl.config import LOG_DIR, MODEL_DIR from rl.cuttle_env import CuttleRLEnvironment +def _snapshot_game_state(env: CuttleRLEnvironment) -> Dict[str, Any]: + """Capture a compact, readable snapshot of the current game state.""" + if not env.game: + return {} + state = env.game.game_state + return { + "turn": state.turn, + "current_action_player": state.current_action_player, + "overall_turn": state.overall_turn, + "scores": { + "player_0": state.get_player_score(0), + "player_1": state.get_player_score(1), + }, + "targets": { + "player_0": state.get_player_target(0), + "player_1": state.get_player_target(1), + }, + "hands": [ + [str(card) for card in state.hands[0]], + [str(card) for card in state.hands[1]], + ], + "fields": [ + [str(card) for card in state.get_player_field(0)], + [str(card) for card in state.get_player_field(1)], + ], + "deck_count": len(state.deck), + "discard_pile": [str(card) for card in state.discard_pile], + "resolving_one_off": state.resolving_one_off, + "resolving_three": state.resolving_three, + "resolving_seven": state.resolving_seven, + "pending_three_player": state.pending_three_player, + "pending_four_player": state.pending_four_player, + "pending_four_count": state.pending_four_count, + "pending_seven_requires_discard": state.pending_seven_requires_discard, + } + + def play_episode( model: MaskablePPO, env: CuttleRLEnvironment, - deterministic: bool = True -) -> Tuple[float, int, Optional[int]]: + deterministic: bool = True, + record: bool = False, +) -> Tuple[float, int, Optional[int], Optional[Dict[str, Any]]]: """Play one episode with action masking.""" obs, info = env.reset() done = False episode_reward = 0.0 steps = 0 + trace: Optional[Dict[str, Any]] = {"steps": []} if record else None while not done: # Agent's turn with action mask action_mask = env.action_masks() + obs_before = obs action, _ = model.predict( - obs, + obs, action_masks=action_mask, # Pass mask to model deterministic=deterministic ) + if env.game: + action_obj = action_index_to_action(env.game.game_state, int(action)) + legal_actions = env.game.game_state.get_legal_actions() + else: + action_obj = None + legal_actions = [] + state_before = _snapshot_game_state(env) if record else None + obs, reward, done, truncated, info = env.step(action) + state_after = _snapshot_game_state(env) if record else None episode_reward += reward steps += 1 + + if trace is not None: + trace["steps"].append( + { + "actor": "agent", + "step": steps, + "obs": obs_before.tolist(), + "next_obs": obs.tolist(), + "action_index": int(action), + "action": str(action_obj) if action_obj else None, + "legal_actions": [str(a) for a in legal_actions], + "action_mask": action_mask.astype(int).tolist(), + "reward": float(reward), + "done": bool(done), + "truncated": bool(truncated), + "info": info, + "state_before": state_before, + "state_after": state_after, + } + ) if done: break @@ -40,17 +111,59 @@ def play_episode( legal_indices = np.where(opponent_mask)[0] if len(legal_indices) > 0: opp_action = np.random.choice(legal_indices) + obs_before = obs + if env.game: + opp_action_obj = action_index_to_action(env.game.game_state, int(opp_action)) + opp_legal_actions = env.game.game_state.get_legal_actions() + else: + opp_action_obj = None + opp_legal_actions = [] + state_before = _snapshot_game_state(env) if record else None + obs, opp_reward, done, truncated, info = env.step(opp_action) + state_after = _snapshot_game_state(env) if record else None episode_reward -= opp_reward steps += 1 + + if trace is not None: + trace["steps"].append( + { + "actor": "opponent", + "step": steps, + "obs": obs_before.tolist(), + "next_obs": obs.tolist(), + "action_index": int(opp_action), + "action": str(opp_action_obj) if opp_action_obj else None, + "legal_actions": [str(a) for a in opp_legal_actions], + "action_mask": opponent_mask.astype(int).tolist(), + "reward": float(opp_reward), + "done": bool(done), + "truncated": bool(truncated), + "info": info, + "state_before": state_before, + "state_after": state_after, + } + ) # Get winner winner = env.game.game_state.winner() if env.game else None - return episode_reward, steps, winner + if trace is not None: + trace["summary"] = { + "episode_reward": float(episode_reward), + "steps": steps, + "winner": winner, + "deterministic": deterministic, + } + + return episode_reward, steps, winner, trace -def evaluate_agent(model_path: str, n_episodes: int = 100): +def evaluate_agent( + model_path: str, + n_episodes: int = 100, + record_path: Optional[str] = None, +): """Evaluate agent over multiple episodes.""" print(f"Loading MaskablePPO model from: {model_path}") model = MaskablePPO.load(model_path) @@ -71,7 +184,10 @@ def evaluate_agent(model_path: str, n_episodes: int = 100): if (episode + 1) % 10 == 0: print(f" Episode {episode + 1}/{n_episodes}") - episode_reward, steps, winner = play_episode(model, env, deterministic=True) + record = record_path is not None and episode == 0 + episode_reward, steps, winner, trace = play_episode( + model, env, deterministic=True, record=record + ) # Record results total_rewards.append(episode_reward) @@ -84,6 +200,12 @@ def evaluate_agent(model_path: str, n_episodes: int = 100): losses += 1 else: stalemates += 1 + + if record and trace is not None: + os.makedirs(os.path.dirname(record_path), exist_ok=True) + with open(record_path, "w", encoding="utf-8") as handle: + json.dump(trace, handle, indent=2) + print(f"Saved episode trace to: {record_path}") # Print results print("\n" + "=" * 50) @@ -101,13 +223,14 @@ def evaluate_agent(model_path: str, n_episodes: int = 100): def main(): """Main evaluation function.""" model_path = os.path.join(MODEL_DIR, "cuttle_rl_final") + record_path = os.path.join(LOG_DIR, "eval_rollout.json") if not os.path.exists(model_path + ".zip"): print(f"ERROR: Model not found at {model_path}.zip") print("Please train a model first using: make train-rl") return - evaluate_agent(model_path, n_episodes=100) + evaluate_agent(model_path, n_episodes=100, record_path=record_path) if __name__ == "__main__": diff --git a/rl/game_logger.py b/rl/game_logger.py new file mode 100644 index 0000000..834acb9 --- /dev/null +++ b/rl/game_logger.py @@ -0,0 +1,185 @@ +"""Logger for detailed RL gameplay analysis.""" +from __future__ import annotations + +import json +from datetime import datetime +from pathlib import Path +from typing import Any, Dict, List, Optional + +from game.action import Action +from game.card import Card +from game.game import Game + + +class GameplayLogger: + """Logs detailed gameplay information for debugging RL agents.""" + + def __init__(self, log_dir: str = "rl/gameplay_logs"): + """Initialize logger. + + Args: + log_dir: Directory to save logs + """ + self.log_dir = Path(log_dir) + self.log_dir.mkdir(parents=True, exist_ok=True) + self.current_game: Optional[Dict[str, Any]] = None + self.games_logged = 0 + self.max_games_per_session = 10 # Only log first 10 games per training + + def start_game(self, game: Game) -> None: + """Start logging a new game.""" + if self.games_logged >= self.max_games_per_session: + return # Don't log more than max games + + self.current_game = { + "game_id": self.games_logged, + "start_time": datetime.now().isoformat(), + "steps": [], + "outcome": None, + "step_count": 0, + } + + def log_step( + self, + step_num: int, + player: int, + action: Action, + game: Game, + reward: float, + legal_action_count: int, + ) -> None: + """Log a single step of gameplay.""" + if self.current_game is None: + return + + step_info = { + "step": step_num, + "player": player, + "action": { + "type": action.action_type.name if hasattr(action.action_type, 'name') else str(action.action_type), + "card": self._card_to_dict(action.card) if action.card else None, + "target": self._card_to_dict(action.target) if action.target else None, + }, + "reward": float(reward), + "legal_actions_count": legal_action_count, + "state": self._get_game_state_snapshot(game, player), + } + + self.current_game["steps"].append(step_info) + self.current_game["step_count"] = step_num + + def end_game( + self, + game: Game, + winner: Optional[int], + reason: str, + step_count: int, + ) -> None: + """End current game and save log.""" + if self.current_game is None: + return + + self.current_game["outcome"] = { + "winner": winner, + "reason": reason, + "total_steps": step_count, + "final_scores": { + "player_0": game.game_state.get_player_score(0), + "player_1": game.game_state.get_player_score(1), + }, + "final_targets": { + "player_0": game.game_state.get_player_target(0), + "player_1": game.game_state.get_player_target(1), + }, + } + + # Save to file + filename = f"game_{self.games_logged:03d}_{reason}.json" + filepath = self.log_dir / filename + + with open(filepath, "w") as f: + json.dump(self.current_game, f, indent=2) + + print(f"📝 Saved gameplay log: {filepath}") + self.games_logged += 1 + self.current_game = None + + def _card_to_dict(self, card: Card) -> Dict[str, Any]: + """Convert card to dictionary.""" + return { + "rank": card.rank.name, + "suit": card.suit.name, + "display": str(card), + } + + def _get_game_state_snapshot(self, game: Game, current_player: int) -> Dict[str, Any]: + """Get snapshot of current game state.""" + return { + "current_player": current_player, + "scores": { + "player_0": game.game_state.get_player_score(0), + "player_1": game.game_state.get_player_score(1), + }, + "hand_sizes": { + "player_0": len(game.game_state.hands[0]), + "player_1": len(game.game_state.hands[1]), + }, + "field_cards": { + "player_0": [self._card_to_dict(c) for c in game.game_state.get_player_field(0)], + "player_1": [self._card_to_dict(c) for c in game.game_state.get_player_field(1)], + }, + "deck_size": len(game.game_state.deck), + "discard_size": len(game.game_state.discard_pile), + "resolving_one_off": game.game_state.resolving_one_off, + "resolving_three": game.game_state.resolving_three, + } + + def generate_summary(self) -> None: + """Generate a summary of all logged games.""" + if self.games_logged == 0: + print("No games logged yet.") + return + + summary = { + "total_games": self.games_logged, + "outcomes": {}, + "avg_steps": 0, + "timeout_rate": 0, + } + + total_steps = 0 + timeouts = 0 + + for i in range(self.games_logged): + for reason in ["timeout", "win", "stalemate"]: + filepath = self.log_dir / f"game_{i:03d}_{reason}.json" + if filepath.exists(): + with open(filepath, "r") as f: + game_data = json.load(f) + reason = game_data["outcome"]["reason"] + summary["outcomes"][reason] = summary["outcomes"].get(reason, 0) + 1 + total_steps += game_data["outcome"]["total_steps"] + if reason == "timeout": + timeouts += 1 + break + + if self.games_logged > 0: + summary["avg_steps"] = total_steps / self.games_logged + summary["timeout_rate"] = timeouts / self.games_logged + + summary_path = self.log_dir / "summary.json" + with open(summary_path, "w") as f: + json.dump(summary, f, indent=2) + + print(f"\n{'='*60}") + print("GAMEPLAY SUMMARY") + print(f"{'='*60}") + print(f"Total games logged: {summary['total_games']}") + print(f"Average steps per game: {summary['avg_steps']:.1f}") + print(f"Timeout rate: {summary['timeout_rate']*100:.1f}%") + print("\nOutcomes:") + for outcome, count in summary["outcomes"].items(): + print(f" {outcome}: {count} ({count/self.games_logged*100:.1f}%)") + print(f"{'='*60}\n") + print(f"📁 Logs saved to: {self.log_dir.absolute()}") + print(f"{'='*60}\n") diff --git a/rl/hyperparameter_search.py b/rl/hyperparameter_search.py new file mode 100644 index 0000000..87277d6 --- /dev/null +++ b/rl/hyperparameter_search.py @@ -0,0 +1,324 @@ +"""Hyperparameter search for RL training.""" +from __future__ import annotations + +import json +import sys +from datetime import datetime +from pathlib import Path +from typing import Any, Dict, List + +sys.path.insert(0, str(Path(__file__).parent.parent)) + +from sb3_contrib import MaskablePPO +from sb3_contrib.common.wrappers import ActionMasker +from stable_baselines3.common.callbacks import CheckpointCallback, EvalCallback +from stable_baselines3.common.monitor import Monitor + +from rl.cuttle_env import CuttleRLEnvironment +from rl.self_play_env import SelfPlayWrapper + + +def mask_fn(env): + """Function that returns action mask for MaskablePPO.""" + # Unwrap to get to the actual environment with action_masks method + while hasattr(env, 'env'): + if hasattr(env, 'action_masks'): + return env.action_masks() + env = env.env + return env.action_masks() + + +# Define hyperparameter configurations to test +EXPERIMENT_CONFIGS: List[Dict[str, Any]] = [ + { + "name": "baseline", + "description": "Current baseline configuration", + "training": { + "total_timesteps": 200_000, + "n_steps": 2048, + "batch_size": 64, + "learning_rate": 3e-4, + }, + "reward": { + "win": 100.0, + "loss": -100.0, + "stalemate": -50.0, + "invalid_action_penalty": -10.0, + "progress_multiplier": 0.1, + "turn_penalty": -0.01, + }, + }, + { + "name": "high_progress_reward", + "description": "Emphasize progress toward winning", + "training": { + "total_timesteps": 200_000, + "n_steps": 2048, + "batch_size": 64, + "learning_rate": 3e-4, + }, + "reward": { + "win": 100.0, + "loss": -100.0, + "stalemate": -50.0, + "invalid_action_penalty": -10.0, + "progress_multiplier": 10.0, # 100x increase + "turn_penalty": -0.5, # Penalize longer games + }, + }, + { + "name": "fast_learning", + "description": "Higher learning rate for faster initial learning", + "training": { + "total_timesteps": 200_000, + "n_steps": 1024, # Smaller steps + "batch_size": 128, # Larger batches + "learning_rate": 1e-3, # Higher LR + }, + "reward": { + "win": 100.0, + "loss": -100.0, + "stalemate": -50.0, + "invalid_action_penalty": -10.0, + "progress_multiplier": 5.0, + "turn_penalty": -0.2, + }, + }, + { + "name": "conservative", + "description": "Lower LR, larger batches for stable learning", + "training": { + "total_timesteps": 200_000, + "n_steps": 4096, # Larger steps + "batch_size": 32, # Smaller batches + "learning_rate": 1e-4, # Lower LR + }, + "reward": { + "win": 100.0, + "loss": -100.0, + "stalemate": -50.0, + "invalid_action_penalty": -10.0, + "progress_multiplier": 3.0, + "turn_penalty": -0.1, + }, + }, + { + "name": "aggressive_scoring", + "description": "Heavy emphasis on scoring points", + "training": { + "total_timesteps": 200_000, + "n_steps": 2048, + "batch_size": 64, + "learning_rate": 3e-4, + }, + "reward": { + "win": 100.0, + "loss": -100.0, + "stalemate": -50.0, + "invalid_action_penalty": -10.0, + "progress_multiplier": 20.0, # Very high + "turn_penalty": -1.0, # Strong penalty for long games + }, + }, +] + + +def run_experiment(config: Dict[str, Any], experiment_dir: Path) -> Dict[str, Any]: + """Run a single experiment with given configuration. + + Args: + config: Experiment configuration + experiment_dir: Directory to save experiment results + + Returns: + Dictionary with experiment results + """ + exp_name = config["name"] + print(f"\n{'='*70}") + print(f"EXPERIMENT: {exp_name}") + print(f"Description: {config['description']}") + print(f"{'='*70}\n") + + # Create experiment directory + exp_path = experiment_dir / exp_name + exp_path.mkdir(parents=True, exist_ok=True) + + # Save config + with open(exp_path / "config.json", "w") as f: + json.dump(config, f, indent=2) + + # Apply reward config (monkey patch for this experiment) + import rl.config as rl_config + for key, value in config["reward"].items(): + rl_config.REWARD_CONFIG[key] = value + + # Create environments with action masking + train_env = SelfPlayWrapper(CuttleRLEnvironment()) + train_env = Monitor(train_env, str(exp_path / "train")) + train_env = ActionMasker(train_env, mask_fn) # Critical: wrap with ActionMasker + + eval_env = SelfPlayWrapper(CuttleRLEnvironment()) + eval_env = Monitor(eval_env, str(exp_path / "eval")) + eval_env = ActionMasker(eval_env, mask_fn) # Critical: wrap with ActionMasker + + # Training parameters + training_config = config["training"] + + # Create model + model = MaskablePPO( + "MlpPolicy", + train_env, + n_steps=training_config["n_steps"], + batch_size=training_config["batch_size"], + learning_rate=training_config["learning_rate"], + verbose=1, + tensorboard_log=str(exp_path / "tensorboard"), + ) + + # Callbacks + checkpoint_callback = CheckpointCallback( + save_freq=10_000, + save_path=str(exp_path / "checkpoints"), + name_prefix=f"{exp_name}_model", + ) + + eval_callback = EvalCallback( + eval_env, + best_model_save_path=str(exp_path / "best_model"), + log_path=str(exp_path / "eval_logs"), + eval_freq=5_000, + deterministic=True, + render=False, + n_eval_episodes=10, + ) + + # Train + start_time = datetime.now() + print(f"Training started at {start_time.isoformat()}\n") + + model.learn( + total_timesteps=training_config["total_timesteps"], + callback=[checkpoint_callback, eval_callback], + progress_bar=True, + ) + + end_time = datetime.now() + duration = (end_time - start_time).total_seconds() + + print(f"\nTraining completed in {duration:.1f} seconds ({duration/60:.1f} minutes)") + + # Save final model + model.save(exp_path / "final_model") + + # Collect results + results = { + "name": exp_name, + "config": config, + "start_time": start_time.isoformat(), + "end_time": end_time.isoformat(), + "duration_seconds": duration, + "model_path": str(exp_path / "final_model.zip"), + "best_model_path": str(exp_path / "best_model" / "best_model.zip"), + } + + # Save results + with open(exp_path / "results.json", "w") as f: + json.dump(results, f, indent=2) + + # Cleanup + train_env.close() + eval_env.close() + + return results + + +def run_all_experiments( + configs: List[Dict[str, Any]], + base_dir: str = "rl/experiments", +) -> None: + """Run all experiments and save results. + + Args: + configs: List of experiment configurations + base_dir: Base directory for all experiments + """ + experiment_dir = Path(base_dir) / datetime.now().strftime("%Y%m%d_%H%M%S") + experiment_dir.mkdir(parents=True, exist_ok=True) + + print(f"\n{'='*70}") + print("HYPERPARAMETER SEARCH") + print(f"{'='*70}") + print(f"Running {len(configs)} experiments") + print(f"Results will be saved to: {experiment_dir.absolute()}") + print(f"{'='*70}\n") + + all_results = [] + + for i, config in enumerate(configs): + print(f"\nExperiment {i+1}/{len(configs)}") + try: + results = run_experiment(config, experiment_dir) + all_results.append(results) + except Exception as e: + print(f"❌ Experiment {config['name']} failed: {e}") + import traceback + traceback.print_exc() + continue + + # Save summary + summary = { + "timestamp": datetime.now().isoformat(), + "total_experiments": len(configs), + "successful_experiments": len(all_results), + "experiments": all_results, + } + + with open(experiment_dir / "summary.json", "w") as f: + json.dump(summary, f, indent=2) + + print(f"\n{'='*70}") + print("ALL EXPERIMENTS COMPLETED") + print(f"{'='*70}") + print(f"Results saved to: {experiment_dir.absolute()}") + print(f"Successful: {len(all_results)}/{len(configs)}") + print(f"{'='*70}\n") + + print("Next steps:") + print(f" 1. Compare results: python rl/compare_experiments.py {experiment_dir}") + print(f" 2. View tensorboard: tensorboard --logdir {experiment_dir}") + print(f" 3. Test best model: make debug-rl --model \n") + + +if __name__ == "__main__": + import argparse + + parser = argparse.ArgumentParser(description="Run hyperparameter search") + parser.add_argument( + "--quick", + action="store_true", + help="Run quick experiments (50K timesteps each)", + ) + parser.add_argument( + "--configs", + nargs="+", + help="Run only specific configs by name", + ) + + args = parser.parse_args() + + # Filter configs if specified + configs = EXPERIMENT_CONFIGS + if args.configs: + configs = [c for c in configs if c["name"] in args.configs] + if not configs: + print(f"❌ No configs found matching: {args.configs}") + print(f"Available: {[c['name'] for c in EXPERIMENT_CONFIGS]}") + sys.exit(1) + + # Reduce timesteps for quick mode + if args.quick: + print("🚀 Quick mode: Using 50K timesteps per experiment\n") + for config in configs: + config["training"]["total_timesteps"] = 50_000 + + run_all_experiments(configs) diff --git a/rl/models/cuttle_rl_final.zip b/rl/models/cuttle_rl_final.zip index d45786d..1eca565 100644 Binary files a/rl/models/cuttle_rl_final.zip and b/rl/models/cuttle_rl_final.zip differ diff --git a/rl/monitor.py b/rl/monitor.py new file mode 100644 index 0000000..f0efe1e --- /dev/null +++ b/rl/monitor.py @@ -0,0 +1,176 @@ +#!/usr/bin/env python3 +"""Simple training monitor for RL training.""" +import re +import sys +import time +import os +import glob + +MONITOR_CSV = "/Users/lihao/Documents/Projects/cuttle-bot/rl/logs/monitor.csv" + +def find_latest_log() -> str: + """Find the most recent training log file.""" + # Prefer explicit current log if present + current_log = "/tmp/train_current.log" + if os.path.exists(current_log) and os.path.getsize(current_log) > 100: + return current_log + + # Use glob to find all training logs + all_logs = list(set(glob.glob("/tmp/train*.log"))) + + # Filter to logs with content and return most recently modified + valid = [ + (path, os.path.getmtime(path)) + for path in all_logs + if os.path.exists(path) and os.path.getsize(path) > 100 + ] + if valid: + return max(valid, key=lambda entry: entry[1])[0] + + # Fallback + return current_log + + +def _read_monitor_episode_stats() -> dict: + """Read episode count and average length from Monitor CSV.""" + if not os.path.exists(MONITOR_CSV): + return {"episodes": 0, "avg_len": 0.0} + + with open(MONITOR_CSV, "r") as monitor_file: + lines = [line.strip() for line in monitor_file if line.strip()] + + if not lines: + return {"episodes": 0, "avg_len": 0.0} + + # Find last header line to avoid mixing runs + last_header_idx = 0 + for idx, line in enumerate(lines): + if line.startswith("#"): + last_header_idx = idx + + data_lines = lines[last_header_idx + 2 :] # skip header + column line + lengths = [] + for line in data_lines: + parts = line.split(",") + if len(parts) >= 2: + try: + lengths.append(float(parts[1])) + except ValueError: + continue + + if not lengths: + return {"episodes": 0, "avg_len": 0.0} + + return {"episodes": len(lengths), "avg_len": sum(lengths) / len(lengths)} + +def monitor(log_file: str | None = None, refresh: bool = False) -> None: + """Monitor training progress.""" + if log_file is None: + log_file = find_latest_log() + + while True: + if not os.path.exists(log_file): + print(f"Log file not found: {log_file}") + return + + with open(log_file, 'r') as f: + content = f.read() + + # Parse metrics + timesteps = re.findall(r'total_timesteps\s+\|\s+(\d+)', content) + ep_rew = re.findall(r'ep_rew_mean\s+\|\s+([-\d.]+)', content) + ep_len = re.findall(r'ep_len_mean\s+\|\s+([\d.]+)', content) + fps_vals = re.findall(r'fps\s+\|\s+(\d+)', content) + time_elapsed = re.findall(r'time_elapsed\s+\|\s+(\d+)', content) + + total = 500000 + + # Clear screen if refreshing + if refresh: + print("\033[2J\033[H", end="") + + print("=" * 60) + print("🎮 CUTTLE RL TRAINING MONITOR") + print(f" Log: {os.path.basename(log_file)}") + print("=" * 60) + + if timesteps: + latest = int(timesteps[-1]) + pct = (latest / total) * 100 + bar = '█' * int(pct / 2) + '░' * (50 - int(pct / 2)) + + print(f"\n📊 Progress: {latest:,} / {total:,} ({pct:.1f}%)") + print(f" [{bar}]") + + if ep_rew: + current = float(ep_rew[-1]) + print(f"\n📈 Reward: {current:.2f}", end="") + if len(ep_rew) >= 5: + early = sum(float(r) for r in ep_rew[:5]) / 5 + recent = sum(float(r) for r in ep_rew[-5:]) / 5 + change = recent - early + print(f" (trend: {'+' if change > 0 else ''}{change:.2f})") + else: + print() + + if ep_len: + print(f"🎲 Episode Length: {float(ep_len[-1]):.1f} steps") + + if fps_vals and timesteps: + fps = int(fps_vals[-1]) + remaining = total - int(timesteps[-1]) + eta_min = (remaining / fps) / 60 if fps > 0 else 0 + print(f"⏱️ Speed: {fps:,}/s | ETA: {eta_min:.1f} min") + + # Calculate timeout stats + timeouts = content.count("TIMEOUT") + stalls = content.count("STALL") + invalid = content.count("Invalid action") + episode_stats = _read_monitor_episode_stats() + episodes = episode_stats["episodes"] + + if episodes > 0: + timeout_pct = (timeouts / episodes * 100) + stall_pct = (stalls / episodes * 100) + non_finish_pct = ((timeouts + stalls) / episodes * 100) + avg_len = episode_stats["avg_len"] + print(f"\n⚠️ Timeouts: {timeouts} / {episodes} games ({timeout_pct:.1f}%)") + if stalls > 0: + print(f"⚠️ Stalls: {stalls} / {episodes} games ({stall_pct:.1f}%)") + print(f"⚠️ Non-finish rate: {non_finish_pct:.1f}%") + print(f" Avg episode length: {avg_len:.1f} steps") + + # Visual indicator + if non_finish_pct > 90: + print(" Status: 🔴 Most games end without a winner") + elif non_finish_pct > 50: + print(" Status: 🟡 Many games stall/timeout") + elif non_finish_pct > 20: + print(" Status: 🟢 Good progress (agent winning more)") + else: + print(" Status: ✅ Excellent (agent wins most games)") + else: + print(f"\n⚠️ Timeouts: {timeouts}") + + if invalid > 0: + print(f"❌ Invalid actions: {invalid}") + + # Self-play info + model_probs = re.findall(r'opponent model prob = ([\d.]+)%', content) + if model_probs: + current_prob = float(model_probs[-1]) + print(f"\n🤖 Self-Play: {current_prob:.0f}% model opponent") + elif "Self-play initialized" in content: + print(f"\n🤖 Self-Play: Starting (0% model, 100% random)") + + print("=" * 60) + + if not refresh: + break + + time.sleep(5) + + +if __name__ == "__main__": + refresh = "--watch" in sys.argv or "-w" in sys.argv + monitor(refresh=refresh) diff --git a/rl/self_play_env.py b/rl/self_play_env.py index c9a8c36..94c0543 100644 --- a/rl/self_play_env.py +++ b/rl/self_play_env.py @@ -1,5 +1,5 @@ -"""Self-play wrapper with action masking support.""" -from typing import Any, Dict, Tuple +"""Self-play wrapper with action masking support and model-based opponent.""" +from typing import Any, Dict, Optional, Tuple import gymnasium as gym import numpy as np @@ -8,16 +8,61 @@ class SelfPlayWrapper(gym.Wrapper): - """Wrapper that enables self-play training with action masking.""" + """Wrapper that enables self-play training with action masking. - def __init__(self, env: CuttleRLEnvironment): - super().__init__(env) - self.opponent_policy = "random" # Strategy: "random" or future: "model" + Supports two opponent modes: + - "random": Opponent chooses randomly from legal actions (default for early training) + - "model": Opponent uses the trained model (true self-play) + """ + def __init__( + self, + env: CuttleRLEnvironment, + opponent_type: str = "random", + ): + super().__init__(env) + self.opponent_type = opponent_type + self._opponent_model = None + self._update_freq = 1000 # Update opponent model every N steps + self._steps_since_update = 0 + + def set_opponent_model(self, model) -> None: + """Set the model to use for opponent actions. + + Args: + model: A trained MaskablePPO model (or compatible) + """ + self._opponent_model = model + self.opponent_type = "model" + def action_masks(self) -> np.ndarray: """Forward action masks from wrapped environment.""" return self.env.action_masks() + def _get_opponent_action(self, mask: np.ndarray) -> int: + """Get opponent's action based on opponent_type.""" + legal_indices = np.where(mask)[0] + + if len(legal_indices) == 0: + return 0 # Fallback (shouldn't happen with proper masking) + + if self.opponent_type == "model" and self._opponent_model is not None: + # Use the model to predict action + obs = self.env._encode_state() + try: + action, _ = self._opponent_model.predict( + obs, + deterministic=False, # Add some exploration + action_masks=mask, + ) + return int(action) + except Exception: + # Fallback to random if prediction fails + return int(np.random.choice(legal_indices)) + else: + # Random opponent (default) + return int(np.random.choice(legal_indices)) + def step(self, action: int) -> Tuple[np.ndarray, float, bool, bool, Dict[str, Any]]: """Execute agent's action, then opponent's action (both use masking).""" # Agent's move @@ -28,14 +73,73 @@ def step(self, action: int) -> Tuple[np.ndarray, float, bool, bool, Dict[str, An # Opponent's turn with action masking opponent_mask = self.env.action_masks() - opponent_legal_indices = np.where(opponent_mask)[0] + opponent_action = self._get_opponent_action(opponent_mask) + + obs, opp_reward, done, truncated, info = self.env.step(opponent_action) - if len(opponent_legal_indices) > 0: - # Random opponent chooses from legal actions only - opponent_action = np.random.choice(opponent_legal_indices) - obs, opp_reward, done, truncated, info = self.env.step(opponent_action) - - # Flip reward: opponent's loss is agent's gain - reward = -opp_reward + # Flip reward: opponent's loss is agent's gain + reward = -opp_reward + + self._steps_since_update += 1 return obs, reward, done, truncated, info + + +class AdaptiveSelfPlayWrapper(SelfPlayWrapper): + """Self-play wrapper that gradually transitions from random to model opponent. + + Starts with random opponent and progressively increases model usage + based on training progress. + """ + + def __init__( + self, + env: CuttleRLEnvironment, + model_prob_start: float = 0.0, + model_prob_end: float = 0.8, + transition_steps: int = 100000, + ): + super().__init__(env, opponent_type="adaptive") + self.model_prob_start = model_prob_start + self.model_prob_end = model_prob_end + self.transition_steps = transition_steps + self._total_steps = 0 + + def _get_model_probability(self) -> float: + """Get current probability of using model opponent.""" + if self._opponent_model is None: + return 0.0 + progress = min(1.0, self._total_steps / self.transition_steps) + return self.model_prob_start + progress * (self.model_prob_end - self.model_prob_start) + + def _get_opponent_action(self, mask: np.ndarray) -> int: + """Get opponent action, mixing random and model based on progress.""" + legal_indices = np.where(mask)[0] + + if len(legal_indices) == 0: + return 0 + + # Decide whether to use model or random + use_model = ( + self._opponent_model is not None + and np.random.random() < self._get_model_probability() + ) + + if use_model: + obs = self.env._encode_state() + try: + action, _ = self._opponent_model.predict( + obs, + deterministic=False, + action_masks=mask, + ) + return int(action) + except Exception: + return int(np.random.choice(legal_indices)) + else: + return int(np.random.choice(legal_indices)) + + def step(self, action: int) -> Tuple[np.ndarray, float, bool, bool, Dict[str, Any]]: + """Execute step and track total steps for adaptive scheduling.""" + self._total_steps += 1 + return super().step(action) diff --git a/rl/train.py b/rl/train.py index e0e3fbf..ee2095e 100644 --- a/rl/train.py +++ b/rl/train.py @@ -1,26 +1,208 @@ -"""Train RL agent for Cuttle game using MaskablePPO.""" +"""Train RL agent for Cuttle game using MaskablePPO with true self-play.""" +from __future__ import annotations import os +import numpy as np +import torch + from sb3_contrib import MaskablePPO -from stable_baselines3.common.callbacks import CheckpointCallback +from sb3_contrib.common.wrappers import ActionMasker +from stable_baselines3.common.callbacks import BaseCallback, CheckpointCallback +from stable_baselines3.common.env_util import make_vec_env +from stable_baselines3.common.logger import TensorBoardOutputFormat from stable_baselines3.common.monitor import Monitor from rl.config import LOG_DIR, MODEL_DIR, TRAINING_CONFIG from rl.cuttle_env import CuttleRLEnvironment -from rl.self_play_env import SelfPlayWrapper +from rl.self_play_env import AdaptiveSelfPlayWrapper + + +class ActivationLogger: + """Capture policy activations for TensorBoard logging.""" + + def __init__(self, policy: torch.nn.Module) -> None: + self._policy = policy + self._activations: dict[str, torch.Tensor] = {} + self._handles = [] + self._register_hooks() + + def _register_hooks(self) -> None: + for name, module in self._policy.named_modules(): + if isinstance(module, torch.nn.Linear): + handle = module.register_forward_hook(self._make_hook(name)) + self._handles.append(handle) + + def _make_hook(self, name: str): + def hook(_module, _inputs, output): + self._activations[name] = output.detach().cpu() + + return hook + + def clear(self) -> None: + self._activations.clear() + + def get(self) -> dict[str, torch.Tensor]: + return self._activations + + def close(self) -> None: + for handle in self._handles: + handle.remove() + self._handles.clear() + + +class DiagnosticsCallback(BaseCallback): + """Log action stats, masks, and activations to TensorBoard.""" + + def __init__(self, log_freq: int = 1000, activation_freq: int = 5000) -> None: + super().__init__() + self.log_freq = log_freq + self.activation_freq = activation_freq + self._tb_writer = None + self._activation_logger: ActivationLogger | None = None + + def _on_training_start(self) -> None: + for fmt in self.logger.output_formats: + if isinstance(fmt, TensorBoardOutputFormat): + self._tb_writer = fmt.writer + break + self._activation_logger = ActivationLogger(self.model.policy) + + def _on_training_end(self) -> None: + if self._activation_logger: + self._activation_logger.close() + + def _get_action_mask(self) -> np.ndarray | None: + if not hasattr(self.training_env, "envs"): + return None + base_env = self.training_env.envs[0] + try: + return base_env.unwrapped.action_masks() + except Exception: + return None + + def _log_activations(self, obs: np.ndarray) -> None: + if not self._tb_writer or not self._activation_logger: + return + self._activation_logger.clear() + with torch.no_grad(): + obs_tensor, _ = self.model.policy.obs_to_tensor(obs) + self.model.policy(obs_tensor) + for name, activation in self._activation_logger.get().items(): + self._tb_writer.add_histogram( + f"activations/{name}", + activation, + self.num_timesteps, + ) + + def _on_step(self) -> bool: + if self.n_calls % self.log_freq != 0: + return True + + def to_numpy(data): + if torch.is_tensor(data): + return data.detach().cpu().numpy() + return np.asarray(data) + + actions = self.locals.get("actions") + if actions is not None: + actions_np = to_numpy(actions).flatten() + self.logger.record("rollout/action_mean", float(np.mean(actions_np))) + if self._tb_writer: + self._tb_writer.add_histogram( + "actions/selected", + actions_np, + self.num_timesteps, + ) + + values = self.locals.get("values") + if values is not None: + values_np = to_numpy(values) + self.logger.record("rollout/value_mean", float(np.mean(values_np))) + + mask = self._get_action_mask() + if mask is not None: + self.logger.record("rollout/legal_action_count", float(mask.sum())) + self.logger.record("rollout/legal_action_fraction", float(mask.mean())) + if self._tb_writer: + self._tb_writer.add_histogram( + "actions/mask", + mask.astype(np.int32), + self.num_timesteps, + ) + + if self.n_calls % self.activation_freq == 0: + obs = self.locals.get("new_obs") + if obs is None: + obs = self.locals.get("obs") + if obs is not None: + self._log_activations(obs) + + return True + + +class SelfPlayCallback(BaseCallback): + """Callback to update opponent model during training for true self-play.""" + + def __init__(self, self_play_env: AdaptiveSelfPlayWrapper, update_freq: int = 10000): + super().__init__() + self.self_play_env = self_play_env + self.update_freq = update_freq + self._last_update = 0 + + def _on_training_start(self) -> None: + # Set initial opponent model + self.self_play_env.set_opponent_model(self.model) + print("🎮 Self-play initialized: opponent will gradually use trained model") + + def _on_step(self) -> bool: + # Update opponent model periodically + if self.num_timesteps - self._last_update >= self.update_freq: + self.self_play_env.set_opponent_model(self.model) + self._last_update = self.num_timesteps + + # Log current model usage probability + prob = self.self_play_env._get_model_probability() + self.logger.record("self_play/model_prob", prob) + print(f"📊 Self-play update @ {self.num_timesteps}: opponent model prob = {prob:.1%}") + + return True + + +def mask_fn(env): + """Function that returns action mask for MaskablePPO.""" + # Unwrap to get to the actual environment with action_masks method + while hasattr(env, 'env'): + if hasattr(env, 'action_masks'): + return env.action_masks() + env = env.env + return env.action_masks() def main(): - """Main training function.""" + """Main training function with true self-play.""" + # Large action spaces can trip strict simplex validation in torch distributions. + torch.distributions.Distribution.set_default_validate_args(False) + # Create directories os.makedirs(MODEL_DIR, exist_ok=True) os.makedirs(LOG_DIR, exist_ok=True) - print("Initializing environment with action masking...") - # Create and wrap environment - env = CuttleRLEnvironment() - env = SelfPlayWrapper(env) - env = Monitor(env, LOG_DIR) + print("Initializing environment with action masking and self-play...") + + # Create base environment with adaptive self-play + # Strategy: Start random-only, gradually introduce model opponent + # This ensures agent learns to win before facing harder opponents + base_env = CuttleRLEnvironment() + self_play_env = AdaptiveSelfPlayWrapper( + base_env, + model_prob_start=0.0, # Start with 100% random opponent + model_prob_end=0.3, # End with only 30% model (mostly random for wins) + transition_steps=300000, # Very slow transition over 300K steps + ) + + # Wrap with Monitor and ActionMasker + env = Monitor(self_play_env, LOG_DIR) + env = ActionMasker(env, mask_fn) # Critical: wrap with ActionMasker print("Creating MaskablePPO model...") # Create MaskablePPO model (supports action masking) @@ -39,7 +221,7 @@ def main(): tensorboard_log=LOG_DIR, ) - # Setup checkpoint callback + # Setup callbacks checkpoint_callback = CheckpointCallback( save_freq=10000, save_path=MODEL_DIR, @@ -47,16 +229,22 @@ def main(): save_replay_buffer=False, save_vecnormalize=False, ) + diagnostics_callback = DiagnosticsCallback() + self_play_callback = SelfPlayCallback( + self_play_env, + update_freq=10000, # Update opponent model every 10K steps + ) # Train the model print(f"Starting training for {TRAINING_CONFIG['total_timesteps']} timesteps...") print("Using action masking - model will only consider legal actions!") + print("Using adaptive self-play - opponent gradually uses trained model!") print("Progress will be shown below. This may take 15-30 minutes.") model.learn( total_timesteps=TRAINING_CONFIG["total_timesteps"], - callback=checkpoint_callback, - progress_bar=True, + callback=[checkpoint_callback, diagnostics_callback, self_play_callback], + progress_bar=False, ) # Save final model diff --git a/rl/view_game.py b/rl/view_game.py new file mode 100644 index 0000000..f7330fa --- /dev/null +++ b/rl/view_game.py @@ -0,0 +1,123 @@ +"""Interactive viewer for logged RL games.""" +from __future__ import annotations + +import json +from pathlib import Path +from typing import Any, Dict + + +def format_card(card_dict: Dict[str, Any]) -> str: + """Format card dict as readable string.""" + if not card_dict: + return "None" + return f"{card_dict['rank']}♦♥♠♣"[0] if card_dict['suit'] == 'DIAMONDS' else \ + f"{card_dict['rank']}♥" if card_dict['suit'] == 'HEARTS' else \ + f"{card_dict['rank']}♠" if card_dict['suit'] == 'SPADES' else \ + f"{card_dict['rank']}♣" + + +def display_game(game_file: Path) -> None: + """Display a game log in a readable format.""" + with open(game_file, "r") as f: + game = json.load(f) + + print(f"\n{'='*80}") + print(f"GAME {game['game_id']}") + print(f"{'='*80}") + print(f"Start: {game['start_time']}") + print(f"Outcome: {game['outcome']['reason'].upper()}") + if game['outcome']['winner'] is not None: + print(f"Winner: Player {game['outcome']['winner']}") + print(f"Total Steps: {game['outcome']['total_steps']}") + print(f"Final Scores: P0={game['outcome']['final_scores']['player_0']}, " + f"P1={game['outcome']['final_scores']['player_1']}") + print(f"{'='*80}\n") + + # Display step by step + for i, step in enumerate(game['steps'][:50]): # Show first 50 steps + player = step['player'] + action = step['action'] + state = step['state'] + + print(f"Step {step['step']:3d} | P{player} | {action['type']:15s}", end="") + + if action['card']: + print(f" | Card: {format_card(action['card'])}", end="") + if action['target']: + print(f" | Target: {format_card(action['target'])}", end="") + + print(f" | Score: P0={state['scores']['player_0']:2d} P1={state['scores']['player_1']:2d}", end="") + print(f" | Hands: P0={state['hand_sizes']['player_0']} P1={state['hand_sizes']['player_1']}", end="") + print(f" | Deck: {state['deck_size']:2d}") + + # Show field state every 10 steps + if (i + 1) % 10 == 0: + print(f" {'─'*72}") + p0_field = [format_card(c) for c in state['field_cards']['player_0']] + p1_field = [format_card(c) for c in state['field_cards']['player_1']] + print(f" P0 Field: {', '.join(p0_field) if p0_field else '(empty)'}") + print(f" P1 Field: {', '.join(p1_field) if p1_field else '(empty)'}") + print(f" {'─'*72}") + + if len(game['steps']) > 50: + print(f"\n... ({len(game['steps']) - 50} more steps) ...\n") + + # Show last 10 steps + print(f"{'─'*80}") + print("LAST 10 STEPS:") + print(f"{'─'*80}\n") + for step in game['steps'][-10:]: + player = step['player'] + action = step['action'] + state = step['state'] + + print(f"Step {step['step']:3d} | P{player} | {action['type']:15s}", end="") + if action['card']: + print(f" | {format_card(action['card'])}", end="") + print(f" | Score: P0={state['scores']['player_0']:2d} P1={state['scores']['player_1']:2d}") + + print(f"\n{'='*80}\n") + + +def main(): + """Main function to view game logs.""" + log_dir = Path("rl/gameplay_logs") + + if not log_dir.exists(): + print("❌ No logs found. Run 'make debug-rl' first.") + return + + game_files = sorted(log_dir.glob("game_*.json")) + + if not game_files: + print("❌ No game logs found.") + return + + print(f"\nFound {len(game_files)} games:") + for i, game_file in enumerate(game_files): + print(f" {i}: {game_file.name}") + + print("\nEnter game number to view (or 'all' for all games, 'q' to quit):") + + while True: + choice = input("> ").strip().lower() + + if choice == 'q': + break + elif choice == 'all': + for game_file in game_files: + display_game(game_file) + break + else: + try: + game_num = int(choice) + if 0 <= game_num < len(game_files): + display_game(game_files[game_num]) + else: + print(f"Invalid game number. Choose 0-{len(game_files)-1}") + except ValueError: + print("Invalid input. Enter a number, 'all', or 'q'") + + +if __name__ == "__main__": + main() diff --git a/tests/test_action_mapping.py b/tests/test_action_mapping.py new file mode 100644 index 0000000..ec76444 --- /dev/null +++ b/tests/test_action_mapping.py @@ -0,0 +1,33 @@ +"""Sanity checks for fixed action mapping.""" +from game.game import Game +from rl.action_mapping import ( + ACTION_SPACE_SIZE, + action_index_to_action, + action_to_index, + build_action_map, + legal_action_mask_from_actions, +) + + +def test_action_mask_roundtrip() -> None: + """Ensure mask/index mapping round-trips for legal actions in a fresh game state.""" + game = Game(manual_selection=False, ai_player=None) + state = game.game_state + legal_actions = state.get_legal_actions() + + action_map = build_action_map(legal_actions) + mask = legal_action_mask_from_actions(legal_actions) + + assert mask.shape[0] == ACTION_SPACE_SIZE + assert int(mask.sum()) == len(action_map) + + for idx, action in action_map.items(): + assert mask[idx] + decoded = action_index_to_action(state, idx) + assert decoded is not None + assert action_to_index(decoded) == idx + + for action in legal_actions: + idx = action_to_index(action) + assert idx is not None + assert mask[idx]