MiniMax-AI · BLUE-coconut · Apr 2, 2026
diff --git a/skills/minimax-multimodal-toolkit/SKILL.md b/skills/minimax-multimodal-toolkit/SKILL.md
diff --git a/skills/minimax-multimodal-toolkit/references/tts-guide.md b/skills/minimax-multimodal-toolkit/references/tts-guide.md
@@ -105,7 +105,7 @@ python scripts/tts/generate_voice.py generate segments.json -o output.mp3 --cros
 - **Endpoint**: `POST /v1/t2a_v2`
 - **Base URL**: `https://api.minimaxi.com`
 - **Auth**: `Authorization: Bearer {MINIMAX_API_KEY}`
-- **Models**: speech-2.8-hd (recommended), speech-2.8-turbo, speech-2.6-hd, speech-2.6-turbo, speech-02-hd, speech-02-turbo, speech-01-hd, speech-01-turbo
+- **Models**: speech-2.8-hd (recommended), speech-2.8-turbo
 - **Text limit**: 10,000 characters per request
 - **Pause marker**: `<#x#>` where x is seconds (0.01–99.99)
 - **Interjection tags** (speech-2.8 only): `(laughs)`, `(chuckle)`, `(coughs)`, `(sighs)`, `(breath)`, etc.
diff --git a/skills/minimax-multimodal-toolkit/references/tts-voice-catalog.md b/skills/minimax-multimodal-toolkit/references/tts-voice-catalog.md
@@ -521,8 +521,6 @@ voice = VoiceSetting(
 | `disgusted` | Repulsed | All |
 | `surprised` | Astonished | All |
 | `calm` | Neutral tone | All |
-| `fluent` | Natural, lively | speech-2.6 only |
-| `whisper` | Soft, gentle | speech-2.6 only |
 
 ---
 

diff --git a/skills/minimax-multimodal-toolkit/references/video-api.md b/skills/minimax-multimodal-toolkit/references/video-api.md
@@ -20,31 +20,24 @@
 ### Text-to-Video (T2V) Models
 | Model | Resolution | Duration | Notes |
 |-------|-----------|----------|-------|
-| MiniMax-Hailuo-2.3 | 768P (default), 1080P | 6s (1080P), 6/10s (768P) | Recommended, latest |
-| MiniMax-Hailuo-2.3-Fast | 768P (default), 1080P | 6s (1080P), 6/10s (768P) | Fast variant |
-| MiniMax-Hailuo-02 | 512P, 768P (default), 1080P | 6s (1080P), 6/10s (512P/768P) | Previous gen |
-| T2V-01-Director | 720P | 6s | Director control |
-| T2V-01 | 720P | 6s | Base model |
+| MiniMax-Hailuo-2.3-Fast | 768P | 6s | Fixed combo: 6s + 768P |
+| MiniMax-Hailuo-2.3 | 768P | 6s | Fixed combo: 6s + 768P |
 
 ### Image-to-Video (I2V) Models
 | Model | Resolution | Duration | Notes |
 |-------|-----------|----------|-------|
-| MiniMax-Hailuo-2.3 | 768P, 1080P | 6s | Recommended |
-| MiniMax-Hailuo-2.3-Fast | 768P, 1080P | 6s | Fast variant |
-| MiniMax-Hailuo-02 | 512P, 768P, 1080P | 6/10s | Previous gen |
-| I2V-01-Director | 720P | 6s | Director control |
-| I2V-01-live | 720P | 6s | Live photo style |
-| I2V-01 | 720P | 6s | Base model |
+| MiniMax-Hailuo-2.3-Fast | 768P | 6s | Fixed combo: 6s + 768P |
+| MiniMax-Hailuo-2.3 | 768P | 6s | Fixed combo: 6s + 768P |
 
 ### Start-End Frame Model
 | Model | Notes |
 |-------|-------|
-| MiniMax-Hailuo-02 | Only model supporting start-end frame |
+| MiniMax-Hailuo-2.3 | Supports start-end frame mode |
 
 ### Subject Reference Model
 | Model | Notes |
 |-------|-------|
-| S2V-01 | Face consistency across video |
+| MiniMax-Hailuo-2.3 | Use supported duration+resolution combos |
 
 ---
 
@@ -56,7 +49,7 @@
 | model | string | Yes | - | Model name |
 | prompt | string | Depends | - | Video description, max 2000 chars |
 | duration | int | No | 6 | Video length in seconds |
-| resolution | string | No | 768P/720P | Video resolution |
+| resolution | string | No | 768P | Video resolution |
 | prompt_optimizer | bool | No | true | Auto-optimize prompt |
 | fast_pretreatment | bool | No | false | Shorten optimizer duration |
 | callback_url | string | No | - | Webhook URL |
@@ -89,19 +82,21 @@ Each object has `type` and `image` (array of image URLs):
 
 ## Camera Instructions
 
-Supported in `[指令]` syntax for Hailuo-2.3, Hailuo-02, and Director models:
+Supported in `[command]` syntax for Hailuo-2.3 models:
 
 | Category | Instructions |
 |----------|-------------|
-| Pan | `[左移]`, `[右移]` |
-| Rotation | `[左摇]`, `[右摇]` |
-| Push/Pull | `[推进]`, `[拉远]` |
-| Elevation | `[上升]`, `[下降]` |
-| Tilt | `[上摇]`, `[下摇]` |
-| Zoom | `[变焦推近]`, `[变焦拉远]` |
-| Other | `[晃动]`, `[跟随]`, `[固定]` |
-
-Combine for simultaneous: `[左摇,上升]` (max 3). Sequential: `...[推进], then ...[拉远]`
+| Truck (lateral) | `[Truck left]`, `[Truck right]` |
+| Pan (horizontal rotation) | `[Pan left]`, `[Pan right]` |
+| Push/Pull (depth) | `[Push in]`, `[Pull out]` |
+| Pedestal (vertical) | `[Pedestal up]`, `[Pedestal down]` |
+| Tilt (vertical rotation) | `[Tilt up]`, `[Tilt down]` |
+| Zoom (focal length) | `[Zoom in]`, `[Zoom out]` |
+| Shake | `[Shake]` |
+| Tracking | `[Tracking shot]` |
+| Static | `[Static shot]` |
+
+Combine for simultaneous: `[Pan left,Pedestal up]` (max 3). Sequential: `...[Push in], then ...[Pull out]`
 
 ---
 

diff --git a/skills/minimax-multimodal-toolkit/references/video-prompt-guide.md b/skills/minimax-multimodal-toolkit/references/video-prompt-guide.md
@@ -14,9 +14,9 @@ Examples:
 **Main subject + Scene + Movement + Camera motion + Aesthetic atmosphere**
 
 Examples:
-- "A couple sits on a park bench, warm golden hour lighting, [固定] framing, intimate and romantic atmosphere"
-- "A young man in a suit eats noodles at a street stall, [拉远] revealing the busy night market, warm tones, cinematic"
-- "A dancer performs contemporary dance in an empty studio, [跟随] smooth tracking, dramatic side lighting"
+- "A couple sits on a park bench, warm golden hour lighting, [Static shot] intimate and romantic atmosphere"
+- "A young man in a suit eats noodles at a street stall, [Pull out] revealing the busy night market, warm tones, cinematic"
+- "A dancer performs contemporary dance in an empty studio, [Tracking shot] smooth tracking, dramatic side lighting"
 
 ---
 
@@ -32,13 +32,13 @@ Examples:
 ## Camera Instructions Usage
 
 ### Simultaneous Camera Movement
-Place multiple instructions in one bracket:
-- `[左摇,上升]` — pan left while rising
-- `[推进,下摇]` — push in while tilting down
+Place multiple instructions in one bracket (max 3):
+- `[Pan left,Pedestal up]` — pan left while rising
+- `[Push in,Tilt down]` — push in while tilting down
 
 ### Sequential Camera Movement
 Place instructions at different points in the prompt:
-- "The camera starts with [推进] toward the face, then [拉远] to reveal the full scene"
+- "The camera starts with [Push in] toward the face, then [Pull out] to reveal the full scene"
 
 ---
 
@@ -75,8 +75,8 @@ Place instructions at different points in the prompt:
 ## Image-to-Video Prompt Tips
 
 Focus on **movement and change** since the image establishes the visual:
-- Image of still lake → "Gentle ripples spread across the water surface, a breeze rustles the trees, [固定] fixed camera, peaceful"
-- Image of portrait → "The person slowly smiles and turns their head, natural blinking, [推进] subtle push in, warm lighting"
+- Image of still lake → "Gentle ripples spread across the water surface, a breeze rustles the trees, [Static shot] peaceful"
+- Image of portrait → "The person slowly smiles and turns their head, natural blinking, [Push in] subtle push in, warm lighting"
 
 ---
 
@@ -85,7 +85,7 @@ Focus on **movement and change** since the image establishes the visual:
 1. **Subject**: Appearance, clothing, color, expression, posture
 2. **Action**: 1-2 key temporal actions ("first...then...")
 3. **Scene**: Setting with foreground + background + atmosphere
-4. **Camera**: `[运镜指令]` for precise control
+4. **Camera**: `[Camera command]` for precise control (e.g. `[Push in]`, `[Tracking shot]`, `[Pan left]`)
 5. **Aesthetic**: Lighting, color, texture, cinematic quality
 
 ## Common Mistakes

diff --git a/skills/minimax-multimodal-toolkit/scripts/image/generate_image.sh b/skills/minimax-multimodal-toolkit/scripts/image/generate_image.sh
@@ -44,7 +44,7 @@ image_to_data_url() {
   local mime
   mime="$(file -b --mime-type "$path" 2>/dev/null)" || mime="image/jpeg"
   local b64
-  b64="$(base64 -w 0 < "$path")"
+  b64="$(base64 < "$path")"
   echo "data:${mime};base64,${b64}"
 }
 
@@ -57,78 +57,6 @@ resolve_image() {
   esac
 }
 
-# ============================================================================
-# Payload builder — avoids command-line length limits on Windows
-# Uses temp files for jq when the payload may contain large base64 data.
-# ============================================================================
-
-# Build JSON payload, writing large fields (base64 image data) to temp files
-# to avoid Windows cmd.exe argument-length limits (~32KB).
-build_payload() {
-  local model="$1" prompt="$2" response_format="$3" n="$4"
-  local prompt_optimizer="$5" aigc_watermark="$6"
-  local aspect_ratio="$7" width="$8" height="$9" seed="${10:-}"
-  local ref_image="${11:-}"
-
-  # Start with base payload using temp file to avoid long command lines
-  local base_tmp
-  base_tmp="$(mktemp)"
-  trap "rm -f '$base_tmp'" EXIT INT TERM HUP
-
-  jq -n \
-    --arg model "$model" \
-    --arg prompt "$prompt" \
-    --arg rf "$response_format" \
-    --argjson n "$n" \
-    --argjson po "$prompt_optimizer" \
-    --argjson aw "$aigc_watermark" \
-    '{model: $model, prompt: $prompt, response_format: $rf, n: $n, prompt_optimizer: $po, aigc_watermark: $aw}' \
-    > "$base_tmp"
-
-  # Add optional fields, each via temp file to stay within Windows arg limits
-  if [[ -n "$aspect_ratio" ]]; then
-    local tmp2; tmp2="$(mktemp)"; trap "rm -f '$base_tmp' '$tmp2'" EXIT INT TERM HUP
-    jq --arg ar "$aspect_ratio" '. + {aspect_ratio: $ar}' "$base_tmp" > "$tmp2"
-    mv "$tmp2" "$base_tmp"
-  fi
-  if [[ -n "$width" ]]; then
-    local tmp2; tmp2="$(mktemp)"; trap "rm -f '$base_tmp' '$tmp2'" EXIT INT TERM HUP
-    jq --argjson w "$width" '. + {width: $w}' "$base_tmp" > "$tmp2"
-    mv "$tmp2" "$base_tmp"
-  fi
-  if [[ -n "$height" ]]; then
-    local tmp2; tmp2="$(mktemp)"; trap "rm -f '$base_tmp' '$tmp2'" EXIT INT TERM HUP
-    jq --argjson h "$height" '. + {height: $h}' "$base_tmp" > "$tmp2"
-    mv "$tmp2" "$base_tmp"
-  fi
-  if [[ -n "$seed" ]]; then
-    local tmp2; tmp2="$(mktemp)"; trap "rm -f '$base_tmp' '$tmp2'" EXIT INT TERM HUP
-    jq --argjson s "$seed" '. + {seed: $s}' "$base_tmp" > "$tmp2"
-    mv "$tmp2" "$base_tmp"
-  fi
-
-  # Subject reference (i2i mode) — build via temp file to avoid huge command-line args
-  if [[ -n "$ref_image" ]]; then
-    local img_url
-    img_url="$(resolve_image "$ref_image")"
-    # Create temp files and set traps separately to avoid set -u issues
-    local ref_tmp; ref_tmp="$(mktemp)"
-    trap "rm -f '$base_tmp' '$ref_tmp'" EXIT INT TERM HUP
-    local url_tmp; url_tmp="$(mktemp)"; trap "rm -f '$base_tmp' '$ref_tmp' '$url_tmp'" EXIT INT TERM HUP
-    # Write URL to temp file to avoid long-argument issues, then build JSON
-    echo -n "$img_url" > "$url_tmp"
-    # Use jq -s to collect all lines (handles base64 with embedded newlines), take first element
-    jq -Rs 'split("\n")[0] | {type: "character", image_file: .}' "$url_tmp" > "$ref_tmp"
-    local tmp2; tmp2="$(mktemp)"; trap "rm -f '$base_tmp' '$ref_tmp' '$url_tmp' '$tmp2'" EXIT INT TERM HUP
-    jq --slurpfile ref "$ref_tmp" '. + {subject_reference: $ref}' "$base_tmp" > "$tmp2"
-    mv "$tmp2" "$base_tmp"
-  fi
-
-  cat "$base_tmp"
-  rm -f "$base_tmp"
-  trap - EXIT INT TERM HUP
-}
-
 # ============================================================================
 # Main
 # ============================================================================
@@ -179,7 +107,7 @@ Options:
   -n, --count N         Number of images to generate (1-9, default: 1)
   --seed N              Random seed for reproducibility
   --prompt-optimizer    Enable automatic prompt optimization
-  --aigc-watermark     Add AIGC watermark to generated images
+  --aigc-watermark      Add AIGC watermark to generated images
   --ref-image FILE      Character reference image (local file or URL, i2i mode)
   --response-format FMT Response format: url (default), base64
   --no-download         Don't download, just print URL(s)
@@ -216,13 +144,31 @@ USAGE
     echo "Error: -n must be between 1 and 9" >&2; exit 1
   fi
 
-  # Build payload using temp-file method (avoids Windows cmd.exe arg-length limit)
+  # Build payload
   local payload
-  payload=$(build_payload \
-    "$model" "$prompt" "$response_format" "$n" \
-    "$prompt_optimizer" "$aigc_watermark" \
-    "$aspect_ratio" "$width" "$height" "$seed" \
-    "$ref_image")
+  payload=$(jq -n \
+    --arg model "$model" \
+    --arg prompt "$prompt" \
+    --arg rf "$response_format" \
+    --argjson n "$n" \
+    --argjson po "$prompt_optimizer" \
+    --argjson aw "$aigc_watermark" \
+    '{model: $model, prompt: $prompt, response_format: $rf, n: $n, prompt_optimizer: $po, aigc_watermark: $aw}')
+
+  [[ -n "$aspect_ratio" ]] && payload=$(echo "$payload" | jq --arg ar "$aspect_ratio" '. + {aspect_ratio: $ar}')
+  [[ -n "$width" ]] && payload=$(echo "$payload" | jq --argjson w "$width" '. + {width: $w}')
+  [[ -n "$height" ]] && payload=$(echo "$payload" | jq --argjson h "$height" '. + {height: $h}')
+  [[ -n "$seed" ]] && payload=$(echo "$payload" | jq --argjson s "$seed" '. + {seed: $s}')
+
+  # Subject reference (i2i mode)
+  if [[ "$mode" == "i2i" ]]; then
+    if [[ -z "$ref_image" ]]; then
+      echo "Error: --ref-image is required for i2i mode" >&2; exit 1
+    fi
+    local img_url
+    img_url="$(resolve_image "$ref_image")"
+    payload=$(echo "$payload" | jq --arg img "$img_url" '. + {subject_reference: [{type: "character", image_file: $img}]}')
+  fi
 
   local api_host="${MINIMAX_API_HOST:-https://api.minimaxi.com}"
   local api_url="${api_host}/v1/image_generation"
@@ -231,18 +177,13 @@ USAGE
   echo "Model: $model"
   echo "Generating $n image(s)..."
 
-  # Write payload to temp file to avoid command-line length limits
-  local payload_tmp; payload_tmp="$(mktemp)"
-  trap "rm -f '$payload_tmp'" EXIT INT TERM HUP
-  echo -n "$payload" > "$payload_tmp"
-
   local raw_output http_code response
   raw_output="$(curl -s -w "\n%{http_code}" \
     -X POST "$api_url" \
     -H "Authorization: Bearer ${MINIMAX_API_KEY}" \
     -H "Content-Type: application/json" \
     --max-time 120 \
-    -d "@$payload_tmp" 2>/dev/null)" || {
+    -d "$payload" 2>/dev/null)" || {
     echo "Error: curl request failed" >&2
     exit 1
   }
@@ -262,7 +203,6 @@ USAGE
     local status_msg
     status_msg="$(echo "$response" | jq -r '.base_resp.status_msg // "Unknown error"')"
     echo "Error: API error (code $status_code): $status_msg" >&2
-    echo "Full response: $response" >&2
     exit 1
   fi
 

diff --git a/skills/minimax-multimodal-toolkit/scripts/video/generate_long_video.sh b/skills/minimax-multimodal-toolkit/scripts/video/generate_long_video.sh
@@ -54,6 +54,28 @@ check_api_key() {
   fi
 }
 
+validate_model_constraints() {
+  local model="$1" duration="$2" resolution="$3"
+  case "$model" in
+    MiniMax-Hailuo-2.3-Fast)
+      if [[ "$duration" != "6" || "$resolution" != "768P" ]]; then
+        echo "Error: MiniMax-Hailuo-2.3-Fast only supports duration=6 and resolution=768P." >&2
+        exit 1
+      fi
+      ;;
+    MiniMax-Hailuo-2.3)
+      if [[ "$duration" != "6" || "$resolution" != "768P" ]]; then
+        echo "Error: MiniMax-Hailuo-2.3 only supports duration=6 and resolution=768P." >&2
+        exit 1
+      fi
+      ;;
+    *)
+      echo "Error: Unsupported model '$model'. Supported models: MiniMax-Hailuo-2.3-Fast, MiniMax-Hailuo-2.3." >&2
+      exit 1
+      ;;
+  esac
+}
+
 image_to_data_url() {
   local path="$1"
   [[ -f "$path" ]] || { echo "Error: Image not found: $path" >&2; exit 1; }
@@ -300,7 +322,7 @@ main() {
   load_env
   check_api_key
 
-  local scenes=() model="" segment_duration=10 resolution="768P"
+  local scenes=() model="" segment_duration=6 resolution="768P"
   local first_frame="" subject_reference="" crossfade=0.5
   local music_prompt="" bgm_volume=0.3 fade_in=0 fade_out=0
   local output=""
@@ -334,8 +356,8 @@ Usage:
 Options:
   --scenes TEXT...          Scene prompts (2+ required)
   --model MODEL             Model name (default: auto)
-  --segment-duration SECS   Duration per segment (default: 10)
-  --resolution RES          Resolution: 768P, 1080P (default: 768P)
+  --segment-duration SECS   Duration per segment (default: 6)
+  --resolution RES          Resolution: 512P, 768P (default: 768P)
   --first-frame FILE        First frame for scene 1 (local file or URL)
   --subject-reference FILE  Subject reference image
   --crossfade SECS          Crossfade duration between scenes (default: 0.5)
@@ -362,6 +384,11 @@ USAGE
     echo "Error: --output / -o is required" >&2; exit 1
   fi
 
+  if [[ -z "$model" ]]; then
+    model="MiniMax-Hailuo-2.3"
+  fi
+  validate_model_constraints "$model" "$segment_duration" "$resolution"
+
   local output_dir
   output_dir="$(dirname "$output")"
   mkdir -p "$output_dir"
@@ -389,12 +416,6 @@ USAGE
 
     # Determine model
     local seg_model="$model"
-    if [[ -z "$seg_model" ]]; then
-      case "$seg_mode" in
-        t2v|i2v) seg_model="MiniMax-Hailuo-2.3" ;;
-        ref) seg_model="S2V-01" ;;
-      esac
-    fi
 
     # Build payload
     local payload