marksverdhei · hai-pilgrim · Mar 28, 2026 · Mar 28, 2026
diff --git a/README.md b/README.md
@@ -43,8 +43,8 @@ Works with any OpenAI-compatible API: local servers (llama.cpp, Ollama, vLLM, LM
 ## Features
 
 - Animated Sorting Hat with drop animation, blinking eyes, and streaming thought bubble
-- Supports text files and images including JPEG, PNG, GIF, BMP, TIFF, WebP, and SVG (via vision/multimodal models)
-- Auto-detects image files by extension
+- Supports text files, images (JPEG, PNG, GIF, BMP, TIFF, WebP, SVG), and audio files (MP3, WAV, FLAC, OGG, AAC, M4A)
+- Auto-detects image and audio files by magic bytes and extension
 - Handles reasoning/thinking tokens from models like Qwen, DeepSeek, etc.
 - Quiet mode for scripting (`--quiet` / `-q`)
 - Configurable reasoning: guard clause defaults to no thinking, naming uses thinking. `--nothink` disables both, `--fullthink` enables both
@@ -62,6 +62,7 @@ Works with any OpenAI-compatible API: local servers (llama.cpp, Ollama, vLLM, LM
 - An OpenAI-compatible LLM API endpoint
 - For image naming: a vision-capable model (e.g., GPT-4o, LLaVA, Qwen-VL)
 - Optional: `Pillow` (`pip install Pillow`) for EXIF metadata extraction from images
+- Optional: `ffprobe` (from `ffmpeg`) for richer audio metadata (tags, duration, bitrate, codec)
 
 ## Installation
 
@@ -188,7 +189,7 @@ done
 
 1. **Guard clause**: Asks the LLM whether the current filename is already descriptive. If yes, skips the file. If no, the check conversation becomes context for the naming request (two-turn multi-turn). Use `--force` to skip the check entirely.
 2. **Metadata collection**: Gathers file metadata (size, modification date, MIME type, EXIF for images) to give the LLM more context. Use `--no-metadata` to skip.
-3. **File analysis**: For text files, reads the first 4KB of content. For images, base64-encodes and sends via the OpenAI multimodal format.
+3. **File analysis**: For text files, reads the first 4KB of content. For images, base64-encodes and sends via the OpenAI multimodal format. For audio files, extracts tags, duration, bitrate, and codec via `ffprobe` (if available) and passes them as text context — no audio bytes are sent to the LLM.
 4. **LLM query**: Sends the content, metadata, and any user context (`--context`) to your configured LLM with a prompt asking for a descriptive kebab-case filename. When the guard clause ran first, this becomes a multi-turn conversation with richer context.
 5. **Streaming display**: Shows the model's reasoning tokens in a speech bubble above the animated hat (supports both `reasoning_content` field and `<think>` tags).
 6. **Name sanitization**: Cleans the response into a valid filename. When preserving extensions (default), the model only generates the name stem and the original extension is appended automatically.

diff --git a/hat b/hat
@@ -38,6 +38,7 @@ Options:
   --context, -c     Additional context to guide naming (e.g. "Q3 marketing")
   --no-metadata     Don't include file metadata in LLM context
   --force, -f       Skip name quality check, always suggest a new name
+  --preview, -p     Show a content preview before suggesting a name
 
 Environment:
   LLM_BASE_URL             OpenAI-compatible base URL (default: http://localhost:8080)
@@ -72,6 +73,30 @@ is_binary() {
   [[ "$mime" == "binary" ]]
 }
 
+# Detect audio by magic bytes first, then fall back to extension
+is_audio() {
+  local file="$1"
+  local magic
+  magic=$(head -c 12 "$file" 2>/dev/null | od -A n -t x1 -N 12 2>/dev/null | tr -d ' \n')
+  # MP3: ID3 tag (49 44 33) or MPEG sync (ff fb / ff fa / ff f3 / ff f2)
+  [[ "$magic" == 494433* ]] && return 0
+  [[ "$magic" == fffb* || "$magic" == fffa* || "$magic" == fff3* || "$magic" == fff2* ]] && return 0
+  # FLAC: 66 4c 61 43
+  [[ "$magic" == 664c6143* ]] && return 0
+  # OGG: 4f 67 67 53
+  [[ "$magic" == 4f676753* ]] && return 0
+  # AAC (ADTS): ff f1 / ff f9
+  [[ "$magic" == fff1* || "$magic" == fff9* ]] && return 0
+  # WAV: RIFF (52 49 46 46) + WAVE at offset 8 (57 41 56 45)
+  [[ "$magic" == 52494646* ]] && [[ "${magic:16:8}" == "57415645" ]] && return 0
+  # M4A / AAC in MP4: ftyp box — check bytes 4-7 for 66 74 79 70
+  [[ "${magic:8:8}" == "66747970" ]] && return 0
+  # Fall back to extension
+  local ext="${file##*.}"
+  ext="${ext,,}"
+  [[ "$ext" =~ ^(mp3|wav|flac|ogg|aac|m4a|opus|wma|aiff|ape)$ ]]
+}
+
 # Detect image by magic bytes first, then fall back to extension
 is_image() {
   local file="$1"
@@ -124,6 +149,10 @@ if mode == "image":
         {'type': 'image_url', 'image_url': {'url': f'data:{mime};base64,{b64}'}},
         {'type': 'text', 'text': prompt}
     ]
+elif mode == "audio":
+    # Audio files can't be passed to the LLM directly — metadata was already
+    # injected into the prompt by collect_metadata(), so just forward the prompt.
+    content = prompt
 else:
     with open(file, 'r', errors='replace') as f:
         content = f.read(4000)
@@ -219,6 +248,77 @@ CHECKEOF
   return 0  # needs renaming
 }
 
+# Print a content preview to stderr before processing.
+# Text: first 8 lines. Audio: key tags. Image: dimensions + EXIF summary.
+show_preview() {
+  local file="$1" mode="$2"
+  local fname
+  fname=$(basename "$file")
+
+  echo "  ── preview: $fname ──" >&2
+
+  if [[ "$mode" == "text" ]]; then
+    head -8 "$file" 2>/dev/null | sed 's/^/  /' >&2
+    local lines
+    lines=$(wc -l < "$file" 2>/dev/null || echo "?")
+    echo "  [${lines} lines total]" >&2
+
+  elif [[ "$mode" == "audio" ]]; then
+    python3 - "$file" <<'PREVIEWEOF' >&2
+import sys, subprocess, json
+file = sys.argv[1]
+try:
+    result = subprocess.run(
+        ['ffprobe', '-v', 'quiet', '-print_format', 'json', '-show_format', file],
+        capture_output=True, text=True, timeout=5
+    )
+    if result.returncode == 0:
+        fmt = json.loads(result.stdout).get('format', {})
+        tags = {k.lower(): v for k, v in fmt.get('tags', {}).items()}
+        for key in ('title', 'artist', 'album', 'date', 'genre'):
+            if tags.get(key):
+                print(f"  {key.capitalize()}: {tags[key]}")
+        if fmt.get('duration'):
+            secs = float(fmt['duration'])
+            print(f"  Duration: {int(secs//60)}:{int(secs%60):02d}")
+    else:
+        print(f"  (ffprobe unavailable)")
+except Exception as e:
+    print(f"  (preview error: {e})")
+PREVIEWEOF
+
+  elif [[ "$mode" == "image" ]]; then
+    python3 - "$file" <<'PREVIEWEOF' >&2
+import sys
+file = sys.argv[1]
+try:
+    from PIL import Image
+    with Image.open(file) as img:
+        print(f"  Size: {img.width}x{img.height}  Mode: {img.mode}")
+        exif = img.getexif()
+        if exif:
+            from PIL.ExifTags import TAGS
+            shown = 0
+            for tid, val in exif.items():
+                tag = TAGS.get(tid, tid)
+                if tag in ('Make', 'Model', 'DateTime', 'Software') and isinstance(val, str):
+                    print(f"  {tag}: {val}")
+                    shown += 1
+                    if shown >= 3:
+                        break
+except ImportError:
+    import subprocess
+    result = subprocess.run(['file', '--brief', '--mime-type', file],
+                           capture_output=True, text=True)
+    print(f"  {result.stdout.strip()}")
+except Exception as e:
+    print(f"  (preview error: {e})")
+PREVIEWEOF
+  fi
+
+  echo "" >&2
+}
+
 # Collect file metadata as JSON
 collect_metadata() {
   local file="$1" mode="$2"
@@ -246,6 +346,38 @@ if mode == 'image':
             if ed: meta['exif'] = ed
     except Exception:
         pass
+elif mode == 'audio':
+    import subprocess
+    try:
+        result = subprocess.run(
+            ['ffprobe', '-v', 'quiet', '-print_format', 'json',
+             '-show_format', '-show_streams', file],
+            capture_output=True, text=True, timeout=10
+        )
+        if result.returncode == 0:
+            probe = json.loads(result.stdout)
+            fmt = probe.get('format', {})
+            tags = fmt.get('tags', {})
+            # Normalise tag keys to lowercase
+            tags = {k.lower(): v for k, v in tags.items()}
+            for key in ('title', 'artist', 'album', 'album_artist', 'date', 'genre', 'track', 'comment'):
+                if tags.get(key):
+                    meta[key] = tags[key][:100]
+            if fmt.get('duration'):
+                secs = float(fmt['duration'])
+                meta['duration'] = f'{int(secs//60)}:{int(secs%60):02d}'
+            if fmt.get('bit_rate'):
+                meta['bitrate_kbps'] = str(int(fmt['bit_rate']) // 1000)
+            # Codec from first audio stream
+            for stream in probe.get('streams', []):
+                if stream.get('codec_type') == 'audio':
+                    if stream.get('codec_name'):
+                        meta['codec'] = stream['codec_name']
+                    if stream.get('sample_rate'):
+                        meta['sample_rate_hz'] = stream['sample_rate']
+                    break
+    except Exception:
+        pass
 print(json.dumps(meta, ensure_ascii=False))
 " "$file" "$mode"
 }
@@ -734,13 +866,20 @@ process_file() {
   local mode="text"
   if [[ "$force_image" == "true" ]] || is_image "$file"; then
     mode="image"
+  elif is_audio "$file"; then
+    mode="audio"
   fi
 
   if [[ "$mode" == "text" ]] && is_binary "$file"; then
     echo "  $basename: binary file, skipping (no analyzable content)" >&2
     return 0
   fi
 
+  # Optional content preview
+  if [[ "$PREVIEW" == "true" ]]; then
+    show_preview "$file" "$mode"
+  fi
+
   # Collect metadata
   local metadata=""
   if [[ "$INCLUDE_METADATA" == "true" ]]; then
@@ -848,6 +987,7 @@ QUIET=false
 CONTEXT=""
 INCLUDE_METADATA=true
 FORCE=false
+PREVIEW=false
 FILES=()
 
 while [[ $# -gt 0 ]]; do
@@ -866,6 +1006,7 @@ while [[ $# -gt 0 ]]; do
     --context|-c) CONTEXT="$2"; shift 2 ;;
     --no-metadata) INCLUDE_METADATA=false; shift ;;
     --force|-f) FORCE=true; shift ;;
+    --preview|-p) PREVIEW=true; shift ;;
     -*) echo "Unknown option: $1" >&2; exit 1 ;;
     *) FILES+=("$1"); shift ;;
   esac

diff --git a/test.sh b/test.sh
@@ -12,8 +12,9 @@ setup_file() {
   # Source testable bash functions
   eval "$(sed -n '/^sanitize_name/,/^}/p' "$HAT")"
   eval "$(sed -n '/^is_binary/,/^}/p' "$HAT")"
+  eval "$(sed -n '/^is_audio/,/^}/p' "$HAT")"
   eval "$(sed -n '/^collect_metadata/,/^}/p' "$HAT")"
-  export -f sanitize_name is_binary collect_metadata
+  export -f sanitize_name is_binary is_audio collect_metadata
 
   # Start mock LLM server that handles multi-turn conversations
   export MOCK_PORT=18950
@@ -223,3 +224,109 @@ teardown_file() {
   assert_output --partial "could not reach LLM"
   refute_output --partial "Traceback"
 }
+
+# ── Audio detection ──────────────────────────────────────────────────
+
+@test "audio: mp3 is detected as audio" {
+  run is_audio "$TEST_ASSETS/sample.mp3"
+  assert_success
+}
+
+@test "audio: wav is detected as audio" {
+  run is_audio "$TEST_ASSETS/sample.wav"
+  assert_success
+}
+
+@test "audio: flac is detected as audio" {
+  run is_audio "$TEST_ASSETS/sample.flac"
+  assert_success
+}
+
+@test "audio: ogg is detected as audio" {
+  run is_audio "$TEST_ASSETS/sample.ogg"
+  assert_success
+}
+
+@test "audio: aac is detected as audio" {
+  run is_audio "$TEST_ASSETS/sample.aac"
+  assert_success
+}
+
+@test "audio: m4a is detected as audio" {
+  run is_audio "$TEST_ASSETS/sample.m4a"
+  assert_success
+}
+
+@test "audio: text file is not audio" {
+  run is_audio "$TEST_ASSETS/sample.txt"
+  assert_failure
+}
+
+@test "audio: image file is not audio" {
+  run is_audio "$TEST_ASSETS/sample.jpg"
+  assert_failure
+}
+
+# ── Audio metadata ───────────────────────────────────────────────────
+
+@test "audio metadata: mp3 has size and modified" {
+  run collect_metadata "$TEST_ASSETS/sample.mp3" "audio"
+  assert_output --partial "size_bytes"
+  assert_output --partial "modified"
+}
+
+@test "audio metadata: mp3 has mime_type" {
+  run collect_metadata "$TEST_ASSETS/sample.mp3" "audio"
+  assert_output --partial "mime_type"
+}
+
+# ── Audio integration ────────────────────────────────────────────────
+
+@test "audio: mp3 is processed (not skipped as binary)" {
+  run bash -c "LLM_BASE_URL=http://127.0.0.1:$MOCK_PORT bash '$HAT' --quiet --dry-run --force '$TEST_ASSETS/sample.mp3' 2>/dev/null"
+  assert_output "suggested-name.mp3"
+}
+
+@test "audio: flac is processed (not skipped as binary)" {
+  run bash -c "LLM_BASE_URL=http://127.0.0.1:$MOCK_PORT bash '$HAT' --quiet --dry-run --force '$TEST_ASSETS/sample.flac' 2>/dev/null"
+  assert_output "suggested-name.flac"
+}
+
+@test "audio: wav is processed (not skipped as binary)" {
+  run bash -c "LLM_BASE_URL=http://127.0.0.1:$MOCK_PORT bash '$HAT' --quiet --dry-run --force '$TEST_ASSETS/sample.wav' 2>/dev/null"
+  assert_output "suggested-name.wav"
+}
+
+# ── Preview flag ─────────────────────────────────────────────────────
+
+@test "preview: --preview shows content preview on stderr for text file" {
+  run bash -c "LLM_BASE_URL=http://127.0.0.1:$MOCK_PORT bash '$HAT' --quiet --dry-run --force --preview '$TEST_ASSETS/sample.txt' 2>&1 >/dev/null"
+  assert_output --partial "preview:"
+  assert_output --partial "sample.txt"
+}
+
+@test "preview: --preview shows line count for text file" {
+  run bash -c "LLM_BASE_URL=http://127.0.0.1:$MOCK_PORT bash '$HAT' --quiet --dry-run --force --preview '$TEST_ASSETS/sample.txt' 2>&1 >/dev/null"
+  assert_output --partial "lines total"
+}
+
+@test "preview: -p shorthand works" {
+  run bash -c "LLM_BASE_URL=http://127.0.0.1:$MOCK_PORT bash '$HAT' --quiet --dry-run --force -p '$TEST_ASSETS/sample.txt' 2>&1 >/dev/null"
+  assert_output --partial "preview:"
+}
+
+@test "preview: without --preview no preview header on stderr" {
+  run bash -c "LLM_BASE_URL=http://127.0.0.1:$MOCK_PORT bash '$HAT' --quiet --dry-run --force '$TEST_ASSETS/sample.txt' 2>&1 >/dev/null"
+  refute_output --partial "preview:"
+}
+
+@test "preview: --preview still produces correct suggestion on stdout" {
+  run bash -c "LLM_BASE_URL=http://127.0.0.1:$MOCK_PORT bash '$HAT' --quiet --dry-run --force --preview '$TEST_ASSETS/sample.txt' 2>/dev/null"
+  assert_output "suggested-name.txt"
+}
+
+@test "preview: --preview works with audio file" {
+  run bash -c "LLM_BASE_URL=http://127.0.0.1:$MOCK_PORT bash '$HAT' --quiet --dry-run --force --preview '$TEST_ASSETS/sample.mp3' 2>&1 >/dev/null"
+  assert_output --partial "preview:"
+  assert_output --partial "sample.mp3"
+}