From 4fdf95a15c62278031cf4a20db5d1efe430d2f9a Mon Sep 17 00:00:00 2001 From: marksverdhei Date: Sat, 28 Mar 2026 19:51:54 +0000 Subject: [PATCH 1/2] feat: add audio file support (MP3, WAV, FLAC, OGG, AAC, M4A) Audio files were previously skipped as unreadable binaries. This adds: - `is_audio()` detection via magic bytes (ID3, RIFF/WAVE, fLaC, OggS, ADTS, M4A ftyp box) with extension fallback - Audio metadata extraction via `ffprobe`: title, artist, album, genre, year, duration, bitrate, codec, sample rate - `build_user_content()` audio mode passes rich metadata as text context so the LLM can suggest a name based on embedded tags - 13 new bats tests covering detection, metadata, and end-to-end naming - README updated with audio formats and ffprobe optional dependency Closes #19 --- README.md | 7 +++--- hat | 62 +++++++++++++++++++++++++++++++++++++++++++++ test.sh | 75 ++++++++++++++++++++++++++++++++++++++++++++++++++++++- 3 files changed, 140 insertions(+), 4 deletions(-) diff --git a/README.md b/README.md index 5b9060d..58eb25b 100644 --- a/README.md +++ b/README.md @@ -43,8 +43,8 @@ Works with any OpenAI-compatible API: local servers (llama.cpp, Ollama, vLLM, LM ## Features - Animated Sorting Hat with drop animation, blinking eyes, and streaming thought bubble -- Supports text files and images including JPEG, PNG, GIF, BMP, TIFF, WebP, and SVG (via vision/multimodal models) -- Auto-detects image files by extension +- Supports text files, images (JPEG, PNG, GIF, BMP, TIFF, WebP, SVG), and audio files (MP3, WAV, FLAC, OGG, AAC, M4A) +- Auto-detects image and audio files by magic bytes and extension - Handles reasoning/thinking tokens from models like Qwen, DeepSeek, etc. - Quiet mode for scripting (`--quiet` / `-q`) - Configurable reasoning: guard clause defaults to no thinking, naming uses thinking. `--nothink` disables both, `--fullthink` enables both @@ -62,6 +62,7 @@ Works with any OpenAI-compatible API: local servers (llama.cpp, Ollama, vLLM, LM - An OpenAI-compatible LLM API endpoint - For image naming: a vision-capable model (e.g., GPT-4o, LLaVA, Qwen-VL) - Optional: `Pillow` (`pip install Pillow`) for EXIF metadata extraction from images +- Optional: `ffprobe` (from `ffmpeg`) for richer audio metadata (tags, duration, bitrate, codec) ## Installation @@ -188,7 +189,7 @@ done 1. **Guard clause**: Asks the LLM whether the current filename is already descriptive. If yes, skips the file. If no, the check conversation becomes context for the naming request (two-turn multi-turn). Use `--force` to skip the check entirely. 2. **Metadata collection**: Gathers file metadata (size, modification date, MIME type, EXIF for images) to give the LLM more context. Use `--no-metadata` to skip. -3. **File analysis**: For text files, reads the first 4KB of content. For images, base64-encodes and sends via the OpenAI multimodal format. +3. **File analysis**: For text files, reads the first 4KB of content. For images, base64-encodes and sends via the OpenAI multimodal format. For audio files, extracts tags, duration, bitrate, and codec via `ffprobe` (if available) and passes them as text context — no audio bytes are sent to the LLM. 4. **LLM query**: Sends the content, metadata, and any user context (`--context`) to your configured LLM with a prompt asking for a descriptive kebab-case filename. When the guard clause ran first, this becomes a multi-turn conversation with richer context. 5. **Streaming display**: Shows the model's reasoning tokens in a speech bubble above the animated hat (supports both `reasoning_content` field and `` tags). 6. **Name sanitization**: Cleans the response into a valid filename. When preserving extensions (default), the model only generates the name stem and the original extension is appended automatically. diff --git a/hat b/hat index 874aa2e..2039b35 100755 --- a/hat +++ b/hat @@ -72,6 +72,30 @@ is_binary() { [[ "$mime" == "binary" ]] } +# Detect audio by magic bytes first, then fall back to extension +is_audio() { + local file="$1" + local magic + magic=$(head -c 12 "$file" 2>/dev/null | od -A n -t x1 -N 12 2>/dev/null | tr -d ' \n') + # MP3: ID3 tag (49 44 33) or MPEG sync (ff fb / ff fa / ff f3 / ff f2) + [[ "$magic" == 494433* ]] && return 0 + [[ "$magic" == fffb* || "$magic" == fffa* || "$magic" == fff3* || "$magic" == fff2* ]] && return 0 + # FLAC: 66 4c 61 43 + [[ "$magic" == 664c6143* ]] && return 0 + # OGG: 4f 67 67 53 + [[ "$magic" == 4f676753* ]] && return 0 + # AAC (ADTS): ff f1 / ff f9 + [[ "$magic" == fff1* || "$magic" == fff9* ]] && return 0 + # WAV: RIFF (52 49 46 46) + WAVE at offset 8 (57 41 56 45) + [[ "$magic" == 52494646* ]] && [[ "${magic:16:8}" == "57415645" ]] && return 0 + # M4A / AAC in MP4: ftyp box — check bytes 4-7 for 66 74 79 70 + [[ "${magic:8:8}" == "66747970" ]] && return 0 + # Fall back to extension + local ext="${file##*.}" + ext="${ext,,}" + [[ "$ext" =~ ^(mp3|wav|flac|ogg|aac|m4a|opus|wma|aiff|ape)$ ]] +} + # Detect image by magic bytes first, then fall back to extension is_image() { local file="$1" @@ -124,6 +148,10 @@ if mode == "image": {'type': 'image_url', 'image_url': {'url': f'data:{mime};base64,{b64}'}}, {'type': 'text', 'text': prompt} ] +elif mode == "audio": + # Audio files can't be passed to the LLM directly — metadata was already + # injected into the prompt by collect_metadata(), so just forward the prompt. + content = prompt else: with open(file, 'r', errors='replace') as f: content = f.read(4000) @@ -246,6 +274,38 @@ if mode == 'image': if ed: meta['exif'] = ed except Exception: pass +elif mode == 'audio': + import subprocess + try: + result = subprocess.run( + ['ffprobe', '-v', 'quiet', '-print_format', 'json', + '-show_format', '-show_streams', file], + capture_output=True, text=True, timeout=10 + ) + if result.returncode == 0: + probe = json.loads(result.stdout) + fmt = probe.get('format', {}) + tags = fmt.get('tags', {}) + # Normalise tag keys to lowercase + tags = {k.lower(): v for k, v in tags.items()} + for key in ('title', 'artist', 'album', 'album_artist', 'date', 'genre', 'track', 'comment'): + if tags.get(key): + meta[key] = tags[key][:100] + if fmt.get('duration'): + secs = float(fmt['duration']) + meta['duration'] = f'{int(secs//60)}:{int(secs%60):02d}' + if fmt.get('bit_rate'): + meta['bitrate_kbps'] = str(int(fmt['bit_rate']) // 1000) + # Codec from first audio stream + for stream in probe.get('streams', []): + if stream.get('codec_type') == 'audio': + if stream.get('codec_name'): + meta['codec'] = stream['codec_name'] + if stream.get('sample_rate'): + meta['sample_rate_hz'] = stream['sample_rate'] + break + except Exception: + pass print(json.dumps(meta, ensure_ascii=False)) " "$file" "$mode" } @@ -734,6 +794,8 @@ process_file() { local mode="text" if [[ "$force_image" == "true" ]] || is_image "$file"; then mode="image" + elif is_audio "$file"; then + mode="audio" fi if [[ "$mode" == "text" ]] && is_binary "$file"; then diff --git a/test.sh b/test.sh index dd77c14..182627a 100755 --- a/test.sh +++ b/test.sh @@ -12,8 +12,9 @@ setup_file() { # Source testable bash functions eval "$(sed -n '/^sanitize_name/,/^}/p' "$HAT")" eval "$(sed -n '/^is_binary/,/^}/p' "$HAT")" + eval "$(sed -n '/^is_audio/,/^}/p' "$HAT")" eval "$(sed -n '/^collect_metadata/,/^}/p' "$HAT")" - export -f sanitize_name is_binary collect_metadata + export -f sanitize_name is_binary is_audio collect_metadata # Start mock LLM server that handles multi-turn conversations export MOCK_PORT=18950 @@ -223,3 +224,75 @@ teardown_file() { assert_output --partial "could not reach LLM" refute_output --partial "Traceback" } + +# ── Audio detection ────────────────────────────────────────────────── + +@test "audio: mp3 is detected as audio" { + run is_audio "$TEST_ASSETS/sample.mp3" + assert_success +} + +@test "audio: wav is detected as audio" { + run is_audio "$TEST_ASSETS/sample.wav" + assert_success +} + +@test "audio: flac is detected as audio" { + run is_audio "$TEST_ASSETS/sample.flac" + assert_success +} + +@test "audio: ogg is detected as audio" { + run is_audio "$TEST_ASSETS/sample.ogg" + assert_success +} + +@test "audio: aac is detected as audio" { + run is_audio "$TEST_ASSETS/sample.aac" + assert_success +} + +@test "audio: m4a is detected as audio" { + run is_audio "$TEST_ASSETS/sample.m4a" + assert_success +} + +@test "audio: text file is not audio" { + run is_audio "$TEST_ASSETS/sample.txt" + assert_failure +} + +@test "audio: image file is not audio" { + run is_audio "$TEST_ASSETS/sample.jpg" + assert_failure +} + +# ── Audio metadata ─────────────────────────────────────────────────── + +@test "audio metadata: mp3 has size and modified" { + run collect_metadata "$TEST_ASSETS/sample.mp3" "audio" + assert_output --partial "size_bytes" + assert_output --partial "modified" +} + +@test "audio metadata: mp3 has mime_type" { + run collect_metadata "$TEST_ASSETS/sample.mp3" "audio" + assert_output --partial "mime_type" +} + +# ── Audio integration ──────────────────────────────────────────────── + +@test "audio: mp3 is processed (not skipped as binary)" { + run bash -c "LLM_BASE_URL=http://127.0.0.1:$MOCK_PORT bash '$HAT' --quiet --dry-run --force '$TEST_ASSETS/sample.mp3' 2>/dev/null" + assert_output "suggested-name.mp3" +} + +@test "audio: flac is processed (not skipped as binary)" { + run bash -c "LLM_BASE_URL=http://127.0.0.1:$MOCK_PORT bash '$HAT' --quiet --dry-run --force '$TEST_ASSETS/sample.flac' 2>/dev/null" + assert_output "suggested-name.flac" +} + +@test "audio: wav is processed (not skipped as binary)" { + run bash -c "LLM_BASE_URL=http://127.0.0.1:$MOCK_PORT bash '$HAT' --quiet --dry-run --force '$TEST_ASSETS/sample.wav' 2>/dev/null" + assert_output "suggested-name.wav" +} From 92dbfbc4674e68ec33804dcc349b306bda7d19e9 Mon Sep 17 00:00:00 2001 From: marksverdhei Date: Sat, 28 Mar 2026 21:50:02 +0000 Subject: [PATCH 2/2] feat: add --preview / -p flag for content preview MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Closes #6 Before the hat animation runs, --preview prints a content snippet to stderr so you know what the file contains without opening it: - Text files: first 8 lines + total line count - Audio files: title, artist, album, duration from ffprobe tags - Images: dimensions, colour mode, key EXIF fields (Make, Model, DateTime) The preview goes to stderr only, so stdout output (the suggested name) is unaffected for scripting. Adds 6 new bats tests (44 total). Example output: ── preview: interview-recording.mp3 ── Title: Q3 Planning Call Artist: Zoom Recording Duration: 42:17 --- hat | 79 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++ test.sh | 34 +++++++++++++++++++++++++ 2 files changed, 113 insertions(+) diff --git a/hat b/hat index 2039b35..1ce7daa 100755 --- a/hat +++ b/hat @@ -38,6 +38,7 @@ Options: --context, -c Additional context to guide naming (e.g. "Q3 marketing") --no-metadata Don't include file metadata in LLM context --force, -f Skip name quality check, always suggest a new name + --preview, -p Show a content preview before suggesting a name Environment: LLM_BASE_URL OpenAI-compatible base URL (default: http://localhost:8080) @@ -247,6 +248,77 @@ CHECKEOF return 0 # needs renaming } +# Print a content preview to stderr before processing. +# Text: first 8 lines. Audio: key tags. Image: dimensions + EXIF summary. +show_preview() { + local file="$1" mode="$2" + local fname + fname=$(basename "$file") + + echo " ── preview: $fname ──" >&2 + + if [[ "$mode" == "text" ]]; then + head -8 "$file" 2>/dev/null | sed 's/^/ /' >&2 + local lines + lines=$(wc -l < "$file" 2>/dev/null || echo "?") + echo " [${lines} lines total]" >&2 + + elif [[ "$mode" == "audio" ]]; then + python3 - "$file" <<'PREVIEWEOF' >&2 +import sys, subprocess, json +file = sys.argv[1] +try: + result = subprocess.run( + ['ffprobe', '-v', 'quiet', '-print_format', 'json', '-show_format', file], + capture_output=True, text=True, timeout=5 + ) + if result.returncode == 0: + fmt = json.loads(result.stdout).get('format', {}) + tags = {k.lower(): v for k, v in fmt.get('tags', {}).items()} + for key in ('title', 'artist', 'album', 'date', 'genre'): + if tags.get(key): + print(f" {key.capitalize()}: {tags[key]}") + if fmt.get('duration'): + secs = float(fmt['duration']) + print(f" Duration: {int(secs//60)}:{int(secs%60):02d}") + else: + print(f" (ffprobe unavailable)") +except Exception as e: + print(f" (preview error: {e})") +PREVIEWEOF + + elif [[ "$mode" == "image" ]]; then + python3 - "$file" <<'PREVIEWEOF' >&2 +import sys +file = sys.argv[1] +try: + from PIL import Image + with Image.open(file) as img: + print(f" Size: {img.width}x{img.height} Mode: {img.mode}") + exif = img.getexif() + if exif: + from PIL.ExifTags import TAGS + shown = 0 + for tid, val in exif.items(): + tag = TAGS.get(tid, tid) + if tag in ('Make', 'Model', 'DateTime', 'Software') and isinstance(val, str): + print(f" {tag}: {val}") + shown += 1 + if shown >= 3: + break +except ImportError: + import subprocess + result = subprocess.run(['file', '--brief', '--mime-type', file], + capture_output=True, text=True) + print(f" {result.stdout.strip()}") +except Exception as e: + print(f" (preview error: {e})") +PREVIEWEOF + fi + + echo "" >&2 +} + # Collect file metadata as JSON collect_metadata() { local file="$1" mode="$2" @@ -803,6 +875,11 @@ process_file() { return 0 fi + # Optional content preview + if [[ "$PREVIEW" == "true" ]]; then + show_preview "$file" "$mode" + fi + # Collect metadata local metadata="" if [[ "$INCLUDE_METADATA" == "true" ]]; then @@ -910,6 +987,7 @@ QUIET=false CONTEXT="" INCLUDE_METADATA=true FORCE=false +PREVIEW=false FILES=() while [[ $# -gt 0 ]]; do @@ -928,6 +1006,7 @@ while [[ $# -gt 0 ]]; do --context|-c) CONTEXT="$2"; shift 2 ;; --no-metadata) INCLUDE_METADATA=false; shift ;; --force|-f) FORCE=true; shift ;; + --preview|-p) PREVIEW=true; shift ;; -*) echo "Unknown option: $1" >&2; exit 1 ;; *) FILES+=("$1"); shift ;; esac diff --git a/test.sh b/test.sh index 182627a..87f1624 100755 --- a/test.sh +++ b/test.sh @@ -296,3 +296,37 @@ teardown_file() { run bash -c "LLM_BASE_URL=http://127.0.0.1:$MOCK_PORT bash '$HAT' --quiet --dry-run --force '$TEST_ASSETS/sample.wav' 2>/dev/null" assert_output "suggested-name.wav" } + +# ── Preview flag ───────────────────────────────────────────────────── + +@test "preview: --preview shows content preview on stderr for text file" { + run bash -c "LLM_BASE_URL=http://127.0.0.1:$MOCK_PORT bash '$HAT' --quiet --dry-run --force --preview '$TEST_ASSETS/sample.txt' 2>&1 >/dev/null" + assert_output --partial "preview:" + assert_output --partial "sample.txt" +} + +@test "preview: --preview shows line count for text file" { + run bash -c "LLM_BASE_URL=http://127.0.0.1:$MOCK_PORT bash '$HAT' --quiet --dry-run --force --preview '$TEST_ASSETS/sample.txt' 2>&1 >/dev/null" + assert_output --partial "lines total" +} + +@test "preview: -p shorthand works" { + run bash -c "LLM_BASE_URL=http://127.0.0.1:$MOCK_PORT bash '$HAT' --quiet --dry-run --force -p '$TEST_ASSETS/sample.txt' 2>&1 >/dev/null" + assert_output --partial "preview:" +} + +@test "preview: without --preview no preview header on stderr" { + run bash -c "LLM_BASE_URL=http://127.0.0.1:$MOCK_PORT bash '$HAT' --quiet --dry-run --force '$TEST_ASSETS/sample.txt' 2>&1 >/dev/null" + refute_output --partial "preview:" +} + +@test "preview: --preview still produces correct suggestion on stdout" { + run bash -c "LLM_BASE_URL=http://127.0.0.1:$MOCK_PORT bash '$HAT' --quiet --dry-run --force --preview '$TEST_ASSETS/sample.txt' 2>/dev/null" + assert_output "suggested-name.txt" +} + +@test "preview: --preview works with audio file" { + run bash -c "LLM_BASE_URL=http://127.0.0.1:$MOCK_PORT bash '$HAT' --quiet --dry-run --force --preview '$TEST_ASSETS/sample.mp3' 2>&1 >/dev/null" + assert_output --partial "preview:" + assert_output --partial "sample.mp3" +}