From 218b24a5f283755a6174e7fb132bd4f8403d0e0c Mon Sep 17 00:00:00 2001 From: kalebbroo Date: Sat, 18 Apr 2026 12:52:12 -0400 Subject: [PATCH 1/4] Convert AceStep to use CPP instead of Python. --- AudioBackends/DynamicAudioBackend.cs | 59 +- AudioLabParams.cs | 222 +--- AudioProviderTypes/AudioProviderDefinition.cs | 3 + .../AudioProviderDefinitionBuilder.cs | 7 +- AudioProviders/AceStepProvider.cs | 60 +- AudioServices/AceStepCppManager.cs | 1179 +++++++++++++++++ AudioServices/AudioServerManager.cs | 6 + python_backend/docker/Dockerfile | 5 +- python_backend/engines/music_acestep.py | 274 ---- python_backend/test_engines.py | 2 +- 10 files changed, 1287 insertions(+), 530 deletions(-) create mode 100644 AudioServices/AceStepCppManager.cs delete mode 100644 python_backend/engines/music_acestep.py diff --git a/AudioBackends/DynamicAudioBackend.cs b/AudioBackends/DynamicAudioBackend.cs index e046c4b..870c082 100644 --- a/AudioBackends/DynamicAudioBackend.cs +++ b/AudioBackends/DynamicAudioBackend.cs @@ -1097,48 +1097,47 @@ private static Dictionary BuildEngineArgs(T2IParamInput input, A break; case "acestep_music": - // Core DiT params (acestep_music_params) - args["lyrics"] = input.TryGet(AudioLabParams.Lyrics, out string ly) ? ly : "[Instrumental]"; + // Resolve DiT GGUF filename from model config + quant level + string ditModel = modelDef?.EngineConfig?.TryGetValue("dit_model", out object dmObj) == true ? dmObj?.ToString() : "acestep-v15-turbo"; + string quantLevel = input.TryGet(AudioLabParams.ACEQuantLevel, out string ql) ? ql : "Q8_0"; + args["dit_file"] = AceStepCppManager.GetDitFileName(ditModel, quantLevel); + // AceRequest-compatible params (acestep_music_params) + args["caption"] = input.Get(T2IParamTypes.Prompt, ""); + string lyrics = input.TryGet(AudioLabParams.Lyrics, out string ly) ? ly : ""; + bool instrumental = input.TryGet(AudioLabParams.Instrumental, out string aceInst) && aceInst == "true"; + args["lyrics"] = instrumental ? "[Instrumental]" : lyrics; args["seed"] = input.TryGet(T2IParamTypes.Seed, out long aceSeed) ? aceSeed : -1L; - args["infer_step"] = input.TryGet(AudioLabParams.InferStep, out int infStep) ? infStep : 8; - args["guidance_scale"] = input.TryGet(AudioLabParams.ACEGuidanceScale, out double aceGuide) ? aceGuide : 7.0; - args["instrumental"] = input.TryGet(AudioLabParams.Instrumental, out string aceInst) ? aceInst : "false"; - args["bpm"] = input.TryGet(AudioLabParams.BPM, out int aceBpm) ? aceBpm : 120; - if (input.TryGet(AudioLabParams.KeyScale, out string aceKey) && !string.IsNullOrEmpty(aceKey)) - args["key_scale"] = aceKey; - args["time_signature"] = input.TryGet(AudioLabParams.TimeSignature, out string aceTs) ? aceTs : "4"; - args["vocal_language"] = input.TryGet(AudioLabParams.VocalLanguage, out string aceVl) ? aceVl : "en"; - args["shift"] = input.TryGet(AudioLabParams.ACEShift, out double aceShift) ? aceShift : 3.0; - args["infer_method"] = input.TryGet(AudioLabParams.InferMethod, out string aceIm) ? aceIm : "ode"; - args["use_adg"] = input.TryGet(AudioLabParams.UseADG, out string aceAdg) ? aceAdg : "false"; - args["cfg_interval_start"] = input.TryGet(AudioLabParams.CFGIntervalStart, out double aceCfgS) ? aceCfgS : 0.0; - args["cfg_interval_end"] = input.TryGet(AudioLabParams.CFGIntervalEnd, out double aceCfgE) ? aceCfgE : 1.0; - args["enable_normalization"] = input.TryGet(AudioLabParams.EnableNormalization, out string aceNorm) ? aceNorm : "true"; - args["normalization_db"] = input.TryGet(AudioLabParams.NormalizationDB, out double aceNormDb) ? aceNormDb : -14.0; - // LM planner params (acestep_lm_params) — TODO: integrate with SwarmUI AbstractLLMBackend + args["inference_steps"] = input.TryGet(AudioLabParams.InferStep, out int infStep) ? infStep : 0; + args["guidance_scale"] = input.TryGet(AudioLabParams.ACEGuidanceScale, out double aceGuide) ? aceGuide : 0.0; + args["bpm"] = input.TryGet(T2IParamTypes.Text2AudioBPM, out long aceBpm) ? (int)aceBpm : 0; + args["keyscale"] = input.TryGet(T2IParamTypes.Text2AudioKeyScale, out string aceKey) ? aceKey : ""; + args["timesignature"] = input.TryGet(T2IParamTypes.Text2AudioTimeSignature, out string aceTs) ? aceTs : ""; + args["vocal_language"] = input.TryGet(T2IParamTypes.Text2AudioLanguage, out string aceVl) ? aceVl : ""; + args["shift"] = input.TryGet(AudioLabParams.ACEShift, out double aceShift) ? aceShift : 0.0; + // LM planner params (acestep_lm_params) args["lm_model"] = input.TryGet(AudioLabParams.ACELMModel, out string aceLm) ? aceLm : "none"; - args["thinking"] = input.TryGet(AudioLabParams.Thinking, out string aceThink) ? aceThink : "true"; args["lm_temperature"] = input.TryGet(AudioLabParams.LMTemperature, out double aceLmTemp) ? aceLmTemp : 0.85; args["lm_cfg_scale"] = input.TryGet(AudioLabParams.LMCFGScale, out double aceLmCfg) ? aceLmCfg : 2.0; args["lm_top_k"] = input.TryGet(AudioLabParams.LMTopK, out int aceLmTopK) ? aceLmTopK : 0; args["lm_top_p"] = input.TryGet(AudioLabParams.LMTopP, out double aceLmTopP) ? aceLmTopP : 0.9; if (input.TryGet(AudioLabParams.LMNegativePrompt, out string aceLmNeg) && !string.IsNullOrEmpty(aceLmNeg)) args["lm_negative_prompt"] = aceLmNeg; - args["use_cot_metas"] = input.TryGet(AudioLabParams.UseCotMetas, out string aceCotM) ? aceCotM : "true"; - args["use_cot_caption"] = input.TryGet(AudioLabParams.UseCotCaption, out string aceCotC) ? aceCotC : "true"; - args["use_cot_language"] = input.TryGet(AudioLabParams.UseCotLanguage, out string aceCotL) ? aceCotL : "true"; + args["use_cot_caption"] = input.TryGet(AudioLabParams.UseCotCaption, out string aceCotC) && aceCotC == "true"; // Task params (acestep_task_params) - args["task_type"] = input.TryGet(AudioLabParams.ACETaskType, out string aceTask) ? aceTask : "text2music"; + string taskType = input.TryGet(AudioLabParams.ACETaskType, out string aceTask) ? aceTask : "text2music"; string aceSrcAudio = GetBase64Audio(input, AudioLabParams.ACESourceAudio); if (!string.IsNullOrEmpty(aceSrcAudio)) args["src_audio"] = aceSrcAudio; - string aceRefAudio = GetBase64Audio(input, AudioLabParams.ACEReferenceAudio); - if (!string.IsNullOrEmpty(aceRefAudio)) - args["reference_audio"] = aceRefAudio; - args["repaint_start"] = input.TryGet(AudioLabParams.RepaintStart, out double aceRpS) ? aceRpS : 0.0; - args["repaint_end"] = input.TryGet(AudioLabParams.RepaintEnd, out double aceRpE) ? aceRpE : -1.0; - args["cover_strength"] = input.TryGet(AudioLabParams.CoverStrength, out double aceCovStr) ? aceCovStr : 1.0; - args["cover_noise_strength"] = input.TryGet(AudioLabParams.CoverNoiseStrength, out double aceCovNs) ? aceCovNs : 0.0; + // Map task-specific fields into AceRequest + if (taskType == "cover") + args["audio_cover_strength"] = input.TryGet(AudioLabParams.CoverStrength, out double aceCovStr) ? aceCovStr : 0.5; + if (taskType == "repaint") + { + args["repainting_start"] = input.TryGet(AudioLabParams.RepaintStart, out double aceRpS) ? aceRpS : -1.0; + args["repainting_end"] = input.TryGet(AudioLabParams.RepaintEnd, out double aceRpE) ? aceRpE : -1.0; + } + if (taskType == "lego") + args["lego"] = "vocals"; // Default track; could add a param for this later break; case "musicgen_music": diff --git a/AudioLabParams.cs b/AudioLabParams.cs index 49d7db7..a364f72 100644 --- a/AudioLabParams.cs +++ b/AudioLabParams.cs @@ -16,6 +16,8 @@ public static class AudioLabParams public static T2IParamGroup STTGroup; /// Audio generation parameter group (music + sound effects). public static T2IParamGroup AudioGenGroup; + /// LM Planner parameter group for ACE-Step language model settings. + public static T2IParamGroup LMPlannerGroup; /// Voice reference parameter group for TTS voice cloning. public static T2IParamGroup VoiceRefGroup; /// Voice conversion parameter group (RVC, OpenVoice, GPT-SoVITS). @@ -239,6 +241,7 @@ public static class AudioLabParams #endregion #region Music — ACE-Step Core (flag: acestep_music_params) + // BPM, Key Scale, Time Signature, Language use built-in Text2Audio params (Text To Audio group). /// Song lyrics for ACE-Step generation. Feature flag: acestep_music_params. public static T2IRegisteredParam Lyrics; @@ -248,38 +251,17 @@ public static class AudioLabParams public static T2IRegisteredParam ACEGuidanceScale; /// Instrumental-only toggle for ACE-Step. Feature flag: acestep_music_params. public static T2IRegisteredParam Instrumental; - /// Beats per minute for ACE-Step music. Feature flag: acestep_music_params. - public static T2IRegisteredParam BPM; - /// Musical key and scale for ACE-Step. Feature flag: acestep_music_params. - public static T2IRegisteredParam KeyScale; - /// Musical time signature for ACE-Step. Feature flag: acestep_music_params. - public static T2IRegisteredParam TimeSignature; - /// Vocal language for ACE-Step music. Feature flag: acestep_music_params. - public static T2IRegisteredParam VocalLanguage; /// Noise schedule shift factor for ACE-Step. Feature flag: acestep_music_params. public static T2IRegisteredParam ACEShift; - /// ODE solver method for ACE-Step diffusion. Feature flag: acestep_music_params. - public static T2IRegisteredParam InferMethod; - /// Adaptive Diffusion Guidance toggle for ACE-Step. Feature flag: acestep_music_params. - public static T2IRegisteredParam UseADG; - /// CFG application interval start for ACE-Step. Feature flag: acestep_music_params. - public static T2IRegisteredParam CFGIntervalStart; - /// CFG application interval end for ACE-Step. Feature flag: acestep_music_params. - public static T2IRegisteredParam CFGIntervalEnd; - /// Output audio normalization toggle for ACE-Step. Feature flag: acestep_music_params. - public static T2IRegisteredParam EnableNormalization; - /// Target loudness in dBFS for ACE-Step normalization. Feature flag: acestep_music_params. - public static T2IRegisteredParam NormalizationDB; + /// GGUF quantization level for ACE-Step model downloads. Feature flag: acestep_music_params. + public static T2IRegisteredParam ACEQuantLevel; #endregion #region Music — ACE-Step LM Planner (flag: acestep_lm_params) - /// Language Model planner selection for ACE-Step. Feature flag: acestep_lm_params. - /// TODO: Integrate with SwarmUI AbstractLLMBackend when LLMAPI.cs is complete. + /// Language Model planner selection for ACE-Step. Feature flag: acestep_lm_params. public static T2IRegisteredParam ACELMModel; - /// Chain-of-thought reasoning toggle for ACE-Step LM. Feature flag: acestep_lm_params. - public static T2IRegisteredParam Thinking; /// Sampling temperature for ACE-Step LM planner. Feature flag: acestep_lm_params. public static T2IRegisteredParam LMTemperature; /// Classifier-free guidance scale for ACE-Step LM. Feature flag: acestep_lm_params. @@ -290,12 +272,8 @@ public static class AudioLabParams public static T2IRegisteredParam LMTopP; /// Negative prompt for ACE-Step LM planner. Feature flag: acestep_lm_params. public static T2IRegisteredParam LMNegativePrompt; - /// Meta tag inclusion in ACE-Step chain-of-thought. Feature flag: acestep_lm_params. - public static T2IRegisteredParam UseCotMetas; - /// Music caption inclusion in ACE-Step chain-of-thought. Feature flag: acestep_lm_params. + /// Music caption enrichment via chain-of-thought. Feature flag: acestep_lm_params. public static T2IRegisteredParam UseCotCaption; - /// Language detection inclusion in ACE-Step chain-of-thought. Feature flag: acestep_lm_params. - public static T2IRegisteredParam UseCotLanguage; #endregion @@ -303,18 +281,14 @@ public static class AudioLabParams /// ACE-Step generation task type. Feature flag: acestep_task_params. public static T2IRegisteredParam ACETaskType; - /// Source audio for ACE-Step cover/repaint/extract/lego/complete tasks. Feature flag: acestep_task_params. + /// Source audio for ACE-Step cover/repaint/lego tasks. Feature flag: acestep_task_params. public static T2IRegisteredParam ACESourceAudio; - /// Style/timbre reference audio for ACE-Step. Feature flag: acestep_task_params. - public static T2IRegisteredParam ACEReferenceAudio; /// Repaint start time in seconds for ACE-Step. Feature flag: acestep_task_params. public static T2IRegisteredParam RepaintStart; /// Repaint end time in seconds for ACE-Step. Feature flag: acestep_task_params. public static T2IRegisteredParam RepaintEnd; /// Style transfer strength for ACE-Step cover task. Feature flag: acestep_task_params. public static T2IRegisteredParam CoverStrength; - /// Noise injection strength for ACE-Step cover task. Feature flag: acestep_task_params. - public static T2IRegisteredParam CoverNoiseStrength; #endregion @@ -433,6 +407,8 @@ public static void RegisterAll() Description: "Speech-to-text parameters. Upload audio to transcribe."); AudioGenGroup = new("Audio Generation", Open: true, OrderPriority: -25, Toggles: false, Description: "Audio generation parameters for music and sound effects. Describe what you want in the Prompt box above."); + LMPlannerGroup = new("LM Planner", Open: false, OrderPriority: -24.5, Toggles: false, + Description: "Language Model planner settings for ACE-Step. Controls how the LM enriches your prompt with structured music metadata."); CloneGroup = new("Voice Conversion", Open: true, OrderPriority: -24, Toggles: false, Description: "Voice conversion parameters. Provide source audio to convert and target voice reference."); AudioProcGroup = new("Audio Processing", Open: true, OrderPriority: -23, Toggles: false, @@ -902,218 +878,112 @@ public static void RegisterAll() "Generate instrumental-only track without vocals.", "false", GetValues: _ => ["false///No", "true///Yes"], - OrderPriority: -5, Group: AudioGenGroup, FeatureFlag: "acestep_music_params")); + IsAdvanced: true, OrderPriority: -5, Group: AudioGenGroup, FeatureFlag: "acestep_music_params")); - BPM = T2IParamTypes.Register(new("BPM", - "Beats per minute for the generated music.", - "120", - Min: 30, Max: 300, Step: 1, ViewType: ParamViewType.SLIDER, - OrderPriority: -4, Group: AudioGenGroup, FeatureFlag: "acestep_music_params")); - - KeyScale = T2IParamTypes.Register(new("Key / Scale", - "Musical key and scale.\nLeave empty for auto-detection.", - "", - GetValues: _ => [ - "///Auto", - "C major///C Major", "C minor///C Minor", - "C# major///C# Major", "C# minor///C# Minor", - "D major///D Major", "D minor///D Minor", - "Eb major///Eb Major", "Eb minor///Eb Minor", - "E major///E Major", "E minor///E Minor", - "F major///F Major", "F minor///F Minor", - "F# major///F# Major", "F# minor///F# Minor", - "G major///G Major", "G minor///G Minor", - "Ab major///Ab Major", "Ab minor///Ab Minor", - "A major///A Major", "A minor///A Minor", - "Bb major///Bb Major", "Bb minor///Bb Minor", - "B major///B Major", "B minor///B Minor" - ], - OrderPriority: -3, Group: AudioGenGroup, FeatureFlag: "acestep_music_params")); - - TimeSignature = T2IParamTypes.Register(new("Time Signature", - "Musical time signature (beats per measure).", - "4", - GetValues: _ => [ - "4///4/4 (Common Time)", "3///3/4 (Waltz)", "2///2/4 (March)", "6///6/8 (Compound)" - ], - OrderPriority: -2, Group: AudioGenGroup, FeatureFlag: "acestep_music_params")); - - VocalLanguage = T2IParamTypes.Register(new("Vocal Language", - "Language for vocal content in the generated music.", - "en", - GetValues: _ => [ - "en///English", "zh///Chinese", "es///Spanish", "fr///French", - "de///German", "ja///Japanese", "ko///Korean", "pt///Portuguese", - "ru///Russian", "it///Italian", "ar///Arabic", "tr///Turkish", - "nl///Dutch", "pl///Polish", "sv///Swedish", "da///Danish", - "fi///Finnish", "no///Norwegian", "id///Indonesian", "vi///Vietnamese", - "th///Thai", "ms///Malay", "ro///Romanian", "cs///Czech", - "el///Greek", "hu///Hungarian", "uk///Ukrainian", "bg///Bulgarian", - "hr///Croatian", "sk///Slovak", "sl///Slovenian", "sr///Serbian", - "lt///Lithuanian", "lv///Latvian", "et///Estonian", "mk///Macedonian", - "sq///Albanian", "bs///Bosnian", "gl///Galician", "ka///Georgian", - "eu///Basque", "cy///Welsh", "ga///Irish", "mt///Maltese", - "is///Icelandic", "az///Azerbaijani", "kk///Kazakh", "uz///Uzbek", - "tg///Tajik", "mn///Mongolian" - ], - OrderPriority: -1, Group: AudioGenGroup, FeatureFlag: "acestep_music_params")); + // BPM removed — use T2IParamTypes.Text2AudioBPM (Text To Audio group) + // KeyScale removed — use T2IParamTypes.Text2AudioKeyScale (Text To Audio group) + // TimeSignature removed — use T2IParamTypes.Text2AudioTimeSignature (Text To Audio group) + // VocalLanguage removed — use T2IParamTypes.Text2AudioLanguage (Text To Audio group) ACEShift = T2IParamTypes.Register(new("Shift", "Noise schedule shift factor.\nHigher values increase generation diversity.", "3.0", Min: 1.0, Max: 5.0, Step: 0.1, ViewType: ParamViewType.SLIDER, - OrderPriority: 0, Group: AudioGenGroup, FeatureFlag: "acestep_music_params")); - - InferMethod = T2IParamTypes.Register(new("Infer Method", - "ODE solver method for diffusion inference.\nODE = deterministic. SDE = stochastic (more varied).", - "ode", - GetValues: _ => ["ode///ODE (Default)", "sde///SDE (Stochastic)"], - OrderPriority: 1, Group: AudioGenGroup, FeatureFlag: "acestep_music_params")); - - UseADG = T2IParamTypes.Register(new("Use ADG", - "Enable Adaptive Diffusion Guidance.\nCan improve prompt adherence for some models.", - "false", - GetValues: _ => ["false///No", "true///Yes"], - OrderPriority: 2, Group: AudioGenGroup, FeatureFlag: "acestep_music_params")); - - CFGIntervalStart = T2IParamTypes.Register(new("CFG Interval Start", - "Start of the CFG application interval.\n0.0 = apply from beginning of denoising.", - "0.0", - Min: 0.0, Max: 1.0, Step: 0.05, ViewType: ParamViewType.SLIDER, - OrderPriority: 3, Group: AudioGenGroup, FeatureFlag: "acestep_music_params")); - - CFGIntervalEnd = T2IParamTypes.Register(new("CFG Interval End", - "End of the CFG application interval.\n1.0 = apply through end of denoising.", - "1.0", - Min: 0.0, Max: 1.0, Step: 0.05, ViewType: ParamViewType.SLIDER, - OrderPriority: 4, Group: AudioGenGroup, FeatureFlag: "acestep_music_params")); + IsAdvanced: true, OrderPriority: 0, Group: AudioGenGroup, FeatureFlag: "acestep_music_params")); - EnableNormalization = T2IParamTypes.Register(new("Normalize Audio", - "Normalize output audio to a target loudness level.", - "true", - GetValues: _ => ["true///Yes (Recommended)", "false///No"], - OrderPriority: 5, Group: AudioGenGroup, FeatureFlag: "acestep_music_params")); - - NormalizationDB = T2IParamTypes.Register(new("Normalization dB", - "Target loudness in dBFS when normalization is enabled.\n-14 dB is typical for streaming.", - "-14.0", - Min: -30.0, Max: 0.0, Step: 0.5, ViewType: ParamViewType.SLIDER, - OrderPriority: 6, Group: AudioGenGroup, FeatureFlag: "acestep_music_params")); + ACEQuantLevel = T2IParamTypes.Register(new("Quant Level", + "GGUF quantization level for model downloads.\nHigher = better quality but more VRAM/disk.", + "Q8_0", + GetValues: _ => [ + "Q8_0///Q8_0 (Best Quality)", "Q6_K///Q6_K (High)", + "Q5_K_M///Q5_K_M (Balanced)", "Q4_K_M///Q4_K_M (Smallest)", + "BF16///BF16 (Full Precision)" + ], + IsAdvanced: true, OrderPriority: 1, Group: AudioGenGroup, FeatureFlag: "acestep_music_params")); #endregion #region Music — ACE-Step LM Planner - // TODO: Integrate with SwarmUI's AbstractLLMBackend when LLMAPI.cs is complete. - // These params are registered and wired through BuildEngineArgs but the actual - // LM inference is stubbed in music_acestep.py until SwarmUI LLM integration is ready. + // ACE LM Model stays in Audio Generation group (it's the primary toggle) ACELMModel = T2IParamTypes.Register(new("ACE LM Model", - "Language Model planner for structured music metadata generation.\nRequires SwarmUI LLM backend integration (not yet available).", + "Language Model planner for structured music metadata generation.\nDownloads GGUF model on first use. Enriches prompt with lyrics, BPM, key, etc.", "none", GetValues: _ => [ - "none///None (Disabled)", "0.6B///Qwen3 0.6B (Fast)", - "1.7B///Qwen3 1.7B (Balanced)", "4B///Qwen3 4B (Best)" + "none///None (Disabled)", "lm-0.6B///Qwen3 0.6B (Fast, ~710MB)", + "lm-1.7B///Qwen3 1.7B (Balanced, ~2GB)", "lm-4B///Qwen3 4B (Best, ~4.5GB)" ], OrderPriority: -10, Group: AudioGenGroup, FeatureFlag: "acestep_lm_params")); - Thinking = T2IParamTypes.Register(new("LM Thinking", - "Enable chain-of-thought reasoning in the LM planner.", - "true", - GetValues: _ => ["true///Yes", "false///No"], - OrderPriority: -9, Group: AudioGenGroup, FeatureFlag: "acestep_lm_params")); - + // LM tuning params in dedicated LM Planner group LMTemperature = T2IParamTypes.Register(new("LM Temperature", "Sampling temperature for the LM planner.\nHigher = more creative metadata generation.", "0.85", Min: 0.0, Max: 2.0, Step: 0.05, ViewType: ParamViewType.SLIDER, - OrderPriority: -8, Group: AudioGenGroup, FeatureFlag: "acestep_lm_params")); + OrderPriority: -8, Group: LMPlannerGroup, FeatureFlag: "acestep_lm_params")); LMCFGScale = T2IParamTypes.Register(new("LM CFG Scale", - "Classifier-free guidance scale for the LM planner.", + "Classifier-free guidance scale for the LM planner.\n1.0 = disabled.", "2.0", Min: 1.0, Max: 5.0, Step: 0.1, ViewType: ParamViewType.SLIDER, - OrderPriority: -7, Group: AudioGenGroup, FeatureFlag: "acestep_lm_params")); + OrderPriority: -7, Group: LMPlannerGroup, FeatureFlag: "acestep_lm_params")); LMTopK = T2IParamTypes.Register(new("LM Top K", "Top-K sampling for the LM planner.\n0 = disabled.", "0", Min: 0, Max: 500, Step: 10, ViewType: ParamViewType.SLIDER, - OrderPriority: -6, Group: AudioGenGroup, FeatureFlag: "acestep_lm_params")); + OrderPriority: -6, Group: LMPlannerGroup, FeatureFlag: "acestep_lm_params")); LMTopP = T2IParamTypes.Register(new("LM Top P", "Nucleus sampling threshold for the LM planner.", "0.9", Min: 0.0, Max: 1.0, Step: 0.05, ViewType: ParamViewType.SLIDER, - OrderPriority: -5, Group: AudioGenGroup, FeatureFlag: "acestep_lm_params")); + OrderPriority: -5, Group: LMPlannerGroup, FeatureFlag: "acestep_lm_params")); LMNegativePrompt = T2IParamTypes.Register(new("LM Negative Prompt", "Negative prompt for the LM planner.\nDescribes unwanted characteristics to avoid.", "", - OrderPriority: -4, Group: AudioGenGroup, FeatureFlag: "acestep_lm_params")); - - UseCotMetas = T2IParamTypes.Register(new("CoT Metas", - "Include meta tags (genre, mood, instruments) in chain-of-thought.", - "true", - GetValues: _ => ["true///Yes", "false///No"], - OrderPriority: -3, Group: AudioGenGroup, FeatureFlag: "acestep_lm_params")); + OrderPriority: -4, Group: LMPlannerGroup, FeatureFlag: "acestep_lm_params")); UseCotCaption = T2IParamTypes.Register(new("CoT Caption", - "Include music description caption in chain-of-thought.", + "Enrich music description caption via chain-of-thought reasoning.", "true", GetValues: _ => ["true///Yes", "false///No"], - OrderPriority: -2, Group: AudioGenGroup, FeatureFlag: "acestep_lm_params")); - - UseCotLanguage = T2IParamTypes.Register(new("CoT Language", - "Include language detection in chain-of-thought.", - "true", - GetValues: _ => ["true///Yes", "false///No"], - OrderPriority: -1, Group: AudioGenGroup, FeatureFlag: "acestep_lm_params")); + OrderPriority: -2, Group: LMPlannerGroup, FeatureFlag: "acestep_lm_params")); #endregion #region Music — ACE-Step Tasks ACETaskType = T2IParamTypes.Register(new("Task Type", - "ACE-Step generation task type.\ntext2music = generate from prompt. cover = style transfer.\nrepaint = regenerate a section. extract = extract elements.\nlego = combine elements. complete = extend/continue.", + "ACE-Step generation task type.\ntext2music = generate from prompt.\ncover = style transfer with source audio.\nrepaint = regenerate a time region.\nlego = extract/isolate a track (vocals, drums, etc.).", "text2music", GetValues: _ => [ "text2music///Text to Music", "cover///Cover (Style Transfer)", - "repaint///Repaint (Section Regen)", "extract///Extract Elements", - "lego///Lego (Combine)", "complete///Complete (Extend)" + "repaint///Repaint (Section Regen)", "lego///Lego (Track Isolation)" ], OrderPriority: -10, Group: AudioGenGroup, FeatureFlag: "acestep_task_params")); ACESourceAudio = T2IParamTypes.Register(new("ACE Source Audio", - "Source audio for cover, repaint, extract, lego, and complete tasks.\nRequired for all tasks except text2music.", + "Source audio for cover, repaint, and lego tasks.\nRequired for all tasks except text2music.", null, OrderPriority: -9, Group: AudioGenGroup, FeatureFlag: "acestep_task_params")); - ACEReferenceAudio = T2IParamTypes.Register(new("Style Reference Audio", - "Optional style/timbre reference audio.\nThe generated music will match the style of this reference.", - null, - OrderPriority: -8, Group: AudioGenGroup, FeatureFlag: "acestep_task_params")); - RepaintStart = T2IParamTypes.Register(new("Repaint Start", "Start time in seconds for repaint task.\nThe section from this point will be regenerated.", "0.0", Min: 0.0, Max: 600.0, Step: 0.5, ViewType: ParamViewType.SLIDER, - OrderPriority: -7, Group: AudioGenGroup, FeatureFlag: "acestep_task_params")); + IsAdvanced: true, OrderPriority: -7, Group: AudioGenGroup, FeatureFlag: "acestep_task_params")); RepaintEnd = T2IParamTypes.Register(new("Repaint End", "End time in seconds for repaint task.\n-1 = auto (repaint to end of audio).", "-1.0", Min: -1.0, Max: 600.0, Step: 0.5, ViewType: ParamViewType.SLIDER, - OrderPriority: -6, Group: AudioGenGroup, FeatureFlag: "acestep_task_params")); + IsAdvanced: true, OrderPriority: -6, Group: AudioGenGroup, FeatureFlag: "acestep_task_params")); CoverStrength = T2IParamTypes.Register(new("Cover Strength", - "Style transfer strength for cover task.\n1.0 = full transfer. Lower = more of original.", - "1.0", - Min: 0.0, Max: 1.0, Step: 0.05, ViewType: ParamViewType.SLIDER, - OrderPriority: -5, Group: AudioGenGroup, FeatureFlag: "acestep_task_params")); - - CoverNoiseStrength = T2IParamTypes.Register(new("Cover Noise", - "Noise injection strength for cover task.\nAdds variation to the style transfer.", - "0.0", + "Style transfer strength for cover task.\nFraction of DiT steps using source audio. 1.0 = full transfer.", + "0.5", Min: 0.0, Max: 1.0, Step: 0.05, ViewType: ParamViewType.SLIDER, - OrderPriority: -4, Group: AudioGenGroup, FeatureFlag: "acestep_task_params")); + IsAdvanced: true, OrderPriority: -5, Group: AudioGenGroup, FeatureFlag: "acestep_task_params")); #endregion diff --git a/AudioProviderTypes/AudioProviderDefinition.cs b/AudioProviderTypes/AudioProviderDefinition.cs index 483738b..2952fa9 100644 --- a/AudioProviderTypes/AudioProviderDefinition.cs +++ b/AudioProviderTypes/AudioProviderDefinition.cs @@ -47,6 +47,9 @@ public sealed class AudioProviderDefinition /// Whether this provider is API-based (no local models, requires API key). public bool IsApiProvider { get; init; } = false; + /// Whether this provider uses a native binary instead of a Python engine. + public bool IsNativeBinary { get; init; } = false; + /// The key name used to look up the API key in user settings (e.g. "elevenlabs_api"). /// Used with user.GetGenericData(ApiKeySettingsId, "key"). public string ApiKeySettingsId { get; init; } = ""; diff --git a/AudioProviderTypes/AudioProviderDefinitionBuilder.cs b/AudioProviderTypes/AudioProviderDefinitionBuilder.cs index 0d0e2ac..14104f8 100644 --- a/AudioProviderTypes/AudioProviderDefinitionBuilder.cs +++ b/AudioProviderTypes/AudioProviderDefinitionBuilder.cs @@ -21,6 +21,7 @@ public sealed class AudioProviderDefinitionBuilder private string _engineGroup = "default"; private bool _requiresDocker = false; private bool _isApiProvider = false; + private bool _isNativeBinary = false; private string _apiKeySettingsId = ""; #endregion @@ -76,6 +77,9 @@ public AudioProviderDefinitionBuilder WithModelClass(string id, string name) /// Marks this provider as requiring Docker to run. public AudioProviderDefinitionBuilder WithRequiresDocker() { _requiresDocker = true; return this; } + /// Marks this provider as using a native binary instead of a Python engine. + public AudioProviderDefinitionBuilder WithNativeBinary() { _isNativeBinary = true; return this; } + /// Marks this provider as API-based (no local models, requires an API key). /// The key name for user settings lookup (e.g. "elevenlabs_api"). public AudioProviderDefinitionBuilder WithApiProvider(string apiKeySettingsId) @@ -94,7 +98,7 @@ public AudioProviderDefinition Build() { if (string.IsNullOrEmpty(_id)) throw new InvalidOperationException("Provider ID is required"); if (string.IsNullOrEmpty(_name)) throw new InvalidOperationException("Provider name is required"); - if (!_isApiProvider) + if (!_isApiProvider && !_isNativeBinary) { if (string.IsNullOrEmpty(_pythonModule)) throw new InvalidOperationException("Python module is required"); if (string.IsNullOrEmpty(_pythonEngineClass)) throw new InvalidOperationException("Python engine class is required"); @@ -117,6 +121,7 @@ public AudioProviderDefinition Build() EngineGroup = _engineGroup, RequiresDocker = _requiresDocker, IsApiProvider = _isApiProvider, + IsNativeBinary = _isNativeBinary, ApiKeySettingsId = _apiKeySettingsId }; } diff --git a/AudioProviders/AceStepProvider.cs b/AudioProviders/AceStepProvider.cs index e54ac58..3425070 100644 --- a/AudioProviders/AceStepProvider.cs +++ b/AudioProviders/AceStepProvider.cs @@ -3,7 +3,8 @@ namespace Hartsy.Extensions.AudioLab.AudioProviders; -/// ACE-Step 1.5 provider — SOTA music generation with lyrics alignment, 6 DiT variants, and optional LM planner. +/// ACE-Step 1.5 provider — SOTA music generation via acestep.cpp native binary with GGUF quantized models. +/// Supports 2B standard and 4B XL DiT variants, optional LM planner, and cover/repaint/lego tasks. public sealed class AceStepProvider : IAudioProviderSource { /// Singleton instance of the ACE-Step provider. @@ -14,63 +15,32 @@ public AudioProviderDefinition GetProvider() => AudioProviderDefinitionBuilder.C .WithId("acestep_music") .WithName("ACE-Step Music") .WithCategory(AudioCategory.AudioGeneration) - .WithPythonEngine("music_acestep", "AceStepEngine") + .WithNativeBinary() .WithModelPrefix("AceStep") .WithModelClass("acestep_music", "ACE-Step Music") .AddFeatureFlag("audiolab_audiogen") .AddFeatureFlag("acestep_music_params") .AddFeatureFlag("acestep_lm_params") .AddFeatureFlag("acestep_task_params") - .AddDependencies(Dependencies) + .AddFeatureFlag("text2audio") .AddModels(Models) - .WithEngineGroup("music") .Build(); - #region Dependencies - - private static PackageDefinition[] Dependencies => - [ - new() { Name = "numpy>=1.26.0", InstallName = "numpy>=1.26.0", ImportName = "numpy", Category = "core" }, - new() { Name = "torch==2.7.1+cu128", InstallName = "torch==2.7.1+cu128", ImportName = "torch", Category = "pytorch", EstimatedInstallTimeMinutes = 12, CustomInstallArgs = "--extra-index-url https://download.pytorch.org/whl/cu128" }, - new() { Name = "torchaudio==2.7.1+cu128", InstallName = "torchaudio==2.7.1+cu128", ImportName = "torchaudio", Category = "pytorch", EstimatedInstallTimeMinutes = 10, CustomInstallArgs = "--extra-index-url https://download.pytorch.org/whl/cu128" }, - new() { Name = "torchvision==0.22.1+cu128", InstallName = "torchvision==0.22.1+cu128", ImportName = "torchvision", Category = "pytorch", EstimatedInstallTimeMinutes = 8, CustomInstallArgs = "--extra-index-url https://download.pytorch.org/whl/cu128" }, - new() { Name = "ace-step", InstallName = "git+https://github.com/ace-step/ACE-Step.git", ImportName = "acestep", Category = "music", IsGitPackage = true, EstimatedInstallTimeMinutes = 15, CustomInstallArgs = "--no-deps" }, - new() { Name = "transformers>=4.51.0,<4.58.0", InstallName = "transformers>=4.51.0,<4.58.0", ImportName = "transformers", Category = "music" }, - new() { Name = "diffusers", InstallName = "diffusers", ImportName = "diffusers", Category = "music" }, - new() { Name = "einops>=0.8.1", InstallName = "einops>=0.8.1", ImportName = "einops", Category = "music" }, - new() { Name = "accelerate>=1.12.0", InstallName = "accelerate>=1.12.0", ImportName = "accelerate", Category = "music" }, - new() { Name = "peft>=0.18.0", InstallName = "peft>=0.18.0", ImportName = "peft", Category = "music" }, - new() { Name = "numba>=0.63.1", InstallName = "numba>=0.63.1", ImportName = "numba", Category = "music" }, - new() { Name = "torchcodec>=0.9.1", InstallName = "torchcodec>=0.9.1", ImportName = "torchcodec", Category = "music" }, - new() { Name = "vector-quantize-pytorch>=1.27.15", InstallName = "vector-quantize-pytorch>=1.27.15", ImportName = "vector_quantize_pytorch", Category = "music" }, - new() { Name = "safetensors==0.7.0", InstallName = "safetensors==0.7.0", ImportName = "safetensors", Category = "music" }, - new() { Name = "scipy>=1.10.1", InstallName = "scipy>=1.10.1", ImportName = "scipy", Category = "music" }, - new() { Name = "soundfile>=0.13.1", InstallName = "soundfile>=0.13.1", ImportName = "soundfile", Category = "core" }, - new() { Name = "loguru>=0.7.3", InstallName = "loguru>=0.7.3", ImportName = "loguru", Category = "music" }, - new() { Name = "pypinyin", InstallName = "pypinyin", ImportName = "pypinyin", Category = "music" }, - new() { Name = "hangul-romanize", InstallName = "hangul-romanize", ImportName = "hangul_romanize", Category = "music" }, - new() { Name = "num2words", InstallName = "num2words", ImportName = "num2words", Category = "music" }, - new() { Name = "cutlet", InstallName = "cutlet", ImportName = "cutlet", Category = "music" }, - new() { Name = "fugashi", InstallName = "fugashi[unidic-lite]", ImportName = "fugashi", Category = "music" }, - new() { Name = "spacy", InstallName = "spacy", ImportName = "spacy", Category = "music" }, - new() { Name = "librosa", InstallName = "librosa", ImportName = "librosa", Category = "music" }, - new() { Name = "py3langid", InstallName = "py3langid", ImportName = "py3langid", Category = "music" }, - new() { Name = "tqdm", InstallName = "tqdm", ImportName = "tqdm", Category = "core" }, - new() { Name = "opencc-python-reimplemented", InstallName = "opencc-python-reimplemented", ImportName = "opencc", Category = "music" } - ]; - - #endregion - #region Models private static AudioModelDefinition[] Models => [ - new() { Id = "turbo", Name = "ACE-Step 1.5 Turbo", Description = "Fast turbo model, 8 steps. Supports text2music, cover, repaint.", SourceUrl = "https://github.com/ace-step/ACE-Step-1.5", License = "MIT", EstimatedSize = "~4GB", EstimatedVram = "~8GB", EngineConfig = new() { ["dit_model"] = "acestep-v15-turbo" } }, - new() { Id = "turbo-shift1", Name = "ACE-Step 1.5 Turbo Shift1", Description = "Turbo with shift=1 for enhanced diversity, 8 steps.", SourceUrl = "https://github.com/ace-step/ACE-Step-1.5", License = "MIT", EstimatedSize = "~4GB", EstimatedVram = "~8GB", EngineConfig = new() { ["dit_model"] = "acestep-v15-turbo-shift1" } }, - new() { Id = "turbo-shift3", Name = "ACE-Step 1.5 Turbo Shift3", Description = "Turbo with shift=3 for high diversity, 8 steps.", SourceUrl = "https://github.com/ace-step/ACE-Step-1.5", License = "MIT", EstimatedSize = "~4GB", EstimatedVram = "~8GB", EngineConfig = new() { ["dit_model"] = "acestep-v15-turbo-shift3" } }, - new() { Id = "turbo-continuous", Name = "ACE-Step 1.5 Turbo Continuous", Description = "Turbo with continuous noise schedule, 8 steps.", SourceUrl = "https://github.com/ace-step/ACE-Step-1.5", License = "MIT", EstimatedSize = "~4GB", EstimatedVram = "~8GB", EngineConfig = new() { ["dit_model"] = "acestep-v15-turbo-continuous" } }, - new() { Id = "sft", Name = "ACE-Step 1.5 SFT", Description = "SFT model with CFG support, 50 steps. Supports text2music, cover, repaint, extract.", SourceUrl = "https://github.com/ace-step/ACE-Step-1.5", License = "MIT", EstimatedSize = "~4GB", EstimatedVram = "~8GB", EngineConfig = new() { ["dit_model"] = "acestep-v15-sft" } }, - new() { Id = "base", Name = "ACE-Step 1.5 Base", Description = "Full base model with CFG, 50 steps. Supports all 6 task types.", SourceUrl = "https://github.com/ace-step/ACE-Step-1.5", License = "MIT", EstimatedSize = "~4GB", EstimatedVram = "~10GB", EngineConfig = new() { ["dit_model"] = "acestep-v15-base" } } + // 2B Standard DiT variants + new() { Id = "turbo", Name = "ACE-Step 1.5 Turbo", Description = "Fast turbo model, 8 steps. Best for quick generation.", SourceUrl = "https://huggingface.co/Serveurperso/ACE-Step-1.5-GGUF", License = "MIT", EstimatedSize = "~2.5GB (Q8_0)", EstimatedVram = "~6GB", EngineConfig = new() { ["dit_model"] = "acestep-v15-turbo" } }, + new() { Id = "turbo-shift1", Name = "ACE-Step 1.5 Turbo Shift1", Description = "Turbo with shift=1 for enhanced diversity, 8 steps.", SourceUrl = "https://huggingface.co/Serveurperso/ACE-Step-1.5-GGUF", License = "MIT", EstimatedSize = "~2.5GB (Q8_0)", EstimatedVram = "~6GB", EngineConfig = new() { ["dit_model"] = "acestep-v15-turbo-shift1" } }, + new() { Id = "turbo-shift3", Name = "ACE-Step 1.5 Turbo Shift3", Description = "Turbo with shift=3 for high diversity, 8 steps.", SourceUrl = "https://huggingface.co/Serveurperso/ACE-Step-1.5-GGUF", License = "MIT", EstimatedSize = "~2.5GB (Q8_0)", EstimatedVram = "~6GB", EngineConfig = new() { ["dit_model"] = "acestep-v15-turbo-shift3" } }, + new() { Id = "turbo-continuous", Name = "ACE-Step 1.5 Turbo Continuous", Description = "Turbo with continuous noise schedule, 8 steps.", SourceUrl = "https://huggingface.co/Serveurperso/ACE-Step-1.5-GGUF", License = "MIT", EstimatedSize = "~2.5GB (Q8_0)", EstimatedVram = "~6GB", EngineConfig = new() { ["dit_model"] = "acestep-v15-turbo-continuous" } }, + new() { Id = "sft", Name = "ACE-Step 1.5 SFT", Description = "SFT model with CFG support, 50 steps. Higher quality.", SourceUrl = "https://huggingface.co/Serveurperso/ACE-Step-1.5-GGUF", License = "MIT", EstimatedSize = "~2.5GB (Q8_0)", EstimatedVram = "~6GB", EngineConfig = new() { ["dit_model"] = "acestep-v15-sft" } }, + new() { Id = "base", Name = "ACE-Step 1.5 Base", Description = "Full base model with CFG, 50 steps. Supports all task types.", SourceUrl = "https://huggingface.co/Serveurperso/ACE-Step-1.5-GGUF", License = "MIT", EstimatedSize = "~2.5GB (Q8_0)", EstimatedVram = "~6GB", EngineConfig = new() { ["dit_model"] = "acestep-v15-base" } }, + // 4B XL DiT variants + new() { Id = "xl-turbo", Name = "ACE-Step 1.5 XL Turbo", Description = "4B XL turbo model, best quality + speed. Requires ~12GB VRAM.", SourceUrl = "https://huggingface.co/Serveurperso/ACE-Step-1.5-GGUF", License = "MIT", EstimatedSize = "~5.3GB (Q8_0)", EstimatedVram = "~12GB", EngineConfig = new() { ["dit_model"] = "acestep-v15-xl-turbo" } }, + new() { Id = "xl-sft", Name = "ACE-Step 1.5 XL SFT", Description = "4B XL SFT model, highest quality. Requires ~12GB VRAM.", SourceUrl = "https://huggingface.co/Serveurperso/ACE-Step-1.5-GGUF", License = "MIT", EstimatedSize = "~5.3GB (Q8_0)", EstimatedVram = "~12GB", EngineConfig = new() { ["dit_model"] = "acestep-v15-xl-sft" } }, + new() { Id = "xl-base", Name = "ACE-Step 1.5 XL Base", Description = "4B XL base model, highest quality + all tasks. Requires ~12GB VRAM.", SourceUrl = "https://huggingface.co/Serveurperso/ACE-Step-1.5-GGUF", License = "MIT", EstimatedSize = "~5.3GB (Q8_0)", EstimatedVram = "~12GB", EngineConfig = new() { ["dit_model"] = "acestep-v15-xl-base" } } ]; #endregion diff --git a/AudioServices/AceStepCppManager.cs b/AudioServices/AceStepCppManager.cs new file mode 100644 index 0000000..925a6e5 --- /dev/null +++ b/AudioServices/AceStepCppManager.cs @@ -0,0 +1,1179 @@ +using System.Diagnostics; +using System.IO; +using System.IO.Compression; +using System.Linq; +using System.Net.Http; +using System.Runtime.InteropServices; +using System.Text; +using Newtonsoft.Json.Linq; +using SwarmUI.Backends; +using SwarmUI.Utils; + +namespace Hartsy.Extensions.AudioLab.AudioServices; + +/// Manages the acestep.cpp native binary: download, model management, server lifecycle, and job-based API. +public sealed class AceStepCppManager : IDisposable +{ + #region Constants + + public const string GITHUB_VST3_API_URL = "https://api.github.com/repos/ace-step/acestep.vst3/releases/latest"; + public const string GITHUB_CPP_API_URL = "https://api.github.com/repos/ace-step/acestep.cpp/releases/latest"; + public const string FALLBACK_RELEASE_TAG = "v0.1.0"; + public const string FALLBACK_WIN_URL = "https://github.com/ace-step/acestep.vst3/releases/download/v0.1.0/acestep-windows-x64.zip"; + public const string FALLBACK_LINUX_URL = "https://github.com/ace-step/acestep.vst3/releases/download/v0.1.0/acestep-linux-x64.tar.gz"; + public const string FALLBACK_MAC_URL = "https://github.com/ace-step/acestep.vst3/releases/download/v0.1.0/acestep-macos-arm64-metal.tar.gz"; + /// Community-hosted Windows build (updated more frequently than GitHub releases). + public const string SERVEURPERSO_WIN_BASE_URL = "https://www.serveurperso.com/temp/acestep.cpp-win64/build/Release/"; + private static readonly string[] ServeurpersoWinFiles = + [ + "ace-server.exe", "ace-lm.exe", "ace-synth.exe", "ace-understand.exe", + "mp3-codec.exe", "neural-codec.exe", "quantize.exe", + "ggml.dll", "ggml-base.dll", "ggml-cuda.dll", "ggml-vulkan.dll", + "ggml-cpu-haswell.dll", "ggml-cpu-alderlake.dll", "ggml-cpu-cannonlake.dll", + "ggml-cpu-cascadelake.dll", "ggml-cpu-icelake.dll", "ggml-cpu-sandybridge.dll", + "ggml-cpu-skylakex.dll", "ggml-cpu-sse42.dll", "ggml-cpu-x64.dll" + ]; + public const string HF_MODEL_BASE_URL = "https://huggingface.co/Serveurperso/ACE-Step-1.5-GGUF/resolve/main/"; + public const string VERSION_FILE = "acestep_version.json"; + private const int JOB_POLL_INTERVAL_MS = 500; + private const int JOB_TIMEOUT_MS = 600_000; // 10 minutes max for generation + private const int HEALTH_CHECK_TIMEOUT_MS = 30_000; + private const int RETRY_503_MAX = 3; + + #endregion + + #region Singleton & State + + private static readonly Lazy InstanceLazy = new(() => new AceStepCppManager()); + public static AceStepCppManager Instance => InstanceLazy.Value; + + private readonly HttpClient _httpClient; + private readonly SemaphoreSlim _startLock = new(1, 1); + private Process _serverProcess; + private int _port; + private bool _lmLoaded; + /// Cached server props (available models, etc) from /props endpoint. + private JObject _serverProps; + + /// Ring buffer of recent stderr lines for crash diagnostics. + private readonly Queue _recentStderr = new(); + private readonly object _stderrLock = new(); + private const int STDERR_BUFFER_SIZE = 50; + /// Last critical error detected in stderr (GGML assertion, CUDA failure, etc). + private volatile string _lastCriticalError; + + private AceStepCppManager() + { + _httpClient = NetworkBackendUtils.MakeHttpClient(); + _httpClient.Timeout = TimeSpan.FromMinutes(15); + } + + #endregion + + #region Paths + + /// Root directory for the ace-server binary. + public static string BinaryRoot => Path.GetFullPath(Path.Combine("dlbackend", "acestep-cpp")); + + /// Root directory for GGUF models. + public static string ModelRoot => Path.GetFullPath(Path.Combine("Models", "audio", "music", "acestep-gguf")); + + /// Path to the ace-server executable. + public static string ServerExecutable => Path.Combine(BinaryRoot, + RuntimeInformation.IsOSPlatform(OSPlatform.Windows) ? "ace-server.exe" : "ace-server"); + + /// Path to the version tracking file. + public static string VersionFilePath => Path.Combine(BinaryRoot, VERSION_FILE); + + #endregion + + #region Binary Management + + /// Ensures the ace-server binary is downloaded and available. Also checks for updates. + public async Task EnsureBinaryAsync() + { + if (File.Exists(ServerExecutable)) + { + // Binary exists — check for updates in the background (don't block startup) + _ = Task.Run(async () => + { + try { await CheckForUpdateAsync(); } + catch (Exception ex) { Logs.Debug($"[AceStep] Update check failed: {ex.Message}"); } + }); + return ServerExecutable; + } + Directory.CreateDirectory(BinaryRoot); + Logs.Info("[AceStep] Downloading ace-server binary..."); + DownloadInfo info = await GetBinaryDownloadInfo(); + // info is null when files were downloaded directly (serveurperso fallback) + if (info is null) + { + if (File.Exists(ServerExecutable)) + { + return ServerExecutable; + } + Logs.Error("[AceStep] No pre-built binary available for this platform."); + return null; + } + string executable = await DownloadAndExtractBinary(info); + if (executable is not null) + { + SaveVersionInfo(new VersionInfo + { + TagName = info.TagName, + ExecutablePath = executable, + InstalledDate = DateTime.UtcNow, + LastUpdateCheck = DateTime.UtcNow + }); + Logs.Info($"[AceStep] Installed ace-server {info.TagName}"); + } + return executable; + } + + /// Gets download info from GitHub releases, checking both repos before falling back. + /// Order: acestep.vst3 latest → acestep.cpp latest → serveurperso (Windows) → hardcoded v0.1.0. + private async Task GetBinaryDownloadInfo() + { + string pattern = GetAssetPattern(); + if (pattern is null) + { + return GetFallbackDownloadInfo(); + } + // Try acestep.vst3 releases first, then acestep.cpp releases + string[] repoUrls = [GITHUB_VST3_API_URL, GITHUB_CPP_API_URL]; + foreach (string repoUrl in repoUrls) + { + try + { + DownloadInfo info = await TryGetReleaseAssetAsync(repoUrl, pattern); + if (info is not null) + { + return info; + } + } + catch (Exception ex) + { + Logs.Debug($"[AceStep] GitHub API check failed for {repoUrl}: {ex.Message}"); + } + } + // On Windows, try the community-hosted build before falling back to v0.1.0 + if (RuntimeInformation.IsOSPlatform(OSPlatform.Windows)) + { + Logs.Info("[AceStep] No Windows binary in GitHub releases, trying community build..."); + bool downloaded = await DownloadServeurpersoFilesAsync(); + if (downloaded) + { + return null; // Signal that binary is already in place (no archive to extract) + } + } + Logs.Info("[AceStep] Using v0.1.0 fallback"); + return GetFallbackDownloadInfo(); + } + + /// Checks a single GitHub releases/latest endpoint for a matching platform asset. + private static async Task TryGetReleaseAssetAsync(string apiUrl, string pattern) + { + using HttpRequestMessage request = new(HttpMethod.Get, apiUrl); + request.Headers.Add("User-Agent", "SwarmUI-AudioLab"); + using HttpResponseMessage response = await Utilities.UtilWebClient.SendAsync(request); + if (!response.IsSuccessStatusCode) + { + return null; + } + JObject release = JObject.Parse(await response.Content.ReadAsStringAsync()); + string tag = release["tag_name"]?.ToString(); + if (release["assets"] is not JArray assets || assets.Count == 0) + { + return null; + } + foreach (JToken asset in assets) + { + string name = asset["name"]?.ToString(); + if (name is not null && name.Contains(pattern, StringComparison.OrdinalIgnoreCase)) + { + Logs.Info($"[AceStep] Found binary {name} ({tag}) from {apiUrl}"); + return new DownloadInfo + { + FileName = name, + DownloadUrl = asset["browser_download_url"]?.ToString(), + Size = asset["size"]?.ToObject() ?? 0, + TagName = tag + }; + } + } + return null; + } + + /// Returns the asset filename pattern for the current OS. + private static string GetAssetPattern() + { + if (RuntimeInformation.IsOSPlatform(OSPlatform.Windows)) return "windows-x64"; + if (RuntimeInformation.IsOSPlatform(OSPlatform.Linux)) return "linux-x64"; + if (RuntimeInformation.IsOSPlatform(OSPlatform.OSX)) return "macos-arm64"; + return null; + } + + /// Returns hardcoded fallback download info for v0.1.0. + private static DownloadInfo GetFallbackDownloadInfo() + { + string url; + string fileName; + if (RuntimeInformation.IsOSPlatform(OSPlatform.Windows)) + { + url = FALLBACK_WIN_URL; + fileName = "acestep-windows-x64.zip"; + } + else if (RuntimeInformation.IsOSPlatform(OSPlatform.Linux)) + { + url = FALLBACK_LINUX_URL; + fileName = "acestep-linux-x64.tar.gz"; + } + else if (RuntimeInformation.IsOSPlatform(OSPlatform.OSX)) + { + url = FALLBACK_MAC_URL; + fileName = "acestep-macos-arm64-metal.tar.gz"; + } + else + { + Logs.Error("[AceStep] Unsupported platform for ace-server binary."); + return null; + } + return new DownloadInfo { FileName = fileName, DownloadUrl = url, Size = 0, TagName = FALLBACK_RELEASE_TAG }; + } + + /// Downloads and extracts the binary archive. + private async Task DownloadAndExtractBinary(DownloadInfo info) + { + string tempDir = Path.Combine(Path.GetTempPath(), "SwarmUI-AceStep"); + Directory.CreateDirectory(tempDir); + string archivePath = Path.Combine(tempDir, info.FileName); + try + { + string sizeStr = info.Size > 0 ? $" ({info.Size / 1024.0 / 1024.0:F1} MB)" : ""; + Logs.Info($"[AceStep] Downloading {info.FileName}{sizeStr}..."); + await Utilities.DownloadFile(info.DownloadUrl, archivePath, (_, __, ___) => { }); + if (!File.Exists(archivePath) || new FileInfo(archivePath).Length == 0) + { + Logs.Error("[AceStep] Download failed — file is empty or missing."); + return null; + } + Logs.Info("[AceStep] Extracting binary..."); + string extractDir = Path.Combine(tempDir, "extracted"); + if (Directory.Exists(extractDir)) Directory.Delete(extractDir, true); + if (archivePath.EndsWith(".zip", StringComparison.OrdinalIgnoreCase)) + { + ZipFile.ExtractToDirectory(archivePath, extractDir); + } + else + { + // tar.gz for Linux/macOS + Directory.CreateDirectory(extractDir); + await Utilities.QuickRunProcess("tar", ["-xzf", archivePath, "-C", extractDir]); + } + CopyExtractedFiles(extractDir, BinaryRoot); + if (!File.Exists(ServerExecutable)) + { + Logs.Error($"[AceStep] Expected executable not found after extraction: {ServerExecutable}"); + return null; + } + if (!RuntimeInformation.IsOSPlatform(OSPlatform.Windows)) + { + try { Process.Start("chmod", $"+x \"{ServerExecutable}\"")?.WaitForExit(); } catch { } + } + return ServerExecutable; + } + finally + { + try { File.Delete(archivePath); } catch { } + try { Directory.Delete(Path.Combine(tempDir, "extracted"), true); } catch { } + } + } + + /// Downloads individual binary files from the community-hosted build server. + private async Task DownloadServeurpersoFilesAsync() + { + try + { + Directory.CreateDirectory(BinaryRoot); + int downloaded = 0; + foreach (string fileName in ServeurpersoWinFiles) + { + string localPath = Path.Combine(BinaryRoot, fileName); + if (File.Exists(localPath)) + { + downloaded++; + continue; + } + string url = SERVEURPERSO_WIN_BASE_URL + fileName; + Logs.Info($"[AceStep] Downloading {fileName}..."); + try + { + await Utilities.DownloadFile(url, localPath, (_, __, ___) => { }); + if (File.Exists(localPath) && new FileInfo(localPath).Length > 0) + { + downloaded++; + } + else + { + Logs.Warning($"[AceStep] Failed to download {fileName} (empty or missing)"); + } + } + catch (Exception ex) + { + Logs.Warning($"[AceStep] Failed to download {fileName}: {ex.Message}"); + } + } + if (File.Exists(ServerExecutable)) + { + // Get Last-Modified from the server exe for future update checks + string lastModified = null; + try + { + using HttpRequestMessage headReq = new(HttpMethod.Head, SERVEURPERSO_WIN_BASE_URL + "ace-server.exe"); + headReq.Headers.Add("User-Agent", "SwarmUI-AudioLab"); + using HttpResponseMessage headResp = await Utilities.UtilWebClient.SendAsync(headReq); + lastModified = headResp.Content.Headers.LastModified?.ToString("R"); + } + catch { } + SaveVersionInfo(new VersionInfo + { + TagName = "serveurperso-latest", + Source = "serveurperso", + ExecutablePath = ServerExecutable, + InstalledDate = DateTime.UtcNow, + LastUpdateCheck = DateTime.UtcNow, + LastModified = lastModified + }); + Logs.Info($"[AceStep] Installed ace-server from community build ({downloaded}/{ServeurpersoWinFiles.Length} files)"); + return true; + } + Logs.Warning("[AceStep] Community build download incomplete — ace-server.exe not found"); + return false; + } + catch (Exception ex) + { + Logs.Warning($"[AceStep] Community build download failed: {ex.Message}"); + return false; + } + } + + /// Recursively copies extracted files to the target directory. + private static void CopyExtractedFiles(string source, string target) + { + Directory.CreateDirectory(target); + foreach (string file in Directory.GetFiles(source)) + { + File.Copy(file, Path.Combine(target, Path.GetFileName(file)), true); + } + foreach (string dir in Directory.GetDirectories(source)) + { + CopyExtractedFiles(dir, Path.Combine(target, Path.GetFileName(dir))); + } + } + + /// Loads saved version info from disk, or null if not available. + private static VersionInfo LoadVersionInfo() + { + try + { + if (!File.Exists(VersionFilePath)) return null; + string json = File.ReadAllText(VersionFilePath); + return JObject.Parse(json).ToObject(); + } + catch { return null; } + } + + /// Checks for a newer ace-server binary and downloads it if the server is not running. + /// Skips if checked within the last 24 hours. + private async Task CheckForUpdateAsync() + { + VersionInfo current = LoadVersionInfo(); + if (current is not null && (DateTime.UtcNow - current.LastUpdateCheck).TotalHours < 24) + { + return; // Checked recently + } + Logs.Debug("[AceStep] Checking for ace-server updates..."); + bool updateAvailable = false; + // Check serveurperso (Windows) — compare Last-Modified header + if (RuntimeInformation.IsOSPlatform(OSPlatform.Windows)) + { + updateAvailable = await CheckServeurpersoUpdateAsync(current); + } + // Check GitHub releases if serveurperso didn't find an update + if (!updateAvailable) + { + updateAvailable = await CheckGitHubUpdateAsync(current); + } + if (updateAvailable && !IsServerRunning()) + { + Logs.Info("[AceStep] Newer binary available, downloading update..."); + await ApplyUpdateAsync(); + } + else if (updateAvailable) + { + Logs.Info("[AceStep] Newer ace-server binary available. Will update on next startup."); + // Mark that we checked but can't update right now + if (current is not null) + { + current.LastUpdateCheck = DateTime.UtcNow; + SaveVersionInfo(current); + } + } + else + { + // No update — just record that we checked + if (current is not null) + { + current.LastUpdateCheck = DateTime.UtcNow; + SaveVersionInfo(current); + } + } + } + + /// Checks serveurperso for a newer build by comparing Last-Modified header. + private async Task CheckServeurpersoUpdateAsync(VersionInfo current) + { + try + { + using HttpRequestMessage req = new(HttpMethod.Head, SERVEURPERSO_WIN_BASE_URL + "ace-server.exe"); + req.Headers.Add("User-Agent", "SwarmUI-AudioLab"); + using HttpResponseMessage resp = await Utilities.UtilWebClient.SendAsync(req); + if (!resp.IsSuccessStatusCode) return false; + string remoteLastModified = resp.Content.Headers.LastModified?.ToString("R"); + if (remoteLastModified is null) return false; + if (current?.LastModified is null) + { + Logs.Debug("[AceStep] No stored Last-Modified, assuming update available."); + return true; + } + if (remoteLastModified != current.LastModified) + { + Logs.Info($"[AceStep] Serveurperso binary changed: {current.LastModified} → {remoteLastModified}"); + return true; + } + return false; + } + catch (Exception ex) + { + Logs.Debug($"[AceStep] Serveurperso update check failed: {ex.Message}"); + return false; + } + } + + /// Checks GitHub releases for a newer version tag. + private static async Task CheckGitHubUpdateAsync(VersionInfo current) + { + try + { + string pattern = GetAssetPattern(); + if (pattern is null) return false; + string[] repoUrls = [GITHUB_VST3_API_URL, GITHUB_CPP_API_URL]; + foreach (string repoUrl in repoUrls) + { + DownloadInfo info = await TryGetReleaseAssetAsync(repoUrl, pattern); + if (info is not null && info.TagName != current?.TagName) + { + Logs.Info($"[AceStep] GitHub release {info.TagName} available (current: {current?.TagName ?? "unknown"})"); + return true; + } + } + return false; + } + catch (Exception ex) + { + Logs.Debug($"[AceStep] GitHub update check failed: {ex.Message}"); + return false; + } + } + + /// Downloads the latest binary, replacing the current one. + private async Task ApplyUpdateAsync() + { + try + { + if (RuntimeInformation.IsOSPlatform(OSPlatform.Windows)) + { + // Delete existing files so serveurperso download replaces them + foreach (string fileName in ServeurpersoWinFiles) + { + string path = Path.Combine(BinaryRoot, fileName); + try { if (File.Exists(path)) File.Delete(path); } catch { } + } + bool downloaded = await DownloadServeurpersoFilesAsync(); + if (downloaded) + { + Logs.Info("[AceStep] ace-server updated successfully from community build."); + return; + } + } + // Fallback: try GitHub release + DownloadInfo info = await GetBinaryDownloadInfo(); + if (info is not null) + { + await DownloadAndExtractBinary(info); + SaveVersionInfo(new VersionInfo + { + TagName = info.TagName, + Source = "github", + ExecutablePath = ServerExecutable, + InstalledDate = DateTime.UtcNow, + LastUpdateCheck = DateTime.UtcNow + }); + Logs.Info($"[AceStep] ace-server updated to {info.TagName}"); + } + } + catch (Exception ex) + { + Logs.Warning($"[AceStep] Failed to apply update: {ex.Message}"); + } + } + + #endregion + + #region Model Management + + /// Ensures required GGUF models are downloaded for the given DiT variant. + public async Task EnsureModelsAsync(string ditFileName, string lmModel = "none") + { + Directory.CreateDirectory(ModelRoot); + // Always-required models + await EnsureSingleModelAsync("vae-BF16.gguf"); + await EnsureSingleModelAsync("Qwen3-Embedding-0.6B-Q8_0.gguf"); + // DiT model + await EnsureSingleModelAsync(ditFileName); + // Optional LM model + if (lmModel != "none" && !string.IsNullOrEmpty(lmModel)) + { + string lmFileName = GetLmFileName(lmModel); + if (lmFileName is not null) + { + await EnsureSingleModelAsync(lmFileName); + } + } + } + + /// Downloads a single GGUF model file if not already present. + private async Task EnsureSingleModelAsync(string fileName) + { + string localPath = Path.Combine(ModelRoot, fileName); + if (File.Exists(localPath)) + { + return; + } + string url = HF_MODEL_BASE_URL + fileName; + Logs.Info($"[AceStep] Downloading model {fileName}..."); + await Utilities.DownloadFile(url, localPath, (current, total, bps) => + { + if (total > 0) + { + double pct = current * 100.0 / total; + double mbps = bps / 1024.0 / 1024.0; + Logs.Verbose($"[AceStep] {fileName}: {pct:F1}% ({mbps:F1} MB/s)"); + } + }); + Logs.Info($"[AceStep] Downloaded {fileName}"); + } + + /// Maps LM model selection to GGUF filename. + public static string GetLmFileName(string lmModel) + { + return lmModel switch + { + "lm-0.6B" => "acestep-5Hz-lm-0.6B-Q8_0.gguf", + "lm-1.7B" => "acestep-5Hz-lm-1.7B-Q8_0.gguf", + "lm-4B" => "acestep-5Hz-lm-4B-Q8_0.gguf", + _ => null + }; + } + + /// Resolves a DiT model ID + quant level to a GGUF filename. + public static string GetDitFileName(string ditModel, string quantLevel = "Q8_0") + { + return $"{ditModel}-{quantLevel}.gguf"; + } + + #endregion + + #region Server Lifecycle + + /// Ensures the ace-server is running. The new binary auto-discovers models in the --models directory + /// and supports hot-swapping without restart, so we only need to ensure the process is alive. + public async Task EnsureServerRunningAsync(string ditFileName, string lmModel = "none") + { + if (IsServerRunning()) return true; + await _startLock.WaitAsync(); + try + { + if (IsServerRunning()) return true; + string exe = await EnsureBinaryAsync(); + if (exe is null) return false; + await EnsureModelsAsync(ditFileName, lmModel); + return await StartServerProcessAsync(); + } + finally + { + _startLock.Release(); + } + } + + /// Starts the ace-server process with --models pointing to the GGUF directory. + /// The new binary auto-discovers all models and supports hot-swapping without restart. + private async Task StartServerProcessAsync() + { + _port = NetworkBackendUtils.GetNextPort(); + // Check if any LM model exists in the models directory + _lmLoaded = Directory.GetFiles(ModelRoot, "acestep-5Hz-lm-*.gguf").Length > 0; + StringBuilder args = new(); + args.Append($"--host 127.0.0.1 --port {_port}"); + args.Append($" --models \"{ModelRoot}\""); + Logs.Info($"[AceStep] Starting ace-server on port {_port} with models dir: {ModelRoot}"); + // Clear stderr buffer and critical error from previous session + lock (_stderrLock) { _recentStderr.Clear(); } + ClearCriticalError(); + ProcessStartInfo psi = new(ServerExecutable, args.ToString()) + { + UseShellExecute = false, + RedirectStandardOutput = true, + RedirectStandardError = true, + CreateNoWindow = true, + WorkingDirectory = BinaryRoot + }; + try + { + _serverProcess = Process.Start(psi); + if (_serverProcess is null || _serverProcess.HasExited) + { + Logs.Error("[AceStep] Failed to start ace-server process."); + return false; + } + // Log stdout/stderr on background threads + _ = Task.Run(() => LogStream(_serverProcess.StandardOutput, "stdout")); + _ = Task.Run(() => LogStream(_serverProcess.StandardError, "stderr")); + // Wait for health check + bool healthy = await WaitForHealthAsync(); + if (!healthy) + { + Logs.Error("[AceStep] ace-server failed health check."); + KillServer(); + return false; + } + // Query available models from the server + await QueryPropsAsync(); + Logs.Info($"[AceStep] ace-server is ready on port {_port} (LM available: {_lmLoaded})"); + return true; + } + catch (Exception ex) + { + Logs.Error($"[AceStep] Failed to start ace-server: {ex.Message}"); + KillServer(); + return false; + } + } + + /// Queries the /props endpoint to discover available models and cache server configuration. + private async Task QueryPropsAsync() + { + try + { + HttpResponseMessage resp = await _httpClient.GetAsync($"http://127.0.0.1:{_port}/props"); + if (resp.IsSuccessStatusCode) + { + string body = await resp.Content.ReadAsStringAsync(); + _serverProps = JObject.Parse(body); + Logs.Debug($"[AceStep] Server props: {body}"); + } + } + catch (Exception ex) + { + Logs.Debug($"[AceStep] Failed to query /props: {ex.Message}"); + _serverProps = null; + } + } + + /// Logs output from the ace-server process, buffering stderr and detecting critical errors. + private async Task LogStream(StreamReader reader, string label) + { + try + { + while (await reader.ReadLineAsync() is { } line) + { + if (string.IsNullOrWhiteSpace(line)) continue; + Logs.Debug($"[AceStep:{label}] {line}"); + if (label != "stderr") continue; + // Buffer recent stderr for crash diagnostics + lock (_stderrLock) + { + _recentStderr.Enqueue(line); + while (_recentStderr.Count > STDERR_BUFFER_SIZE) + _recentStderr.Dequeue(); + } + // Detect critical errors and log at appropriate level + if (line.Contains("GGML_ASSERT", StringComparison.Ordinal)) + { + _lastCriticalError = $"GGML assertion failed: {line}"; + Logs.Error($"[AceStep] FATAL: {line}"); + } + else if (line.Contains("failed to initialize CUDA", StringComparison.OrdinalIgnoreCase)) + { + _lastCriticalError = "CUDA failed to initialize — running on CPU only, which may cause errors with some models."; + Logs.Warning($"[AceStep] {_lastCriticalError}"); + } + else if (line.Contains("out of memory", StringComparison.OrdinalIgnoreCase)) + { + _lastCriticalError = $"Out of memory: {line}"; + Logs.Error($"[AceStep] {_lastCriticalError}"); + } + } + } + catch { } + } + + /// Gets the last N stderr lines for diagnostics. Returns empty string if none. + private string GetRecentStderr(int maxLines = 10) + { + lock (_stderrLock) + { + if (_recentStderr.Count == 0) return ""; + return string.Join("\n", _recentStderr.TakeLast(maxLines)); + } + } + + /// Clears the critical error flag (call before each new generation). + private void ClearCriticalError() => _lastCriticalError = null; + + /// Waits for the ace-server /health endpoint to return ok. + private async Task WaitForHealthAsync() + { + DateTime deadline = DateTime.UtcNow.AddMilliseconds(HEALTH_CHECK_TIMEOUT_MS); + while (DateTime.UtcNow < deadline) + { + try + { + HttpResponseMessage resp = await _httpClient.GetAsync($"http://127.0.0.1:{_port}/health"); + if (resp.IsSuccessStatusCode) + { + string body = await resp.Content.ReadAsStringAsync(); + if (body.Contains("ok", StringComparison.OrdinalIgnoreCase)) + { + return true; + } + } + } + catch { } + await Task.Delay(500); + } + return false; + } + + /// Checks if the server process is still running. + public bool IsServerRunning() + { + return _serverProcess is not null && !_serverProcess.HasExited; + } + + /// Gracefully shuts down the ace-server. + public async Task ShutdownAsync() + { + if (_serverProcess is null) return; + try + { + using CancellationTokenSource cts = new(5000); + await _httpClient.PostAsync($"http://127.0.0.1:{_port}/shutdown", + new StringContent("{}", Encoding.UTF8, "application/json"), cts.Token); + } + catch { } + await Task.Delay(2000); + KillServer(); + } + + /// Force-kills the server process if still running. + private void KillServer() + { + try + { + if (_serverProcess is not null && !_serverProcess.HasExited) + { + Utilities.KillProcess(_serverProcess, 3); + } + } + catch { } + _serverProcess = null; + _serverProps = null; + } + + #endregion + + #region Request Processing + + /// Full processing pipeline: ensure server running, optionally run LM, then synth, return result. + public async Task ProcessAsync(Dictionary args, CancellationToken cancelToken = default) + { + string ditFileName = args.TryGetValue("dit_file", out object ditObj) ? ditObj.ToString() : "acestep-v15-turbo-Q8_0.gguf"; + string lmModel = args.TryGetValue("lm_model", out object lmObj) ? lmObj.ToString() : "none"; + // Ensure models are downloaded before starting server + bool started = await EnsureServerRunningAsync(ditFileName, lmModel); + if (!started) + { + return CreateErrorResponse("Failed to start ace-server. Check logs for details."); + } + // If the server crashed on a previous request, clean up so we can restart + if (_serverProcess is not null && _serverProcess.HasExited) + { + Logs.Warning("[AceStep] Server process is dead from a previous crash, will restart."); + _serverProcess = null; + _serverProps = null; + started = await EnsureServerRunningAsync(ditFileName, lmModel); + if (!started) + { + return CreateErrorResponse("Failed to restart ace-server after crash. Check logs for details."); + } + } + ClearCriticalError(); + try + { + // Build AceRequest from args — includes dit_model for server model selection + JObject aceRequest = BuildAceRequest(args); + aceRequest["dit_model"] = ditFileName; + // Phase 1: LM enrichment (optional — check if any LM model is available) + if (_lmLoaded && lmModel != "none" && !string.IsNullOrEmpty(lmModel)) + { + JObject enriched = await RunLmPhaseAsync(aceRequest, cancelToken); + if (enriched is not null) + { + aceRequest = enriched; + } + } + // Phase 2: Synth + byte[] audioBytes = await RunSynthPhaseAsync(aceRequest, args, cancelToken); + if (audioBytes is null) + { + string synthError = BuildCrashDiagnostic("Synthesis returned no audio data."); + return CreateErrorResponse(synthError); + } + string base64Audio = Convert.ToBase64String(audioBytes); + // Detect format from file magic bytes: MP3 starts with 0xFF 0xFB or "ID3" + string format = (audioBytes.Length >= 3 && (audioBytes[0] == 0xFF || (audioBytes[0] == 'I' && audioBytes[1] == 'D' && audioBytes[2] == '3'))) + ? "mp3" : "wav"; + return new JObject + { + ["success"] = true, + ["audio_data"] = base64Audio, + ["output_format"] = format, + ["sample_rate"] = 48000, + ["metadata"] = new JObject + { + ["engine"] = "acestep.cpp", + ["dit_model"] = ditFileName, + ["lm_model"] = lmModel + } + }; + } + catch (TaskCanceledException) + { + return CreateErrorResponse("Generation cancelled by user."); + } + catch (Exception ex) + { + string errorMsg = BuildCrashDiagnostic(ex.Message); + Logs.Error($"[AceStep] Processing error: {errorMsg}"); + return CreateErrorResponse(errorMsg); + } + } + + /// Builds an AceRequest JSON from the args dictionary. + private static JObject BuildAceRequest(Dictionary args) + { + JObject req = new(); + void Set(string key, object defaultVal) + { + if (args.TryGetValue(key, out object val)) + req[key] = JToken.FromObject(val); + else if (defaultVal is not null) + req[key] = JToken.FromObject(defaultVal); + } + Set("caption", ""); + Set("lyrics", ""); + Set("bpm", 0); + Set("duration", 0); + Set("keyscale", ""); + Set("timesignature", ""); + Set("vocal_language", ""); + Set("seed", -1L); + Set("inference_steps", 0); + Set("guidance_scale", 0.0); + Set("shift", 0.0); + Set("audio_cover_strength", 0.5); + Set("repainting_start", -1.0); + Set("repainting_end", -1.0); + Set("lego", ""); + // LM params + Set("lm_temperature", 0.85); + Set("lm_cfg_scale", 2.0); + Set("lm_top_k", 0); + Set("lm_top_p", 0.9); + Set("lm_negative_prompt", ""); + Set("use_cot_caption", true); + return req; + } + + /// Runs the LM enrichment phase: POST to /lm, return enriched request. + /// Handles both sync (direct JSON result) and async (job-based polling) server versions. + private async Task RunLmPhaseAsync(JObject aceRequest, CancellationToken cancelToken) + { + try + { + StringContent content = new(aceRequest.ToString(), Encoding.UTF8, "application/json"); + HttpResponseMessage resp = await _httpClient.PostAsync($"http://127.0.0.1:{_port}/lm", content, cancelToken); + if (resp.StatusCode == System.Net.HttpStatusCode.ServiceUnavailable) + { + Logs.Warning("[AceStep] LM phase skipped (GPU busy)."); + return null; + } + resp.EnsureSuccessStatusCode(); + string body = await resp.Content.ReadAsStringAsync(cancelToken); + JToken parsed = JToken.Parse(body); + // Async mode: {"id":"..."} — poll for result + if (parsed is JObject idObj && idObj["id"] is not null && idObj.Count <= 2) + { + string jobId = idObj["id"].ToString(); + JToken result = await PollJobAsync(jobId, cancelToken); + if (result is null) return null; + parsed = result; + } + // Extract enriched request from result (array or object) + if (parsed is JArray arr && arr.Count > 0) + { + return arr[0] as JObject; + } + if (parsed is JObject obj && obj["id"] is null) + { + return obj; + } + return null; + } + catch (Exception ex) + { + Logs.Warning($"[AceStep] LM phase failed (continuing without): {ex.Message}"); + return null; + } + } + + /// Runs the synth phase: POST to /synth, return raw audio bytes. + /// Handles both sync (direct audio response) and async (job-based polling) server versions. + private async Task RunSynthPhaseAsync(JObject aceRequest, Dictionary args, CancellationToken cancelToken) + { + bool hasSourceAudio = args.TryGetValue("src_audio", out object srcAudioObj) && srcAudioObj is string srcAudio && !string.IsNullOrEmpty(srcAudio); + for (int attempt = 0; attempt <= RETRY_503_MAX; attempt++) + { + HttpResponseMessage resp; + if (hasSourceAudio) + { + using MultipartFormDataContent form = new(); + form.Add(new StringContent(aceRequest.ToString(), Encoding.UTF8, "application/json"), "request"); + byte[] audioBytes = Convert.FromBase64String((string)args["src_audio"]); + form.Add(new ByteArrayContent(audioBytes), "audio", "source.wav"); + resp = await _httpClient.PostAsync($"http://127.0.0.1:{_port}/synth", form, cancelToken); + } + else + { + StringContent content = new(aceRequest.ToString(), Encoding.UTF8, "application/json"); + resp = await _httpClient.PostAsync($"http://127.0.0.1:{_port}/synth", content, cancelToken); + } + if (resp.StatusCode == System.Net.HttpStatusCode.ServiceUnavailable) + { + int delayMs = 2000; + Logs.Info($"[AceStep] GPU busy (503), retrying in {delayMs}ms..."); + await Task.Delay(delayMs, cancelToken); + continue; + } + resp.EnsureSuccessStatusCode(); + string contentType = resp.Content.Headers.ContentType?.MediaType ?? ""; + // If response is JSON, it's an async job ID — poll for result + if (contentType.Contains("json", StringComparison.OrdinalIgnoreCase)) + { + string body = await resp.Content.ReadAsStringAsync(cancelToken); + JObject result = JObject.Parse(body); + if (result["error"] is not null) + { + Logs.Error($"[AceStep] Server error: {result["error"]}"); + return null; + } + string jobId = result["id"]?.ToString(); + if (jobId is not null) + { + return await PollJobForBytesAsync(jobId, cancelToken); + } + Logs.Error($"[AceStep] Unexpected JSON response: {body}"); + return null; + } + // Otherwise it's direct audio bytes (sync server) + return await resp.Content.ReadAsByteArrayAsync(cancelToken); + } + Logs.Error("[AceStep] GPU busy after max retries."); + return null; + } + + /// Submits a JSON job to the specified endpoint. Returns the job ID. + private async Task SubmitJobAsync(string endpoint, JObject payload, CancellationToken cancelToken) + { + for (int attempt = 0; attempt <= RETRY_503_MAX; attempt++) + { + StringContent content = new(payload.ToString(), Encoding.UTF8, "application/json"); + HttpResponseMessage resp = await _httpClient.PostAsync($"http://127.0.0.1:{_port}{endpoint}", content, cancelToken); + if (resp.StatusCode == System.Net.HttpStatusCode.ServiceUnavailable) + { + // GPU busy — retry after delay + string retryAfter = resp.Headers.RetryAfter?.Delta?.TotalMilliseconds.ToString() ?? "2000"; + int delayMs = int.TryParse(retryAfter, out int ra) ? Math.Max(ra, 500) : 2000; + Logs.Info($"[AceStep] GPU busy (503), retrying in {delayMs}ms..."); + await Task.Delay(delayMs, cancelToken); + continue; + } + resp.EnsureSuccessStatusCode(); + string body = await resp.Content.ReadAsStringAsync(cancelToken); + JObject result = JObject.Parse(body); + if (result["error"] is not null) + { + Logs.Error($"[AceStep] Server error: {result["error"]}"); + return null; + } + return result["id"]?.ToString(); + } + Logs.Error("[AceStep] GPU busy after max retries."); + return null; + } + + /// Polls a job until completion and returns the result as a JToken (for LM phase JSON responses). + private async Task PollJobAsync(string jobId, CancellationToken cancelToken) + { + DateTime deadline = DateTime.UtcNow.AddMilliseconds(JOB_TIMEOUT_MS); + while (DateTime.UtcNow < deadline) + { + cancelToken.ThrowIfCancellationRequested(); + HttpResponseMessage statusResp = await _httpClient.GetAsync($"http://127.0.0.1:{_port}/job?id={jobId}", cancelToken); + string statusBody = await statusResp.Content.ReadAsStringAsync(cancelToken); + JObject statusObj = JObject.Parse(statusBody); + string status = statusObj["status"]?.ToString(); + if (status == "done") + { + // Fetch result + HttpResponseMessage resultResp = await _httpClient.GetAsync($"http://127.0.0.1:{_port}/job?id={jobId}&result=1", cancelToken); + string resultBody = await resultResp.Content.ReadAsStringAsync(cancelToken); + return JToken.Parse(resultBody); + } + if (status == "failed" || status == "cancelled") + { + string error = statusObj["error"]?.ToString() ?? status; + Logs.Error($"[AceStep] Job {jobId} {status}: {error}"); + return null; + } + await Task.Delay(JOB_POLL_INTERVAL_MS, cancelToken); + } + Logs.Error($"[AceStep] Job {jobId} timed out."); + return null; + } + + /// Polls a job until completion and returns the result as raw bytes (for synth phase audio). + private async Task PollJobForBytesAsync(string jobId, CancellationToken cancelToken) + { + DateTime deadline = DateTime.UtcNow.AddMilliseconds(JOB_TIMEOUT_MS); + while (DateTime.UtcNow < deadline) + { + cancelToken.ThrowIfCancellationRequested(); + HttpResponseMessage statusResp = await _httpClient.GetAsync($"http://127.0.0.1:{_port}/job?id={jobId}", cancelToken); + string statusBody = await statusResp.Content.ReadAsStringAsync(cancelToken); + JObject statusObj = JObject.Parse(statusBody); + string status = statusObj["status"]?.ToString(); + if (status == "done") + { + // Fetch audio result as WAV + HttpResponseMessage resultResp = await _httpClient.GetAsync($"http://127.0.0.1:{_port}/job?id={jobId}&result=1&wav=1", cancelToken); + return await resultResp.Content.ReadAsByteArrayAsync(cancelToken); + } + if (status == "failed" || status == "cancelled") + { + string error = statusObj["error"]?.ToString() ?? status; + Logs.Error($"[AceStep] Job {jobId} {status}: {error}"); + return null; + } + await Task.Delay(JOB_POLL_INTERVAL_MS, cancelToken); + } + Logs.Error($"[AceStep] Job {jobId} timed out."); + return null; + } + + #endregion + + #region Helpers + + private static JObject CreateErrorResponse(string message) => new() + { + ["success"] = false, + ["error"] = message + }; + + /// Builds a diagnostic error message by checking if the server crashed and including stderr context. + private string BuildCrashDiagnostic(string baseMessage) + { + bool crashed = _serverProcess is not null && _serverProcess.HasExited; + string critical = _lastCriticalError; + if (crashed && critical is not null) + { + // Server crashed with a known critical error — give a clear message + Logs.Error($"[AceStep] Server process crashed. Last critical error: {critical}"); + string stderr = GetRecentStderr(5); + return $"ace-server crashed: {critical}" + (string.IsNullOrEmpty(stderr) ? "" : $"\nRecent stderr:\n{stderr}"); + } + if (crashed) + { + // Server crashed but we don't know why — include recent stderr + Logs.Error("[AceStep] Server process crashed unexpectedly."); + string stderr = GetRecentStderr(10); + return "ace-server crashed unexpectedly." + (string.IsNullOrEmpty(stderr) ? "" : $"\nRecent stderr:\n{stderr}"); + } + if (critical is not null) + { + // Server still running but a critical error was detected + return $"{baseMessage} ({critical})"; + } + return baseMessage; + } + + private void SaveVersionInfo(VersionInfo info) + { + try { File.WriteAllText(VersionFilePath, JObject.FromObject(info).ToString(Newtonsoft.Json.Formatting.Indented)); } + catch (Exception ex) { Logs.Warning($"[AceStep] Failed to save version info: {ex.Message}"); } + } + + public void Dispose() + { + ShutdownAsync().Wait(5000); + _httpClient?.Dispose(); + _startLock?.Dispose(); + } + + #endregion + + #region Data Classes + + public class VersionInfo + { + public string TagName { get; set; } + public string Source { get; set; } + public string ExecutablePath { get; set; } + public DateTime InstalledDate { get; set; } + public DateTime LastUpdateCheck { get; set; } + /// Last-Modified header from the download source, used for update checking. + public string LastModified { get; set; } + } + + public class DownloadInfo + { + public string FileName { get; set; } + public string DownloadUrl { get; set; } + public long Size { get; set; } + public string TagName { get; set; } + } + + #endregion +} diff --git a/AudioServices/AudioServerManager.cs b/AudioServices/AudioServerManager.cs index ddd4c9a..40d0282 100644 --- a/AudioServices/AudioServerManager.cs +++ b/AudioServices/AudioServerManager.cs @@ -286,6 +286,12 @@ public async Task ProcessAsync(AudioProviderDefinition provider, Dictio return await ProcessViaDockerAsync(provider, args, cancelToken); } + // Native binary providers use their own server manager (e.g. acestep.cpp) + if (provider.IsNativeBinary) + { + return await AceStepCppManager.Instance.ProcessAsync(args, cancelToken); + } + // API providers are handled entirely in C# — no Python server needed if (provider.IsApiProvider) { diff --git a/python_backend/docker/Dockerfile b/python_backend/docker/Dockerfile index 2209529..510a3ce 100644 --- a/python_backend/docker/Dockerfile +++ b/python_backend/docker/Dockerfile @@ -1,5 +1,5 @@ # AudioLab Docker image for Linux-only engines -# Engines: ACE-Step, RVC, GPT-SoVITS, Resemble-Enhance, CosyVoice, RealtimeSTT +# Engines: RVC, GPT-SoVITS, Resemble-Enhance, CosyVoice, RealtimeSTT # # Build: docker build -t audiolab-linux -f python_backend/docker/Dockerfile . # Run: docker compose -f python_backend/docker/docker-compose.yml up -d @@ -37,8 +37,7 @@ RUN pip install --no-cache-dir \ accelerate scipy librosa # Install Linux-only engine dependencies -# ACE-Step 1.5 -RUN pip install --no-cache-dir ace-step || true +# ACE-Step now uses native C++ binary (acestep.cpp) — no longer in Docker # Resemble-Enhance (requires deepspeed which only builds on Linux) RUN pip install --no-cache-dir deepspeed && \ diff --git a/python_backend/engines/music_acestep.py b/python_backend/engines/music_acestep.py deleted file mode 100644 index 1c20039..0000000 --- a/python_backend/engines/music_acestep.py +++ /dev/null @@ -1,274 +0,0 @@ -#!/usr/bin/env python3 -"""ACE-Step engine — full-song music generation with lyrics alignment. - -Uses ACEStepPipeline from ace-step v0.2.0+. Supports multiple DiT model -variants, task types (text2music, cover, repaint, edit, extend, retake), -and configurable generation parameters. - -Requires: ace-step (git+https://github.com/ace-step/ACE-Step.git) -""" - -import base64 -import logging -import os -import shutil -import tempfile - -import numpy as np - -from .base_engine import BaseAudioEngine - -logger = logging.getLogger("Music.ACEStep") - -# Map SwarmUI model config names to HuggingFace repo IDs -_DIT_REPO_MAP = { - "acestep-v15-turbo": "ACE-Step/ACE-Step-v1-3.5B", - "acestep-v15-turbo-shift1": "ACE-Step/ACE-Step-v1-3.5B", - "acestep-v15-turbo-shift3": "ACE-Step/ACE-Step-v1-3.5B", - "acestep-v15-turbo-continuous": "ACE-Step/ACE-Step-v1-3.5B", - "acestep-v15-sft": "ACE-Step/ACE-Step-v1-3.5B", - "acestep-v15-base": "ACE-Step/ACE-Step-v1-3.5B", -} - -# Map scheduler types to ACE-Step scheduler names -_SCHEDULER_MAP = { - "ode": "euler", - "euler": "euler", - "heun": "heun", - "pingpong": "pingpong", -} - - -class AceStepEngine(BaseAudioEngine): - """ACE-Step music generation engine with lazy model loading.""" - - name = "acestep" - category = "audiogeneration" - - def __init__(self): - self.pipeline = None - self.current_dit_model = None - self.sample_rate = 48000 - - def initialize(self) -> bool: - try: - from acestep.pipeline_ace_step import ACEStepPipeline # noqa: F401 - logger.info("ACE-Step ready (model loaded on first request)") - return True - except ImportError as e: - logger.error("ace-step package not found: %s", e) - return False - - def _ensure_loaded(self, dit_model: str): - """Load or swap the pipeline if needed.""" - if self.pipeline is not None and self.current_dit_model == dit_model: - return - - if self.pipeline is not None: - logger.info("Switching DiT model: %s -> %s", self.current_dit_model, dit_model) - self.cleanup() - - import torch - from acestep.pipeline_ace_step import ACEStepPipeline - - device_id = 0 if torch.cuda.is_available() else -1 - self.pipeline = ACEStepPipeline(device_id=device_id) - self.pipeline.load_checkpoint() - self.current_dit_model = dit_model - logger.info("ACE-Step loaded: %s on %s", dit_model, - "cuda" if torch.cuda.is_available() else "cpu") - - def _decode_audio_to_tempfile(self, audio_b64: str, prefix: str = "ace_") -> str: - """Decode base64 audio data and write to a temporary WAV file.""" - audio_bytes = base64.b64decode(audio_b64) - fd, path = tempfile.mkstemp(prefix=prefix, suffix=".wav") - try: - os.write(fd, audio_bytes) - finally: - os.close(fd) - return path - - def process(self, **kwargs) -> dict: - prompt = kwargs.get("prompt", "") - lyrics = kwargs.get("lyrics", "[Instrumental]") - duration = float(kwargs.get("duration", 30)) - seed = int(kwargs.get("seed", -1)) - dit_model = kwargs.get("dit_model", "acestep-v15-turbo") - - infer_step = int(kwargs.get("infer_step", 8)) - guidance_scale = float(kwargs.get("guidance_scale", 7.0)) - instrumental = kwargs.get("instrumental", "false") == "true" - bpm = int(kwargs.get("bpm", 120)) - key_scale = kwargs.get("key_scale", "") - time_signature = kwargs.get("time_signature", "4") - vocal_language = kwargs.get("vocal_language", "en") - shift = float(kwargs.get("shift", 3.0)) - infer_method = kwargs.get("infer_method", "ode") - use_adg = kwargs.get("use_adg", "false") == "true" - cfg_interval_start = float(kwargs.get("cfg_interval_start", 0.0)) - cfg_interval_end = float(kwargs.get("cfg_interval_end", 1.0)) - enable_normalization = kwargs.get("enable_normalization", "true") == "true" - normalization_db = float(kwargs.get("normalization_db", -14.0)) - task_type = kwargs.get("task_type", "text2music") - - if not prompt.strip(): - return {"success": False, "error": "No prompt provided"} - - valid_tasks = {"text2music", "cover", "repaint", "edit", "extend", "retake"} - if task_type not in valid_tasks: - return {"success": False, "error": f"Invalid task_type '{task_type}'. Must be one of: {', '.join(sorted(valid_tasks))}"} - - temp_files = [] - save_dir = None - try: - import torchaudio - - self._ensure_loaded(dit_model) - - # Build lyrics with metadata tags - if instrumental: - lyrics = "[Instrumental]" - - full_prompt = prompt - tags = [] - if bpm > 0: - tags.append(f"bpm: {bpm}") - if time_signature: - tags.append(f"time_signature: {time_signature}/4") - if key_scale: - tags.append(f"key: {key_scale}") - if tags: - full_prompt = f"{prompt} [{', '.join(tags)}]" - - # Map scheduler - scheduler_type = _SCHEDULER_MAP.get(infer_method, "euler") - - # Build __call__ kwargs - gen_kwargs = { - "prompt": full_prompt, - "lyrics": lyrics, - "task": task_type, - "audio_duration": duration, - "infer_step": infer_step, - "guidance_scale": guidance_scale, - "scheduler_type": scheduler_type, - "cfg_type": "apg" if use_adg else "cfg", - "guidance_interval": cfg_interval_end - cfg_interval_start, - "batch_size": 1, - } - - if seed >= 0: - gen_kwargs["manual_seeds"] = [seed] - - # Source audio for cover/repaint/edit/extend tasks - src_audio_b64 = kwargs.get("src_audio", "") - if src_audio_b64: - src_path = self._decode_audio_to_tempfile(src_audio_b64, "ace_src_") - temp_files.append(src_path) - gen_kwargs["src_audio_path"] = src_path - - # Reference audio for audio2audio - ref_audio_b64 = kwargs.get("reference_audio", "") - if ref_audio_b64: - ref_path = self._decode_audio_to_tempfile(ref_audio_b64, "ace_ref_") - temp_files.append(ref_path) - gen_kwargs["ref_audio_input"] = ref_path - gen_kwargs["audio2audio_enable"] = True - gen_kwargs["ref_audio_strength"] = float(kwargs.get("cover_strength", 0.5)) - - # Task-specific params - if task_type == "repaint": - gen_kwargs["repaint_start"] = int(float(kwargs.get("repaint_start", 0.0))) - repaint_end = float(kwargs.get("repaint_end", -1.0)) - if repaint_end >= 0: - gen_kwargs["repaint_end"] = int(repaint_end) - - if task_type == "cover": - gen_kwargs["ref_audio_strength"] = float(kwargs.get("cover_strength", 1.0)) - - # Output directory - save_dir = tempfile.mkdtemp(prefix="acestep_") - gen_kwargs["save_path"] = save_dir - - # Generate - result = self.pipeline(**gen_kwargs) - - # Extract audio — result is [output_path, ..., params_json_dict] - audio_path = None - if isinstance(result, (list, tuple)): - for item in result: - if isinstance(item, str) and os.path.isfile(item): - audio_path = item - break - - # Fallback: scan save_dir - if audio_path is None and save_dir: - for f in sorted(os.listdir(save_dir)): - if f.endswith((".wav", ".mp3", ".flac")): - audio_path = os.path.join(save_dir, f) - break - - if audio_path is None: - return {"success": False, "error": "ACE-Step produced no output audio"} - - waveform, sr = torchaudio.load(audio_path) - audio_numpy = waveform.cpu().numpy().astype(np.float32) - self.sample_rate = sr - - # Preserve stereo: interleave [channels, samples] → [samples * channels] - num_channels = 1 - if len(audio_numpy.shape) > 1 and audio_numpy.shape[0] >= 2: - num_channels = audio_numpy.shape[0] - audio_numpy = audio_numpy.T.flatten() # [C, N] → [N, C] → [N*C] - elif len(audio_numpy.shape) > 1: - audio_numpy = audio_numpy.squeeze(0) - - output_format = kwargs.get("output_format", "wav_16") - output_quality = kwargs.get("output_quality", "high") - audio_b64, fmt = self.encode_audio(audio_numpy, self.sample_rate, num_channels=num_channels, output_format=output_format, quality=output_quality) - actual_duration = len(audio_numpy) / (self.sample_rate * num_channels) - - return { - "success": True, - "audio_data": audio_b64, - "output_format": fmt, - "duration": actual_duration, - "metadata": { - "engine": "acestep", - "dit_model": dit_model, - "task_type": task_type, - "sample_rate": self.sample_rate, - "prompt": prompt, - "has_lyrics": not instrumental and lyrics != "[Instrumental]", - "seed": seed, - }, - } - except Exception as e: - logger.error("ACE-Step process failed: %s", e, exc_info=True) - return {"success": False, "error": str(e)} - finally: - for f in temp_files: - try: - if os.path.exists(f): - os.remove(f) - except OSError: - pass - if save_dir and os.path.exists(save_dir): - try: - shutil.rmtree(save_dir) - except OSError: - pass - - def cleanup(self): - if self.pipeline is not None: - try: - import torch - self.pipeline.cleanup_memory() - del self.pipeline - self.pipeline = None - self.current_dit_model = None - if torch.cuda.is_available(): - torch.cuda.empty_cache() - except Exception: - self.pipeline = None - self.current_dit_model = None diff --git a/python_backend/test_engines.py b/python_backend/test_engines.py index ca42764..2b93319 100644 --- a/python_backend/test_engines.py +++ b/python_backend/test_engines.py @@ -35,7 +35,7 @@ "tts_chatterbox": ("ChatterboxEngine", "tts", {"text": "Hello, this is a test of Chatterbox.", "volume": 0.8}), "tts_bark": ("BarkEngine", "tts", {"text": "Hello, this is a test of Bark.", "volume": 0.8}), "music_musicgen": ("MusicGenEngine", "music", {"prompt": "upbeat electronic music", "duration": 5}), - "music_acestep": ("AceStepEngine", "music", {"prompt": "calm piano music", "duration": 5}), + # music_acestep removed — ACE-Step now uses native C++ binary (acestep.cpp) "sfx_audiogen": ("AudioGenEngine", "sfx", {"prompt": "thunder and rain storm", "duration": 5}), "clone_openvoice": ("OpenVoiceEngine", "clone", None), # Needs audio input "clone_rvc": ("RVCEngine", "clone", None), # Needs audio input From 7dd9626093e2c91418ea08a45f265d545c690a6b Mon Sep 17 00:00:00 2001 From: kalebbroo Date: Sat, 18 Apr 2026 12:54:37 -0400 Subject: [PATCH 2/4] fix core params --- Assets/audio-integration.js | 6 ++++-- AudioLabParams.cs | 18 +----------------- 2 files changed, 5 insertions(+), 19 deletions(-) diff --git a/Assets/audio-integration.js b/Assets/audio-integration.js index a03eeb8..e9efb8e 100644 --- a/Assets/audio-integration.js +++ b/Assets/audio-integration.js @@ -31,7 +31,7 @@ const AudioLabConfig = { moonshine_stt: { category: 'audiolab_stt', providerFlag: 'moonshine_stt_params' }, realtimestt_stt: { category: 'audiolab_stt', providerFlag: 'realtimestt_params' }, musicgen_music: { category: 'audiolab_audiogen', providerFlag: 'musicgen_music_params', extraFlags: ['audiocraft_sampling'] }, - acestep_music: { category: 'audiolab_audiogen', providerFlag: 'acestep_music_params', extraFlags: ['acestep_lm_params', 'acestep_task_params'] }, + acestep_music: { category: 'audiolab_audiogen', providerFlag: 'acestep_music_params', extraFlags: ['acestep_lm_params', 'acestep_task_params'], keepCoreParams: ['steps', 'cfgscale'] }, openvoice_clone: { category: 'audiolab_clone', providerFlag: 'openvoice_clone_params' }, rvc_clone: { category: 'audiolab_clone', providerFlag: 'rvc_clone_params' }, gptsovits_clone: { category: 'audiolab_clone', providerFlag: 'gptsovits_clone_params' }, @@ -141,10 +141,12 @@ featureSetChangers.push(() => { const curArch = currentModelHelper.curArch; const isAudioModel = AudioLabConfig.isAudioModel(curArch); + const archConfig = isAudioModel ? AudioLabConfig.archToCategory[curArch] : null; + const keepCoreParams = archConfig?.keepCoreParams || []; console.log('[audiolab] featureSetChanger: curArch =', JSON.stringify(curArch), 'isAudioModel =', isAudioModel); for (const param of gen_param_types) { - if (AudioLabConfig.coreParamsToHide.includes(param.id)) { + if (AudioLabConfig.coreParamsToHide.includes(param.id) && !keepCoreParams.includes(param.id)) { if (isAudioModel) { if (!param.hasOwnProperty('original_feature_flag_audiolab')) { param.original_feature_flag_audiolab = param.feature_flag; diff --git a/AudioLabParams.cs b/AudioLabParams.cs index a364f72..0a4921d 100644 --- a/AudioLabParams.cs +++ b/AudioLabParams.cs @@ -241,14 +241,11 @@ public static class AudioLabParams #endregion #region Music — ACE-Step Core (flag: acestep_music_params) + // Steps, CFG Scale use built-in Core params (kept visible via keepCoreParams in audio-integration.js). // BPM, Key Scale, Time Signature, Language use built-in Text2Audio params (Text To Audio group). /// Song lyrics for ACE-Step generation. Feature flag: acestep_music_params. public static T2IRegisteredParam Lyrics; - /// Diffusion inference step count for ACE-Step. Feature flag: acestep_music_params. - public static T2IRegisteredParam InferStep; - /// Classifier-free guidance strength for ACE-Step. Feature flag: acestep_music_params. - public static T2IRegisteredParam ACEGuidanceScale; /// Instrumental-only toggle for ACE-Step. Feature flag: acestep_music_params. public static T2IRegisteredParam Instrumental; /// Noise schedule shift factor for ACE-Step. Feature flag: acestep_music_params. @@ -861,19 +858,6 @@ public static void RegisterAll() ViewType: ParamViewType.PROMPT, OrderPriority: -9, Group: AudioGenGroup, FeatureFlag: "acestep_music_params")); - - InferStep = T2IParamTypes.Register(new("Infer Steps", - "Number of diffusion inference steps.\nTurbo models: 8. SFT/Base models: 50.", - "8", - Min: 1, Max: 200, Step: 1, ViewType: ParamViewType.SLIDER, - OrderPriority: -7, Group: AudioGenGroup, FeatureFlag: "acestep_music_params")); - - ACEGuidanceScale = T2IParamTypes.Register(new("ACE Guidance", - "Classifier-free guidance strength.\nOnly effective with SFT/Base models that support CFG.", - "7.0", - Min: 1.0, Max: 30.0, Step: 0.5, ViewType: ParamViewType.SLIDER, - OrderPriority: -6, Group: AudioGenGroup, FeatureFlag: "acestep_music_params")); - Instrumental = T2IParamTypes.Register(new("Instrumental", "Generate instrumental-only track without vocals.", "false", From 0faff06dbf0f3c766ed29e254f0c3ab14dad5619 Mon Sep 17 00:00:00 2001 From: kalebbroo Date: Sat, 18 Apr 2026 13:02:35 -0400 Subject: [PATCH 3/4] setps and cfg should use core params --- AudioBackends/DynamicAudioBackend.cs | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/AudioBackends/DynamicAudioBackend.cs b/AudioBackends/DynamicAudioBackend.cs index 870c082..496a23a 100644 --- a/AudioBackends/DynamicAudioBackend.cs +++ b/AudioBackends/DynamicAudioBackend.cs @@ -1107,8 +1107,8 @@ private static Dictionary BuildEngineArgs(T2IParamInput input, A bool instrumental = input.TryGet(AudioLabParams.Instrumental, out string aceInst) && aceInst == "true"; args["lyrics"] = instrumental ? "[Instrumental]" : lyrics; args["seed"] = input.TryGet(T2IParamTypes.Seed, out long aceSeed) ? aceSeed : -1L; - args["inference_steps"] = input.TryGet(AudioLabParams.InferStep, out int infStep) ? infStep : 0; - args["guidance_scale"] = input.TryGet(AudioLabParams.ACEGuidanceScale, out double aceGuide) ? aceGuide : 0.0; + args["inference_steps"] = input.TryGet(T2IParamTypes.Steps, out int infStep) ? infStep : 0; + args["guidance_scale"] = input.TryGet(T2IParamTypes.CFGScale, out double aceGuide) ? aceGuide : 0.0; args["bpm"] = input.TryGet(T2IParamTypes.Text2AudioBPM, out long aceBpm) ? (int)aceBpm : 0; args["keyscale"] = input.TryGet(T2IParamTypes.Text2AudioKeyScale, out string aceKey) ? aceKey : ""; args["timesignature"] = input.TryGet(T2IParamTypes.Text2AudioTimeSignature, out string aceTs) ? aceTs : ""; From 1baddb1ac8bd35e6e959328d2435789eb3a5f10d Mon Sep 17 00:00:00 2001 From: kalebbroo Date: Sat, 18 Apr 2026 22:09:17 -0400 Subject: [PATCH 4/4] fix download location and model locations --- AudioBackends/DynamicAudioBackend.cs | 35 +++++----- AudioLabParams.cs | 95 +++++++++++++--------------- AudioServices/AceStepCppManager.cs | 22 ++----- 3 files changed, 67 insertions(+), 85 deletions(-) diff --git a/AudioBackends/DynamicAudioBackend.cs b/AudioBackends/DynamicAudioBackend.cs index 496a23a..19d3419 100644 --- a/AudioBackends/DynamicAudioBackend.cs +++ b/AudioBackends/DynamicAudioBackend.cs @@ -1097,16 +1097,14 @@ private static Dictionary BuildEngineArgs(T2IParamInput input, A break; case "acestep_music": - // Resolve DiT GGUF filename from model config + quant level + // Resolve DiT GGUF filename from model config string ditModel = modelDef?.EngineConfig?.TryGetValue("dit_model", out object dmObj) == true ? dmObj?.ToString() : "acestep-v15-turbo"; - string quantLevel = input.TryGet(AudioLabParams.ACEQuantLevel, out string ql) ? ql : "Q8_0"; - args["dit_file"] = AceStepCppManager.GetDitFileName(ditModel, quantLevel); - // AceRequest-compatible params (acestep_music_params) + args["dit_file"] = AceStepCppManager.GetDitFileName(ditModel); + // AceRequest-compatible params args["caption"] = input.Get(T2IParamTypes.Prompt, ""); - string lyrics = input.TryGet(AudioLabParams.Lyrics, out string ly) ? ly : ""; - bool instrumental = input.TryGet(AudioLabParams.Instrumental, out string aceInst) && aceInst == "true"; - args["lyrics"] = instrumental ? "[Instrumental]" : lyrics; + args["lyrics"] = input.TryGet(AudioLabParams.Lyrics, out string ly) ? ly : "[Instrumental]"; args["seed"] = input.TryGet(T2IParamTypes.Seed, out long aceSeed) ? aceSeed : -1L; + args["duration"] = input.TryGet(T2IParamTypes.Text2AudioDuration, out double aceDur) ? aceDur : 30.0; args["inference_steps"] = input.TryGet(T2IParamTypes.Steps, out int infStep) ? infStep : 0; args["guidance_scale"] = input.TryGet(T2IParamTypes.CFGScale, out double aceGuide) ? aceGuide : 0.0; args["bpm"] = input.TryGet(T2IParamTypes.Text2AudioBPM, out long aceBpm) ? (int)aceBpm : 0; @@ -1123,21 +1121,22 @@ private static Dictionary BuildEngineArgs(T2IParamInput input, A if (input.TryGet(AudioLabParams.LMNegativePrompt, out string aceLmNeg) && !string.IsNullOrEmpty(aceLmNeg)) args["lm_negative_prompt"] = aceLmNeg; args["use_cot_caption"] = input.TryGet(AudioLabParams.UseCotCaption, out string aceCotC) && aceCotC == "true"; - // Task params (acestep_task_params) - string taskType = input.TryGet(AudioLabParams.ACETaskType, out string aceTask) ? aceTask : "text2music"; + // Task params — source audio determines task mode (acestep_task_params) string aceSrcAudio = GetBase64Audio(input, AudioLabParams.ACESourceAudio); if (!string.IsNullOrEmpty(aceSrcAudio)) - args["src_audio"] = aceSrcAudio; - // Map task-specific fields into AceRequest - if (taskType == "cover") - args["audio_cover_strength"] = input.TryGet(AudioLabParams.CoverStrength, out double aceCovStr) ? aceCovStr : 0.5; - if (taskType == "repaint") { - args["repainting_start"] = input.TryGet(AudioLabParams.RepaintStart, out double aceRpS) ? aceRpS : -1.0; - args["repainting_end"] = input.TryGet(AudioLabParams.RepaintEnd, out double aceRpE) ? aceRpE : -1.0; + args["src_audio"] = aceSrcAudio; + string taskType = input.TryGet(AudioLabParams.ACETaskType, out string aceTask) ? aceTask : "cover"; + if (taskType == "cover") + args["audio_cover_strength"] = input.TryGet(AudioLabParams.CoverStrength, out double aceCovStr) ? aceCovStr : 0.5; + if (taskType == "repaint") + { + args["repainting_start"] = input.TryGet(AudioLabParams.RepaintStart, out double aceRpS) ? aceRpS : -1.0; + args["repainting_end"] = input.TryGet(AudioLabParams.RepaintEnd, out double aceRpE) ? aceRpE : -1.0; + } + if (taskType == "lego") + args["lego"] = "vocals"; } - if (taskType == "lego") - args["lego"] = "vocals"; // Default track; could add a param for this later break; case "musicgen_music": diff --git a/AudioLabParams.cs b/AudioLabParams.cs index 0a4921d..0d591d8 100644 --- a/AudioLabParams.cs +++ b/AudioLabParams.cs @@ -1,3 +1,5 @@ +using System.IO; +using Hartsy.Extensions.AudioLab.AudioServices; using SwarmUI.Media; using SwarmUI.Text2Image; @@ -246,12 +248,8 @@ public static class AudioLabParams /// Song lyrics for ACE-Step generation. Feature flag: acestep_music_params. public static T2IRegisteredParam Lyrics; - /// Instrumental-only toggle for ACE-Step. Feature flag: acestep_music_params. - public static T2IRegisteredParam Instrumental; /// Noise schedule shift factor for ACE-Step. Feature flag: acestep_music_params. public static T2IRegisteredParam ACEShift; - /// GGUF quantization level for ACE-Step model downloads. Feature flag: acestep_music_params. - public static T2IRegisteredParam ACEQuantLevel; #endregion @@ -858,47 +856,28 @@ public static void RegisterAll() ViewType: ParamViewType.PROMPT, OrderPriority: -9, Group: AudioGenGroup, FeatureFlag: "acestep_music_params")); - Instrumental = T2IParamTypes.Register(new("Instrumental", - "Generate instrumental-only track without vocals.", - "false", - GetValues: _ => ["false///No", "true///Yes"], - IsAdvanced: true, OrderPriority: -5, Group: AudioGenGroup, FeatureFlag: "acestep_music_params")); - - // BPM removed — use T2IParamTypes.Text2AudioBPM (Text To Audio group) - // KeyScale removed — use T2IParamTypes.Text2AudioKeyScale (Text To Audio group) - // TimeSignature removed — use T2IParamTypes.Text2AudioTimeSignature (Text To Audio group) - // VocalLanguage removed — use T2IParamTypes.Text2AudioLanguage (Text To Audio group) - ACEShift = T2IParamTypes.Register(new("Shift", "Noise schedule shift factor.\nHigher values increase generation diversity.", "3.0", Min: 1.0, Max: 5.0, Step: 0.1, ViewType: ParamViewType.SLIDER, IsAdvanced: true, OrderPriority: 0, Group: AudioGenGroup, FeatureFlag: "acestep_music_params")); - ACEQuantLevel = T2IParamTypes.Register(new("Quant Level", - "GGUF quantization level for model downloads.\nHigher = better quality but more VRAM/disk.", - "Q8_0", - GetValues: _ => [ - "Q8_0///Q8_0 (Best Quality)", "Q6_K///Q6_K (High)", - "Q5_K_M///Q5_K_M (Balanced)", "Q4_K_M///Q4_K_M (Smallest)", - "BF16///BF16 (Full Precision)" - ], - IsAdvanced: true, OrderPriority: 1, Group: AudioGenGroup, FeatureFlag: "acestep_music_params")); - #endregion #region Music — ACE-Step LM Planner - // ACE LM Model stays in Audio Generation group (it's the primary toggle) ACELMModel = T2IParamTypes.Register(new("ACE LM Model", - "Language Model planner for structured music metadata generation.\nDownloads GGUF model on first use. Enriches prompt with lyrics, BPM, key, etc.", + "Language Model planner for structured music metadata.\n" + + "Place acestep-5Hz-lm-*.gguf files in Models/audio/music/acestep-gguf/ to enable.\n" + + "Download from: https://huggingface.co/Serveurperso/ACE-Step-1.5-GGUF\n" + + "Larger models produce better metadata. 4B recommended for quality, 0.6B for speed.", "none", - GetValues: _ => [ - "none///None (Disabled)", "lm-0.6B///Qwen3 0.6B (Fast, ~710MB)", - "lm-1.7B///Qwen3 1.7B (Balanced, ~2GB)", "lm-4B///Qwen3 4B (Best, ~4.5GB)" - ], - OrderPriority: -10, Group: AudioGenGroup, FeatureFlag: "acestep_lm_params")); - - // LM tuning params in dedicated LM Planner group + GetValues: _ => ["none///None (Disabled)", + .. (Directory.Exists(AceStepCppManager.ModelRoot) + ? Directory.GetFiles(AceStepCppManager.ModelRoot, "acestep-5Hz-lm-*.gguf") + .Select(Path.GetFileName).OrderBy(f => f) + .Select(f => $"{f}///{FormatLmDisplayName(f)}") + : [])], + OrderPriority: -10, Group: LMPlannerGroup, FeatureFlag: "acestep_lm_params")); LMTemperature = T2IParamTypes.Register(new("LM Temperature", "Sampling temperature for the LM planner.\nHigher = more creative metadata generation.", "0.85", @@ -937,37 +916,41 @@ public static void RegisterAll() #endregion #region Music — ACE-Step Tasks + ACESourceAudio = T2IParamTypes.Register(new("ACE Source Audio", + "Source audio for cover, repaint, and lego tasks.\nUpload audio to enable task selection below.", + null, + OrderPriority: -10, Group: AudioGenGroup, FeatureFlag: "acestep_task_params")); + ACETaskType = T2IParamTypes.Register(new("Task Type", - "ACE-Step generation task type.\ntext2music = generate from prompt.\ncover = style transfer with source audio.\nrepaint = regenerate a time region.\nlego = extract/isolate a track (vocals, drums, etc.).", - "text2music", + "What to do with the source audio.\ncover = style transfer.\nrepaint = regenerate a time region.\nlego = extract/isolate a track (vocals, drums, etc.).", + "cover", GetValues: _ => [ - "text2music///Text to Music", "cover///Cover (Style Transfer)", + "cover///Cover (Style Transfer)", "repaint///Repaint (Section Regen)", "lego///Lego (Track Isolation)" ], - OrderPriority: -10, Group: AudioGenGroup, FeatureFlag: "acestep_task_params")); + OrderPriority: -9, Group: AudioGenGroup, FeatureFlag: "acestep_task_params", + DependNonDefault: ACESourceAudio.Type.ID)); - ACESourceAudio = T2IParamTypes.Register(new("ACE Source Audio", - "Source audio for cover, repaint, and lego tasks.\nRequired for all tasks except text2music.", - null, - OrderPriority: -9, Group: AudioGenGroup, FeatureFlag: "acestep_task_params")); + CoverStrength = T2IParamTypes.Register(new("Cover Strength", + "Style transfer strength for cover task.\nFraction of DiT steps using source audio. 1.0 = full transfer.", + "0.5", + Min: 0.0, Max: 1.0, Step: 0.05, ViewType: ParamViewType.SLIDER, + OrderPriority: -8, Group: AudioGenGroup, FeatureFlag: "acestep_task_params", + DependNonDefault: ACESourceAudio.Type.ID)); RepaintStart = T2IParamTypes.Register(new("Repaint Start", "Start time in seconds for repaint task.\nThe section from this point will be regenerated.", "0.0", Min: 0.0, Max: 600.0, Step: 0.5, ViewType: ParamViewType.SLIDER, - IsAdvanced: true, OrderPriority: -7, Group: AudioGenGroup, FeatureFlag: "acestep_task_params")); + OrderPriority: -7, Group: AudioGenGroup, FeatureFlag: "acestep_task_params", + DependNonDefault: ACESourceAudio.Type.ID)); RepaintEnd = T2IParamTypes.Register(new("Repaint End", "End time in seconds for repaint task.\n-1 = auto (repaint to end of audio).", "-1.0", Min: -1.0, Max: 600.0, Step: 0.5, ViewType: ParamViewType.SLIDER, - IsAdvanced: true, OrderPriority: -6, Group: AudioGenGroup, FeatureFlag: "acestep_task_params")); - - CoverStrength = T2IParamTypes.Register(new("Cover Strength", - "Style transfer strength for cover task.\nFraction of DiT steps using source audio. 1.0 = full transfer.", - "0.5", - Min: 0.0, Max: 1.0, Step: 0.05, ViewType: ParamViewType.SLIDER, - IsAdvanced: true, OrderPriority: -5, Group: AudioGenGroup, FeatureFlag: "acestep_task_params")); + OrderPriority: -6, Group: AudioGenGroup, FeatureFlag: "acestep_task_params", + DependNonDefault: ACESourceAudio.Type.ID)); #endregion @@ -1190,4 +1173,16 @@ public static void RegisterAll() #endregion } + + /// Formats an LM GGUF filename for display. + /// "acestep-5Hz-lm-4B-Q8_0.gguf" → "LM 4B (Q8_0)" + private static string FormatLmDisplayName(string filename) + { + string stem = Path.GetFileNameWithoutExtension(filename); + string[] parts = stem.Split('-'); + // Pattern: acestep-5Hz-lm-{size}-{quant} + string size = parts.Length >= 4 ? parts[3] : stem; + string quant = parts.Length >= 5 ? parts[4] : ""; + return string.IsNullOrEmpty(quant) ? $"LM {size}" : $"LM {size} ({quant})"; + } } diff --git a/AudioServices/AceStepCppManager.cs b/AudioServices/AceStepCppManager.cs index 925a6e5..06a30c2 100644 --- a/AudioServices/AceStepCppManager.cs +++ b/AudioServices/AceStepCppManager.cs @@ -73,7 +73,7 @@ private AceStepCppManager() #region Paths /// Root directory for the ace-server binary. - public static string BinaryRoot => Path.GetFullPath(Path.Combine("dlbackend", "acestep-cpp")); + public static string BinaryRoot => Path.GetFullPath(Path.Combine("dlbackend", "audiolab", "acestep")); /// Root directory for GGUF models. public static string ModelRoot => Path.GetFullPath(Path.Combine("Models", "audio", "music", "acestep-gguf")); @@ -541,13 +541,13 @@ public async Task EnsureModelsAsync(string ditFileName, string lmModel = "none") await EnsureSingleModelAsync("Qwen3-Embedding-0.6B-Q8_0.gguf"); // DiT model await EnsureSingleModelAsync(ditFileName); - // Optional LM model + // Optional LM model (filename provided directly from filesystem-scanned param) if (lmModel != "none" && !string.IsNullOrEmpty(lmModel)) { - string lmFileName = GetLmFileName(lmModel); - if (lmFileName is not null) + string lmPath = Path.Combine(ModelRoot, lmModel); + if (!File.Exists(lmPath)) { - await EnsureSingleModelAsync(lmFileName); + Logs.Warning($"[AceStep] LM model not found: {lmModel}"); } } } @@ -574,18 +574,6 @@ await Utilities.DownloadFile(url, localPath, (current, total, bps) => Logs.Info($"[AceStep] Downloaded {fileName}"); } - /// Maps LM model selection to GGUF filename. - public static string GetLmFileName(string lmModel) - { - return lmModel switch - { - "lm-0.6B" => "acestep-5Hz-lm-0.6B-Q8_0.gguf", - "lm-1.7B" => "acestep-5Hz-lm-1.7B-Q8_0.gguf", - "lm-4B" => "acestep-5Hz-lm-4B-Q8_0.gguf", - _ => null - }; - } - /// Resolves a DiT model ID + quant level to a GGUF filename. public static string GetDitFileName(string ditModel, string quantLevel = "Q8_0") {