From 3e70beff7f5e4f2bbf4036f36ff2d95a8ce8289e Mon Sep 17 00:00:00 2001 From: Maika Date: Fri, 26 Aug 2022 23:14:32 -0700 Subject: [PATCH] Format code, rewrite README Format code to be easier to understand, completely rewrite README to be more professional and user friendly, fixed typos etc. --- README.md | 101 ++++++++++--------- optimizedSD/img2img_gradio.py | 168 ++++++++++++++++++++----------- optimizedSD/optimized_img2img.py | 146 ++++++++++++++------------- optimizedSD/optimized_txt2img.py | 120 +++++++++++----------- optimizedSD/txt2img_gradio.py | 148 +++++++++++++++++---------- 5 files changed, 384 insertions(+), 299 deletions(-) diff --git a/README.md b/README.md index eb2bedef4..4fb2a94c4 100644 --- a/README.md +++ b/README.md @@ -1,27 +1,25 @@ -# Update v0.6: Added support for weighted prompts (based on the code from @lstein's [repo](https://github.com/lstein/stable-diffusion)) +

Optimized Stable Diffusion

+

+ + + +

-- You can now use weighted prompts to put relative emphasis on certain words. - eg. `--prompt tabby cat:0.25 white duck:0.75 hybrid`. -- The number followed by the colon represents the weight given to the words before the colon. - The weights can be fractions or integers. - -# Optimized Stable Diffusion (Sort of) +This repo is a modified version of the Stable Diffusion repo, optimized to use less VRAM than the original by sacrificing inference speed. -This repo is a modified version of the Stable Diffusion repo, optimized to use lesser VRAM than the original by sacrificing on inference speed. +To achieve this, the stable diffusion model is fragmented into four parts which are sent to the GPU only when needed. After the calculation is done, they are moved back to the CPU. This allows us to run a bigger model while requiring less VRAM. -img2img to generate new image based on a prior image and prompt +

Installation

-- `optimized_img2img.py` Generate images using CLI -- `img2img_gradio.py` Generate images using gradio GUI +All the modified files are in the [optimizedSD](optimizedSD) folder, so if you have already cloned the original repository you can just download and copy this folder into the original instead of cloning the entire repo. You can also clone this repo and follow the same installation steps as the original (mainly creating the conda environment and placing the weights at the specified location). -txt2img to generate an image based only on a prompt +The only drawback is higher inference time which is still orders of magnitude faster than inference on CPU. -- `optimized_txt2img.py` Generate images using CLI -- `txt2img_gradio.py` Generate images using gradio GUI +

Usage

-### img2img +## img2img -- It can generate _512x512 images from a prior image and prompt on a 4GB VRAM GPU in under 20 seconds per image_ (RTX 2060 in my case). +- `img2img` can generate _512x512 images from a prior image and prompt on a 4GB VRAM GPU in under 20 seconds per image_ on an RTX 2060. - The maximum size that can fit on 6GB GPU (RTX 2060) is around 576x768. @@ -29,80 +27,85 @@ txt2img to generate an image based only on a prompt `python optimizedSD/optimized_img2img.py --prompt "Austrian alps" --init-img ~/sketch-mountains-input.jpg --strength 0.8 --n_iter 2 --n_samples 5 --H 576 --W 768` -### txt2img +## txt2img -- It can generate _512x512 images from a prompt on a 4GB VRAM GPU in under 25 seconds per image_ (RTX 2060 in my case). +- `txt2img` can generate _512x512 images from a prompt on a 4GB VRAM GPU in under 25 seconds per image_ on an RTX 2060. - For example, the following command will generate 20 512x512 images: `python optimizedSD/optimized_txt2img.py --prompt "Cyberpunk style image of a Telsa car reflection in rain" --H 512 --W 512 --seed 27 --n_iter 2 --n_samples 10 --ddim_steps 50` ---- -## Optional arguments -### `--turbo` (Faster inference) -- If generating small batch of images (~ 1 to 4), use this argument to generate a single 512x512 image in under 25 seconds (on RTX 2060, excluding the time to load the model) using using only around 1GB of extra VRAM (4.3 GB instead of 3.3GB) +

Arguments

+## `--turbo` + +**Increases inference speed at the cost of extra VRAM usage.** + +- If generating small batch of images (~ 1 to 4), use this argument to generate an image or images in under 25 seconds (based on an RTX 2060, excluding the time to load the model) using around 1GB of extra VRAM (3.3 GB -> 4.3 GB usage on average) + +## `--seed` -### `--seed` (Seed) +**Seed for image generation**, can be used to reproduce previously generated images. Defaults to a random seed if unspecified. -- The code will give the seed number along with each generated image. To generate the same image again, just specify the seed using `--seed` argument. Also, images will be saved with its seed number as its name. +- The code will give the seed number along with each generated image. To generate the same image again, just specify the seed using `--seed` argument. Images are saved with its seed number as its name by default. -- eg. If the seed number for an image is `1234` and it's the 55th image in the folder, the image name will be named `seed_1234_00055.png`. If no seed is given as an argument, a random initial seed will be choosen. +- For example if the seed number for an image is `1234` and it's the 55th image in the folder, the image name will be named `seed_1234_00055.png`. -### `--n_samples` (batch size) +## `--n_samples` + +**Batch size/amount of images to generate at once.** - To get the lowest inference time per image, use the maximum batch size `--n_samples` that can fit on the GPU. Inference time per image will reduce on increasing the batch size, but the required VRAM will also increase. - If you get a CUDA out of memory error, try reducing the batch size `--n_samples`. If it doesn't work, the other option is to reduce the image width `--W` or height `--H` or both. -### `--n_iter` +## `--n_iter` -- Equivalent to running the script n_iter number of times. Only difference is that the model is loaded only once per n_iter iterations. Unlike `n_samples`, reducing it doesn't have an effect on VRAM required or inference time. +**Run *x* amount of times** -### `--H` & `--W` (Height & Width) +- Functions similarly to `--n_samples`, the only difference is that the model is loaded only once per n_iter iterations, (unlike `n_samples`) meaning reducing it doesn't have an effect on VRAM required or inference time. -- Both height and width should be a multiple of 64 +## `--H` & `--W` -### `--precision autocast` or `--precision full` (Full or Mixed Precision) +**Height & width of the generated image.** -- Mixed Precision is enabled by default. If you don't have a GPU with tensor cores (RTX 10xx series), you may not be able use mixed precision. Use `--precision full` argument to disable it. +- Both height and width should be a multiple of 64. -### `--unet_bs` +## `--precision autocast` or `--precision full` -- Batch size for the unet model. Takes up a lot of extra RAM for very little improvement in inference time. unet_bs > 1 is not recommended! +**Whether or not to use `full` or `mixed` precision** -- Should be a multiple of 2x(n_samples) +- Mixed Precision is enabled by default. If you don't have a GPU with tensor cores (any GTX 10 series card), you may not be able use mixed precision. Use the `--precision full` argument to disable it. -### Gradio for Graphical User Interface +## `--unet_bs` -- You can also use gradio interface for img2img & txt2img instead of the CLI. Just activate the conda env and install the latest version of gradio using `pip install gradio` . +**Batch size for the unet model** -- Run img2img using `python optimizedSD/img2img_gradio.py` and txt2img using `python optimizedSD/img2img_gradio.py`. +- Takes up a lot of extra RAM for **very little improvement** in inference time. `unet_bs` > 1 is not recommended! --- img2img_gradio.py has the feature to crop images. Look for the pen symbol in the image box after selecting the image. +- Should generally be a multiple of 2x(n_samples) -### Weighted Prompts +

Using the Gradio GUI

-- The prompts can also be weighted to put relative emphasis on certain words. - eg. `--prompt tabby cat:0.25 white duck:0.75 hybrid`. +- You can also use the built-in gradio interface for `img2img` & `txt2img` instead of the command line interface. Activate the conda environment and install the latest version of gradio using `pip install gradio`, -- The number followed by the colon represents the weight given to the words before the colon.The weights can be both fractions or integers. +- Run img2img using `python3 optimizedSD/img2img_gradio.py` and txt2img using `python3 optimizedSD/img2img_gradio.py`. -### Installation +- img2img_gradio.py has the feature to crop images. Look for the pen symbol in the image box after selecting the image. -- All the modified files are in the [optimizedSD](optimizedSD) folder, so if you have already cloned the original repo, you can just download and copy this folder into the orignal repo instead of cloning the entire repo. You can also clone this repo and follow the same installation steps as the original repo(mainly creating the conda env and placing the weights at the specified location). +

Weighted Prompts

---- +- Prompts can also be weighted to put relative emphasis on certain words. + eg. `--prompt tabby cat:0.25 white duck:0.75 hybrid`. -- To achieve this, the stable diffusion model is fragmented into four parts which are sent to the GPU only when needed. After the calculation is done, they are moved back to the CPU. This allows us to run a bigger model on a lower VRAM. +- The number followed by the colon represents the weight given to the words before the colon.The weights can be both fractions or integers. -- The only drawback is higher inference time which is still an order of magnitude faster than inference on CPU. ## Changelog -- v0.6: Added support for using weighted prompts. (based on @lstein [repo](https://github.com/lstein/stable-diffusion)) +- v0.6: Added support for using weighted prompts. (based on @lstein's [repo](https://github.com/lstein/stable-diffusion)) - v0.5: Added support for using gradio interface. - v0.4: Added support for specifying image seed. - v0.3: Added support for using mixed precision. diff --git a/optimizedSD/img2img_gradio.py b/optimizedSD/img2img_gradio.py index db0f41bc1..fcbdc3152 100644 --- a/optimizedSD/img2img_gradio.py +++ b/optimizedSD/img2img_gradio.py @@ -20,11 +20,14 @@ from contextlib import nullcontext from ldm.util import instantiate_from_config from transformers import logging + logging.set_verbosity_error() from split_subprompts import split_weighted_subprompts import mimetypes + mimetypes.init() -mimetypes.add_type('application/javascript', '.js') +mimetypes.add_type("application/javascript", ".js") + def chunk(it, size): it = iter(it) @@ -39,22 +42,23 @@ def load_model_from_config(ckpt, verbose=False): sd = pl_sd["state_dict"] return sd + def load_img(image, h0, w0): - + image = image.convert("RGB") w, h = image.size - print(f"loaded input image of size ({w}, {h})") - if(h0 is not None and w0 is not None): + print(f"loaded input image of size ({w}, {h})") + if h0 is not None and w0 is not None: h, w = h0, w0 - + w, h = map(lambda x: x - x % 64, (w, h)) # resize to integer multiple of 32 print(f"New image size ({w}, {h})") - image = image.resize((w, h), resample = Image.LANCZOS) + image = image.resize((w, h), resample=Image.LANCZOS) image = np.array(image).astype(np.float32) / 255.0 image = image[None].transpose(0, 3, 1, 2) image = torch.from_numpy(image) - return 2.*image - 1. + return 2.0 * image - 1.0 config = "optimizedSD/v1-inference.yaml" @@ -62,46 +66,64 @@ def load_img(image, h0, w0): sd = load_model_from_config(f"{ckpt}") li, lo = [], [] for key, v_ in sd.items(): - sp = key.split('.') - if(sp[0]) == 'model': - if('input_blocks' in sp): + sp = key.split(".") + if (sp[0]) == "model": + if "input_blocks" in sp: li.append(key) - elif('middle_block' in sp): + elif "middle_block" in sp: li.append(key) - elif('time_embed' in sp): + elif "time_embed" in sp: li.append(key) else: lo.append(key) for key in li: - sd['model1.' + key[6:]] = sd.pop(key) + sd["model1." + key[6:]] = sd.pop(key) for key in lo: - sd['model2.' + key[6:]] = sd.pop(key) + sd["model2." + key[6:]] = sd.pop(key) config = OmegaConf.load(f"{config}") model = instantiate_from_config(config.modelUNet) _, _ = model.load_state_dict(sd, strict=False) model.eval() - + modelCS = instantiate_from_config(config.modelCondStage) _, _ = modelCS.load_state_dict(sd, strict=False) modelCS.eval() - + modelFS = instantiate_from_config(config.modelFirstStage) _, _ = modelFS.load_state_dict(sd, strict=False) modelFS.eval() del sd -def generate(image, prompt,strength,ddim_steps,n_iter, batch_size, Height, Width, scale,ddim_eta, unet_bs,device,seed,outdir, turbo, full_precision): - seeds = '' +def generate( + image, + prompt, + strength, + ddim_steps, + n_iter, + batch_size, + Height, + Width, + scale, + ddim_eta, + unet_bs, + device, + seed, + outdir, + turbo, + full_precision, +): + + seeds = "" init_image = load_img(image, Height, Width).to(device) model.unet_bs = unet_bs model.turbo = turbo model.cdevice = device modelCS.cond_stage_model.device = device - if device != 'cpu' and full_precision == False: + if device != "cpu" and full_precision == False: model.half() modelCS.half() modelFS.half() @@ -110,11 +132,11 @@ def generate(image, prompt,strength,ddim_steps,n_iter, batch_size, Height, Width tic = time.time() os.makedirs(outdir, exist_ok=True) outpath = outdir - sample_path = os.path.join(outpath, '_'.join(re.split(':| ',prompt)))[:150] + sample_path = os.path.join(outpath, "_".join(re.split(":| ", prompt)))[:150] os.makedirs(sample_path, exist_ok=True) base_count = len(os.listdir(sample_path)) - - if seed == '': + + if seed == "": seed = randint(0, 1000000) seed = int(seed) seed_everything(seed) @@ -125,21 +147,20 @@ def generate(image, prompt,strength,ddim_steps,n_iter, batch_size, Height, Width modelFS.to(device) - init_image = repeat(init_image, '1 ... -> b ...', b=batch_size) + init_image = repeat(init_image, "1 ... -> b ...", b=batch_size) init_latent = modelFS.get_first_stage_encoding(modelFS.encode_first_stage(init_image)) # move to latent space - if(device != 'cpu'): - mem = torch.cuda.memory_allocated()/1e6 + if device != "cpu": + mem = torch.cuda.memory_allocated() / 1e6 modelFS.to("cpu") - while(torch.cuda.memory_allocated()/1e6 >= mem): + while torch.cuda.memory_allocated() / 1e6 >= mem: time.sleep(1) - - assert 0. <= strength <= 1., 'can only work with strength in [0.0, 1.0]' - t_enc = int(strength *ddim_steps) + assert 0.0 <= strength <= 1.0, "can only work with strength in [0.0, 1.0]" + t_enc = int(strength * ddim_steps) print(f"target t_enc is {t_enc} steps") - if full_precision== False and device != "cpu": + if full_precision == False and device != "cpu": precision_scope = autocast else: precision_scope = nullcontext @@ -157,7 +178,7 @@ def generate(image, prompt,strength,ddim_steps,n_iter, batch_size, Height, Width if isinstance(prompts, tuple): prompts = list(prompts) - subprompts,weights = split_weighted_subprompts(prompts[0]) + subprompts, weights = split_weighted_subprompts(prompts[0]) if len(subprompts) > 1: c = torch.zeros_like(uc) totalWeight = sum(weights) @@ -166,67 +187,94 @@ def generate(image, prompt,strength,ddim_steps,n_iter, batch_size, Height, Width weight = weights[i] # if not skip_normalize: weight = weight / totalWeight - c = torch.add(c,modelCS.get_learned_conditioning(subprompts[i]), alpha=weight) + c = torch.add(c, modelCS.get_learned_conditioning(subprompts[i]), alpha=weight) else: c = modelCS.get_learned_conditioning(prompts) - + c = modelCS.get_learned_conditioning(prompts) - if(device != 'cpu'): - mem = torch.cuda.memory_allocated()/1e6 + if device != "cpu": + mem = torch.cuda.memory_allocated() / 1e6 modelCS.to("cpu") - while(torch.cuda.memory_allocated()/1e6 >= mem): + while torch.cuda.memory_allocated() / 1e6 >= mem: time.sleep(1) # encode (scaled latent) - z_enc = model.stochastic_encode(init_latent, torch.tensor([t_enc]*batch_size).to(device), seed,ddim_eta,ddim_steps) + z_enc = model.stochastic_encode( + init_latent, torch.tensor([t_enc] * batch_size).to(device), seed, ddim_eta, ddim_steps + ) # decode it - samples_ddim = model.decode(z_enc, c, t_enc, unconditional_guidance_scale=scale, - unconditional_conditioning=uc,) + samples_ddim = model.decode( + z_enc, + c, + t_enc, + unconditional_guidance_scale=scale, + unconditional_conditioning=uc, + ) modelFS.to(device) print("saving images") for i in range(batch_size): - + x_samples_ddim = modelFS.decode_first_stage(samples_ddim[i].unsqueeze(0)) x_sample = torch.clamp((x_samples_ddim + 1.0) / 2.0, min=0.0, max=1.0) all_samples.append(x_sample.to("cpu")) - x_sample = 255. * rearrange(x_sample[0].cpu().numpy(), 'c h w -> h w c') + x_sample = 255.0 * rearrange(x_sample[0].cpu().numpy(), "c h w -> h w c") Image.fromarray(x_sample.astype(np.uint8)).save( - os.path.join(sample_path, "seed_" + str(seed) + "_" + f"{base_count:05}.png")) - seeds+= str(seed) + ',' - seed+=1 + os.path.join(sample_path, "seed_" + str(seed) + "_" + f"{base_count:05}.png") + ) + seeds += str(seed) + "," + seed += 1 base_count += 1 - - if(device != 'cpu'): - mem = torch.cuda.memory_allocated()/1e6 + if device != "cpu": + mem = torch.cuda.memory_allocated() / 1e6 modelFS.to("cpu") - while(torch.cuda.memory_allocated()/1e6 >= mem): + while torch.cuda.memory_allocated() / 1e6 >= mem: time.sleep(1) del samples_ddim del x_sample del x_samples_ddim - print("memory_final = ", torch.cuda.memory_allocated()/1e6) + print("memory_final = ", torch.cuda.memory_allocated() / 1e6) toc = time.time() - time_taken = (toc-tic)/60.0 + time_taken = (toc - tic) / 60.0 grid = torch.cat(all_samples, 0) grid = make_grid(grid, nrow=n_iter) - grid = 255. * rearrange(grid, 'c h w -> h w c').cpu().numpy() - - txt = "Your samples are ready in " + str(round(time_taken, 3)) + " minutes and waiting for you here \n" + sample_path + "\nSeeds used = " + seeds[:-1] + grid = 255.0 * rearrange(grid, "c h w -> h w c").cpu().numpy() + + txt = ( + "Samples finished in " + + str(round(time_taken, 3)) + + " minutes and exported to \n" + + sample_path + + "\nSeeds used = " + + seeds[:-1] + ) return Image.fromarray(grid.astype(np.uint8)), txt + demo = gr.Interface( fn=generate, - inputs=[gr.Image(tool="editor", type="pil"),"text",gr.Slider(0, 1,value=0.75), - gr.Slider(1, 1000,value=50),gr.Slider(1, 100, step=1), gr.Slider(1, 100,step=1), - gr.Slider(64,4096,value = 512,step=64), gr.Slider(64,4096,value = 512,step=64), - gr.Slider(0,50,value=7.5,step=0.1),gr.Slider(0,1,step=0.01), - gr.Slider(1,2,value = 1,step=1),gr.Text(value = "cuda"), "text", - gr.Text(value = "outputs/img2img-samples"),"checkbox", "checkbox",], + inputs=[ + gr.Image(tool="editor", type="pil"), + "text", + gr.Slider(0, 1, value=0.75), + gr.Slider(1, 1000, value=50), + gr.Slider(1, 100, step=1), + gr.Slider(1, 100, step=1), + gr.Slider(64, 4096, value=512, step=64), + gr.Slider(64, 4096, value=512, step=64), + gr.Slider(0, 50, value=7.5, step=0.1), + gr.Slider(0, 1, step=0.01), + gr.Slider(1, 2, value=1, step=1), + gr.Text(value="cuda"), + "text", + gr.Text(value="outputs/img2img-samples"), + "checkbox", + "checkbox", + ], outputs=["image", "text"], ) -demo.launch() \ No newline at end of file +demo.launch() diff --git a/optimizedSD/optimized_img2img.py b/optimizedSD/optimized_img2img.py index 0fb06390b..8bf8635d7 100644 --- a/optimizedSD/optimized_img2img.py +++ b/optimizedSD/optimized_img2img.py @@ -16,8 +16,10 @@ from ldm.util import instantiate_from_config from split_subprompts import split_weighted_subprompts from transformers import logging + logging.set_verbosity_error() + def chunk(it, size): it = iter(it) return iter(lambda: tuple(islice(it, size)), ()) @@ -31,23 +33,25 @@ def load_model_from_config(ckpt, verbose=False): sd = pl_sd["state_dict"] return sd + def load_img(path, h0, w0): - + image = Image.open(path).convert("RGB") w, h = image.size - print(f"loaded input image of size ({w}, {h}) from {path}") - if(h0 is not None and w0 is not None): + print(f"loaded input image of size ({w}, {h}) from {path}") + if h0 is not None and w0 is not None: h, w = h0, w0 - + w, h = map(lambda x: x - x % 64, (w, h)) # resize to integer multiple of 32 print(f"New image size ({w}, {h})") - image = image.resize((w, h), resample = Image.LANCZOS) + image = image.resize((w, h), resample=Image.LANCZOS) image = np.array(image).astype(np.float32) / 255.0 image = image[None].transpose(0, 3, 1, 2) image = torch.from_numpy(image) - return 2.*image - 1. + return 2.0 * image - 1.0 + config = "optimizedSD/v1-inference.yaml" ckpt = "models/ldm/stable-diffusion-v1/model.ckpt" @@ -55,34 +59,19 @@ def load_img(path, h0, w0): parser = argparse.ArgumentParser() parser.add_argument( - "--prompt", - type=str, - nargs="?", - default="a painting of a virus monster playing guitar", - help="the prompt to render" -) -parser.add_argument( - "--outdir", - type=str, - nargs="?", - help="dir to write results to", - default="outputs/img2img-samples" -) -parser.add_argument( - "--init-img", - type=str, - nargs="?", - help="path to the input image" + "--prompt", type=str, nargs="?", default="a painting of a virus monster playing guitar", help="the prompt to render" ) +parser.add_argument("--outdir", type=str, nargs="?", help="dir to write results to", default="outputs/img2img-samples") +parser.add_argument("--init-img", type=str, nargs="?", help="path to the input image") parser.add_argument( "--skip_grid", - action='store_true', + action="store_true", help="do not save a grid, only individual samples. Helpful when evaluating lots of samples", ) parser.add_argument( "--skip_save", - action='store_true', + action="store_true", help="do not save individual samples. For speed measurements.", ) parser.add_argument( @@ -165,15 +154,11 @@ def load_img(path, h0, w0): ) parser.add_argument( "--turbo", - action='store_true', + action="store_true", help="Reduces inference time on the expense of 1GB VRAM", ) parser.add_argument( - "--precision", - type=str, - help="evaluate at this precision", - choices=["full", "autocast"], - default="autocast" + "--precision", type=str, help="evaluate at this precision", choices=["full", "autocast"], default="autocast" ) opt = parser.parse_args() @@ -181,7 +166,7 @@ def load_img(path, h0, w0): os.makedirs(opt.outdir, exist_ok=True) outpath = opt.outdir -sample_path = os.path.join(outpath, '_'.join(re.split(':| ',opt.prompt)))[:150] +sample_path = os.path.join(outpath, "_".join(re.split(":| ", opt.prompt)))[:150] os.makedirs(sample_path, exist_ok=True) base_count = len(os.listdir(sample_path)) grid_count = len(os.listdir(outpath)) - 1 @@ -194,20 +179,20 @@ def load_img(path, h0, w0): li = [] lo = [] for key, value in sd.items(): - sp = key.split('.') - if(sp[0]) == 'model': - if('input_blocks' in sp): + sp = key.split(".") + if (sp[0]) == "model": + if "input_blocks" in sp: li.append(key) - elif('middle_block' in sp): + elif "middle_block" in sp: li.append(key) - elif('time_embed' in sp): + elif "time_embed" in sp: li.append(key) else: lo.append(key) for key in li: - sd['model1.' + key[6:]] = sd.pop(key) + sd["model1." + key[6:]] = sd.pop(key) for key in lo: - sd['model2.' + key[6:]] = sd.pop(key) + sd["model2." + key[6:]] = sd.pop(key) config = OmegaConf.load(f"{config}") @@ -220,17 +205,17 @@ def load_img(path, h0, w0): model.cdevice = opt.device model.unet_bs = opt.unet_bs model.turbo = opt.turbo - + modelCS = instantiate_from_config(config.modelCondStage) _, _ = modelCS.load_state_dict(sd, strict=False) modelCS.eval() modelCS.cond_stage_model.device = opt.device - + modelFS = instantiate_from_config(config.modelFirstStage) _, _ = modelFS.load_state_dict(sd, strict=False) modelFS.eval() del sd -if opt.device != 'cpu' and opt.precision == "autocast": +if opt.device != "cpu" and opt.precision == "autocast": model.half() modelCS.half() modelFS.half() @@ -252,33 +237,33 @@ def load_img(path, h0, w0): modelFS.to(opt.device) -init_image = repeat(init_image, '1 ... -> b ...', b=batch_size) +init_image = repeat(init_image, "1 ... -> b ...", b=batch_size) init_latent = modelFS.get_first_stage_encoding(modelFS.encode_first_stage(init_image)) # move to latent space -if(opt.device != 'cpu'): - mem = torch.cuda.memory_allocated()/1e6 +if opt.device != "cpu": + mem = torch.cuda.memory_allocated() / 1e6 modelFS.to("cpu") - while(torch.cuda.memory_allocated()/1e6 >= mem): + while torch.cuda.memory_allocated() / 1e6 >= mem: time.sleep(1) -assert 0. <= opt.strength <= 1., 'can only work with strength in [0.0, 1.0]' +assert 0.0 <= opt.strength <= 1.0, "can only work with strength in [0.0, 1.0]" t_enc = int(opt.strength * opt.ddim_steps) print(f"target t_enc is {t_enc} steps") -if opt.precision=="autocast" and opt.device != "cpu": +if opt.precision == "autocast" and opt.device != "cpu": precision_scope = autocast else: precision_scope = nullcontext -seeds = '' +seeds = "" with torch.no_grad(): all_samples = list() for n in trange(opt.n_iter, desc="Sampling"): for prompts in tqdm(data, desc="data"): - with precision_scope("cuda"): + with precision_scope("cuda"): modelCS.to(opt.device) uc = None if opt.scale != 1.0: @@ -286,7 +271,7 @@ def load_img(path, h0, w0): if isinstance(prompts, tuple): prompts = list(prompts) - subprompts,weights = split_weighted_subprompts(prompts[0]) + subprompts, weights = split_weighted_subprompts(prompts[0]) if len(subprompts) > 1: c = torch.zeros_like(uc) totalWeight = sum(weights) @@ -295,48 +280,65 @@ def load_img(path, h0, w0): weight = weights[i] # if not skip_normalize: weight = weight / totalWeight - c = torch.add(c,modelCS.get_learned_conditioning(subprompts[i]), alpha=weight) + c = torch.add(c, modelCS.get_learned_conditioning(subprompts[i]), alpha=weight) else: c = modelCS.get_learned_conditioning(prompts) - if(opt.device != 'cpu'): - mem = torch.cuda.memory_allocated()/1e6 + if opt.device != "cpu": + mem = torch.cuda.memory_allocated() / 1e6 modelCS.to("cpu") - while(torch.cuda.memory_allocated()/1e6 >= mem): + while torch.cuda.memory_allocated() / 1e6 >= mem: time.sleep(1) # encode (scaled latent) - z_enc = model.stochastic_encode(init_latent, torch.tensor([t_enc]*batch_size).to(opt.device), opt.seed,opt.ddim_eta, opt.ddim_steps) + z_enc = model.stochastic_encode( + init_latent, + torch.tensor([t_enc] * batch_size).to(opt.device), + opt.seed, + opt.ddim_eta, + opt.ddim_steps, + ) # decode it - samples_ddim = model.decode(z_enc, c, t_enc, unconditional_guidance_scale=opt.scale, - unconditional_conditioning=uc,) - + samples_ddim = model.decode( + z_enc, + c, + t_enc, + unconditional_guidance_scale=opt.scale, + unconditional_conditioning=uc, + ) modelFS.to(opt.device) print("saving images") for i in range(batch_size): - + x_samples_ddim = modelFS.decode_first_stage(samples_ddim[i].unsqueeze(0)) x_sample = torch.clamp((x_samples_ddim + 1.0) / 2.0, min=0.0, max=1.0) - x_sample = 255. * rearrange(x_sample[0].cpu().numpy(), 'c h w -> h w c') + x_sample = 255.0 * rearrange(x_sample[0].cpu().numpy(), "c h w -> h w c") Image.fromarray(x_sample.astype(np.uint8)).save( - os.path.join(sample_path, "seed_" + str(opt.seed) + "_" + f"{base_count:05}.png")) - seeds+= str(opt.seed) + ',' - opt.seed+=1 + os.path.join(sample_path, "seed_" + str(opt.seed) + "_" + f"{base_count:05}.png") + ) + seeds += str(opt.seed) + "," + opt.seed += 1 base_count += 1 - - if(opt.device != 'cpu'): - mem = torch.cuda.memory_allocated()/1e6 + if opt.device != "cpu": + mem = torch.cuda.memory_allocated() / 1e6 modelFS.to("cpu") - while(torch.cuda.memory_allocated()/1e6 >= mem): + while torch.cuda.memory_allocated() / 1e6 >= mem: time.sleep(1) del samples_ddim - print("memory_final = ", torch.cuda.memory_allocated()/1e6) + print("memory_final = ", torch.cuda.memory_allocated() / 1e6) toc = time.time() -time_taken = (toc-tic)/60.0 +time_taken = (toc - tic) / 60.0 -print(("Your samples are ready in {0:.2f} minutes and waiting for you here " + sample_path + "\n Seeds used = " + seeds[:-1]).format(time_taken)) \ No newline at end of file +print( + ( + "Samples finished in {0:.2f} minutes and exported to " + + sample_path + + "\n Seeds used = " + + seeds[:-1] + ).format(time_taken) +) diff --git a/optimizedSD/optimized_txt2img.py b/optimizedSD/optimized_txt2img.py index a69a9cd08..ef4b814c3 100644 --- a/optimizedSD/optimized_txt2img.py +++ b/optimizedSD/optimized_txt2img.py @@ -15,6 +15,7 @@ from ldm.util import instantiate_from_config from split_subprompts import split_weighted_subprompts from transformers import logging + logging.set_verbosity_error() @@ -38,27 +39,17 @@ def load_model_from_config(ckpt, verbose=False): parser = argparse.ArgumentParser() parser.add_argument( - "--prompt", - type=str, - nargs="?", - default="a painting of a virus monster playing guitar", - help="the prompt to render" -) -parser.add_argument( - "--outdir", - type=str, - nargs="?", - help="dir to write results to", - default="outputs/txt2img-samples" + "--prompt", type=str, nargs="?", default="a painting of a virus monster playing guitar", help="the prompt to render" ) +parser.add_argument("--outdir", type=str, nargs="?", help="dir to write results to", default="outputs/txt2img-samples") parser.add_argument( "--skip_grid", - action='store_true', + action="store_true", help="do not save a grid, only individual samples. Helpful when evaluating lots of samples", ) parser.add_argument( "--skip_save", - action='store_true', + action="store_true", help="do not save individual samples. For speed measurements.", ) parser.add_argument( @@ -70,7 +61,7 @@ def load_model_from_config(ckpt, verbose=False): parser.add_argument( "--fixed_code", - action='store_true', + action="store_true", help="if enabled, uses the same starting code across samples ", ) parser.add_argument( @@ -152,15 +143,11 @@ def load_model_from_config(ckpt, verbose=False): ) parser.add_argument( "--turbo", - action='store_true', + action="store_true", help="Reduces inference time on the expense of 1GB VRAM", ) parser.add_argument( - "--precision", - type=str, - help="evaluate at this precision", - choices=["full", "autocast"], - default="autocast" + "--precision", type=str, help="evaluate at this precision", choices=["full", "autocast"], default="autocast" ) opt = parser.parse_args() @@ -168,7 +155,7 @@ def load_model_from_config(ckpt, verbose=False): os.makedirs(opt.outdir, exist_ok=True) outpath = opt.outdir -sample_path = os.path.join(outpath, '_'.join(re.split(':| ',opt.prompt)))[:150] +sample_path = os.path.join(outpath, "_".join(re.split(":| ", opt.prompt)))[:150] os.makedirs(sample_path, exist_ok=True) base_count = len(os.listdir(sample_path)) grid_count = len(os.listdir(outpath)) - 1 @@ -180,20 +167,20 @@ def load_model_from_config(ckpt, verbose=False): sd = load_model_from_config(f"{ckpt}") li, lo = [], [] for key, value in sd.items(): - sp = key.split('.') - if(sp[0]) == 'model': - if('input_blocks' in sp): + sp = key.split(".") + if (sp[0]) == "model": + if "input_blocks" in sp: li.append(key) - elif('middle_block' in sp): + elif "middle_block" in sp: li.append(key) - elif('time_embed' in sp): + elif "time_embed" in sp: li.append(key) else: lo.append(key) for key in li: - sd['model1.' + key[6:]] = sd.pop(key) + sd["model1." + key[6:]] = sd.pop(key) for key in lo: - sd['model2.' + key[6:]] = sd.pop(key) + sd["model2." + key[6:]] = sd.pop(key) config = OmegaConf.load(f"{config}") @@ -203,12 +190,12 @@ def load_model_from_config(ckpt, verbose=False): model.unet_bs = opt.unet_bs model.cdevice = opt.device model.turbo = opt.turbo - + modelCS = instantiate_from_config(config.modelCondStage) _, _ = modelCS.load_state_dict(sd, strict=False) modelCS.eval() modelCS.cond_stage_model.device = opt.device - + modelFS = instantiate_from_config(config.modelFirstStage) _, _ = modelFS.load_state_dict(sd, strict=False) modelFS.eval() @@ -238,27 +225,26 @@ def load_model_from_config(ckpt, verbose=False): data = list(chunk(data, batch_size)) -if opt.precision=="autocast" and opt.device != "cpu": +if opt.precision == "autocast" and opt.device != "cpu": precision_scope = autocast else: precision_scope = nullcontext -seeds = '' +seeds = "" with torch.no_grad(): all_samples = list() for n in trange(opt.n_iter, desc="Sampling"): for prompts in tqdm(data, desc="data"): - with precision_scope("cuda"): + with precision_scope("cuda"): modelCS.to(opt.device) uc = None if opt.scale != 1.0: uc = modelCS.get_learned_conditioning(batch_size * [""]) if isinstance(prompts, tuple): prompts = list(prompts) - - subprompts,weights = split_weighted_subprompts(prompts[0]) + subprompts, weights = split_weighted_subprompts(prompts[0]) if len(subprompts) > 1: c = torch.zeros_like(uc) totalWeight = sum(weights) @@ -267,56 +253,64 @@ def load_model_from_config(ckpt, verbose=False): weight = weights[i] # if not skip_normalize: weight = weight / totalWeight - c = torch.add(c,modelCS.get_learned_conditioning(subprompts[i]), alpha=weight) + c = torch.add(c, modelCS.get_learned_conditioning(subprompts[i]), alpha=weight) else: c = modelCS.get_learned_conditioning(prompts) - shape = [opt.C, opt.H // opt.f, opt.W // opt.f] - if(opt.device != 'cpu'): - mem = torch.cuda.memory_allocated()/1e6 + if opt.device != "cpu": + mem = torch.cuda.memory_allocated() / 1e6 modelCS.to("cpu") - while(torch.cuda.memory_allocated()/1e6 >= mem): + while torch.cuda.memory_allocated() / 1e6 >= mem: time.sleep(1) - - samples_ddim = model.sample(S=opt.ddim_steps, - conditioning=c, - batch_size=opt.n_samples, - seed = opt.seed, - shape=shape, - verbose=False, - unconditional_guidance_scale=opt.scale, - unconditional_conditioning=uc, - eta=opt.ddim_eta, - x_T=start_code) + samples_ddim = model.sample( + S=opt.ddim_steps, + conditioning=c, + batch_size=opt.n_samples, + seed=opt.seed, + shape=shape, + verbose=False, + unconditional_guidance_scale=opt.scale, + unconditional_conditioning=uc, + eta=opt.ddim_eta, + x_T=start_code, + ) modelFS.to(opt.device) print(samples_ddim.shape) print("saving images") for i in range(batch_size): - + x_samples_ddim = modelFS.decode_first_stage(samples_ddim[i].unsqueeze(0)) x_sample = torch.clamp((x_samples_ddim + 1.0) / 2.0, min=0.0, max=1.0) - x_sample = 255. * rearrange(x_sample[0].cpu().numpy(), 'c h w -> h w c') + x_sample = 255.0 * rearrange(x_sample[0].cpu().numpy(), "c h w -> h w c") Image.fromarray(x_sample.astype(np.uint8)).save( - os.path.join(sample_path, "seed_" + str(opt.seed) + "_" + f"{base_count:05}.png")) - seeds+= str(opt.seed) + ',' - opt.seed+=1 + os.path.join(sample_path, "seed_" + str(opt.seed) + "_" + f"{base_count:05}.png") + ) + seeds += str(opt.seed) + "," + opt.seed += 1 base_count += 1 - if(opt.device != 'cpu'): - mem = torch.cuda.memory_allocated()/1e6 + if opt.device != "cpu": + mem = torch.cuda.memory_allocated() / 1e6 modelFS.to("cpu") - while(torch.cuda.memory_allocated()/1e6 >= mem): + while torch.cuda.memory_allocated() / 1e6 >= mem: time.sleep(1) del samples_ddim - print("memory_final = ", torch.cuda.memory_allocated()/1e6) + print("memory_final = ", torch.cuda.memory_allocated() / 1e6) toc = time.time() -time_taken = (toc-tic)/60.0 +time_taken = (toc - tic) / 60.0 -print(("Your samples are ready in {0:.2f} minutes and waiting for you here " + sample_path + "\n Seeds used = " + seeds[:-1]).format(time_taken)) \ No newline at end of file +print( + ( + "Samples finished in {0:.2f} minutes and exported to " + + sample_path + + "\n Seeds used = " + + seeds[:-1] + ).format(time_taken) +) diff --git a/optimizedSD/txt2img_gradio.py b/optimizedSD/txt2img_gradio.py index 2ff3dbedf..ac7ff8983 100644 --- a/optimizedSD/txt2img_gradio.py +++ b/optimizedSD/txt2img_gradio.py @@ -21,10 +21,12 @@ from ldm.util import instantiate_from_config from split_subprompts import split_weighted_subprompts from transformers import logging + logging.set_verbosity_error() import mimetypes + mimetypes.init() -mimetypes.add_type('application/javascript', '.js') +mimetypes.add_type("application/javascript", ".js") def chunk(it, size): @@ -46,39 +48,55 @@ def load_model_from_config(ckpt, verbose=False): sd = load_model_from_config(f"{ckpt}") li, lo = [], [] for key, v_ in sd.items(): - sp = key.split('.') - if(sp[0]) == 'model': - if('input_blocks' in sp): + sp = key.split(".") + if (sp[0]) == "model": + if "input_blocks" in sp: li.append(key) - elif('middle_block' in sp): + elif "middle_block" in sp: li.append(key) - elif('time_embed' in sp): + elif "time_embed" in sp: li.append(key) else: lo.append(key) for key in li: - sd['model1.' + key[6:]] = sd.pop(key) + sd["model1." + key[6:]] = sd.pop(key) for key in lo: - sd['model2.' + key[6:]] = sd.pop(key) + sd["model2." + key[6:]] = sd.pop(key) config = OmegaConf.load(f"{config}") model = instantiate_from_config(config.modelUNet) _, _ = model.load_state_dict(sd, strict=False) model.eval() - + modelCS = instantiate_from_config(config.modelCondStage) _, _ = modelCS.load_state_dict(sd, strict=False) modelCS.eval() - + modelFS = instantiate_from_config(config.modelFirstStage) _, _ = modelFS.load_state_dict(sd, strict=False) modelFS.eval() del sd -def generate(prompt,ddim_steps,n_iter, batch_size, Height, Width, scale, ddim_eta,unet_bs, device,seed, outdir,turbo,full_precision,): - - seeds = '' + +def generate( + prompt, + ddim_steps, + n_iter, + batch_size, + Height, + Width, + scale, + ddim_eta, + unet_bs, + device, + seed, + outdir, + turbo, + full_precision, +): + + seeds = "" C = 4 f = 8 start_code = None @@ -87,18 +105,18 @@ def generate(prompt,ddim_steps,n_iter, batch_size, Height, Width, scale, ddim_et model.cdevice = device modelCS.cond_stage_model.device = device - if device != 'cpu' and full_precision == False: + if device != "cpu" and full_precision == False: model.half() modelCS.half() tic = time.time() os.makedirs(outdir, exist_ok=True) outpath = outdir - sample_path = os.path.join(outpath, '_'.join(re.split(':| ',prompt)))[:150] + sample_path = os.path.join(outpath, "_".join(re.split(":| ", prompt)))[:150] os.makedirs(sample_path, exist_ok=True) base_count = len(os.listdir(sample_path)) - - if seed == '': + + if seed == "": seed = randint(0, 1000000) seed = int(seed) seed_everything(seed) @@ -107,7 +125,7 @@ def generate(prompt,ddim_steps,n_iter, batch_size, Height, Width, scale, ddim_et assert prompt is not None data = [batch_size * [prompt]] - if full_precision== False and device != "cpu": + if full_precision == False and device != "cpu": precision_scope = autocast else: precision_scope = nullcontext @@ -126,7 +144,7 @@ def generate(prompt,ddim_steps,n_iter, batch_size, Height, Width, scale, ddim_et if isinstance(prompts, tuple): prompts = list(prompts) - subprompts,weights = split_weighted_subprompts(prompts[0]) + subprompts, weights = split_weighted_subprompts(prompts[0]) if len(subprompts) > 1: c = torch.zeros_like(uc) totalWeight = sum(weights) @@ -135,73 +153,93 @@ def generate(prompt,ddim_steps,n_iter, batch_size, Height, Width, scale, ddim_et weight = weights[i] # if not skip_normalize: weight = weight / totalWeight - c = torch.add(c,modelCS.get_learned_conditioning(subprompts[i]), alpha=weight) + c = torch.add(c, modelCS.get_learned_conditioning(subprompts[i]), alpha=weight) else: c = modelCS.get_learned_conditioning(prompts) shape = [C, Height // f, Width // f] - if device != 'cpu': - mem = torch.cuda.memory_allocated()/1e6 + if device != "cpu": + mem = torch.cuda.memory_allocated() / 1e6 modelCS.to("cpu") - while(torch.cuda.memory_allocated()/1e6 >= mem): + while torch.cuda.memory_allocated() / 1e6 >= mem: time.sleep(1) - - samples_ddim = model.sample(S=ddim_steps, - conditioning=c, - batch_size=batch_size, - seed = seed, - shape=shape, - verbose=False, - unconditional_guidance_scale=scale, - unconditional_conditioning=uc, - eta=ddim_eta, - x_T=start_code) + samples_ddim = model.sample( + S=ddim_steps, + conditioning=c, + batch_size=batch_size, + seed=seed, + shape=shape, + verbose=False, + unconditional_guidance_scale=scale, + unconditional_conditioning=uc, + eta=ddim_eta, + x_T=start_code, + ) modelFS.to(device) print("saving images") for i in range(batch_size): - + x_samples_ddim = modelFS.decode_first_stage(samples_ddim[i].unsqueeze(0)) x_sample = torch.clamp((x_samples_ddim + 1.0) / 2.0, min=0.0, max=1.0) all_samples.append(x_sample.to("cpu")) - x_sample = 255. * rearrange(x_sample[0].cpu().numpy(), 'c h w -> h w c') + x_sample = 255.0 * rearrange(x_sample[0].cpu().numpy(), "c h w -> h w c") Image.fromarray(x_sample.astype(np.uint8)).save( - os.path.join(sample_path, "seed_" + str(seed) + "_" + f"{base_count:05}.png")) - seeds+= str(seed) + ',' - seed+=1 + os.path.join(sample_path, "seed_" + str(seed) + "_" + f"{base_count:05}.png") + ) + seeds += str(seed) + "," + seed += 1 base_count += 1 - if device != 'cpu': - mem = torch.cuda.memory_allocated()/1e6 + if device != "cpu": + mem = torch.cuda.memory_allocated() / 1e6 modelFS.to("cpu") - while(torch.cuda.memory_allocated()/1e6 >= mem): + while torch.cuda.memory_allocated() / 1e6 >= mem: time.sleep(1) - + del samples_ddim del x_sample del x_samples_ddim - print("memory_final = ", torch.cuda.memory_allocated()/1e6) + print("memory_final = ", torch.cuda.memory_allocated() / 1e6) toc = time.time() - time_taken = (toc-tic)/60.0 + time_taken = (toc - tic) / 60.0 grid = torch.cat(all_samples, 0) grid = make_grid(grid, nrow=n_iter) - grid = 255. * rearrange(grid, 'c h w -> h w c').cpu().numpy() - - txt = "Your samples are ready in " + str(round(time_taken, 3)) + " minutes and waiting for you here " + sample_path + "\nSeeds used = " + seeds[:-1] + grid = 255.0 * rearrange(grid, "c h w -> h w c").cpu().numpy() + + txt = ( + "Samples finished in " + + str(round(time_taken, 3)) + + " minutes and exported to " + + sample_path + + "\nSeeds used = " + + seeds[:-1] + ) return Image.fromarray(grid.astype(np.uint8)), txt + demo = gr.Interface( fn=generate, - inputs=["text",gr.Slider(1, 1000,value=50),gr.Slider(1, 100, step=1), - gr.Slider(1, 100,step=1), gr.Slider(64,4096,value = 512,step=64), - gr.Slider(64,4096,value = 512,step=64),gr.Slider(0,50,value=7.5,step=0.1), - gr.Slider(0,1,step=0.01),gr.Slider(1,2,value = 1,step=1), - gr.Text(value = "cuda"),"text",gr.Text(value = "outputs/txt2img-samples"), - "checkbox", "checkbox",], + inputs=[ + "text", + gr.Slider(1, 1000, value=50), + gr.Slider(1, 100, step=1), + gr.Slider(1, 100, step=1), + gr.Slider(64, 4096, value=512, step=64), + gr.Slider(64, 4096, value=512, step=64), + gr.Slider(0, 50, value=7.5, step=0.1), + gr.Slider(0, 1, step=0.01), + gr.Slider(1, 2, value=1, step=1), + gr.Text(value="cuda"), + "text", + gr.Text(value="outputs/txt2img-samples"), + "checkbox", + "checkbox", + ], outputs=["image", "text"], ) -demo.launch() \ No newline at end of file +demo.launch()