stark-global
diff --git a/‎configs/models.yaml‎
Lines changed: 7 additions & 0 deletions b/‎configs/models.yaml‎
Lines changed: 7 additions & 0 deletions
diff --git a/‎configs/stable-diffusion/v1-inpainting-inference.yaml‎
Lines changed: 79 additions & 0 deletions b/‎configs/stable-diffusion/v1-inpainting-inference.yaml‎
Lines changed: 79 additions & 0 deletions
diff --git a/‎ldm/generate.py‎
Lines changed: 15 additions & 1 deletion b/‎ldm/generate.py‎
Lines changed: 15 additions & 1 deletion
diff --git a/‎ldm/invoke/args.py‎
Lines changed: 3 additions & 1 deletion b/‎ldm/invoke/args.py‎
Lines changed: 3 additions & 1 deletion
diff --git a/‎ldm/invoke/conditioning.py‎
Lines changed: 22 additions & 1 deletion b/‎ldm/invoke/conditioning.py‎
Lines changed: 22 additions & 1 deletion
diff --git a/‎ldm/invoke/generator/base.py‎
Lines changed: 12 additions & 7 deletions b/‎ldm/invoke/generator/base.py‎
Lines changed: 12 additions & 7 deletions
diff --git a/‎ldm/invoke/generator/img2img.py‎
Lines changed: 5 additions & 2 deletions b/‎ldm/invoke/generator/img2img.py‎
Lines changed: 5 additions & 2 deletions
@@ -13,6 +13,13 @@ stable-diffusion-1.4:
   width: 512
   height: 512
   default: true
+inpainting-1.5:
+  description: runwayML tuned inpainting model v1.5
+  weights: models/ldm/stable-diffusion-v1/sd-v1-5-inpainting.ckpt
+  config: configs/stable-diffusion/v1-inpainting-inference.yaml
+#  vae: models/ldm/stable-diffusion-v1/vae-ft-mse-840000-ema-pruned.ckpt
+  width: 512
+  height: 512
 stable-diffusion-1.5:
   config: configs/stable-diffusion/v1-inference.yaml
   weights: models/ldm/stable-diffusion-v1/v1-5-pruned-emaonly.ckpt
 
@@ -0,0 +1,79 @@
+model:
+  base_learning_rate: 7.5e-05
+  target: ldm.models.diffusion.ddpm.LatentInpaintDiffusion
+  params:
+    linear_start: 0.00085
+    linear_end: 0.0120
+    num_timesteps_cond: 1
+    log_every_t: 200
+    timesteps: 1000
+    first_stage_key: "jpg"
+    cond_stage_key: "txt"
+    image_size: 64
+    channels: 4
+    cond_stage_trainable: false   # Note: different from the one we trained before
+    conditioning_key: hybrid   # important
+    monitor: val/loss_simple_ema
+    scale_factor: 0.18215
+    finetune_keys: null
+
+    scheduler_config: # 10000 warmup steps
+      target: ldm.lr_scheduler.LambdaLinearScheduler
+      params:
+        warm_up_steps: [ 2500 ] # NOTE for resuming. use 10000 if starting from scratch
+        cycle_lengths: [ 10000000000000 ] # incredibly large number to prevent corner cases
+        f_start: [ 1.e-6 ]
+        f_max: [ 1. ]
+        f_min: [ 1. ]
+
+    personalization_config:
+      target: ldm.modules.embedding_manager.EmbeddingManager
+      params:
+        placeholder_strings: ["*"]
+        initializer_words: ['face', 'man', 'photo', 'africanmale']
+        per_image_tokens: false
+        num_vectors_per_token: 1
+        progressive_words: False
+
+    unet_config:
+      target: ldm.modules.diffusionmodules.openaimodel.UNetModel
+      params:
+        image_size: 32 # unused
+        in_channels: 9  # 4 data + 4 downscaled image + 1 mask
+        out_channels: 4
+        model_channels: 320
+        attention_resolutions: [ 4, 2, 1 ]
+        num_res_blocks: 2
+        channel_mult: [ 1, 2, 4, 4 ]
+        num_heads: 8
+        use_spatial_transformer: True
+        transformer_depth: 1
+        context_dim: 768
+        use_checkpoint: True
+        legacy: False
+
+    first_stage_config:
+      target: ldm.models.autoencoder.AutoencoderKL
+      params:
+        embed_dim: 4
+        monitor: val/rec_loss
+        ddconfig:
+          double_z: true
+          z_channels: 4
+          resolution: 256
+          in_channels: 3
+          out_ch: 3
+          ch: 128
+          ch_mult:
+          - 1
+          - 2
+          - 4
+          - 4
+          num_res_blocks: 2
+          attn_resolutions: []
+          dropout: 0.0
+        lossconfig:
+          target: torch.nn.Identity
+
+    cond_stage_config:
+      target: ldm.modules.encoders.modules.FrozenCLIPEmbedder
@@ -421,7 +421,10 @@ def process_image(image,seed):
             )
 
             # TODO: Hacky selection of operation to perform. Needs to be refactored.
-            if (init_image is not None) and (mask_image is not None):
+            if self.sampler.conditioning_key() in ('hybrid','concat'):
+                print(f'** Inpainting model detected. Will try it! **')
+                generator = self._make_omnibus()
+            elif (init_image is not None) and (mask_image is not None):
                 generator = self._make_inpaint()
             elif (embiggen != None or embiggen_tiles != None):
                 generator = self._make_embiggen()
@@ -677,6 +680,7 @@ def _make_images(
 
         return init_image,init_mask
 
+    # lots o' repeated code here! Turn into a make_func()
     def _make_base(self):
         if not self.generators.get('base'):
             from ldm.invoke.generator import Generator
@@ -687,6 +691,7 @@ def _make_img2img(self):
         if not self.generators.get('img2img'):
             from ldm.invoke.generator.img2img import Img2Img
             self.generators['img2img'] = Img2Img(self.model, self.precision)
+            self.generators['img2img'].free_gpu_mem = self.free_gpu_mem
         return self.generators['img2img']
 
     def _make_embiggen(self):
@@ -715,6 +720,15 @@ def _make_inpaint(self):
             self.generators['inpaint'] = Inpaint(self.model, self.precision)
         return self.generators['inpaint']
 
+    # "omnibus" supports the runwayML custom inpainting model, which does
+    # txt2img, img2img and inpainting using slight variations on the same code
+    def _make_omnibus(self):
+        if not self.generators.get('omnibus'):
+            from ldm.invoke.generator.omnibus import Omnibus
+            self.generators['omnibus'] = Omnibus(self.model, self.precision)
+            self.generators['omnibus'].free_gpu_mem = self.free_gpu_mem
+        return self.generators['omnibus']
+
     def load_model(self):
         '''
         preload model identified in self.model_name
 
@@ -181,7 +181,9 @@ def parse_cmd(self,cmd_string):
         switches_started = False
 
         for element in elements:
-            if element[0] == '-' and not switches_started:
+            if len(element) == 0:  # empty prompt
+                pass
+            elif element[0] == '-' and not switches_started:
                 switches_started = True
             if switches_started:
                 switches.append(element)
 
@@ -123,8 +123,8 @@ def get_uc_and_c_and_ec(prompt_string_uncleaned, model, log_tokens=False, skip_n
         else:
             conditioning, _ = build_embeddings_and_tokens_for_flattened_prompt(model, flattened_prompt, log_tokens=log_tokens)
 
-
     unconditioning, _ = build_embeddings_and_tokens_for_flattened_prompt(model, parsed_negative_prompt, log_tokens=log_tokens)
+    conditioning = flatten_hybrid_conditioning(unconditioning, conditioning)
     return (
         unconditioning, conditioning, InvokeAIDiffuserComponent.ExtraConditioningInfo(
             cross_attention_control_args=cac_args
@@ -166,4 +166,25 @@ def get_tokens_length(model, fragments: list[Fragment]):
     tokens = model.cond_stage_model.get_tokens(fragment_texts, include_start_and_end_markers=False)
     return sum([len(x) for x in tokens])
 
+def flatten_hybrid_conditioning(uncond, cond):
+    '''
+    This handles the choice between a conditional conditioning
+    that is a tensor (used by cross attention) vs one that has additional
+    dimensions as well, as used by 'hybrid'
+    '''
+    if isinstance(cond, dict):
+        assert isinstance(uncond, dict)
+        cond_in = dict()
+        for k in cond:
+            if isinstance(cond[k], list):
+                cond_in[k] = [
+                    torch.cat([uncond[k][i], cond[k][i]])
+                    for i in range(len(cond[k]))
+                ]
+            else:
+                cond_in[k] = torch.cat([uncond[k], cond[k]])
+        return cond_in
+    else:
+        return cond
 
+            
@@ -6,6 +6,7 @@
 import numpy as  np
 import random
 import os
+import traceback
 from tqdm import tqdm, trange
 from PIL import Image, ImageFilter
 from einops import rearrange, repeat
@@ -43,14 +44,15 @@ def set_variation(self, seed, variation_amount, with_variations):
         self.variation_amount = variation_amount
         self.with_variations  = with_variations
 
-    def generate(self,prompt,init_image,width,height,iterations=1,seed=None,
+    def generate(self,prompt,init_image,width,height,sampler, iterations=1,seed=None,
                  image_callback=None, step_callback=None, threshold=0.0, perlin=0.0,
                  safety_checker:dict=None,
                  **kwargs):
         scope = choose_autocast(self.precision)
         self.safety_checker = safety_checker
         make_image          = self.get_make_image(
             prompt,
+            sampler = sampler,
             init_image    = init_image,
             width         = width,
             height        = height,
@@ -59,12 +61,14 @@ def generate(self,prompt,init_image,width,height,iterations=1,seed=None,
             perlin        = perlin,
             **kwargs
         )
-
         results             = []
         seed                = seed if seed is not None else self.new_seed()
         first_seed          = seed
         seed, initial_noise = self.generate_initial_noise(seed, width, height)
-        with scope(self.model.device.type), self.model.ema_scope():
+
+        # There used to be an additional self.model.ema_scope() here, but it breaks
+        # the inpaint-1.5 model. Not sure what it did.... ?
+        with scope(self.model.device.type):
             for n in trange(iterations, desc='Generating'):
                 x_T = None
                 if self.variation_amount > 0:
@@ -79,7 +83,8 @@ def generate(self,prompt,init_image,width,height,iterations=1,seed=None,
                     try:
                         x_T = self.get_noise(width,height)
                     except:
-                        pass
+                        print('** An error occurred while getting initial noise **')
+                        print(traceback.format_exc())
 
                 image = make_image(x_T)
 
@@ -95,10 +100,10 @@ def generate(self,prompt,init_image,width,height,iterations=1,seed=None,
 
         return results
 
-    def sample_to_image(self,samples):
+    def sample_to_image(self,samples)->Image.Image:
         """
-        Returns a function returning an image derived from the prompt and the initial image
-        Return value depends on the seed at the time you call it
+        Given samples returned from a sampler, converts
+        it into a PIL Image
         """
         x_samples = self.model.decode_first_stage(samples)
         x_samples = torch.clamp((x_samples + 1.0) / 2.0, min=0.0, max=1.0)
 
@@ -15,7 +15,7 @@
 class Img2Img(Generator):
     def __init__(self, model, precision):
         super().__init__(model, precision)
-        self.init_latent         = None    # by get_noise()
+        self.init_latent = None    # by get_noise()
 
     def get_make_image(self,prompt,sampler,steps,cfg_scale,ddim_eta,
                        conditioning,init_image,strength,step_callback=None,threshold=0.0,perlin=0.0,**kwargs):
@@ -80,7 +80,10 @@ def get_noise(self,width,height):
 
     def _image_to_tensor(self, image:Image, normalize:bool=True)->Tensor:
         image = np.array(image).astype(np.float32) / 255.0
-        image = image[None].transpose(0, 3, 1, 2)
+        if len(image.shape) == 2:  # 'L' image, as in a mask
+            image = image[None,None]
+        else:                      # 'RGB' image
+            image = image[None].transpose(0, 3, 1, 2)
         image = torch.from_numpy(image)
         if normalize:
             image = 2.0 * image - 1.0