hitomodev
diff --git a/‎invokeai/app/invocations/compel.py‎
Lines changed: 4 additions & 2 deletions b/‎invokeai/app/invocations/compel.py‎
Lines changed: 4 additions & 2 deletions
diff --git a/‎invokeai/app/invocations/latent.py‎
Lines changed: 2 additions & 1 deletion b/‎invokeai/app/invocations/latent.py‎
Lines changed: 2 additions & 1 deletion
diff --git a/‎invokeai/app/services/config/config_default.py‎
Lines changed: 2 additions & 0 deletions b/‎invokeai/app/services/config/config_default.py‎
Lines changed: 2 additions & 0 deletions
diff --git a/‎invokeai/backend/model_management/lora.py‎
Lines changed: 47 additions & 29 deletions b/‎invokeai/backend/model_management/lora.py‎
Lines changed: 47 additions & 29 deletions
diff --git a/‎invokeai/backend/model_management/memory_snapshot.py‎
Lines changed: 4 additions & 1 deletion b/‎invokeai/backend/model_management/memory_snapshot.py‎
Lines changed: 4 additions & 1 deletion
diff --git a/‎invokeai/backend/model_management/model_cache.py‎
Lines changed: 43 additions & 9 deletions b/‎invokeai/backend/model_management/model_cache.py‎
Lines changed: 43 additions & 9 deletions
diff --git a/‎invokeai/backend/model_management/model_load_optimizations.py‎
Lines changed: 1 addition & 1 deletion b/‎invokeai/backend/model_management/model_load_optimizations.py‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎invokeai/backend/model_management/model_manager.py‎
Lines changed: 1 addition & 0 deletions b/‎invokeai/backend/model_management/model_manager.py‎
Lines changed: 1 addition & 0 deletions
@@ -108,13 +108,14 @@ def _lora_loader():
                 print(f'Warn: trigger: "{trigger}" not found')
 
         with (
-            ModelPatcher.apply_lora_text_encoder(text_encoder_info.context.model, _lora_loader()),
             ModelPatcher.apply_ti(tokenizer_info.context.model, text_encoder_info.context.model, ti_list) as (
                 tokenizer,
                 ti_manager,
             ),
             ModelPatcher.apply_clip_skip(text_encoder_info.context.model, self.clip.skipped_layers),
             text_encoder_info as text_encoder,
+            # Apply the LoRA after text_encoder has been moved to its target device for faster patching.
+            ModelPatcher.apply_lora_text_encoder(text_encoder, _lora_loader()),
         ):
             compel = Compel(
                 tokenizer=tokenizer,
@@ -229,13 +230,14 @@ def _lora_loader():
                 print(f'Warn: trigger: "{trigger}" not found')
 
         with (
-            ModelPatcher.apply_lora(text_encoder_info.context.model, _lora_loader(), lora_prefix),
             ModelPatcher.apply_ti(tokenizer_info.context.model, text_encoder_info.context.model, ti_list) as (
                 tokenizer,
                 ti_manager,
             ),
             ModelPatcher.apply_clip_skip(text_encoder_info.context.model, clip_field.skipped_layers),
             text_encoder_info as text_encoder,
+            # Apply the LoRA after text_encoder has been moved to its target device for faster patching.
+            ModelPatcher.apply_lora(text_encoder, _lora_loader(), lora_prefix),
         ):
             compel = Compel(
                 tokenizer=tokenizer,
 
@@ -710,9 +710,10 @@ def _lora_loader():
             )
             with (
                 ExitStack() as exit_stack,
-                ModelPatcher.apply_lora_unet(unet_info.context.model, _lora_loader()),
                 set_seamless(unet_info.context.model, self.unet.seamless_axes),
                 unet_info as unet,
+                # Apply the LoRA after unet has been moved to its target device for faster patching.
+                ModelPatcher.apply_lora_unet(unet, _lora_loader()),
             ):
                 latents = latents.to(device=unet.device, dtype=unet.dtype)
                 if noise is not None:
 
@@ -45,6 +45,7 @@
     ram: 13.5
     vram: 0.25
     lazy_offload: true
+    log_memory_usage: false
   Device:
     device: auto
     precision: auto
@@ -261,6 +262,7 @@ class InvokeAIAppConfig(InvokeAISettings):
     ram                 : float = Field(default=7.5, gt=0, description="Maximum memory amount used by model cache for rapid switching (floating point number, GB)", json_schema_extra=Categories.ModelCache, )
     vram                : float = Field(default=0.25, ge=0, description="Amount of VRAM reserved for model storage (floating point number, GB)", json_schema_extra=Categories.ModelCache, )
     lazy_offload        : bool = Field(default=True, description="Keep models in VRAM until their space is needed", json_schema_extra=Categories.ModelCache, )
+    log_memory_usage    : bool = Field(default=False, description="If True, a memory snapshot will be captured before and after every model cache operation, and the result will be logged (at debug level). There is a time cost to capturing the memory snapshots, so it is recommended to only enable this feature if you are actively inspecting the model cache's behaviour.", json_schema_extra=Categories.ModelCache)
 
     # DEVICE
     device              : Literal["auto", "cpu", "cuda", "cuda:1", "mps"] = Field(default="auto", description="Generation device", json_schema_extra=Categories.Device)
 
@@ -1,6 +1,6 @@
 from __future__ import annotations
 
-import copy
+import pickle
 from contextlib import contextmanager
 from pathlib import Path
 from typing import Any, Dict, List, Optional, Tuple, Union
@@ -54,24 +54,6 @@ def _resolve_lora_key(model: torch.nn.Module, lora_key: str, prefix: str) -> Tup
 
         return (module_key, module)
 
-    @staticmethod
-    def _lora_forward_hook(
-        applied_loras: List[Tuple[LoRAModel, float]],
-        layer_name: str,
-    ):
-        def lora_forward(module, input_h, output):
-            if len(applied_loras) == 0:
-                return output
-
-            for lora, weight in applied_loras:
-                layer = lora.layers.get(layer_name, None)
-                if layer is None:
-                    continue
-                output += layer.forward(module, input_h, weight)
-            return output
-
-        return lora_forward
-
     @classmethod
     @contextmanager
     def apply_lora_unet(
@@ -129,21 +111,40 @@ def apply_lora(
                         if not layer_key.startswith(prefix):
                             continue
 
+                        # TODO(ryand): A non-negligible amount of time is currently spent resolving LoRA keys. This
+                        # should be improved in the following ways:
+                        # 1. The key mapping could be more-efficiently pre-computed. This would save time every time a
+                        #    LoRA model is applied.
+                        # 2. From an API perspective, there's no reason that the `ModelPatcher` should be aware of the
+                        #    intricacies of Stable Diffusion key resolution. It should just expect the input LoRA
+                        #    weights to have valid keys.
                         module_key, module = cls._resolve_lora_key(model, layer_key, prefix)
+
+                        # All of the LoRA weight calculations will be done on the same device as the module weight.
+                        # (Performance will be best if this is a CUDA device.)
+                        device = module.weight.device
+                        dtype = module.weight.dtype
+
                         if module_key not in original_weights:
                             original_weights[module_key] = module.weight.detach().to(device="cpu", copy=True)
 
-                        # enable autocast to calc fp16 loras on cpu
-                        # with torch.autocast(device_type="cpu"):
-                        layer.to(dtype=torch.float32)
                         layer_scale = layer.alpha / layer.rank if (layer.alpha and layer.rank) else 1.0
-                        layer_weight = layer.get_weight(original_weights[module_key]) * lora_weight * layer_scale
+
+                        # We intentionally move to the target device first, then cast. Experimentally, this was found to
+                        # be significantly faster for 16-bit CPU tensors being moved to a CUDA device than doing the
+                        # same thing in a single call to '.to(...)'.
+                        layer.to(device=device)
+                        layer.to(dtype=torch.float32)
+                        # TODO(ryand): Using torch.autocast(...) over explicit casting may offer a speed benefit on CUDA
+                        # devices here. Experimentally, it was found to be very slow on CPU. More investigation needed.
+                        layer_weight = layer.get_weight(module.weight) * (lora_weight * layer_scale)
+                        layer.to(device="cpu")
 
                         if module.weight.shape != layer_weight.shape:
                             # TODO: debug on lycoris
                             layer_weight = layer_weight.reshape(module.weight.shape)
 
-                        module.weight += layer_weight.to(device=module.weight.device, dtype=module.weight.dtype)
+                        module.weight += layer_weight.to(dtype=dtype)
 
             yield  # wait for context manager exit
 
@@ -164,7 +165,13 @@ def apply_ti(
         new_tokens_added = None
 
         try:
-            ti_tokenizer = copy.deepcopy(tokenizer)
+            # HACK: The CLIPTokenizer API does not include a way to remove tokens after calling add_tokens(...). As a
+            # workaround, we create a full copy of `tokenizer` so that its original behavior can be restored after
+            # exiting this `apply_ti(...)` context manager.
+            #
+            # In a previous implementation, the deep copy was obtained with `ti_tokenizer = copy.deepcopy(tokenizer)`,
+            # but a pickle roundtrip was found to be much faster (1 sec vs. 0.05 secs).
+            ti_tokenizer = pickle.loads(pickle.dumps(tokenizer))
             ti_manager = TextualInversionManager(ti_tokenizer)
             init_tokens_count = text_encoder.resize_token_embeddings(None).num_embeddings
 
@@ -196,7 +203,9 @@ def _get_trigger(ti_name, index):
 
                     if model_embeddings.weight.data[token_id].shape != embedding.shape:
                         raise ValueError(
-                            f"Cannot load embedding for {trigger}. It was trained on a model with token dimension {embedding.shape[0]}, but the current model has token dimension {model_embeddings.weight.data[token_id].shape[0]}."
+                            f"Cannot load embedding for {trigger}. It was trained on a model with token dimension"
+                            f" {embedding.shape[0]}, but the current model has token dimension"
+                            f" {model_embeddings.weight.data[token_id].shape[0]}."
                         )
 
                     model_embeddings.weight.data[token_id] = embedding.to(
@@ -257,7 +266,8 @@ def from_checkpoint(
         if "string_to_param" in state_dict:
             if len(state_dict["string_to_param"]) > 1:
                 print(
-                    f'Warn: Embedding "{file_path.name}" contains multiple tokens, which is not supported. The first token will be used.'
+                    f'Warn: Embedding "{file_path.name}" contains multiple tokens, which is not supported. The first'
+                    " token will be used."
                 )
 
             result.embedding = next(iter(state_dict["string_to_param"].values()))
@@ -435,7 +445,13 @@ def apply_ti(
         orig_embeddings = None
 
         try:
-            ti_tokenizer = copy.deepcopy(tokenizer)
+            # HACK: The CLIPTokenizer API does not include a way to remove tokens after calling add_tokens(...). As a
+            # workaround, we create a full copy of `tokenizer` so that its original behavior can be restored after
+            # exiting this `apply_ti(...)` context manager.
+            #
+            # In a previous implementation, the deep copy was obtained with `ti_tokenizer = copy.deepcopy(tokenizer)`,
+            # but a pickle roundtrip was found to be much faster (1 sec vs. 0.05 secs).
+            ti_tokenizer = pickle.loads(pickle.dumps(tokenizer))
             ti_manager = TextualInversionManager(ti_tokenizer)
 
             def _get_trigger(ti_name, index):
@@ -470,7 +486,9 @@ def _get_trigger(ti_name, index):
 
                     if embeddings[token_id].shape != embedding.shape:
                         raise ValueError(
-                            f"Cannot load embedding for {trigger}. It was trained on a model with token dimension {embedding.shape[0]}, but the current model has token dimension {embeddings[token_id].shape[0]}."
+                            f"Cannot load embedding for {trigger}. It was trained on a model with token dimension"
+                            f" {embedding.shape[0]}, but the current model has token dimension"
+                            f" {embeddings[token_id].shape[0]}."
                         )
 
                     embeddings[token_id] = embedding
 
@@ -64,7 +64,7 @@ def capture(cls, run_garbage_collector: bool = True):
         return cls(process_ram, vram, malloc_info)
 
 
-def get_pretty_snapshot_diff(snapshot_1: MemorySnapshot, snapshot_2: MemorySnapshot) -> str:
+def get_pretty_snapshot_diff(snapshot_1: Optional[MemorySnapshot], snapshot_2: Optional[MemorySnapshot]) -> str:
     """Get a pretty string describing the difference between two `MemorySnapshot`s."""
 
     def get_msg_line(prefix: str, val1: int, val2: int):
@@ -73,6 +73,9 @@ def get_msg_line(prefix: str, val1: int, val2: int):
 
     msg = ""
 
+    if snapshot_1 is None or snapshot_2 is None:
+        return msg
+
     msg += get_msg_line("Process RAM", snapshot_1.process_ram, snapshot_2.process_ram)
 
     if snapshot_1.malloc_info is not None and snapshot_2.malloc_info is not None:
 
@@ -117,6 +117,7 @@ def __init__(
         lazy_offloading: bool = True,
         sha_chunksize: int = 16777216,
         logger: types.ModuleType = logger,
+        log_memory_usage: bool = False,
     ):
         """
         :param max_cache_size: Maximum size of the RAM cache [6.0 GB]
@@ -126,6 +127,10 @@ def __init__(
         :param lazy_offloading: Keep model in VRAM until another model needs to be loaded
         :param sequential_offload: Conserve VRAM by loading and unloading each stage of the pipeline sequentially
         :param sha_chunksize: Chunksize to use when calculating sha256 model hash
+        :param log_memory_usage: If True, a memory snapshot will be captured before and after every model cache
+            operation, and the result will be logged (at debug level). There is a time cost to capturing the memory
+            snapshots, so it is recommended to disable this feature unless you are actively inspecting the model cache's
+            behaviour.
         """
         self.model_infos: Dict[str, ModelBase] = dict()
         # allow lazy offloading only when vram cache enabled
@@ -137,13 +142,19 @@ def __init__(
         self.storage_device: torch.device = storage_device
         self.sha_chunksize = sha_chunksize
         self.logger = logger
+        self._log_memory_usage = log_memory_usage
 
         # used for stats collection
         self.stats = None
 
         self._cached_models = dict()
         self._cache_stack = list()
 
+    def _capture_memory_snapshot(self) -> Optional[MemorySnapshot]:
+        if self._log_memory_usage:
+            return MemorySnapshot.capture()
+        return None
+
     def get_key(
         self,
         model_path: str,
@@ -223,10 +234,10 @@ def get_model(
 
             # Load the model from disk and capture a memory snapshot before/after.
             start_load_time = time.time()
-            snapshot_before = MemorySnapshot.capture()
+            snapshot_before = self._capture_memory_snapshot()
             with skip_torch_weight_init():
                 model = model_info.get_model(child_type=submodel, torch_dtype=self.precision)
-            snapshot_after = MemorySnapshot.capture()
+            snapshot_after = self._capture_memory_snapshot()
             end_load_time = time.time()
 
             self_reported_model_size_after_load = model_info.get_size(submodel)
@@ -275,9 +286,9 @@ def _move_model_to_device(self, key: str, target_device: torch.device):
             return
 
         start_model_to_time = time.time()
-        snapshot_before = MemorySnapshot.capture()
+        snapshot_before = self._capture_memory_snapshot()
         cache_entry.model.to(target_device)
-        snapshot_after = MemorySnapshot.capture()
+        snapshot_after = self._capture_memory_snapshot()
         end_model_to_time = time.time()
         self.logger.debug(
             f"Moved model '{key}' from {source_device} to"
@@ -286,7 +297,12 @@ def _move_model_to_device(self, key: str, target_device: torch.device):
             f"{get_pretty_snapshot_diff(snapshot_before, snapshot_after)}"
         )
 
-        if snapshot_before.vram is not None and snapshot_after.vram is not None:
+        if (
+            snapshot_before is not None
+            and snapshot_after is not None
+            and snapshot_before.vram is not None
+            and snapshot_after.vram is not None
+        ):
             vram_change = abs(snapshot_before.vram - snapshot_after.vram)
 
             # If the estimated model size does not match the change in VRAM, log a warning.
@@ -422,12 +438,17 @@ def _make_cache_room(self, model_size):
         self.logger.debug(f"Before unloading: cached_models={len(self._cached_models)}")
 
         pos = 0
+        models_cleared = 0
         while current_size + bytes_needed > maximum_size and pos < len(self._cache_stack):
             model_key = self._cache_stack[pos]
             cache_entry = self._cached_models[model_key]
 
             refs = sys.getrefcount(cache_entry.model)
 
+            # HACK: This is a workaround for a memory-management issue that we haven't tracked down yet. We are directly
+            # going against the advice in the Python docs by using `gc.get_referrers(...)` in this way:
+            # https://docs.python.org/3/library/gc.html#gc.get_referrers
+
             # manualy clear local variable references of just finished function calls
             # for some reason python don't want to collect it even by gc.collect() immidiately
             if refs > 2:
@@ -453,15 +474,16 @@ def _make_cache_room(self, model_size):
                 f" refs: {refs}"
             )
 
-            # 2 refs:
+            # Expected refs:
             # 1 from cache_entry
             # 1 from getrefcount function
             # 1 from onnx runtime object
-            if not cache_entry.locked and refs <= 3 if "onnx" in model_key else 2:
+            if not cache_entry.locked and refs <= (3 if "onnx" in model_key else 2):
                 self.logger.debug(
                     f"Unloading model {model_key} to free {(model_size/GIG):.2f} GB (-{(cache_entry.size/GIG):.2f} GB)"
                 )
                 current_size -= cache_entry.size
+                models_cleared += 1
                 if self.stats:
                     self.stats.cleared += 1
                 del self._cache_stack[pos]
@@ -471,7 +493,20 @@ def _make_cache_room(self, model_size):
             else:
                 pos += 1
 
-        gc.collect()
+        if models_cleared > 0:
+            # There would likely be some 'garbage' to be collected regardless of whether a model was cleared or not, but
+            # there is a significant time cost to calling `gc.collect()`, so we want to use it sparingly. (The time cost
+            # is high even if no garbage gets collected.)
+            #
+            # Calling gc.collect(...) when a model is cleared seems like a good middle-ground:
+            # - If models had to be cleared, it's a signal that we are close to our memory limit.
+            # - If models were cleared, there's a good chance that there's a significant amount of garbage to be
+            #   collected.
+            #
+            # Keep in mind that gc is only responsible for handling reference cycles. Most objects should be cleaned up
+            # immediately when their reference count hits 0.
+            gc.collect()
+
         torch.cuda.empty_cache()
         if choose_torch_device() == torch.device("mps"):
             mps.empty_cache()
@@ -491,7 +526,6 @@ def _offload_unlocked_models(self, size_needed: int = 0):
                 vram_in_use = torch.cuda.memory_allocated()
                 self.logger.debug(f"{(vram_in_use/GIG):.2f}GB VRAM used for models; max allowed={(reserved/GIG):.2f}GB")
 
-        gc.collect()
         torch.cuda.empty_cache()
         if choose_torch_device() == torch.device("mps"):
             mps.empty_cache()
 
@@ -17,7 +17,7 @@ def skip_torch_weight_init():
     completely unnecessary if the intent is to load checkpoint weights from disk for the layer. This context manager
     monkey-patches common torch layers to skip the weight initialization step.
     """
-    torch_modules = [torch.nn.Linear, torch.nn.modules.conv._ConvNd]
+    torch_modules = [torch.nn.Linear, torch.nn.modules.conv._ConvNd, torch.nn.Embedding]
     saved_functions = [m.reset_parameters for m in torch_modules]
 
     try:
 
@@ -351,6 +351,7 @@ def __init__(
             precision=precision,
             sequential_offload=sequential_offload,
             logger=logger,
+            log_memory_usage=self.app_config.log_memory_usage,
         )
 
         self._read_models(config)
Original file line number	Diff line number	Diff line change
`@@ -351,6 +351,7 @@ def __init__(`
`351`	`351`	`precision=precision,`
`352`	`352`	`sequential_offload=sequential_offload,`
`353`	`353`	`logger=logger,`
	`354`	`+ log_memory_usage=self.app_config.log_memory_usage,`
`354`	`355`	`)`
`355`	`356`
`356`	`357`	`self._read_models(config)`