Skip to content

Commit

Permalink
Make T2I Adapters work with any resolution supported by the models (#…
Browse files Browse the repository at this point in the history
…7215)

## Summary

This change mimics the unet padding strategy to align T2I featuremaps
with the latents during denoising. It also slightly adjusts the crop and
scale logic so that the control will match the input image without
shifting when it needs to pad.

## Related Issues / Discussions

<!--WHEN APPLICABLE: List any related issues or discussions on github or
discord. If this PR closes an issue, please use the "Closes #1234"
format, so that the issue will be automatically closed when the PR
merges.-->

## QA Instructions

Image generated at 1032x1024

![image](https://github.com/user-attachments/assets/7ea579e4-61dc-4b6b-aa84-33d676d160c6)

Image generated at 1080x1040 to prove feature alignment.

![image](https://github.com/user-attachments/assets/ee6e5b6a-d0d5-474d-9fc4-f65c104964bd)

Edge artifacts on the bottom and right are a result of SDXL's unet
padding, and t2i influence will be cut off in those regions.

## Merge Plan

Contingent on #7205 
Currently the Canvas UI prevents users from generating non-64
resolutions while t2i adapter layers are active. Will leave this as a
draft until fixing that.

## Checklist

- [x] _The PR has a short but descriptive title, suitable for a
changelog_
- [ ] _Tests added / updated (if applicable)_
- [ ] _Documentation added / updated (if applicable)_
  • Loading branch information
psychedelicious authored Nov 1, 2024
2 parents 26f95d6 + 6fbc019 commit 016a6f1
Show file tree
Hide file tree
Showing 3 changed files with 35 additions and 49 deletions.
28 changes: 19 additions & 9 deletions invokeai/app/invocations/denoise_latents.py
Original file line number Diff line number Diff line change
Expand Up @@ -622,7 +622,7 @@ def run_t2i_adapters(
for t2i_adapter_field in t2i_adapter:
t2i_adapter_model_config = context.models.get_config(t2i_adapter_field.t2i_adapter_model.key)
t2i_adapter_loaded_model = context.models.load(t2i_adapter_field.t2i_adapter_model)
image = context.images.get_pil(t2i_adapter_field.image.image_name)
image = context.images.get_pil(t2i_adapter_field.image.image_name, mode="RGB")

# The max_unet_downscale is the maximum amount that the UNet model downscales the latent image internally.
if t2i_adapter_model_config.base == BaseModelType.StableDiffusion1:
Expand All @@ -640,29 +640,39 @@ def run_t2i_adapters(
with t2i_adapter_loaded_model as t2i_adapter_model:
total_downscale_factor = t2i_adapter_model.total_downscale_factor

# Resize the T2I-Adapter input image.
# We select the resize dimensions so that after the T2I-Adapter's total_downscale_factor is applied, the
# result will match the latent image's dimensions after max_unet_downscale is applied.
t2i_input_height = latents_shape[2] // max_unet_downscale * total_downscale_factor
t2i_input_width = latents_shape[3] // max_unet_downscale * total_downscale_factor

# Note: We have hard-coded `do_classifier_free_guidance=False`. This is because we only want to prepare
# a single image. If CFG is enabled, we will duplicate the resultant tensor after applying the
# T2I-Adapter model.
#
# Note: We re-use the `prepare_control_image(...)` from ControlNet for T2I-Adapter, because it has many
# of the same requirements (e.g. preserving binary masks during resize).

# Assuming fixed dimensional scaling of LATENT_SCALE_FACTOR.
_, _, latent_height, latent_width = latents_shape
control_height_resize = latent_height * LATENT_SCALE_FACTOR
control_width_resize = latent_width * LATENT_SCALE_FACTOR
t2i_image = prepare_control_image(
image=image,
do_classifier_free_guidance=False,
width=t2i_input_width,
height=t2i_input_height,
width=control_width_resize,
height=control_height_resize,
num_channels=t2i_adapter_model.config["in_channels"], # mypy treats this as a FrozenDict
device=t2i_adapter_model.device,
dtype=t2i_adapter_model.dtype,
resize_mode=t2i_adapter_field.resize_mode,
)

# Resize the T2I-Adapter input image.
# We select the resize dimensions so that after the T2I-Adapter's total_downscale_factor is applied, the
# result will match the latent image's dimensions after max_unet_downscale is applied.
# We crop the image to this size so that the positions match the input image on non-standard resolutions
t2i_input_height = latents_shape[2] // max_unet_downscale * total_downscale_factor
t2i_input_width = latents_shape[3] // max_unet_downscale * total_downscale_factor
if t2i_image.shape[2] > t2i_input_height or t2i_image.shape[3] > t2i_input_width:
t2i_image = t2i_image[
:, :, : min(t2i_image.shape[2], t2i_input_height), : min(t2i_image.shape[3], t2i_input_width)
]

adapter_state = t2i_adapter_model(t2i_image)

if do_classifier_free_guidance:
Expand Down
16 changes: 16 additions & 0 deletions invokeai/backend/stable_diffusion/diffusers_pipeline.py
Original file line number Diff line number Diff line change
Expand Up @@ -499,6 +499,22 @@ def step(
for idx, value in enumerate(single_t2i_adapter_data.adapter_state):
accum_adapter_state[idx] += value * t2i_adapter_weight

# Hack: force compatibility with irregular resolutions by padding the feature map with zeros
for idx, tensor in enumerate(accum_adapter_state):
# The tensor size is supposed to be some integer downscale factor of the latents size.
# Internally, the unet will pad the latents before downscaling between levels when it is no longer divisible by its downscale factor.
# If the latent size does not scale down evenly, we need to pad the tensor so that it matches the the downscaled padded latents later on.
scale_factor = latents.size()[-1] // tensor.size()[-1]
required_padding_width = math.ceil(latents.size()[-1] / scale_factor) - tensor.size()[-1]
required_padding_height = math.ceil(latents.size()[-2] / scale_factor) - tensor.size()[-2]
tensor = torch.nn.functional.pad(
tensor,
(0, required_padding_width, 0, required_padding_height, 0, 0, 0, 0),
mode="constant",
value=0,
)
accum_adapter_state[idx] = tensor

down_intrablock_additional_residuals = accum_adapter_state

# Handle inpainting models.
Expand Down
40 changes: 0 additions & 40 deletions invokeai/frontend/web/src/common/hooks/useIsReadyToEnqueue.ts
Original file line number Diff line number Diff line change
Expand Up @@ -202,46 +202,6 @@ const createSelector = (
if (controlLayer.controlAdapter.model?.base !== model?.base) {
problems.push(i18n.t('parameters.invoke.layer.controlAdapterIncompatibleBaseModel'));
}
// T2I Adapters require images have dimensions that are multiples of 64 (SD1.5) or 32 (SDXL)
if (controlLayer.controlAdapter.type === 't2i_adapter') {
const multiple = model?.base === 'sdxl' ? 32 : 64;
if (bbox.scaleMethod === 'none') {
if (bbox.rect.width % 16 !== 0) {
reasons.push({
content: i18n.t('parameters.invoke.layer.t2iAdapterIncompatibleBboxWidth', {
multiple,
width: bbox.rect.width,
}),
});
}
if (bbox.rect.height % 16 !== 0) {
reasons.push({
content: i18n.t('parameters.invoke.layer.t2iAdapterIncompatibleBboxHeight', {
multiple,
height: bbox.rect.height,
}),
});
}
} else {
if (bbox.scaledSize.width % 16 !== 0) {
reasons.push({
content: i18n.t('parameters.invoke.layer.t2iAdapterIncompatibleScaledBboxWidth', {
multiple,
width: bbox.scaledSize.width,
}),
});
}
if (bbox.scaledSize.height % 16 !== 0) {
reasons.push({
content: i18n.t('parameters.invoke.layer.t2iAdapterIncompatibleScaledBboxHeight', {
multiple,
height: bbox.scaledSize.height,
}),
});
}
}
}

if (problems.length) {
const content = upperFirst(problems.join(', '));
reasons.push({ prefix, content });
Expand Down

0 comments on commit 016a6f1

Please sign in to comment.