DeepFloyd IF
概述
DeepFloyd IF 是一种新型的最先进的开源文本到图像模型,具有高度的逼真度和语言理解能力。 该模型由一个冻结的文本编码器和三个级联的像素扩散模块组成:
- 第一阶段:基于文本提示生成 64x64 像素图像的基础模型,
- 第二阶段:64x64 像素 => 256x256 像素的超分辨率模型,
- 第三阶段:256x256 像素 => 1024x1024 像素的超分辨率模型 第一阶段和第二阶段使用基于 T5 变压器的冻结文本编码器来提取文本嵌入,然后将这些嵌入输入到增强的 UNet 架构中,该架构集成了交叉注意力和注意力池化。 第三阶段是 Stability AI 的 x4 放大模型。 结果是一个高度高效的模型,其性能超过了当前的最先进模型,在 COCO 数据集上实现了零样本 FID 分数 6.66。 我们的工作强调了在级联扩散模型的第一阶段使用更大 UNet 架构的潜力,并描绘了文本到图像合成的光明未来。
使用
在使用 IF 之前,你需要接受其使用条件。具体步骤如下:
- 确保你有一个 Hugging Face 账户 并已登录。
- 在 DeepFloyd/IF-I-XL-v1.0 的模型卡上接受许可。接受第一阶段模型卡上的许可将自动接受其他 IF 模型的许可。
- 确保本地登录。安装
huggingface_hub
:
pip install huggingface_hub --upgrade
在 Python 解释器中运行登录函数:
from huggingface_hub import login
login()
并输入你的 Hugging Face Hub 访问令牌。
接下来我们安装 diffusers
及其依赖项:
pip install -q diffusers accelerate transformers
以下部分提供了如何使用 IF 的更详细示例。具体包括:
可用检查点
第一阶段
第二阶段
第三阶段
文本到图像生成
默认情况下,diffusers 使用 模型 CPU 卸载 来运行整个 IF 管道,最低只需 14 GB 的 VRAM。
from diffusers import DiffusionPipeline
from diffusers.utils import pt_to_pil, make_image_grid
import torch
# stage 1
stage_1 = DiffusionPipeline.from_pretrained("DeepFloyd/IF-I-XL-v1.0", variant="fp16", torch_dtype=torch.float16)
stage_1.enable_model_cpu_offload()
# stage 2
stage_2 = DiffusionPipeline.from_pretrained(
"DeepFloyd/IF-II-L-v1.0", text_encoder=None, variant="fp16", torch_dtype=torch.float16
)
stage_2.enable_model_cpu_offload()
# stage 3
safety_modules = {
"feature_extractor": stage_1.feature_extractor,
"safety_checker": stage_1.safety_checker,
"watermarker": stage_1.watermarker,
}
stage_3 = DiffusionPipeline.from_pretrained(
"stabilityai/stable-diffusion-x4-upscaler", **safety_modules, torch_dtype=torch.float16
)
stage_3.enable_model_cpu_offload()
prompt = 'a photo of a kangaroo wearing an orange hoodie and blue sunglasses standing in front of the eiffel tower holding a sign that says "very deep learning"'
generator = torch.manual_seed(1)
# text embeds
prompt_embeds, negative_embeds = stage_1.encode_prompt(prompt)
# stage 1
stage_1_output = stage_1(
prompt_embeds=prompt_embeds, negative_prompt_embeds=negative_embeds, generator=generator, output_type="pt"
).images
#pt_to_pil(stage_1_output)[0].save("./if_stage_I.png")
# stage 2
stage_2_output = stage_2(
image=stage_1_output,
prompt_embeds=prompt_embeds,
negative_prompt_embeds=negative_embeds,
generator=generator,
output_type="pt",
).images
#pt_to_pil(stage_2_output)[0].save("./if_stage_II.png")
# stage 3
stage_3_output = stage_3(prompt=prompt, image=stage_2_output, noise_level=100, generator=generator).images
#stage_3_output[0].save("./if_stage_III.png")
make_image_grid([pt_to_pil(stage_1_output)[0], pt_to_pil(stage_2_output)[0], stage_3_output[0]], rows=1, rows=3)
文本引导的图像到图像生成
相同的 IF 模型权重可以用于文本引导的图像到图像转换或图像变体生成。 在这种情况下,只需确保使用 [IFImg2ImgPipeline
] 和 [IFImg2ImgSuperResolutionPipeline
] 管道加载权重。
注意:你也可以直接将文本到图像管道的权重移动到图像到图像管道,而无需加载两次,方法是使用 [~DiffusionPipeline.components
] 参数,具体说明请参见此处。
from diffusers import IFImg2ImgPipeline, IFImg2ImgSuperResolutionPipeline, DiffusionPipeline
from diffusers.utils import pt_to_pil, load_image, make_image_grid
import torch
# download image
url = "https://raw.githubusercontent.com/CompVis/stable-diffusion/main/assets/stable-samples/img2img/sketch-mountains-input.jpg"
original_image = load_image(url)
original_image = original_image.resize((768, 512))
# stage 1
stage_1 = IFImg2ImgPipeline.from_pretrained("DeepFloyd/IF-I-XL-v1.0", variant="fp16", torch_dtype=torch.float16)
stage_1.enable_model_cpu_offload()
# stage 2
stage_2 = IFImg2ImgSuperResolutionPipeline.from_pretrained(
"DeepFloyd/IF-II-L-v1.0", text_encoder=None, variant="fp16", torch_dtype=torch.float16
)
stage_2.enable_model_cpu_offload()
# stage 3
safety_modules = {
"feature_extractor": stage_1.feature_extractor,
"safety_checker": stage_1.safety_checker,
"watermarker": stage_1.watermarker,
}
stage_3 = DiffusionPipeline.from_pretrained(
"stabilityai/stable-diffusion-x4-upscaler", **safety_modules, torch_dtype=torch.float16
)
stage_3.enable_model_cpu_offload()
prompt = "A fantasy landscape in style minecraft"
generator = torch.manual_seed(1)
# text embeds
prompt_embeds, negative_embeds = stage_1.encode_prompt(prompt)
# stage 1
stage_1_output = stage_1(
image=original_image,
prompt_embeds=prompt_embeds,
negative_prompt_embeds=negative_embeds,
generator=generator,
output_type="pt",
).images
#pt_to_pil(stage_1_output)[0].save("./if_stage_I.png")
# stage 2
stage_2_output = stage_2(
image=stage_1_output,
original_image=original_image,
prompt_embeds=prompt_embeds,
negative_prompt_embeds=negative_embeds,
generator=generator,
output_type="pt",
).images
#pt_to_pil(stage_2_output)[0].save("./if_stage_II.png")
# stage 3
stage_3_output = stage_3(prompt=prompt, image=stage_2_output, generator=generator, noise_level=100).images
#stage_3_output[0].save("./if_stage_III.png")
make_image_grid([original_image, pt_to_pil(stage_1_output)[0], pt_to_pil(stage_2_output)[0], stage_3_output[0]], rows=1, rows=4)
文本引导的图像修复生成
相同的 IF 模型权重可以用于文本引导的图像到图像转换或图像变体生成。 在这种情况下,只需确保使用 [IFInpaintingPipeline
] 和 [IFInpaintingSuperResolutionPipeline
] 管道加载权重。
注意:你也可以直接将文本到图像管道的权重移动到图像到图像管道,而无需再次加载,方法是使用 [~DiffusionPipeline.components()
] 函数,具体操作请参见此处。
from diffusers import IFInpaintingPipeline, IFInpaintingSuperResolutionPipeline, DiffusionPipeline
from diffusers.utils import pt_to_pil, load_image, make_image_grid
import torch
# download image
url = "https://huggingface.co/datasets/diffusers/docs-images/resolve/main/if/person.png"
original_image = load_image(url)
# download mask
url = "https://huggingface.co/datasets/diffusers/docs-images/resolve/main/if/glasses_mask.png"
mask_image = load_image(url)
# stage 1
stage_1 = IFInpaintingPipeline.from_pretrained("DeepFloyd/IF-I-XL-v1.0", variant="fp16", torch_dtype=torch.float16)
stage_1.enable_model_cpu_offload()
# stage 2
stage_2 = IFInpaintingSuperResolutionPipeline.from_pretrained(
"DeepFloyd/IF-II-L-v1.0", text_encoder=None, variant="fp16", torch_dtype=torch.float16
)
stage_2.enable_model_cpu_offload()
# stage 3
safety_modules = {
"feature_extractor": stage_1.feature_extractor,
"safety_checker": stage_1.safety_checker,
"watermarker": stage_1.watermarker,
}
stage_3 = DiffusionPipeline.from_pretrained(
"stabilityai/stable-diffusion-x4-upscaler", **safety_modules, torch_dtype=torch.float16
)
stage_3.enable_model_cpu_offload()
prompt = "blue sunglasses"
generator = torch.manual_seed(1)
# text embeds
prompt_embeds, negative_embeds = stage_1.encode_prompt(prompt)
# stage 1
stage_1_output = stage_1(
image=original_image,
mask_image=mask_image,
prompt_embeds=prompt_embeds,
negative_prompt_embeds=negative_embeds,
generator=generator,
output_type="pt",
).images
#pt_to_pil(stage_1_output)[0].save("./if_stage_I.png")
# stage 2
stage_2_output = stage_2(
image=stage_1_output,
original_image=original_image,
mask_image=mask_image,
prompt_embeds=prompt_embeds,
negative_prompt_embeds=negative_embeds,
generator=generator,
output_type="pt",
).images
#pt_to_pil(stage_1_output)[0].save("./if_stage_II.png")
# stage 3
stage_3_output = stage_3(prompt=prompt, image=stage_2_output, generator=generator, noise_level=100).images
#stage_3_output[0].save("./if_stage_III.png")
make_image_grid([original_image, mask_image, pt_to_pil(stage_1_output)[0], pt_to_pil(stage_2_output)[0], stage_3_output[0]], rows=1, rows=5)
在不同管道之间转换
除了使用 from_pretrained
加载之外,管道也可以直接从一个管道加载到另一个管道。
from diffusers import IFPipeline, IFSuperResolutionPipeline
pipe_1 = IFPipeline.from_pretrained("DeepFloyd/IF-I-XL-v1.0")
pipe_2 = IFSuperResolutionPipeline.from_pretrained("DeepFloyd/IF-II-L-v1.0")
from diffusers import IFImg2ImgPipeline, IFImg2ImgSuperResolutionPipeline
pipe_1 = IFImg2ImgPipeline(**pipe_1.components)
pipe_2 = IFImg2ImgSuperResolutionPipeline(**pipe_2.components)
from diffusers import IFInpaintingPipeline, IFInpaintingSuperResolutionPipeline
pipe_1 = IFInpaintingPipeline(**pipe_1.components)
pipe_2 = IFInpaintingSuperResolutionPipeline(**pipe_2.components)
优化速度
最简单的优化方法是将所有模型组件移至 GPU。
pipe = DiffusionPipeline.from_pretrained("DeepFloyd/IF-I-XL-v1.0", variant="fp16", torch_dtype=torch.float16)
pipe.to("cuda")
你也可以让扩散过程运行更少的时间步。
这可以通过 num_inference_steps
参数来实现:
pipe("<prompt>", num_inference_steps=30)
或者使用 timesteps
参数:
from diffusers.pipelines.deepfloyd_if import fast27_timesteps
pipe("<prompt>", timesteps=fast27_timesteps)
在进行图像变化或修复时,你也可以通过 strength
参数减少时间步数。strength
参数是添加到输入图像的噪声量,同时也决定了去噪过程中的步数。 较小的数值会使图像变化较小,但运行速度更快。
pipe = IFImg2ImgPipeline.from_pretrained("DeepFloyd/IF-I-XL-v1.0", variant="fp16", torch_dtype=torch.float16)
pipe.to("cuda")
image = pipe(image=image, prompt="<prompt>", strength=0.3).images
你也可以使用 torch.compile
。请注意,我们尚未对 torch.compile
与 IF 进行全面测试,因此它可能无法给出预期的结果。
from diffusers import DiffusionPipeline
import torch
pipe = DiffusionPipeline.from_pretrained("DeepFloyd/IF-I-XL-v1.0", variant="fp16", torch_dtype=torch.float16)
pipe.to("cuda")
pipe.text_encoder = torch.compile(pipe.text_encoder, mode="reduce-overhead", fullgraph=True)
pipe.unet = torch.compile(pipe.unet, mode="reduce-overhead", fullgraph=True)
优化内存
在优化 GPU 内存时,我们可以使用标准的 diffusers CPU 卸载 API。
无论是基于模型的 CPU 卸载,
pipe = DiffusionPipeline.from_pretrained("DeepFloyd/IF-I-XL-v1.0", variant="fp16", torch_dtype=torch.float16)
pipe.enable_model_cpu_offload()
或者更激进的基于层的 CPU 卸载。
pipe = DiffusionPipeline.from_pretrained("DeepFloyd/IF-I-XL-v1.0", variant="fp16", torch_dtype=torch.float16)
pipe.enable_sequential_cpu_offload()
此外,T5 可以以 8 位精度加载
from transformers import T5EncoderModel
text_encoder = T5EncoderModel.from_pretrained(
"DeepFloyd/IF-I-XL-v1.0", subfolder="text_encoder", device_map="auto", load_in_8bit=True, variant="8bit"
)
from diffusers import DiffusionPipeline
pipe = DiffusionPipeline.from_pretrained(
"DeepFloyd/IF-I-XL-v1.0",
text_encoder=text_encoder, # pass the previously instantiated 8bit text encoder
unet=None,
device_map="auto",
)
prompt_embeds, negative_embeds = pipe.encode_prompt("<prompt>")
对于像 Google Colab 免费版这样 CPU 内存受限的机器,我们无法一次性将所有模型组件加载到 CPU 上,因此我们可以手动仅在需要时加载文本编码器或 UNet 等相应的模型组件。
from diffusers import IFPipeline, IFSuperResolutionPipeline
import torch
import gc
from transformers import T5EncoderModel
from diffusers.utils import pt_to_pil, make_image_grid
text_encoder = T5EncoderModel.from_pretrained(
"DeepFloyd/IF-I-XL-v1.0", subfolder="text_encoder", device_map="auto", load_in_8bit=True, variant="8bit"
)
# text to image
pipe = DiffusionPipeline.from_pretrained(
"DeepFloyd/IF-I-XL-v1.0",
text_encoder=text_encoder, # pass the previously instantiated 8bit text encoder
unet=None,
device_map="auto",
)
prompt = 'a photo of a kangaroo wearing an orange hoodie and blue sunglasses standing in front of the eiffel tower holding a sign that says "very deep learning"'
prompt_embeds, negative_embeds = pipe.encode_prompt(prompt)
# Remove the pipeline so we can re-load the pipeline with the unet
del text_encoder
del pipe
gc.collect()
torch.cuda.empty_cache()
pipe = IFPipeline.from_pretrained(
"DeepFloyd/IF-I-XL-v1.0", text_encoder=None, variant="fp16", torch_dtype=torch.float16, device_map="auto"
)
generator = torch.Generator().manual_seed(0)
stage_1_output = pipe(
prompt_embeds=prompt_embeds,
negative_prompt_embeds=negative_embeds,
output_type="pt",
generator=generator,
).images
#pt_to_pil(stage_1_output)[0].save("./if_stage_I.png")
# Remove the pipeline so we can load the super-resolution pipeline
del pipe
gc.collect()
torch.cuda.empty_cache()
# First super resolution
pipe = IFSuperResolutionPipeline.from_pretrained(
"DeepFloyd/IF-II-L-v1.0", text_encoder=None, variant="fp16", torch_dtype=torch.float16, device_map="auto"
)
generator = torch.Generator().manual_seed(0)
stage_2_output = pipe(
image=stage_1_output,
prompt_embeds=prompt_embeds,
negative_prompt_embeds=negative_embeds,
output_type="pt",
generator=generator,
).images
#pt_to_pil(stage_2_output)[0].save("./if_stage_II.png")
make_image_grid([pt_to_pil(stage_1_output)[0], pt_to_pil(stage_2_output)[0]], rows=1, rows=2)
可用管道:
| 管道 | 任务 | Colab |
|---|---|:---😐 | pipeline_if.py | Text-to-Image Generation | - | | pipeline_if_superresolution.py | Text-to-Image Generation | - | | pipeline_if_img2img.py | Image-to-Image Generation | - | | pipeline_if_img2img_superresolution.py | Image-to-Image Generation | - | | pipeline_if_inpainting.py | Image-to-Image Generation | - | | pipeline_if_inpainting_superresolution.py | Image-to-Image Generation | - |
IFPipeline
[[autodoc]] IFPipeline - all - call
IFSuperResolutionPipeline
[[autodoc]] IFSuperResolutionPipeline - all - call
IFImg2ImgPipeline
[[autodoc]] IFImg2ImgPipeline - all - call
IFImg2ImgSuperResolutionPipeline
[[autodoc]] IFImg2ImgSuperResolutionPipeline - all - call
IFInpaintingPipeline
[[autodoc]] IFInpaintingPipeline - all - call
IFInpaintingSuperResolutionPipeline
[[autodoc]] IFInpaintingSuperResolutionPipeline - all - call