Typical diffusion models are trained to accept a particular form of conditioning, most commonly text, and cannot be conditioned on other modalities without retraining. In this work, we propose a universal guidance algorithm that enables diffusion models to be controlled by arbitrary guidance modalities without the need to retrain any use-specific components. We show that our algorithm successfully generates quality images with guidance functions including segmentation, face recognition, object detection, and classifier signals. Code is available at https://github.com/arpitbansal297/Universal-Guided-Diffusion.
@article{arxiv.2302.07121,
title = {Universal Guidance for Diffusion Models},
author = {Arpit Bansal and Hong-Min Chu and Avi Schwarzschild and Soumyadip Sengupta and Micah Goldblum and Jonas Geiping and Tom Goldstein},
journal= {arXiv preprint arXiv:2302.07121},
year = {2023}
}