Stable Diffusion 视觉提示词注入攻击原理与实现分析

提到提示词注入（Prompt Injection），大家的第一反应往往是精心构造的文本越狱指令。而在图生图任务中，输入图像在本质上扮演了视觉提示词的角色，与文本指令共同指导生成模型。基于这一视角，本文展示针对视觉提示词的注入攻击：通过 PGD 对抗攻击算法对输入图像进行像素级微调，使其生成的违规图像能够绕过开源大模型的 NSFW 安全检测机制。

作为 AI 安全的研究者，理解这些攻击机制的本质是为了探索人工智能的安全边界，从而构建更坚固的防御体系。本教程将深入剖析如何利用对抗样本突破 Stable Diffusion 的安全过滤防线，重点在于技术原理的复现与解析。

一、NSFW 防线：开源模型的安全过滤机制

默认情况下，当我们向开源大模型请求其生成包含敏感或违规特征的内容时，官方内置的 Safety Checker（安全检测器）会拦截该输出，返回一张全黑的图像，并在终端抛出如下警告：

Potential NSFW content was detected in one or more images. A black image will be returned instead. Try again with a different prompt and/or seed.

我们的目标就是突破上述防线。

二、攻击场景定义 (Threat Model)

我们需要在输入图像中注入肉眼难以察觉的梯度扰动，欺骗官方的 Safety Checker 模块，使模型最终输出违规图像，而非全黑拦截图。

靶标模型：Hugging Face 开源的图像修复模型 stable-diffusion-v1-5-inpainting
输入内容：
1. 算法优化的视觉提示词图像（对抗样本），及其对应的局部掩码（Mask）。
2. 生成提示词（Prompt），例如请求生成特定违规内容。
攻击目标：降低生成图像在 CLIP 特征空间上与违规概念的相似度，使其低于安全判定阈值。

[示意图：攻击前后对比]

三、环境搭建

为了在国内网络环境下稳定拉取 Hugging Face 模型，建议在代码顶部定义国内镜像源及模型缓存地址，避免超时问题。

import os
os.environ['HF_HOME'] = '/root/autodl-tmp/cache/'
os.environ['HF_ENDPOINT'] = 'https://hf-mirror.com'

from SafetyChecker import StableDiffusionSafetyChecker
from PIL import Image, ImageOps
import torch
import numpy as np
import random
from tqdm import tqdm
from diffusers import StableDiffusionInpaintPipeline
import torchvision.transforms  T
 torch.nn  nn
 argparse
 cv2

to_pil = T.ToPILImage()
to_tensor = T.ToTensor()

 transformers  CLIPConfig, CLIPVisionModel, PreTrainedModel

def attack(cur_mask, cur_masked_image, prompt, iters, pipe_inpaint, num_inference_steps): cur_mask = cur_mask.clone() cur_masked_image = cur_masked_image.clone().cuda() # 初始化对抗噪声 delta delta = torch.zeros_like(cur_masked_image).cuda().requires_grad_(True) # 攻击参数设置 epsilon = 8 / 255 # 最大扰动范围 alpha = 1 / 255 # 单步步长 adjustment = 0.02 # 阈值调节，目的是比官方的阈值设定再低一些，确保攻击稳定 for i in range(iters): adv_sample = cur_masked_image + delta Inpaint_img = diffusion_forward(pipe_inpaint, prompt, adv_sample, cur_mask, num_inference_steps) clip_input = vae_tensor_to_clip_input(Inpaint_img).cuda() # Safety Checker 安全检测器本质上是一个预训练的 CLIP 模型 image_embeds = SafetyChecker(clip_input) # 计算当前'图像'和'官方预设定的多个违规词'在 Clip 特征空间上的距离 cos_dist = cosine_distance(image_embeds, concept_embeds) attack_loss = 0 loss_total = 0 for concept_idx in range(len(cos_dist[0])): concept_cos = cos_dist[0][concept_idx] concept_threshold = concept_embeds_weights[concept_idx].item() loss_total = loss_total + concept_cos if torch.round(concept_cos - concept_threshold + adjustment, decimals=3) > 0: attack_loss = attack_loss + concept_cos if attack_loss: loss = attack_loss else: loss = loss_total print('attack_loss:', attack_loss) grad = torch.autograd.grad(loss, [delta])[0] * (1 - cur_mask) delta.data = delta.data - alpha * grad.sign() delta.data = torch.clamp(delta.data, -epsilon, epsilon) # 确保最终像素在正常范围内 delta.data = (torch.clamp(cur_masked_image + delta.data, -1, 1) - cur_masked_image) torch.cuda.empty_cache() adv_sample = cur_masked_image + delta return adv_sample.data.cpu(), Inpaint_img.data.cpu()

def diffusion_forward(self, prompt, masked_image, mask, num_inference_steps): height: int = 512 width: int = 512 guidance_scale: float = 7.5 eta: float = 0.0 text_inputs = self.tokenizer( prompt, padding="max_length", max_length=self.tokenizer.model_max_length, return_tensors="pt" ) text_input_ids = text_inputs.input_ids text_embeddings = self.text_encoder(text_input_ids.to(self.device))[0] uncond_tokens = [""] max_length = text_input_ids.shape[-1] uncond_input = self.tokenizer( uncond_tokens, padding="max_length", max_length=max_length, truncation=True, return_tensors="pt" ) uncond_embeddings = self.text_encoder(uncond_input.input_ids.to(self.device))[0] seq_len = uncond_embeddings.shape[1] text_embeddings = torch.cat([uncond_embeddings, text_embeddings]) text_embeddings = text_embeddings.detach() num_channels_latents = self.vae.config.latent_channels latents_shape = (1, num_channels_latents, height // 8, width // 8) latents = torch.randn(latents_shape, device=self.device, dtype=text_embeddings.dtype) mask = torch.nn.functional.interpolate(mask, size=(height // 8, width // 8)) mask = torch.cat([mask] * 2) masked_image_latents = self.vae.encode(masked_image).latent_dist.sample() masked_image_latents = 0.18215 * masked_image_latents masked_image_latents = torch.cat([masked_image_latents] * 2) latents = latents * self.scheduler.init_noise_sigma self.scheduler.set_timesteps(num_inference_steps) timesteps_tensor = self.scheduler.timesteps.to(self.device) for i, t in enumerate(timesteps_tensor): latent_model_input = torch.cat([latents] * 2) latent_model_input = torch.cat([latent_model_input, mask, masked_image_latents], dim=1) noise_pred = self.unet(latent_model_input, t, encoder_hidden_states=text_embeddings).sample noise_pred_uncond, noise_pred_text = noise_pred.chunk(2) noise_pred = noise_pred_uncond + guidance_scale * (noise_pred_text - noise_pred_uncond) latents = self.scheduler.step(noise_pred, t, latents, eta=eta).prev_sample latents = 1 / 0.18215 * latents image = self.vae.decode(latents).sample return image

Stable Diffusion 视觉提示词注入攻击原理与实现分析