生成式AI开发入门：Python实现GAN与Diffusion模型

# 生成式AI开发入门：Python实现GAN与Diffusion模型

qq_34419312

1197人浏览 · 2025-11-23 21:49:27

qq_34419312 · 2025-11-23 21:49:27 发布

概述

生成式人工智能正在重塑内容创作的边界，从图像生成到文本创作，再到音乐和视频合成。本文将深入探讨两种主流的生成式模型：生成对抗网络（GAN）和扩散模型（Diffusion Models），通过Python实战演示它们的原理和实现。
在这里插入图片描述

生成式AI技术演进

模型发展时间线

年份	技术突破	代表模型	生成质量	训练稳定性	应用影响
2014	GAN诞生	原始GAN	基础	较差	开启生成模型新方向
2015	条件生成	CGAN	可控制	中等	实现定向生成
2016	图像改进	DCGAN	清晰	较好	推动高质量图像生成
2017	循环一致性	CycleGAN	风格转换	稳定	无配对图像转换
2018	渐进训练	ProGAN	高分辨率	复杂	生成高清图像
2019	风格控制	StyleGAN	逼真	需要调优	人脸生成达到新高度
2020	扩散模型	DDPM	优秀	稳定	新的技术路径
2022	文生图	Stable Diffusion	惊人	较好	democratize AI艺术
2024	视频生成	Sora	电影级	资源密集	突破视频生成边界

技术对比分析

# 生成模型特性对比
generative_models_comparison = {
    'GAN': {
        '训练稳定性': '中等，需要精细调参',
        '生成速度': '快速，单次前向传播',
        '多样性': '可能发生模式崩溃',
        '可控性': '通过条件生成实现',
        '资源需求': '中等',
        '适合场景': '实时生成、数据增强'
    },
    '扩散模型': {
        '训练稳定性': '高，训练相对简单',
        '生成速度': '较慢，需要多步采样',
        '多样性': '优秀，覆盖完整分布',
        '可控性': '通过引导实现精细控制',
        '资源需求': '较高，特别是推理时',
        '适合场景': '高质量图像生成、创意艺术'
    },
    'VAE': {
        '训练稳定性': '高',
        '生成速度': '快速',
        '多样性': '中等，可能模糊',
        '可控性': '通过潜空间操作',
        '资源需求': '低',
        '适合场景': '数据压缩、插值生成'
    }
}

GAN基础理论与实现

GAN核心原理

import torch
import torch.nn as nn
import torch.optim as optim
import torchvision
import torchvision.transforms as transforms
from torch.utils.data import DataLoader
import matplotlib.pyplot as plt
import numpy as np

class Generator(nn.Module):
    """生成器网络"""
    
    def __init__(self, latent_dim=100, img_channels=1, feature_map_size=64):
        super(Generator, self).__init__()
        
        self.main = nn.Sequential(
            # 输入: latent_dim x 1 x 1
            nn.ConvTranspose2d(latent_dim, feature_map_size * 8, 4, 1, 0, bias=False),
            nn.BatchNorm2d(feature_map_size * 8),
            nn.ReLU(True),
            
            # 状态: (feature_map_size*8) x 4 x 4
            nn.ConvTranspose2d(feature_map_size * 8, feature_map_size * 4, 4, 2, 1, bias=False),
            nn.BatchNorm2d(feature_map_size * 4),
            nn.ReLU(True),
            
            # 状态: (feature_map_size*4) x 8 x 8
            nn.ConvTranspose2d(feature_map_size * 4, feature_map_size * 2, 4, 2, 1, bias=False),
            nn.BatchNorm2d(feature_map_size * 2),
            nn.ReLU(True),
            
            # 状态: (feature_map_size*2) x 16 x 16
            nn.ConvTranspose2d(feature_map_size * 2, feature_map_size, 4, 2, 1, bias=False),
            nn.BatchNorm2d(feature_map_size),
            nn.ReLU(True),
            
            # 状态: (feature_map_size) x 32 x 32
            nn.ConvTranspose2d(feature_map_size, img_channels, 4, 2, 1, bias=False),
            nn.Tanh()
            # 输出: img_channels x 64 x 64
        )
    
    def forward(self, input):
        return self.main(input)

class Discriminator(nn.Module):
    """判别器网络"""
    
    def __init__(self, img_channels=1, feature_map_size=64):
        super(Discriminator, self).__init__()
        
        self.main = nn.Sequential(
            # 输入: img_channels x 64 x 64
            nn.Conv2d(img_channels, feature_map_size, 4, 2, 1, bias=False),
            nn.LeakyReLU(0.2, inplace=True),
            
            # 状态: (feature_map_size) x 32 x 32
            nn.Conv2d(feature_map_size, feature_map_size * 2, 4, 2, 1, bias=False),
            nn.BatchNorm2d(feature_map_size * 2),
            nn.LeakyReLU(0.2, inplace=True),
            
            # 状态: (feature_map_size*2) x 16 x 16
            nn.Conv2d(feature_map_size * 2, feature_map_size * 4, 4, 2, 1, bias=False),
            nn.BatchNorm2d(feature_map_size * 4),
            nn.LeakyReLU(0.2, inplace=True),
            
            # 状态: (feature_map_size*4) x 8 x 8
            nn.Conv2d(feature_map_size * 4, feature_map_size * 8, 4, 2, 1, bias=False),
            nn.BatchNorm2d(feature_map_size * 8),
            nn.LeakyReLU(0.2, inplace=True),
            
            # 状态: (feature_map_size*8) x 4 x 4
            nn.Conv2d(feature_map_size * 8, 1, 4, 1, 0, bias=False),
            nn.Sigmoid()
        )
    
    def forward(self, input):
        return self.main(input).view(-1, 1).squeeze(1)

GAN训练策略

class GANTrainer:
    """GAN训练器"""
    
    def __init__(self, generator, discriminator, latent_dim=100, lr=0.0002, beta1=0.5):
        self.generator = generator
        self.discriminator = discriminator
        self.latent_dim = latent_dim
        
        # 初始化损失函数
        self.criterion = nn.BCELoss()
        
        # 初始化优化器
        self.optimizer_G = optim.Adam(generator.parameters(), lr=lr, betas=(beta1, 0.999))
        self.optimizer_D = optim.Adam(discriminator.parameters(), lr=lr, betas=(beta1, 0.999))
        
        # 训练历史
        self.g_losses = []
        self.d_losses = []
        
    def train_epoch(self, dataloader, device):
        """训练一个epoch"""
        for i, (real_imgs, _) in enumerate(dataloader):
            batch_size = real_imgs.size(0)
            real_imgs = real_imgs.to(device)
            
            # 创建标签
            real_labels = torch.ones(batch_size, device=device)
            fake_labels = torch.zeros(batch_size, device=device)
            
            # ---------------------
            #  训练判别器
            # ---------------------
            self.optimizer_D.zero_grad()
            
            # 真实图像的损失
            output_real = self.discriminator(real_imgs)
            loss_real = self.criterion(output_real, real_labels)
            
            # 生成假图像
            z = torch.randn(batch_size, self.latent_dim, 1, 1, device=device)
            fake_imgs = self.generator(z)
            
            # 假图像的损失
            output_fake = self.discriminator(fake_imgs.detach())
            loss_fake = self.criterion(output_fake, fake_labels)
            
            # 判别器总损失
            loss_D = (loss_real + loss_fake) / 2
            loss_D.backward()
            self.optimizer_D.step()
            
            # ---------------------
            #  训练生成器
            # ---------------------
            self.optimizer_G.zero_grad()
            
            # 生成器试图欺骗判别器
            output_fake = self.discriminator(fake_imgs)
            loss_G = self.criterion(output_fake, real_labels)
            
            loss_G.backward()
            self.optimizer_G.step()
            
            # 记录损失
            if i % 50 == 0:
                self.g_losses.append(loss_G.item())
                self.d_losses.append(loss_D.item())
                
    def generate_samples(self, num_samples, device):
        """生成样本"""
        with torch.no_grad():
            z = torch.randn(num_samples, self.latent_dim, 1, 1, device=device)
            fake_imgs = self.generator(z)
        return fake_imgs
    
    def plot_training_progress(self):
        """绘制训练进度"""
        plt.figure(figsize=(10, 5))
        plt.plot(self.g_losses, label='Generator Loss')
        plt.plot(self.d_losses, label='Discriminator Loss')
        plt.xlabel('Iterations')
        plt.ylabel('Loss')
        plt.title('GAN Training Progress')
        plt.legend()
        plt.show()

扩散模型原理与实现

扩散过程数学基础

import torch
import torch.nn as nn
import torch.nn.functional as F

class DiffusionProcess:
    """扩散过程管理"""
    
    def __init__(self, timesteps=1000, beta_start=0.0001, beta_end=0.02):
        self.timesteps = timesteps
        
        # 线性方差调度
        self.betas = torch.linspace(beta_start, beta_end, timesteps)
        
        # 预计算扩散过程参数
        self.alphas = 1. - self.betas
        self.alphas_cumprod = torch.cumprod(self.alphas, dim=0)
        self.alphas_cumprod_prev = F.pad(self.alphas_cumprod[:-1], (1, 0), value=1.0)
        
        # 计算后验方差参数
        self.posterior_variance = self.betas * (1. - self.alphas_cumprod_prev) / (1. - self.alphas_cumprod)
    
    def q_sample(self, x_start, t, noise=None):
        """前向扩散过程：向数据添加噪声"""
        if noise is None:
            noise = torch.randn_like(x_start)
        
        sqrt_alphas_cumprod_t = self.extract(self.alphas_cumprod.sqrt(), t, x_start.shape)
        sqrt_one_minus_alphas_cumprod_t = self.extract((1. - self.alphas_cumprod).sqrt(), t, x_start.shape)
        
        return sqrt_alphas_cumprod_t * x_start + sqrt_one_minus_alphas_cumprod_t * noise
    
    def p_losses(self, denoise_model, x_start, t, noise=None, loss_type="l1"):
        """计算训练损失"""
        if noise is None:
            noise = torch.randn_like(x_start)
        
        x_noisy = self.q_sample(x_start=x_start, t=t, noise=noise)
        predicted_noise = denoise_model(x_noisy, t)
        
        if loss_type == 'l1':
            loss = F.l1_loss(noise, predicted_noise)
        elif loss_type == 'l2':
            loss = F.mse_loss(noise, predicted_noise)
        elif loss_type == "huber":
            loss = F.smooth_l1_loss(noise, predicted_noise)
        else:
            raise NotImplementedError()
        
        return loss
    
    @torch.no_grad()
    def p_sample(self, model, x, t, t_index):
        """从模型中进行一次采样步骤"""
        betas_t = self.extract(self.betas, t, x.shape)
        sqrt_one_minus_alphas_cumprod_t = self.extract(
            (1. - self.alphas_cumprod).sqrt(), t, x.shape
        )
        sqrt_recip_alphas_t = self.extract(
            (1.0 / self.alphas).sqrt(), t, x.shape
        )
        
        # 使用模型预测均值
        model_mean = sqrt_recip_alphas_t * (
            x - betas_t * model(x, t) / sqrt_one_minus_alphas_cumprod_t
        )
        
        if t_index == 0:
            return model_mean
        else:
            posterior_variance_t = self.extract(self.posterior_variance, t, x.shape)
            noise = torch.randn_like(x)
            return model_mean + torch.sqrt(posterior_variance_t) * noise
    
    @torch.no_grad()
    def p_sample_loop(self, model, shape):
        """采样循环"""
        device = next(model.parameters()).device
        
        # 从噪声开始
        img = torch.randn(shape, device=device)
        imgs = []
        
        for i in reversed(range(0, self.timesteps)):
            t = torch.full((shape[0],), i, device=device, dtype=torch.long)
            img = self.p_sample(model, img, t, i)
            imgs.append(img.cpu())
        
        return imgs
    
    def extract(self, a, t, x_shape):
        """从a中提取时间步t对应的值"""
        batch_size = t.shape[0]
        out = a.gather(-1, t.cpu())
        return out.reshape(batch_size, *((1,) * (len(x_shape) - 1))).to(t.device)

UNet去噪网络

class ResidualBlock(nn.Module):
    """残差块"""
    
    def __init__(self, in_channels, out_channels, time_emb_dim):
        super().__init__()
        self.time_mlp = nn.Linear(time_emb_dim, out_channels)
        
        self.block1 = nn.Sequential(
            nn.GroupNorm(8, in_channels),
            nn.SiLU(),
            nn.Conv2d(in_channels, out_channels, 3, padding=1)
        )
        
        self.block2 = nn.Sequential(
            nn.GroupNorm(8, out_channels),
            nn.SiLU(),
            nn.Conv2d(out_channels, out_channels, 3, padding=1)
        )
        
        if in_channels != out_channels:
            self.residual_conv = nn.Conv2d(in_channels, out_channels, 1)
        else:
            self.residual_conv = nn.Identity()
    
    def forward(self, x, t):
        h = self.block1(x)
        
        # 添加时间嵌入
        time_emb = self.time_mlp(t)
        h = h + time_emb.unsqueeze(-1).unsqueeze(-1)
        
        h = self.block2(h)
        return h + self.residual_conv(x)

class UNet(nn.Module):
    """UNet去噪网络"""
    
    def __init__(self, in_channels=1, out_channels=1, time_emb_dim=256):
        super().__init__()
        
        # 时间嵌入
        self.time_mlp = nn.Sequential(
            nn.Linear(time_emb_dim, time_emb_dim),
            nn.SiLU(),
            nn.Linear(time_emb_dim, time_emb_dim)
        )
        
        # 下采样路径
        self.down1 = ResidualBlock(in_channels, 64, time_emb_dim)
        self.down2 = ResidualBlock(64, 128, time_emb_dim)
        self.down3 = ResidualBlock(128, 256, time_emb_dim)
        
        # 中间层
        self.mid = ResidualBlock(256, 256, time_emb_dim)
        
        # 上采样路径
        self.up1 = ResidualBlock(512, 128, time_emb_dim)  # 512 = 256 + 256
        self.up2 = ResidualBlock(256, 64, time_emb_dim)   # 256 = 128 + 128
        self.up3 = ResidualBlock(128, 64, time_emb_dim)   # 128 = 64 + 64
        
        self.final_conv = nn.Conv2d(64, out_channels, 1)
        
        # 下采样和上采样
        self.downsample = nn.MaxPool2d(2)
        self.upsample = nn.Upsample(scale_factor=2, mode='bilinear', align_corners=True)
    
    def forward(self, x, timesteps):
        # 时间嵌入
        t = self.time_mlp(timesteps)
        
        # 下采样
        x1 = self.down1(x, t)
        x2 = self.down2(self.downsample(x1), t)
        x3 = self.down3(self.downsample(x2), t)
        
        # 中间层
        x_mid = self.mid(self.downsample(x3), t)
        
        # 上采样
        x = self.up1(torch.cat([self.upsample(x_mid), x3], dim=1), t)
        x = self.up2(torch.cat([self.upsample(x), x2], dim=1), t)
        x = self.up3(torch.cat([self.upsample(x), x1], dim=1), t)
        
        return self.final_conv(x)

实战项目：手写数字生成

数据准备与训练

def train_gan_mnist():
    """训练GAN生成MNIST手写数字"""
    
    # 设备配置
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    print(f"使用设备: {device}")
    
    # 数据加载
    transform = transforms.Compose([
        transforms.Resize(64),
        transforms.ToTensor(),
        transforms.Normalize((0.5,), (0.5,))
    ])
    
    dataset = torchvision.datasets.MNIST(
        root='./data', train=True, download=True, transform=transform
    )
    dataloader = DataLoader(dataset, batch_size=128, shuffle=True)
    
    # 初始化模型
    generator = Generator(latent_dim=100, img_channels=1).to(device)
    discriminator = Discriminator(img_channels=1).to(device)
    
    # 训练器
    trainer = GANTrainer(generator, discriminator)
    
    # 训练循环
    num_epochs = 50
    for epoch in range(num_epochs):
        trainer.train_epoch(dataloader, device)
        
        if epoch % 10 == 0:
            print(f"Epoch [{epoch}/{num_epochs}]")
            
            # 生成样本
            with torch.no_grad():
                fake_images = trainer.generate_samples(16, device)
                
                # 可视化生成结果
                grid = torchvision.utils.make_grid(
                    fake_images, nrow=4, normalize=True
                )
                plt.figure(figsize=(8, 8))
                plt.imshow(grid.permute(1, 2, 0).cpu())
                plt.title(f"Generated Images - Epoch {epoch}")
                plt.axis('off')
                plt.show()
    
    # 绘制训练曲线
    trainer.plot_training_progress()
    
    return generator, discriminator

def train_diffusion_mnist():
    """训练扩散模型生成MNIST手写数字"""
    
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    
    # 数据加载
    transform = transforms.Compose([
        transforms.ToTensor(),
        transforms.Normalize((0.5,), (0.5,))
    ])
    
    dataset = torchvision.datasets.MNIST(
        root='./data', train=True, download=True, transform=transform
    )
    dataloader = DataLoader(dataset, batch_size=128, shuffle=True)
    
    # 初始化扩散过程和模型
    diffusion = DiffusionProcess(timesteps=1000)
    model = UNet(in_channels=1, out_channels=1).to(device)
    optimizer = torch.optim.Adam(model.parameters(), lr=1e-4)
    
    # 训练循环
    num_epochs = 100
    for epoch in range(num_epochs):
        for batch_idx, (images, _) in enumerate(dataloader):
            images = images.to(device)
            batch_size = images.shape[0]
            
            # 随机采样时间步
            t = torch.randint(0, diffusion.timesteps, (batch_size,), device=device).long()
            
            # 计算损失
            loss = diffusion.p_losses(model, images, t, loss_type="huber")
            
            # 反向传播
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
            
            if batch_idx % 100 == 0:
                print(f"Epoch [{epoch}/{num_epochs}] Batch [{batch_idx}/{len(dataloader)}] Loss: {loss.item():.4f}")
        
        # 每个epoch结束后生成样本
        if epoch % 20 == 0:
            with torch.no_grad():
                # 生成样本
                sample_shape = (16, 1, 28, 28)
                samples = diffusion.p_sample_loop(model, sample_shape)
                generated = samples[-1]  # 取最终生成的图像
                
                # 可视化
                grid = torchvision.utils.make_grid(
                    generated, nrow=4, normalize=True
                )
                plt.figure(figsize=(8, 8))
                plt.imshow(grid.permute(1, 2, 0).cpu())
                plt.title(f"Diffusion Generated - Epoch {epoch}")
                plt.axis('off')
                plt.show()
    
    return model, diffusion

高级技巧与优化

GAN训练稳定性改进

class WGAN_GP_Trainer:
    """WGAN with Gradient Penalty训练器"""
    
    def __init__(self, generator, discriminator, lambda_gp=10):
        self.generator = generator
        self.discriminator = discriminator
        self.lambda_gp = lambda_gp
        
        self.optimizer_G = optim.Adam(generator.parameters(), lr=0.0001, betas=(0.5, 0.9))
        self.optimizer_D = optim.Adam(discriminator.parameters(), lr=0.0001, betas=(0.5, 0.9))
    
    def compute_gradient_penalty(self, real_samples, fake_samples):
        """计算梯度惩罚"""
        batch_size = real_samples.size(0)
        alpha = torch.rand(batch_size, 1, 1, 1, device=real_samples.device)
        
        # 插值样本
        interpolates = (alpha * real_samples + (1 - alpha) * fake_samples).requires_grad_(True)
        d_interpolates = self.discriminator(interpolates)
        
        # 计算梯度
        gradients = torch.autograd.grad(
            outputs=d_interpolates,
            inputs=interpolates,
            grad_outputs=torch.ones_like(d_interpolates),
            create_graph=True,
            retain_graph=True,
            only_inputs=True,
        )[0]
        
        gradients = gradients.view(gradients.size(0), -1)
        gradient_penalty = ((gradients.norm(2, dim=1) - 1) ** 2).mean()
        
        return gradient_penalty
    
    def train_epoch(self, dataloader, device, n_critic=5):
        """训练一个epoch"""
        for i, (real_imgs, _) in enumerate(dataloader):
            real_imgs = real_imgs.to(device)
            batch_size = real_imgs.size(0)
            
            # 训练判别器多次
            for _ in range(n_critic):
                self.optimizer_D.zero_grad()
                
                # 真实图像得分
                real_validity = self.discriminator(real_imgs)
                
                # 生成假图像
                z = torch.randn(batch_size, 100, 1, 1, device=device)
                fake_imgs = self.generator(z)
                fake_validity = self.discriminator(fake_imgs)
                
                # 梯度惩罚
                gradient_penalty = self.compute_gradient_penalty(real_imgs.data, fake_imgs.data)
                
                # WGAN损失
                d_loss = -torch.mean(real_validity) + torch.mean(fake_validity) + self.lambda_gp * gradient_penalty
                
                d_loss.backward()
                self.optimizer_D.step()
            
            # 训练生成器
            self.optimizer_G.zero_grad()
            
            z = torch.randn(batch_size, 100, 1, 1, device=device)
            gen_imgs = self.generator(z)
            g_loss = -torch.mean(self.discriminator(gen_imgs))
            
            g_loss.backward()
            self.optimizer_G.step()

class ConditionalGAN:
    """条件GAN实现"""
    
    def __init__(self, num_classes=10):
        self.num_classes = num_classes
        
    def add_condition(self, x, labels):
        """添加条件信息"""
        batch_size = x.size(0)
        
        # 将标签转换为嵌入
        label_embedding = F.one_hot(labels, self.num_classes).float()
        label_embedding = label_embedding.view(batch_size, self.num_classes, 1, 1)
        label_embedding = label_embedding.expand(-1, -1, x.size(2), x.size(3))
        
        # 拼接条件信息
        return torch.cat([x, label_embedding], dim=1)

扩散模型加速采样

class AcceleratedDiffusion(DiffusionProcess):
    """加速扩散模型采样"""
    
    def __init__(self, timesteps=1000, sampling_timesteps=100):
        super().__init__(timesteps)
        self.sampling_timesteps = sampling_timesteps
        
        # 创建采样时间表
        self.sampling_timesteps = torch.linspace(
            0, timesteps - 1, sampling_timesteps, dtype=torch.long
        )
    
    @torch.no_grad()
    def ddim_sample(self, model, shape, eta=0.0):
        """DDIM采样 - 加速扩散过程"""
        device = next(model.parameters()).device
        batch_size = shape[0]
        
        # 从噪声开始
        x = torch.randn(shape, device=device)
        
        # 时间步序列
        times = torch.linspace(-1, self.timesteps - 1, steps=self.sampling_timesteps + 1)
        times = list(reversed(times.int().tolist()))
        time_pairs = list(zip(times[:-1], times[1:]))
        
        for time, time_next in time_pairs:
            alpha = self.alphas_cumprod[time]
            alpha_next = self.alphas_cumprod[time_next] if time_next >= 0 else torch.tensor(1.0)
            
            # 预测噪声
            t = torch.full((batch_size,), time, device=device, dtype=torch.long)
            pred_noise = model(x, t)
            
            # 计算x0预测
            x0_pred = (x - torch.sqrt(1 - alpha) * pred_noise) / torch.sqrt(alpha)
            
            # 计算方向
            dir_xt = torch.sqrt(1 - alpha_next - eta ** 2) * pred_noise
            
            # 添加噪声
            noise = eta * torch.randn_like(x) if time_next > 0 else 0
            
            x = torch.sqrt(alpha_next) * x0_pred + dir_xt + noise
        
        return x

应用场景与评估

生成质量评估指标

class GenerativeMetrics:
    """生成模型评估指标"""
    
    @staticmethod
    def calculate_fid(real_features, fake_features):
        """计算Fréchet Inception Distance"""
        mu1, sigma1 = real_features.mean(0), torch.cov(real_features.T)
        mu2, sigma2 = fake_features.mean(0), torch.cov(fake_features.T)
        
        diff = mu1 - mu2
        covmean = torch.sqrt(sigma1 @ sigma2)
        
        fid = diff @ diff + torch.trace(sigma1 + sigma2 - 2 * covmean)
        return fid.item()
    
    @staticmethod
    def calculate_inception_score(fake_images, classifier, num_splits=10):
        """计算Inception Score"""
        preds = classifier(fake_images)
        
        # 计算条件标签分布p(y|x)和边缘分布p(y)
        p_yx = F.softmax(preds, dim=1)
        p_y = p_yx.mean(dim=0)
        
        # 计算KL散度
        kl_d = p_yx * (torch.log(p_yx) - torch.log(p_y.unsqueeze(0)))
        kl_d = kl_d.sum(dim=1)
        
        # 平均和指数
        inception_score = torch.exp(kl_d.mean())
        return inception_score.item()
    
    @staticmethod
    def diversity_score(fake_images):
        """计算多样性分数"""
        # 使用特征之间的平均距离作为多样性度量
        features = fake_images.view(fake_images.size(0), -1)
        distances = torch.cdist(features, features)
        
        # 排除对角线元素
        mask = ~torch.eye(features.size(0), dtype=torch.bool)
        avg_distance = distances[mask].mean()
        
        return avg_distance.item()

应用场景分析

应用领域	推荐模型	理由	实施要点
艺术创作	扩散模型	生成质量高，多样性好	需要大量计算资源，适合云端部署
数据增强	GAN	生成速度快，实时性好	注意模式崩溃问题，需要稳定性措施
图像编辑	条件GAN	可控性强，编辑精确	需要精确的条件输入设计
视频生成	扩散模型+时序扩展	时序一致性要求高	需要3D卷积或时序注意力机制
医学影像	条件扩散模型	需要高精度和可控性	数据隐私和安全考虑

部署与优化

模型压缩与加速

def optimize_generation(model, example_input):
    """优化生成模型推理速度"""
    
    # 模型量化
    quantized_model = torch.quantization.quantize_dynamic(
        model, {nn.Linear, nn.Conv2d}, dtype=torch.qint8
    )
    
    # TorchScript编译
    traced_model = torch.jit.trace(model, example_input)
    
    # 图优化
    optimized_model = torch.jit.optimize_for_inference(traced_model)
    
    return optimized_model

class ProgressiveGeneration:
    """渐进式生成策略"""
    
    def __init__(self, model, steps=10):
        self.model = model
        self.steps = steps
    
    def progressive_generate(self, initial_noise):
        """渐进式生成，显示中间步骤"""
        intermediates = []
        current = initial_noise
        
        for step in range(self.steps):
            # 这里简化表示，实际需要根据模型类型调整
            with torch.no_grad():
                current = self.model(current, step)
            intermediates.append(current.cpu())
        
        return intermediates

总结与展望

技术对比总结

技术维度	GAN优势	扩散模型优势	发展建议
生成速度	⭐⭐⭐⭐⭐	⭐⭐	GAN适合实时应用
生成质量	⭐⭐⭐	⭐⭐⭐⭐⭐	扩散模型质量领先
训练稳定性	⭐⭐	⭐⭐⭐⭐	扩散模型更易训练
多样性	⭐⭐⭐	⭐⭐⭐⭐⭐	扩散模型覆盖更好
可控性	⭐⭐⭐⭐	⭐⭐⭐	两者都在改进

2025年发展趋势

多模态融合：文本、图像、音频的联合生成
效率优化：更快的采样算法和轻量级架构
可控生成：精细的内容控制和编辑能力
3D生成：从2D向3D内容的扩展
伦理安全：生成内容的检测和溯源技术

生成式AI正在快速发展，掌握GAN和扩散模型的核心原理和实践技能，将为在AI创意领域的深入探索奠定坚实基础。无论选择哪种技术路径，理解数据特性、掌握模型原理、持续实践优化都是成功的关键。

2048 AI社区

有“AI”的1024 = 2048，欢迎大家加入2048 AI社区

更多推荐

【愚公系列】《腾讯元宝从入门到精通》031-元宝在学术论文写作领域的应用（生成文献综述）

2048 AI社区

CANN × 智慧农业：端侧病虫识别系统设想

2048 AI社区

python代码if __name__ == ‘__main__‘ 的含义及作用

Python模块中的if __name__ == '__main__'机制解析摘要：本文详细剖析了Python中if __name__ == '__main__'的核心原理与应用场景。该机制通过内置变量__name__实现模块的双重功能：当直接执行时__name__为'main'，触发主程序逻辑；被导入时则保持模块名，避免执行测试代码。这种设计实现了模块的"脚本-库"双重身