10.7 BNN

10.7.1 原理

在传统神经网络中，模型参数（权重\(w\)）被视为固定值，通过最小化损失函数获得最优点估计：

\[ \hat{w} = \arg\max_w p(D|w) \]

其中\(D=\{(x_1,y_1),(x_2,y_2),…,(x_N,y_N)\}\)表示数据集。

然而，在现实问题中，数据噪声与异质性会导致模型存在显著不确定性。贝叶斯神经网络（Bayesian Neural Network,BNN）的核心思想是将模型参数\(w\)看作随机变量，并通过贝叶斯推断来量化模型不确定性。

BNN 的关键思想源于贝叶斯定理：

\[ p(w|D) = \frac{p(D|w)p(w)}{p(D)} \]

其中：

\(p(w)\)：参数的先验分布；
\(p(D|w)\)：数据在给定参数下的似然函数；
\(p(w|D)\)：参数的后验分布；
\(p(D)\)：边际似然（证据）。

因此，BNN不再求单点参数\(\hat{w}\)，而是学习整个参数分布\(p(w|D)\)。

给定新输入\(x^*\)，预测输出\(y^*\)的分布为：

\[ p(y^*|x^*, D) = \int p(y^*|x^*, w) \, p(w|D) \, dw \]

由于该积分难以解析计算，通常采用近似推断方法求解，如：

变分推断
马尔科夫链蒙特卡洛
Monte Carlo Dropout
深度集合

在这里仅介绍变分推断和MC Dropout方法。

变分推断法就是用一个可学习分布\(q(w|\theta)\)来近似后验分布\(p(w|D)\)，通过最小化两者的KL散度实现优化。经过一系列推导可知，最小化KL散度等价于最大化ELBO：

详细推导可见什么是变分推断

\[ ELBO = \mathbb{E}_{q(w|\theta)}[\log p(D|w)] - KL(q(w|\theta)\;||\;p(w)) \]

若令先验分布\(p(w)\)为标准正态分布，近似后验分布为正态分布，则：

\[ KL(\mathcal{N}(\mu,\sigma^2)\;||\;\mathcal{N}(0,1)) = \frac{1}{2}(\sigma^2 + \mu^2 - 1 - \log\sigma^2) \]

据此可用重参数化技巧\(w = \mu + \sigma \epsilon, \epsilon \sim N(0,1)\)来对\(w\)进行MC抽样。

MC Dropout是一种近似贝叶斯推断方法：

在训练和预测阶段都启用 Dropout；
每次前向传播都会随机丢弃部分神经元；
多次采样预测结果，计算均值与方差。

\[ p(y|x, D) \approx \frac{1}{M} \sum_{i=1}^{M} f(x; \hat{w}_i) \]

每个\(\hat{w}_i\)对应一次随机Dropout后的网络参数。

10.7.2 示例

变分推断法：

# ===============================================
# 贝叶斯神经网络 (Bayesian Neural Network) 示例
# 使用变分近似 + 多次采样预测不确定性
# ===============================================

import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import matplotlib.pyplot as plt


# ------------------------------------------------
# 1. 生成模拟数据：y = sin(x) + 噪声
# ------------------------------------------------
torch.manual_seed(42)

N = 100
x = torch.linspace(-3, 3, N).unsqueeze(1)
y_true = torch.sin(x)
y = y_true + 0.2 * torch.randn_like(y_true)  # 添加噪声

plt.figure(figsize=(7, 4))
plt.scatter(x, y, label="Noisy observations", s=15)
plt.plot(x, y_true, color='orange', label="True function")
plt.legend()
plt.title("Training Data (sin function + noise)")
plt.show()


# ------------------------------------------------
# 2. 定义贝叶斯线性层
# ------------------------------------------------
class BayesianLinear(nn.Module):
    def __init__(self, in_features, out_features):
        super().__init__()
        # 均值与log方差参数
        self.w_mu = nn.Parameter(torch.Tensor(out_features, in_features).normal_(0, 0.1))
        self.w_logvar = nn.Parameter(torch.Tensor(out_features, in_features).normal_(-3, 0.1))
        self.b_mu = nn.Parameter(torch.Tensor(out_features).normal_(0, 0.1))
        self.b_logvar = nn.Parameter(torch.Tensor(out_features).normal_(-3, 0.1))

    def forward(self, x, sample=True):
        if sample:
            w = self.w_mu + torch.exp(0.5 * self.w_logvar) * torch.randn_like(self.w_mu)
            b = self.b_mu + torch.exp(0.5 * self.b_logvar) * torch.randn_like(self.b_mu)
        else:
            w, b = self.w_mu, self.b_mu
        return F.linear(x, w, b)
    
    def kl_loss(self):
        # KL 散度项：衡量权重分布与先验 N(0,1) 的距离
        return 0.5 * torch.sum(
            torch.exp(self.w_logvar) + self.w_mu**2 - 1.0 - self.w_logvar
        ) + 0.5 * torch.sum(
            torch.exp(self.b_logvar) + self.b_mu**2 - 1.0 - self.b_logvar
        )


# ------------------------------------------------
# 3. 定义贝叶斯神经网络模型
# ------------------------------------------------
class BayesianNN(nn.Module):
    def __init__(self):
        super().__init__()
        self.fc1 = BayesianLinear(1, 20)
        self.fc2 = BayesianLinear(20, 20)
        self.fc3 = BayesianLinear(20, 1)

    def forward(self, x, sample=True):
        x = torch.relu(self.fc1(x, sample))
        x = torch.relu(self.fc2(x, sample))
        return self.fc3(x, sample)

    def kl_loss(self):
        return self.fc1.kl_loss() + self.fc2.kl_loss() + self.fc3.kl_loss()


# ------------------------------------------------
# 4. 训练模型
# ------------------------------------------------
model = BayesianNN()
optimizer = optim.Adam(model.parameters(), lr=0.01)
epochs = 2000

for epoch in range(epochs):
    optimizer.zero_grad()
    y_pred = model(x, sample=True)
    
    # 似然项 (MSE)
    likelihood = F.mse_loss(y_pred, y, reduction='sum')
    
    # KL 散度项
    kl = model.kl_loss()
    
    # 总损失 = 似然项 + KL权重
    loss = likelihood + 1e-3 * kl
    loss.backward()
    optimizer.step()

    if (epoch + 1) % 200 == 0:
        print(f"Epoch [{epoch+1}/{epochs}], Loss: {loss.item():.4f}")


# ------------------------------------------------
# 5. 多次采样预测，计算不确定性
# ------------------------------------------------
model.eval()
x_test = torch.linspace(-3, 3, 200).unsqueeze(1)
pred_samples = []

with torch.no_grad():
    for _ in range(100):  # 采样100次
        pred = model(x_test, sample=True)
        pred_samples.append(pred)

pred_stack = torch.stack(pred_samples)   # (100, 200, 1)
y_mean = pred_stack.mean(0).squeeze()
y_std = pred_stack.std(0).squeeze()


# ------------------------------------------------
# 6. 可视化预测结果与置信区间
# ------------------------------------------------
plt.figure(figsize=(8,5))
plt.plot(x_test, torch.sin(x_test), 'orange', label='True function')
plt.scatter(x, y, color='gray', s=15, label='Training data')
plt.plot(x_test, y_mean, 'b', label='Predicted mean')
plt.fill_between(
    x_test.squeeze().numpy(),
    (y_mean - 2*y_std).numpy(),
    (y_mean + 2*y_std).numpy(),
    color='lightblue', alpha=0.4, label='±2 std (uncertainty)'
)
plt.legend()
plt.title("Bayesian Neural Network Prediction with Uncertainty")
plt.show()

Droptout法：

# ===========================================================
# Monte Carlo Dropout 版 贝叶斯神经网络 (Bayesian NN)
# ===========================================================
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import matplotlib.pyplot as plt

# -------------------------------------------------
# 1. 生成模拟数据：y = sin(x) + 噪声
# -------------------------------------------------
torch.manual_seed(42)

N = 100
x = torch.linspace(-3, 3, N).unsqueeze(1)
y_true = torch.sin(x)
y = y_true + 0.2 * torch.randn_like(y_true)

plt.figure(figsize=(7,4))
plt.scatter(x, y, label="Noisy observations", s=15)
plt.plot(x, y_true, color='orange', label="True function")
plt.legend()
plt.title("Training Data (sin function + noise)")
plt.show()


# -------------------------------------------------
# 2. 定义 Dropout 版神经网络
# -------------------------------------------------
class MCDropoutNN(nn.Module):
    def __init__(self, input_dim=1, hidden_dim=64, output_dim=1, dropout_p=0.2):
        super(MCDropoutNN, self).__init__()
        self.fc1 = nn.Linear(input_dim, hidden_dim)
        self.fc2 = nn.Linear(hidden_dim, hidden_dim)
        self.fc3 = nn.Linear(hidden_dim, output_dim)
        self.dropout = nn.Dropout(p=dropout_p)
        
    def forward(self, x):
        x = torch.relu(self.fc1(x))
        x = self.dropout(x)
        x = torch.relu(self.fc2(x))
        x = self.dropout(x)
        x = self.fc3(x)
        return x


# -------------------------------------------------
# 3. 训练模型
# -------------------------------------------------
model = MCDropoutNN(input_dim=1, hidden_dim=64, dropout_p=0.2)
optimizer = optim.Adam(model.parameters(), lr=0.01)
criterion = nn.MSELoss()

epochs = 2000
for epoch in range(epochs):
    model.train()
    optimizer.zero_grad()
    y_pred = model(x)
    loss = criterion(y_pred, y)
    loss.backward()
    optimizer.step()

    if (epoch + 1) % 200 == 0:
        print(f"Epoch [{epoch+1}/{epochs}], Loss: {loss.item():.4f}")


# -------------------------------------------------
# 4. 测试阶段：启用 Dropout，多次采样预测
# -------------------------------------------------
def mc_dropout_predict(model, x_test, n_samples=100):
    model.train()  # 关键！保持 Dropout 激活状态
    preds = []
    with torch.no_grad():
        for _ in range(n_samples):
            pred = model(x_test)
            preds.append(pred)
    preds = torch.stack(preds)  # (n_samples, N, 1)
    return preds

x_test = torch.linspace(-3, 3, 200).unsqueeze(1)
preds = mc_dropout_predict(model, x_test, n_samples=100)
y_mean = preds.mean(0).squeeze()
y_std = preds.std(0).squeeze()


# -------------------------------------------------
# 5. 可视化结果
# -------------------------------------------------
plt.figure(figsize=(8,5))
plt.plot(x_test, torch.sin(x_test), 'orange', label='True function')
plt.scatter(x, y, color='gray', s=15, label='Training data')
plt.plot(x_test, y_mean, 'b', label='Predicted mean')
plt.fill_between(
    x_test.squeeze().numpy(),
    (y_mean - 2*y_std).numpy(),
    (y_mean + 2*y_std).numpy(),
    color='lightblue', alpha=0.4, label='±2 std (uncertainty)'
)
plt.legend()
plt.title("MC Dropout Bayesian Neural Network (Uncertainty Estimation)")
plt.show()