MLA 多头潜在注意力,利用低秩压缩技术,将模型参数压缩到极致
MOE 混合专家机制,ff 前馈连接层整体分为 N 个专家主导,利用 linear 层加权求和的方式算出专家的得分 logits,利用 softmax 算出专家的权重比例,topk 选择排名靠前的专家进行激活计算,抓重点,专业的事交给专业的部分搞定,大幅节省算力 (激活率 5.5%)
FP8 采用 FP8 精度,量化压缩 50%
RLAIF AI 强化学习,节省人工标注成本,让模型更懂人性,听人话
AI 标注>>有监督微调>>排序实现奖励模型>>PPO 强化学习>>反向促进模型成长
Deepseek Sparse Attention 稀疏注意力机制,抓重点部分做计算,节省半数算力
DSA 是通过 linear 层不断更新参数,不断学习如何筛选更优质的注意力部分
import torch
import torch.nn as nn
import torch.nn.functional as F
class Expert(nn.Module):
def __init__(self, input_dim, hidden_dim, output_dim):
super().__init__()
self.fc1 = nn.Linear(input_dim, hidden_dim)
self.fc2 = nn.Linear(hidden_dim, output_dim)
def forward(self, x):
x = F.relu(self.fc1(x))
return self.fc2(x)
class RoutingGate(nn.Module):
def __init__(self, input_dim, num_routed_experts, k=2):
super().__init__()
self.fc = nn.Linear(input_dim, num_routed_experts)
self.k = k
def forward(self, x):
logits = self.fc(x)
topk_val, topk_idx = torch.topk(logits, self.k, dim=-1)
weights = F.softmax(topk_val, dim=-1)
print("weights:", weights)
routed_weights = torch.zeros_like(logits)
routed_weights.scatter_(-1, topk_idx, weights)
print("Routed weights:", routed_weights)
return routed_weights
class MoEWithRouting(nn.Module):
def __init__(self, input_dim, hidden_dim, output_dim, num_routed_experts, num_shared_experts, k=2):
super().__init__()
self.routed_experts = nn.ModuleList(
[Expert(input_dim, hidden_dim, output_dim) for _ in range(num_routed_experts)]
)
self.shared_experts = nn.ModuleList(
[Expert(input_dim, hidden_dim, output_dim) for _ in range(num_shared_experts)]
)
self.routing_gate = RoutingGate(input_dim, num_routed_experts, k)
self.shared_weights = nn.Parameter(
torch.ones(num_shared_experts) / num_shared_experts, requires_grad=True
)
def forward(self, x):
routed_weights = self.routing_gate(x)
print("routed_weights:", routed_weights)
routed_outputs = torch.stack(
[expert(x) for expert in self.routed_experts], dim=2
)
print("routed_outputs:", routed_outputs)
routed_result = torch.sum(routed_weights.unsqueeze(1) * routed_outputs, dim=2)
print("routed_result:", routed_result)
shared_outputs = torch.stack(
[expert(x) for expert in self.shared_experts], dim=2
)
shared_weights = F.softmax(self.shared_weights, dim=0)
shared_result = torch.sum(
shared_weights.unsqueeze(0).unsqueeze(1) * shared_outputs, dim=2
)
output = routed_result + shared_result
return output
input_dim = 10
hidden_dim = 20
output_dim = 5
num_routed_experts = 4
num_shared_experts = 2
k = 2
seq_len = 8
model = MoEWithRouting(
input_dim, hidden_dim, output_dim,
num_routed_experts, num_shared_experts, k
)
x = torch.randn(seq_len, input_dim)
output = model(x)
print("Output shape:", output.shape, output)
"""
═══════════════════════════════════════════════════════════════
输入数据 x 形状:[8, 10] (batch=8, dim=10)
═══════════════════════════════════════════════════════════════
│ ┌─────────────────┼─────────────────┐ │ │ │ ▼ ▼ ▼ ┌────────┐ ┌────────┐ ┌────────┐ │门控网络│ │路由专家│ │共享专家│ │ │ │ (4 个) │ │ (2 个) │ └────────┘ └────────┘ └────────┘ │ │ │ ▼ ▼ ▼ ┌────────┐ ┌────────────────────────────┐ │[8,4] │ │ 每个专家内部:│ │(权重) │ │ ┌────────────────────────┐ │ └────────┘ │ │ 输入:[8,10] │ │ │ │ │ ↓ │ │ │ │ │ fc1: Linear(10→20) │ │ │ │ │ ↓ │ │ │ │ │ ReLU │ │ │ │ │ ↓ │ │ │ │ │ fc2: Linear(20→5) │ │ │ │ │ ↓ │ │ │ │ │ 输出:[8,5] │ │ │ │ └────────────────────────┘ │ │ │ │ │ │ 4 个专家各自输出:│ │ │ E0: [8,5] │ │ │ E1: [8,5] │ │ │ E2: [8,5] │ │ │ E3: [8,5] │ │ │ ↓ │ │ │ stack(dim=2) │ │ │ ↓ │ │ │ [8,5,4] ←────────────┐ │ │ │ │ │ │ └───────────────────────┼────┘ │ │ ▼ ▼ ┌─────────────────────────────────────────────────┐ │ 路由加权求和:[8,1,4] × [8,5,4] = [8,5,4] │ │ sum(dim=2) → [8,5] │ └─────────────────────────────────────────────────┘ │ ▼ routed_result: [8,5] │ │ ┌────────────────────────────┐ │ │ 共享专家 (2 个): │ │ │ 每个专家内部:│ │ │ 输入:[8,10] │ │ │ ↓ │ │ │ fc1: Linear(10→20) │ │ │ ↓ │ │ │ ReLU │ │ │ ↓ │ │ │ fc2: Linear(20→5) │ │ │ ↓ │ │ │ 输出:[8,5] │ │ │ │ │ │ S0: [8,5] │ │ │ S1: [8,5] │ │ │ ↓ │ │ │ stack(dim=2) │ │ │ ↓ │ │ │ [8,5,2] │ │ └────────────────────────────┘ │ │ │ ▼ │ shared_weights: [2] │ │ │ ▼ │ ┌─────────────────────────────┐ │ │ 共享加权求和:│ │ │ [1,1,2] × [8,5,2] = [8,5,2] │ │ │ sum(dim=2) → [8,5] │ │ └─────────────────────────────┘ │ │ │ ▼ │ shared_result: [8,5] │ │ └──────┬───────┘ ▼ ┌─────────────────────────┐ │ 最终输出 = routed + shared │ │ [8,5] + [8,5] = [8,5] │ └─────────────────────────┘ │ ▼ ════════════════════════════ 最终输出:[8, 5] ════════════════════════════
"""