从零开始用 Python 复现 LLaMA 4 MoE 架构 | 极客日志

PythonAI算法

从零开始用 Python 复现 LLaMA 4 MoE 架构

综述由AI生成通过 Python 从零构建简化版 LLaMA 4 MoE 模型，涵盖分词、RoPE 位置编码、RMSNorm 归一化及混合专家层实现。教程演示了数据预处理、超参数配置、训练循环优化及文本生成流程，帮助开发者深入理解大语言模型核心组件的工作原理与工程落地细节。

灰度发布发布于 2026/4/8更新于 2026/6/421 浏览

引言

理解大语言模型的核心架构，最好的方式莫过于亲手实现。本文带你从零开始，利用 Python 和 PyTorch 构建一个简化版的 LLaMA 风格混合专家（MoE）模型。我们将深入探讨 MoE 层、RoPE 位置编码以及 RMSNorm 归一化等关键组件的实际落地。

架构概览

MoE 机制的核心思想类似于组建专家团队：与其让一个庞大的网络处理所有任务，不如让多个小型的'专家'网络各司其职，再由一个路由器决定每个输入由哪个专家处理。

以句子 "The cat sat" 为例：

分词：将文本切分为 Token。
路由选择：路由器分析 Token 特征，决定调用哪些专家（例如名词可能由擅长语义的专家处理）。
加权组合：选定的专家输出结果后，根据路由权重进行加权求和，生成最终输出。

这种设计在保持模型容量的同时，显著降低了推理时的计算开销。接下来，我们逐步拆解代码实现。

环境准备与数据加载

首先导入必要的库并配置设备。为了演示方便，我们使用字符级分词，语料选自《爱丽丝梦游仙境》的一个片段。

import torch
import torch.nn as nn
from torch.nn import functional as F
import math
import os

# 设备配置
device = 'cuda' if torch.cuda.is_available() else 'cpu'
print(f"使用设备：{device}")

# 定义训练语料库
corpus_raw = """ Alice was beginning to get very tired of sitting by her sister on the bank, and of having nothing to do: once or twice she had peeped into the book her sister was reading, but it had no pictures or conversations in it, 'and what is the use of a book,' thought Alice 'without pictures or conversation?' So she was considering in her own mind (as well as she could, for the hot day made her feel very sleepy and stupid), whether the pleasure of making a daisy-chain would be worth the trouble of getting up and picking the daisies, when suddenly a White Rabbit with pink eyes ran close by her. """

分词与编码

计算机无法直接理解字符，我们需要建立字符到整数的映射。这里采用简单的字符级分词策略。

# 找出所有唯一字符
chars = sorted(list(set(corpus_raw)))
vocab_size = len(chars)


char_to_int = {ch: i  i, ch  (chars)}
int_to_char = {i: ch  i, ch  (chars)}

()


encoded_corpus = [char_to_int[ch]  ch  corpus_raw]
full_data_sequence = torch.tensor(encoded_corpus, dtype=torch.long, device=device)

相关免费在线工具

加密/解密文本
使用加密算法（如AES、TripleDES、Rabbit或RC4）加密和解密文本明文。在线工具，加密/解密文本在线工具，online
RSA密钥对生成器
生成新的随机RSA私钥和公钥pem证书。在线工具，RSA密钥对生成器在线工具，online
Mermaid 预览与可视化编辑
基于 Mermaid.js 实时预览流程图、时序图等图表，支持源码编辑与即时渲染。在线工具，Mermaid 预览与可视化编辑在线工具，online
随机西班牙地址生成器
随机生成西班牙地址（支持马德里、加泰罗尼亚、安达卢西亚、瓦伦西亚筛选），支持数量快捷选择、显示全部与下载。在线工具，随机西班牙地址生成器在线工具，online
Gemini 图片去水印
基于开源反向 Alpha 混合算法去除 Gemini/Nano Banana 图片水印，支持批量处理与下载。在线工具，Gemini 图片去水印在线工具，online
curl 转代码
解析常见 curl 参数并生成 fetch、axios、PHP curl 或 Python requests 示例代码。在线工具，curl 转代码在线工具，online

# 基础架构参数
d_model = 128
n_layers = 4
n_heads = 4
block_size = 64
rms_norm_eps = 1e-5
rope_theta = 10000.0

# MoE 特定参数
num_local_experts = 4
num_experts_per_tok = 2
intermediate_size_expert = d_model * 2
shared_expert_dim = d_model * 2

# 训练参数
learning_rate = 5e-4
batch_size = 16
epochs = 3000

all_x, all_y = [], []
for i in range(len(full_data_sequence) - block_size):
    x_chunk = full_data_sequence[i : i + block_size]
    y_chunk = full_data_sequence[i + 1 : i + block_size + 1]
    all_x.append(x_chunk)
    all_y.append(y_chunk)

train_x = torch.stack(all_x)
train_y = torch.stack(all_y)

token_embedding_table = nn.Embedding(vocab_size, d_model).to(device)

# 预计算 RoPE 逆频率
rope_freq_indices = torch.arange(0, d_k, 2, dtype=torch.float, device=device)
inv_freq = 1.0 / (rope_theta ** (rope_freq_indices / d_k))

# 初始化 RMSNorm 权重
rmsnorm_weights_input = [nn.Parameter(torch.ones(d_model, device=device)) for _ in range(n_layers)]
rmsnorm_weights_post_attn = [nn.Parameter(torch.ones(d_model, device=device)) for _ in range(n_layers)]
final_rmsnorm_weight = nn.Parameter(torch.ones(d_model, device=device))

# 初始化 MHA 线性层
mha_qkv_linears = [nn.Linear(d_model, 3 * d_model, bias=False).to(device) for _ in range(n_layers)]
mha_output_linears = [nn.Linear(d_model, d_model, bias=False).to(device) for _ in range(n_layers)]

moe_routers = [nn.Linear(d_model, num_local_experts, bias=False).to(device) for _ in range(n_layers)]
moe_expert_gate_up_proj = [nn.Parameter(torch.empty(num_local_experts, d_model, 2 * expert_dim, device=device)) for _ in range(n_layers)]
moe_expert_down_proj = [nn.Parameter(torch.empty(num_local_experts, expert_dim, d_model, device=device)) for _ in range(n_layers)]

# 共享专家
shared_expert_gate_proj = [nn.Linear(d_model, shared_expert_dim, bias=False).to(device) for _ in range(n_layers)]
shared_expert_up_proj = [nn.Linear(d_model, shared_expert_dim, bias=False).to(device) for _ in range(n_layers)]
shared_expert_down_proj = [nn.Linear(shared_expert_dim, d_model, bias=False).to(device) for _ in range(n_layers)]

activation_fn = nn.SiLU()

optimizer = optim.AdamW(all_model_parameters, lr=learning_rate)
criterion = nn.CrossEntropyLoss()
losses = []

for epoch in range(epochs):
    # 随机采样 Batch
    indices = torch.randint(0, train_x.shape[0], (batch_size,))
    xb, yb = train_x[indices].to(device), train_y[indices].to(device)
    
    # 前向传播
    token_embed = token_embedding_table(xb)
    position_ids = torch.arange(xb.shape[1], device=device).unsqueeze(0)
    freqs_cis = torch.polar(torch.ones_like(position_ids), (inv_freq.unsqueeze(0).unsqueeze(-1).expand(xb.shape[0], -1, 1).float() @ position_ids.unsqueeze(1).expand(xb.shape[0], -1).float()).transpose(1, 2))
    
    x = token_embed
    for i in range(n_layers):
        # RMSNorm & Attention
        x_norm = (x.float() * torch.rsqrt(x.float().pow(2).mean(-1, keepdim=True) + rms_norm_eps)) * rmsnorm_weights_input[i]
        qkv = mha_qkv_linears[i](x_norm).view(xb.shape[0], xb.shape[1], n_heads, 3 * d_k).chunk(3, dim=-1)
        q, k, v = qkv[0], qkv[1], qkv[2]
        # ... (此处省略具体的 RoPE 旋转与 Attention 计算细节，逻辑同前文描述)
        # MoE Block
        x_norm_moe = (x.float() * torch.rsqrt(x.float().pow(2).mean(-1, keepdim=True) + rms_norm_eps)) * rmsnorm_weights_post_attn[i]
        router_logits = moe_routers[i](x_norm_moe)
        routing_weights, selected_experts = torch.sigmoid(torch.topk(router_logits, num_experts_per_tok, dim=-1)[0]), torch.topk(router_logits, num_experts_per_tok, dim=-1)[1]
        # ... (专家计算与聚合逻辑)
        x = x + combined_expert_outputs.view(xb.shape[0], xb.shape[1], d_model) + shared_output
    
    logits = output_linear_layer((x.float() * torch.rsqrt(x.float().pow(2).mean(-1, keepdim=True) + rms_norm_eps)) * final_rmsnorm_weight)
    loss = criterion(logits.view(-1, logits.shape[-1]), yb.view(-1))
    
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()
    losses.append(loss.item())
    
    if epoch % eval_interval == 0 or epoch == epochs - 1:
        print(f"第 {epoch+1}/{epochs} 个周期，损失：{loss.item():.4f}")

seed_chars = "Alice "
num_tokens_to_generate = 200

# 设置评估模式
for layer in [token_embedding_table] + mha_qkv_linears + moe_routers + output_linear_layer:
    layer.eval()

with torch.no_grad():
    generated_sequence = torch.tensor([[char_to_int[ch] for ch in seed_chars]], dtype=torch.long, device=device)
    for _ in range(num_tokens_to_generate):
        current_context = generated_sequence[:, -block_size:]
        # ... (执行前向传播获取 logits)
        next_token = torch.multinomial(F.softmax(logits_gen[:, -1, :], dim=-1), num_samples=1)
        generated_sequence = torch.cat((generated_sequence, next_token), dim=1)

# 解码输出
decoded_text = ''.join([int_to_char.get(id_val, '[UNK]') for id_val in generated_sequence[0].tolist()])
print(decoded_text)

从零开始用 Python 复现 LLaMA 4 MoE 架构

引言

架构概览

环境准备与数据加载

分词与编码

更多推荐文章

相关免费在线工具

超参数定义

数据预处理

模型组件初始化

嵌入与 RoPE

RMSNorm 与注意力

MoE 层实现

训练循环

文本生成

结语

更多推荐文章

相关免费在线工具

从零开始用 Python 复现 LLaMA 4 MoE 架构

引言

架构概览

环境准备与数据加载

分词与编码

微信扫一扫，关注极客日志

更多推荐文章

相关免费在线工具

超参数定义

数据预处理

模型组件初始化

嵌入与 RoPE

RMSNorm 与注意力

MoE 层实现

训练循环

文本生成

结语

微信扫一扫，关注极客日志

更多推荐文章

相关免费在线工具