LLM核心原理与LoRA微调实战
LLM现在几乎成了AI的代名词,但它的原理并不神秘。这篇文章从最常用的decoder-only架构开始,把大语言模型的训练流程理一遍,最后用LoRA在LLaMA-2上跑一个中文指令微调的例子——这是普通人也能复现的玩法。
为什么是Decoder-only?
现在主流的LLM,无论是GPT、LLaMA还是ChatGLM,都用了Transformer decoder-only。简单说,就是只保留解码器堆叠,通过自回归方式一个token一个token地生成。相比原始的Transformer,省去了编码器,靠masked self-attention来保证因果性。
一个简化版的decoder-only模型实现起来大概是这样(代码去掉了dropout之类的细节,保留核心逻辑):
import torch
import torch.nn as nn
import torch.nn.functional as F
class MultiHeadAttention(nn.Module):
def __init__(self, d_model, num_heads):
super().__init__()
self.d_model = d_model
self.num_heads = num_heads
self.d_k = d_model // num_heads
self.wq = nn.Linear(d_model, d_model)
self.wk = nn.Linear(d_model, d_model)
self.wv = nn.Linear(d_model, d_model)
self.w_o = nn.Linear(d_model, d_model)
def split_heads(self, x, batch_size):
x = x.view(batch_size, -1, self.num_heads, self.d_k)
return x.transpose(1, 2)
def forward(self, x, mask=None):
batch_size = x.size(0)
q = self.split_heads(self.wq(x), batch_size)
k = self.split_heads(self.wk(x), batch_size)
v = .split_heads(.wv(x), batch_size)
scores = torch.matmul(q, k.transpose(-, -)) / torch.sqrt(torch.tensor(.d_k, dtype=torch.float32))
mask :
scores = scores.masked_fill(mask == , -)
attn_weights = F.softmax(scores, dim=-)
attn_output = torch.matmul(attn_weights, v)
attn_output = attn_output.transpose(, ).contiguous().view(batch_size, -, .d_model)
.w_o(attn_output)
(nn.Module):
():
().__init__()
.linear1 = nn.Linear(d_model, d_ff)
.linear2 = nn.Linear(d_ff, d_model)
.relu = nn.ReLU()
():
.linear2(.relu(.linear1(x)))
(nn.Module):
():
().__init__()
.self_attn = MultiHeadAttention(d_model, num_heads)
.feed_forward = FeedForward(d_model, d_ff)
.layernorm1 = nn.LayerNorm(d_model)
.layernorm2 = nn.LayerNorm(d_model)
():
attn_output = .self_attn(x, mask)
x = .layernorm1(x + attn_output)
ff_output = .feed_forward(x)
x = .layernorm2(x + ff_output)
x
(nn.Module):
():
().__init__()
.embedding = nn.Embedding(vocab_size, d_model)
.pos_encoding = nn.Embedding(, d_model)
.decoder_layers = nn.ModuleList([
DecoderLayer(d_model, num_heads, d_ff)
_ (num_layers)
])
.fc = nn.Linear(d_model, vocab_size)
():
mask = torch.tril(torch.ones((seq_len, seq_len)))
mask
():
batch_size, seq_len = x.size()
positions = torch.arange(, seq_len).expand(batch_size, seq_len).to(x.device)
x = .embedding(x) + .pos_encoding(positions)
mask = .generate_look_ahead_mask(seq_len).to(x.device)
layer .decoder_layers:
x = layer(x, mask)
logits = .fc(x)
logits
vocab_size =
d_model =
num_heads =
num_layers =
d_ff =
model = DecoderOnlyLLM(vocab_size, d_model, num_heads, num_layers, d_ff)
(model)


