基于 LLAMA 的大模型推理流程解析

基于 LLAMA 的大模型推理流程解析 | 极客日志

class LlamaTokenizer(PreTrainedTokenizer):  
    """  
    Construct a Llama tokenizer. Based on byte-level Byte-Pair-Encoding.  
...

input_ids  
tensor([[   0,  376, 1366,  338,  263, 3017,  775, 6160]], device='cuda:0')  
input_ids.shape  
torch.Size([1, 8])

LlamaForCausalLM(
  (model): LlamaModel(
    (embed_tokens): Embedding(32000, 4096, padding_idx=31999)
    (layers): ModuleList(
      (0-31): 32 x LlamaDecoderLayer(
        (self_attn): LlamaAttention(
          (q_proj): Linear(in_features=4096, out_features=4096, bias=False)
          (k_proj): Linear(in_features=4096, out_features=4096, bias=False)
          (v_proj): Linear(in_features=4096, out_features=4096, bias=False)
          (o_proj): Linear(in_features=4096, out_features=4096, bias=False)
          (rotary_emb): LlamaRotaryEmbedding()
        )
        (mlp): LlamaMLP(
          (gate_proj): Linear(in_features=4096, out_features=11008, bias=False)
          (down_proj): Linear(in_features=11008, out_features=4096, bias=False)
          (up_proj): Linear(in_features=4096, out_features=11008, bias=False)
          (act_fn): SiLUActivation()
        )
        (input_layernorm): LlamaRMSNorm()
        (post_attention_layernorm): LlamaRMSNorm()
      )
    )
    (norm): LlamaRMSNorm()
  )
  (lm_head): Linear(in_features=4096, out_features=32000, bias=False)
)

{
    "architectures": [
        "LLaMAForCausalLM"
    ],
    "bos_token_id": 0,
    "eos_token_id": 1,
    "hidden_act": "silu",
    "hidden_size": 4096,
    "intermediate_size": 11008,
    "initializer_range": 0.02,
    "max_sequence_length": 2048,
    "model_type": "llama",
    "num_attention_heads": 32,
    "num_hidden_layers": 32,
    "pad_token_id": 0,
    "rms_norm_eps": 1e-06,
    "torch_dtype": "float16",
    "transformers_version": "4.27.0.dev0",
    "use_cache": true,
    "vocab_size": 32000
}

class LlamaModel(LlamaPreTrainedModel):
    def __init__(self, config: LlamaConfig):
        super().__init__(config)
        self.padding_idx = config.pad_token_id
        self.vocab_size = config.vocab_size
        self.embed_tokens = nn.Embedding(config.vocab_size, config.hidden_size, self.padding_idx)
        self.layers = nn.ModuleList([LlamaDecoderLayer(config) for _ in range(config.num_hidden_layers)])
        self.norm = LlamaRMSNorm(config.hidden_size, eps=config.rms_norm_eps)
        self.gradient_checkpointing = False
        # Initialize weights and apply final processing
        self.post_init()
    ...

inputs_tensor, model_input_name, model_kwargs = self._prepare_model_inputs(
            inputs, generation_config.bos_token_id, model_kwargs
        )

model_kwargs["attention_mask"] = self._prepare_attention_mask_for_generation(
                inputs_tensor, generation_config.pad_token_id, generation_config.eos_token_id
            )

attention_mask = self._prepare_decoder_attention_mask(
            attention_mask, (batch_size, seq_length), inputs_embeds, past_key_values_length
        )

for idx, decoder_layer in enumerate(self.layers):
    if output_hidden_states:
        all_hidden_states += (hidden_states,)
    # 如果存在 past_key_value，则一并传进去
    past_key_value = past_key_values[idx] if past_key_values is not None else None
    layer_outputs = decoder_layer(
        hidden_states,
        attention_mask=attention_mask,
        position_ids=position_ids,
        past_key_value=past_key_value,
        output_attentions=output_attentions,
        use_cache=use_cache
    )

    hidden_states = layer_outputs[0]
    if use_cache:
        next_decoder_cache += (layer_outputs[2 if output_attentions else 1],)
    if output_attentions:
        all_self_attns += (layer_outputs[1],)

residual = hidden_states
hidden_states = self.input_layernorm(hidden_states)
# Self Attention
hidden_states, self_attn_weights, present_key_value = self.self_attn(
    hidden_states=hidden_states,
    attention_mask=attention_mask,
    position_ids=position_ids,
    past_key_value=past_key_value,
    output_attentions=output_attentions,
    use_cache=use_cache,
)
hidden_states = residual + hidden_states
# Fully Connected
residual = hidden_states
hidden_states = self.post_attention_layernorm(hidden_states)
hidden_states = self.mlp(hidden_states)
hidden_states = residual + hidden_states
outputs = (hidden_states,)
if output_attentions:
    outputs += (self_attn_weights,)
if use_cache:
    outputs += (present_key_value,)
return outputs

class LlamaAttention(nn.Module):
    def __init__(self, config: LlamaConfig):
        super().__init__()
        self.config = config
        self.hidden_size = config.hidden_size
        self.num_heads = config.num_attention_heads # head 的数量 这里是 32
        self.head_dim = self.hidden_size // self.num_heads  # head 的大小 这里是 128
        self.max_position_embeddings = config.max_position_embeddings
        if (self.head_dim * self.num_heads) != self.hidden_size:
            raise ValueError(
                f"hidden_size must be divisible by num_heads (got `hidden_size`: {self.hidden_size}"
                f" and `num_heads`: {self.num_heads})."
            )
        # 线性层 self.q_proj, self.k_proj, self.v_proj 将输入 hidden_states 映射到 num_heads * head_dim 的维度，以分别获得查询、键、值 tensor。
        self.q_proj = nn.Linear(self.hidden_size, self.num_heads * self.head_dim, bias=False)
        self.k_proj = nn.Linear(self.hidden_size, self.num_heads * self.head_dim, bias=False)
        self.v_proj = nn.Linear(self.hidden_size, self.num_heads * self.head_dim, bias=False)
        self.o_proj = nn.Linear(self.num_heads * self.head_dim, self.hidden_size, bias=False)
        self.rotary_emb = LlamaRotaryEmbedding(self.head_dim, max_position_embeddings=self.max_position_embeddings)
    ...

def forward(
    self,
    hidden_states: torch.Tensor,
    attention_mask: Optional[torch.Tensor] = None,
    position_ids: Optional[torch.LongTensor] = None,
    past_key_value: Optional[Tuple[torch.Tensor]] = None,
    output_attentions: bool = False,
    use_cache: bool = False,
) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
    bsz, q_len, _ = hidden_states.size()

    query_states = self.q_proj(hidden_states).view(bsz, q_len, self.num_heads, self.head_dim).transpose(1, 2)
    key_states = self.k_proj(hidden_states).view(bsz, q_len, self.num_heads, self.head_dim).transpose(1, 2)
    value_states = self.v_proj(hidden_states).view(bsz, q_len, self.num_heads, self.head_dim).transpose(1, 2)

    kv_seq_len = key_states.shape[-2]
    if past_key_value is not None:
        kv_seq_len += past_key_value[0].shape[-2]
    cos, sin = self.rotary_emb(value_states, seq_len=kv_seq_len)
    query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin, position_ids)
    # [bsz, nh, t, hd]
    if past_key_value is not None:
        # reuse k, v, self_attention
        key_states = torch.cat([past_key_value[0], key_states], dim=2)
        value_states = torch.cat([past_key_value[1], value_states], dim=2)
    past_key_value = (key_states, value_states) if use_cache else None
    attn_weights = torch.matmul(query_states, key_states.transpose(2, 3)) / math.sqrt(self.head_dim)
    if attn_weights.size() != (bsz, self.num_heads, q_len, kv_seq_len):
        raise ValueError(
            f"Attention weights should be of size {(bsz * self.num_heads, q_len, kv_seq_len)}, but is"
            f" {attn_weights.size()}"
        )
    if attention_mask is not None:
        if attention_mask.size() != (bsz, 1, q_len, kv_seq_len):
            raise ValueError(
                f"Attention mask should be of size {(bsz, 1, q_len, kv_seq_len)}, but is {attention_mask.size()}"
            )
        attn_weights = attn_weights + attention_mask
        attn_weights = torch.max(attn_weights, torch.tensor(torch.finfo(attn_weights.dtype).min))
    # upcast attention to fp32
    attn_weights = nn.functional.softmax(attn_weights, dim=-1, dtype=torch.float32).to(query_states.dtype)
    attn_output = torch.matmul(attn_weights, value_states)

    if attn_output.size() != (bsz, self.num_heads, q_len, self.head_dim):
        raise ValueError(
            f"`attn_output` should be of size {(bsz, self.num_heads, q_len, self.head_dim)}, but is"
            f" {attn_output.size()}"
        )
    attn_output = attn_output.transpose(1, 2)
    attn_output = attn_output.reshape(bsz, q_len, self.hidden_size)
    attn_output = self.o_proj(attn_output)
    if not output_attentions:
        attn_weights = None
    return attn_output, attn_weights, past_key_value

next_token_logits = outputs.logits[:, -1, :]  
# pre-process distribution  
next_token_scores = logits_processor(input_ids, next_token_logits)  
next_token_scores = logits_warper(input_ids, next_token_scores)  
...  
probs = nn.functional.softmax(next_token_scores, dim=-1)  
next_tokens = torch.multinomial(probs, num_samples=1).squeeze(1)  

# logits_processor 中执行的操作  

class MinLengthLogitsProcessor(LogitsProcessor):  
    [`LogitsProcessor`] enforcing a min-length by setting EOS probability to 0.  
    def __init__(self, min_length: int, eos_token_id: Union[int, List[int]]):  
        if not isinstance(min_length, int) or min_length < 0:  
            raise ValueError(f"`min_length` has to be a non-negative integer, but is {min_length}")  

        if isinstance(eos_token_id, int):  
            eos_token_id = [eos_token_id]  
        if not all([isinstance(i, int) for i in eos_token_id]) or any([i < 0 for i in eos_token_id]):  
            logger.warning(f"`eos_token_id` has to be a list of positive integers, but is {eos_token_id}")  

        self.min_length = min_length  
        self.eos_token_id = eos_token_id  

    def __call__(self, input_ids: torch.LongTensor, scores: torch.FloatTensor) -> torch.FloatTensor:  
        cur_len = input_ids.shape[-1]  
        if cur_len < self.min_length:  
            for i in self.eos_token_id:  
                scores[:, i] = -float("inf")  
        return scores  
...
# logits_warper 调用的三个函数  
class TemperatureLogitsWarper(LogitsWarper):  
    def __init__(self, temperature: float):  
        if not isinstance(temperature, float) or not (temperature > 0):  
            raise ValueError(f"`temperature` has to be a strictly positive float, but is {temperature}")  
        self.temperature = temperature  
    def __call__(self, input_ids: torch.Tensor, scores: torch.Tensor) -> torch.FloatTensor:  
        scores = scores / self.temperature  
        return scores  

class TopKLogitsWarper(LogitsWarper):  
    def __init__(self, top_k: int, filter_value: float = -float("Inf"), min_tokens_to_keep: int = 1):  
        if not isinstance(top_k, int) or top_k <= 0:  
            raise ValueError(f"`top_k` has to be a strictly positive integer, but is {top_k}")  

        self.top_k = max(self.top_k, min_tokens_to_keep)  
        self.filter_value = filter_value  

    def __call__(self, input_ids: torch.LongTensor, scores: torch.FloatTensor) -> torch.FloatTensor:  
        top_k = min(self.top_k, scores.size(-1))  # Safety check  
        # Remove all tokens with a probability less than the last token of the top-k  
        indices_to_remove = scores < torch.topk(scores, top_k)[0][..., -1, None]  
        scores = scores.masked_fill(indices_to_remove, self.filter_value)  
        return scores  

class TopPLogitsWarper(LogitsWarper):  
    def __init__(self, top_p: float, filter_value: float = -float("Inf"), min_tokens_to_keep: int = 1):  
        top_p = float(top_p)  
        if top_p < 0 or top_p > 1.0:  
            raise ValueError(f"`top_p` has to be a float > 0 and < 1, but is {top_p}")  
        self.top_p = top_p  
        self.filter_value = filter_value  
        self.min_tokens_to_keep = min_tokens_to_keep  

    def __call__(self, input_ids: torch.LongTensor, scores: torch.FloatTensor) -> torch.FloatTensor:  
        sorted_logits, sorted_indices = torch.sort(scores, descending=False)  
        cumulative_probs = sorted_logits.softmax(dim=-1).cumsum(dim=-1)  
        # Remove tokens with cumulative top_p above the threshold (token with 0 are kept)  
        sorted_indices_to_remove = cumulative_probs <= (1 - self.top_p)  
        if self.min_tokens_to_keep > 1:  
            # Keep at least min_tokens_to_keep  
            sorted_indices_to_remove[..., -self.min_tokens_to_keep :] = 0  
        # scatter sorted tensors to original indexing  
        indices_to_remove = sorted_indices_to_remove.scatter(1, sorted_indices, sorted_indices_to_remove)  
        scores = scores.masked_fill(indices_to_remove, self.filter_value)  
        return scores

将 logits 传递给 logits_processor 和 logits_warper，在这两个方法中进行一些预处理过程，例如添加惩罚项或对概率分布进行修改，使得生成的结果更符合期望，具体调用了（这里用到了 temperature 参数，作用是调节模型生成的随机性，temperature 通常被用于控制 softmax 函数的形状，从而影响生成序列的多样性，当 temperature 值接近 0 时，模型趋向于输出最可能的单个结果，也就是模型的输出趋向于确定性。这种情况下，所有的概率质量都集中在概率最大的那个输出上，其他的输出的概率几乎为 0，当 temperature 值比较大（大于 1）时，模型趋向于输出更多样化的结果，也就是增加了模型输出的随机性。在这种情况下，不同的输出之间的概率差异减小，使得即使概率较小的输出也有可能被选中）
TopKLogitsWarper 类是一个用于处理模型输出分数（scores）的工具，主要用于进行所谓的'Top-K 截断'。在自然语言生成的过程中，Top-K 截断是一种常见的技巧，它的目标是在每个生成步骤中，只保留 K 个最可能的输出选项，而忽略其他的选项。这种方法可以降低生成过程的复杂性，并且可以减少不太可能的输出的干扰。
TopPLogitsWarper 类实现了被称为'Top-p（或 nucleus）抽样'的策略。该策略用于限制模型在每个生成步骤中所考虑的可能输出的范围。在 Top-p 抽样中，我们不再固定考虑概率最高的 K 个输出，而是根据概率分布的累积分布函数（CDF）来选择可能的输出。我们设置一个阈值 P，然后选择输出，直到它们的累积概率大于等于 P。由于这个方法根据概率分布动态地调整输出的数量，所以它可以更好地处理不同的分布情况，从而在某些情况下可以生成更自然的文本。
最后，使用 softmax 函数将经过预处理的 logits 转换为概率分布，并利用 multinomial 方法从中采样得到下一个 token。最后，将该 token 添加到原始输入序列中，并进行下一次迭代，生成新的文本内容。如果需要记录中间变量，则将它们存储在相应的变量中，以便之后访问根据得到的新 token，随后更新 input_ids 得到下一个 token_id 之后

input_ids = torch.cat([input_ids, next_tokens[:, None]], dim=-1)  
if streamer is not None:  
    streamer.put(next_tokens.cpu())  
model_kwargs = self._update_model_kwargs_for_generation(  
    outputs, model_kwargs, is_encoder_decoder=self.config.is_encoder_decoder  
)  

# if eos_token was found in one sentence, set sentence to finished  
if eos_token_id_tensor is not None:  
    unfinished_sequences = unfinished_sequences.mul(  
        next_tokens.tile(eos_token_id_tensor.shape[0], 1).ne(eos_token_id_tensor.unsqueeze(1)).prod(dim=0)  
    )  

# stop when each sentence is finished, or if we exceed the maximum length  
if unfinished_sequences.max() == 0 or stopping_criteria(input_ids, scores):  
    if not synced_gpus:  
        break  
    else:  
        this_peer_finished = True

print(tokenizer.batch_decode(generated_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0])

input_sentences = [
    "DeepSpeed is a machine learning framework",
    "He is working on",
    "He has a",
    "He got all",
    "Everyone is happy and I can",
    "The new movie that got Oscar this year",
    "In the far far distance from our galaxy,",
    "Peace is the only way",
]
tokenizer.pad_token = 0
input_tokens = tokenizer.batch_encode_plus(input_sentences, return_tensors="pt", padding=True)

基于 LLAMA 的大模型推理流程解析

什么是 LLM

llama 相关的知识点

分词器、token、embedding

更多推荐文章

相关免费在线工具

自注意力 Self-Attention

位置编码

多头注意力 (Multi-head Attention)

批标准化 (Batch Norm) & 层标准化 (Layer Norm)

残差网络 (ResNet)

LLAMA 的模型结构

运行 pipeline

第一步分词

第二步配置

第三步 sample

LlamaDecoderLayer

LlamaAttention

落地相关

padding

bad_words 和 stop_words

kv-cache

总结

一些概念

unconditional generation

context len

更多推荐文章

相关免费在线工具

基于 LLAMA 的大模型推理流程解析

什么是 LLM

llama 相关的知识点

分词器、token、embedding

微信扫一扫，关注极客日志

更多推荐文章

相关免费在线工具

自注意力 Self-Attention

位置编码

多头注意力 (Multi-head Attention)

批标准化 (Batch Norm) & 层标准化 (Layer Norm)

残差网络 (ResNet)

LLAMA 的模型结构

运行 pipeline

第一步 分词

第二步 配置

第三步 sample

LlamaDecoderLayer

LlamaAttention

落地相关

padding

bad_words 和 stop_words

kv-cache

总结

一些概念

unconditional generation

context len

微信扫一扫，关注极客日志

更多推荐文章

相关免费在线工具

第一步分词

第二步配置