探索 Ovis 多模态大模型量化实战指南

探索 Ovis 多模态大模型量化实战指南 | 极客日志

from transformers import AutoTokenizer, TextGenerationPipeline
from auto_gptq import AutoGPTQForCausalLM, BaseQuantizeConfig
import logging

pretrained_model_dir = "facebook/opt-125m"
quantized_model_dir = "opt-125m-4bit"

# 加载 Tokenizer
tokenizer = AutoTokenizer.from_pretrained(pretrained_model_dir, use_fast=True)

# 准备校准数据样本
examples = [
    tokenizer(
        "auto-gptq is an easy-to-use model quantization library with user-friendly apis, based on GPTQ algorithm."
    )
]

# 设置量化参数
quantize_config = BaseQuantizeConfig(
    bits=4,         # quantize model to 4-bit
    group_size=128,  # it is recommended to set the value to 128
    desc_act=False,  # set to False can significantly speed up inference but the perplexity may slightly bad
)

# 加载待量化模型
model = AutoGPTQForCausalLM.from_pretrained(pretrained_model_dir, quantize_config)

# 开始量化
model.quantize(examples)

# 保存量化模型
model.save_quantized(quantized_model_dir, use_safetensors=True)

# 加载量化模型
model = AutoGPTQForCausalLM.from_quantized(quantized_model_dir, device="cuda:0")

# 设置推理样例
sample = "auto_gptq is"

# 推理
print(tokenizer.decode(model.generate(**tokenizer(sample, return_tensors="pt").to(model.device))[0]))

for example in examples:
    for k, v in example.items():
        if len(v.shape) == 1:
            v = v.unsqueeze(0)
        example[k] = move_to_device(v, cur_layer_device)
    try:
        self.model(**example)
    except ValueError:
        pass

def generate(self, **kwargs):
    """shortcut for model.generate"""
    with torch.inference_mode(), torch.amp.autocast(device_type=self.device.type):
        return self.model.generate(**kwargs)

from auto_gptq.modeling import BaseGPTQForCausalLM

class OPTGPTQForCausalLM(BaseGPTQForCausalLM):
    # chained attribute name of transformer layer block
    layers_block_name = "model.decoder.layers"
    # chained attribute names of other nn modules that in the same level as the transformer layer block
    outside_layer_modules = [
        "model.decoder.embed_tokens", "model.decoder.embed_positions", "model.decoder.project_out",
        "model.decoder.project_in", "model.decoder.final_layer_norm"
    ]
    # chained attribute names of linear layers in transformer layer module
    # normally, there are four sub lists, for each one the modules in it can be seen as one operation,
    # and the order should be the order when they are truly executed, in this case (and usually in most cases),
    # they are: attention q_k_v projection, attention output projection, MLP project input, MLP project output
    inside_layer_modules = [
        ["self_attn.k_proj", "self_attn.v_proj", "self_attn.q_proj"],
        ["self_attn.out_proj"],
        ["fc1"],
        ["fc2"]
    ]

# After this, you can use OPTGPTQForCausalLM.from_pretrained and other methods as shown in Basic.

from logging import getLogger
from ._base import BaseGPTQForCausalLM

logger = getLogger(__name__)

class Gemma2GPTQForCausalLM(BaseGPTQForCausalLM):
    layer_type = "Gemma2DecoderLayer"
    layers_block_name = "model.layers"
    outside_layer_modules = ["model.embed_tokens", "model.norm"]
    inside_layer_modules = [
        ["self_attn.k_proj", "self_attn.v_proj", "self_attn.q_proj"],
        ["self_attn.o_proj"],
        ["mlp.up_proj", "mlp.gate_proj"],
        ["mlp.down_proj"],
    ]

__all__ = ["Gemma2GPTQForCausalLM"]

from auto_gptq.modeling import OvisGemma2GPTQForCausalLM

# 准备校准数据样本
examples = ...

# 设置量化参数
...

# 加载待量化模型
model = OvisGemma2GPTQForCausalLM.from_pretrained(...)

# 开始量化
model.quantize(examples=examples, ...)

# 保存
model.save_quantized(...)

# BE SURE TO RUN UNDER CUDA 12.1
# 1. Get a basic environment
conda create -n <your_env_name> python=3.10
conda activate <your_env_name>
pip install torch==2.2.1 torchvision==0.17.1 torchaudio==2.2.1 --index-url https://download.pytorch.org/whl/cu121
pip install numpy==1.24.3 transformers==4.44.2 pillow==10.3.0 gekko pandas

# 2. Build AutoGPTQ: We customized AutoGPTQ to support Ovis model quantization. You need to build from source to install the customized version.
git clone https://github.com/AIDC-AI/AutoGPTQ.git
cd AutoGPTQ
pip install -vvv --no-build-isolation -e .

from transformers import GenerationConfig
from auto_gptq.modeling import OvisGemma2GPTQForCausalLM

# load model
load_device = "cuda:0" # customize load device
model = OvisGemma2GPTQForCausalLM.from_quantized(
    "AIDC-AI/Ovis1.6-Gemma2-9B-GPTQ-Int4",
    device=load_device,
    trust_remote_code=True
)
model.model.generation_config = GenerationConfig.from_pretrained("AIDC-AI/Ovis1.6-Gemma2-9B-GPTQ-Int4")
text_tokenizer = model.get_text_tokenizer()
visual_tokenizer = model.get_visual_tokenizer()

# THE REST IS THE SAME AS UNQUANTIZED VERSIONS OF OVIS
...

from auto_gptq.modeling import OvisGemma2GPTQForCausalLM

# Load Model
model = OvisGemma2GPTQForCausalLM.from_pretrained(
    model_path,
    quantize_config,
    torch_dtype=torch.bfloat16,
    multimodal_max_length=8192,
    trust_remote_code=True
).cuda()
model.model.llm.model.config.use_cache = False

# Prepare your own calibration samples here and format them as follows
data_list = [
    {
        "image": "path/to/image/of/this/sample",
        "conversations": [
            {
                "from": "human",
                "value": "<image>\n[Your sample prompt]"
            },
            {
                "from": "gpt",
                "value": "[Anything]"
            }
        ]
    },
    ...
]

# See the Hugging Face Model Cards for details on dataloader formation
train_loader = ...

# Start quantizing
model.quantize(examples=train_loader, cache_examples_on_gpu=False)

# Save quantized model
quantize_save_path = ...
model.save_quantized(quantize_save_path, use_safetensors=True)

探索 Ovis 多模态大模型量化实战指南

探索 Ovis 多模态大模型量化实战指南

量化方案的选取

更多推荐文章

相关免费在线工具

定制 AutoGPTQ 库

代码分析

修改代码：新增 ovis.py

校准数据的变量格式

完整的量化方案

总结

更多推荐文章

相关免费在线工具

探索 Ovis 多模态大模型量化实战指南

探索 Ovis 多模态大模型量化实战指南

量化方案的选取

微信扫一扫，关注极客日志

更多推荐文章

相关免费在线工具

定制 AutoGPTQ 库

代码分析

修改代码：新增 ovis.py

校准数据的变量格式

完整的量化方案

总结

微信扫一扫，关注极客日志

更多推荐文章

相关免费在线工具