from transformers import AutoTokenizer, TextGenerationPipeline
from auto_gptq import AutoGPTQForCausalLM, BaseQuantizeConfig
import logging
pretrained_model_dir = "facebook/opt-125m"
quantized_model_dir = "opt-125m-4bit"# 加载 Tokenizer
tokenizer = AutoTokenizer.from_pretrained(pretrained_model_dir, use_fast=True)
# 准备校准数据样本
examples = [
tokenizer(
"auto-gptq is an easy-to-use model quantization library with user-friendly apis, based on GPTQ algorithm."
)
]
# 设置量化参数
quantize_config = BaseQuantizeConfig(
bits=4, # quantize model to 4-bit
group_size=128, # it is recommended to set the value to 128
desc_act=False, # set to False can significantly speed up inference but the perplexity may slightly bad
)
# 加载待量化模型
model = AutoGPTQForCausalLM.from_pretrained(pretrained_model_dir, quantize_config)
# 开始量化
model.quantize(examples)
# 保存量化模型
model.save_quantized(quantized_model_dir, use_safetensors=True)
for example in examples:
for k, v in example.items():
iflen(v.shape) == 1:
v = v.unsqueeze(0)
example[k] = move_to_device(v, cur_layer_device)
try:
self.model(**example)
except ValueError:
pass
from auto_gptq.modeling import BaseGPTQForCausalLM
classOPTGPTQForCausalLM(BaseGPTQForCausalLM):
# chained attribute name of transformer layer block
layers_block_name = "model.decoder.layers"# chained attribute names of other nn modules that in the same level as the transformer layer block
outside_layer_modules = [
"model.decoder.embed_tokens", "model.decoder.embed_positions", "model.decoder.project_out",
"model.decoder.project_in", "model.decoder.final_layer_norm"
]
# chained attribute names of linear layers in transformer layer module# normally, there are four sub lists, for each one the modules in it can be seen as one operation,# and the order should be the order when they are truly executed, in this case (and usually in most cases),# they are: attention q_k_v projection, attention output projection, MLP project input, MLP project output
inside_layer_modules = [
["self_attn.k_proj", "self_attn.v_proj", "self_attn.q_proj"],
["self_attn.out_proj"],
["fc1"],
["fc2"]
]
# After this, you can use OPTGPTQForCausalLM.from_pretrained and other methods as shown in Basic.
我们开源了经以上方式修改后的 AutoGPTQ 库,并在量化版 Ovis 模型的 Hugging Face Model Card 内给出了详细的使用指南,便于社区用户部署 Ovis 量化模型或量化经自己微调后的 Ovis。完整的方案如下。
安装运行环境:
# BE SURE TO RUN UNDER CUDA 12.1# 1. Get a basic environment
conda create -n <your_env_name> python=3.10
conda activate <your_env_name>
pip install torch==2.2.1 torchvision==0.17.1 torchaudio==2.2.1 --index-url https://download.pytorch.org/whl/cu121
pip install numpy==1.24.3 transformers==4.44.2 pillow==10.3.0 gekko pandas
# 2. Build AutoGPTQ: We customized AutoGPTQ to support Ovis model quantization. You need to build from source to install the customized version.
git clone https://github.com/AIDC-AI/AutoGPTQ.git
cd AutoGPTQ
pip install -vvv --no-build-isolation -e .
调用 Ovis 量化模型执行推理:代码大纲
from transformers import GenerationConfig
from auto_gptq.modeling import OvisGemma2GPTQForCausalLM
# load model
load_device = "cuda:0"# customize load device
model = OvisGemma2GPTQForCausalLM.from_quantized(
"AIDC-AI/Ovis1.6-Gemma2-9B-GPTQ-Int4",
device=load_device,
trust_remote_code=True
)
model.model.generation_config = GenerationConfig.from_pretrained("AIDC-AI/Ovis1.6-Gemma2-9B-GPTQ-Int4")
text_tokenizer = model.get_text_tokenizer()
visual_tokenizer = model.get_visual_tokenizer()
# THE REST IS THE SAME AS UNQUANTIZED VERSIONS OF OVIS
...
微调原版 Ovis 模型后,自行量化:代码大纲
from auto_gptq.modeling import OvisGemma2GPTQForCausalLM
# Load Model
model = OvisGemma2GPTQForCausalLM.from_pretrained(
model_path,
quantize_config,
torch_dtype=torch.bfloat16,
multimodal_max_length=8192,
trust_remote_code=True
).cuda()
model.model.llm.model.config.use_cache = False# Prepare your own calibration samples here and format them as follows
data_list = [
{
"image": "path/to/image/of/this/sample",
"conversations": [
{
"from": "human",
"value": "<image>\n[Your sample prompt]"
},
{
"from": "gpt",
"value": "[Anything]"
}
]
},
...
]
# See the Hugging Face Model Cards for details on dataloader formation
train_loader = ...
# Start quantizing
model.quantize(examples=train_loader, cache_examples_on_gpu=False)
# Save quantized model
quantize_save_path = ...
model.save_quantized(quantize_save_path, use_safetensors=True)