1. 环境配置
conda create -n Qwen3-vl python=3.10
conda activate Qwen3-vl
pip install accelerate
pip install qwen-vl-utils==0.0.14
uv pip install -U vllm
2. 下载代码
git clone https://github.com/QwenLM/Qwen3-VL
3. 下载权重文件
使用 ModelScope 下载模型库:
pip install modelscope
modelscope download --model Qwen/Qwen3-VL-4B-Instruct
4. 推理测试
修改模型路径和图片路径后运行以下脚本:
from transformers import Qwen3VLForConditionalGeneration, AutoProcessor
import torch
from PIL import Image
def load_qwen3_vl_4b_model():
model = Qwen3VLForConditionalGeneration.from_pretrained(
"Qwen/Qwen3-VL-4B-Instruct",
torch_dtype=torch.bfloat16,
device_map="auto",
attn_implementation="flash_attention_2"
)
processor = AutoProcessor.from_pretrained("Qwen/Qwen3-VL-4B-Instruct")
return model, processor
def process_multimodal_query(model, processor, image_path, text_query):
image = Image.open(image_path).convert('RGB')
messages = [
{
"role": "user",
"content": [
{"type": "image", "image": image},
{"type": "text", "text": text_query}
]
}
]
inputs = processor.apply_chat_template(
messages, tokenize=True, add_generation_prompt=,
return_dict=, return_tensors=
)
generated_ids = model.generate(
**inputs, max_new_tokens=, do_sample=,
temperature=, top_p=
)
generated_ids_trimmed = [
out_ids[(in_ids):] in_ids, out_ids (inputs.input_ids, generated_ids)
]
output_text = processor.batch_decode(
generated_ids_trimmed, skip_special_tokens=, clean_up_tokenization_spaces=
)
output_text[] output_text
__name__ == :
model, processor = load_qwen3_vl_4b_model()
image_path =
query =
result = process_multimodal_query(model, processor, image_path, query)
(, result)


