Qwen3-VL 结合 LLaMA-Factory 实现 Grounding 任务 LoRA 微调

Qwen3-VL 是阿里云 Qwen 团队开发的多模态大语言模型系列，在空间感知和 OCR 能力上有了显著提升。它支持 2D/3D grounding，能判断物体方位、视角变化及遮挡关系；OCR 支持语言扩展至 32 种，对生僻字、古籍字及复杂场景的识别准确率更高。

环境配置

首先创建独立的 Conda 环境并安装依赖：

conda create -n Qwen3-vl python=3.10
conda activate Qwen3-vl
pip install accelerate
pip install qwen-vl-utils==0.0.14
uv pip install -U vllm

注意：vLLM 版本建议保持在 >=0.11.0。

下载模型权重

由于模型体积较大，推荐使用国内镜像加速下载。安装 ModelScope 后执行：

pip install modelscope
modelscope download --model Qwen/Qwen3-VL-2B-Instruct

若需使用 4B 版本，请相应调整下载命令中的模型名称。

推理测试

在微调前，先验证模型加载与推理是否正常。以下代码展示了如何加载模型并处理多模态查询（图像 + 文本）：

from transformers import Qwen3VLForConditionalGeneration, AutoProcessor
import torch
from PIL import Image

def load_qwen3_vl_4b_model():
    """加载 Qwen3-VL-4B-Instruct 模型和处理器"""
    model = Qwen3VLForConditionalGeneration.from_pretrained(
        "Qwen/Qwen3-VL-4B-Instruct",
        torch_dtype=torch.bfloat16,
        device_map="auto",
        attn_implementation="flash_attention_2"  # 可选，用于加速
    )
    processor = AutoProcessor.from_pretrained("Qwen/Qwen3-VL-4B-Instruct")
    return model, processor

def process_multimodal_query(model, processor, image_path, text_query):
    """处理多模态查询（图像 + 文本）"""
    image = Image.open(image_path).convert('RGB')
    messages = [
        {
            "role": "user",
            "content": [
                {: , : image},
                {: , : text_query}
            ]
        }
    ]
    inputs = processor.apply_chat_template(
        messages, tokenize=, add_generation_prompt=,
        return_dict=, return_tensors=
    )
    generated_ids = model.generate(
        **inputs, max_new_tokens=, do_sample=,
        temperature=, top_p=
    )
    generated_ids_trimmed = [
        out_ids[(in_ids):]  in_ids, out_ids  (inputs.input_ids, generated_ids)
    ]
    output_text = processor.batch_decode(
        generated_ids_trimmed, skip_special_tokens=, clean_up_tokenization_spaces=
    )
     output_text[]  output_text  

 __name__ == :
    model, processor = load_qwen3_vl_4b_model()
    image_path =   
    query =   
    result = process_multimodal_query(model, processor, image_path, query)
    (, result)

import os import json from tqdm import tqdm # ================== 配置区域 ================== IMAGE_DIR = "images" # 图片所在目录 LABEL_DIR = "labels" # YOLO 标签所在目录 OUTPUT_JSON = "qwen3_vl_grounding_mllm.json" # 输出文件 CLASS_ID2NAME = { # 类别 ID 映射名称（可选） 0: "house", # 1: "car", } USER_PROMPT = ( "<image>\n" "Locate all objects in this image and output the bbox coordinates " "in JSON format using relative coordinates in the range [0, 1000]." ) # ============================================= def yolo_to_xyxy_relative(xc, yc, w, h): """YOLO 归一化坐标 -> 相对坐标系下的四点 [x_min, y_min, x_max, y_max]""" x_min = xc - w / 2 y_min = yc - h / 2 x_max = xc + w / 2 y_max = yc + h / 2 return [ max(0.0, min(1.0, x_min)), max(0.0, min(1.0, y_min)), max(0.0, min(1.0, x_max)), max(0.0, min(1.0, y_max)) ] def scale_to_qwen_coords(xyxy_rel, scale=1000): """[0,1] -> [0, scale]，返回整数坐标""" x_min, y_min, x_max, y_max = xyxy_rel return [ int(round(x_min * scale)), int(round(y_min * scale)), int(round(x_max * scale)), int(round(y_max * scale)), ] def collect_image_files(image_dir): exts = {".jpg", ".jpeg", ".png", ".bmp", ".webp"} files = [] for fname in os.listdir(image_dir): if os.path.splitext(fname)[1].lower() in exts: files.append(fname) return sorted(files) def main(): image_files = collect_image_files(IMAGE_DIR) if not image_files: print(f"No images found in {IMAGE_DIR}") return dataset = [] for img_name in tqdm(image_files, desc="Converting"): img_path = os.path.join(IMAGE_DIR, img_name) img_rel_or_abs = os.path.abspath(img_path) base, _ = os.path.splitext(img_name) label_path = os.path.join(LABEL_DIR, base + ".txt") if not os.path.exists(label_path): continue bboxes_qwen = [] cls_ids = [] with open(label_path, "r", encoding="utf-8") as f: for line in f: line = line.strip() if not line: continue parts = line.split() if len(parts) < 5: continue cls_id = int(parts[0]) xc, yc, w, h = float(parts[1]), float(parts[2]), float(parts[3]), float(parts[4]) xyxy_rel = yolo_to_xyxy_relative(xc, yc, w, h) xyxy_qwen = scale_to_qwen_coords(xyxy_rel, scale=1000) bboxes_qwen.append(xyxy_qwen) cls_ids.append(cls_id) if not bboxes_qwen: continue objects = [] for cid, box in zip(cls_ids, bboxes_qwen): obj = {"cls_id": cid, "bbox_2d": box} if cid in CLASS_ID2NAME: obj["cls_name"] = CLASS_ID2NAME[cid] objects.append(obj) answer_obj = {"objects": objects} answer_str = json.dumps(answer_obj, ensure_ascii=False) sample = { "conversations": [ {"from": "human", "value": USER_PROMPT}, {"from": "gpt", "value": answer_str} ], "images": [img_rel_or_abs] } dataset.append(sample) with open(OUTPUT_JSON, "w", encoding="utf-8") as f: json.dump(dataset, f, ensure_ascii=False, indent=2) print(f"Done. Wrote {len(dataset)} samples to {OUTPUT_JSON}") if __name__ == "__main__": main()

Qwen3-VL 结合 LLaMA-Factory 实现 Grounding 任务 LoRA 微调