Qwen3-VL 结合 LLaMA-Factory 实现 Grounding 任务 LoRA 微调

Qwen3-VL 特性概览

Qwen3-VL 在空间感知与 OCR 能力上有了显著提升。2D grounding 从绝对坐标转向相对坐标，支持判断物体方位、视角变化及遮挡关系，甚至能实现 3D grounding，为复杂场景下的空间推理打下基础。OCR 方面，支持语言从 10 种扩展至 32 种，在复杂光线、模糊等场景下表现更稳定。

其技术核心主要体现在三点：

一是采用 MRoPE-Interleave 位置编码。原始 MRoPE 将特征维度按时间、高度和宽度分块划分，而 Qwen3-VL 采取交错分布形式，实现对全频率覆盖，增强了模型对长视频的理解鲁棒性。

二是引入 DeepStack 技术。通过融合 ViT 多层次特征，将视觉 tokens 注入到大型语言模型的深层而非单层，实现了更精细化的视觉理解，有效保留了从底层到高层的丰富信息。

三是升级视频时序建模机制。将原有的 T-RoPE 升级为文本时间戳对齐机制，支持'秒数'与'时：分：秒'两种输出格式，显著提升了动作定位与事件语义感知的精度。

环境配置

首先准备 Python 环境。建议使用 conda 创建独立虚拟环境，避免依赖冲突。

conda create -n qwen3-vl python=3.10
conda activate qwen3-vl
pip install accelerate
pip install qwen-vl-utils==0.0.14
uv pip install -U vllm

注意安装 vLLM 时需确保版本兼容，建议查阅官方文档确认最新稳定版。

模型下载与推理

权重文件较大，推荐使用 ModelScope 进行下载。

pip install modelscope
modelscope download --model Qwen/Qwen3-VL-2B-Instruct

加载模型后，我们可以编写一个简单的推理脚本。这里需要注意 device_map 的设置以及注意力机制的选择，使用 flash_attention_2 可以加速推理过程。

from transformers import Qwen3VLForConditionalGeneration, AutoProcessor
import torch
from PIL import Image

def load_qwen3_vl_model():
    model = Qwen3VLForConditionalGeneration.from_pretrained(
        "Qwen/Qwen3-VL-4B-Instruct",
        torch_dtype=torch.bfloat16,
        device_map="auto",
        attn_implementation="flash_attention_2"
    )
    processor = AutoProcessor.from_pretrained("Qwen/Qwen3-VL-4B-Instruct")
    return model, processor

def process_query(model, processor, image_path, text_query):
    image = Image.open(image_path).convert('RGB')
    messages = [
        {
            "role": "user",
            "content": [
                {: , : image},
                {: , : text_query}
            ]
        }
    ]
    inputs = processor.apply_chat_template(
        messages, tokenize=, add_generation_prompt=,
        return_dict=, return_tensors=
    )
    generated_ids = model.generate(
        **inputs, max_new_tokens=, do_sample=,
        temperature=, top_p=
    )
    generated_ids_trimmed = [
        out_ids[(in_ids):]  in_ids, out_ids  (inputs.input_ids, generated_ids)
    ]
    output_text = processor.batch_decode(
        generated_ids_trimmed, skip_special_tokens=, clean_up_tokenization_spaces=
    )
     output_text[]

 __name__ == :
    model, processor = load_qwen3_vl_model()
    image_path = 
    query = 
    result = process_query(model, processor, image_path, query)
    (, result)

import os import json from tqdm import tqdm IMAGE_DIR = "images" LABEL_DIR = "labels" OUTPUT_JSON = "qwen3_vl_grounding_mllm.json" USER_PROMPT = ( "<image>\n" "Locate all objects in this image and output the bbox coordinates " "in JSON format using relative coordinates in the range [0, 1000]." ) def yolo_to_xyxy_relative(xc, yc, w, h): x_min = xc - w / 2 y_min = yc - h / 2 x_max = xc + w / 2 y_max = yc + h / 2 return [ max(0.0, min(1.0, x_min)), max(0.0, min(1.0, y_min)), max(0.0, min(1.0, x_max)), max(0.0, min(1.0, y_max)) ] def scale_to_qwen_coords(xyxy_rel, scale=1000): x_min, y_min, x_max, y_max = xyxy_rel return [ int(round(x_min * scale)), int(round(y_min * scale)), int(round(x_max * scale)), int(round(y_max * scale)) ] def main(): files = sorted([f for f in os.listdir(IMAGE_DIR) if f.lower().endswith(('.jpg', '.png'))]) dataset = [] for img_name in tqdm(files): img_path = os.path.join(IMAGE_DIR, img_name) base, _ = os.path.splitext(img_name) label_path = os.path.join(LABEL_DIR, base + ".txt") if not os.path.exists(label_path): continue bboxes_qwen = [] cls_ids = [] with open(label_path, "r", encoding="utf-8") as f: for line in f: parts = line.strip().split() if len(parts) < 5: continue cls_id = int(parts[0]) xc, yc, w, h = float(parts[1]), float(parts[2]), float(parts[3]), float(parts[4]) xyxy_rel = yolo_to_xyxy_relative(xc, yc, w, h) xyxy_qwen = scale_to_qwen_coords(xyxy_rel) bboxes_qwen.append(xyxy_qwen) cls_ids.append(cls_id) if not bboxes_qwen: continue objects = [{"cls_id": cid, "bbox_2d": box} for cid, box in zip(cls_ids, bboxes_qwen)] answer_str = json.dumps({"objects": objects}, ensure_ascii=False) dataset.append({ "conversations": [ {"from": "human", "value": USER_PROMPT}, {"from": "gpt", "value": answer_str} ], "images": [img_path] }) with open(OUTPUT_JSON, "w", encoding="utf-8") as f: json.dump(dataset, f, ensure_ascii=False, indent=2) print(f"Done. Samples: {len(dataset)}") if __name__ == "__main__": main()

Qwen3-VL 结合 LLaMA-Factory 实现 Grounding 任务 LoRA 微调

Qwen3-VL 特性概览

环境配置

模型下载与推理

更多推荐文章

相关免费在线工具

微调实战

数据准备

启动训练

更多推荐文章

相关免费在线工具

Qwen3-VL 结合 LLaMA-Factory 实现 Grounding 任务 LoRA 微调

Qwen3-VL 特性概览

环境配置

模型下载与推理

微信扫一扫，关注极客日志

更多推荐文章

相关免费在线工具

微调实战

数据准备

启动训练

微信扫一扫，关注极客日志

更多推荐文章

相关免费在线工具