Qwen3-VL 与 LLaMA-Factory 实现 Grounding 任务 LoRA 微调

介绍基于 Qwen3-VL 模型与 LLaMA-Factory 框架进行 Grounding 任务 LoRA 微调的完整流程。涵盖环境搭建、模型权重下载、推理代码示例、数据集格式转换（YOLO 至 Qwen 格式）及可视化训练配置。详细说明了相对坐标归一化处理、DeepStack 技术背景及多模态数据适配方法，并提供模型测试与导出步骤。

ApiHolic发布于 2026/4/6更新于 2026/7/1048 浏览

0. 官方资源

GitHub - QwenLM/Qwen3-VL

空间感知能力大幅提升：2D grounding 从绝对坐标变为相对坐标，支持判断物体方位、视角变化、遮挡关系，能实现 3D grounding，为复杂场景下的空间推理和具身场景打下基础。

OCR 支持更多语言及复杂场景：支持的中英外的语言从 10 种扩展到 32 种，覆盖更多国家和地区；在复杂光线、模糊、倾斜等实拍挑战性场景下表现更稳定；对生僻字、古籍字、专业术语的识别准确率也显著提升；超长文档理解和精细结构还原能力进一步提升。

一是采用 MRoPE-Interleave：原始 MRoPE 将特征维度按照时间（t）、高度（h) 和宽度 (w) 的顺序分块划分，使得时间信息全部分布在高频维度上。在 Qwen3-VL 中采取了 t,h,w 交错分布的形式，实现对时间，高度和宽度的全频率覆盖，这样更加鲁棒的位置编码能够保证模型在图片理解能力相当的情况下，提升对长视频的理解能力。

二是引入 DeepStack 技术：融合 ViT 多层次特征，提升视觉细节捕捉能力和图文对齐精度。沿用 DeepStack 的核心思想，将以往多模态大模型（LMM）单层输入视觉 tokens 的范式，改为在大型语言模型 (LLM) 的多层中进行注入。这种多层注入方式旨在实现更精细化的视觉理解。在此基础上，进一步优化了视觉特征 token 化的策略。具体而言，将来自 ViT 不同层的视觉特征进行 token 化，并以此作为视觉输入。实验结果表明，该方法在多种视觉理解任务上均展现出显著的性能提升。

三是将原有的视频时序建模机制 T-RoPE 升级为文本时间戳对齐机制：该机采用'时间戳 - 视频帧'交错的输入形式，实现帧级别的时间信息与视觉内容的细粒度对齐。同时，模型原生支持'秒数'与'时：分：秒'（HMS）两种时间输出格式。这一改进显著提升了模型对视频中动作、事件的语义感知与时间定位精度，使其在复杂时序推理任务——如事件定位、动作边界检测、跨模态时间问答等——中表现更稳健、响应更精准。

1. 配置环境

conda create -n Qwen3-vl python=3.10
conda activate Qwen3-vl
pip install accelerate
pip install qwen-vl-utils==0.0.14
uv pip install -U vllm

2. 下载代码

使用 git clone 克隆项目：

git clone https://github.com/QwenLM/Qwen3-VL

或从 GitHub 页面下载压缩包。

3. 下载权重文件

权重文件较大，建议使用 ModelScope 下载：

# 安装 ModelScope
pip install modelscope
# 下载完整模型库
modelscope download --model Qwen/Qwen3-VL-2B-Instruct

4. 推理代码

修改 Qwen/Qwen3-VL-4B-Instruct 为你下载的参数地址以及图片地址。

from transformers import Qwen3VLForConditionalGeneration, AutoProcessor
import torch
from PIL import Image

def load_qwen3_vl_4b_model():
    """加载 Qwen3-VL-4B-Instruct 模型和处理器"""
    model = Qwen3VLForConditionalGeneration.from_pretrained(
        ,
        torch_dtype=torch.bfloat16,
        device_map=,
        attn_implementation= 
    )
    processor = AutoProcessor.from_pretrained()
     model, processor

 ():
    
    
    image = Image.(image_path).convert()
    
    messages = [
        {
            : ,
            : [
                {: , : image},
                {: , : text_query}
            ]
        }
    ]
    
    inputs = processor.apply_chat_template(
        messages, tokenize=, add_generation_prompt=, return_dict=, return_tensors=
    )
    
    generated_ids = model.generate(
        **inputs, max_new_tokens=, do_sample=, temperature=, top_p=
    )
    
    generated_ids_trimmed = [out_ids[(in_ids):]  in_ids, out_ids  (inputs.input_ids, generated_ids)]
    output_text = processor.batch_decode(generated_ids_trimmed, skip_special_tokens=, clean_up_tokenization_spaces=)
     output_text[]  output_text  

 __name__ == :
    model, processor = load_qwen3_vl_4b_model()
    image_path =  
    query =  
    result = process_multimodal_query(model, processor, image_path, query)
    (, result)

0. 官方资源

GitHub - QwenLM/Qwen3-VL

1. 配置环境

conda create -n Qwen3-vl python=3.10
conda activate Qwen3-vl
pip install accelerate
pip install qwen-vl-utils==0.0.14
uv pip install -U vllm

2. 下载代码

使用 git clone 克隆项目：

git clone https://github.com/QwenLM/Qwen3-VL

或从 GitHub 页面下载压缩包。

3. 下载权重文件

权重文件较大，建议使用 ModelScope 下载：

# 安装 ModelScope
pip install modelscope
# 下载完整模型库
modelscope download --model Qwen/Qwen3-VL-2B-Instruct

4. 推理代码

修改 Qwen/Qwen3-VL-4B-Instruct 为你下载的参数地址以及图片地址。

from transformers import Qwen3VLForConditionalGeneration, AutoProcessor
import torch
from PIL import Image

def load_qwen3_vl_4b_model():
    """加载 Qwen3-VL-4B-Instruct 模型和处理器"""
    model = Qwen3VLForConditionalGeneration.from_pretrained(
        ,
        torch_dtype=torch.bfloat16,
        device_map=,
        attn_implementation= 
    )
    processor = AutoProcessor.from_pretrained()
     model, processor

 ():
    
    
    image = Image.(image_path).convert()
    
    messages = [
        {
            : ,
            : [
                {: , : image},
                {: , : text_query}
            ]
        }
    ]
    
    inputs = processor.apply_chat_template(
        messages, tokenize=, add_generation_prompt=, return_dict=, return_tensors=
    )
    
    generated_ids = model.generate(
        **inputs, max_new_tokens=, do_sample=, temperature=, top_p=
    )
    
    generated_ids_trimmed = [out_ids[(in_ids):]  in_ids, out_ids  (inputs.input_ids, generated_ids)]
    output_text = processor.batch_decode(generated_ids_trimmed, skip_special_tokens=, clean_up_tokenization_spaces=)
     output_text[]  output_text  

 __name__ == :
    model, processor = load_qwen3_vl_4b_model()
    image_path =  
    query =  
    result = process_multimodal_query(model, processor, image_path, query)
    (, result)

import os import json from tqdm import tqdm # ================== 需要你修改的路径 ================== IMAGE_DIR = "images" # 例如：qwen3_vl_grounding_train/images LABEL_DIR = "labels" # 例如：qwen3_vl_grounding_train/labels OUTPUT_JSON = "qwen3_vl_grounding_mllm.json" CLASS_ID2NAME = { 0: "house", # 1: "car", # ... } USE_ONLY_CLASS_IDS = None # 例如只保留房子：USE_ONLY_CLASS_IDS = {0} USER_PROMPT = ( "<image>\n" "Locate all objects in this image and output the bbox coordinates " "in JSON format using relative coordinates in the range [0, 1000]." ) # ================== 坐标工具函数 ================== def yolo_to_xyxy_relative(xc, yc, w, h): """YOLO 归一化坐标 (xc, yc, w, h) ∈ [0,1] -> 相对坐标系下的四点 [x_min, y_min, x_max, y_max]（仍然是 [0,1]）""" x_min = xc - w / 2 y_min = yc - h / 2 x_max = xc + w / 2 y_max = yc + h / 2 x_min = max(0.0, min(1.0, x_min)) y_min = max(0.0, min(1.0, y_min)) x_max = max(0.0, min(1.0, x_max)) y_max = max(0.0, min(1.0, y_max)) return [x_min, y_min, x_max, y_max] def scale_to_qwen_coords(xyxy_rel, scale=1000): """[0,1] -> [0, scale]，Qwen3-VL 默认 scale=1000 返回整数坐标 [x_min, y_min, x_max, y_max]""" x_min, y_min, x_max, y_max = xyxy_rel return [ int(round(x_min * scale)), int(round(y_min * scale)), int(round(x_max * scale)), int(round(y_max * scale)), ] def collect_image_files(image_dir): exts = {".jpg", ".jpeg", ".png", ".bmp", ".webp"} files = [] for fname in os.listdir(image_dir): if os.path.splitext(fname)[1].lower() in exts: files.append(fname) return sorted(files) # ================== 主逻辑 ================== def main(): image_files = collect_image_files(IMAGE_DIR) if not image_files: print(f"No images found in {IMAGE_DIR}") return dataset = [] for img_name in tqdm(image_files, desc="Converting"): img_path = os.path.join(IMAGE_DIR, img_name) img_rel_or_abs = os.path.abspath(img_path) base, _ = os.path.splitext(img_name) label_path = os.path.join(LABEL_DIR, base + ".txt") if not os.path.exists(label_path): continue bboxes_qwen = [] cls_ids = [] with open(label_path, "r", encoding="utf-8") as f: for line in f: line = line.strip() if not line: continue parts = line.split() if len(parts) < 5: print(f"Label format error in {label_path}: {line}") continue cls_id = int(parts[0]) if USE_ONLY_CLASS_IDS is not None and cls_id not in USE_ONLY_CLASS_IDS: continue xc = float(parts[1]) yc = float(parts[2]) w = float(parts[3]) h = float(parts[4]) xyxy_rel = yolo_to_xyxy_relative(xc, yc, w, h) xyxy_qwen = scale_to_qwen_coords(xyxy_rel, scale=1000) bboxes_qwen.append(xyxy_qwen) cls_ids.append(cls_id) if not bboxes_qwen: continue objects = [] for cid, box in zip(cls_ids, bboxes_qwen): obj = { "cls_id": cid, "bbox_2d": box } if cid in CLASS_ID2NAME: obj["cls_name"] = CLASS_ID2NAME[cid] objects.append(obj) answer_obj = {"objects": objects} answer_str = json.dumps(answer_obj, ensure_ascii=False) sample = { "conversations": [ {"from": "human", "value": USER_PROMPT}, {"from": "gpt", "value": answer_str} ], "images": [img_rel_or_abs] } dataset.append(sample) with open(OUTPUT_JSON, "w", encoding="utf-8") as f: json.dump(dataset, f, ensure_ascii=False, indent=2) print(f"Done. Wrote {len(dataset)} samples to {OUTPUT_JSON}") if __name__ == "__main__": main()

Qwen3-VL 与 LLaMA-Factory 实现 Grounding 任务 LoRA 微调

0. 官方资源

1. 配置环境

2. 下载代码

3. 下载权重文件

4. 推理代码

Qwen3-VL 与 LLaMA-Factory 实现 Grounding 任务 LoRA 微调

0. 官方资源

1. 配置环境

2. 下载代码

3. 下载权重文件

4. 推理代码

更多推荐文章

相关免费在线工具

5. 微调部分

5.1 使用 LLaMA-Factory 项目进行微调

5.1.1 下载项目

5.1.2 创建虚拟环境

5.2 准备微调数据集

5.2.1 所需数据集格式

5.2.2 YOLO 格式转换为 qwen3-vl-grounding 格式

5.3 使用 LLaMA-Factory 可视化界面进行微调

5.3.1 启动可视化界面

5.3.2 修改训练参数

5.3.3 对话测试模型

更多推荐文章

相关免费在线工具

Qwen3-VL 与 LLaMA-Factory 实现 Grounding 任务 LoRA 微调

0. 官方资源

1. 配置环境

2. 下载代码

3. 下载权重文件

4. 推理代码

Qwen3-VL 与 LLaMA-Factory 实现 Grounding 任务 LoRA 微调

0. 官方资源

1. 配置环境

2. 下载代码

3. 下载权重文件

4. 推理代码

微信扫一扫，关注极客日志

更多推荐文章

相关免费在线工具

5. 微调部分

5.1 使用 LLaMA-Factory 项目进行微调

5.1.1 下载项目

5.1.2 创建虚拟环境

5.2 准备微调数据集

5.2.1 所需数据集格式

5.2.2 YOLO 格式转换为 qwen3-vl-grounding 格式

5.3 使用 LLaMA-Factory 可视化界面进行微调

5.3.1 启动可视化界面

5.3.2 修改训练参数

5.3.3 对话测试模型

微信扫一扫，关注极客日志

更多推荐文章

相关免费在线工具