环境配置
- Ubuntu 24
- NVIDIA 3090 (24G)
- CUDA 12.9
一、数据集制作
1. Label-studio 制作数据集
这是从零开始制作数据集的方法。 安装完 label-studio 后,输入指令启动:
label-studio start
进入浏览器界面创建项目(Create Project),引入图片后选择图像描述数据集制作(Image Captioning)。
2. 利用 Qwen2.5-VL 半自动制作数据集
利用 Qwen 的图像描述能力进行预生成,再人工复核修改,可减少人力成本。 脚本示例如下:
import torch
from modelscope import Qwen2_5_VLForConditionalGeneration, AutoTokenizer, AutoProcessor
from qwen_vl_utils import process_vision_info
import time
import os
from pathlib import Path
import json
def process_single_image(model, processor, image_path, prompt):
messages = [{"role": "user", "content": [{"type": "image", "image": image_path}, {"type": "text", "text": prompt}]}]
text = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
image_inputs, video_inputs = process_vision_info(messages)
inputs = processor(text=[text], images=image_inputs, videos=video_inputs, padding=True, return_tensors="pt")
inputs = inputs.to("cuda")
time_start = time.time()
generated_ids = model.generate(**inputs, max_new_tokens=256, do_sample=False)
time_end = time.time()
print(f"Inference time for : s")
generated_ids_trimmed = [out_ids[(in_ids):] in_ids, out_ids (inputs.input_ids, generated_ids)]
output_text = processor.batch_decode(generated_ids_trimmed, skip_special_tokens=, clean_up_tokenization_spaces=)
output_text[]
():
image_extensions = {, , , , , }
image_files = []
file Path(image_folder).iterdir():
file.suffix.lower() image_extensions:
image_files.append(file)
image_files.sort()
image_files:
()
()
results = []
image_file image_files:
()
:
result = process_single_image(model, processor, (image_file), prompt)
()
results.append({: image_file.name, : (image_file), : result})
Exception e:
()
results.append({: image_file.name, : (image_file), : , : })
output_file:
(output_file, , encoding=) f:
item results:
json_line = {: item[], : item[]}
f.write(json.dumps(json_line, ensure_ascii=) + )
()
results
__name__ == :
model = Qwen2_5_VLForConditionalGeneration.from_pretrained(, torch_dtype=, device_map=)
min_pixels = * *
max_pixels = * *
processor = AutoProcessor.from_pretrained(, min_pixels=min_pixels, max_pixels=max_pixels)
image_folder =
prompt =
output_file =
results = process_images_in_folder(model, processor, image_folder, prompt, output_file)
()

