人工智能大模型的多模态融合与跨领域应用实战

人工智能大模型的多模态融合与跨领域应用实战 | 极客日志

from transformers import BertTokenizer, BertModel
import torch

# 初始化文本编码器
tokenizer = BertTokenizer.from_pretrained("bert-base-chinese")
text_encoder = BertModel.from_pretrained("bert-base-chinese").to("cuda")

def preprocess_text(text):
    # 分词与编码
    inputs = tokenizer(
        text,
        return_tensors="pt",
        truncation=True,
        max_length=64,
        padding="max_length"
    ).to("cuda")
    # 特征提取
    with torch.no_grad():
        outputs = text_encoder(**inputs)
    # 返回 [CLS] token 的特征向量（768 维）
    return outputs.last_hidden_state[:, 0, :].squeeze()

# 测试
text = "一只黑色的猫坐在沙发上"
text_feat = preprocess_text(text)
print(f"文本特征维度：{text_feat.shape}")  # 输出：torch.Size([768])

from transformers import ViTImageProcessor, ViTModel
from PIL import Image
import torch

# 初始化图像编码器
image_processor = ViTImageProcessor.from_pretrained("google/vit-base-patch16-224")
image_encoder = ViTModel.from_pretrained("google/vit-base-patch16-224").to("cuda")

def preprocess_image(image_path):
    # 加载图像
    image = Image.open(image_path).convert("RGB")
    # 预处理（尺寸调整、归一化）
    inputs = image_processor(
        image,
        return_tensors="pt",
        resample=Image.BILINEAR
    ).to("cuda")
    # 特征提取
    with torch.no_grad():
        outputs = image_encoder(**inputs)
    # 返回图像全局特征向量（768 维）
    return outputs.last_hidden_state[:, 0, :].squeeze()

# 测试
image_feat = preprocess_image("cat.jpg")
print(f"图像特征维度：{image_feat.shape}")  # 输出：torch.Size([768])

from transformers import Wav2Vec2Processor, Wav2Vec2Model
import soundfile as sf
import torch

# 初始化语音编码器
processor = Wav2Vec2Processor.from_pretrained("facebook/wav2vec2-base-960h")
audio_encoder = Wav2Vec2Model.from_pretrained("facebook/wav2vec2-base-960h").to("cuda")

def preprocess_audio(audio_path):
    # 加载音频（采样率 16kHz）
    audio, sr = sf.read(audio_path)
    assert sr == 16000, "音频采样率必须为 16kHz"
    # 预处理
    inputs = processor(
        audio,
        return_tensors="pt",
        padding="max_length",
        max_length=32000,
        truncation=True
    ).to("cuda")
    # 特征提取
    with torch.no_grad():
        outputs = audio_encoder(**inputs)
    # 返回音频特征向量（768 维）
    return outputs.last_hidden_state.mean(dim=1).squeeze()

# 测试
audio_feat = preprocess_audio("speech.wav")
print(f"语音特征维度：{audio_feat.shape}")  # 输出：torch.Size([768])

from transformers import CLIPProcessor, CLIPModel
from PIL import Image
import torch

# 加载 CLIP 模型
model = CLIPModel.from_pretrained("openai/clip-vit-base-patch32").to("cuda")
processor = CLIPProcessor.from_pretrained("openai/clip-vit-base-patch32")

# 准备图文数据
texts = ["一只黑色的猫", "红色的汽车", "绿色的草地"]
images = [
    Image.open("cat.jpg"),
    Image.open("car.jpg"),
    Image.open("grass.jpg")
]

# 预处理
inputs = processor(
    text=texts,
    images=images,
    return_tensors="pt",
    padding=True,
    truncation=True
).to("cuda")

# 特征提取
with torch.no_grad():
    outputs = model(**inputs)
    text_embeds = outputs.text_embeds  # 文本特征（3, 512）
    image_embeds = outputs.image_embeds  # 图像特征（3, 512）

# 计算图文相似度（余弦相似度）
similarity = torch.nn.functional.cosine_similarity(
    text_embeds.unsqueeze(1),
    image_embeds.unsqueeze(0),
    dim=-1
)
print("图文相似度矩阵：")
print(similarity.cpu().numpy())

from transformers import Wav2Vec2ForCTC, Wav2Vec2Processor
import soundfile as sf
import torch

# 加载语音识别模型
processor = Wav2Vec2Processor.from_pretrained("facebook/wav2vec2-base-chinese")
model = Wav2Vec2ForCTC.from_pretrained("facebook/wav2vec2-base-chinese").to("cuda")

# 加载语音数据
audio, sr = sf.read("speech.wav")
# 语音内容："人工智能多模态融合技术"

# 预处理
inputs = processor(audio, sampling_rate=16000, return_tensors="pt", padding=True).to("cuda")
input_values = inputs.input_values

# 模型推理（获取 CTC logits）
with torch.no_grad():
    logits = model(input_values).logits  # (1, 时间步，字符表大小)

# CTC 解码与对齐
pred_ids = torch.argmax(logits, dim=-1)
transcription = processor.decode(pred_ids[0], skip_special_tokens=True)
print(f"语音识别结果：{transcription}")

# 提取对齐信息（语音时间步与字符的对应关系）
alignment = model.ctc_loss.forward(
    logits.transpose(0, 1),  # (时间步，1, 字符表大小)
    pred_ids.transpose(0, 1),  # (字符数，1)
    input_lengths=torch.tensor([logits.shape[1]]).to("cuda"),
    target_lengths=torch.tensor([len(transcription)]).to("cuda"),
    reduction="none"
)

# 简化对齐结果：输出每个字符对应的语音时间区间（单位：秒）
audio_duration = len(audio) / sr
time_per_step = audio_duration / logits.shape[1]
char_times = []
for i, char in enumerate(transcription):
    # 假设每个字符对应连续的时间步（实际需根据 CTC 对齐结果优化）
    start_time = i * len(logits[0]) / len(transcription) * time_per_step
    end_time = (i + 1) * len(logits[0]) / len(transcription) * time_per_step
    char_times.append((char, start_time, end_time))

print("语音 - 文本时序对齐结果：")
for char, start, end in char_times:
    print(f"字符 '{char}': {start:.2f}s - {end:.2f}s")

pip install transformers pillow torch soundfile
# 核心依赖
pip install faiss-cpu
# 用于向量检索（可选，提升检索效率）

from transformers import CLIPProcessor, CLIPModel
from PIL import Image
import os
import faiss
import numpy as np
import torch

# 1. 初始化 CLIP 模型与处理器
model = CLIPModel.from_pretrained("openai/clip-vit-base-patch32").to("cuda")
processor = CLIPProcessor.from_pretrained("openai/clip-vit-base-patch32")

# 2. 构建图像库（批量提取图像特征）
def build_image_database(image_dir):
    image_paths = [
        os.path.join(image_dir, f)
        for f in os.listdir(image_dir)
        if f.endswith((".jpg", ".png"))
    ]
    image_embeds = []
    # 批量处理图像
    for img_path in image_paths:
        image = Image.open(img_path).convert("RGB")
        inputs = processor(images=image, return_tensors="pt").to("cuda")
        with torch.no_grad():
            embed = model.get_image_features(**inputs).cpu().numpy()
        # 归一化特征向量（提升检索精度）
        embed = embed / np.linalg.norm(embed, axis=-1, keepdims=True)
        image_embeds.append(embed)
    # 构建 FAISS 索引（快速检索）
    image_embeds = np.vstack(image_embeds)
    index = faiss.IndexFlatL2(image_embeds.shape[1])
    index.add(image_embeds)
    return index, image_paths

# 3. 文本检索图像
def text_to_image_search(query_text, index, image_paths, top_k=3):
    # 提取文本特征
    inputs = processor(text=query_text, return_tensors="pt").to("cuda")
    with torch.no_grad():
        text_embed = model.get_text_features(**inputs).cpu().numpy()
    text_embed = text_embed / np.linalg.norm(text_embed, axis=-1, keepdims=True)
    # 检索 Top-K 图像
    distances, indices = index.search(text_embed, top_k)
    # 返回结果
    results = []
    for idx, dist in zip(indices[0], distances[0]):
        results.append({
            "image_path": image_paths[idx],
            "similarity": 1 - dist  # 转换为相似度（L2 距离越小，相似度越高）
        })
    return results

# 测试
if __name__ == "__main__":
    image_dir = "./image_database"  # 图像库目录
    query = "一只在雪地里玩耍的狗"
    # 构建图像库
    index, image_paths = build_image_database(image_dir)
    # 检索
    results = text_to_image_search(query, index, image_paths, top_k=3)
    # 输出结果
    print(f"查询文本：{query}")
    for i, res in enumerate(results, 1):
        print(f"Top-{i}：{res['image_path']}，相似度：{res['similarity']:.4f}")

def zero_shot_image_classification(image_path, class_labels):
    # 加载图像
    image = Image.open(image_path).convert("RGB")
    # 预处理图像与类别文本
    inputs = processor(
        text=class_labels,
        images=image,
        return_tensors="pt",
        padding=True,
        truncation=True
    ).to("cuda")
    # 特征提取与相似度计算
    with torch.no_grad():
        outputs = model(**inputs)
        image_embed = outputs.image_embeds
        text_embed = outputs.text_embeds
    # 计算图像与每个类别的相似度
    similarity = torch.nn.functional.cosine_similarity(image_embed, text_embed, dim=-1)
    pred_label = class_labels[torch.argmax(similarity).item()]
    pred_score = torch.max(similarity).item()
    return pred_label, pred_score

# 测试
image_path = "dog_snow.jpg"
class_labels = ["猫", "狗", "兔子", "狐狸", "熊"]  # 类别文本描述
pred_label, pred_score = zero_shot_image_classification(image_path, class_labels)
print(f"图像分类结果：{pred_label}，置信度：{pred_score:.4f}")

from transformers import Blip2Processor, Blip2ForConditionalGeneration
import torch
from PIL import Image

# 1. 初始化模型与处理器（使用 Flan-T5 作为语言模型）
model_name = "Salesforce/blip2-flan-t5-xl"
processor = Blip2Processor.from_pretrained(model_name)
model = Blip2ForConditionalGeneration.from_pretrained(
    model_name,
    torch_dtype=torch.float16,
    device_map="auto",
    trust_remote_code=True
)

# 2. 图像描述生成函数
def generate_image_caption(image_path, prompt="请描述这张图片的内容："):
    # 加载图像
    image = Image.open(image_path).convert("RGB")
    # 预处理（图像 + 文本提示）
    inputs = processor(
        images=image,
        text=prompt,
        return_tensors="pt"
    ).to("cuda", torch.float16)
    # 生成描述（控制生成长度与随机性）
    outputs = model.generate(
        **inputs,
        max_new_tokens=100,
        temperature=0.7,
        top_p=0.9,
        do_sample=True
    )
    # 解码结果
    caption = processor.decode(outputs[0], skip_special_tokens=True)
    return caption

# 测试
image_path = "mountain_lake.jpg"
caption = generate_image_caption(image_path)
print(f"图像描述：{caption}")

def multimodal_chat(image_path, chat_history):
    # 加载图像
    image = Image.open(image_path).convert("RGB")
    # 构建对话历史文本
    conversation = ""
    for turn in chat_history:
        conversation += f"用户：{turn['user']}\n助手：{turn['assistant']}\n"
    # 添加当前用户查询（假设最后一条为当前查询）
    current_query = chat_history[-1]["user"]
    conversation += f"用户：{current_query}\n助手："
    # 预处理
    inputs = processor(
        images=image,
        text=conversation,
        return_tensors="pt"
    ).to("cuda", torch.float16)
    # 生成回复
    outputs = model.generate(
        **inputs,
        max_new_tokens=150,
        temperature=0.6,
        top_p=0.8,
        do_sample=True
    )
    response = processor.decode(outputs[0], skip_special_tokens=True)
    return response

# 测试多轮对话
image_path = "city_traffic.jpg"
chat_history = [{"user": "这张图片展示了什么场景？", "assistant": ""},]
# 第一轮对话
response1 = multimodal_chat(image_path, chat_history)
chat_history[0]["assistant"] = response1
print(f"用户：{chat_history[0]['user']}")
print(f"助手：{response1}\n")
# 第二轮对话（基于上一轮结果）
chat_history.append({"user": "图片中的车辆多吗？是否存在交通拥堵？", "assistant": ""})
response2 = multimodal_chat(image_path, chat_history)
chat_history[-1]["assistant"] = response2
print(f"用户：{chat_history[-1]['user']}")
print(f"助手：{response2}")

[
    {
        "image": "medical_images/ct_001.jpg",
        "conversations": [
            {"from": "human", "value": "请分析这张 CT 影像的异常情况？"},
            {"from": "gpt", "value": "这张胸部 CT 影像显示，右肺上叶存在一个直径约 1.2cm 的结节状高密度影，边界欠清晰，内部密度不均匀，建议进一步进行增强 CT 检查以明确性质。"}
        ]
    },
    {
        "image": "medical_images/ct_002.jpg",
        "conversations": [
            {"from": "human", "value": "这张 CT 影像是否存在胸腔积液？"},
            {"from": "gpt", "value": "影像显示双侧胸腔内未见明显液性低密度影，胸膜无增厚，暂不考虑胸腔积液情况。"}
        ]
    }
]

# 核心依赖
pip install torch transformers datasets accelerate peft bitsandbytes
# 图像处理依赖
pip install pillow opencv-python
# 日志与可视化依赖
pip install wandb

import torch
from datasets import load_dataset
from transformers import (
    LlavaProcessor,
    LlavaForConditionalGeneration,
    TrainingArguments,
    Trainer,
    DataCollatorForSeq2Seq
)
from peft import LoraConfig, get_peft_model
import wandb

# 1. 初始化 W&B（日志跟踪）
wandb.init(project="llava-medical-finetune", name="llava-7b-medical")

# 2. 加载数据集
dataset = load_dataset("json", data_files="medical_chat_dataset.json")
train_dataset = dataset["train"]

# 3. 加载模型与处理器
model_name = "liuhaotian/LLaVA-7B-v1.5"
processor = LlavaProcessor.from_pretrained(model_name)
model = LlavaForConditionalGeneration.from_pretrained(
    model_name,
    torch_dtype=torch.float16,
    device_map="auto",
    trust_remote_code=True
)

# 4. 配置 LoRA 微调参数（仅训练部分参数，降低显存占用）
lora_config = LoraConfig(
    r=8,
    lora_alpha=32,
    target_modules=["q_proj", "v_proj"],
    lora_dropout=0.05,
    bias="none",
    task_type="CAUSAL_LM"
)
model = get_peft_model(model, lora_config)
model.print_trainable_parameters()  # 输出可训练参数比例（约 0.1%）

# 5. 数据预处理函数
def preprocess_function(examples):
    inputs = []
    labels = []
    for img_path, convs in zip(examples["image"], examples["conversations"]):
        # 加载图像
        image = processor(images=img_path, return_tensors="pt")["pixel_values"][0]
        # 构建对话文本
        conversation_text = ""
        for i, conv in enumerate(convs):
            if conv["from"] == "human":
                conversation_text += f"用户：{conv['value']}\n"
            else:
                conversation_text += f"助手：{conv['value']}\n"
        # 编码（图像 + 文本）
        encoding = processor(
            text=conversation_text,
            images=image,
            return_tensors="pt",
            truncation=True,
            max_length=1024,
            padding="max_length"
        )
        # 构建标签（仅训练助手回复部分）
        input_ids = encoding["input_ids"].flatten()
        label_ids = input_ids.clone()
        # 掩码用户输入部分（不计算损失）
        user_token_indices = torch.where(
            input_ids == processor.tokenizer.encode("用户：", add_special_tokens=False)[0]
        )[0]
        for idx in user_token_indices:
            # 找到对应的助手回复开始位置
            assistant_start = torch.where(
                input_ids[idx:] == processor.tokenizer.encode("助手：", add_special_tokens=False)[0]
            )[0]
            if len(assistant_start) > 0:
                assistant_start = idx + assistant_start[0]
                label_ids[idx:assistant_start] = -100  # -100 表示不参与损失计算
        inputs.append({
            "input_ids": input_ids,
            "attention_mask": encoding["attention_mask"].flatten(),
            "pixel_values": encoding["pixel_values"].flatten()
        })
        labels.append(label_ids)
    return {
        "input_ids": torch.stack([x["input_ids"] for x in inputs]),
        "attention_mask": torch.stack([x["attention_mask"] for x in inputs]),
        "pixel_values": torch.stack([x["pixel_values"] for x in inputs]),
        "labels": torch.stack(labels)
    }

# 6. 预处理数据集
tokenized_dataset = train_dataset.map(
    preprocess_function,
    batched=True,
    batch_size=4,
    remove_columns=train_dataset.column_names
)

# 7. 数据整理器
data_collator = DataCollatorForSeq2Seq(
    processor=processor,
    model=model,
    padding=True,
    truncation=True,
    max_length=1024
)

# 8. 训练参数配置
training_args = TrainingArguments(
    output_dir="./llava-medical-finetuned",
    per_device_train_batch_size=4,
    gradient_accumulation_steps=4,
    learning_rate=2e-4,
    num_train_epochs=3,
    logging_steps=10,
    save_steps=50,
    fp16=True,
    push_to_hub=False,
    report_to="wandb",
    gradient_checkpointing=True  # 节省显存
)

# 9. 初始化 Trainer 并训练
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset,
    data_collator=data_collator
)
trainer.train()

# 10. 保存微调后的模型
model.save_pretrained("./llava-medical-lora")
processor.save_pretrained("./llava-medical-lora")
wandb.finish()

# 加载微调后的模型
from peft import PeftModel
from transformers import LlavaForConditionalGeneration
from PIL import Image
import torch

base_model = LlavaForConditionalGeneration.from_pretrained(
    model_name,
    torch_dtype=torch.float16,
    device_map="auto",
    trust_remote_code=True
)
finetuned_model = PeftModel.from_pretrained(base_model, "./llava-medical-lora")
finetuned_model = finetuned_model.merge_and_unload()  # 合并 LoRA 权重

# 验证函数
def test_medical_chat(image_path, query):
    image = Image.open(image_path).convert("RGB")
    # 预处理
    inputs = processor(
        text=f"用户：{query}\n助手：",
        images=image,
        return_tensors="pt"
    ).to("cuda", torch.float16)
    # 生成回复
    outputs = finetuned_model.generate(
        **inputs,
        max_new_tokens=200,
        temperature=0.6,
        top_p=0.8,
        do_sample=True
    )
    response = processor.decode(outputs[0], skip_special_tokens=True)
    return response.replace(f"用户：{query}\n助手：", "")

# 测试
image_path = "medical_images/ct_test.jpg"
query = "这张 CT 影像的肺部是否存在异常？请详细说明。"
response = test_medical_chat(image_path, query)
print(f"用户：{query}")
print(f"助手：{response}")

from transformers import BioBERTTokenizer, BioBERTModel, MedicalViTModel, MedicalViTImageProcessor
import torch
import torch.nn as nn
from PIL import Image

# 1. 初始化各模态编码器
# 医疗文本编码器（BioBERT）
text_tokenizer = BioBERTTokenizer.from_pretrained("dmis-lab/biobert-v1.1")
text_encoder = BioBERTModel.from_pretrained("dmis-lab/biobert-v1.1").to("cuda")

# 医疗影像编码器（MedicalViT）
image_processor = MedicalViTImageProcessor.from_pretrained("microsoft/medical-vit-base")
image_encoder = MedicalViTModel.from_pretrained("microsoft/medical-vit-base").to("cuda")

# 2. 多模态融合模型
class MedicalFusionModel(nn.Module):
    def __init__(self, text_dim=768, image_dim=768, num_classes=10):
        super().__init__()
        # 跨模态注意力层
        self.cross_attention = nn.MultiheadAttention(
            embed_dim=text_dim + image_dim,
            num_heads=8,
            batch_first=True
        )
        # 分类头
        self.classifier = nn.Sequential(
            nn.Linear(text_dim + image_dim, 512),
            nn.ReLU(),
            nn.Dropout(0.3),
            nn.Linear(512, num_classes)
        )

    def forward(self, text_feat, image_feat):
        # 拼接多模态特征
        fusion_feat = torch.cat([text_feat, image_feat], dim=-1).unsqueeze(1)  # (batch, 1, dim)
        # 跨模态注意力融合
        attn_output, _ = self.cross_attention(fusion_feat, fusion_feat, fusion_feat)
        attn_output = attn_output.squeeze(1)
        # 分类
        logits = self.classifier(attn_output)
        return logits

# 3. 加载预训练的融合模型
fusion_model = MedicalFusionModel(num_classes=10).to("cuda")
fusion_model.load_state_dict(torch.load("./medical_fusion_model.pth"))
fusion_model.eval()

# 4. 诊断函数
def medical_diagnosis(image_path, medical_record):
    # 预处理文本（电子病历）
    text_inputs = text_tokenizer(
        medical_record,
        return_tensors="pt",
        truncation=True,
        max_length=128,
        padding="max_length"
    ).to("cuda")
    with torch.no_grad():
        text_feat = text_encoder(**text_inputs).last_hidden_state[:, 0, :]
    # 预处理影像（CT）
    image = Image.open(image_path).convert("RGB")
    image_inputs = image_processor(image, return_tensors="pt").to("cuda")
    with torch.no_grad():
        image_feat = image_encoder(**image_inputs).last_hidden_state[:, 0, :]
    # 融合诊断
    with torch.no_grad():
        logits = fusion_model(text_feat, image_feat)
        preds = torch.sigmoid(logits) > 0.5  # 多标签分类
    # 疾病标签映射（示例）
    disease_labels = ["肺炎", "肺结核", "肺癌", "胸腔积液", "气胸", "心肌炎", "心力衰竭", "肝硬化", "肾炎", "糖尿病"]
    diagnosis_results = [
        disease_labels[i] for i, pred in enumerate(preds[0]) if pred.item()
    ]
    return diagnosis_results

# 测试
image_path = "ct_lung_inflammation.jpg"
medical_record = "患者男性，56 岁，咳嗽、咳痰 1 周，伴发热（体温 38.5℃），既往有吸烟史 20 年，无其他慢性病史。"
diagnosis = medical_diagnosis(image_path, medical_record)
print(f"融合诊断结果：{diagnosis}")

def generate_medical_report(image_path, medical_record):
    # 加载微调后的医疗 BLIP-2 模型
    model = Blip2ForConditionalGeneration.from_pretrained("./blip2-medical-finetuned").to("cuda")
    processor = Blip2Processor.from_pretrained("./blip2-medical-finetuned")
    # 加载图像与文本
    image = Image.open(image_path).convert("RGB")
    prompt = f"电子病历：{medical_record}\n请结合 CT 影像，生成结构化诊断报告，包含影像表现、诊断结论、治疗建议三部分："
    # 预处理
    inputs = processor(
        images=image,
        text=prompt,
        return_tensors="pt"
    ).to("cuda", torch.float16)
    # 生成报告
    outputs = model.generate(
        **inputs,
        max_new_tokens=500,
        temperature=0.5,
        top_p=0.8,
        do_sample=False  # 生成式任务关闭采样，保证结果准确性
    )
    report = processor.decode(outputs[0], skip_special_tokens=True)
    return report

# 测试
report = generate_medical_report(image_path, medical_record)
print("结构化诊断报告：")
print(report)

from ultralytics import YOLO
from transformers import BertTokenizer, BertModel
import torch
import numpy as np
import torch.nn as nn
from PIL import Image

# 1. 初始化模型
# 缺陷检测模型（YOLOv8）
yolo_model = YOLO("yolov8n-industrial-defect.pt").to("cuda")

# 生产日志文本编码器
text_tokenizer = BertTokenizer.from_pretrained("bert-base-chinese")
text_encoder = BertModel.from_pretrained("bert-base-chinese").to("cuda")

# 融合分类器（判断缺陷是否为致命性）
class DefectSeverityClassifier(nn.Module):
    def __init__(self, image_dim=1024, text_dim=768, num_classes=2):
        super().__init__()
        self.fc = nn.Sequential(
            nn.Linear(image_dim + text_dim, 512),
            nn.ReLU(),
            nn.Dropout(0.2),
            nn.Linear(512, num_classes)
        )

    def forward(self, image_feat, text_feat):
        fusion_feat = torch.cat([image_feat, text_feat], dim=-1)
        logits = self.fc(fusion_feat)
        return logits

severity_model = DefectSeverityClassifier().to("cuda")
severity_model.load_state_dict(torch.load("./defect_severity_classifier.pth"))
severity_model.eval()

# 2. 缺陷检测与严重程度判断
def industrial_defect_detection(image_path, production_log):
    # 图像缺陷检测
    results = yolo_model(image_path)
    defects = []
    for r in results:
        boxes = r.boxes.data.cpu().numpy()  # (x1, y1, x2, y2, conf, cls)
        for box in boxes:
            x1, y1, x2, y2, conf, cls = box
            defect_type = yolo_model.names[int(cls)]
            defects.append({
                "type": defect_type,
                "position": (x1, y1, x2, y2),
                "confidence": conf
            })
    # 生产日志文本特征提取
    text_inputs = text_tokenizer(
        production_log,
        return_tensors="pt",
        truncation=True,
        max_length=64,
        padding="max_length"
    ).to("cuda")
    with torch.no_grad():
        text_feat = text_encoder(**text_inputs).last_hidden_state[:, 0, :]
    # 缺陷严重程度判断（以第一个缺陷为例）
    if defects:
        # 提取缺陷区域图像特征（简化版：使用 YOLO 输出的特征）
        image_feat = torch.tensor(results[0].probs.data.cpu().numpy()).unsqueeze(0).to("cuda")
        with torch.no_grad():
            logits = severity_model(image_feat.float(), text_feat)
        severity = "致命缺陷" if torch.argmax(logits).item() == 1 else "非致命缺陷"
        defects[0]["severity"] = severity
    return defects

# 测试
image_path = "part_surface.jpg"
production_log = "生产批次：20240510，原材料：铝合金，加工工艺：冲压 + 喷涂，设备运行正常，无异常报警。"
defects = industrial_defect_detection(image_path, production_log)
print("产品缺陷检测结果：")
for defect in defects:
    print(f"缺陷类型：{defect['type']}，位置：{defect['position']}，置信度：{defect['confidence']:.4f}，严重程度：{defect.get('severity','未评估')}")

import pandas as pd
from sklearn.preprocessing import StandardScaler
import torch.nn as nn
from PIL import Image
import torch

# 1. 传感器数据预处理
def preprocess_sensor_data(csv_path):
    # 加载传感器数据（温度、振动、电压，时序数据）
    df = pd.read_csv(csv_path)
    sensor_data = df[["temperature", "vibration", "voltage"]].values
    # 标准化
    scaler = StandardScaler()
    sensor_data = scaler.fit_transform(sensor_data)
    # 转换为时序输入（窗口大小=10）
    windows = []
    for i in range(len(sensor_data) - 10):
        window = sensor_data[i:i+10]
        windows.append(window)
    return torch.tensor(windows, dtype=torch.float32).unsqueeze(1).to("cuda")  # (batch, 1, window_size, 3)

# 2. 设备图像特征提取
def extract_device_image_feat(image_path):
    image = Image.open(image_path).convert("RGB")
    inputs = image_processor(image, return_tensors="pt").to("cuda")
    with torch.no_grad():
        feat = image_encoder(**inputs).last_hidden_state[:, 0, :]
    return feat

# 3. 故障预测模型
class EquipmentFaultPredictor(nn.Module):
    def __init__(self, sensor_dim=3, window_size=10, image_dim=768, num_classes=3):
        super().__init__()
        # 传感器时序特征提取（TCN）
        self.tcn = nn.Conv1d(sensor_dim, 64, kernel_size=3, padding=1)
        self.pool = nn.AdaptiveAvgPool1d(1)
        # 融合与分类
        self.fusion = nn.Sequential(
            nn.Linear(64 + image_dim, 256),
            nn.ReLU(),
            nn.Dropout(0.3),
            nn.Linear(256, num_classes)
        )

    def forward(self, sensor_data, image_feat):
        # 传感器特征提取
        sensor_feat = self.tcn(sensor_data.transpose(2, 3)).squeeze(1)  # (batch, 64, window_size)
        sensor_feat = self.pool(sensor_feat).squeeze(-1)  # (batch, 64)
        # 融合预测
        fusion_feat = torch.cat([sensor_feat, image_feat.repeat(sensor_feat.shape[0], 1)], dim=-1)
        logits = self.fusion(fusion_feat)
        return logits

# 4. 预测函数
def predict_equipment_fault(image_path, sensor_csv_path):
    # 预处理数据
    sensor_data = preprocess_sensor_data(sensor_csv_path)
    image_feat = extract_device_image_feat(image_path)
    # 加载模型
    predictor = EquipmentFaultPredictor().to("cuda")
    predictor.load_state_dict(torch.load("./equipment_fault_predictor.pth"))
    predictor.eval()
    # 预测
    with torch.no_grad():
        logits = predictor(sensor_data, image_feat)
        pred = torch.argmax(logits, dim=-1).item()
    # 故障类型映射
    fault_types = ["正常", "轴承磨损", "电机过热", "电路故障"]
    fault_type = fault_types[pred]
    # 剩余寿命预测（简化版）
    remaining_life = 100 - (pred * 30)  # 故障越严重，剩余寿命越短
    return fault_type, remaining_life

# 测试
image_path = "equipment_motor.jpg"
sensor_csv_path = "motor_sensor_data.csv"
fault_type, remaining_life = predict_equipment_fault(image_path, sensor_csv_path)
print(f"设备故障预测：{fault_type}，预计剩余使用寿命：{remaining_life}小时")

from transformers import Blip2Processor, Blip2ForConditionalGeneration
import torch
from PIL import Image

# 加载微调后的教育版 BLIP-2 模型
model = Blip2ForConditionalGeneration.from_pretrained("./blip2-education-finetuned").to("cuda")
processor = Blip2Processor.from_pretrained("./blip2-education-finetuned")

def math_qa(image_path, question):
    # 加载数学公式图像（如手写公式、印刷体公式）
    image = Image.open(image_path).convert("RGB")
    # 构建提示词
    prompt = f"请结合以下数学公式图像，解答问题：{question}\n要求：步骤清晰，给出最终答案。"
    # 预处理
    inputs = processor(
        images=image,
        text=prompt,
        return_tensors="pt"
    ).to("cuda", torch.float16)
    # 生成解答
    outputs = model.generate(
        **inputs,
        max_new_tokens=300,
        temperature=0.4,
        top_p=0.8,
        do_sample=False
    )
    answer = processor.decode(outputs[0], skip_special_tokens=True)
    return answer

# 测试
image_path = "math_formula.jpg"  # 图像内容：二元一次方程组
question = "求解该方程组的解？"
answer = math_qa(image_path, question)
print(f"问题：{question}")
print(f"解答：{answer}")

from transformers import Wav2Vec2ForCTC, Wav2Vec2Processor, BertTokenizer, BertForSequenceClassification
import torch
import soundfile as sf

# 1. 语音转文本（ASR）
asr_processor = Wav2Vec2Processor.from_pretrained("facebook/wav2vec2-base-english")
asr_model = Wav2Vec2ForCTC.from_pretrained("facebook/wav2vec2-base-english").to("cuda")

# 2. 口语测评模型（发音准确性、流畅度）
eval_tokenizer = BertTokenizer.from_pretrained("bert-base-english")
eval_model = BertForSequenceClassification.from_pretrained("./speech_eval_model").to("cuda")
eval_model.eval()

def speech_evaluation(audio_path, reference_text):
    # 步骤 1：语音转文本
    audio, sr = sf.read(audio_path)
    inputs = asr_processor(audio, sampling_rate=16000, return_tensors="pt").to("cuda")
    with torch.no_grad():
        logits = asr_model(**inputs).logits
        pred_ids = torch.argmax(logits, dim=-1)
        transcribed_text = asr_processor.decode(pred_ids[0], skip_special_tokens=True)
    # 步骤 2：口语测评（发音准确性、流畅度）
    # 构建输入：参考文本 + 转录文本
    eval_input = f"参考文本：{reference_text}\n转录文本：{transcribed_text}"
    eval_inputs = eval_tokenizer(
        eval_input,
        return_tensors="pt",
        truncation=True,
        max_length=128,
        padding="max_length"
    ).to("cuda")
    with torch.no_grad():
        outputs = eval_model(**eval_inputs)
        scores = torch.sigmoid(outputs.logits)[0]
        # 0：发音准确性，1：流畅度
    return {
        "transcribed_text": transcribed_text,
        "pronunciation_score": scores[0].item() * 100,  # 0-100 分
        "fluency_score": scores[1].item() * 100
    }

# 测试
audio_path = "student_speech.wav"  # 学生朗读文本："The quick brown fox jumps over the lazy dog"
reference_text = "The quick brown fox jumps over the lazy dog"
evaluation = speech_evaluation(audio_path, reference_text)
print(f"转录文本：{evaluation['transcribed_text']}")
print(f"发音准确性得分：{evaluation['pronunciation_score']:.1f}分")
print(f"流畅度得分：{evaluation['fluency_score']:.1f}分")

from diffusers import StableDiffusionPipeline
import torch

# 加载 Stable Diffusion 模型
pipe = StableDiffusionPipeline.from_pretrained(
    "runwayml/stable-diffusion-v1-5",
    torch_dtype=torch.float16
).to("cuda")

def news_image_generation(news_text, style="photorealistic"):
    # 构建提示词（结合新闻内容与风格）
    prompt = f"{style} style, news photography, {news_text}, high resolution, detailed, realistic lighting"
    # 生成图像
    image = pipe(
        prompt,
        num_inference_steps=50,
        guidance_scale=7.5,
        width=512,
        height=384
    ).images[0]
    # 保存图像
    image_path = "./news_image.png"
    image.save(image_path)
    return image_path

# 测试
news_text = "2024 年夏季奥运会开幕式在巴黎举行，运动员们入场，现场观众欢呼雀跃"
image_path = news_image_generation(news_text, style="photorealistic")
print(f"新闻配图生成完成，路径：{image_path}")

from transformers import ViTForImageClassification, ViTImageProcessor, BertForSequenceClassification, BertTokenizer
import torch
from PIL import Image

# 1. 图像违规检测模型
image_audit_processor = ViTImageProcessor.from_pretrained("facebook/vit-base-patch16-224")
image_audit_model = ViTForImageClassification.from_pretrained("./image_audit_model").to("cuda")

# 2. 文本违规检测模型
text_audit_tokenizer = BertTokenizer.from_pretrained("bert-base-chinese")
text_audit_model = BertForSequenceClassification.from_pretrained("./text_audit_model").to("cuda")

def multimodal_content_audit(image_path, text):
    # 图像违规检测
    image = Image.open(image_path).convert("RGB")
    image_inputs = image_audit_processor(image, return_tensors="pt").to("cuda")
    with torch.no_grad():
        image_pred = torch.argmax(image_audit_model(**image_inputs).logits, dim=-1).item()
        image_audit_result = "合规" if image_pred == 0 else "违规"
    # 文本违规检测
    text_inputs = text_audit_tokenizer(
        text,
        return_tensors="pt",
        truncation=True,
        max_length=64,
        padding="max_length"
    ).to("cuda")
    with torch.no_grad():
        text_pred = torch.argmax(text_audit_model(**text_inputs).logits, dim=-1).item()
        text_audit_result = "合规" if text_pred == 0 else "违规"
    # 综合审核结果
    final_result = "合规" if (image_audit_result == "合规" and text_audit_result == "合规") else "违规"
    return {
        "image_audit": image_audit_result,
        "text_audit": text_audit_result,
        "final_result": final_result
    }

# 测试
image_path = "advertisement_image.jpg"
text = "这款保健品能治愈癌症，无效退款，立即购买享 5 折优惠！"
audit_result = multimodal_content_audit(image_path, text)
print(f"内容审核结果：{audit_result['final_result']}")
print(f"图像审核：{audit_result['image_audit']}")
print(f"文本审核：{audit_result['text_audit']}")

import onnxruntime as ort
from transformers import CLIPProcessor

# 加载 ONNX 模型
session = ort.InferenceSession("clip-text-encoder.onnx", providers=["CUDAExecutionProvider"])

# 推理函数
def onnx_clip_text_encode(text, processor):
    inputs = processor(text=text, return_tensors="np")
    input_ids = inputs["input_ids"]
    attention_mask = inputs["attention_mask"]
    # ONNX 推理
    outputs = session.run(None, {"input_ids": input_ids, "attention_mask": attention_mask})
    return outputs[0]

# 基础镜像（含 CUDA 11.7）
FROM nvidia/cuda:11.7.1-cudnn8-runtime-ubuntu20.04

# 设置工作目录
WORKDIR /app

# 安装依赖
RUN apt-get update && apt-get install -y \
    python3-pip \
    python3-dev \
    && rm -rf /var/lib/apt/lists/*

# 安装 Python 依赖
COPY requirements.txt .
RUN pip3 install --no-cache-dir -r requirements.txt

# 复制服务代码与模型文件
COPY app.py .
COPY ./model /app/model

# 暴露端口
EXPOSE 8000

# 启动命令
CMD ["uvicorn", "app:app", "--host", "0.0.0.0", "--port", "8000", "--workers", "4"]

from fastapi import FastAPI, UploadFile, File, Form
from fastapi.middleware.cors import CORSMiddleware
import uvicorn
from PIL import Image
import torch
from transformers import Blip2Processor, Blip2ForConditionalGeneration

# 初始化 FastAPI
app = FastAPI(title="多模态图文对话服务")

# 配置 CORS
app.add_middleware(
    CORSMiddleware,
    allow_origins=["*"],
    allow_credentials=True,
    allow_methods=["*"],
    allow_headers=["*"],
)

# 加载模型
processor = Blip2Processor.from_pretrained("./model")
model = Blip2ForConditionalGeneration.from_pretrained(
    "./model",
    torch_dtype=torch.float16,
    device_map="auto"
)

# 图文对话接口
@app.post("/multimodal_chat")
async def multimodal_chat(
    image: UploadFile = File(...),
    question: str = Form(...)
):
    # 加载图像
    image = Image.open(image.file).convert("RGB")
    # 预处理
    inputs = processor(
        images=image,
        text=question,
        return_tensors="pt"
    ).to("cuda", torch.float16)
    # 推理
    outputs = model.generate(
        **inputs,
        max_new_tokens=200,
        temperature=0.6,
        top_p=0.8
    )
    # 解码结果
    response = processor.decode(outputs[0], skip_special_tokens=True)
    return {
        "question": question,
        "response": response,
        "status": "success"
    }

version: "3"
services:
  multimodal-chat-service:
    build: .
    ports:
      - "8000:8000"
    deploy:
      resources:
        reservations:
          devices:
            - driver: nvidia
              count: 1
              capabilities: [gpu]
    volumes:
      - ./model:/app/model
      - ./logs:/app/logs
    environment:
      - CUDA_VISIBLE_DEVICES=0

# 1. 构建镜像
docker-compose build
# 2. 启动服务
docker-compose up -d
# 3. 查看日志
docker-compose logs -f

apiVersion: apps/v1
kind: Deployment
metadata:
  name: multimodal-chat-deployment
  namespace: ai-service
spec:
  replicas: 3
  selector:
    matchLabels:
      app: multimodal-chat
  template:
    metadata:
      labels:
        app: multimodal-chat
    spec:
      containers:
        - name: multimodal-chat-container
          image: my-harbor.com/ai/multimodal-chat:v1.0
          resources:
            limits:
              nvidia.com/gpu: 1
              cpu: "16"
              memory: "64Gi"
            requests:
              nvidia.com/gpu: 1
              cpu: "8"
              memory: "32Gi"
          ports:
            - containerPort: 8000
          livenessProbe:
            httpGet:
              path: /health
              port: 8000
            initialDelaySeconds: 60
            periodSeconds: 10
          readinessProbe:
            httpGet:
              path: /health
              port: 8000
            initialDelaySeconds: 30
            periodSeconds: 5
          volumeMounts:
            - name: model-storage
              mountPath: /app/model
            - name: log-storage
              mountPath: /app/logs
      volumes:
        - name: model-storage
          persistentVolumeClaim:
            claimName: multimodal-model-pvc
        - name: log-storage
          persistentVolumeClaim:
            claimName: multimodal-log-pvc
---
apiVersion: v1
kind: Service
metadata:
  name: multimodal-chat-service
  namespace: ai-service
spec:
  type: LoadBalancer
  ports:
    - port: 80
      targetPort: 8000
  selector:
    app: multimodal-chat
---
apiVersion: autoscaling/v2
kind: HorizontalPodAutoscaler
metadata:
  name: multimodal-chat-hpa
  namespace: ai-service
spec:
  scaleTargetRef:
    apiVersion: apps/v1
    kind: Deployment
    name: multimodal-chat-deployment
  minReplicas: 2
  maxReplicas: 10
  metrics:
    - type: Resource
      resource:
        name: cpu
        target:
          type: Utilization
          averageUtilization: 70
    - type: Resource
      resource:
        name: memory
        target:
          type: Utilization
          averageUtilization: 80

# 创建命名空间
kubectl create namespace ai-service
# 部署 PVC
kubectl apply -f multimodal-pvc.yaml
# 部署应用
kubectl apply -f multimodal-deployment.yaml
# 查看部署状态
kubectl get pods -n ai-service
kubectl get svc -n ai-service

from ultralytics import YOLO
import cv2
import numpy as np

# 加载微调后的工业缺陷检测模型
model = YOLO("./yolov8-industrial-defect.pt")

def image_anomaly_detection(image_path, infrared_image_path=None):
    # 加载可见光图像
    image = cv2.imread(image_path)
    image_rgb = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
    # 缺陷检测
    results = model(image_rgb, conf=0.7)
    defects = []
    for r in results:
        boxes = r.boxes.data.cpu().numpy()  # (x1, y1, x2, y2, conf, cls)
        for box in boxes:
            x1, y1, x2, y2, conf, cls = box.astype(int)
            defect_type = model.names[int(cls)]
            # 绘制缺陷框
            cv2.rectangle(image, (x1, y1), (x2, y2), (0, 0, 255), 2)
            cv2.putText(
                image,
                f"{defect_type}{conf:.2f}",
                (x1, y1 - 10),
                cv2.FONT_HERSHEY_SIMPLEX,
                0.6,
                (0, 0, 255),
                2
            )
            defects.append({
                "type": defect_type,
                "position": (x1, y1, x2, y2),
                "confidence": conf
            })
    # 红外图像温度异常检测（若有）
    temperature_anomaly = []
    if infrared_image_path:
        infrared_image = cv2.imread(infrared_image_path, cv2.IMREAD_GRAYSCALE)
        # 温度标定（假设红外图像像素值与温度呈线性关系：T = a*pixel + b）
        a = 0.5  # 标定系数（实际需校准）
        b = -50
        temperature_map = a * infrared_image + b
        # 温度阈值（设备正常工作温度≤80℃）
        high_temp_regions = np.where(temperature_map > 80)
        if len(high_temp_regions[0]) > 0:
            temperature_anomaly.append({
                "type": "温度异常",
                "max_temperature": np.max(temperature_map[high_temp_regions]),
                "region_count": len(high_temp_regions[0])
            })
        # 绘制高温区域
        for y, x in zip(high_temp_regions[0], high_temp_regions[1]):
            cv2.circle(infrared_image, (x, y), 2, (255), -1)
    return defects, temperature_anomaly, image, infrared_image

import torch
import torch.nn as nn
import soundfile as sf
from sklearn.preprocessing import StandardScaler
import numpy as np
from python_speech_features import mfcc

# 定义声音异常检测模型
class AudioAnomalyDetector(nn.Module):
    def __init__(self, input_dim=40, hidden_dim=128, num_classes=5):
        super().__init__()
        self.cnn = nn.Sequential(
            nn.Conv1d(input_dim, 64, kernel_size=3, padding=1),
            nn.ReLU(),
            nn.MaxPool1d(2),
            nn.Conv1d(64, 128, kernel_size=3, padding=1),
            nn.ReLU(),
            nn.MaxPool1d(2)
        )
        self.lstm = nn.LSTM(128, hidden_dim, batch_first=True, bidirectional=True)
        self.fc = nn.Linear(hidden_dim * 2, num_classes)

    def forward(self, x):
        # x: (batch, seq_len, input_dim)
        x = x.transpose(1, 2)  # (batch, input_dim, seq_len)
        cnn_out = self.cnn(x)  # (batch, 128, seq_len/4)
        cnn_out = cnn_out.transpose(1, 2)  # (batch, seq_len/4, 128)
        lstm_out, _ = self.lstm(cnn_out)
        # 取最后一个时间步的输出
        final_out = lstm_out[:, -1, :]
        logits = self.fc(final_out)
        return logits

# 加载模型
model = AudioAnomalyDetector().to("cuda")
model.load_state_dict(torch.load("./audio_anomaly_detector.pth"))
model.eval()

# 声音预处理与检测
def audio_anomaly_detection(audio_path):
    # 加载音频
    audio, sr = sf.read(audio_path)
    # 提取 MFCC 特征
    mfcc_feat = mfcc(audio, sr=sr, numcep=40, nfft=512)
    # 标准化
    scaler = StandardScaler()
    mfcc_feat = scaler.fit_transform(mfcc_feat)
    # 调整维度（batch, seq_len, input_dim）
    mfcc_feat = torch.tensor(mfcc_feat, dtype=torch.float32).unsqueeze(0).to("cuda")
    # 推理
    with torch.no_grad():
        logits = model(mfcc_feat)
        pred = torch.argmax(logits, dim=-1).item()
    # 声音类型映射
    audio_types = ["正常运行声", "轴承异响", "电机啸叫", "管道泄漏声", "报警声"]
    audio_result = {
        "type": audio_types[pred],
        "is_anomaly": pred != 0,
        "confidence": torch.softmax(logits, dim=-1)[0][pred].item()
    }
    return audio_result

def multimodal_fusion_detection(defects, temperature_anomaly, audio_result, sensor_data):
    # 定义各模态异常权重
    weights = {
        "image_defect": 0.4,
        "temperature_anomaly": 0.3,
        "audio_anomaly": 0.2,
        "sensor_anomaly": 0.1
    }
    # 单模态异常评分（0-1）
    image_score = 1.0 if defects else 0.0
    temperature_score = 1.0 if temperature_anomaly else 0.0
    audio_score = 1.0 if audio_result["is_anomaly"] else 0.0
    # 传感器数据异常评分（基于温度、振动阈值）
    sensor_anomaly = False
    if sensor_data["temperature"] > 80 or sensor_data["vibration"] > 5:
        sensor_anomaly = True
    sensor_score = 1.0 if sensor_anomaly else 0.0
    # 融合评分
    fusion_score = (
        image_score * weights["image_defect"] +
        temperature_score * weights["temperature_anomaly"] +
        audio_score * weights["audio_anomaly"] +
        sensor_score * weights["sensor_anomaly"]
    )
    # 决策阈值（≥0.3 判定为异常）
    is_anomaly = fusion_score >= 0.3
    # 整合异常信息
    anomaly_info = {
        "fusion_score": fusion_score,
        "is_anomaly": is_anomaly,
        "details": {
            "image_defects": defects,
            "temperature_anomaly": temperature_anomaly,
            "audio_anomaly": audio_result,
            "sensor_anomaly": sensor_anomaly
        }
    }
    return anomaly_info

人工智能大模型的多模态融合与跨领域应用实战

多模态融合与跨领域应用实战

一、章节学习目标与重点

1.1 学习目标

1.2 学习重点

二、多模态大模型核心基础：概念与技术架构

2.1 核心概念解析

2.1.1 多模态与跨模态

2.1.2 多模态融合的核心目标

2.2 多模态大模型的技术架构

2.2.1 核心架构流程

微信扫一扫，关注极客日志

更多推荐文章

相关免费在线工具

2.2.2 典型多模态模型架构解析

① CLIP（Contrastive Language-Image Pre-training）

② BLIP-2（Bootstrapping Language-Image Pre-training 2）

③ GPT-4V（GPT-4 Vision）

④ FLAVA（Flow of Language and Vision Alignment）

2.3 多模态数据预处理与模态对齐技术

2.3.1 多模态数据预处理方法

2.3.2 模态对齐技术实操

① 文本 - 图像语义对齐（基于 CLIP）

② 语音 - 文本时序对齐（基于 CTC 对齐）

三、主流多模态模型实操：从 API 调用到微调

3.1 CLIP：图文对齐与跨模态检索实操

3.1.1 基础功能：图文检索（文本搜图像）

① 环境准备

② 实操代码

③ 结果解读

3.1.2 进阶功能：零样本图像分类

3.2 BLIP-2：多模态生成任务实操（图像描述 + 图文对话）

3.2.1 基础功能：图像描述生成

① 实操代码

② 输出示例

3.2.2 进阶功能：图文对话（多轮交互）

③ 输出示例

3.3 LLaVA：开源多模态对话模型微调实操

3.3.1 微调准备：数据集与环境配置

① 数据集格式（自定义医疗图文对话数据集）

② 环境安装

3.3.2 微调代码（基于 LoRA 高效微调）

3.3.3 微调效果验证

输出示例

四、多模态大模型跨领域应用场景实战

4.1 医疗领域：多模态融合诊断系统

4.1.1 场景需求

4.1.2 方案设计

① 核心架构

② 核心功能与实操

功能 1：医疗影像 + 文本病历融合诊断

功能 2：诊断报告自动生成

③ 效果验证

4.2 工业领域：多模态质量检测与设备运维系统

4.2.1 场景需求

4.2.2 方案设计

① 核心架构

② 核心功能与实操

功能 1：产品表面缺陷检测（图像 + 生产日志融合）

功能 2：设备故障预测（图像 + 传感器数据融合）

③ 效果验证

4.3 教育领域：多模态智能教学助手

4.3.1 场景需求

4.3.2 核心功能与实操

功能 1：图文结合答疑（数学公式 + 文本问题）

功能 2：语音口语测评（语音 + 文本对照）

4.4 传媒领域：多模态内容生成与审核系统

4.4.1 场景需求

4.4.2 核心功能与实操

功能 1：文本生成图像（新闻配图）

功能 2：多模态内容审核（图像 + 文本）

五、多模态系统的性能优化与工程化部署

5.1 性能优化策略

5.1.1 模型层面优化

5.1.2 数据层面优化

5.1.3 推理层面优化

5.2 工程化部署方案

5.2.1 部署架构设计

5.2.2 Docker 容器化部署实操

① Dockerfile