PythonAI算法

基于 YOLO12 的无人机航拍视角目标检测系统

综述由AI生成基于 YOLO12 和 YOLO11 的无人机航拍视角目标检测系统。内容涵盖 VisDrone 数据集配置、环境搭建、本地及云端训练流程、图形化界面（PySide6/Gradio）封装及模型架构解析。详细说明了 YOLO12 的区域注意力机制（Area Attention）及 R-ELAN 模块，对比了 YOLO11 的 C2PSA 和 C3k2 结构。实验部分展示了训练指标分析、mAP 结果及可视化效果，并提供了 GhostConv 和 CBAM 等模型改进方案。系统适用于智慧城市、农业及安防领域的实时目标检测任务。

随缘发布于 2026/4/5更新于 2026/5/2338 浏览

基于 YOLO12 的无人机（航拍）视角的目标检测系统

本次教程主要介绍基于无人机视角下的目标检测，对常规的行人、车辆等目标进行检测，并说明 YOLO12 的新模块。教程包含标注好的数据集、训练好的 YOLOv5、YOLOv8、YOLO11 以及 YOLO12 模型，还有一个配套的图形化界面。

本次的数据集包含的类别如下：

0: pedestrian 行人 1: people 人 2: bicycle 自行车 3: car 汽车 4: van 货车 5: truck 卡车 6: tricycle 三轮车 7: awning-tricycle 遮阳篷三轮车 8: bus 公交车 9: motor 摩托车

以下是部分数据示例。

train_batch0

下面是部分实现效果，支持视频和图像检测。

项目实战

#!/usr/bin/env python# -*- coding: UTF-8 -*-
import gradio as gr
import PIL.Image as Image
from ultralytics import ASSETS, YOLO

model = YOLO("runs/yolo11s/weights/best.pt") # todo 需要在这个位置修改为你自己的模型地址
TITLE = "欢迎使用基于 YOLOv12 的无人机视角目标检测"

def predict_image(img, conf_threshold, iou_threshold):
    """Predicts objects in an image using a YOLO11 model with adjustable confidence and IOU thresholds."""
    results = model.predict(
        source=img,
        conf=conf_threshold,
        iou=iou_threshold,
        show_labels=True,
        show_conf=True,
        imgsz=640,
    )
    for r in results:
        im_array = r.plot()
        im = Image.fromarray(im_array[...,::-1])
    return im

iface = gr.Interface(
    fn=predict_image,
    inputs=[
        gr.Image(type="pil", label="Upload Image"),
        gr.Slider(minimum=0, maximum=1, value=0.25, label="Confidence threshold"),
        gr.Slider(minimum=0, maximum=1, value=0.45, label="IoU threshold"),
    ],
    outputs=gr.Image(type="pil", label="Result"),
    title=TITLE,
    description="Upload images for inference.",
)

if __name__ == "__main__":
    iface.launch()

class A2C2f(nn.Module):
    """ Area-Attention C2f module for enhanced feature extraction with area-based attention mechanisms. This module extends the C2f architecture by incorporating area-attention and ABlock layers for improved feature processing. It supports both area-attention and standard convolution modes.
    Attributes:
        cv1 (Conv): Initial 1x1 convolution layer that reduces input channels to hidden channels.
        cv2 (Conv): Final 1x1 convolution layer that processes concatenated features.
        gamma (nn.Parameter | None): Learnable parameter for residual scaling when using area attention.
        m (nn.ModuleList): List of either ABlock or C3k modules for feature processing.
    Methods:
        forward: Processes input through area-attention or standard convolution pathway.
    Examples:
        >>> m = A2C2f(512, 512, n=1, a2=True, area=1)
        >>> x = torch.randn(1, 512, 32, 32)
        >>> output = m(x)
        >>> print(output.shape)
        torch.Size([1, 512, 32, 32])
    """
    def __init__(self, c1, c2, n=1, a2=True, area=1, residual=False, mlp_ratio=2.0, e=0.5, g=1, shortcut=True):
        """ Area-Attention C2f module for enhanced feature extraction with area-based attention mechanisms.
        Args:
            c1 (int): Number of input channels.
            c2 (int): Number of output channels.
            n (int): Number of ABlock or C3k modules to stack.
            a2 (bool): Whether to use area attention blocks. If False, uses C3k blocks instead.
            area (int): Number of areas the feature map is divided.
            residual (bool): Whether to use residual connections with learnable gamma parameter.
            mlp_ratio (float): Expansion ratio for MLP hidden dimension.
            e (float): Channel expansion ratio for hidden channels.
            g (int): Number of groups for grouped convolutions.
            shortcut (bool): Whether to use shortcut connections in C3k blocks.
        """
        super().__init__()
        c_ = int(c2 * e) # hidden channels
        assert c_ % 32 == 0, "Dimension of ABlock be a multiple of 32."
        self.cv1 = Conv(c1, c_, 1, 1)
        self.cv2 = Conv((1 + n) * c_, c2, 1)
        self.gamma = nn.Parameter(0.01 * torch.ones(c2), requires_grad=True) if a2 and residual else None
        self.m = nn.ModuleList(
            nn.Sequential(*(ABlock(c_, c_ // 32, mlp_ratio, area) for _ in range(2))) if a2 else C3k(c_, c_, 2, shortcut, g) for _ in range(n)
        )

    def forward(self, x):
        """Forward pass through R-ELAN layer."""
        y = [self.cv1(x)]
        y.extend(m(y[-1]) for m in self.m)
        y = self.cv2(torch.cat(y, 1))
        if self.gamma is not None:
            return x + self.gamma.view(-1, len(self.gamma), 1, 1) * y
        return y

# Ultralytics YOLO 🚀, AGPL-3.0 license
"""Model head modules."""
import copy
import math
import torch
import torch.nn as nn
from torch.nn.init import constant_, xavier_uniform_
from ultralytics.utils.tal import TORCH_1_10, dist2bbox, dist2rbox, make_anchors
from .block import DFL, BNContrastiveHead, ContrastiveHead, Proto
from .conv import Conv, DWConv
from .transformer import MLP, DeformableTransformerDecoder, DeformableTransformerDecoderLayer
from .utils import bias_init_with_prob, linear_init
__all__ = "Detect","Segment","Pose","Classify","OBB","RTDETRDecoder","v10Detect"

class Concat(nn.Module):
    """Concatenate a list of tensors along dimension."""
    def __init__(self, dimension=1):
        """Concatenates a list of tensors along a specified dimension."""
        super().__init__()
        self.d = dimension

    def forward(self, x):
        """Forward pass for the YOLOv8 mask Proto module."""
        return torch.cat(x, self.d)

torch.nn.functional.interpolate(input, size=None, scale_factor=None, mode='nearest', align_corners=None)

class C3k(C3):
    """C3k is a CSP bottleneck module with customizable kernel sizes for feature extraction in neural networks."""
    def __init__(self, c1, c2, n=1, shortcut=True, g=1, e=0.5, k=3):
        """Initializes C3k module with specified channels, number of layers, and configurations."""
        super().__init__(c1, c2, n, shortcut, g, e)
        c_ = int(c2 * e) # hidden channels
        # self.m = nn.Sequential(*(RepBottleneck(c_, c_, shortcut, g, k=(k, k), e=1.0) for _ in range(n)))
        self.m = nn.Sequential(*(Bottleneck(c_, c_, shortcut, g, k=(k, k), e=1.0) for _ in range(n)))

class C3k2(C2f):
    """Faster Implementation of CSP Bottleneck with 2 convolutions."""
    def __init__(self, c1, c2, n=1, c3k=False, e=0.5, g=1, shortcut=True):
        """Initializes C3k2 module, a faster CSP Bottleneck with 2 convolutions and optional C3k blocks."""
        super().__init__(c1, c2, n, shortcut, g, e)
        self.m = nn.ModuleList(
            C3k(self.c, self.c, 2, shortcut, g) if c3k else Bottleneck(self.c, self.c, shortcut, g) for _ in range(n)
        )

class Conv(nn.Module):
    """Standard convolution with args(ch_in, ch_out, kernel, stride, padding, groups, dilation, activation)."""
    default_act = nn.SiLU() # default activation

    def __init__(self, c1, c2, k=1, s=1, p=None, g=1, d=1, act=True):
        """Initialize Conv layer with given arguments including activation."""
        super().__init__()
        self.conv = nn.Conv2d(c1, c2, k, s, autopad(k, p, d), groups=g, dilation=d, bias=False)
        self.bn = nn.BatchNorm2d(c2)
        self.act = self.default_act if act is True else act if isinstance(act, nn.Module) else nn.Identity()

    def forward(self, x):
        """Apply convolution, batch normalization and activation to input tensor."""
        return self.act(self.bn(self.conv(x)))

    def forward_fuse(self, x):
        """Perform transposed convolution of 2D data."""
        return self.act(self.conv(x))

class C2PSA(nn.Module):
    """ C2PSA module with attention mechanism for enhanced feature extraction and processing. This module implements a convolutional block with attention mechanisms to enhance feature extraction and processing capabilities. It includes a series of PSABlock modules for self-attention and feed-forward operations.
    Attributes:
        c (int): Number of hidden channels.
        cv1 (Conv): 1x1 convolution layer to reduce the number of input channels to 2*c.
        cv2 (Conv): 1x1 convolution layer to reduce the number of output channels to c.
        m (nn.Sequential): Sequential container of PSABlock modules for attention and feed-forward operations.
    Methods:
        forward: Performs a forward pass through the C2PSA module, applying attention and feed-forward operations.
    Notes:
        This module essentially is the same as PSA module, but refactored to allow stacking more PSABlock modules.
    Examples:
        >>> c2psa = C2PSA(c1=256, c2=256, n=3, e=0.5)
        >>> input_tensor = torch.randn(1, 256, 64, 64)
        >>> output_tensor = c2psa(input_tensor)
    """
    def __init__(self, c1, c2, n=1, e=0.5):
        """Initializes C2PSA module with specified input/output channels, number of layers, and expansion ratio."""
        super().__init__()
        assert c1 == c2
        self.c = int(c1 * e)
        self.cv1 = Conv(c1, 2 * self.c, 1, 1)
        self.cv2 = Conv(2 * self.c, c1, 1)
        self.m = nn.Sequential(*(PSABlock(self.c, attn_ratio=0.5, num_heads=self.c // 64) for _ in range(n)))

    def forward(self, x):
        """Processes the input tensor 'x' through a series of PSA blocks and returns the transformed tensor."""
        a, b = self.cv1(x).split((self.c, self.c), dim=1)
        b = self.m(b)
        return self.cv2(torch.cat((a, b), 1))

path: H:/raspi/0000-38-visdrone-detect-yolo12/visdrone # dataset root dir
train: VisDrone2019-DET-train/images # train images (relative to 'path') 6471 images
val: VisDrone2019-DET-val/images # val images (relative to 'path') 548 images
test: VisDrone2019-DET-test-dev/images # test images (optional) 1610 images
# Classes
names:
0: pedestrian
1: people
2: bicycle
3: car
4: van
5: truck
6: tricycle
7: awning-tricycle
8: bus
9: motor

[1] Zhang Y , Li H , Bu R ,et al.Fuzzy Multi-objective Requirements for NRP Based on Particle Swarm Optimization[C]//2020.DOI:10.1007/978-3-030-57881-7_13. [2] Zhao N , Cao M , Song C ,et al.Trusted Component Decomposition Based on OR-Transition Colored Petri Net[C]//International Conference on Artificial Intelligence and Security.Springer, Cham, 2019.DOI:10.1007/978-3-030-24268-8_41. DOI: 10.1109/ACCESS.2020.2973568 [3] Song C, Chang H. RST R-CNN: a triplet matching few-shot remote sensing object detection framework[C]//Fourth International Conference on Computer Vision, Application, and Algorithm (CVAA 2024). SPIE, 2025, 13486: 553-568. [4] Zhou Q , Yu C . Point RCNN: An Angle-Free Framework for Rotated Object Detection[J]. Remote Sensing, 2022, 14. [5] Zhang, Y., Li, H., Bu, R., Song, C., Li, T., Kang, Y., & Chen, T. (2020). Fuzzy Multi-objective Requirements for NRP Based on Particle Swarm Optimization. International Conference on Adaptive and Intelligent Systems. [6] Li X , Deng J , Fang Y . Few-Shot Object Detection on Remote Sensing Images[J]. IEEE Transactions on Geoscience and Remote Sensing, 2021(99). [7] Su W, Zhu X, Tao C, et al. Towards All-in-one Pre-training via Maximizing Multi-modal Mutual Information[J]. arXiv preprint arXiv:2211.09807, 2022. [8] Chen Q, Wang J, Han C, et al. Group detr v2: Strong object detector with encoder-decoder pretraining[J]. arXiv preprint arXiv:2211.03594, 2022. [9] Liu, Shilong, et al. 'Grounding DINO: Marrying DINO with Grounded Pre-Training for Open-Set Object Detection.' arXiv preprint arXiv:2303.05499 (2023). [10] Redmon J, Divvala S, Girshick R, et al. You only look once: Unified, real-time object detection[C]//Proceedings of the IEEE conference on computer vision and pattern recognition. 2016: 779-788. [11] Redmon J, Farhadi A. YOLO9000: better, faster, stronger[C]//Proceedings of the IEEE conference on computer vision and pattern recognition. 2017: 7263-7271. [12] Redmon J, Farhadi A. Yolov3: An incremental improvement[J]. arXiv preprint arXiv:1804.02767, 2018. [13] Tian Z, Shen C, Chen H, et al. Fcos: Fully convolutional one-stage object detection[C]//Proceedings of the IEEE/CVF international conference on computer vision. 2019: 9627-9636. [14] Chen L C, Zhu Y, Papandreou G, et al. Encoder-decoder with atrous separable convolution for semantic image segmentation[C]//Proceedings of the European conference on computer vision (ECCV). 2018: 801-818. [15] Liu W, Anguelov D, Erhan D, et al. Ssd: Single shot multibox detector[C]//Computer Vision–ECCV 2016: 14th European Conference, Amsterdam, The Netherlands, October 11–14, 2016, Proceedings, Part I 14. Springer International Publishing, 2016: 21-37. [16] Lin T Y, Dollár P, Girshick R, et al. Feature pyramid networks for object detection[C]//Proceedings of the IEEE conference on computer vision and pattern recognition. 2017: 2117-2125. [17] Cai Z, Vasconcelos N. Cascade r-cnn: Delving into high quality object detection[C]//Proceedings of the IEEE conference on computer vision and pattern recognition. 2018: 6154-6162. [18] Ren S, He K, Girshick R, et al. Faster r-cnn: Towards real-time object detection with region proposal networks[J]. Advances in neural information processing systems, 2015, 28. [19] Wang R, Shivanna R, Cheng D, et al. Dcn v2: Improved deep & cross network and practical lessons for web-scale learning to rank systems[C]//Proceedings of the web conference 2021. 2021: 1785-1797. [20] Chen L C, Papandreou G, Schroff F, et al. Rethinking atrous convolution for semantic image segmentation[J]. arXiv preprint arXiv:1706.05587, 2017.

import torch
import torch.nn as nn

class GhostConv(nn.Module):
    def __init__(self, in_channels, out_channels, kernel_size=3, stride=1, padding=1, ratio=2, dw_kernel_size=3):
        """ Ghost Convolution 实现
        Args:
            in_channels (int): 输入通道数
            out_channels (int): 输出通道数
            kernel_size (int): 卷积核大小
            stride (int): 卷积步幅
            padding (int): 卷积填充
            ratio (int): 副特征与主特征的比例
            dw_kernel_size (int): 深度卷积的卷积核大小
        """
        super(GhostConv, self).__init__()
        self.out_channels = out_channels
        self.primary_channels = out_channels // ratio # 主特征图通道数
        self.ghost_channels = out_channels - self.primary_channels # 副特征图通道数
        # 主特征图的标准卷积
        self.primary_conv = nn.Conv2d(in_channels, self.primary_channels, kernel_size, stride, padding, bias=False)
        self.bn1 = nn.BatchNorm2d(self.primary_channels)
        # 副特征图的深度卷积
        self.ghost_conv = nn.Conv2d(self.primary_channels, self.ghost_channels, dw_kernel_size, stride=1, padding=dw_kernel_size //2, groups=self.primary_channels, bias=False)
        self.bn2 = nn.BatchNorm2d(self.ghost_channels)
        self.relu = nn.ReLU(inplace=True)

    def forward(self, x):
        # 主特征图
        primary_features = self.primary_conv(x)
        primary_features = self.bn1(primary_features)
        # 副特征图
        ghost_features = self.ghost_conv(primary_features)
        ghost_features = self.bn2(ghost_features)
        # 合并主特征图和副特征图
        output = torch.cat([primary_features, ghost_features], dim=1)
        output = self.relu(output)
        return output

import torch
import torch.nn as nn

class ChannelAttention(nn.Module):
    def __init__(self, in_channels, reduction=16):
        """ 通道注意力模块
        Args:
            in_channels (int): 输入通道数
            reduction (int): 缩减比例因子
        """
        super(ChannelAttention, self).__init__()
        self.avg_pool = nn.AdaptiveAvgPool2d(1) # 全局平均池化
        self.max_pool = nn.AdaptiveMaxPool2d(1) # 全局最大池化
        self.fc = nn.Sequential(
            nn.Linear(in_channels, in_channels // reduction, bias=False),
            nn.ReLU(inplace=True),
            nn.Linear(in_channels // reduction, in_channels, bias=False)
        )
        self.sigmoid = nn.Sigmoid()

    def forward(self, x):
        batch, channels, _, _ = x.size()
        # 全局平均池化
        avg_out = self.fc(self.avg_pool(x).view(batch, channels))
        # 全局最大池化
        max_out = self.fc(self.max_pool(x).view(batch, channels))
        # 加和后通过 Sigmoid
        out = avg_out + max_out
        out = self.sigmoid(out).view(batch, channels, 1, 1) # 通道加权
        return x * out

class SpatialAttention(nn.Module):
    def __init__(self, kernel_size=7):
        """ 空间注意力模块
        Args:
            kernel_size (int): 卷积核大小
        """
        super(SpatialAttention, self).__init__()
        self.conv = nn.Conv2d(2, 1, kernel_size=kernel_size, padding=kernel_size //2, bias=False)
        self.sigmoid = nn.Sigmoid()

    def forward(self, x):
        # 通道维度求平均和最大值
        avg_out = torch.mean(x, dim=1, keepdim=True)
        max_out, _ = torch.max(x, dim=1, keepdim=True)
        combined = torch.cat([avg_out, max_out], dim=1) # 拼接
        # 卷积处理
        out = self.conv(combined)
        out = self.sigmoid(out) # 空间加权
        return x * out

class CBAM(nn.Module):
    def __init__(self, in_channels, reduction=16, kernel_size=7):
        """ CBAM 模块
        Args:
            in_channels (int): 输入通道数
            reduction (int): 缩减比例因子
            kernel_size (int): 空间注意力卷积核大小
        """
        super(CBAM, self).__init__()
        self.channel_attention = ChannelAttention(in_channels, reduction)
        self.spatial_attention = SpatialAttention(kernel_size)

    def forward(self, x):
        # 通道注意力模块
        x = self.channel_attention(x)
        # 空间注意力模块
        x = self.spatial_attention(x)
        return x

基于 YOLO12 的无人机航拍视角目标检测系统

基于 YOLO12 的无人机（航拍）视角的目标检测系统

项目实战

基于 YOLO12 的无人机航拍视角目标检测系统

基于 YOLO12 的无人机（航拍）视角的目标检测系统

项目实战

更多推荐文章

相关免费在线工具

环境配置

本地模型训练

GPU 服务器训练（可选）

模型测试

图形化界面封装

文档

背景与意义

相关文献综述

本文算法介绍

YOLO12 算法介绍

YOLO11 算法介绍

实验结果分析

数据集介绍

指标结果分析

结论

参考文献

模型改进的基本流程（选看）

模型改进（选看）

更多推荐文章

相关免费在线工具

基于 YOLO12 的无人机航拍视角目标检测系统

基于 YOLO12 的无人机（航拍）视角的目标检测系统

项目实战

基于 YOLO12 的无人机航拍视角目标检测系统

基于 YOLO12 的无人机（航拍）视角的目标检测系统

项目实战

微信扫一扫，关注极客日志

更多推荐文章

相关免费在线工具

环境配置

本地模型训练

GPU 服务器训练（可选）

模型测试

图形化界面封装

文档

背景与意义

相关文献综述

本文算法介绍

YOLO12 算法介绍

YOLO11 算法介绍

实验结果分析

数据集介绍

指标结果分析

结论

参考文献

模型改进的基本流程（选看）

模型改进（选看）

微信扫一扫，关注极客日志

更多推荐文章

相关免费在线工具