项目背景
本项目基于 ONNX Runtime 和 OpenCV,实现了一个轻量、高效、可扩展的 YOLOv8 C++ 推理模块。它不依赖 PyTorch,可直接加载 .onnx 模型进行推理,适用于 Windows/Linux 平台,支持 CPU 与 CUDA 加速。
项目有三个文件:inference.h,inference.cpp 和 main.cpp,核心文件为 inference.cpp。

代码讲解
1. inference.cpp 注释版代码:
// Ultralytics 🚀 AGPL-3.0 License - https://ultralytics.com/license
#define _CRT_SECURE_NO_WARNINGS 1 // 关闭 MSVC 下部分 C 运行库的'安全'警告(如 strcpy 等函数)
#include "inference.h"
#include <regex>
#define benchmark // 打开后会进行简单的时间统计(前处理/推理/后处理耗时打印)
#define min(a,b) (((a) < (b)) ? (a) : (b)) // 自定义 min 宏(注意:可能与 std::min 冲突,项目里保持原样)
YOLO_V8::YOLO_V8() { }
YOLO_V8::~YOLO_V8() { delete session; // 析构时释放 ONNX Runtime 的 Session(注意:input/output 节点名里 new 的 char* 未释放,存在内存泄露风险) }
#ifdef USE_CUDA
namespace Ort {
// 当使用 CUDA 且输入为 half(fp16)时,告知 ORT 该模板类型映射为 ONNX 的 FLOAT16
template<>
struct TypeToTensorType<half> {
static constexpr ONNXTensorElementDataType type = ONNX_TENSOR_ELEMENT_DATA_TYPE_FLOAT16;
};
}
#endif
{
channels = iImg.();
imgHeight = iImg.rows;
imgWidth = iImg.cols;
( c = ; c < channels; c++) {
( h = ; h < imgHeight; h++) {
( w = ; w < imgWidth; w++) {
iBlob[c * imgWidth * imgHeight + h * imgWidth + w] = std::remove_pointer<T>::((iImg.<cv::Vec3b>(h, w)[c]) / );
}
}
}
RET_OK;
}
{
(iImg.() == ) {
oImg = iImg.();
cv::(oImg, oImg, cv::COLOR_BGR2RGB);
} {
cv::(iImg, oImg, cv::COLOR_GRAY2RGB);
}
(modelType) {
YOLO_DETECT_V8:
YOLO_POSE:
YOLO_DETECT_V8_HALF:
YOLO_POSE_V8_HALF: {
(iImg.cols >= iImg.rows) {
resizeScales = iImg.cols / ()iImgSize.();
cv::(oImg, oImg, cv::(iImgSize.(), (iImg.rows / resizeScales)));
} {
resizeScales = iImg.rows / ()iImgSize.();
cv::(oImg, oImg, cv::((iImg.cols / resizeScales), iImgSize.()));
}
cv::Mat tempImg = cv::Mat::(iImgSize.(), iImgSize.(), CV_8UC3);
oImg.((cv::(, , oImg.cols, oImg.rows)));
oImg = tempImg;
;
}
YOLO_CLS: {
h = iImg.rows;
w = iImg.cols;
m = (h, w);
top = (h - m) / ;
left = (w - m) / ;
cv::((cv::(left, top, m, m)), oImg, cv::(iImgSize.(), iImgSize.()));
;
}
}
RET_OK;
}
{
* Ret = RET_OK;
;
result = std::(iParams.modelPath, pattern);
(result) {
Ret = (*);
std::cout << Ret << std::endl;
Ret;
}
{
rectConfidenceThreshold = iParams.rectConfidenceThreshold;
iouThreshold = iParams.iouThreshold;
imgSize = iParams.imgSize;
modelType = iParams.modelType;
cudaEnable = iParams.cudaEnable;
env = Ort::(ORT_LOGGING_LEVEL_WARNING, );
Ort::SessionOptions sessionOption;
(iParams.cudaEnable) {
OrtCUDAProviderOptions cudaOption;
cudaOption.device_id = ;
sessionOption.(cudaOption);
}
sessionOption.(GraphOptimizationLevel::ORT_ENABLE_ALL);
sessionOption.(iParams.intraOpNumThreads);
sessionOption.(iParams.logSeverityLevel);
ModelPathSize = (CP_UTF8, , iParams.modelPath.(), <>(iParams.modelPath.()), , );
* wide_cstr = [ModelPathSize + ];
(CP_UTF8, , iParams.modelPath.(), <>(iParams.modelPath.()), wide_cstr, ModelPathSize);
wide_cstr[ModelPathSize] = ;
* modelPath = wide_cstr;
* modelPath = iParams.modelPath.();
session = Ort::(env, modelPath, sessionOption);
Ort::AllocatorWithDefaultOptions allocator;
inputNodesNum = session->();
( i = ; i < inputNodesNum; i++) {
Ort::AllocatedStringPtr input_node_name = session->(i, allocator);
* temp_buf = [];
(temp_buf, input_node_name.());
inputNodeNames.(temp_buf);
}
OutputNodesNum = session->();
( i = ; i < OutputNodesNum; i++) {
Ort::AllocatedStringPtr output_node_name = session->(i, allocator);
* temp_buf = [];
(temp_buf, output_node_name.());
outputNodeNames.(temp_buf);
}
options = Ort::RunOptions{ };
();
RET_OK;
} ( std::exception& e) {
* str1 = ;
* str2 = e.();
std::string result = std::(str1) + std::(str2);
* merged = [result.() + ];
(merged, result.());
std::cout << merged << std::endl;
[] merged;
(*);
}
}
{
starttime_1 = ();
* Ret = RET_OK;
cv::Mat processedImg;
(iImg, imgSize, processedImg);
(modelType < ) {
* blob = [processedImg.() * ];
(processedImg, blob);
std::vector<> inputNodeDims = { , , imgSize.(), imgSize.() };
(starttime_1, iImg, blob, inputNodeDims, oResult);
} {
half* blob = half[processedImg.() * ];
(processedImg, blob);
std::vector<> inputNodeDims = { ,,imgSize.(),imgSize.() };
(starttime_1, iImg, blob, inputNodeDims, oResult);
}
Ret;
}
{
Ort::Value inputTensor = Ort::Value::CreateTensor< std::remove_pointer<N>::type>(
Ort::MemoryInfo::(OrtDeviceAllocator, OrtMemTypeCPU), blob, * imgSize.() * imgSize.(), inputNodeDims.(), inputNodeDims.());
starttime_2 = ();
outputTensor = session->(options, inputNodeNames.(), &inputTensor, , outputNodeNames.(), outputNodeNames.());
starttime_3 = ();
Ort::TypeInfo typeInfo = outputTensor.().();
tensor_info = typeInfo.();
std::vector<> outputNodeDims = tensor_info.();
output = outputTensor.().GetTensorMutableData< std::remove_pointer<N>::type>();
[] blob;
(modelType) {
YOLO_DETECT_V8:
YOLO_DETECT_V8_HALF: {
signalResultNum = outputNodeDims[];
strideNum = outputNodeDims[];
std::vector<> class_ids;
std::vector<> confidences;
std::vector<cv::Rect> boxes;
cv::Mat rawData;
(modelType == YOLO_DETECT_V8) {
rawData = cv::(signalResultNum, strideNum, CV_32F, output);
} {
rawData = cv::(signalResultNum, strideNum, CV_16F, output);
rawData.(rawData, CV_32F);
}
rawData = rawData.();
* data = (*)rawData.data;
( i = ; i < strideNum; ++i) {
* classesScores = data + ;
;
cv::Point class_id;
maxClassScore;
cv::(scores, , &maxClassScore, , &class_id);
(maxClassScore > rectConfidenceThreshold) {
confidences.(maxClassScore);
class_ids.(class_id.x);
x = data[];
y = data[];
w = data[];
h = data[];
left = ((x - * w) * resizeScales);
top = ((y - * h) * resizeScales);
width = (w * resizeScales);
height = (h * resizeScales);
boxes.(cv::(left, top, width, height));
}
data += signalResultNum;
}
std::vector<> nmsResult;
cv::dnn::(boxes, confidences, rectConfidenceThreshold, iouThreshold, nmsResult);
( i = ; i < nmsResult.(); ++i) {
idx = nmsResult[i];
DL_RESULT result;
result.classId = class_ids[idx];
result.confidence = confidences[idx];
result.box = boxes[idx];
oResult.(result);
}
starttime_4 = ();
pre_process_time = ()(starttime_2 - starttime_1) / CLOCKS_PER_SEC * ;
process_time = ()(starttime_3 - starttime_2) / CLOCKS_PER_SEC * ;
post_process_time = ()(starttime_4 - starttime_3) / CLOCKS_PER_SEC * ;
(cudaEnable) {
std::cout << << pre_process_time << << process_time << << post_process_time << << std::endl;
} {
std::cout << << pre_process_time << << process_time << << post_process_time << << std::endl;
}
;
}
YOLO_CLS:
YOLO_CLS_HALF: {
cv::Mat rawData;
(modelType == YOLO_CLS) {
rawData = cv::(, ->classes.(), CV_32F, output);
} {
rawData = cv::(, ->classes.(), CV_16F, output);
rawData.(rawData, CV_32F);
}
* data = (*)rawData.data;
DL_RESULT result;
( i = ; i < ->classes.(); i++) {
result.classId = i;
result.confidence = data[i];
oResult.(result);
}
;
}
:
std::cout << << << std::endl;
}
RET_OK;
}
{
starttime_1 = ();
cv::Mat iImg = cv::(cv::(imgSize.(), imgSize.()), CV_8UC3);
cv::Mat processedImg;
(iImg, imgSize, processedImg);
(modelType < ) {
* blob = [iImg.() * ];
(processedImg, blob);
std::vector<> YOLO_input_node_dims = { , , imgSize.(), imgSize.() };
Ort::Value input_tensor = Ort::Value::<>(
Ort::MemoryInfo::(OrtDeviceAllocator, OrtMemTypeCPU), blob, * imgSize.() * imgSize.(), YOLO_input_node_dims.(), YOLO_input_node_dims.());
output_tensors = session->(options, inputNodeNames.(), &input_tensor, , outputNodeNames.(), outputNodeNames.());
[] blob;
starttime_4 = ();
post_process_time = ()(starttime_4 - starttime_1) / CLOCKS_PER_SEC * ;
(cudaEnable) {
std::cout << << << post_process_time << << std::endl;
}
} {
half* blob = half[iImg.() * ];
(processedImg, blob);
std::vector<> YOLO_input_node_dims = { ,,imgSize.(),imgSize.() };
Ort::Value input_tensor = Ort::Value::<half>(Ort::MemoryInfo::(OrtDeviceAllocator, OrtMemTypeCPU), blob, * imgSize.() * imgSize.(), YOLO_input_node_dims.(), YOLO_input_node_dims.());
output_tensors = session->(options, inputNodeNames.(), &input_tensor, , outputNodeNames.(), outputNodeNames.());
[] blob;
starttime_4 = ();
post_process_time = ()(starttime_4 - starttime_1) / CLOCKS_PER_SEC * ;
(cudaEnable) {
std::cout << << << post_process_time << << std::endl;
}
}
RET_OK;
}







