C++ 性能优化实战:内存、CPU 与 I/O 效率提升指南
深入探讨 C++ 性能优化的核心知识,帮助你掌握提升代码执行效率的关键技巧。通过学习,你将能够理解性能分析的方法,学会优化内存管理以减少泄漏和碎片,掌握 CPU 优化技巧提高执行速度,并优化 I/O 操作提升读写效率。
性能优化的基本原则
优化前务必遵循以下准则:
- 先测量后优化:必须先用工具找出瓶颈,而非盲目猜测
- 聚焦关键路径:只优化对整体性能影响最大的部分
- 保持可维护性:优化后的代码仍需易于理解和维护
- 验证结果:优化后必须测试正确性及性能提升效果
常用分析工具包括 GProf、Valgrind、Perf 以及 Visual Studio Profiler,根据平台选择合适的工具进行定位。
内存管理优化
智能指针与内存泄漏
手动管理内存容易引发泄漏,推荐使用智能指针自动释放资源。
#include <iostream>
#include <memory>
class MyClass {
public:
MyClass() {
std::cout << "MyClass 构造函数" << std::endl;
}
~MyClass() {
std::cout << "MyClass 析构函数" << std::endl;
}
void doSomething() {
std::cout << "MyClass 正在做某事" << std::endl;
}
};
void useSmartPointer() {
std::shared_ptr<MyClass> ptr = std::make_shared<MyClass>();
ptr->doSomething();
}
int main() {
useSmartPointer();
return 0;
}
预分配减少内存碎片
对于已知大小的容器,提前 reserve 可以避免多次重新分配导致的碎片和拷贝开销。
#include <iostream>
#include <vector>
#include <chrono>
void preallocateMemory() {
const int size = 10000;
std::vector<int> vec;
vec.reserve(size);
for (int i = 0; i < size; ++i) {
vec.push_back(i);
}
}
void notPreallocateMemory() {
const int size = 10000;
std::vector<int> vec;
for (int i = 0; i < size; ++i) {
vec.push_back(i);
}
}
int main() {
auto start = std::chrono::high_resolution_clock::now();
preallocateMemory();
auto end = std::chrono::high_resolution_clock::now();
std::cout << "预分配耗时:"
<< std::chrono::duration_cast<std::chrono::microseconds>(end - start).count()
<< "微秒" << std::endl;
return 0;
}
CPU 优化技巧
循环合并与局部性
减少遍历次数能显著降低缓存未命中。尽量在一个循环内完成相关操作。
#include <iostream>
#include <vector>
#include <algorithm>
#include <chrono>
void optimizedLoop() {
const int size = 10000;
std::vector<int> vec1(size, 1);
std::vector<int> vec2(size, 2);
std::vector<int> result(size, 0);
for (int i = 0; i < size; ++i) {
result[i] = vec1[i] + vec2[i];
}
}
void unoptimizedLoop() {
const int size = 10000;
std::vector<int> vec1(size, 1);
std::vector<int> vec2(size, 2);
std::vector<int> result(size, 0);
for (int i = 0; i < size; ++i) {
result[i] = vec1[i];
}
for (int i = 0; i < size; ++i) {
result[i] += vec2[i];
}
}
int main() {
auto start = std::chrono::high_resolution_clock::now();
optimizedLoop();
auto end = std::chrono::high_resolution_clock::now();
std::cout << "优化循环耗时:"
<< std::chrono::duration_cast<std::chrono::microseconds>(end - start).count()
<< "微秒" << std::endl;
return 0;
}
内联函数减少调用开销
短小且频繁调用的函数适合设为 inline,避免栈帧切换成本。
#include <iostream>
#include <chrono>
inline int add(int a, int b) {
return a + b;
}
int addNotInline(int a, int b) {
return a + b;
}
void testFunctionCallOverhead() {
const int size = 1000000;
int result = 0;
auto start = std::chrono::high_resolution_clock::now();
for (int i = 0; i < size; ++i) {
result += add(i, i);
}
auto end = std::chrono::high_resolution_clock::now();
std::cout << "内联函数耗时:"
<< std::chrono::duration_cast<std::chrono::microseconds>(end - start).count()
<< "微秒" << std::endl;
}
int main() {
testFunctionCallOverhead();
return 0;
}
I/O 操作优化
文件 I/O 缓冲控制
默认缓冲区可能带来额外开销,根据场景调整缓冲策略。注意示例中禁用缓冲是为了演示对比,生产环境通常保留缓冲。
#include <iostream>
#include <fstream>
#include <chrono>
void optimizedFileIO() {
const std::string filename = "test.txt";
const int size = 10000;
std::ofstream file(filename);
// 实际场景中通常不建议完全禁用缓冲,此处仅为演示
file.rdbuf()->pubsetbuf(nullptr, 0);
for (int i = 0; i < size; ++i) {
file << i << std::endl;
}
file.close();
}
void unoptimizedFileIO() {
const std::string filename = "test.txt";
const int size = 10000;
std::ofstream file(filename);
for (int i = 0; i < size; ++i) {
file << i << std::endl;
}
file.close();
}
int main() {
auto start = std::chrono::high_resolution_clock::now();
optimizedFileIO();
auto end = std::chrono::high_resolution_clock::now();
std::cout << "优化文件 I/O 耗时:"
<< std::chrono::duration_cast<std::chrono::microseconds>(end - start).count()
<< "微秒" << std::endl;
return 0;
}
网络 I/O 异步模型
高并发场景下,异步 I/O 能大幅提升吞吐量,避免线程阻塞等待。
#include <iostream>
#include <boost/asio.hpp>
#include <chrono>
using boost::asio::ip::tcp;
using namespace std;
void optimizedNetworkIO() {
try {
boost::asio::io_service io_service;
tcp::resolver resolver(io_service);
tcp::resolver::query query("example.com", "http");
tcp::resolver::iterator endpoint_iterator = resolver.resolve(query);
tcp::socket socket(io_service);
boost::asio::connect(socket, endpoint_iterator);
std::string request = "GET / HTTP/1.1\r\nHost: example.com\r\nConnection: close\r\n\r\n";
boost::asio::write(socket, boost::asio::buffer(request));
boost::asio::streambuf response;
boost::asio::read_until(socket, response, "\r\n");
} catch (const std::exception& e) {
cerr << "错误:" << e.what() << endl;
}
}
int main() {
auto start = std::chrono::high_resolution_clock::now();
optimizedNetworkIO();
auto end = std::chrono::high_resolution_clock::now();
std::cout << "网络 I/O 耗时:"
<< std::chrono::duration_cast<std::chrono::milliseconds>(end - start).count()
<< "毫秒" << std::endl;
return 0;
}
综合案例:矩阵乘法优化
项目结构
text
MatrixMultiplicationOptimization/
├── include/
│ └── Matrix.h
├── src/
│ ├── Matrix.cpp
│ └── main.cpp
└── build/
核心实现
// include/Matrix.h
#ifndef MATRIX_H
#define MATRIX_H
#include <vector>
#include <chrono>
#include <stdexcept>
using namespace std;
using namespace chrono;
class Matrix {
public:
Matrix(int rows, int cols);
Matrix(const vector<vector<int>>& data);
int getRows() const;
int getCols() const;
int& operator()(int row, int col);
const int& operator()(int row, int col) const;
Matrix multiplyNaive(const Matrix& other) const;
Matrix multiplyOptimized(const Matrix& other) const;
void print() const;
private:
int rows_;
int cols_;
vector<vector<int>> data_;
};
#endif
// src/Matrix.cpp
#include "Matrix.h"
#include <iostream>
Matrix::Matrix(int rows, int cols)
: rows_(rows), cols_(cols), data_(rows, vector<int>(cols, 0)) {}
Matrix::Matrix(const vector<vector<int>>& data)
: rows_(data.size()), cols_(data[0].size()), data_(data) {}
int Matrix::getRows() const { return rows_; }
int Matrix::getCols() const { return cols_; }
int& Matrix::operator()(int row, int col) { return data_[row][col]; }
const int& Matrix::operator()(int row, int col) const { return data_[row][col]; }
Matrix Matrix::multiplyNaive(const Matrix& other) const {
if (cols_ != other.rows_) throw invalid_argument("尺寸不兼容");
Matrix result(rows_, other.cols_);
for (int i = 0; i < rows_; ++i) {
for (int j = 0; j < other.cols_; ++j) {
for (int k = 0; k < cols_; ++k) {
result(i, j) += data_[i][k] * other.data_[k][j];
}
}
}
return result;
}
Matrix Matrix::multiplyOptimized(const Matrix& other) const {
if (cols_ != other.rows_) throw invalid_argument("尺寸不兼容");
Matrix result(rows_, other.cols_);
vector<vector<int>> otherTransposed = other.getTransposed();
for (int i = 0; i < rows_; ++i) {
for (int j = 0; j < other.cols_; ++j) {
int sum = 0;
for (int k = 0; k < cols_; ++k) {
sum += data_[i][k] * otherTransposed[j][k];
}
result(i, j) = sum;
}
}
return result;
}
vector<vector<int>> Matrix::getTransposed() const {
vector<vector<int>> transposed(cols_, vector<int>(rows_));
for (int i = 0; i < rows_; ++i) {
for (int j = 0; j < cols_; ++j) {
transposed[j][i] = data_[i][j];
}
}
return transposed;
}
void Matrix::print() const {
for (const auto& row : data_) {
for (int value : row) cout << value << " ";
cout << endl;
}
}
// src/main.cpp
#include <iostream>
#include <vector>
#include <chrono>
#include "Matrix.h"
using namespace std;
using namespace chrono;
int main() {
const int size = 100;
Matrix matrix1(size, size);
Matrix matrix2(size, size);
for (int i = 0; i < size; ++i) {
for (int j = 0; j < size; ++j) {
matrix1(i, j) = i + j;
matrix2(i, j) = i * j;
}
}
auto start = high_resolution_clock::now();
Matrix resultNaive = matrix1.multiplyNaive(matrix2);
auto end = high_resolution_clock::now();
cout << "朴素算法耗时:" << duration_cast<milliseconds>(end - start).count() << "毫秒" << endl;
start = high_resolution_clock::now();
Matrix resultOptimized = matrix1.multiplyOptimized(matrix2);
end = high_resolution_clock::now();
cout << "优化算法耗时:" << duration_cast<milliseconds>(end - start).count() << "毫秒" << endl;
return 0;
}
构建与运行
mkdir -p build && cd build
cmake -DCMAKE_BUILD_TYPE=Release ..
cmake --build . --config Release
./MatrixMultiplicationOptimization
总结与实践
性能优化不是魔法,而是基于数据的科学决策。记住'先测量后优化',善用工具定位瓶颈。内存上优先使用智能指针和预分配,CPU 上关注循环局部性和内联,I/O 上选择合适缓冲与异步模型。通过矩阵乘法案例可以看到,算法层面的改进往往比单纯加速更有价值。后续可以尝试 SIMD 指令集或并发编程进一步挖掘潜力。


