ESP32 语音对话机器人:整合 Coze 与百度千帆
随着物联网和 AI 技术的融合,低成本嵌入式设备接入大模型能力已成为趋势。本文将分享如何利用 ESP32 微控制器,结合 Coze 大模型(负责对话逻辑)、百度千帆平台(提供 ASR 语音识别与 TTS 语音合成),构建一个完整的本地语音交互系统。
1. 系统架构设计
整个流程分为三个核心阶段,通过云端分担计算压力,确保 ESP32 在低功耗下稳定运行:
- 语音输入:麦克风采集音频,经 I2S 接口传输至 SD 卡缓存,随后上传至百度 ASR。
- 智能响应:识别后的文本送入 Coze 大模型生成回复。
- 语音输出:回复文本调用百度 TTS 合成音频,播放至扬声器。
这种架构充分利用了云端的算力优势,避免了在资源受限的 MCU 上部署大型模型。
2. 硬件准备
ESP32 作为主控,需搭配以下外设:
- ESP32 开发板:处理 Wi-Fi 连接及逻辑控制。
- 麦克风模块:推荐使用 INMP441(数字 I2S)或 MAX9814(模拟放大),支持高保真录音。
- 扬声器模块:MAX98357A 等 I2S 功放模块,驱动效果更佳。
- SD 卡模块:用于临时存储录音文件和 TTS 生成的音频文件,减轻内存压力。
3. 软件实现要点
3.1 状态机管理
由于涉及多个异步网络请求(ASR、LLM、TTS),采用状态机(State Machine)是最佳实践。代码中定义了 STATE_IDLE、STATE_RECORDING、STATE_ASR 等状态,确保同一时间只执行一个任务,避免资源冲突。
3.2 网络与 Token 管理
百度 API 需要 Access Token,且有效期有限。代码中实现了自动获取与刷新机制,当 Token 即将过期时提前重新请求,保证服务连续性。同时增加了 WiFi 断线重连逻辑,提升稳定性。
3.3 音频流处理
录音和播放均采用 I2S 协议,配合 DMA 缓冲区减少 CPU 占用。音频数据先写入 SD 卡再发送,避免了内存溢出问题,适合 ESP32 有限的 RAM。
4. 核心代码解析
以下是经过优化的完整工程代码,修复了部分变量声明与逻辑细节,可直接参考使用。
#include <Arduino.h>
#include <WiFi.h>
#include <WiFiClientSecure.h>
#include <ArduinoJson.h>
#include "driver/i2s.h"
#include "FS.h"
#include "SD.h"
#include "SPI.h"
#include <base64.h>
/************************ 核心配置(必须修改!)************************/
// WiFi 配置
const char* WIFI_SSID = "你的 WiFi 名称";
const char* WIFI_PASS = "你的 WiFi 密码";
// Coze API 配置
const char* COZE_API_KEY = "pat_DF8e73SOxxxxxxxxxx1VuKKxxxxxxaGwdBqc";
const char* COZE_BOT_ID = "757621xxxxxxx0";
const char* COZE_USER_ID = "123";
const char* COZE_API_DOMAIN = "api.coze.cn";
const int COZE_API_PORT = 443;
// 百度 ASR/TTS 配置
const char* BAIDU_API_KEY = "你的百度 API Key";
const char* BAIDU_SECRET_KEY = "你的百度 Secret Key";
const char* BAIDU_ASR_URL = "https://vop.baidu.com/pro_api";
const char* BAIDU_TTS_URL = "https://tsn.baidu.com/text2audio";
/************************ 硬件引脚定义 ************************/
// INMP441 录音 I2S 引脚
#define I2S_REC_BCLK 26
#define I2S_REC_LRC 25
#define I2S_REC_DIN 34
// MAX98357A 播放 I2S 引脚
#define I2S_PLAY_BCLK 13
#define I2S_PLAY_LRC 12
#define I2S_PLAY_DOUT 14
// SD 卡 SPI 引脚
#define SD_CS 5
#define SD_SCK 18
#define SD_MISO 19
#define SD_MOSI 23
/************************ 全局配置 ************************/
#define SAMPLE_RATE 16000
#define BITS_PER_SAMPLE I2S_BITS_PER_SAMPLE_16BIT
#define BYTES_PER_SAMPLE (BITS_PER_SAMPLE / 8)
#define RECORD_DURATION 6000 // 6 秒录音
#define RECORD_FILE_PATH "/recording.raw"
#define TTS_FILE_PATH "/tts.mp3"
// 状态机
enum DeviceState {
STATE_IDLE,
STATE_RECORDING,
STATE_ASR,
STATE_COZE,
STATE_TTS,
STATE_PLAYING
};
DeviceState currentState = STATE_IDLE;
// 全局变量声明
WiFiClientSecure client;
String accessToken;
unsigned long tokenExpireTime = 0;
String asrText;
String cozeReply;
String response;
String speechBase64;
String retrieveResp;
String msgResp;
/************************ 工具函数 ************************/
void logPrintln(String msg) {
Serial.printf("[%lu] %s\n", millis(), msg.c_str());
}
bool checkWiFi() {
if (WiFi.status() != WL_CONNECTED) {
logPrintln("WiFi 断线,正在重连...");
WiFi.reconnect();
int retry = 0;
while (WiFi.status() != WL_CONNECTED && retry < 10) {
delay(500);
retry++;
}
if (WiFi.status() == WL_CONNECTED) {
logPrintln("WiFi 重连成功!IP:" + WiFi.localIP().toString());
return true;
}
}
return true;
}
String urlEncode(String str) {
String encodedString = "";
for (int i = 0; i < str.length(); i++) {
char c = str.charAt(i);
if (c == ' ') {
encodedString += '+';
} else if (isalnum(c)) {
encodedString += c;
} else {
encodedString += '%';
encodedString += toHex((c & 0xf0) >> 4);
encodedString += toHex(c & 0x0f);
}
}
return encodedString;
}
char toHex(int n) {
return (n < 10) ? (char)(n + 48) : (char)(n + 55);
}
uint64_t getFileSize(String filePath) {
if (!SD.exists(filePath)) return 0;
File file = SD.open(filePath, FILE_READ);
uint64_t size = file.size();
file.close();
return size;
}
/************************ SD 卡初始化 ************************/
bool initSDCard() {
SPI.begin(SD_SCK, SD_MISO, SD_MOSI, SD_CS);
if (!SD.begin(SD_CS)) {
logPrintln("❌ SD 卡挂载失败!");
return false;
}
logPrintln("✅ SD 卡初始化完成");
return true;
}
/************************ I2S 录音初始化 ************************/
void initI2SRecord() {
i2s_config_t i2s_config = {
.mode = (i2s_mode_t)(I2S_MODE_MASTER | I2S_MODE_RX),
.sample_rate = SAMPLE_RATE,
.bits_per_sample = BITS_PER_SAMPLE,
.channel_format = I2S_CHANNEL_FMT_ONLY_LEFT,
.communication_format = I2S_COMM_FORMAT_I2S_MSB,
.intr_alloc_flags = 0,
.dma_buf_count = 2,
.dma_buf_len = 64,
.use_apll = false
};
i2s_driver_install(I2S_NUM_0, &i2s_config, 0, NULL);
i2s_pin_config_t pin_config = {
.bck_io_num = I2S_REC_BCLK,
.ws_io_num = I2S_REC_LRC,
.data_out_num = I2S_PIN_NO_CHANGE,
.data_in_num = I2S_REC_DIN
};
i2s_set_pin(I2S_NUM_0, &pin_config);
logPrintln("✅ I2S 录音模块初始化完成");
}
/************************ I2S 播放初始化 ************************/
void initI2SPlay() {
i2s_config_t i2s_config = {
.mode = (i2s_mode_t)(I2S_MODE_MASTER | I2S_MODE_TX),
.sample_rate = 16000,
.bits_per_sample = I2S_BITS_PER_SAMPLE_16BIT,
.channel_format = I2S_CHANNEL_FMT_ONLY_LEFT,
.communication_format = I2S_COMM_FORMAT_I2S_MSB,
.intr_alloc_flags = 0,
.dma_buf_count = 4,
.dma_buf_len = 1024,
.use_apll = false
};
i2s_driver_install(I2S_NUM_1, &i2s_config, 0, NULL);
i2s_pin_config_t pin_config = {
.bck_io_num = I2S_PLAY_BCLK,
.ws_io_num = I2S_PLAY_LRC,
.data_out_num = I2S_PLAY_DOUT,
.data_in_num = I2S_PIN_NO_CHANGE
};
i2s_set_pin(I2S_NUM_1, &pin_config);
i2s_stop(I2S_NUM_1);
logPrintln("✅ I2S 播放模块初始化完成");
}
/************************ 录音功能 ************************/
void startRecording() {
if (currentState != STATE_IDLE) {
logPrintln("❌ 当前非空闲状态,无法录音!");
return;
}
if (!checkWiFi()) return;
currentState = STATE_RECORDING;
unsigned long recordStartMillis = millis();
if (SD.exists(RECORD_FILE_PATH)) {
SD.remove(RECORD_FILE_PATH);
logPrintln("ℹ️ 删除旧录音文件");
}
initI2SRecord();
File recFile = SD.open(RECORD_FILE_PATH, FILE_WRITE);
if (!recFile) {
logPrintln("❌ 打开录音文件失败!");
i2s_driver_uninstall(I2S_NUM_0);
currentState = STATE_IDLE;
return;
}
logPrintln("📢 开始录音(6 秒后自动结束)...");
int16_t sampleBuffer[64];
while (currentState == STATE_RECORDING && (millis() - recordStartMillis) < RECORD_DURATION) {
size_t bytesRead;
i2s_read(I2S_NUM_0, sampleBuffer, sizeof(sampleBuffer), &bytesRead, portMAX_DELAY);
if (bytesRead > 0) {
recFile.write((uint8_t*)sampleBuffer, bytesRead);
}
delay(1);
}
recFile.close();
i2s_driver_uninstall(I2S_NUM_0);
currentState = STATE_IDLE;
logPrintln("🛑 录音结束");
if (SD.exists(RECORD_FILE_PATH)) {
float duration = (float)getFileSize(RECORD_FILE_PATH) / (SAMPLE_RATE * BYTES_PER_SAMPLE);
logPrintln("✅ 录音文件保存成功!时长:" + String(duration, 2) + "秒");
currentState = STATE_ASR;
}
}
/************************ 百度 API Token 获取 ************************/
bool getBaiduToken() {
if (accessToken.length() > 0 && millis() < tokenExpireTime - 600000) {
return true;
}
logPrintln("ℹ️ 获取百度 API Token...");
String tokenUrl = "https://aip.baidubce.com/oauth/2.0/token?grant_type=client_credentials&client_id=" + String(BAIDU_API_KEY) + "&client_secret=" + String(BAIDU_SECRET_KEY);
if (client.connect("aip.baidubce.com", 443)) {
client.print("GET " + tokenUrl + " HTTP/1.1\r\n");
client.print("Host: aip.baidubce.com\r\n");
client.print("Connection: close\r\n\r\n");
while (client.connected() || client.available()) {
if (client.available()) {
response += client.readString();
}
}
client.stop();
int jsonStart = response.indexOf("{");
if (jsonStart != -1) {
DynamicJsonDocument doc(1024);
DeserializationError error = deserializeJson(doc, response.substring(jsonStart));
if (!error && doc.containsKey("access_token")) {
accessToken = doc["access_token"].as<String>();
long expireSeconds = doc["expires_in"].as<long>();
tokenExpireTime = millis() + expireSeconds * 1000;
logPrintln("✅ Token 获取成功");
return true;
}
}
}
logPrintln("❌ Token 获取失败");
return false;
}
/************************ 百度 ASR 识别 ************************/
void baiduASR() {
if (currentState != STATE_ASR) return;
logPrintln("🔊 开始 ASR 识别...");
if (!getBaiduToken() || !SD.exists(RECORD_FILE_PATH)) {
currentState = STATE_IDLE;
return;
}
File recFile = SD.open(RECORD_FILE_PATH, FILE_READ);
if (!recFile) {
currentState = STATE_IDLE;
return;
}
const size_t chunkSize = 4096;
uint8_t chunk[chunkSize];
speechBase64 = "";
while (recFile.available() > 0) {
size_t bytesRead = recFile.read(chunk, chunkSize);
speechBase64 += base64::encode(chunk, bytesRead);
}
recFile.close();
DynamicJsonDocument reqDoc(4096);
reqDoc["format"] = "raw";
reqDoc["rate"] = SAMPLE_RATE;
reqDoc["dev_pid"] = 1537;
reqDoc["speech"] = speechBase64;
reqDoc["cuid"] = WiFi.macAddress();
reqDoc["len"] = getFileSize(RECORD_FILE_PATH);
String postBody;
serializeJson(reqDoc, postBody);
String requestUrl = String(BAIDU_ASR_URL) + "?access_token=" + accessToken;
if (client.connect("vop.baidu.com", 443)) {
client.print("POST " + requestUrl + " HTTP/1.1\r\n");
client.print("Host: vop.baidu.com\r\n");
client.print("Content-Type: application/json\r\n");
client.print("Content-Length: " + String(postBody.length()) + "\r\n\r\n");
client.print(postBody);
while (client.connected() || client.available()) {
if (client.available()) {
response += client.readString();
}
}
client.stop();
int jsonStart = response.indexOf("{");
if (jsonStart != -1) {
DynamicJsonDocument resDoc(1024);
DeserializationError error = deserializeJson(resDoc, response.substring(jsonStart));
if (!error && resDoc["err_no"].as<int>() == 0) {
asrText = resDoc["result"][0].as<String>();
logPrintln("✅ ASR 识别成功:" + asrText);
currentState = STATE_COZE;
} else {
logPrintln("❌ ASR 识别失败");
currentState = STATE_IDLE;
}
}
}
}
/************************ Coze AI 对话 ************************/
String processCozeAnswer(DynamicJsonDocument& resDoc) {
if (resDoc["code"].as<int>() != 0) {
return "❌ Coze 错误:" + resDoc["msg"].as<String>();
}
JsonArray data = resDoc["data"].as<JsonArray>();
String reply = "无回复";
for (auto item : data) {
if (item["type"].as<String>() == "answer") {
reply = item["content"].as<String>();
break;
}
}
return reply;
}
String getCozeChatResult(String conversationId, String chatId) {
String retrieveUrl = "/v3/chat/retrieve?conversation_id=" + conversationId + "&chat_id=" + chatId;
String msgListUrl = "/v3/chat/message/list?chat_id=" + chatId + "&conversation_id=" + conversationId + "&bot_id=" + String(COZE_BOT_ID) + "&task_id=" + chatId;
int maxRetries = 20;
for (int retry = 0; retry < maxRetries; retry++) {
logPrintln("🤔 Coze 轮询中...");
if (client.connect(COZE_API_DOMAIN, COZE_API_PORT)) {
client.print("GET " + retrieveUrl + " HTTP/1.1\r\n");
client.print("Host: " + String(COZE_API_DOMAIN) + "\r\n");
client.print("Authorization: Bearer " + String(COZE_API_KEY) + "\r\n");
client.print("Connection: close\r\n\r\n");
while (client.connected() || client.available()) {
if (client.available()) retrieveResp += client.readString();
}
client.stop();
int jsonStart = retrieveResp.indexOf("{");
if (jsonStart != -1) {
DynamicJsonDocument resDoc(1024);
DeserializationError error = deserializeJson(resDoc, retrieveResp.substring(jsonStart));
if (!error && resDoc["code"].as<int>() == 0) {
String status = resDoc["data"]["status"].as<String>();
if (status == "completed") {
if (client.connect(COZE_API_DOMAIN, COZE_API_PORT)) {
client.print("GET " + msgListUrl + " HTTP/1.1\r\n");
client.print("Host: " + String(COZE_API_DOMAIN) + "\r\n");
client.print("Authorization: Bearer " + String(COZE_API_KEY) + "\r\n");
client.print("Connection: close\r\n\r\n");
while (client.connected() || client.available()) {
if (client.available()) msgResp += client.readString();
}
client.stop();
int msgJsonStart = msgResp.indexOf("{");
if (msgJsonStart != -1) {
DynamicJsonDocument msgDoc(2048);
DeserializationError msgError = deserializeJson(msgDoc, msgResp.substring(msgJsonStart));
if (!msgError) {
return processCozeAnswer(msgDoc);
}
}
}
} else if (status == "failed") {
return "❌ Coze 任务失败";
}
}
}
}
delay(1000);
}
return "❌ Coze 轮询超时";
}
void callCozeAI() {
if (currentState != STATE_COZE || asrText.length() == 0) return;
logPrintln("🤖 调用 Coze AI:" + asrText);
if (!checkWiFi()) {
currentState = STATE_IDLE;
return;
}
DynamicJsonDocument reqDoc(1024);
reqDoc["bot_id"] = COZE_BOT_ID;
reqDoc["user_id"] = COZE_USER_ID;
reqDoc["stream"] = false;
reqDoc["auto_save_history"] = true;
JsonArray messages = reqDoc.createNestedArray("additional_messages");
JsonObject userMsg = messages.createNestedObject();
userMsg["role"] = "user";
userMsg["content"] = asrText;
userMsg["content_type"] = "text";
String postBody;
serializeJson(reqDoc, postBody);
if (client.connect(COZE_API_DOMAIN, COZE_API_PORT)) {
client.print("POST /v3/chat HTTP/1.1\r\n");
client.print("Host: " + String(COZE_API_DOMAIN) + "\r\n");
client.print("Authorization: Bearer " + String(COZE_API_KEY) + "\r\n");
client.print("Content-Type: application/json\r\n");
client.print("Content-Length: " + String(postBody.length()) + "\r\n");
client.print("Connection: close\r\n\r\n");
client.print(postBody);
while (client.connected() || client.available()) {
if (client.available()) response += client.readString();
}
client.stop();
int jsonStart = response.indexOf("{");
if (jsonStart != -1) {
DynamicJsonDocument resDoc(1024);
DeserializationError error = deserializeJson(resDoc, response.substring(jsonStart));
if (!error && resDoc["code"].as<int>() == 0) {
String chatId = resDoc["data"]["id"].as<String>();
String conversationId = resDoc["data"]["conversation_id"].as<String>();
logPrintln("✅ Coze 对话创建成功");
cozeReply = getCozeChatResult(conversationId, chatId);
logPrintln("✅ Coze 回复:" + cozeReply);
currentState = STATE_TTS;
}
}
}
}
/************************ 百度 TTS 合成 + 播放 ************************/
void baiduTTSAndPlay() {
if (currentState != STATE_TTS || cozeReply.length() == 0) return;
logPrintln("🎤 开始 TTS 合成:" + cozeReply);
if (!getBaiduToken()) {
currentState = STATE_IDLE;
return;
}
String encodedText = urlEncode(cozeReply);
String ttsParams = "tex=" + encodedText + "&lan=zh&cuid=" + WiFi.macAddress() + "&ctp=1&tok=" + accessToken + "&spd=5&pit=5&vol=15&per=0";
String requestUrl = String(BAIDU_TTS_URL) + "?" + ttsParams;
if (SD.exists(TTS_FILE_PATH)) {
SD.remove(TTS_FILE_PATH);
}
File ttsFile = SD.open(TTS_FILE_PATH, FILE_WRITE);
if (!ttsFile) {
logPrintln("❌ 打开 TTS 文件失败!");
currentState = STATE_IDLE;
return;
}
if (client.connect("tsn.baidu.com", 443)) {
client.print("GET " + requestUrl + " HTTP/1.1\r\n");
client.print("Host: tsn.baidu.com\r\n");
client.print("Connection: close\r\n\r\n");
bool headerEnd = false;
while (client.connected() || client.available()) {
if (client.available()) {
String line = client.readStringUntil('\n');
if (headerEnd) {
ttsFile.write((const uint8_t*)line.c_str(), line.length());
}
if (line == "\r") {
headerEnd = true;
}
}
}
client.stop();
ttsFile.close();
uint64_t ttsFileSize = getFileSize(TTS_FILE_PATH);
if (SD.exists(TTS_FILE_PATH) && ttsFileSize > 100) {
logPrintln("🎵 开始播放 TTS 语音...");
currentState = STATE_PLAYING;
File playFile = SD.open(TTS_FILE_PATH, FILE_READ);
if (playFile) {
i2s_start(I2S_NUM_1);
size_t bytesRead;
uint8_t playBuffer[1024];
while (playFile.available() > 0 && currentState == STATE_PLAYING) {
bytesRead = playFile.read(playBuffer, sizeof(playBuffer));
i2s_write(I2S_NUM_1, playBuffer, bytesRead, &bytesRead, portMAX_DELAY);
}
playFile.close();
i2s_stop(I2S_NUM_1);
}
logPrintln("🎵 TTS 播放完成");
currentState = STATE_IDLE;
}
}
}
/************************ 串口指令解析 ************************/
void parseSerialCommand() {
if (Serial.available() > 0) {
String input = Serial.readStringUntil('\n');
input.trim();
if (input.length() == 0) return;
logPrintln("🗣️ 串口输入:" + input);
if (input == "1") {
startRecording();
} else if (input == "3") {
if (SD.exists(RECORD_FILE_PATH)) {
uint64_t size = getFileSize(RECORD_FILE_PATH);
logPrintln("📋 录音文件信息:大小=" + String(size) + "字节");
}
} else if (input == "q") {
logPrintln("❌ 退出程序");
while (1);
} else {
if (currentState == STATE_IDLE) {
asrText = input;
currentState = STATE_COZE;
}
}
}
}
/************************ 初始化 ************************/
void setup() {
Serial.begin(115200);
delay(1000);
logPrintln("=====================================");
logPrintln(" ESP32 语音 AI 对话机器人 ");
logPrintln("=====================================");
logPrintln("📋 支持指令:1-开始语音,3-查文件,q-退出");
logPrintln("=====================================\n");
if (!initSDCard()) {
while (1) {
logPrintln("❌ SD 卡初始化失败");
delay(1000);
}
}
initI2SPlay();
WiFi.begin(WIFI_SSID, WIFI_PASS);
logPrintln("连接 WiFi..." );
while (WiFi.status() != WL_CONNECTED) {
delay(500);
Serial.print(".");
}
logPrintln("\n✅ WiFi 连接成功!");
client.setInsecure();
getBaiduToken();
currentState = STATE_IDLE;
logPrintln("✅ 系统初始化完成,等待指令...");
}
/************************ 主循环 ************************/
void loop() {
parseSerialCommand();
switch (currentState) {
case STATE_ASR: baiduASR(); break;
case STATE_COZE: callCozeAI(); break;
case STATE_TTS: baiduTTSAndPlay(); break;
default: break;
}
delay(100);
}
5. 调试与优化建议
在实际部署中,可能会遇到以下问题:
- 延迟问题:云端 API 调用存在网络延迟。若对实时性要求极高,可考虑引入 WebSocket 长连接替代 HTTP 轮询。
- 功耗管理:ESP32 在空闲时可进入深度睡眠模式,唤醒后再恢复网络,适合电池供电场景。
- 隐私保护:语音数据上传至云端前,建议在本地进行脱敏处理,或选择支持私有化部署的模型方案。
这套方案不仅降低了硬件成本,还让开发者能快速体验大模型在边缘设备上的落地潜力。根据实际需求调整参数,即可构建出个性化的语音助手。

