搭建支持情感控制的二次封装 TTS 服务

第一阶段：环境准备与模型部署

1. 创建项目并安装核心依赖

打开终端，执行以下命令：

# 1. 创建项目目录
mkdir MyEmotionalTTS && cd MyEmotionalTTS

# 2. 创建 Python 虚拟环境（推荐）
python -m venv venv

# 在 Linux/Mac 上激活：source venv/bin/activate
# 在 Windows 上激活：venv\Scripts\activate

# 3. 安装 PyTorch (根据你的 CUDA 版本选择，以 CUDA 12.1 为例)
pip install torch torchaudio --index-url https://download.pytorch.org/whl/cu121

# 4. 安装 ChatTTS 及其他依赖
pip install ChatTTS transformers soundfile ipython

2. 下载并初始化 ChatTTS 模型

创建一个名为 init_model.py 的脚本，写入以下代码：

import ChatTTS
import torch
import warnings
warnings.filterwarnings("ignore")

# 初始化 ChatTTS
chat = ChatTTS.Chat()
# 加载模型（自动下载权重，约 2GB）
chat.load_models(compile=False)
# `compile=False` 可避免特定环境下的错误

# 查看可用模型参数（可选）
print("模型加载成功！")
print(f"设备：{chat.device}")

# 将模型设为推理模式（重要）
chat.eval()

# 保存模型对象以供后续使用（示例，实际我们会在封装类中管理）
# import pickle
# with open('chat_model.pkl', 'wb') as f:
#     pickle.dump(chat, f)

运行它来下载和验证模型：

python init_model.py

第二阶段：核心封装与情感控制接口

创建一个核心封装类 EmotionalTTS.py，这是二次封装的精髓。

 ChatTTS
 torch
 numpy  np
 soundfile  sf
 typing  , , 
 warnings

warnings.filterwarnings()

 :
    
     ():
        
        .chat = ChatTTS.Chat()
        
         device:
            .chat.load_models(=, device=device)
        :
            .chat.load_models(=)
        
        
        .chat.()
        
        
        .emotion_params_map = {
            : {: , : },      
            : {: , : },        
            : {: , : },      
            : {: , : },    
            : {: , : }    
        }
        ()

     () -> :
        
        
        texts = [text]
        
        
        params = .emotion_params_map.get(emotion, .emotion_params_map[])
        
        
        rand_spk = np.random.randint(, )  speaker_embedding    

        
         torch.no_grad():
            wavs, _ = .chat.infer(
                texts,
                params_refine_text={: },  
                params_infer_code={
                    : speaker_embedding,
                    : rand_spk,
                    : params[],
                },
                do_text_normalization=,
                return_duration=
            )
        
        audio_data = wavs.squeeze()  
        
        
         speed != :
             scipy  signal
            new_length = ((audio_data) / speed)
            audio_data = signal.resample(audio_data, new_length)
        
        
         save_path:
              save_path.endswith():
                save_path += 
            sf.write(save_path, audio_data, samplerate=sample_rate)
            ()
        
         audio_data, sample_rate

     () -> []:
        
         os
        os.makedirs(save_dir, exist_ok=)
        
         emotions  :
            emotions = [] * (texts)
            
        file_paths = []
         i, (text, emotion)  ((texts, emotions)):
            ()
            save_path = os.path.join(save_dir, )
            .synthesize(text, emotion=emotion, save_path=save_path)
            file_paths.append(save_path)
         file_paths

     () -> []:
        
         (.emotion_params_map.keys())

    
     () -> np.ndarray:
        
        
        
        
        ()
         np.random.randn(, ).astype(np.float32)

from EmotionalTTS import EmotionalTTS import soundfile as sf import simpleaudio as sa # 用于直接播放，安装： pip install simpleaudio def main(): # 1. 初始化引擎 print("="*50) print("初始化情感 TTS 引擎...") tts_engine = EmotionalTTS(device='cuda') # 如果你有 GPU # tts_engine = EmotionalTTS(device='cpu') # 使用 CPU # 2. 查看支持的情感 print("支持的情感:", tts_engine.get_available_emotions()) print("="*50) # 3. 单句合成示例 test_text = "你好，世界！这是一个测试，看看情感语音合成效果怎么样。" # 用不同的情感合成同一句话 for emo in ['neutral', 'happy', 'sad', 'angry']: print(f"\n>>> 正在用「{emo}」情感合成...") audio_data, sr = tts_engine.synthesize( text=test_text, emotion=emo, speed=1.0 if emo != 'sad' else 0.9, # 悲伤时语速放慢 save_path=f"./output/demo_{emo}.wav" # 保存文件 ) # 尝试播放（如果环境支持） try: play_obj = sa.play_buffer(audio_data, 1, 2, sr) play_obj.wait_done() except: print(f"音频已保存，如需播放请查看文件：demo_{emo}.wav") # 4. 批量合成示例 print("\n" + "="*50) print("开始批量合成示例...") batch_texts = [ "早上好，今天天气真不错。", "我对此感到非常失望。", "太棒了！我们终于成功了！", "请立即离开这个地方。" ] batch_emotions = ['friendly', 'sad', 'happy', 'angry'] saved_files = tts_engine.batch_synthesize( texts=batch_texts, emotions=batch_emotions, save_dir="./output/batch" ) print(f"批量合成完成，共生成 {len(saved_files)} 个文件。") # 5. 高级：尝试自定义情感参数（直接修改映射） print("\n" + "="*50) print("高级：自定义情感参数...") tts_engine.emotion_params_map['whisper'] = {'temperature': 0.2, 'spk_emb': None} # 耳语 audio_custom, _ = tts_engine.synthesize( "这是一个秘密，我只告诉你一个人。", emotion='whisper', save_path="./output/whisper_secret.wav" ) print("自定义情感「whisper」合成完成。") if __name__ == "__main__": # 确保有输出目录 import os os.makedirs("./output", exist_ok=True) os.makedirs("./output/batch", exist_ok=True) main() print("\n所有测试完成！请检查 './output' 目录下的音频文件。")

from flask import Flask, request, jsonify, send_file from EmotionalTTS import EmotionalTTS import io import soundfile as sf import numpy as np import uuid import os app = Flask(__name__) tts_engine = None def init_engine(): global tts_engine print("正在加载 TTS 模型...") tts_engine = EmotionalTTS(device='cpu') # API 服务通常用 CPU print("模型加载完毕，API 服务就绪。") init_engine() @app.route('/synthesize', methods=['POST']) def synthesize(): """API 端点：文本转语音""" data = request.json # 解析请求参数 text = data.get('text', '') emotion = data.get('emotion', 'neutral') speed = float(data.get('speed', 1.0)) if not text: return jsonify({'error': '文本内容不能为空'}), 400 # 调用引擎合成 try: audio_data, sr = tts_engine.synthesize( text=text, emotion=emotion, speed=speed ) # 将音频数据转为字节流返回 audio_bytes = io.BytesIO() sf.write(audio_bytes, audio_data, samplerate=sr, format='WAV') audio_bytes.seek(0) # 也可以选择保存到文件后返回 URL（生产环境建议） # filename = f"{uuid.uuid4()}.wav" # filepath = os.path.join('./audio_cache', filename) # sf.write(filepath, audio_data, sr) # return jsonify({'url': f'/audio/{filename}'}) return send_file( audio_bytes, mimetype='audio/wav', as_attachment=True, download_name=f'speech_{emotion}.wav' ) except Exception as e: return jsonify({'error': f'合成失败：{str(e)}'}), 500 @app.route('/emotions', methods=['GET']) def list_emotions(): """返回支持的情感列表""" return jsonify({'emotions': tts_engine.get_available_emotions()}) if __name__ == '__main__': os.makedirs('./audio_cache', exist_ok=True) # 生产环境请使用 waitress 或 gunicorn，不要用 debug 模式 app.run(host='0.0.0.0', port=5000, debug=True, use_reloader=False)

搭建支持情感控制的二次封装 TTS 服务