from fastapi import FastAPI, File, UploadFile, HTTPException
from fastapi.responses import JSONResponse
import whisper
import tempfile
import os
import numpy as np
from scipy import signal
import librosa
import uvicorn
import soundfile as sf
app = FastAPI(
title="Whisper 音频转录 API",
description="基于 OpenAI Whisper 的高级音频转录服务",
version="1.0.0"
)
model = None
def load_whisper_model():
global model
if model is None:
model = whisper.load_model("large")
return model
def preprocess_audio(audio_path):
""" 音频预处理:重采样、降噪、标准化 """
try:
y, sr = librosa.load(audio_path, sr=16000)
b, a = signal.butter(4, 100, 'highpass', fs=sr)
y = signal.filtfilt(b, a, y)
y = y / np.max(np.abs(y))
temp_path = tempfile.mktemp(suffix='.wav')
sf.write(temp_path, y, sr)
return temp_path
except Exception as e:
print(f"音频预处理失败:{str(e)}")
return audio_path
@app.on_event("startup")
async def startup_event():
"""应用启动时加载模型"""
print("正在加载 Whisper 模型...")
load_whisper_model()
print("Whisper 模型加载完成!")
@app.post("/transcribe", summary="音频转录", description="上传音频文件并返回转录文本")
async def transcribe_audio(file: UploadFile = File(...)):
""" 转录音频文件为文本,使用优化参数
- **file**:音频文件,支持 MP3、WAV、OGG、M4A、FLAC 等格式
"""
valid_extensions = {'.mp3', '.wav', '.ogg', '.m4a', '.flac', '.aac', '.m4b'}
file_extension = os.path.splitext(file.filename)[1].lower()
if file_extension not in valid_extensions:
raise HTTPException(
status_code=400,
detail=f"不支持的文件类型:{file_extension}。请上传音频文件:{valid_extensions}"
)
temp_path = None
processed_audio = None
try:
with tempfile.NamedTemporaryFile(delete=False, suffix=file_extension) as temp_file:
content = await file.read()
temp_file.write(content)
temp_path = temp_file.name
processed_audio = preprocess_audio(temp_path)
model = load_whisper_model()
result = model.transcribe(
processed_audio,
language="zh",
task="transcribe",
beam_size=5,
best_of=5,
temperature=0.0,
patience=1.0,
suppress_tokens=[-1]
)
return JSONResponse(content={
"status": "success",
"text": result["text"],
"language": result.get("language", "zh"),
"file_name": file.filename
})
except Exception as e:
raise HTTPException(status_code=500, detail=f"转录过程中出现错误:{str(e)}")
finally:
if temp_path and os.path.exists(temp_path):
os.unlink(temp_path)
if processed_audio and processed_audio != temp_path and os.path.exists(processed_audio):
os.unlink(processed_audio)
@app.get("/health", summary="健康检查", description="检查服务是否正常运行")
async def health_check():
"""健康检查端点"""
return JSONResponse(content={"status": "healthy", "model_loaded": model is not None})
@app.get("/", summary="根端点", description="API 基本信息")
async def root():
"""根端点,返回 API 基本信息"""
return {
"message": "Whisper 音频转录 API 服务",
"version": "1.0.0",
"endpoints": {
"transcribe": "/transcribe (POST)",
"health": "/health (GET)",
"docs": "/docs (GET)",
"redoc": "/redoc (GET)"
},
"model": "large",
"supported_languages": "近百种语言,支持中文转录"
}
if __name__ == "__main__":
uvicorn.run(app, host="0.0.0.0", port=7862)
FROM nvidia/cuda:12.4.1-cudnn-devel-ubuntu22.04
# 设置工作目录
WORKDIR /app
RUN sed -i 's/archive.ubuntu.com/mirrors.aliyun.com/g' /etc/apt/sources.list && \
sed -i 's/security.ubuntu.com/mirrors.aliyun.com/g' /etc/apt/sources.list
# 安装系统依赖
RUN apt-get update && apt-get install -y \
ffmpeg \
python3 \
python3-pip \
python3-venv \
&& rm -rf /var/lib/apt/lists/* \
&& ln -sf /usr/bin/python3 /usr/bin/python
# 复制依赖文件
COPY requirements.txt .
# 安装 Python 依赖
RUN pip install --no-cache-dir -r requirements.txt -i https://mirrors.aliyun.com/pypi/simple/
# 复制应用代码(修改为 FastAPI 文件)
COPY whisper_fastapi.py .
# 暴露 FastAPI 端口
EXPOSE 7862
# 设置环境变量:模型缓存路径
ENV WHISPER_MODEL_CACHE=/root/.cache/whisper
# 启动应用(修改为 FastAPI 启动命令)
CMD ["uvicorn", "whisper_fastapi:app", "--host", "0.0.0.0", "--port", "7862"]
import gradio as gr
import whisper
import tempfile
import os
import numpy as np
from scipy import signal
import librosa
model = None
def load_whisper_model():
global model
if model is None:
model = whisper.load_model("medium")
return model
def preprocess_audio(audio_path):
""" 音频预处理:重采样、降噪、标准化 """
try:
y, sr = librosa.load(audio_path, sr=16000)
b, a = signal.butter(4, 100, 'highpass', fs=sr)
y = signal.filtfilt(b, a, y)
y = y / np.max(np.abs(y))
temp_path = tempfile.mktemp(suffix='.wav')
librosa.output.write_wav(temp_path, y, sr)
return temp_path
except Exception as e:
print(f"音频预处理失败:{str(e)}")
return audio_path
def transcribe_audio(audio_file):
""" 转录音频文件为文本,使用优化参数 """
model = load_whisper_model()
if audio_file is None:
return "错误:请上传一个音频文件。"
try:
processed_audio = preprocess_audio(audio_file)
result = model.transcribe(
processed_audio,
language="zh",
task="transcribe",
beam_size=5,
best_of=5,
temperature=0.0,
patience=1.0,
suppress_tokens=[-1]
)
if processed_audio != audio_file:
try:
os.unlink(processed_audio)
except:
pass
return result["text"]
except Exception as e:
return f"转录过程中出现错误:{str(e)}"
with gr.Blocks(title="Whisper 音频转录") as demo:
gr.Markdown("# 🎤 Whisper 音频转录")
gr.Markdown("上传 MP3、WAV、OGG 等音频文件,使用优化的参数将其转换为文本")
with gr.Row():
with gr.Column():
audio_input = gr.Audio(
sources=["upload"],
type="filepath",
label="上传音频文件",
interactive=True
)
submit_btn = gr.Button("开始转录", variant="primary")
with gr.Column():
text_output = gr.Textbox(
label="转录结果",
placeholder="转录文本将显示在这里...",
lines=10,
max_lines=15
)
submit_btn.click(
fn=transcribe_audio,
inputs=audio_input,
outputs=text_output
)
gr.Markdown("""
### 使用说明
1. 点击'上传音频文件'或拖放文件到上传区域
2. 支持格式:MP3, WAV, OGG, M4A, FLAC 等
3. 点击'开始转录'按钮
4. 等待转录结果出现在右侧文本框中
**注意**:首次使用需要下载 Whisper Large 模型,请耐心等待。转录过程可能需要较长时间。
""")
if __name__ == "__main__":
demo.launch(server_name="0.0.0.0", server_port=7862, share=False)