一、环境准备
在开始之前,请先安装所需依赖包:
pip install openai-whisper transformers pydub librosa tqdm torch ffmpeg-python modelscope
⚠️ 需要提前安装 FFmpeg(Windows 用户请到 ffmpeg.org 下载并配置环境变量)
二、项目功能概述
本项目实现的流程如下:
- 提取视频音频(使用 FFmpeg)
- 验证音频文件是否可用(使用
pydub) - 使用 Whisper 模型进行语音识别
- 自动检测音频语言
- 使用 Transformers 翻译模型进行中英文互译
- 生成双语字幕文件
.srt
三、国内下载模型方法
from modelscope import snapshot_download
# 下载模型到当前目录
model_dir = snapshot_download('Helsinki-NLP/opus-mt-en-zh', cache_dir='./')
print(f"✅ 模型已下载到当前目录:{model_dir}")
model_dir = snapshot_download('Helsinki-NLP/opus-mt-zh-en', cache_dir='./')
print(f"✅ 模型已下载到当前目录:{model_dir}")
四、完整代码(含中文注释)
import whisper
import warnings
from datetime import timedelta
from tqdm import tqdm
import librosa
import time
from transformers import pipeline
import torch
from pydub import AudioSegment
import os
import ffmpeg
# 忽略警告信息
warnings.filterwarnings("ignore", category=UserWarning)
input_video =
output_audio =
ffmpeg.(input_video).output(output_audio, ac=, ar=).run()
():
td = timedelta(seconds=seconds)
hours, remainder = (td.seconds, )
minutes, seconds = (remainder, )
milliseconds = (td.microseconds / )
():
:
audio = AudioSegment.from_file(audio_file)
duration = (audio) /
sample_rate = audio.frame_rate
channels = audio.channels
()
, duration
Exception e:
()
,
()
translator_en_to_zh = pipeline(, model=)
translator_zh_to_en = pipeline(, model=)
()
audio_file = output_audio
is_valid, duration = validate_audio(audio_file)
is_valid:
ValueError()
:
duration = librosa.get_duration(path=audio_file)
Exception e:
()
()
:
model = whisper.load_model()
Exception e:
()
model = whisper.load_model()
progress_bar = tqdm(total=, desc=, unit=)
():
start_time = time.time()
language :
:
audio = whisper.load_audio(audio_file)
mel = whisper.log_mel_spectrogram(audio, n_mels=).to(model.device)
_, probs = model.detect_language(mel)
detected_language = (probs, key=probs.get)
()
Exception e:
()
detected_language =
:
detected_language = language
()
:
result = model.transcribe(audio_file, language=detected_language)
Exception e:
()
progress_bar.update( - progress_bar.n)
progress_bar.close()
elapsed_time = time.time() - start_time
()
result, detected_language
result, detected_language = transcribe_with_progress(model, audio_file, language=)
srt_path =
(srt_path, , encoding=) f:
i, segment (result[]):
start_time = format_timestamp(segment[])
end_time = format_timestamp(segment[])
text = segment[].strip()
detected_language == :
zh_text = text
en_text = translator_zh_to_en(text)[][]
:
zh_text = translator_en_to_zh(text)[][]
en_text = text
f.write()
f.write()
f.write()
()


