Files
Cyrene/backend/voice-service/internal/service/stt_service.go
T
AskaEth 6ef9e082a6 feat: 语音流式输入管线 + VAD前端集成 + 插件-工具合并清理
- 前端: VAD语音检测(@ricky0123/vad-web) + useVoiceInput双模式(流式WS/REST)
- Gateway: VoiceStreamManager代理WS流式STT到voice-service
- Voice-service: DashScope REST → Realtime WS → Whisper三级引擎 + ffmpeg转码
- 共享模块: pkg/audio(音频转换) + pkg/dashscope(ASR REST客户端)
- 清理: 移除旧plugin-manager和pkg/plugins,完成插件→工具合并
- 文档: 完善gateway-api.md和voice-service.md语音API文档
- 工具: scripts/voice/ 语音转换脚本集

Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
2026-06-06 11:50:40 +08:00

235 lines
6.4 KiB
Go

package service
import (
"context"
"fmt"
"os"
"os/exec"
"path/filepath"
"strings"
"time"
"git.yeij.top/AskaEth/Cyrene/voice-service/internal/config"
)
// SupportedLanguages STT 支持的语言列表
var SupportedLanguages = []string{"zh", "en", "ja", "ko", "auto"}
// STTService 语音转文字服务。
// 离线转录优先使用 DashScope REST API,失败回退 Whisper。
// 流式转录使用 DashScope Realtime WS。
type STTService struct {
whisperBinary string
whisperModel string
language string
dashscope *DashScopeSTT // 实时 ASR (qwen3-asr-flash-realtime)
dashscopeREST *DashScopeRESTSTT // 离线 ASR (qwen3-asr-flash-2026-02-10)
}
// NewSTTService 创建 STT 服务。
func NewSTTService(cfg *config.Config) *STTService {
realtimeModel := cfg.DashScopeSTTRealtime
if realtimeModel == "" {
realtimeModel = "qwen3-asr-flash-realtime"
}
offlineModel := cfg.DashScopeModel
if offlineModel == "" {
offlineModel = "qwen3-asr-flash-2026-02-10"
}
return &STTService{
whisperBinary: cfg.WhisperBinary,
whisperModel: cfg.WhisperModel,
language: cfg.WhisperLanguage,
dashscope: NewDashScopeSTT(cfg.DashScopeAPIKey, realtimeModel),
dashscopeREST: NewDashScopeRESTSTT(cfg.DashScopeAPIKey, offlineModel),
}
}
// IsAvailable 检查是否有任一 STT 引擎可用。
func (s *STTService) IsAvailable() bool {
if s.dashscopeREST.IsAvailable() || s.dashscope.IsAvailable() {
return true
}
return s.whisperAvailable()
}
// whisperAvailable 检查本地 Whisper 引擎是否真正可用。
func (s *STTService) whisperAvailable() bool {
if _, err := os.Stat(s.whisperBinary); err != nil {
return false
}
if _, err := os.Stat(s.whisperModel); err != nil {
return false
}
if _, err := exec.LookPath("ffmpeg"); err != nil {
return false
}
return true
}
// Transcribe 将音频数据转录为文字。
// 优先使用 DashScope REST 离线模型,失败回退到本地 Whisper。
func (s *STTService) Transcribe(audioData []byte, format string, language string) (string, error) {
if language == "" {
language = s.language
}
if !isSupportedLanguage(language) {
return "", fmt.Errorf("不支持的语言: %s,支持的语言: %s", language, strings.Join(SupportedLanguages, ", "))
}
// 优先 DashScope REST 离线模型(低延迟,无需 session 协商)
if s.dashscopeREST.IsAvailable() {
ctx, cancel := context.WithTimeout(context.Background(), 30*time.Second)
defer cancel()
text, err := s.dashscopeREST.Transcribe(ctx, audioData, format, language)
if err == nil {
return text, nil
}
fmt.Printf("[stt] DashScope REST 失败,回退 Whisper: %v\n", err)
}
// 回退到本地 Whisper
return s.transcribeWhisper(audioData, format, language)
}
// StartStreaming 创建持久的流式语音识别会话。
func (s *STTService) StartStreaming(format, language string) (*StreamingSession, error) {
if !s.dashscope.IsAvailable() {
return nil, fmt.Errorf("流式识别需要 DashScope,请配置 DASHSCOPE_API_KEY")
}
if language == "" {
language = s.language
}
ctx, cancel := context.WithTimeout(context.Background(), 15*time.Second)
defer cancel()
return s.dashscope.StartStreaming(ctx, format, language)
}
// transcribeWhisper 使用本地 Whisper 引擎转录。
func (s *STTService) transcribeWhisper(audioData []byte, format string, language string) (string, error) {
if _, err := os.Stat(s.whisperBinary); err != nil {
return "", fmt.Errorf("STT 引擎不可用: DashScope API Key 未配置且 Whisper 未安装")
}
ext := normalizeExt(format)
tmpFile, err := os.CreateTemp("/tmp", "cyrene-stt-*"+ext)
if err != nil {
return "", fmt.Errorf("创建临时文件失败: %w", err)
}
tmpPath := tmpFile.Name()
defer os.Remove(tmpPath)
if _, err := tmpFile.Write(audioData); err != nil {
tmpFile.Close()
return "", fmt.Errorf("写入临时文件失败: %w", err)
}
tmpFile.Close()
inputPath := tmpPath
if format != "wav" && format != "" {
convertedPath := tmpPath + ".wav"
if err := convertToWav(tmpPath, convertedPath); err == nil {
defer os.Remove(convertedPath)
inputPath = convertedPath
}
}
outputPrefix := strings.TrimSuffix(inputPath, filepath.Ext(inputPath))
outputTxt := outputPrefix + ".txt"
cmd := exec.Command(s.whisperBinary,
"-m", s.whisperModel,
"-l", language,
"-f", inputPath,
"-otxt",
"-of", outputPrefix,
)
cmd.Stderr = os.Stderr
if err := cmd.Run(); err != nil {
os.Remove(outputTxt)
return "", fmt.Errorf("whisper 转录失败: %w", err)
}
defer os.Remove(outputTxt)
txtData, err := os.ReadFile(outputTxt)
if err != nil {
return "", fmt.Errorf("读取转录结果失败: %w", err)
}
return strings.TrimSpace(string(txtData)), nil
}
// GetStatus 返回服务状态。
func (s *STTService) GetStatus() map[string]interface{} {
binaryAvailable := false
if _, err := os.Stat(s.whisperBinary); err == nil {
binaryAvailable = true
}
modelExists := false
if _, err := os.Stat(s.whisperModel); err == nil {
modelExists = true
}
ffmpegAvailable := false
if _, err := exec.LookPath("ffmpeg"); err == nil {
ffmpegAvailable = true
}
return map[string]interface{}{
"available": s.IsAvailable(),
"primary": "dashscope_rest",
"dashscope_rest": s.dashscopeREST.GetStatus(),
"dashscope_ws": s.dashscope.GetStatus(),
"whisper": map[string]interface{}{
"available": s.whisperAvailable(),
"binary_available": binaryAvailable,
"model_loaded": modelExists,
"ffmpeg_available": ffmpegAvailable,
"model_name": filepath.Base(s.whisperModel),
},
"default_language": s.language,
"supported_languages": SupportedLanguages,
}
}
// normalizeExt 规范化文件扩展名。
func normalizeExt(format string) string {
switch strings.ToLower(format) {
case "wav":
return ".wav"
case "mp3", "mpeg":
return ".mp3"
case "ogg", "opus":
return ".ogg"
case "flac":
return ".flac"
case "m4a", "mp4", "aac":
return ".m4a"
default:
return ".wav"
}
}
// isSupportedLanguage 检查语言是否支持。
func isSupportedLanguage(lang string) bool {
for _, l := range SupportedLanguages {
if l == lang {
return true
}
}
return false
}
// convertToWav 使用 ffmpeg 将音频转换为 WAV 格式。
func convertToWav(inputPath, outputPath string) error {
cmd := exec.Command("ffmpeg",
"-i", inputPath,
"-ar", "16000",
"-ac", "1",
"-c:a", "pcm_s16le",
outputPath,
"-y",
)
cmd.Stderr = nil
return cmd.Run()
}