package service import ( "context" "fmt" "os" "os/exec" "path/filepath" "strings" "time" "git.yeij.top/AskaEth/Cyrene/voice-service/internal/config" ) // SupportedLanguages STT 支持的语言列表 var SupportedLanguages = []string{"zh", "en", "ja", "ko", "auto"} // STTService 语音转文字服务。 // 离线转录优先使用 DashScope REST API,失败回退 Whisper。 // 流式转录使用 DashScope Realtime WS。 type STTService struct { whisperBinary string whisperModel string language string dashscope *DashScopeSTT // 实时 ASR (qwen3-asr-flash-realtime) dashscopeREST *DashScopeRESTSTT // 离线 ASR (qwen3-asr-flash-2026-02-10) } // NewSTTService 创建 STT 服务。 func NewSTTService(cfg *config.Config) *STTService { realtimeModel := cfg.DashScopeSTTRealtime if realtimeModel == "" { realtimeModel = "qwen3-asr-flash-realtime" } offlineModel := cfg.DashScopeModel if offlineModel == "" { offlineModel = "qwen3-asr-flash-2026-02-10" } return &STTService{ whisperBinary: cfg.WhisperBinary, whisperModel: cfg.WhisperModel, language: cfg.WhisperLanguage, dashscope: NewDashScopeSTT(cfg.DashScopeAPIKey, realtimeModel), dashscopeREST: NewDashScopeRESTSTT(cfg.DashScopeAPIKey, offlineModel), } } // IsAvailable 检查是否有任一 STT 引擎可用。 func (s *STTService) IsAvailable() bool { if s.dashscopeREST.IsAvailable() || s.dashscope.IsAvailable() { return true } return s.whisperAvailable() } // whisperAvailable 检查本地 Whisper 引擎是否真正可用。 func (s *STTService) whisperAvailable() bool { if _, err := os.Stat(s.whisperBinary); err != nil { return false } if _, err := os.Stat(s.whisperModel); err != nil { return false } if _, err := exec.LookPath("ffmpeg"); err != nil { return false } return true } // Transcribe 将音频数据转录为文字。 // 优先使用 DashScope REST 离线模型,失败回退到本地 Whisper。 func (s *STTService) Transcribe(audioData []byte, format string, language string) (string, error) { if language == "" { language = s.language } if !isSupportedLanguage(language) { return "", fmt.Errorf("不支持的语言: %s,支持的语言: %s", language, strings.Join(SupportedLanguages, ", ")) } // 优先 DashScope REST 离线模型(低延迟,无需 session 协商) if s.dashscopeREST.IsAvailable() { ctx, cancel := context.WithTimeout(context.Background(), 30*time.Second) defer cancel() text, err := s.dashscopeREST.Transcribe(ctx, audioData, format, language) if err == nil { return text, nil } fmt.Printf("[stt] DashScope REST 失败,回退 Whisper: %v\n", err) } // 回退到本地 Whisper return s.transcribeWhisper(audioData, format, language) } // StartStreaming 创建持久的流式语音识别会话。 func (s *STTService) StartStreaming(format, language string) (*StreamingSession, error) { if !s.dashscope.IsAvailable() { return nil, fmt.Errorf("流式识别需要 DashScope,请配置 DASHSCOPE_API_KEY") } if language == "" { language = s.language } ctx, cancel := context.WithTimeout(context.Background(), 15*time.Second) defer cancel() return s.dashscope.StartStreaming(ctx, format, language) } // transcribeWhisper 使用本地 Whisper 引擎转录。 func (s *STTService) transcribeWhisper(audioData []byte, format string, language string) (string, error) { if _, err := os.Stat(s.whisperBinary); err != nil { return "", fmt.Errorf("STT 引擎不可用: DashScope API Key 未配置且 Whisper 未安装") } ext := normalizeExt(format) tmpFile, err := os.CreateTemp("/tmp", "cyrene-stt-*"+ext) if err != nil { return "", fmt.Errorf("创建临时文件失败: %w", err) } tmpPath := tmpFile.Name() defer os.Remove(tmpPath) if _, err := tmpFile.Write(audioData); err != nil { tmpFile.Close() return "", fmt.Errorf("写入临时文件失败: %w", err) } tmpFile.Close() inputPath := tmpPath if format != "wav" && format != "" { convertedPath := tmpPath + ".wav" if err := convertToWav(tmpPath, convertedPath); err == nil { defer os.Remove(convertedPath) inputPath = convertedPath } } outputPrefix := strings.TrimSuffix(inputPath, filepath.Ext(inputPath)) outputTxt := outputPrefix + ".txt" cmd := exec.Command(s.whisperBinary, "-m", s.whisperModel, "-l", language, "-f", inputPath, "-otxt", "-of", outputPrefix, ) cmd.Stderr = os.Stderr if err := cmd.Run(); err != nil { os.Remove(outputTxt) return "", fmt.Errorf("whisper 转录失败: %w", err) } defer os.Remove(outputTxt) txtData, err := os.ReadFile(outputTxt) if err != nil { return "", fmt.Errorf("读取转录结果失败: %w", err) } return strings.TrimSpace(string(txtData)), nil } // GetStatus 返回服务状态。 func (s *STTService) GetStatus() map[string]interface{} { binaryAvailable := false if _, err := os.Stat(s.whisperBinary); err == nil { binaryAvailable = true } modelExists := false if _, err := os.Stat(s.whisperModel); err == nil { modelExists = true } ffmpegAvailable := false if _, err := exec.LookPath("ffmpeg"); err == nil { ffmpegAvailable = true } return map[string]interface{}{ "available": s.IsAvailable(), "primary": "dashscope_rest", "dashscope_rest": s.dashscopeREST.GetStatus(), "dashscope_ws": s.dashscope.GetStatus(), "whisper": map[string]interface{}{ "available": s.whisperAvailable(), "binary_available": binaryAvailable, "model_loaded": modelExists, "ffmpeg_available": ffmpegAvailable, "model_name": filepath.Base(s.whisperModel), }, "default_language": s.language, "supported_languages": SupportedLanguages, } } // normalizeExt 规范化文件扩展名。 func normalizeExt(format string) string { switch strings.ToLower(format) { case "wav": return ".wav" case "mp3", "mpeg": return ".mp3" case "ogg", "opus": return ".ogg" case "flac": return ".flac" case "m4a", "mp4", "aac": return ".m4a" default: return ".wav" } } // isSupportedLanguage 检查语言是否支持。 func isSupportedLanguage(lang string) bool { for _, l := range SupportedLanguages { if l == lang { return true } } return false } // convertToWav 使用 ffmpeg 将音频转换为 WAV 格式。 func convertToWav(inputPath, outputPath string) error { cmd := exec.Command("ffmpeg", "-i", inputPath, "-ar", "16000", "-ac", "1", "-c:a", "pcm_s16le", outputPath, "-y", ) cmd.Stderr = nil return cmd.Run() }