Files
Cyrene/backend/voice-service/internal/service/stt_service.go
T
AskaEth 7eb5e984c2 refactor: 认证系统重构 + DevTools CLI 重写 + 文档全面更新
- auth: Login 简化为管理员始终通过 .env 验证,GetProfile 修正 admin DB 查询
- devtools: .sh/.bat 同步重写为完整 CLI (start/stop/status/logs/build/db:*)
- docs: 新增 devtools.md,重写 Deploy.md (三种方式+Windows说明),更新 README/gateway-api
- voice-service: DashScope 实时流式 STT 支持
- gateway: Phase 6 多模型配置 + 多端客户端管理 + WebSocket 增强

Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
2026-05-24 14:55:47 +08:00

212 lines
5.6 KiB
Go
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
package service
import (
"context"
"fmt"
"os"
"os/exec"
"path/filepath"
"strings"
"time"
"github.com/yourname/cyrene-ai/voice-service/internal/config"
)
// SupportedLanguages STT 支持的语言列表
var SupportedLanguages = []string{"zh", "en", "ja", "ko", "auto"}
// STTService 语音转文字服务。
// 优先使用 DashScope API,不可用时回退到本地 Whisper。
type STTService struct {
whisperBinary string
whisperModel string
language string
dashscope *DashScopeSTT // 实时 ASR (qwen3-asr-flash-realtime)
}
// NewSTTService 创建 STT 服务。
func NewSTTService(cfg *config.Config) *STTService {
// 实时模型用于所有 WebSocket ASR 请求(支持 one-shot 和 streaming
// 离线模型 (qwen3-asr-flash-2026-02-10) 是 HTTP REST API,暂未实现
model := cfg.DashScopeSTTRealtime
if model == "" {
model = cfg.DashScopeModel
}
return &STTService{
whisperBinary: cfg.WhisperBinary,
whisperModel: cfg.WhisperModel,
language: cfg.WhisperLanguage,
dashscope: NewDashScopeSTT(cfg.DashScopeAPIKey, model),
}
}
// IsAvailable 检查是否有任一 STT 引擎可用。
func (s *STTService) IsAvailable() bool {
if s.dashscope.IsAvailable() {
return true
}
_, err := os.Stat(s.whisperBinary)
return err == nil
}
// Transcribe 将音频数据转录为文字。
// 优先使用 DashScope,不可用时回退到本地 Whisper。
func (s *STTService) Transcribe(audioData []byte, format string, language string) (string, error) {
if language == "" {
language = s.language
}
if !isSupportedLanguage(language) {
return "", fmt.Errorf("不支持的语言: %s,支持的语言: %s", language, strings.Join(SupportedLanguages, ", "))
}
// 优先 DashScope
if s.dashscope.IsAvailable() {
ctx, cancel := context.WithTimeout(context.Background(), 30*time.Second)
defer cancel()
text, err := s.dashscope.Transcribe(ctx, audioData, format, language)
if err == nil {
return text, nil
}
// DashScope 失败,返回具体错误而不是回退到 Whisper
return "", fmt.Errorf("语音识别失败: %w", err)
}
// 回退到本地 Whisper
return s.transcribeWhisper(audioData, format, language)
}
// StartStreaming 创建持久的流式语音识别会话。
func (s *STTService) StartStreaming(format, language string) (*StreamingSession, error) {
if !s.dashscope.IsAvailable() {
return nil, fmt.Errorf("流式识别需要 DashScope,请配置 DASHSCOPE_API_KEY")
}
if language == "" {
language = s.language
}
ctx, cancel := context.WithTimeout(context.Background(), 15*time.Second)
defer cancel()
return s.dashscope.StartStreaming(ctx, format, language)
}
// transcribeWhisper 使用本地 Whisper 引擎转录。
func (s *STTService) transcribeWhisper(audioData []byte, format string, language string) (string, error) {
if _, err := os.Stat(s.whisperBinary); err != nil {
return "", fmt.Errorf("STT 引擎不可用: DashScope API Key 未配置且 Whisper 未安装")
}
ext := normalizeExt(format)
tmpFile, err := os.CreateTemp("/tmp", "cyrene-stt-*"+ext)
if err != nil {
return "", fmt.Errorf("创建临时文件失败: %w", err)
}
tmpPath := tmpFile.Name()
defer os.Remove(tmpPath)
if _, err := tmpFile.Write(audioData); err != nil {
tmpFile.Close()
return "", fmt.Errorf("写入临时文件失败: %w", err)
}
tmpFile.Close()
inputPath := tmpPath
if format != "wav" && format != "" {
convertedPath := tmpPath + ".wav"
if err := convertToWav(tmpPath, convertedPath); err == nil {
defer os.Remove(convertedPath)
inputPath = convertedPath
}
}
outputPrefix := strings.TrimSuffix(inputPath, filepath.Ext(inputPath))
outputTxt := outputPrefix + ".txt"
cmd := exec.Command(s.whisperBinary,
"-m", s.whisperModel,
"-l", language,
"-f", inputPath,
"-otxt",
"-of", outputPrefix,
)
cmd.Stderr = os.Stderr
if err := cmd.Run(); err != nil {
os.Remove(outputTxt)
return "", fmt.Errorf("whisper 转录失败: %w", err)
}
defer os.Remove(outputTxt)
txtData, err := os.ReadFile(outputTxt)
if err != nil {
return "", fmt.Errorf("读取转录结果失败: %w", err)
}
return strings.TrimSpace(string(txtData)), nil
}
// GetStatus 返回服务状态。
func (s *STTService) GetStatus() map[string]interface{} {
binaryAvailable := false
if _, err := os.Stat(s.whisperBinary); err == nil {
binaryAvailable = true
}
modelExists := false
if _, err := os.Stat(s.whisperModel); err == nil {
modelExists = true
}
return map[string]interface{}{
"available": s.IsAvailable(),
"primary": "dashscope",
"dashscope": s.dashscope.GetStatus(),
"whisper": map[string]interface{}{
"available": binaryAvailable && modelExists,
"binary_available": binaryAvailable,
"model_loaded": modelExists,
"model_name": filepath.Base(s.whisperModel),
},
"default_language": s.language,
"supported_languages": SupportedLanguages,
}
}
// normalizeExt 规范化文件扩展名。
func normalizeExt(format string) string {
switch strings.ToLower(format) {
case "wav":
return ".wav"
case "mp3", "mpeg":
return ".mp3"
case "ogg", "opus":
return ".ogg"
case "flac":
return ".flac"
case "m4a", "mp4", "aac":
return ".m4a"
default:
return ".wav"
}
}
// isSupportedLanguage 检查语言是否支持。
func isSupportedLanguage(lang string) bool {
for _, l := range SupportedLanguages {
if l == lang {
return true
}
}
return false
}
// convertToWav 使用 ffmpeg 将音频转换为 WAV 格式。
func convertToWav(inputPath, outputPath string) error {
cmd := exec.Command("ffmpeg",
"-i", inputPath,
"-ar", "16000",
"-ac", "1",
"-c:a", "pcm_s16le",
outputPath,
"-y",
)
cmd.Stderr = nil
return cmd.Run()
}