feat: 语音流式输入管线 + VAD前端集成 + 插件-工具合并清理
- 前端: VAD语音检测(@ricky0123/vad-web) + useVoiceInput双模式(流式WS/REST) - Gateway: VoiceStreamManager代理WS流式STT到voice-service - Voice-service: DashScope REST → Realtime WS → Whisper三级引擎 + ffmpeg转码 - 共享模块: pkg/audio(音频转换) + pkg/dashscope(ASR REST客户端) - 清理: 移除旧plugin-manager和pkg/plugins,完成插件→工具合并 - 文档: 完善gateway-api.md和voice-service.md语音API文档 - 工具: scripts/voice/ 语音转换脚本集 Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
This commit is contained in:
@@ -16,41 +16,59 @@ import (
|
||||
var SupportedLanguages = []string{"zh", "en", "ja", "ko", "auto"}
|
||||
|
||||
// STTService 语音转文字服务。
|
||||
// 优先使用 DashScope API,不可用时回退到本地 Whisper。
|
||||
// 离线转录优先使用 DashScope REST API,失败回退 Whisper。
|
||||
// 流式转录使用 DashScope Realtime WS。
|
||||
type STTService struct {
|
||||
whisperBinary string
|
||||
whisperModel string
|
||||
language string
|
||||
dashscope *DashScopeSTT // 实时 ASR (qwen3-asr-flash-realtime)
|
||||
whisperBinary string
|
||||
whisperModel string
|
||||
language string
|
||||
dashscope *DashScopeSTT // 实时 ASR (qwen3-asr-flash-realtime)
|
||||
dashscopeREST *DashScopeRESTSTT // 离线 ASR (qwen3-asr-flash-2026-02-10)
|
||||
}
|
||||
|
||||
// NewSTTService 创建 STT 服务。
|
||||
func NewSTTService(cfg *config.Config) *STTService {
|
||||
// 实时模型用于所有 WebSocket ASR 请求(支持 one-shot 和 streaming)
|
||||
// 离线模型 (qwen3-asr-flash-2026-02-10) 是 HTTP REST API,暂未实现
|
||||
model := cfg.DashScopeSTTRealtime
|
||||
if model == "" {
|
||||
model = cfg.DashScopeModel
|
||||
realtimeModel := cfg.DashScopeSTTRealtime
|
||||
if realtimeModel == "" {
|
||||
realtimeModel = "qwen3-asr-flash-realtime"
|
||||
}
|
||||
offlineModel := cfg.DashScopeModel
|
||||
if offlineModel == "" {
|
||||
offlineModel = "qwen3-asr-flash-2026-02-10"
|
||||
}
|
||||
return &STTService{
|
||||
whisperBinary: cfg.WhisperBinary,
|
||||
whisperModel: cfg.WhisperModel,
|
||||
language: cfg.WhisperLanguage,
|
||||
dashscope: NewDashScopeSTT(cfg.DashScopeAPIKey, model),
|
||||
dashscope: NewDashScopeSTT(cfg.DashScopeAPIKey, realtimeModel),
|
||||
dashscopeREST: NewDashScopeRESTSTT(cfg.DashScopeAPIKey, offlineModel),
|
||||
}
|
||||
}
|
||||
|
||||
// IsAvailable 检查是否有任一 STT 引擎可用。
|
||||
func (s *STTService) IsAvailable() bool {
|
||||
if s.dashscope.IsAvailable() {
|
||||
if s.dashscopeREST.IsAvailable() || s.dashscope.IsAvailable() {
|
||||
return true
|
||||
}
|
||||
_, err := os.Stat(s.whisperBinary)
|
||||
return err == nil
|
||||
return s.whisperAvailable()
|
||||
}
|
||||
|
||||
// whisperAvailable 检查本地 Whisper 引擎是否真正可用。
|
||||
func (s *STTService) whisperAvailable() bool {
|
||||
if _, err := os.Stat(s.whisperBinary); err != nil {
|
||||
return false
|
||||
}
|
||||
if _, err := os.Stat(s.whisperModel); err != nil {
|
||||
return false
|
||||
}
|
||||
if _, err := exec.LookPath("ffmpeg"); err != nil {
|
||||
return false
|
||||
}
|
||||
return true
|
||||
}
|
||||
|
||||
// Transcribe 将音频数据转录为文字。
|
||||
// 优先使用 DashScope,不可用时回退到本地 Whisper。
|
||||
// 优先使用 DashScope REST 离线模型,失败回退到本地 Whisper。
|
||||
func (s *STTService) Transcribe(audioData []byte, format string, language string) (string, error) {
|
||||
if language == "" {
|
||||
language = s.language
|
||||
@@ -59,16 +77,15 @@ func (s *STTService) Transcribe(audioData []byte, format string, language string
|
||||
return "", fmt.Errorf("不支持的语言: %s,支持的语言: %s", language, strings.Join(SupportedLanguages, ", "))
|
||||
}
|
||||
|
||||
// 优先 DashScope
|
||||
if s.dashscope.IsAvailable() {
|
||||
// 优先 DashScope REST 离线模型(低延迟,无需 session 协商)
|
||||
if s.dashscopeREST.IsAvailable() {
|
||||
ctx, cancel := context.WithTimeout(context.Background(), 30*time.Second)
|
||||
defer cancel()
|
||||
text, err := s.dashscope.Transcribe(ctx, audioData, format, language)
|
||||
text, err := s.dashscopeREST.Transcribe(ctx, audioData, format, language)
|
||||
if err == nil {
|
||||
return text, nil
|
||||
}
|
||||
// DashScope 失败,返回具体错误而不是回退到 Whisper
|
||||
return "", fmt.Errorf("语音识别失败: %w", err)
|
||||
fmt.Printf("[stt] DashScope REST 失败,回退 Whisper: %v\n", err)
|
||||
}
|
||||
|
||||
// 回退到本地 Whisper
|
||||
@@ -152,15 +169,21 @@ func (s *STTService) GetStatus() map[string]interface{} {
|
||||
if _, err := os.Stat(s.whisperModel); err == nil {
|
||||
modelExists = true
|
||||
}
|
||||
ffmpegAvailable := false
|
||||
if _, err := exec.LookPath("ffmpeg"); err == nil {
|
||||
ffmpegAvailable = true
|
||||
}
|
||||
|
||||
return map[string]interface{}{
|
||||
"available": s.IsAvailable(),
|
||||
"primary": "dashscope",
|
||||
"dashscope": s.dashscope.GetStatus(),
|
||||
"available": s.IsAvailable(),
|
||||
"primary": "dashscope_rest",
|
||||
"dashscope_rest": s.dashscopeREST.GetStatus(),
|
||||
"dashscope_ws": s.dashscope.GetStatus(),
|
||||
"whisper": map[string]interface{}{
|
||||
"available": binaryAvailable && modelExists,
|
||||
"available": s.whisperAvailable(),
|
||||
"binary_available": binaryAvailable,
|
||||
"model_loaded": modelExists,
|
||||
"ffmpeg_available": ffmpegAvailable,
|
||||
"model_name": filepath.Base(s.whisperModel),
|
||||
},
|
||||
"default_language": s.language,
|
||||
|
||||
Reference in New Issue
Block a user