feat: Phase 5 STT — DashScope Gummy 实时语音识别 + 本地 Whisper 回退

- DashScope WebSocket STT 客户端 (gummy-chat-v1)
- 双引擎架构: DashScope 优先, Whisper 本地回退
- 实时流式 STT WebSocket 端点
- DevTools 模型搜索框焦点修复

Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
This commit is contained in:
2026-05-23 22:15:43 +08:00
parent 0717928496
commit b1e89c606e
9 changed files with 545 additions and 84 deletions
@@ -0,0 +1,273 @@
package service
import (
"context"
"fmt"
"sync"
"time"
"github.com/gorilla/websocket"
)
// DashScopeSTT 使用阿里云百炼 Gummy 模型进行语音识别。
// WebSocket API: wss://dashscope.aliyuncs.com/api-ws/v1/inference
type DashScopeSTT struct {
apiKey string
model string
timeout time.Duration
}
// NewDashScopeSTT 创建 DashScope STT 客户端。
func NewDashScopeSTT(apiKey, model string) *DashScopeSTT {
if model == "" {
model = "gummy-chat-v1"
}
return &DashScopeSTT{
apiKey: apiKey,
model: model,
timeout: 30 * time.Second,
}
}
// IsAvailable 检查 API Key 是否已配置。
func (d *DashScopeSTT) IsAvailable() bool {
return d.apiKey != ""
}
// sttMessage 定义 STT WebSocket 协议消息格式。
type sttMessage struct {
Header sttHeader `json:"header"`
Payload sttPayload `json:"payload"`
}
type sttHeader struct {
Streaming string `json:"streaming"`
TaskID string `json:"task_id"`
Action string `json:"action"`
}
type sttPayload struct {
Model string `json:"model"`
TaskGroup string `json:"task_group"`
Task string `json:"task"`
Function string `json:"function"`
Input map[string]interface{} `json:"input,omitempty"`
Parameters sttParameters `json:"parameters"`
Output map[string]interface{} `json:"output,omitempty"`
}
type sttParameters struct {
SampleRate int `json:"sample_rate"`
Format string `json:"format"`
TranscriptionEnabled bool `json:"transcription_enabled"`
TranslationEnabled bool `json:"translation_enabled"`
SourceLanguage string `json:"source_language,omitempty"`
MaxEndSilence int `json:"max_end_silence,omitempty"`
}
// sttServerMsg 服务端返回的消息格式。
type sttServerMsg struct {
Header sttServerHeader `json:"header"`
Payload sttServerPayload `json:"payload"`
}
type sttServerHeader struct {
TaskID string `json:"task_id"`
Event string `json:"event"`
}
type sttServerPayload struct {
Output map[string]interface{} `json:"output,omitempty"`
Usage map[string]interface{} `json:"usage,omitempty"`
Error sttError `json:"error,omitempty"`
}
type sttError struct {
Code string `json:"code"`
Message string `json:"message"`
}
// Transcribe 将音频数据发送到 DashScope 进行识别,返回识别文本。
func (d *DashScopeSTT) Transcribe(ctx context.Context, audioData []byte, format string, language string) (string, error) {
if !d.IsAvailable() {
return "", fmt.Errorf("DashScope API Key 未配置")
}
dialer := websocket.Dialer{
HandshakeTimeout: 10 * time.Second,
}
header := make(map[string][]string)
header["Authorization"] = []string{"Bearer " + d.apiKey}
conn, _, err := dialer.DialContext(ctx, "wss://dashscope.aliyuncs.com/api-ws/v1/inference", header)
if err != nil {
return "", fmt.Errorf("连接 DashScope STT 失败: %w", err)
}
defer conn.Close()
conn.SetReadDeadline(time.Now().Add(d.timeout))
taskID := fmt.Sprintf("cyrene-stt-%d", time.Now().UnixNano())
// 规范化音频格式
normFormat := normalizeSTTFormat(format)
if language == "" || language == "auto" {
language = "zh"
}
// 发送 run-task
startMsg := sttMessage{
Header: sttHeader{
Streaming: "duplex",
TaskID: taskID,
Action: "run-task",
},
Payload: sttPayload{
Model: d.model,
TaskGroup: "audio",
Task: "asr",
Function: "recognition",
Parameters: sttParameters{
SampleRate: 16000,
Format: normFormat,
TranscriptionEnabled: true,
TranslationEnabled: false,
SourceLanguage: language,
},
},
}
if err := conn.WriteJSON(startMsg); err != nil {
return "", fmt.Errorf("发送 run-task 失败: %w", err)
}
// 等待 task-started
var textResult string
var mu sync.Mutex
started := make(chan struct{})
errc := make(chan error, 1)
done := make(chan struct{})
go func() {
defer close(done)
startedClosed := false
for {
var msg sttServerMsg
if err := conn.ReadJSON(&msg); err != nil {
select {
case errc <- fmt.Errorf("读取响应失败: %w", err):
default:
}
return
}
switch msg.Header.Event {
case "task-started":
if !startedClosed {
close(started)
startedClosed = true
}
case "result-generated":
if out, ok := msg.Payload.Output["transcription"]; ok {
if transMap, ok := out.(map[string]interface{}); ok {
if text, ok := transMap["text"].(string); ok {
mu.Lock()
textResult = text
mu.Unlock()
}
}
}
case "task-finished":
return
case "task-failed":
errMsg := msg.Payload.Error.Message
if errMsg == "" {
errMsg = "未知错误"
}
select {
case errc <- fmt.Errorf("DashScope 识别失败: %s (code=%s)", errMsg, msg.Payload.Error.Code):
default:
}
return
}
}
}()
// 等待 task-started 或错误
select {
case <-started:
case err := <-errc:
return "", err
case <-ctx.Done():
return "", ctx.Err()
}
// 发送音频数据(分块发送,每块 ~10KB)
chunkSize := 10240
for i := 0; i < len(audioData); i += chunkSize {
end := i + chunkSize
if end > len(audioData) {
end = len(audioData)
}
conn.SetWriteDeadline(time.Now().Add(10 * time.Second))
if err := conn.WriteMessage(websocket.BinaryMessage, audioData[i:end]); err != nil {
return "", fmt.Errorf("发送音频数据失败: %w", err)
}
}
// 发送 finish-task
finishMsg := sttMessage{
Header: sttHeader{
Streaming: "duplex",
TaskID: taskID,
Action: "finish-task",
},
}
if err := conn.WriteJSON(finishMsg); err != nil {
return "", fmt.Errorf("发送 finish-task 失败: %w", err)
}
// 等待完成
select {
case <-done:
mu.Lock()
text := textResult
mu.Unlock()
if text == "" {
return "", fmt.Errorf("未收到识别结果")
}
return text, nil
case err := <-errc:
return "", err
case <-ctx.Done():
return "", ctx.Err()
}
}
// normalizeSTTFormat 将音频格式映射到 DashScope 支持的格式名。
func normalizeSTTFormat(format string) string {
switch format {
case "wav":
return "wav"
case "mp3", "mpeg":
return "mp3"
case "ogg", "opus":
return "ogg"
case "flac":
return "flac"
case "m4a", "aac", "mp4":
return "aac"
default:
return "pcm"
}
}
// GetStatus 返回 DashScope STT 状态。
func (d *DashScopeSTT) GetStatus() map[string]interface{} {
return map[string]interface{}{
"available": d.IsAvailable(),
"model": d.model,
"provider": "dashscope",
}
}