feat: 语音流式输入管线 + VAD前端集成 + 插件-工具合并清理

- 前端: VAD语音检测(@ricky0123/vad-web) + useVoiceInput双模式(流式WS/REST)
- Gateway: VoiceStreamManager代理WS流式STT到voice-service
- Voice-service: DashScope REST → Realtime WS → Whisper三级引擎 + ffmpeg转码
- 共享模块: pkg/audio(音频转换) + pkg/dashscope(ASR REST客户端)
- 清理: 移除旧plugin-manager和pkg/plugins,完成插件→工具合并
- 文档: 完善gateway-api.md和voice-service.md语音API文档
- 工具: scripts/voice/ 语音转换脚本集

Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
This commit is contained in:
2026-06-06 11:50:40 +08:00
parent 258cf81b25
commit 6ef9e082a6
91 changed files with 4091 additions and 3929 deletions
@@ -38,6 +38,7 @@ type ChatHandler struct {
hub *ws.Hub
sessionStore *store.SessionStore
fileStore *store.FileStore
voiceStream *VoiceStreamManager
upgrader websocket.Upgrader
pending map[string][]queuedMsg // per-session message queue
pendingMu sync.Mutex
@@ -50,6 +51,7 @@ func NewChatHandler(cfg *config.Config, hub *ws.Hub, sessionStore *store.Session
hub: hub,
sessionStore: sessionStore,
fileStore: fileStore,
voiceStream: NewVoiceStreamManager(cfg.VoiceServiceURL),
pending: make(map[string][]queuedMsg),
upgrader: websocket.Upgrader{
ReadBufferSize: 1024,
@@ -131,6 +133,12 @@ func (h *ChatHandler) handleMessage(client *ws.Client, msg ws.ClientMessage) {
h.handleChatMessage(client, msg)
case "voice_input":
h.handleVoiceInput(client, msg)
case "voice_stream_start":
h.handleVoiceStreamStart(client, msg)
case "voice_stream_chunk":
h.handleVoiceStreamChunk(client, msg)
case "voice_stream_end":
h.handleVoiceStreamEnd(client, msg)
case "history":
h.handleHistoryRequest(client, msg)
default:
@@ -436,11 +444,13 @@ func (h *ChatHandler) streamResponse(client *ws.Client, mode string, reqBody []b
// 处理审查后的结构化消息 (review)
if len(chunk.ReviewMessages) > 0 {
for i, rm := range chunk.ReviewMessages {
msgType := rm.Type
if msgType == "" {
msgType = "chat"
}
role := "assistant"
msgType := "chat"
if rm.Type == "action" {
if msgType == "action" {
role = "action"
msgType = "action"
}
reviewMsgID := fmt.Sprintf("%s_r%d", msgID, i)
// 持久化每条审查消息 (action 角色映射为 assistantLLM 模型不支持自定义角色)
@@ -473,6 +483,7 @@ func (h *ChatHandler) streamResponse(client *ws.Client, mode string, reqBody []b
SessionID: client.SessionID,
Timestamp: time.Now().UnixMilli(),
ClientInfo: clientInfo,
Metadata: rm.Metadata,
})
// 使用 MessageScheduler 计算的 per-message 延迟
if rm.DelayMs > 0 {
@@ -650,6 +661,96 @@ func (h *ChatHandler) handleVoiceInput(client *ws.Client, msg ws.ClientMessage)
}()
}
// handleVoiceStreamStart begins a streaming voice session via voice-service.
func (h *ChatHandler) handleVoiceStreamStart(client *ws.Client, msg ws.ClientMessage) {
format := msg.Format
if format == "" {
format = "webm"
}
language := msg.Language
if language == "" {
language = "zh"
}
if err := h.voiceStream.StartStream(client, format, language); err != nil {
logger.Printf("[voice-stream] 启动流式 STT 失败: %v", err)
client.SendMessage(ws.ServerMessage{
Type: "error",
MessageID: "msg_" + generateID(),
Error: "启动语音流失败: " + err.Error(),
Timestamp: time.Now().UnixMilli(),
})
return
}
client.SendMessage(ws.ServerMessage{
Type: "voice_interim",
MessageID: "voice_" + generateID(),
Text: "",
Timestamp: time.Now().UnixMilli(),
})
}
// handleVoiceStreamChunk forwards an audio chunk to the active voice stream.
func (h *ChatHandler) handleVoiceStreamChunk(client *ws.Client, msg ws.ClientMessage) {
if msg.AudioData == "" {
return
}
audioData, err := decodeBase64(msg.AudioData)
if err != nil {
logger.Printf("[voice-stream] 解码音频块失败: %v", err)
return
}
if err := h.voiceStream.SendChunk(client.ClientID, client.SessionID, audioData, msg.Sequence); err != nil {
logger.Printf("[voice-stream] 发送音频块失败: %v", err)
}
}
// handleVoiceStreamEnd stops the voice stream and processes the final transcription.
func (h *ChatHandler) handleVoiceStreamEnd(client *ws.Client, msg ws.ClientMessage) {
go func() {
text, err := h.voiceStream.EndStream(client.ClientID, client.SessionID)
if err != nil {
logger.Printf("[voice-stream] 结束流式 STT 失败: %v", err)
client.SendMessage(ws.ServerMessage{
Type: "error",
MessageID: "msg_" + generateID(),
Error: "语音流处理失败: " + err.Error(),
Timestamp: time.Now().UnixMilli(),
})
return
}
if text == "" {
client.SendMessage(ws.ServerMessage{
Type: "voice_final",
MessageID: "voice_" + generateID(),
Text: "",
Timestamp: time.Now().UnixMilli(),
})
return
}
// Send final transcription to frontend
client.SendMessage(ws.ServerMessage{
Type: "voice_final",
MessageID: "voice_" + generateID(),
Text: text,
Timestamp: time.Now().UnixMilli(),
})
// Route the transcribed text as a regular chat message to ai-core
chatMsg := ws.ClientMessage{
Type: "message",
Content: text,
Mode: msg.Mode,
}
h.handleChatMessage(client, chatMsg)
}()
}
// transcribeAudio 将 base64 编码的音频发送到 voice-service 进行转录。
func (h *ChatHandler) transcribeAudio(audioB64 string, format string) (string, error) {
audioData, err := decodeBase64(audioB64)