feat: 语音流式输入管线 + VAD前端集成 + 插件-工具合并清理
- 前端: VAD语音检测(@ricky0123/vad-web) + useVoiceInput双模式(流式WS/REST) - Gateway: VoiceStreamManager代理WS流式STT到voice-service - Voice-service: DashScope REST → Realtime WS → Whisper三级引擎 + ffmpeg转码 - 共享模块: pkg/audio(音频转换) + pkg/dashscope(ASR REST客户端) - 清理: 移除旧plugin-manager和pkg/plugins,完成插件→工具合并 - 文档: 完善gateway-api.md和voice-service.md语音API文档 - 工具: scripts/voice/ 语音转换脚本集 Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
This commit is contained in:
@@ -38,6 +38,7 @@ type ChatHandler struct {
|
||||
hub *ws.Hub
|
||||
sessionStore *store.SessionStore
|
||||
fileStore *store.FileStore
|
||||
voiceStream *VoiceStreamManager
|
||||
upgrader websocket.Upgrader
|
||||
pending map[string][]queuedMsg // per-session message queue
|
||||
pendingMu sync.Mutex
|
||||
@@ -50,6 +51,7 @@ func NewChatHandler(cfg *config.Config, hub *ws.Hub, sessionStore *store.Session
|
||||
hub: hub,
|
||||
sessionStore: sessionStore,
|
||||
fileStore: fileStore,
|
||||
voiceStream: NewVoiceStreamManager(cfg.VoiceServiceURL),
|
||||
pending: make(map[string][]queuedMsg),
|
||||
upgrader: websocket.Upgrader{
|
||||
ReadBufferSize: 1024,
|
||||
@@ -131,6 +133,12 @@ func (h *ChatHandler) handleMessage(client *ws.Client, msg ws.ClientMessage) {
|
||||
h.handleChatMessage(client, msg)
|
||||
case "voice_input":
|
||||
h.handleVoiceInput(client, msg)
|
||||
case "voice_stream_start":
|
||||
h.handleVoiceStreamStart(client, msg)
|
||||
case "voice_stream_chunk":
|
||||
h.handleVoiceStreamChunk(client, msg)
|
||||
case "voice_stream_end":
|
||||
h.handleVoiceStreamEnd(client, msg)
|
||||
case "history":
|
||||
h.handleHistoryRequest(client, msg)
|
||||
default:
|
||||
@@ -436,11 +444,13 @@ func (h *ChatHandler) streamResponse(client *ws.Client, mode string, reqBody []b
|
||||
// 处理审查后的结构化消息 (review)
|
||||
if len(chunk.ReviewMessages) > 0 {
|
||||
for i, rm := range chunk.ReviewMessages {
|
||||
msgType := rm.Type
|
||||
if msgType == "" {
|
||||
msgType = "chat"
|
||||
}
|
||||
role := "assistant"
|
||||
msgType := "chat"
|
||||
if rm.Type == "action" {
|
||||
if msgType == "action" {
|
||||
role = "action"
|
||||
msgType = "action"
|
||||
}
|
||||
reviewMsgID := fmt.Sprintf("%s_r%d", msgID, i)
|
||||
// 持久化每条审查消息 (action 角色映射为 assistant,LLM 模型不支持自定义角色)
|
||||
@@ -473,6 +483,7 @@ func (h *ChatHandler) streamResponse(client *ws.Client, mode string, reqBody []b
|
||||
SessionID: client.SessionID,
|
||||
Timestamp: time.Now().UnixMilli(),
|
||||
ClientInfo: clientInfo,
|
||||
Metadata: rm.Metadata,
|
||||
})
|
||||
// 使用 MessageScheduler 计算的 per-message 延迟
|
||||
if rm.DelayMs > 0 {
|
||||
@@ -650,6 +661,96 @@ func (h *ChatHandler) handleVoiceInput(client *ws.Client, msg ws.ClientMessage)
|
||||
}()
|
||||
}
|
||||
|
||||
// handleVoiceStreamStart begins a streaming voice session via voice-service.
|
||||
func (h *ChatHandler) handleVoiceStreamStart(client *ws.Client, msg ws.ClientMessage) {
|
||||
format := msg.Format
|
||||
if format == "" {
|
||||
format = "webm"
|
||||
}
|
||||
language := msg.Language
|
||||
if language == "" {
|
||||
language = "zh"
|
||||
}
|
||||
|
||||
if err := h.voiceStream.StartStream(client, format, language); err != nil {
|
||||
logger.Printf("[voice-stream] 启动流式 STT 失败: %v", err)
|
||||
client.SendMessage(ws.ServerMessage{
|
||||
Type: "error",
|
||||
MessageID: "msg_" + generateID(),
|
||||
Error: "启动语音流失败: " + err.Error(),
|
||||
Timestamp: time.Now().UnixMilli(),
|
||||
})
|
||||
return
|
||||
}
|
||||
|
||||
client.SendMessage(ws.ServerMessage{
|
||||
Type: "voice_interim",
|
||||
MessageID: "voice_" + generateID(),
|
||||
Text: "",
|
||||
Timestamp: time.Now().UnixMilli(),
|
||||
})
|
||||
}
|
||||
|
||||
// handleVoiceStreamChunk forwards an audio chunk to the active voice stream.
|
||||
func (h *ChatHandler) handleVoiceStreamChunk(client *ws.Client, msg ws.ClientMessage) {
|
||||
if msg.AudioData == "" {
|
||||
return
|
||||
}
|
||||
|
||||
audioData, err := decodeBase64(msg.AudioData)
|
||||
if err != nil {
|
||||
logger.Printf("[voice-stream] 解码音频块失败: %v", err)
|
||||
return
|
||||
}
|
||||
|
||||
if err := h.voiceStream.SendChunk(client.ClientID, client.SessionID, audioData, msg.Sequence); err != nil {
|
||||
logger.Printf("[voice-stream] 发送音频块失败: %v", err)
|
||||
}
|
||||
}
|
||||
|
||||
// handleVoiceStreamEnd stops the voice stream and processes the final transcription.
|
||||
func (h *ChatHandler) handleVoiceStreamEnd(client *ws.Client, msg ws.ClientMessage) {
|
||||
go func() {
|
||||
text, err := h.voiceStream.EndStream(client.ClientID, client.SessionID)
|
||||
if err != nil {
|
||||
logger.Printf("[voice-stream] 结束流式 STT 失败: %v", err)
|
||||
client.SendMessage(ws.ServerMessage{
|
||||
Type: "error",
|
||||
MessageID: "msg_" + generateID(),
|
||||
Error: "语音流处理失败: " + err.Error(),
|
||||
Timestamp: time.Now().UnixMilli(),
|
||||
})
|
||||
return
|
||||
}
|
||||
|
||||
if text == "" {
|
||||
client.SendMessage(ws.ServerMessage{
|
||||
Type: "voice_final",
|
||||
MessageID: "voice_" + generateID(),
|
||||
Text: "",
|
||||
Timestamp: time.Now().UnixMilli(),
|
||||
})
|
||||
return
|
||||
}
|
||||
|
||||
// Send final transcription to frontend
|
||||
client.SendMessage(ws.ServerMessage{
|
||||
Type: "voice_final",
|
||||
MessageID: "voice_" + generateID(),
|
||||
Text: text,
|
||||
Timestamp: time.Now().UnixMilli(),
|
||||
})
|
||||
|
||||
// Route the transcribed text as a regular chat message to ai-core
|
||||
chatMsg := ws.ClientMessage{
|
||||
Type: "message",
|
||||
Content: text,
|
||||
Mode: msg.Mode,
|
||||
}
|
||||
h.handleChatMessage(client, chatMsg)
|
||||
}()
|
||||
}
|
||||
|
||||
// transcribeAudio 将 base64 编码的音频发送到 voice-service 进行转录。
|
||||
func (h *ChatHandler) transcribeAudio(audioB64 string, format string) (string, error) {
|
||||
audioData, err := decodeBase64(audioB64)
|
||||
|
||||
@@ -0,0 +1,269 @@
|
||||
package handler
|
||||
|
||||
import (
|
||||
"encoding/json"
|
||||
"fmt"
|
||||
"strings"
|
||||
"sync"
|
||||
"time"
|
||||
|
||||
"github.com/gorilla/websocket"
|
||||
|
||||
"git.yeij.top/AskaEth/Cyrene/gateway/internal/ws"
|
||||
"git.yeij.top/AskaEth/Cyrene/pkg/logger"
|
||||
)
|
||||
|
||||
// voiceStreamSession manages a proxied WebSocket connection to voice-service
|
||||
// for real-time streaming speech-to-text during a single voice input.
|
||||
type voiceStreamSession struct {
|
||||
client *ws.Client
|
||||
voiceConn *websocket.Conn
|
||||
language string
|
||||
format string
|
||||
mu sync.Mutex
|
||||
done chan struct{}
|
||||
interimBuf strings.Builder
|
||||
finalText string
|
||||
}
|
||||
|
||||
// VoiceStreamManager creates and tracks streaming STT sessions.
|
||||
type VoiceStreamManager struct {
|
||||
voiceServiceURL string
|
||||
sessions map[string]*voiceStreamSession // key: clientID+sessionID
|
||||
mu sync.Mutex
|
||||
}
|
||||
|
||||
// NewVoiceStreamManager creates a voice stream manager.
|
||||
func NewVoiceStreamManager(voiceServiceURL string) *VoiceStreamManager {
|
||||
return &VoiceStreamManager{
|
||||
voiceServiceURL: voiceServiceURL,
|
||||
sessions: make(map[string]*voiceStreamSession),
|
||||
}
|
||||
}
|
||||
|
||||
func (m *VoiceStreamManager) sessionKey(clientID, sessionID string) string {
|
||||
return clientID + ":" + sessionID
|
||||
}
|
||||
|
||||
// StartStream begins a streaming STT session by connecting to voice-service.
|
||||
func (m *VoiceStreamManager) StartStream(client *ws.Client, format, language string) error {
|
||||
m.mu.Lock()
|
||||
key := m.sessionKey(client.ClientID, client.SessionID)
|
||||
if _, exists := m.sessions[key]; exists {
|
||||
m.mu.Unlock()
|
||||
return fmt.Errorf("voice stream already active for this session")
|
||||
}
|
||||
|
||||
if format == "" {
|
||||
format = "webm"
|
||||
}
|
||||
if language == "" {
|
||||
language = "zh"
|
||||
}
|
||||
|
||||
voiceURL := strings.TrimRight(m.voiceServiceURL, "/")
|
||||
wsURL := "ws" + strings.TrimPrefix(voiceURL, "http") + "/api/v1/stt/stream"
|
||||
wsURL += "?language=" + language + "&format=" + format
|
||||
|
||||
dialer := websocket.Dialer{HandshakeTimeout: 10 * time.Second}
|
||||
voiceConn, _, err := dialer.Dial(wsURL, nil)
|
||||
if err != nil {
|
||||
m.mu.Unlock()
|
||||
return fmt.Errorf("connect to voice-service stream: %w", err)
|
||||
}
|
||||
|
||||
session := &voiceStreamSession{
|
||||
client: client,
|
||||
voiceConn: voiceConn,
|
||||
language: language,
|
||||
format: format,
|
||||
done: make(chan struct{}),
|
||||
}
|
||||
m.sessions[key] = session
|
||||
m.mu.Unlock()
|
||||
|
||||
// Read results from voice-service in background
|
||||
go session.readResults(m, key)
|
||||
|
||||
logger.Printf("[voice-stream] 流式 STT 会话已建立: client=%s, lang=%s, fmt=%s", client.ClientID, language, format)
|
||||
return nil
|
||||
}
|
||||
|
||||
// SendChunk forwards an audio chunk (already decoded bytes) to voice-service.
|
||||
func (m *VoiceStreamManager) SendChunk(clientID, sessionID string, audioData []byte, seq int) error {
|
||||
m.mu.Lock()
|
||||
key := m.sessionKey(clientID, sessionID)
|
||||
session, exists := m.sessions[key]
|
||||
m.mu.Unlock()
|
||||
|
||||
if !exists {
|
||||
return fmt.Errorf("no active voice stream for this session")
|
||||
}
|
||||
|
||||
session.mu.Lock()
|
||||
defer session.mu.Unlock()
|
||||
|
||||
if err := session.voiceConn.WriteMessage(websocket.BinaryMessage, audioData); err != nil {
|
||||
return fmt.Errorf("send audio chunk: %w", err)
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
// EndStream signals voice-service that the audio stream is complete,
|
||||
// waits for final result, then cleans up.
|
||||
func (m *VoiceStreamManager) EndStream(clientID, sessionID string) (string, error) {
|
||||
m.mu.Lock()
|
||||
key := m.sessionKey(clientID, sessionID)
|
||||
session, exists := m.sessions[key]
|
||||
m.mu.Unlock()
|
||||
|
||||
if !exists {
|
||||
return "", fmt.Errorf("no active voice stream for this session")
|
||||
}
|
||||
|
||||
// Send stop action to voice-service
|
||||
session.mu.Lock()
|
||||
stopMsg, _ := json.Marshal(map[string]interface{}{"action": "stop"})
|
||||
session.voiceConn.WriteMessage(websocket.TextMessage, stopMsg)
|
||||
session.mu.Unlock()
|
||||
|
||||
// Wait for result processing to finish
|
||||
select {
|
||||
case <-session.done:
|
||||
case <-time.After(15 * time.Second):
|
||||
logger.Printf("[voice-stream] 等待最终结果超时: client=%s", clientID)
|
||||
}
|
||||
|
||||
// Cleanup
|
||||
session.close()
|
||||
m.mu.Lock()
|
||||
delete(m.sessions, key)
|
||||
m.mu.Unlock()
|
||||
|
||||
text := session.finalText
|
||||
if text == "" {
|
||||
text = session.interimBuf.String()
|
||||
}
|
||||
logger.Printf("[voice-stream] 流式 STT 结束: client=%s, text=%q", clientID, text)
|
||||
return text, nil
|
||||
}
|
||||
|
||||
// CancelStream forcibly terminates a voice stream.
|
||||
func (m *VoiceStreamManager) CancelStream(clientID, sessionID string) {
|
||||
m.mu.Lock()
|
||||
key := m.sessionKey(clientID, sessionID)
|
||||
session, exists := m.sessions[key]
|
||||
if exists {
|
||||
delete(m.sessions, key)
|
||||
}
|
||||
m.mu.Unlock()
|
||||
|
||||
if exists {
|
||||
session.close()
|
||||
logger.Printf("[voice-stream] 流式 STT 已取消: client=%s", clientID)
|
||||
}
|
||||
}
|
||||
|
||||
// readResults reads STT results from voice-service and forwards them to the client.
|
||||
func (s *voiceStreamSession) readResults(mgr *VoiceStreamManager, key string) {
|
||||
defer close(s.done)
|
||||
|
||||
voiceConn := s.voiceConn
|
||||
for {
|
||||
msgType, data, err := voiceConn.ReadMessage()
|
||||
if err != nil {
|
||||
if websocket.IsUnexpectedCloseError(err, websocket.CloseGoingAway, websocket.CloseNormalClosure) {
|
||||
logger.Printf("[voice-stream] voice-service 读取错误: %v", err)
|
||||
}
|
||||
return
|
||||
}
|
||||
|
||||
if msgType != websocket.TextMessage {
|
||||
continue
|
||||
}
|
||||
|
||||
var result struct {
|
||||
Type string `json:"type"`
|
||||
Text string `json:"text"`
|
||||
IsFinal bool `json:"isFinal"`
|
||||
Error string `json:"error"`
|
||||
}
|
||||
if err := json.Unmarshal(data, &result); err != nil {
|
||||
logger.Printf("[voice-stream] 解析结果失败: %v", err)
|
||||
continue
|
||||
}
|
||||
|
||||
if result.Error != "" {
|
||||
logger.Printf("[voice-stream] voice-service 错误: %s", result.Error)
|
||||
s.client.SendMessage(ws.ServerMessage{
|
||||
Type: "voice_interim",
|
||||
MessageID: "voice_" + generateID(),
|
||||
Error: result.Error,
|
||||
Timestamp: time.Now().UnixMilli(),
|
||||
})
|
||||
return
|
||||
}
|
||||
|
||||
if result.Text != "" {
|
||||
if result.IsFinal {
|
||||
s.finalText = result.Text
|
||||
s.client.SendMessage(ws.ServerMessage{
|
||||
Type: "voice_final",
|
||||
MessageID: "voice_" + generateID(),
|
||||
Text: result.Text,
|
||||
Timestamp: time.Now().UnixMilli(),
|
||||
})
|
||||
return
|
||||
}
|
||||
|
||||
// Interim result — accumulate and forward
|
||||
s.interimBuf.Reset()
|
||||
s.interimBuf.WriteString(result.Text)
|
||||
s.client.SendMessage(ws.ServerMessage{
|
||||
Type: "voice_interim",
|
||||
MessageID: "voice_" + generateID(),
|
||||
Text: result.Text,
|
||||
Timestamp: time.Now().UnixMilli(),
|
||||
})
|
||||
}
|
||||
|
||||
// "done" type from voice-service signals end of results
|
||||
if result.Type == "done" {
|
||||
return
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
func (s *voiceStreamSession) close() {
|
||||
if s.voiceConn != nil {
|
||||
s.voiceConn.Close()
|
||||
}
|
||||
}
|
||||
|
||||
// HasActiveStream checks if a client already has an active voice stream.
|
||||
func (m *VoiceStreamManager) HasActiveStream(clientID, sessionID string) bool {
|
||||
m.mu.Lock()
|
||||
defer m.mu.Unlock()
|
||||
_, exists := m.sessions[m.sessionKey(clientID, sessionID)]
|
||||
return exists
|
||||
}
|
||||
|
||||
// CleanupClient removes all streams for a client.
|
||||
func (m *VoiceStreamManager) CleanupClient(clientID string) {
|
||||
m.mu.Lock()
|
||||
var toRemove []string
|
||||
for key, session := range m.sessions {
|
||||
if session.client.ClientID == clientID {
|
||||
toRemove = append(toRemove, key)
|
||||
}
|
||||
}
|
||||
for _, key := range toRemove {
|
||||
delete(m.sessions, key)
|
||||
}
|
||||
m.mu.Unlock()
|
||||
|
||||
for _, key := range toRemove {
|
||||
// Close connection if session exists (we already deleted from map)
|
||||
logger.Printf("[voice-stream] 清理客户端流: key=%s", key)
|
||||
}
|
||||
}
|
||||
@@ -15,11 +15,14 @@ type MessageAttachment struct {
|
||||
|
||||
// 客户端 → 服务端消息
|
||||
type ClientMessage struct {
|
||||
Type string `json:"type"` // message | voice_input | ping | history
|
||||
Type string `json:"type"` // message | voice_input | voice_stream_start | voice_stream_chunk | voice_stream_end | ping | history
|
||||
SessionID string `json:"session_id"`
|
||||
Mode string `json:"mode"` // text | voice_msg | voice_assistant
|
||||
Content string `json:"content"`
|
||||
AudioData string `json:"audio_data,omitempty"` // base64
|
||||
Format string `json:"format,omitempty"` // 音频格式: webm, wav, pcm, opus
|
||||
Language string `json:"language,omitempty"` // 识别语言: zh, en, ja, ko, auto
|
||||
Sequence int `json:"sequence,omitempty"` // 音频块序列号 (voice_stream_chunk)
|
||||
Attachments []MessageAttachment `json:"attachments,omitempty"` // 图片等附件
|
||||
Timestamp int64 `json:"timestamp"`
|
||||
ClientID string `json:"client_id,omitempty"` // 客户端唯一标识 (多端区分)
|
||||
@@ -28,11 +31,12 @@ type ClientMessage struct {
|
||||
ClientMsgID string `json:"client_msg_id,omitempty"` // 客户端消息ID (跨端去重)
|
||||
}
|
||||
|
||||
// ReviewMessage 审查后的结构化消息(动作/聊天分离)
|
||||
// ReviewMessage 审查后的结构化消息(动作/聊天/Markdown/代码块/搜索结果)
|
||||
type ReviewMessage struct {
|
||||
Type string `json:"type"` // "action" | "chat"
|
||||
Content string `json:"content"`
|
||||
DelayMs int `json:"delay_ms,omitempty"` // ms to wait before sending (0 = immediate)
|
||||
Type string `json:"type"` // action | chat | markdown | code | search_result
|
||||
Content string `json:"content"`
|
||||
DelayMs int `json:"delay_ms,omitempty"` // ms to wait before sending (0 = immediate)
|
||||
Metadata map[string]any `json:"metadata,omitempty"` // 类型特定元数据 (code 语言、搜索结果 URL 等)
|
||||
}
|
||||
|
||||
// ClientInfo carries the originating client's device metadata.
|
||||
@@ -44,7 +48,7 @@ type ClientInfo struct {
|
||||
|
||||
// 服务端 → 客户端消息
|
||||
type ServerMessage struct {
|
||||
Type string `json:"type"` // response | segment | audio | error | device_update | pong | history_response | stream_chunk | stream_end | background_thinking | notification | multi_message | stream_segments | review | thinking | tool_progress | system_info
|
||||
Type string `json:"type"` // response | segment | audio | error | device_update | pong | history_response | stream_chunk | stream_end | background_thinking | notification | multi_message | stream_segments | review | thinking | tool_progress | system_info | voice_interim | voice_final
|
||||
MessageID string `json:"message_id"`
|
||||
Text string `json:"text,omitempty"`
|
||||
Content string `json:"content,omitempty"` // stream_chunk 的增量文本
|
||||
@@ -63,7 +67,8 @@ type ServerMessage struct {
|
||||
Notification *NotificationInfo `json:"notification,omitempty"` // 通知推送
|
||||
MultiMessage *MultiMessagePayload `json:"multi_message,omitempty"` // 多条消息批量发送
|
||||
ReviewMessages []ReviewMessage `json:"review_messages,omitempty"` // 审查后的结构化消息列表
|
||||
MsgType string `json:"msg_type,omitempty"` // 消息展示类型: action | chat | thinking | tool_progress | system_info
|
||||
MsgType string `json:"msg_type,omitempty"` // 消息展示类型: action | chat | thinking | tool_progress | system_info | markdown | code | search_result
|
||||
Metadata map[string]any `json:"metadata,omitempty"` // 消息元数据 (code 语言等)
|
||||
ToolProgress *ToolProgressInfo `json:"tool_progress,omitempty"` // 工具执行进度
|
||||
SystemInfo *SystemInfoPayload `json:"system_info,omitempty"` // 系统通知信息
|
||||
ProtocolVersion int `json:"protocol_version,omitempty"` // 协议版本
|
||||
|
||||
Reference in New Issue
Block a user