feat: ASR语音转写管线 + 群聊身份混淆修复

- 新增ASR语音识别管线: QQ语音→下载音频→qwen3-asr-flash转录→注入用户消息
- 模型名称全部从models.json路由获取,无硬编码
- 修复群聊中AI将非管理员用户误称为管理员昵称(叶酱)的问题
  - 助手回复缓存时标注[回复 昵称 (UID)],防止对话历史中身份混淆
  - 群聊上下文指令改为肯定性表述,移除具体名称提及
- trace面板时间戳改为YYYY-MM-DD HH:MM:SS格式,耗时统一显示为秒
- 修复Go time.Duration纳秒值在前端显示问题(Duration/1e6转毫秒)
- 新增video_tool插件模板
- 优化OpenAI adapter reasoning_content处理

Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
This commit is contained in:
2026-05-31 16:46:47 +08:00
parent d112fdd540
commit a9c79d7887
16 changed files with 780 additions and 67 deletions
@@ -41,6 +41,8 @@ type Orchestrator struct {
toolRegistry *plgManager.ToolRegistry
visionProvider llm.LLMProvider // 视觉模型 (图片预处理)
ocrProvider llm.LLMProvider // OCR 模型 (文字提取,与视觉模型并行调用)
videoProvider llm.LLMProvider // 视频模型 (短视频理解)
asrProvider llm.ASRProvider // ASR 语音识别 (语音消息转录)
}
// SetResponseCache sets the response cache (optional, for Phase 0.2).
@@ -84,6 +86,16 @@ func (o *Orchestrator) SetOCRProvider(op llm.LLMProvider) {
o.ocrProvider = op
}
// SetVideoProvider sets the video model provider for short video understanding.
func (o *Orchestrator) SetVideoProvider(vp llm.LLMProvider) {
o.videoProvider = vp
}
// SetASRProvider sets the ASR provider for voice message transcription.
func (o *Orchestrator) SetASRProvider(ap llm.ASRProvider) {
o.asrProvider = ap
}
// getBus returns the bus or a nop fallback.
func (o *Orchestrator) getBus() bus.Bus {
if o.eventBus == nil {
@@ -121,6 +133,8 @@ type ProcessParams struct {
SessionID string
Message string
Images []string // 图片 base64 data URL (多模态)
VideoURLs []string // 视频 URL (多模态), ≤20s short videos
VoiceURLs []string // 语音 URL (ASR 转录)
Mode string // text / voice_msg / voice_assistant
Nickname string
ChannelType string // direct / group
@@ -174,6 +188,34 @@ func (o *Orchestrator) ProcessInput(
}
// 预处理后清空原始图片,避免后续传给不支持多模态的 Chat 模型
params.Images = nil
// 0.6 视频预处理: 使用视频模型分析短视频 (≤20s),将描述注入消息
if len(params.VideoURLs) > 0 && o.videoProvider != nil {
startTime := time.Now()
augmented := o.preprocessVideos(ctx, params.Message, params.VideoURLs)
if augmented != params.Message {
params.Message = augmented
logger.Printf("[orchestrator] 视频预处理耗时: %v", time.Since(startTime))
}
params.VideoURLs = nil
} else if len(params.VideoURLs) > 0 {
logger.Printf("[orchestrator] 视频模型未配置,丢弃 %d 个视频", len(params.VideoURLs))
params.VideoURLs = nil
}
// 0.7 语音预处理: 使用 ASR 模型转录语音消息,将文本注入消息
if len(params.VoiceURLs) > 0 && o.asrProvider != nil && o.asrProvider.IsAvailable() {
startTime := time.Now()
augmented := o.preprocessVoice(ctx, params.Message, params.VoiceURLs)
if augmented != params.Message {
params.Message = augmented
logger.Printf("[orchestrator] 语音预处理耗时: %v", time.Since(startTime))
}
params.VoiceURLs = nil
} else if len(params.VoiceURLs) > 0 {
logger.Printf("[orchestrator] ASR模型未配置,丢弃 %d 个语音", len(params.VoiceURLs))
params.VoiceURLs = nil
}
} else if len(params.Images) > 0 {
// 未配置 Vision 模型时,告知用户该模型不支持图片,并清空图片避免报错
if params.Message == "" {
@@ -234,7 +276,7 @@ func (o *Orchestrator) ProcessInput(
eventCh <- model.StreamEvent{Type: model.StreamSegments, Segments: segments}
}
eventCh <- model.StreamEvent{Type: model.StreamDone}
o.contextBuilder.CacheMessage(params.SessionID, model.RoleAssistant, fullContent)
o.cacheAssistantMessage(params, fullContent)
logger.Printf("[orchestrator] 缓存响应完成: len=%d", len([]rune(fullContent)))
return
}
@@ -478,7 +520,7 @@ func (o *Orchestrator) ProcessInput(
// 10. 后处理:缓存回复
if fullContent != "" {
o.contextBuilder.CacheMessage(params.SessionID, model.RoleAssistant, fullContent)
o.cacheAssistantMessage(params, fullContent)
if o.responseCache != nil {
o.responseCache.Set(params.Message, fullContent)
}
@@ -694,6 +736,19 @@ func (o *Orchestrator) CacheMessage(sessionID string, role model.Role, content s
}
}
// cacheAssistantMessage caches the assistant response, tagging it with the recipient
// in group chats so dialog history shows who the AI was addressing.
func (o *Orchestrator) cacheAssistantMessage(params ProcessParams, fullContent string) {
if o.contextBuilder == nil {
return
}
cached := fullContent
if params.ChannelType == "group" && params.Nickname != "" {
cached = fmt.Sprintf("[回复 %s]\n%s", params.Nickname, fullContent)
}
o.contextBuilder.CacheMessage(params.SessionID, model.RoleAssistant, cached)
}
// preprocessImages uses vision and OCR models to analyze images and augments the user message.
// When both vision and OCR providers are available (and are different models), they are called
// in parallel and both results are passed to the chat model for autonomous judgment.
@@ -781,6 +836,74 @@ func (o *Orchestrator) preprocessImages(ctx context.Context, message string, ima
return augmented
}
// preprocessVideos uses the video model to analyze short videos and augments the message.
func (o *Orchestrator) preprocessVideos(ctx context.Context, message string, videoURLs []string) string {
if o.videoProvider == nil {
return message
}
var descriptions []string
for i, url := range videoURLs {
resp, err := o.videoProvider.Chat(ctx, []model.LLMMessage{
{Role: model.RoleUser, Content: "请用简短的中文描述这个视频的内容,包括场景、人物、动作等。控制在100字以内。", VideoURLs: []string{url}},
})
if err != nil {
logger.Printf("[orchestrator] 视频 %d 分析失败: %v", i, err)
continue
}
if resp.Content != "" {
descriptions = append(descriptions, resp.Content)
}
}
if len(descriptions) == 0 {
return message
}
if message == "" {
return strings.Join(descriptions, "\n\n")
}
augmented := message
for i, desc := range descriptions {
augmented += fmt.Sprintf("\n\n[视频%d的分析]: %s", i+1, desc)
}
return augmented
}
// preprocessVoice transcribes voice messages using the ASR provider and augments the message.
func (o *Orchestrator) preprocessVoice(ctx context.Context, message string, voiceURLs []string) string {
if o.asrProvider == nil || !o.asrProvider.IsAvailable() {
return message
}
var transcriptions []string
for i, url := range voiceURLs {
text, err := o.asrProvider.Transcribe(ctx, url)
if err != nil {
logger.Printf("[orchestrator] 语音 %d 转录失败: %v", i, err)
continue
}
if text != "" {
transcriptions = append(transcriptions, text)
}
}
if len(transcriptions) == 0 {
return message
}
if message == "" {
return strings.Join(transcriptions, "\n\n")
}
augmented := message
for i, t := range transcriptions {
augmented += fmt.Sprintf("\n\n[语音%d的转写]: %s", i+1, t)
}
return augmented
}
// Ensure time, memory are used
var _ = time.Now
var _ = memory.NewRetriever
@@ -35,6 +35,7 @@ type SynthesizeParams struct {
SessionID string
UserMessage string
Images []string // 图片 base64 data URL (多模态)
VideoURLs []string // 视频 URL (多模态)
Nickname string
PersonaPrompt string // 完整人格提示词
DialogHistory []model.LLMMessage // 对话历史
@@ -215,7 +216,7 @@ func (s *Synthesizer) buildSynthesizeMessages(params SynthesizeParams) []model.L
if params.ChannelType == "group" {
messages = append(messages, model.LLMMessage{
Role: model.RoleSystem,
Content: "【群聊上下文】这条消息来自QQ群聊。消息前缀 [群聊 群号] 昵称 (QQ号) 标注了真实发送者。你不是在和开拓者一对一私聊,而是在群聊中和不同成员交流。请用发送者的真实名字称呼,不要叫所有人开拓者或叶酱。只在对你说话或延续已有对话时才回复。",
Content: "【群聊上下文】这条消息来自QQ群聊。消息前缀 [群聊 群号] 昵称 (QQ号) 标注了真实发送者。你不是在和开拓者一对一私聊,而是在群聊中和不同成员交流。请根据消息前缀中的发送者名字称呼对方,不同的人有不同的名字。只在对你说话或延续已有对话时才回复。",
})
}
@@ -280,11 +281,12 @@ func (s *Synthesizer) buildSynthesizeMessages(params SynthesizeParams) []model.L
messages = append(messages, history...)
}
// 当前用户消息 (支持多模态图片)
// 当前用户消息 (支持多模态图片和视频)
messages = append(messages, model.LLMMessage{
Role: model.RoleUser,
Content: params.UserMessage,
Images: params.Images,
Role: model.RoleUser,
Content: params.UserMessage,
Images: params.Images,
VideoURLs: params.VideoURLs,
})
return messages