fix: 后台思考身份混淆 + 静默模式视觉理解 + QQ卡片解析 + 仪表盘状态修复

- 后台思考对话历史增加标签说明,严格区分群聊中不同发送者
- 静默观察模式传入图片URL并预处理,供后台思考参考
- 视觉+OCR双模型结果合并格式优化,避免LLM误认为多张图片
- QQ卡片消息(CQ:json)正确解析标题/类型,不再丢失为[JSON]
- 进程管理器stop()在进程为null时重置pid/startTime,消除矛盾状态

Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
This commit is contained in:
2026-05-31 21:07:25 +08:00
parent a9c79d7887
commit b085e58031
7 changed files with 179 additions and 33 deletions
@@ -180,7 +180,7 @@ func (o *Orchestrator) ProcessInput(
// 0.5 图片预处理: 使用视觉模型分析图片,将描述注入消息
if len(params.Images) > 0 && o.visionProvider != nil {
startTime := time.Now()
augmented := o.preprocessImages(ctx, params.Message, params.Images)
augmented := o.PreprocessImages(ctx, params.Message, params.Images)
if augmented != params.Message {
params.Message = augmented
logger.Printf("[orchestrator] 图片预处理耗时: %v, 原消息=%d字, 增强后=%d字",
@@ -736,25 +736,20 @@ func (o *Orchestrator) CacheMessage(sessionID string, role model.Role, content s
}
}
// cacheAssistantMessage caches the assistant response, tagging it with the recipient
// in group chats so dialog history shows who the AI was addressing.
// cacheAssistantMessage caches the assistant response.
func (o *Orchestrator) cacheAssistantMessage(params ProcessParams, fullContent string) {
if o.contextBuilder == nil {
return
}
cached := fullContent
if params.ChannelType == "group" && params.Nickname != "" {
cached = fmt.Sprintf("[回复 %s]\n%s", params.Nickname, fullContent)
}
o.contextBuilder.CacheMessage(params.SessionID, model.RoleAssistant, cached)
o.contextBuilder.CacheMessage(params.SessionID, model.RoleAssistant, fullContent)
}
// preprocessImages uses vision and OCR models to analyze images and augments the user message.
// PreprocessImages uses vision and OCR models to analyze images and augments the user message.
// When both vision and OCR providers are available (and are different models), they are called
// in parallel and both results are passed to the chat model for autonomous judgment.
// For standalone images (no text): generates a comprehensive description as the message.
// For text+images: appends image descriptions as contextual annotations.
func (o *Orchestrator) preprocessImages(ctx context.Context, message string, images []string) string {
func (o *Orchestrator) PreprocessImages(ctx context.Context, message string, images []string) string {
visionPromptBase := "请详细描述这张图片的内容,包括场景、物体、人物、文字(如有)、颜色、氛围等所有视觉信息。"
ocrPromptBase := `请逐字逐句完整提取图片中的所有文字内容,保持原有格式和排版。如果图片中没有文字,请回复"无文字"。`
@@ -809,7 +804,7 @@ func (o *Orchestrator) preprocessImages(ctx context.Context, message string, ima
var combined string
switch {
case visionDesc != "" && ocrDesc != "":
combined = fmt.Sprintf("[视觉分析]: %s\n[文字提取(OCR)]: %s", visionDesc, ocrDesc)
combined = fmt.Sprintf("视觉描述:%s\n(图中文字:%s", visionDesc, ocrDesc)
case visionDesc != "":
combined = visionDesc
case ocrDesc != "":
@@ -831,7 +826,7 @@ func (o *Orchestrator) preprocessImages(ctx context.Context, message string, ima
augmented := message
for i, desc := range descriptions {
augmented += fmt.Sprintf("\n\n[图片%d的视觉分析]: %s", i+1, desc)
augmented += fmt.Sprintf("\n\n[图片%d分析结果]: %s", i+1, desc)
}
return augmented
}
@@ -216,7 +216,7 @@ func (s *Synthesizer) buildSynthesizeMessages(params SynthesizeParams) []model.L
if params.ChannelType == "group" {
messages = append(messages, model.LLMMessage{
Role: model.RoleSystem,
Content: "【群聊上下文】这条消息来自QQ群聊。消息前缀 [群聊 群号] 昵称 (QQ号) 标注了真实发送者。你不是在和开拓者一对一私聊,而是在群聊中和不同成员交流。请根据消息前缀中的发送者名字称呼对方不同的人有不同的名字。只在对你说话或延续已有对话时才回复。",
Content: "【群聊上下文】这条消息来自QQ群聊。消息前缀 [群聊 群号] 昵称 (QQ号) 标注了真实发送者。你不是在和开拓者一对一私聊,而是在群聊中和不同成员交流。请根据当前这条消息前缀中的发送者名字称呼对方——即使你之前在历史对话中称呼过别人,也不要把之前用的称呼套在当前发送者身上。不同的人有不同的名字。只在对你说话或延续已有对话时才回复。",
})
}