fix: 后台思考身份混淆 + 静默模式视觉理解 + QQ卡片解析 + 仪表盘状态修复

- 后台思考对话历史增加标签说明，严格区分群聊中不同发送者 - 静默观察模式传入图片URL并预处理，供后台思考参考 - 视觉+OCR双模型结果合并格式优化，避免LLM误认为多张图片 - QQ卡片消息(CQ:json)正确解析标题/类型，不再丢失为[JSON] - 进程管理器stop()在进程为null时重置pid/startTime，消除矛盾状态 Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
2026-05-31 21:07:25 +08:00
parent a9c79d7887
commit b085e58031
7 changed files with 179 additions and 33 deletions
@@ -180,7 +180,7 @@ func (o *Orchestrator) ProcessInput(
 		// 0.5 图片预处理: 使用视觉模型分析图片，将描述注入消息
 	if len(params.Images) > 0 && o.visionProvider != nil {
 		startTime := time.Now()
-		augmented := o.preprocessImages(ctx, params.Message, params.Images)
+		augmented := o.PreprocessImages(ctx, params.Message, params.Images)
 		if augmented != params.Message {
 			params.Message = augmented
 			logger.Printf("[orchestrator] 图片预处理耗时: %v, 原消息=%d字, 增强后=%d字",
@@ -736,25 +736,20 @@ func (o *Orchestrator) CacheMessage(sessionID string, role model.Role, content s
 	}
 }

-// cacheAssistantMessage caches the assistant response, tagging it with the recipient
-// in group chats so dialog history shows who the AI was addressing.
+// cacheAssistantMessage caches the assistant response.
 func (o *Orchestrator) cacheAssistantMessage(params ProcessParams, fullContent string) {
 	if o.contextBuilder == nil {
 		return
 	}
-	cached := fullContent
-	if params.ChannelType == "group" && params.Nickname != "" {
-		cached = fmt.Sprintf("[回复 %s]\n%s", params.Nickname, fullContent)
-	}
-	o.contextBuilder.CacheMessage(params.SessionID, model.RoleAssistant, cached)
+	o.contextBuilder.CacheMessage(params.SessionID, model.RoleAssistant, fullContent)
 }

-// preprocessImages uses vision and OCR models to analyze images and augments the user message.
+// PreprocessImages uses vision and OCR models to analyze images and augments the user message.
 // When both vision and OCR providers are available (and are different models), they are called
 // in parallel and both results are passed to the chat model for autonomous judgment.
 // For standalone images (no text): generates a comprehensive description as the message.
 // For text+images: appends image descriptions as contextual annotations.
-func (o *Orchestrator) preprocessImages(ctx context.Context, message string, images []string) string {
+func (o *Orchestrator) PreprocessImages(ctx context.Context, message string, images []string) string {
 	visionPromptBase := "请详细描述这张图片的内容，包括场景、物体、人物、文字（如有）、颜色、氛围等所有视觉信息。"
 	ocrPromptBase := `请逐字逐句完整提取图片中的所有文字内容，保持原有格式和排版。如果图片中没有文字，请回复"无文字"。`

@@ -809,7 +804,7 @@ func (o *Orchestrator) preprocessImages(ctx context.Context, message string, ima
 		var combined string
 		switch {
 		case visionDesc != "" && ocrDesc != "":
-			combined = fmt.Sprintf("[视觉分析]: %s\n[文字提取(OCR)]: %s", visionDesc, ocrDesc)
+			combined = fmt.Sprintf("视觉描述：%s\n（图中文字：%s）", visionDesc, ocrDesc)
 		case visionDesc != "":
 			combined = visionDesc
 		case ocrDesc != "":
@@ -831,7 +826,7 @@ func (o *Orchestrator) preprocessImages(ctx context.Context, message string, ima

 	augmented := message
 	for i, desc := range descriptions {
-		augmented += fmt.Sprintf("\n\n[图片%d的视觉分析]: %s", i+1, desc)
+		augmented += fmt.Sprintf("\n\n[图片%d分析结果]: %s", i+1, desc)
 	}
 	return augmented
 }
@@ -216,7 +216,7 @@ func (s *Synthesizer) buildSynthesizeMessages(params SynthesizeParams) []model.L
 	if params.ChannelType == "group" {
 		messages = append(messages, model.LLMMessage{
 			Role:    model.RoleSystem,
-			Content: "【群聊上下文】这条消息来自QQ群聊。消息前缀 [群聊 群号] 昵称 (QQ号) 标注了真实发送者。你不是在和开拓者一对一私聊，而是在群聊中和不同成员交流。请根据消息前缀中的发送者名字称呼对方，不同的人有不同的名字。只在对你说话或延续已有对话时才回复。",
+			Content: "【群聊上下文】这条消息来自QQ群聊。消息前缀 [群聊 群号] 昵称 (QQ号) 标注了真实发送者。你不是在和开拓者一对一私聊，而是在群聊中和不同成员交流。请根据当前这条消息前缀中的发送者名字来称呼对方——即使你之前在历史对话中称呼过别人，也不要把之前用的称呼套在当前发送者身上。不同的人有不同的名字。只在对你说话或延续已有对话时才回复。",
 		})
 	}