fix: XML动作标签 + 意图分析上下文 + 图片file_id引用

- 动作消息改用 <action>...</action> XML 标签（注入器 + 解析器 + 测试） - 括号解析保留为降级方案，确保向后兼容 - 意图分析传入最近对话历史，防止短追问误判为 iot_query - 意图提示词强化：短追问明确归为 question，iot_query 需设备名词 - 图片附件支持 file_id 轻量引用（Gateway FileStore 解析 + 上传端点复用） - API 文档更新：附件新格式 + 图片传递链路 Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
2026-05-29 19:27:25 +08:00
parent 3e15285065
commit c4de813629
9 changed files with 140 additions and 35 deletions
@@ -28,7 +28,7 @@ func NewIntentAnalyzer(llmAdapter *llm.Adapter) *IntentAnalyzer {

 // Analyze 分析用户消息意图
 // 优先使用 LLM，对于简单问候使用关键词快速通道（跳过 LLM 调用）
-func (a *IntentAnalyzer) Analyze(ctx context.Context, userMessage string) (*model.IntentResult, error) {
+func (a *IntentAnalyzer) Analyze(ctx context.Context, userMessage string, historyHint ...string) (*model.IntentResult, error) {
 	// 快速通道：简单问候/闲聊直接返回，跳过 LLM 调用
 	if a.isSimpleGreeting(userMessage) {
 		logger.Printf("[intent] 快速通道: 检测到简单问候，跳过 LLM 分析")
@@ -55,6 +55,10 @@ func (a *IntentAnalyzer) Analyze(ctx context.Context, userMessage string) (*mode
 	}

 	// 构建轻量意图分析提示词
+	userContent := userMessage
+	if len(historyHint) > 0 && historyHint[0] != "" {
+		userContent = fmt.Sprintf("对话上下文: %s\n\n用户消息: %s", historyHint[0], userMessage)
+	}
 	messages := []model.LLMMessage{
 		{
 			Role:    model.RoleSystem,
@@ -62,7 +66,7 @@ func (a *IntentAnalyzer) Analyze(ctx context.Context, userMessage string) (*mode
 		},
 		{
 			Role:    model.RoleUser,
-			Content: fmt.Sprintf("用户消息: %s", userMessage),
+			Content: userContent,
 		},
 	}

@@ -223,13 +227,14 @@ const intentAnalysisSystemPrompt = `分析以下用户消息的意图。只需
 - primary: 用户的主要意图
  - chat: 日常闲聊
  - iot_control: 需要控制智能设备
-  - iot_query: 查询设备状态
-  - question: 提问
+  - iot_query: 查询设备状态（仅当明确提到设备名时才用，如灯/空调/温度）
+  - question: 提问（短追问如"看到了什么""什么意思""然后呢"归此类）
  - emotional: 情绪表达/倾诉
- needs_iot: 是否需要调用 IoT 相关功能
+- needs_iot: 是否需要调用 IoT 相关功能（仅当明确提到设备名词时才为 true）
 - needs_memory: 是否需要检索用户记忆（大部分情况为 true）
 - sentiment: 用户情绪
- urgency: low=普通闲聊, medium=需要回应, high=紧急求助`
+- urgency: low=普通闲聊, medium=需要回应, high=紧急求助
+- 重要：短追问绝不判定为 iot_control 或 iot_query，应判定为 question`

 // parseIntentResponse 从 LLM 响应中解析意图 JSON
 func parseIntentResponse(content string) (*model.IntentResult, error) {
@@ -184,7 +184,8 @@ func (o *Orchestrator) ProcessInput(

 	// 1. 意图分析
 		startTime := time.Now()
-		intent, err := o.intentAnalyzer.Analyze(ctx, params.Message)
+		historyHint := o.buildHistoryHint(params.SessionID)
+		intent, err := o.intentAnalyzer.Analyze(ctx, params.Message, historyHint)
 		if err != nil || intent == nil {
 			logger.Printf("[orchestrator] 意图分析失败: %v，使用默认值", err)
 			intent = &model.IntentResult{
@@ -650,6 +651,31 @@ func (o *Orchestrator) GetHistory(sessionID string, limit int) []model.LLMMessag
 	return o.contextBuilder.GetHistory(sessionID, limit)
 }

+// buildHistoryHint returns a short context string from recent conversation history.
+// Used by the intent analyzer to disambiguate follow-up questions from IoT queries.
+func (o *Orchestrator) buildHistoryHint(sessionID string) string {
+	if o.contextBuilder == nil {
+		return ""
+	}
+	history := o.contextBuilder.GetHistory(sessionID, 3)
+	if len(history) == 0 {
+		return ""
+	}
+	var parts []string
+	for _, m := range history {
+		roleLabel := "用户"
+		if m.Role == model.RoleAssistant {
+			roleLabel = "昔涟"
+		}
+		content := []rune(m.Content)
+		if len(content) > 60 {
+			content = content[:60]
+		}
+		parts = append(parts, fmt.Sprintf("%s: %s", roleLabel, string(content)))
+	}
+	return strings.Join(parts, "\n")
+}
+
 // CacheMessage 缓存消息
 func (o *Orchestrator) CacheMessage(sessionID string, role model.Role, content string) {
 	if o.contextBuilder != nil {
@@ -19,9 +19,17 @@ func TestParseReviewMessages(t *testing.T) {
 		{"动作+聊天", "(歪着头看你) 叶酱，客厅灯早就开着啦♪", 2, []model.ReviewMessageType{model.ReviewMessageAction, model.ReviewMessageChat}},
 		{"聊天+动作", "我帮你关掉了哦 (轻轻按下遥控器)", 2, []model.ReviewMessageType{model.ReviewMessageChat, model.ReviewMessageAction}},
 		{"只有括号但无内容", "", 0, nil},
-		{"空括号", "()", 1, []model.ReviewMessageType{model.ReviewMessageChat}}, // fallback to chat for unparseable bracket
+		{"空括号", "()", 1, []model.ReviewMessageType{model.ReviewMessageChat}},
 		{"多段落", "第一段内容\n\n第二段内容", 2, []model.ReviewMessageType{model.ReviewMessageChat, model.ReviewMessageChat}},
 		{"动作+多段聊天", "(歪头) 第一段\n\n第二段内容", 3, []model.ReviewMessageType{model.ReviewMessageAction, model.ReviewMessageChat, model.ReviewMessageChat}},
+		// XML action tag tests
+		{"XML纯动作", "<action>轻轻晃了晃手指</action>", 1, []model.ReviewMessageType{model.ReviewMessageAction}},
+		{"XML动作+聊天", "<action>歪头看着你</action> 叶酱，今天好开心呀♪", 2, []model.ReviewMessageType{model.ReviewMessageAction, model.ReviewMessageChat}},
+		{"XML聊天+动作+聊天", "你说的对 <action>轻轻敲了敲桌子</action> 不过我还有一个想法", 3, []model.ReviewMessageType{model.ReviewMessageChat, model.ReviewMessageAction, model.ReviewMessageChat}},
+		{"XML多个动作", "<action>歪头</action> <action>轻轻按下遥控器</action> 帮你关掉啦~", 3, []model.ReviewMessageType{model.ReviewMessageAction, model.ReviewMessageAction, model.ReviewMessageChat}},
+		{"XML混合括号降级", "开头聊天 <action>歪头</action> 中间聊天 (括号动作) 结尾聊天", 5, []model.ReviewMessageType{model.ReviewMessageChat, model.ReviewMessageAction, model.ReviewMessageChat, model.ReviewMessageAction, model.ReviewMessageChat}},
+		{"XML空标签忽略", "<action></action> 正常聊天", 1, []model.ReviewMessageType{model.ReviewMessageChat}},
+		{"XML多行动作", "<action>走到窗边\n拉开窗帘</action> 今天阳光真好呢♪", 2, []model.ReviewMessageType{model.ReviewMessageAction, model.ReviewMessageChat}},
 	}

 	for _, tt := range tests {
@@ -10,6 +10,9 @@ import (
 // codeBlockPattern matches fenced code blocks: ```lang\n...\n```
 var codeBlockPattern = regexp.MustCompile("`{3}([^\n]*)\n([\\s\\S]*?)`{3}")

+// actionTagPattern matches <action>...</action> XML tags (supports multiline content).
+var actionTagPattern = regexp.MustCompile(`(?s)<action>(.*?)</action>`)
+
 // markdownPatterns detects common Markdown syntax for auto-classification.
 var markdownPatterns = []*regexp.Regexp{
 	regexp.MustCompile(`^#{1,6}\s`),                // headings
@@ -73,8 +76,46 @@ func parseReviewMessages(text string) []model.ReviewMessage {
 		})
 	}

-	// Phase 2: bracket-action parser on non-code text
+	// Phase 2: XML action tags + bracket-based fallback
+	var processBracketText func(t string) // pre-declare for mutual reference
+
 	processText := func(t string) {
+		// Step 1: extract <action> XML tags
+		actionMatches := actionTagPattern.FindAllStringSubmatchIndex(t, -1)
+		type xmlAction struct {
+			start, end int
+			content    string
+		}
+		var xmlActions []xmlAction
+		for _, m := range actionMatches {
+			xmlActions = append(xmlActions, xmlAction{
+				start:   m[0],
+				end:     m[1],
+				content: strings.TrimSpace(t[m[2]:m[3]]),
+			})
+		}
+
+		pos := 0
+		for _, xa := range xmlActions {
+			if xa.start > pos {
+				processBracketText(t[pos:xa.start])
+			}
+			if xa.content != "" {
+				messages = append(messages, model.ReviewMessage{
+					Type:    model.ReviewMessageAction,
+					Content: xa.content,
+				})
+			}
+			pos = xa.end
+		}
+		if pos < len(t) {
+			processBracketText(t[pos:])
+		}
+	}
+
+	// processBracketText is the bracket-based action parser (backward compat).
+	// Detects (action) and （action） patterns in text that wasn't already handled by XML tags.
+	processBracketText = func(t string) {
 		remaining := t
 		for len(remaining) > 0 {
 			actionStart := -1
@@ -83,11 +124,11 @@ func parseReviewMessages(text string) []model.ReviewMessage {

 			runes := []rune(remaining)
 			for ri, r := range runes {
-				if r == '(' || r == '（' { // fullwidth (
+				if r == '(' || r == '（' {
 					actionStart = len(string(runes[:ri]))
 					closeRune := ')'
 					if r == '（' {
-						closeRune = '）' // fullwidth )
+						closeRune = '）'
 					}
 					for rj := ri + 1; rj < len(runes); rj++ {
 						if runes[rj] == closeRune {
@@ -269,7 +269,8 @@ func (pc *PersonaConfig) buildConversationStyle() string {
 	}
 	sb.WriteString("- 像 LINE 聊天一样，随意、亲切、有温度\n")
 	sb.WriteString("- 偶尔可以用语气词开头：\"嗯...\"、\"啊\"、\"诶\"\n")
-	sb.WriteString("- 执行操作时（开关设备、查询状态等），用括号包裹动作描述，后面跟自然对话。例如：\"(帮你把客厅灯关掉啦) 嗯，已经关好了~\"\n")
+	sb.WriteString("- 表达动作、表情、肢体语言或执行操作时，使用 <action>...</action> 标签包裹，后面跟自然对话。例如：\"<action>帮你把客厅灯关掉啦</action> 嗯，已经关好了~\"\n")
+	sb.WriteString("- 动作标签只能包含纯动作描述，不要把对话内容放进 <action> 标签里\n")

 	if len(cs.SentenceEnders) > 0 {
 		sb.WriteString(fmt.Sprintf("- 句尾可以带这些语气符：%s\n", strings.Join(cs.SentenceEnders, " ")))