LLM Agent架构设计模式与核心组件分析 - Part 20 私有化/混合架构

2. 私有化/混合架构

2.1 架构概述

混合架构结合了云服务的便利性和私有部署的安全性，是企业级应用的首选方案。

graph TB
    subgraph "Private Network"
        subgraph "DMZ"
            LB[负载均衡器]
            Gateway[API网关]
        end
        
        subgraph "Application Cluster"
            AS1[Agent服务1]
            AS2[Agent服务2]
            AS3[Agent服务3]
        end
        
        subgraph "AI Cluster"
            LLM1[私有LLM模型]
            LLM2[开源模型]
            LLM3[量化模型]
        end
        
        subgraph "Data Cluster"
            Vector1[(私有向量库)]
            Vector2[(分布式向量库)]
            Cache1[(Redis集群)]
            DB1[(私有数据库)]
        end
        
        subgraph "Private Tools"
            Tools[内部工具集]
            Workflow[工作流引擎]
        end
    end
    
    subgraph "Cloud Services"
        CloudLLM[云端LLM]
        CloudSearch[云搜索]
        CloudCache[云缓存]
    end
    
    subgraph "Data Sources"
        Internal[内部数据源]
        External[外部数据源]
    end
    
    LB --> Gateway
    Gateway --> AS1
    Gateway --> AS2
    Gateway --> AS3
    
    AS1 --> LLM1
    AS2 --> LLM2
    AS3 --> LLM3
    
    AS1 --> Vector1
    AS2 --> Vector2
    AS3 --> Cache1
    AS3 --> DB1
    
    AS1 --> Tools
    AS2 --> Workflow
    AS3 --> Tools
    
    AS1 -.-> CloudLLM
    AS2 -.-> CloudSearch
    AS3 -.-> CloudCache
    
    Internal --> AS1
    External --> AS2

2.2 私有化Agent实现

import torch
import transformers
from typing import Dict, List, Optional
import asyncio
import json

质量评估与监控体系

在LLM Agent系统中，质量评估与监控是确保系统可靠性、准确性和用户体验的关键环节。由于Agent系统涉及多轮交互、工具调用和复杂决策过程，其质量评估比传统LLM应用更加复杂和多样化。本节将从离线评测、在线评测和监控指标三个维度，详细分析LLM Agent的质量保障体系。

1. 离线评测体系

1.1 评测数据集构建

class AgentEvaluationDataset:
    """LLM Agent评测数据集构建器"""
    
    def __init__(self):
        self.scenarios = {
            "single_tool": self._build_single_tool_scenarios(),
            "multi_tool": self._build_multi_tool_scenarios(),
            "planning": self._build_planning_scenarios(),
            "memory": self._build_memory_scenarios(),
            "multi_agent": self._build_multi_agent_scenarios()
        }
    
    def _build_single_tool_scenarios(self):
        """单工具调用场景数据集"""
        return [
            {
                "task_id": "st_001",
                "description": "查询天气信息",
                "input": "北京今天天气怎么样？",
                "expected_tool": "weather_api",
                "expected_params": {"city": "北京"},
                "expected_output_type": "weather_info",
                "gold_answer": "北京今天多云，气温15-22°C",
                "evaluation_criteria": [
                    "tool_correctness",
                    "parameter_accuracy", 
                    "answer_relevance"
                ]
            },
            {
                "task_id": "st_002",
                "description": "预订会议室",
                "input": "明天下午2点预订容纳10人的会议室",
                "expected_tool": "meeting_room_booking",
                "expected_params": {
                    "date": "tomorrow",
                    "time": "14:00",
                    "capacity": 10
                },
                "expected_output_type": "booking_confirmation",
                "gold_answer": "会议室预订成功，明天下午2点，容纳10人",
                "evaluation_criteria": [
                    "tool_correctness",
                    "parameter_completeness",
                    "booking_success"
                ]
            }
        ]
    
    def _build_planning_scenarios(self):
        """多步规划场景数据集"""
        return [
            {
                "task_id": "pl_001",
                "description": "复杂任务分解与执行",
                "input": "帮我分析竞争对手的产品策略",
                "expected_steps": [
                    {"step": 1, "action": "search", "tool": "web_search", 
                     "params": {"query": "竞争对手产品分析"}},
                    {"step": 2, "action": "analyze", "tool": "data_analysis", 
                     "params": {"data_source": "search_results"}},
                    {"step": 3, "action": "report", "tool": "report_generator", 
                     "params": {"format": "markdown"}}
                ],
                "expected_output_structure": {
                    "sections": ["executive_summary", "competitive_analysis", 
                               "recommendations", "data_sources"],
                    "reference_count": ">5"
                },
                "gold_answer": "包含执行摘要、竞争分析、建议和数据源的完整报告",
                "evaluation_criteria": [
                    "planning_logic",
                    "step_execution",
                    "output_completeness",
                    "analysis_depth"
                ]
            }
        ]

1.2 自动化评测框架

class AgentEvaluator:
    """Agent质量自动化评测器"""
    
    def __init__(self, agent, evaluation_dataset):
        self.agent = agent
        self.dataset = evaluation_dataset
        self.metrics = MetricsCollector()
    
    def evaluate_single_scenario(self, scenario):
        """单场景评测"""
        result = EvaluationResult(scenario["task_id"])
        
        try:
            # 执行Agent任务
            agent_response = self.agent.process(scenario["input"])
            
            # 工具调用正确性评估
            tool_correctness = self._evaluate_tool_usage(
                agent_response, scenario
            )
            
            # 参数准确性评估
            param_accuracy = self._evaluate_parameters(
                agent_response, scenario
            )
            
            # 输出质量评估
            output_quality = self._evaluate_output_quality(
                agent_response, scenario
            )
            
            # 整体评分
            overall_score = self._calculate_overall_score({
                "tool_correctness": tool_correctness,
                "parameter_accuracy": param_accuracy, 
                "output_quality": output_quality
            })
            
            result.update_metrics({
                "tool_correctness": tool_correctness,
                "parameter_accuracy": param_accuracy,
                "output_quality": output_quality,
                "overall_score": overall_score,
                "execution_time": agent_response.execution_time,
                "tokens_used": agent_response.token_count
            })
            
        except Exception as e:
            result.mark_failed(str(e))
        
        return result
    
    def _evaluate_tool_usage(self, response, scenario):
        """评估工具使用正确性"""
        if "expected_tool" not in scenario:
            return 1.0  # 无特定工具要求
        
        actual_tool = response.selected_tool
        expected_tool = scenario["expected_tool"]
        
        if actual_tool == expected_tool:
            return 1.0
        elif self._is_equivalent_tool(actual_tool, expected_tool):
            return 0.8  # 功能等价工具
        else:
            return 0.0
    
    def _evaluate_parameters(self, response, scenario):
        """评估参数准确性"""
        if "expected_params" not in scenario:
            return 1.0
        
        actual_params = response.tool_parameters
        expected_params = scenario["expected_params"]
        
        param_matches = 0
        total_params = len(expected_params)
        
        for key, expected_value in expected_params.items():
            if key in actual_params:
                if self._compare_values(actual_params[key], expected_value):
                    param_matches += 1
        
        return param_matches / total_params if total_params > 0 else 1.0
    
    def _evaluate_output_quality(self, response, scenario):
        """评估输出质量"""
        criteria = scenario.get("evaluation_criteria", [])
        scores = {}
        
        for criterion in criteria:
            if criterion == "answer_relevance":
                scores[criterion] = self._evaluate_relevance(
                    response.content, scenario.get("gold_answer", "")
                )
            elif criterion == "output_completeness":
                scores[criterion] = self._evaluate_completeness(
                    response.content, scenario.get("expected_output_structure", {})
                )
            elif criterion == "planning_logic":
                scores[criterion] = self._evaluate_planning_logic(
                    response.execution_trace, scenario.get("expected_steps", [])
                )
        
        return np.mean(list(scores.values())) if scores else 0.8
    
    def _evaluate_relevance(self, response, gold_answer):
        """相关性评估（使用embedding相似度）"""
        from sentence_transformers import SentenceTransformer
        
        model = SentenceTransformer('all-MiniLM-L6-v2')
        
        response_embedding = model.encode(response)
        gold_embedding = model.encode(gold_answer)
        
        similarity = cosine_similarity([response_embedding], [gold_embedding])[0][0]
        return float(similarity)
    
    def _evaluate_completeness(self, response, expected_structure):
        """完整性评估"""
        if not expected_structure:
            return 0.8  # 默认分数
        
        sections = expected_structure.get("sections", [])
        if not sections:
            return 0.8
        
        present_sections = 0
        for section in sections:
            if section.lower() in response.lower():
                present_sections += 1
        
        return present_sections / len(sections)
    
    def _evaluate_planning_logic(self, execution_trace, expected_steps):
        """规划逻辑评估"""
        if not expected_steps:
            return 0.8
        
        actual_steps = len(execution_trace)
        expected_count = len(expected_steps)
        
        # 步骤数量匹配度
        count_score = min(actual_steps / expected_count, 1.0)
        
        # 步骤顺序合理性
        order_score = self._evaluate_step_order(execution_trace, expected_steps)
        
        return (count_score + order_score) / 2
    
    def run_full_evaluation(self):
        """运行完整评测"""
        results = []
        for scenario in self.dataset:
            results.append(self.evaluate_single_scenario(scenario))
        return results