2. 私有化/混合架构
2.1 架构概述
混合架构结合了云服务的便利性和私有部署的安全性,是企业级应用的首选方案。
graph TB
subgraph "Private Network"
subgraph "DMZ"
LB[负载均衡器]
Gateway[API网关]
end
subgraph "Application Cluster"
AS1[Agent服务1]
AS2[Agent服务2]
AS3[Agent服务3]
end
subgraph "AI Cluster"
LLM1[私有LLM模型]
LLM2[开源模型]
LLM3[量化模型]
end
subgraph "Data Cluster"
Vector1[(私有向量库)]
Vector2[(分布式向量库)]
Cache1[(Redis集群)]
DB1[(私有数据库)]
end
subgraph "Private Tools"
Tools[内部工具集]
Workflow[工作流引擎]
end
end
subgraph "Cloud Services"
CloudLLM[云端LLM]
CloudSearch[云搜索]
CloudCache[云缓存]
end
subgraph "Data Sources"
Internal[内部数据源]
External[外部数据源]
end
LB --> Gateway
Gateway --> AS1
Gateway --> AS2
Gateway --> AS3
AS1 --> LLM1
AS2 --> LLM2
AS3 --> LLM3
AS1 --> Vector1
AS2 --> Vector2
AS3 --> Cache1
AS3 --> DB1
AS1 --> Tools
AS2 --> Workflow
AS3 --> Tools
AS1 -.-> CloudLLM
AS2 -.-> CloudSearch
AS3 -.-> CloudCache
Internal --> AS1
External --> AS22.2 私有化Agent实现
import torch
import transformers
from typing import Dict, List, Optional
import asyncio
import json质量评估与监控体系
在LLM Agent系统中,质量评估与监控是确保系统可靠性、准确性和用户体验的关键环节。由于Agent系统涉及多轮交互、工具调用和复杂决策过程,其质量评估比传统LLM应用更加复杂和多样化。本节将从离线评测、在线评测和监控指标三个维度,详细分析LLM Agent的质量保障体系。
1. 离线评测体系
1.1 评测数据集构建
class AgentEvaluationDataset:
"""LLM Agent评测数据集构建器"""
def __init__(self):
self.scenarios = {
"single_tool": self._build_single_tool_scenarios(),
"multi_tool": self._build_multi_tool_scenarios(),
"planning": self._build_planning_scenarios(),
"memory": self._build_memory_scenarios(),
"multi_agent": self._build_multi_agent_scenarios()
}
def _build_single_tool_scenarios(self):
"""单工具调用场景数据集"""
return [
{
"task_id": "st_001",
"description": "查询天气信息",
"input": "北京今天天气怎么样?",
"expected_tool": "weather_api",
"expected_params": {"city": "北京"},
"expected_output_type": "weather_info",
"gold_answer": "北京今天多云,气温15-22°C",
"evaluation_criteria": [
"tool_correctness",
"parameter_accuracy",
"answer_relevance"
]
},
{
"task_id": "st_002",
"description": "预订会议室",
"input": "明天下午2点预订容纳10人的会议室",
"expected_tool": "meeting_room_booking",
"expected_params": {
"date": "tomorrow",
"time": "14:00",
"capacity": 10
},
"expected_output_type": "booking_confirmation",
"gold_answer": "会议室预订成功,明天下午2点,容纳10人",
"evaluation_criteria": [
"tool_correctness",
"parameter_completeness",
"booking_success"
]
}
]
def _build_planning_scenarios(self):
"""多步规划场景数据集"""
return [
{
"task_id": "pl_001",
"description": "复杂任务分解与执行",
"input": "帮我分析竞争对手的产品策略",
"expected_steps": [
{"step": 1, "action": "search", "tool": "web_search",
"params": {"query": "竞争对手产品分析"}},
{"step": 2, "action": "analyze", "tool": "data_analysis",
"params": {"data_source": "search_results"}},
{"step": 3, "action": "report", "tool": "report_generator",
"params": {"format": "markdown"}}
],
"expected_output_structure": {
"sections": ["executive_summary", "competitive_analysis",
"recommendations", "data_sources"],
"reference_count": ">5"
},
"gold_answer": "包含执行摘要、竞争分析、建议和数据源的完整报告",
"evaluation_criteria": [
"planning_logic",
"step_execution",
"output_completeness",
"analysis_depth"
]
}
]1.2 自动化评测框架
class AgentEvaluator:
"""Agent质量自动化评测器"""
def __init__(self, agent, evaluation_dataset):
self.agent = agent
self.dataset = evaluation_dataset
self.metrics = MetricsCollector()
def evaluate_single_scenario(self, scenario):
"""单场景评测"""
result = EvaluationResult(scenario["task_id"])
try:
# 执行Agent任务
agent_response = self.agent.process(scenario["input"])
# 工具调用正确性评估
tool_correctness = self._evaluate_tool_usage(
agent_response, scenario
)
# 参数准确性评估
param_accuracy = self._evaluate_parameters(
agent_response, scenario
)
# 输出质量评估
output_quality = self._evaluate_output_quality(
agent_response, scenario
)
# 整体评分
overall_score = self._calculate_overall_score({
"tool_correctness": tool_correctness,
"parameter_accuracy": param_accuracy,
"output_quality": output_quality
})
result.update_metrics({
"tool_correctness": tool_correctness,
"parameter_accuracy": param_accuracy,
"output_quality": output_quality,
"overall_score": overall_score,
"execution_time": agent_response.execution_time,
"tokens_used": agent_response.token_count
})
except Exception as e:
result.mark_failed(str(e))
return result
def _evaluate_tool_usage(self, response, scenario):
"""评估工具使用正确性"""
if "expected_tool" not in scenario:
return 1.0 # 无特定工具要求
actual_tool = response.selected_tool
expected_tool = scenario["expected_tool"]
if actual_tool == expected_tool:
return 1.0
elif self._is_equivalent_tool(actual_tool, expected_tool):
return 0.8 # 功能等价工具
else:
return 0.0
def _evaluate_parameters(self, response, scenario):
"""评估参数准确性"""
if "expected_params" not in scenario:
return 1.0
actual_params = response.tool_parameters
expected_params = scenario["expected_params"]
param_matches = 0
total_params = len(expected_params)
for key, expected_value in expected_params.items():
if key in actual_params:
if self._compare_values(actual_params[key], expected_value):
param_matches += 1
return param_matches / total_params if total_params > 0 else 1.0
def _evaluate_output_quality(self, response, scenario):
"""评估输出质量"""
criteria = scenario.get("evaluation_criteria", [])
scores = {}
for criterion in criteria:
if criterion == "answer_relevance":
scores[criterion] = self._evaluate_relevance(
response.content, scenario.get("gold_answer", "")
)
elif criterion == "output_completeness":
scores[criterion] = self._evaluate_completeness(
response.content, scenario.get("expected_output_structure", {})
)
elif criterion == "planning_logic":
scores[criterion] = self._evaluate_planning_logic(
response.execution_trace, scenario.get("expected_steps", [])
)
return np.mean(list(scores.values())) if scores else 0.8
def _evaluate_relevance(self, response, gold_answer):
"""相关性评估(使用embedding相似度)"""
from sentence_transformers import SentenceTransformer
model = SentenceTransformer('all-MiniLM-L6-v2')
response_embedding = model.encode(response)
gold_embedding = model.encode(gold_answer)
similarity = cosine_similarity([response_embedding], [gold_embedding])[0][0]
return float(similarity)
def _evaluate_completeness(self, response, expected_structure):
"""完整性评估"""
if not expected_structure:
return 0.8 # 默认分数
sections = expected_structure.get("sections", [])
if not sections:
return 0.8
present_sections = 0
for section in sections:
if section.lower() in response.lower():
present_sections += 1
return present_sections / len(sections)
def _evaluate_planning_logic(self, execution_trace, expected_steps):
"""规划逻辑评估"""
if not expected_steps:
return 0.8
actual_steps = len(execution_trace)
expected_count = len(expected_steps)
# 步骤数量匹配度
count_score = min(actual_steps / expected_count, 1.0)
# 步骤顺序合理性
order_score = self._evaluate_step_order(execution_trace, expected_steps)
return (count_score + order_score) / 2
def run_full_evaluation(self):
"""运行完整评测"""
results = []
for scenario in self.dataset:
results.append(self.evaluate_single_scenario(scenario))
return results