dark 05ece1bffc feat: 标准化 LangGraph 运行链路并完善 MCP 接入
- 将 CLI/chat 部署执行切换为 action 级 LangGraph runtime
- 接入 LangGraph interrupt/checkpointer 处理人工确认与恢复
- 保留业务 checkpoint JSON 用于跨进程断点续跑
- 增加 MCP HTTP/SSE server_url 配置支持
- 增加 MCP 独立 OAuth token 鉴权,复用 HOME 的 client_credentials 方式
- 支持从 MCP server list_tools 自动发现 tools,action_tools 仅作为可选覆盖
- 更新 MCP 配置示例、README、打包说明和整体流程图
- 补充 MCP 配置、鉴权和 tool 自动发现测试
2026-06-02 10:44:42 +08:00

555 lines
23 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

"""PAM 部署 Agent 运行时。
本模块不强依赖 LangGraph可独立运行同一组节点也可在
`pam_deploy_graph.graph` 中接入 LangGraph。
"""
from __future__ import annotations
import time
from dataclasses import asdict
from pathlib import Path
from typing import Any
from .action_router import ActionRouter, build_action_backends
from .checkpoint_store import save_checkpoint
from .config_writer import write_config
from .constants import DEFAULT_PARAMS, GLOBAL_ACTION_SEQUENCE, IP_ACTION_SEQUENCE, REQUIRED_PARAMS
from .fake_runner import FakeActionRunner
from .llm import LlmClient, RuleBasedLlmClient, validate_deploy_plan, validate_intent_result
from .mcp_runner import McpActionRunner
from .models import AgentState, ExecutionStrategy, LlmDeployPlan, LlmIntentResult, LlmParamResult
from .script_runner import ScriptActionRunner, select_script_entry
from .skill_policy import load_skill_policy
class PamDeployAgent:
"""PAM 部署主 Agent串联 LLM、action 路由、确认和续跑状态。"""
def __init__(
self,
*,
skill_path: str | Path = "doc_scripts/PAM_AUTO_DEPLY_SKILL.md",
script_base_dir: str | Path = "doc_scripts",
mcp_runner: McpActionRunner | None = None,
fake_runner: FakeActionRunner | None = None,
llm_client: LlmClient | None = None,
action_analysis_enabled: bool = False,
) -> None:
"""初始化策略、脚本 runner、MCP runner、fake runner 和 LLM client。"""
self.skill_policy = load_skill_policy(skill_path)
self.script_base_dir = Path(script_base_dir)
self.script_runner = ScriptActionRunner(self.script_base_dir)
self.fake_runner = fake_runner or FakeActionRunner()
self.mcp_runner = mcp_runner
self.llm_client = llm_client or RuleBasedLlmClient()
self.action_analysis_enabled = action_analysis_enabled
self.router = ActionRouter(
script_runner=self.script_runner,
mcp_runner=mcp_runner,
fake_runner=self.fake_runner,
)
def understand_request(self, text: str) -> LlmIntentResult:
"""调用 LLM 识别用户意图,并执行基础校验。"""
result = self.llm_client.understand_request(text)
validate_intent_result(result)
return result
def extract_params(self, text: str, base_params: dict[str, Any] | None = None) -> LlmParamResult:
"""从自然语言中抽取部署参数和控制参数。"""
return self.llm_client.extract_params(text, base_params)
def generate_plan(
self,
*,
params: dict[str, Any],
intent: str,
strategy: ExecutionStrategy,
) -> LlmDeployPlan:
"""根据参数、意图和执行策略生成部署计划。"""
plan = self.llm_client.generate_plan(params=params, intent=intent, strategy=strategy)
validate_deploy_plan(plan)
return plan
def analyze_request(self, text: str, base_params: dict[str, Any] | None = None) -> dict[str, Any]:
"""完成意图识别、参数抽取和计划生成,供 analyze/chat 使用。"""
intent = self.understand_request(text)
params = self.extract_params(text, base_params)
strategy = self._choose_strategy(intent.strategy_preference)
plan = self.generate_plan(
params={**DEFAULT_PARAMS, **params.extracted_params},
intent=intent.intent,
strategy=strategy,
)
return {
"intent": intent,
"params": params,
"plan": plan,
}
def normalize_params(self, params: dict[str, Any]) -> dict[str, Any]:
"""合并默认参数并校验必填参数是否齐全。"""
normalized = {**DEFAULT_PARAMS, **params}
missing = [key for key in REQUIRED_PARAMS if not normalized.get(key)]
if missing:
raise ValueError(f"缺少必填参数: {', '.join(missing)}")
return normalized
def _choose_strategy(self, preference: str) -> ExecutionStrategy:
"""把 LLM 给出的策略偏好转换为内部执行策略。"""
if preference in ("hybrid_node_mcp", "script_only", "fake"):
return preference # type: ignore[return-value]
return "hybrid_node_mcp"
def create_state(
self,
*,
params: dict[str, Any],
execution_strategy: ExecutionStrategy = "hybrid_node_mcp",
run_id: str | None = None,
script_entry: str | None = None,
config_path: str | None = None,
trace_file_path: str | None = None,
checkpoint_path: str | None = None,
target_ips: list[str] | None = None,
) -> AgentState:
"""创建一次运行所需的 AgentState并写入脚本配置文件。"""
normalized = self.normalize_params(params)
actual_run_id = run_id or time.strftime("%Y%m%d_%H%M%S")
actual_script_entry = script_entry or select_script_entry()
runtime_dir = Path("runtime")
actual_config_path = config_path or str(runtime_dir / f"config_{actual_run_id}.txt")
actual_trace_path = trace_file_path or str(Path("logs") / f"api_trace_{actual_run_id}.log")
write_config(normalized, actual_config_path)
return AgentState(
run_id=actual_run_id,
params=normalized,
execution_strategy=execution_strategy,
action_backends=build_action_backends(execution_strategy),
script_entry=actual_script_entry,
script_base_dir=str(self.script_base_dir),
config_path=actual_config_path,
trace_file_path=actual_trace_path,
checkpoint_path=checkpoint_path or "",
target_ips=target_ips or [],
)
def preview(self, params: dict[str, Any], strategy: ExecutionStrategy = "hybrid_node_mcp") -> str:
"""渲染部署预览,展示参数和 action 路由。"""
normalized = self.normalize_params(params)
routes = build_action_backends(strategy)
if strategy == "hybrid_node_mcp":
home_backend = "脚本 action"
node_backend = "MCP"
elif strategy == "script_only":
home_backend = "脚本 action"
node_backend = "脚本 action"
else:
home_backend = "fake"
node_backend = "fake"
lines = [
"## PAM 部署预览",
"",
f"- 执行策略: {strategy}",
f"- PAM_HOME: {home_backend}",
f"- PAM_NODE: {node_backend}",
f"- 机场: {normalized['AIRPORT_CODE']}",
f"- 应用: {normalized['APP_NAME']}",
f"- 模块: {normalized['MODULE_NAME']}",
f"- 版本: {normalized['VERSION_NUMBER']}",
"",
"| action | backend |",
"| --- | --- |",
]
for action in GLOBAL_ACTION_SEQUENCE:
lines.append(f"| `{action}` | `{routes[action]}` |")
return "\n".join(lines)
def run_global_flow(self, state: AgentState) -> AgentState:
"""执行全局部署阶段,并跳过 checkpoint 中已完成的步骤。"""
while True:
action = self.next_global_action(state)
if action is None:
return state
self.run_global_action(state, action)
def next_global_action(self, state: AgentState) -> str | None:
"""返回下一个未完成的全局 action。"""
for action in GLOBAL_ACTION_SEQUENCE:
if action in state.completed_global_steps:
continue
return action
return None
def run_global_action(self, state: AgentState, action: str) -> AgentState:
"""执行一个全局 action并把结果写回 AgentState。"""
if action in state.completed_global_steps:
return state
kwargs: dict[str, Any] = {}
if action == "publish-version":
kwargs["hash_code"] = state.hash_code
result = self.router.run_action(state, action, **kwargs)
state.events.append(
{
"type": "ACTION_DONE" if result.ok else "ACTION_FAIL",
"stage": action,
"backend": result.backend,
"message": result.error_summary or "ok",
}
)
self._append_action_analysis(state, action, result)
if not result.ok:
state.last_failed_step = action
self._save_checkpoint(state)
raise RuntimeError(f"{action} 执行失败: {result.error_summary}")
self._apply_result(state, action, result.values)
state.completed_global_steps.append(action)
state.last_success_step = action
self._save_checkpoint(state)
return state
def run_deploy_flow(self, state: AgentState) -> AgentState:
"""执行完整部署流程:全局阶段后进入逐 IP 阶段。"""
if state.pending_confirmation:
self._save_checkpoint(state)
return state
self.run_global_flow(state)
self.run_ip_flow(state)
return state
def run_ip_flow(self, state: AgentState) -> AgentState:
"""执行逐 IP 部署流程,失败时停在人工确认点。"""
while True:
work = self.next_ip_action(state)
if work is None:
return state
ip, action = work
self.run_ip_action(state, ip, action)
def next_ip_action(self, state: AgentState) -> tuple[str, str] | None:
"""返回下一个待执行的单 IP action并按需初始化 IP 状态。"""
if state.pending_confirmation:
self._save_checkpoint(state)
return None
self._resolve_target_ips(state)
for ip in state.target_ips:
ip_state = state.ip_states.get(ip)
if ip_state and ip_state.get("status") == "SUCCESS":
continue
if ip_state and ip_state.get("status") == "FAILED":
if ip_state.get("rollback_status") == "PENDING_AGENT_CONFIRMATION":
state.pending_confirmation = f"rollback-ip:{ip}"
self._save_checkpoint(state)
return None
continue
if not ip_state:
state.events.append({"type": "IP_START", "ip": ip, "message": "start"})
ip_state = {
"status": "RUNNING",
"completed_steps": [],
"failed_stage": "",
"failure_reason": "",
"rollback_status": "ROLLBACK_NOT_RUN",
"rollback_stop_first": False,
"log_file": "",
}
state.ip_states[ip] = ip_state
completed_steps = ip_state.setdefault("completed_steps", [])
for action in IP_ACTION_SEQUENCE:
if action not in completed_steps:
return ip, action
ip_state["status"] = "SUCCESS"
state.events.append({"type": "IP_DONE", "ip": ip, "message": "success"})
self._save_checkpoint(state)
return None
def run_ip_action(self, state: AgentState, ip: str, action: str) -> AgentState:
"""执行一个单 IP action并在失败时设置人工确认点。"""
ip_state = state.ip_states[ip]
completed_steps = ip_state.setdefault("completed_steps", [])
if action in completed_steps:
return state
result = self.router.run_action(state, action, ip=ip)
failed = (not result.ok) or self._business_failed(action, result.values)
state.events.append(
{
"type": "ACTION_FAIL" if failed else "ACTION_DONE",
"stage": action,
"backend": result.backend,
"ip": ip,
"message": result.error_summary or result.values.get("MESSAGE", "ok"),
}
)
self._append_action_analysis(state, action, result, ip=ip)
if failed:
self._record_ip_failure(state, ip, action, result.error_summary or str(result.values))
if action != "download-log":
self._download_log_best_effort(state, ip)
state.pending_confirmation = f"rollback-ip:{ip}"
self._save_checkpoint(state)
return state
self._apply_ip_result(ip_state, action, result.values)
completed_steps.append(action)
self._save_checkpoint(state)
return state
def build_confirmation_request(self, state: AgentState) -> dict[str, Any]:
"""把 pending_confirmation 转换为面向用户的确认请求。"""
if not state.pending_confirmation:
return {}
kind, _, value = state.pending_confirmation.partition(":")
if kind == "rollback-ip":
ip_state = state.ip_states.get(value, {})
return {
"type": "rollback-ip",
"ip": value,
"failed_stage": ip_state.get("failed_stage", ""),
"failure_reason": ip_state.get("failure_reason", ""),
"rollback_stop_first": bool(ip_state.get("rollback_stop_first", False)),
"allowed_decisions": ["approve", "reject"],
}
return {
"type": kind,
"value": value,
"allowed_decisions": ["approve", "reject"],
}
def confirm_pending(self, state: AgentState, *, approved: bool, operator_note: str = "") -> AgentState:
"""处理人工确认结果;当前支持失败 IP 的回滚确认。"""
request = self.build_confirmation_request(state)
if not request:
raise ValueError("当前没有待确认事项")
if request["type"] != "rollback-ip":
raise ValueError(f"不支持的确认类型: {request['type']}")
ip = request["ip"]
ip_state = state.ip_states[ip]
if not approved:
ip_state["rollback_status"] = "REJECTED_BY_OPERATOR"
state.events.append(
{
"type": "CONFIRMATION_REJECTED",
"stage": "rollback-ip",
"ip": ip,
"message": operator_note or "rollback rejected by operator",
}
)
state.pending_confirmation = ""
self._save_checkpoint(state)
return state
result = self.router.run_action(
state,
"rollback-ip",
ip=ip,
stop_first=bool(ip_state.get("rollback_stop_first", False)),
)
ip_state["rollback_status"] = "ROLLBACK_DONE" if result.ok else "ROLLBACK_FAILED"
state.events.append(
{
"type": "ACTION_DONE" if result.ok else "ACTION_FAIL",
"stage": "rollback-ip",
"backend": result.backend,
"ip": ip,
"message": result.error_summary or result.values.get("MESSAGE", "ok"),
}
)
self._append_action_analysis(state, "rollback-ip", result, ip=ip)
if result.ok:
state.pending_confirmation = ""
state.last_success_step = "rollback-ip"
state.last_failed_step = ""
else:
state.pending_confirmation = f"rollback-ip:{ip}"
state.last_failed_step = "rollback-ip"
self._save_checkpoint(state)
return state
def _apply_result(self, state: AgentState, action: str, values: dict[str, Any]) -> None:
"""把全局 action 返回值写回 AgentState。"""
if "HASH_CODE" in values:
state.hash_code = str(values["HASH_CODE"])
if "NODE_URL" in values:
state.node_url = str(values["NODE_URL"])
if action == "get-online-ips":
ips = values.get("IP", [])
if isinstance(ips, str):
ips = [ips]
state.online_ips = list(ips)
state.target_ips = state.target_ips or state.online_ips.copy()
def _resolve_target_ips(self, state: AgentState) -> None:
"""根据在线 IP 和用户指定 IP 计算最终目标 IP。"""
if not state.target_ips:
state.target_ips = state.online_ips.copy()
return
online = set(state.online_ips)
requested = state.target_ips
state.target_ips = [ip for ip in requested if ip in online]
missing = [ip for ip in requested if ip not in online]
if missing:
state.events.append(
{
"type": "TARGET_SCOPE_CHANGED",
"message": "some requested IPs are not online",
"missing_ips": missing,
"target_ips": state.target_ips,
}
)
def _business_failed(self, action: str, values: dict[str, Any]) -> bool:
"""识别 exit code 之外的业务失败条件。"""
if action == "verify-ip":
success = values.get("SUCCESS")
if success is None:
return False
return str(success).lower() not in ("true", "1", "yes")
return False
def _apply_ip_result(self, ip_state: dict[str, Any], action: str, values: dict[str, Any]) -> None:
"""把逐 IP action 返回值写回单 IP 状态。"""
if action == "download-log":
ip_state["log_file"] = str(values.get("LOG_FILE", ""))
def _record_ip_failure(self, state: AgentState, ip: str, action: str, reason: str) -> None:
"""记录单 IP 失败,并设置待回滚确认状态。"""
ip_state = state.ip_states[ip]
stop_first = action in ("start-ip", "verify-ip")
ip_state.update(
{
"status": "FAILED",
"failed_stage": action,
"failure_reason": reason,
"rollback_status": "PENDING_AGENT_CONFIRMATION",
"rollback_stop_first": stop_first,
}
)
state.last_failed_step = action
state.events.append(
{
"type": "CONFIRMATION_REQUIRED",
"stage": "rollback-ip",
"ip": ip,
"stop_first": stop_first,
"message": f"{action} 执行失败,需要确认是否回滚",
}
)
def _download_log_best_effort(self, state: AgentState, ip: str) -> None:
"""失败后尽力下载日志,日志失败不覆盖原失败原因。"""
result = self.router.run_action(state, "download-log", ip=ip)
ip_state = state.ip_states[ip]
if result.ok:
ip_state["log_file"] = str(result.values.get("LOG_FILE", ""))
state.events.append(
{
"type": "ACTION_DONE",
"stage": "download-log",
"backend": result.backend,
"ip": ip,
"message": "已尽力下载日志",
}
)
else:
state.events.append(
{
"type": "ACTION_FAIL",
"stage": "download-log",
"backend": result.backend,
"ip": ip,
"message": result.error_summary or "尽力下载日志失败",
}
)
self._append_action_analysis(state, "download-log", result, ip=ip)
def _save_checkpoint(self, state: AgentState) -> None:
"""如果配置了 checkpoint 路径,则保存完整运行状态。"""
if state.checkpoint_path:
save_checkpoint(state, state.checkpoint_path, redact=False)
def _append_action_analysis(
self,
state: AgentState,
action: str,
result,
*,
ip: str | None = None,
) -> None:
"""启用 action 后分析时,把诊断结果追加到 events。"""
if not self.action_analysis_enabled:
return
try:
analysis = self.llm_client.analyze_action_result(
action=action,
result=result,
state_summary=self._state_summary_for_llm(state, ip=ip),
)
except Exception as exc: # pragma: no cover - 诊断失败不应影响部署主流程
state.events.append(
{
"type": "ACTION_ANALYSIS_FAIL",
"stage": action,
"ip": ip or "",
"message": str(exc),
}
)
return
payload = asdict(analysis)
payload.update({"type": "ACTION_ANALYSIS", "stage": action})
if ip:
payload["ip"] = ip
state.events.append(payload)
def _state_summary_for_llm(self, state: AgentState, *, ip: str | None = None) -> dict[str, Any]:
"""生成给 LLM action 分析使用的脱敏状态摘要。"""
return {
"run_id": state.run_id,
"execution_strategy": state.execution_strategy,
"completed_global_steps": state.completed_global_steps,
"online_ip_count": len(state.online_ips),
"target_ips": state.target_ips,
"current_ip": ip or "",
"current_ip_state": state.ip_states.get(ip, {}) if ip else {},
"pending_confirmation": state.pending_confirmation,
"last_success_step": state.last_success_step,
"last_failed_step": state.last_failed_step,
}
def render_report(self, state: AgentState) -> str:
"""渲染当前部署状态报告。"""
success = sum(1 for item in state.ip_states.values() if item.get("status") == "SUCCESS")
failed = sum(1 for item in state.ip_states.values() if item.get("status") == "FAILED")
lines = [
"## PAM 智能部署报告",
"",
f"- 执行策略: {state.execution_strategy}",
f"- 机场: {state.params['AIRPORT_CODE']}",
f"- 应用: {state.params['APP_NAME']}",
f"- 模块: {state.params['MODULE_NAME']}",
f"- 版本: {state.params['VERSION_NUMBER']}",
f"- 在线工作站数: {len(state.online_ips)}",
f"- 目标工作站数: {len(state.target_ips)}",
f"- 成功: {success}",
f"- 失败: {failed}",
f"- 待确认: {state.pending_confirmation or '-'}",
"",
"| IP | 状态 | 失败阶段 | 回滚状态 | 日志 |",
"| --- | --- | --- | --- | --- |",
]
for ip, ip_state in state.ip_states.items():
lines.append(
"| {ip} | {status} | {failed_stage} | {rollback_status} | {log_file} |".format(
ip=ip,
status=ip_state.get("status", ""),
failed_stage=ip_state.get("failed_stage") or "-",
rollback_status=ip_state.get("rollback_status") or "-",
log_file=ip_state.get("log_file") or "-",
)
)
return "\n".join(lines)