- 为 pam_deploy_graph 生产代码补充中文模块、类、函数/方法文档字符串 - 将原有英文说明和主要英文异常提示改为中文 - 新增当前整体逻辑结构流程图文档,覆盖模块结构、执行链路、action 路由、人工确认和 checkpoint 续跑 - 新增 Linux 自带运行环境打包脚本,使用 PyInstaller 生成解压即用目录和 tar.gz - 新增 Linux 打包说明,包含构建命令、运行方式、依赖说明和包大小评估 - 同步 README,补充流程图、打包方式、产物路径和大小预估 - 更新相关测试断言以匹配中文错误提示
468 lines
19 KiB
Python
468 lines
19 KiB
Python
"""PAM 部署 Agent 运行时。
|
||
|
||
本模块不强依赖 LangGraph,可独立运行;同一组节点也可在
|
||
`pam_deploy_graph.graph` 中接入 LangGraph。
|
||
"""
|
||
|
||
from __future__ import annotations
|
||
|
||
import time
|
||
from pathlib import Path
|
||
from typing import Any
|
||
|
||
from .action_router import ActionRouter, build_action_backends
|
||
from .checkpoint_store import save_checkpoint
|
||
from .config_writer import write_config
|
||
from .constants import DEFAULT_PARAMS, GLOBAL_ACTION_SEQUENCE, IP_ACTION_SEQUENCE, REQUIRED_PARAMS
|
||
from .fake_runner import FakeActionRunner
|
||
from .llm import LlmClient, RuleBasedLlmClient, validate_deploy_plan, validate_intent_result
|
||
from .mcp_runner import McpActionRunner
|
||
from .models import AgentState, ExecutionStrategy, LlmDeployPlan, LlmIntentResult, LlmParamResult
|
||
from .script_runner import ScriptActionRunner, select_script_entry
|
||
from .skill_policy import load_skill_policy
|
||
|
||
|
||
class PamDeployAgent:
|
||
"""PAM 部署主 Agent,串联 LLM、action 路由、确认和续跑状态。"""
|
||
|
||
def __init__(
|
||
self,
|
||
*,
|
||
skill_path: str | Path = "doc_scripts/PAM_AUTO_DEPLY_SKILL.md",
|
||
script_base_dir: str | Path = "doc_scripts",
|
||
mcp_runner: McpActionRunner | None = None,
|
||
fake_runner: FakeActionRunner | None = None,
|
||
llm_client: LlmClient | None = None,
|
||
) -> None:
|
||
"""初始化策略、脚本 runner、MCP runner、fake runner 和 LLM client。"""
|
||
self.skill_policy = load_skill_policy(skill_path)
|
||
self.script_base_dir = Path(script_base_dir)
|
||
self.script_runner = ScriptActionRunner(self.script_base_dir)
|
||
self.fake_runner = fake_runner or FakeActionRunner()
|
||
self.mcp_runner = mcp_runner
|
||
self.llm_client = llm_client or RuleBasedLlmClient()
|
||
self.router = ActionRouter(
|
||
script_runner=self.script_runner,
|
||
mcp_runner=mcp_runner,
|
||
fake_runner=self.fake_runner,
|
||
)
|
||
|
||
def understand_request(self, text: str) -> LlmIntentResult:
|
||
"""调用 LLM 识别用户意图,并执行基础校验。"""
|
||
result = self.llm_client.understand_request(text)
|
||
validate_intent_result(result)
|
||
return result
|
||
|
||
def extract_params(self, text: str, base_params: dict[str, Any] | None = None) -> LlmParamResult:
|
||
"""从自然语言中抽取部署参数和控制参数。"""
|
||
return self.llm_client.extract_params(text, base_params)
|
||
|
||
def generate_plan(
|
||
self,
|
||
*,
|
||
params: dict[str, Any],
|
||
intent: str,
|
||
strategy: ExecutionStrategy,
|
||
) -> LlmDeployPlan:
|
||
"""根据参数、意图和执行策略生成部署计划。"""
|
||
plan = self.llm_client.generate_plan(params=params, intent=intent, strategy=strategy)
|
||
validate_deploy_plan(plan)
|
||
return plan
|
||
|
||
def analyze_request(self, text: str, base_params: dict[str, Any] | None = None) -> dict[str, Any]:
|
||
"""完成意图识别、参数抽取和计划生成,供 analyze/chat 使用。"""
|
||
intent = self.understand_request(text)
|
||
params = self.extract_params(text, base_params)
|
||
strategy = self._choose_strategy(intent.strategy_preference)
|
||
plan = self.generate_plan(
|
||
params={**DEFAULT_PARAMS, **params.extracted_params},
|
||
intent=intent.intent,
|
||
strategy=strategy,
|
||
)
|
||
return {
|
||
"intent": intent,
|
||
"params": params,
|
||
"plan": plan,
|
||
}
|
||
|
||
def normalize_params(self, params: dict[str, Any]) -> dict[str, Any]:
|
||
"""合并默认参数并校验必填参数是否齐全。"""
|
||
normalized = {**DEFAULT_PARAMS, **params}
|
||
missing = [key for key in REQUIRED_PARAMS if not normalized.get(key)]
|
||
if missing:
|
||
raise ValueError(f"缺少必填参数: {', '.join(missing)}")
|
||
return normalized
|
||
|
||
def _choose_strategy(self, preference: str) -> ExecutionStrategy:
|
||
"""把 LLM 给出的策略偏好转换为内部执行策略。"""
|
||
if preference in ("hybrid_node_mcp", "script_only", "fake"):
|
||
return preference # type: ignore[return-value]
|
||
return "hybrid_node_mcp"
|
||
|
||
def create_state(
|
||
self,
|
||
*,
|
||
params: dict[str, Any],
|
||
execution_strategy: ExecutionStrategy = "hybrid_node_mcp",
|
||
run_id: str | None = None,
|
||
script_entry: str | None = None,
|
||
config_path: str | None = None,
|
||
trace_file_path: str | None = None,
|
||
checkpoint_path: str | None = None,
|
||
target_ips: list[str] | None = None,
|
||
) -> AgentState:
|
||
"""创建一次运行所需的 AgentState,并写入脚本配置文件。"""
|
||
normalized = self.normalize_params(params)
|
||
actual_run_id = run_id or time.strftime("%Y%m%d_%H%M%S")
|
||
actual_script_entry = script_entry or select_script_entry()
|
||
runtime_dir = Path("runtime")
|
||
actual_config_path = config_path or str(runtime_dir / f"config_{actual_run_id}.txt")
|
||
actual_trace_path = trace_file_path or str(Path("logs") / f"api_trace_{actual_run_id}.log")
|
||
write_config(normalized, actual_config_path)
|
||
return AgentState(
|
||
run_id=actual_run_id,
|
||
params=normalized,
|
||
execution_strategy=execution_strategy,
|
||
action_backends=build_action_backends(execution_strategy),
|
||
script_entry=actual_script_entry,
|
||
script_base_dir=str(self.script_base_dir),
|
||
config_path=actual_config_path,
|
||
trace_file_path=actual_trace_path,
|
||
checkpoint_path=checkpoint_path or "",
|
||
target_ips=target_ips or [],
|
||
)
|
||
|
||
def preview(self, params: dict[str, Any], strategy: ExecutionStrategy = "hybrid_node_mcp") -> str:
|
||
"""渲染部署预演,展示参数和 action 路由。"""
|
||
normalized = self.normalize_params(params)
|
||
routes = build_action_backends(strategy)
|
||
if strategy == "hybrid_node_mcp":
|
||
home_backend = "脚本 action"
|
||
node_backend = "MCP"
|
||
elif strategy == "script_only":
|
||
home_backend = "脚本 action"
|
||
node_backend = "脚本 action"
|
||
else:
|
||
home_backend = "fake"
|
||
node_backend = "fake"
|
||
lines = [
|
||
"## PAM 部署预演",
|
||
"",
|
||
f"- 执行策略: {strategy}",
|
||
f"- PAM_HOME: {home_backend}",
|
||
f"- PAM_NODE: {node_backend}",
|
||
f"- 机场: {normalized['AIRPORT_CODE']}",
|
||
f"- 应用: {normalized['APP_NAME']}",
|
||
f"- 模块: {normalized['MODULE_NAME']}",
|
||
f"- 版本: {normalized['VERSION_NUMBER']}",
|
||
"",
|
||
"| action | backend |",
|
||
"| --- | --- |",
|
||
]
|
||
for action in GLOBAL_ACTION_SEQUENCE:
|
||
lines.append(f"| `{action}` | `{routes[action]}` |")
|
||
return "\n".join(lines)
|
||
|
||
def run_global_flow(self, state: AgentState) -> AgentState:
|
||
"""执行全局部署阶段,并跳过 checkpoint 中已完成的步骤。"""
|
||
for action in GLOBAL_ACTION_SEQUENCE:
|
||
if action in state.completed_global_steps:
|
||
continue
|
||
kwargs: dict[str, Any] = {}
|
||
if action == "publish-version":
|
||
kwargs["hash_code"] = state.hash_code
|
||
result = self.router.run_action(state, action, **kwargs)
|
||
state.events.append(
|
||
{
|
||
"type": "ACTION_DONE" if result.ok else "ACTION_FAIL",
|
||
"stage": action,
|
||
"backend": result.backend,
|
||
"message": result.error_summary or "ok",
|
||
}
|
||
)
|
||
if not result.ok:
|
||
state.last_failed_step = action
|
||
self._save_checkpoint(state)
|
||
raise RuntimeError(f"{action} 执行失败: {result.error_summary}")
|
||
self._apply_result(state, action, result.values)
|
||
state.completed_global_steps.append(action)
|
||
state.last_success_step = action
|
||
self._save_checkpoint(state)
|
||
return state
|
||
|
||
def run_deploy_flow(self, state: AgentState) -> AgentState:
|
||
"""执行完整部署流程:全局阶段后进入逐 IP 阶段。"""
|
||
if state.pending_confirmation:
|
||
self._save_checkpoint(state)
|
||
return state
|
||
self.run_global_flow(state)
|
||
self.run_ip_flow(state)
|
||
return state
|
||
|
||
def run_ip_flow(self, state: AgentState) -> AgentState:
|
||
"""执行逐 IP 部署流程,失败时停在人工确认点。"""
|
||
if state.pending_confirmation:
|
||
self._save_checkpoint(state)
|
||
return state
|
||
self._resolve_target_ips(state)
|
||
for ip in state.target_ips:
|
||
ip_state = state.ip_states.get(ip)
|
||
if ip_state and ip_state.get("status") == "SUCCESS":
|
||
continue
|
||
if ip_state and ip_state.get("status") == "FAILED":
|
||
if ip_state.get("rollback_status") == "PENDING_AGENT_CONFIRMATION":
|
||
state.pending_confirmation = f"rollback-ip:{ip}"
|
||
self._save_checkpoint(state)
|
||
return state
|
||
continue
|
||
if not ip_state:
|
||
state.events.append({"type": "IP_START", "ip": ip, "message": "start"})
|
||
ip_state = {
|
||
"status": "RUNNING",
|
||
"completed_steps": [],
|
||
"failed_stage": "",
|
||
"failure_reason": "",
|
||
"rollback_status": "ROLLBACK_NOT_RUN",
|
||
"rollback_stop_first": False,
|
||
"log_file": "",
|
||
}
|
||
state.ip_states[ip] = ip_state
|
||
|
||
for action in IP_ACTION_SEQUENCE:
|
||
completed_steps = ip_state.setdefault("completed_steps", [])
|
||
if action in completed_steps:
|
||
continue
|
||
result = self.router.run_action(state, action, ip=ip)
|
||
failed = (not result.ok) or self._business_failed(action, result.values)
|
||
state.events.append(
|
||
{
|
||
"type": "ACTION_FAIL" if failed else "ACTION_DONE",
|
||
"stage": action,
|
||
"backend": result.backend,
|
||
"ip": ip,
|
||
"message": result.error_summary or result.values.get("MESSAGE", "ok"),
|
||
}
|
||
)
|
||
|
||
if failed:
|
||
self._record_ip_failure(state, ip, action, result.error_summary or str(result.values))
|
||
if action != "download-log":
|
||
self._download_log_best_effort(state, ip)
|
||
state.pending_confirmation = f"rollback-ip:{ip}"
|
||
self._save_checkpoint(state)
|
||
return state
|
||
|
||
self._apply_ip_result(ip_state, action, result.values)
|
||
completed_steps.append(action)
|
||
self._save_checkpoint(state)
|
||
|
||
ip_state["status"] = "SUCCESS"
|
||
state.events.append({"type": "IP_DONE", "ip": ip, "message": "success"})
|
||
self._save_checkpoint(state)
|
||
return state
|
||
|
||
def build_confirmation_request(self, state: AgentState) -> dict[str, Any]:
|
||
"""把 pending_confirmation 转换为面向用户的确认请求。"""
|
||
if not state.pending_confirmation:
|
||
return {}
|
||
kind, _, value = state.pending_confirmation.partition(":")
|
||
if kind == "rollback-ip":
|
||
ip_state = state.ip_states.get(value, {})
|
||
return {
|
||
"type": "rollback-ip",
|
||
"ip": value,
|
||
"failed_stage": ip_state.get("failed_stage", ""),
|
||
"failure_reason": ip_state.get("failure_reason", ""),
|
||
"rollback_stop_first": bool(ip_state.get("rollback_stop_first", False)),
|
||
"allowed_decisions": ["approve", "reject"],
|
||
}
|
||
return {
|
||
"type": kind,
|
||
"value": value,
|
||
"allowed_decisions": ["approve", "reject"],
|
||
}
|
||
|
||
def confirm_pending(self, state: AgentState, *, approved: bool, operator_note: str = "") -> AgentState:
|
||
"""处理人工确认结果;当前支持失败 IP 的回滚确认。"""
|
||
request = self.build_confirmation_request(state)
|
||
if not request:
|
||
raise ValueError("当前没有待确认事项")
|
||
if request["type"] != "rollback-ip":
|
||
raise ValueError(f"不支持的确认类型: {request['type']}")
|
||
|
||
ip = request["ip"]
|
||
ip_state = state.ip_states[ip]
|
||
if not approved:
|
||
ip_state["rollback_status"] = "REJECTED_BY_OPERATOR"
|
||
state.events.append(
|
||
{
|
||
"type": "CONFIRMATION_REJECTED",
|
||
"stage": "rollback-ip",
|
||
"ip": ip,
|
||
"message": operator_note or "rollback rejected by operator",
|
||
}
|
||
)
|
||
state.pending_confirmation = ""
|
||
self._save_checkpoint(state)
|
||
return state
|
||
|
||
result = self.router.run_action(
|
||
state,
|
||
"rollback-ip",
|
||
ip=ip,
|
||
stop_first=bool(ip_state.get("rollback_stop_first", False)),
|
||
)
|
||
ip_state["rollback_status"] = "ROLLBACK_DONE" if result.ok else "ROLLBACK_FAILED"
|
||
state.events.append(
|
||
{
|
||
"type": "ACTION_DONE" if result.ok else "ACTION_FAIL",
|
||
"stage": "rollback-ip",
|
||
"backend": result.backend,
|
||
"ip": ip,
|
||
"message": result.error_summary or result.values.get("MESSAGE", "ok"),
|
||
}
|
||
)
|
||
if result.ok:
|
||
state.pending_confirmation = ""
|
||
state.last_success_step = "rollback-ip"
|
||
state.last_failed_step = ""
|
||
else:
|
||
state.pending_confirmation = f"rollback-ip:{ip}"
|
||
state.last_failed_step = "rollback-ip"
|
||
self._save_checkpoint(state)
|
||
return state
|
||
|
||
def _apply_result(self, state: AgentState, action: str, values: dict[str, Any]) -> None:
|
||
"""把全局 action 返回值写回 AgentState。"""
|
||
if "HASH_CODE" in values:
|
||
state.hash_code = str(values["HASH_CODE"])
|
||
if "NODE_URL" in values:
|
||
state.node_url = str(values["NODE_URL"])
|
||
if action == "get-online-ips":
|
||
ips = values.get("IP", [])
|
||
if isinstance(ips, str):
|
||
ips = [ips]
|
||
state.online_ips = list(ips)
|
||
state.target_ips = state.target_ips or state.online_ips.copy()
|
||
|
||
def _resolve_target_ips(self, state: AgentState) -> None:
|
||
"""根据在线 IP 和用户指定 IP 计算最终目标 IP。"""
|
||
if not state.target_ips:
|
||
state.target_ips = state.online_ips.copy()
|
||
return
|
||
online = set(state.online_ips)
|
||
requested = state.target_ips
|
||
state.target_ips = [ip for ip in requested if ip in online]
|
||
missing = [ip for ip in requested if ip not in online]
|
||
if missing:
|
||
state.events.append(
|
||
{
|
||
"type": "TARGET_SCOPE_CHANGED",
|
||
"message": "some requested IPs are not online",
|
||
"missing_ips": missing,
|
||
"target_ips": state.target_ips,
|
||
}
|
||
)
|
||
|
||
def _business_failed(self, action: str, values: dict[str, Any]) -> bool:
|
||
"""识别 exit code 之外的业务失败条件。"""
|
||
if action == "verify-ip":
|
||
success = values.get("SUCCESS")
|
||
if success is None:
|
||
return False
|
||
return str(success).lower() not in ("true", "1", "yes")
|
||
return False
|
||
|
||
def _apply_ip_result(self, ip_state: dict[str, Any], action: str, values: dict[str, Any]) -> None:
|
||
"""把逐 IP action 返回值写回单 IP 状态。"""
|
||
if action == "download-log":
|
||
ip_state["log_file"] = str(values.get("LOG_FILE", ""))
|
||
|
||
def _record_ip_failure(self, state: AgentState, ip: str, action: str, reason: str) -> None:
|
||
"""记录单 IP 失败,并设置待回滚确认状态。"""
|
||
ip_state = state.ip_states[ip]
|
||
stop_first = action in ("start-ip", "verify-ip")
|
||
ip_state.update(
|
||
{
|
||
"status": "FAILED",
|
||
"failed_stage": action,
|
||
"failure_reason": reason,
|
||
"rollback_status": "PENDING_AGENT_CONFIRMATION",
|
||
"rollback_stop_first": stop_first,
|
||
}
|
||
)
|
||
state.last_failed_step = action
|
||
state.events.append(
|
||
{
|
||
"type": "CONFIRMATION_REQUIRED",
|
||
"stage": "rollback-ip",
|
||
"ip": ip,
|
||
"stop_first": stop_first,
|
||
"message": f"{action} 执行失败,需要确认是否回滚",
|
||
}
|
||
)
|
||
|
||
def _download_log_best_effort(self, state: AgentState, ip: str) -> None:
|
||
"""失败后尽力下载日志,日志失败不覆盖原失败原因。"""
|
||
result = self.router.run_action(state, "download-log", ip=ip)
|
||
ip_state = state.ip_states[ip]
|
||
if result.ok:
|
||
ip_state["log_file"] = str(result.values.get("LOG_FILE", ""))
|
||
state.events.append(
|
||
{
|
||
"type": "ACTION_DONE",
|
||
"stage": "download-log",
|
||
"backend": result.backend,
|
||
"ip": ip,
|
||
"message": "已尽力下载日志",
|
||
}
|
||
)
|
||
else:
|
||
state.events.append(
|
||
{
|
||
"type": "ACTION_FAIL",
|
||
"stage": "download-log",
|
||
"backend": result.backend,
|
||
"ip": ip,
|
||
"message": result.error_summary or "尽力下载日志失败",
|
||
}
|
||
)
|
||
|
||
def _save_checkpoint(self, state: AgentState) -> None:
|
||
"""如果配置了 checkpoint 路径,则保存完整运行状态。"""
|
||
if state.checkpoint_path:
|
||
save_checkpoint(state, state.checkpoint_path, redact=False)
|
||
|
||
def render_report(self, state: AgentState) -> str:
|
||
"""渲染当前部署状态报告。"""
|
||
success = sum(1 for item in state.ip_states.values() if item.get("status") == "SUCCESS")
|
||
failed = sum(1 for item in state.ip_states.values() if item.get("status") == "FAILED")
|
||
lines = [
|
||
"## PAM 智能部署报告",
|
||
"",
|
||
f"- 执行策略: {state.execution_strategy}",
|
||
f"- 机场: {state.params['AIRPORT_CODE']}",
|
||
f"- 应用: {state.params['APP_NAME']}",
|
||
f"- 模块: {state.params['MODULE_NAME']}",
|
||
f"- 版本: {state.params['VERSION_NUMBER']}",
|
||
f"- 在线工作站数: {len(state.online_ips)}",
|
||
f"- 目标工作站数: {len(state.target_ips)}",
|
||
f"- 成功: {success}",
|
||
f"- 失败: {failed}",
|
||
f"- 待确认: {state.pending_confirmation or '-'}",
|
||
"",
|
||
"| IP | 状态 | 失败阶段 | 回滚状态 | 日志 |",
|
||
"| --- | --- | --- | --- | --- |",
|
||
]
|
||
for ip, ip_state in state.ip_states.items():
|
||
lines.append(
|
||
"| {ip} | {status} | {failed_stage} | {rollback_status} | {log_file} |".format(
|
||
ip=ip,
|
||
status=ip_state.get("status", ""),
|
||
failed_stage=ip_state.get("failed_stage") or "-",
|
||
rollback_status=ip_state.get("rollback_status") or "-",
|
||
log_file=ip_state.get("log_file") or "-",
|
||
)
|
||
)
|
||
return "\n".join(lines)
|