dark a11904b7c5 docs/build: 补齐中文注释、流程图和 Linux 解压即用打包脚本
- 为 pam_deploy_graph 生产代码补充中文模块、类、函数/方法文档字符串
- 将原有英文说明和主要英文异常提示改为中文
- 新增当前整体逻辑结构流程图文档,覆盖模块结构、执行链路、action 路由、人工确认和 checkpoint 续跑
- 新增 Linux 自带运行环境打包脚本,使用 PyInstaller 生成解压即用目录和 tar.gz
- 新增 Linux 打包说明,包含构建命令、运行方式、依赖说明和包大小评估
- 同步 README,补充流程图、打包方式、产物路径和大小预估
- 更新相关测试断言以匹配中文错误提示
2026-06-01 11:21:42 +08:00

468 lines
19 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

"""PAM 部署 Agent 运行时。
本模块不强依赖 LangGraph可独立运行同一组节点也可在
`pam_deploy_graph.graph` 中接入 LangGraph。
"""
from __future__ import annotations
import time
from pathlib import Path
from typing import Any
from .action_router import ActionRouter, build_action_backends
from .checkpoint_store import save_checkpoint
from .config_writer import write_config
from .constants import DEFAULT_PARAMS, GLOBAL_ACTION_SEQUENCE, IP_ACTION_SEQUENCE, REQUIRED_PARAMS
from .fake_runner import FakeActionRunner
from .llm import LlmClient, RuleBasedLlmClient, validate_deploy_plan, validate_intent_result
from .mcp_runner import McpActionRunner
from .models import AgentState, ExecutionStrategy, LlmDeployPlan, LlmIntentResult, LlmParamResult
from .script_runner import ScriptActionRunner, select_script_entry
from .skill_policy import load_skill_policy
class PamDeployAgent:
"""PAM 部署主 Agent串联 LLM、action 路由、确认和续跑状态。"""
def __init__(
self,
*,
skill_path: str | Path = "doc_scripts/PAM_AUTO_DEPLY_SKILL.md",
script_base_dir: str | Path = "doc_scripts",
mcp_runner: McpActionRunner | None = None,
fake_runner: FakeActionRunner | None = None,
llm_client: LlmClient | None = None,
) -> None:
"""初始化策略、脚本 runner、MCP runner、fake runner 和 LLM client。"""
self.skill_policy = load_skill_policy(skill_path)
self.script_base_dir = Path(script_base_dir)
self.script_runner = ScriptActionRunner(self.script_base_dir)
self.fake_runner = fake_runner or FakeActionRunner()
self.mcp_runner = mcp_runner
self.llm_client = llm_client or RuleBasedLlmClient()
self.router = ActionRouter(
script_runner=self.script_runner,
mcp_runner=mcp_runner,
fake_runner=self.fake_runner,
)
def understand_request(self, text: str) -> LlmIntentResult:
"""调用 LLM 识别用户意图,并执行基础校验。"""
result = self.llm_client.understand_request(text)
validate_intent_result(result)
return result
def extract_params(self, text: str, base_params: dict[str, Any] | None = None) -> LlmParamResult:
"""从自然语言中抽取部署参数和控制参数。"""
return self.llm_client.extract_params(text, base_params)
def generate_plan(
self,
*,
params: dict[str, Any],
intent: str,
strategy: ExecutionStrategy,
) -> LlmDeployPlan:
"""根据参数、意图和执行策略生成部署计划。"""
plan = self.llm_client.generate_plan(params=params, intent=intent, strategy=strategy)
validate_deploy_plan(plan)
return plan
def analyze_request(self, text: str, base_params: dict[str, Any] | None = None) -> dict[str, Any]:
"""完成意图识别、参数抽取和计划生成,供 analyze/chat 使用。"""
intent = self.understand_request(text)
params = self.extract_params(text, base_params)
strategy = self._choose_strategy(intent.strategy_preference)
plan = self.generate_plan(
params={**DEFAULT_PARAMS, **params.extracted_params},
intent=intent.intent,
strategy=strategy,
)
return {
"intent": intent,
"params": params,
"plan": plan,
}
def normalize_params(self, params: dict[str, Any]) -> dict[str, Any]:
"""合并默认参数并校验必填参数是否齐全。"""
normalized = {**DEFAULT_PARAMS, **params}
missing = [key for key in REQUIRED_PARAMS if not normalized.get(key)]
if missing:
raise ValueError(f"缺少必填参数: {', '.join(missing)}")
return normalized
def _choose_strategy(self, preference: str) -> ExecutionStrategy:
"""把 LLM 给出的策略偏好转换为内部执行策略。"""
if preference in ("hybrid_node_mcp", "script_only", "fake"):
return preference # type: ignore[return-value]
return "hybrid_node_mcp"
def create_state(
self,
*,
params: dict[str, Any],
execution_strategy: ExecutionStrategy = "hybrid_node_mcp",
run_id: str | None = None,
script_entry: str | None = None,
config_path: str | None = None,
trace_file_path: str | None = None,
checkpoint_path: str | None = None,
target_ips: list[str] | None = None,
) -> AgentState:
"""创建一次运行所需的 AgentState并写入脚本配置文件。"""
normalized = self.normalize_params(params)
actual_run_id = run_id or time.strftime("%Y%m%d_%H%M%S")
actual_script_entry = script_entry or select_script_entry()
runtime_dir = Path("runtime")
actual_config_path = config_path or str(runtime_dir / f"config_{actual_run_id}.txt")
actual_trace_path = trace_file_path or str(Path("logs") / f"api_trace_{actual_run_id}.log")
write_config(normalized, actual_config_path)
return AgentState(
run_id=actual_run_id,
params=normalized,
execution_strategy=execution_strategy,
action_backends=build_action_backends(execution_strategy),
script_entry=actual_script_entry,
script_base_dir=str(self.script_base_dir),
config_path=actual_config_path,
trace_file_path=actual_trace_path,
checkpoint_path=checkpoint_path or "",
target_ips=target_ips or [],
)
def preview(self, params: dict[str, Any], strategy: ExecutionStrategy = "hybrid_node_mcp") -> str:
"""渲染部署预演,展示参数和 action 路由。"""
normalized = self.normalize_params(params)
routes = build_action_backends(strategy)
if strategy == "hybrid_node_mcp":
home_backend = "脚本 action"
node_backend = "MCP"
elif strategy == "script_only":
home_backend = "脚本 action"
node_backend = "脚本 action"
else:
home_backend = "fake"
node_backend = "fake"
lines = [
"## PAM 部署预演",
"",
f"- 执行策略: {strategy}",
f"- PAM_HOME: {home_backend}",
f"- PAM_NODE: {node_backend}",
f"- 机场: {normalized['AIRPORT_CODE']}",
f"- 应用: {normalized['APP_NAME']}",
f"- 模块: {normalized['MODULE_NAME']}",
f"- 版本: {normalized['VERSION_NUMBER']}",
"",
"| action | backend |",
"| --- | --- |",
]
for action in GLOBAL_ACTION_SEQUENCE:
lines.append(f"| `{action}` | `{routes[action]}` |")
return "\n".join(lines)
def run_global_flow(self, state: AgentState) -> AgentState:
"""执行全局部署阶段,并跳过 checkpoint 中已完成的步骤。"""
for action in GLOBAL_ACTION_SEQUENCE:
if action in state.completed_global_steps:
continue
kwargs: dict[str, Any] = {}
if action == "publish-version":
kwargs["hash_code"] = state.hash_code
result = self.router.run_action(state, action, **kwargs)
state.events.append(
{
"type": "ACTION_DONE" if result.ok else "ACTION_FAIL",
"stage": action,
"backend": result.backend,
"message": result.error_summary or "ok",
}
)
if not result.ok:
state.last_failed_step = action
self._save_checkpoint(state)
raise RuntimeError(f"{action} 执行失败: {result.error_summary}")
self._apply_result(state, action, result.values)
state.completed_global_steps.append(action)
state.last_success_step = action
self._save_checkpoint(state)
return state
def run_deploy_flow(self, state: AgentState) -> AgentState:
"""执行完整部署流程:全局阶段后进入逐 IP 阶段。"""
if state.pending_confirmation:
self._save_checkpoint(state)
return state
self.run_global_flow(state)
self.run_ip_flow(state)
return state
def run_ip_flow(self, state: AgentState) -> AgentState:
"""执行逐 IP 部署流程,失败时停在人工确认点。"""
if state.pending_confirmation:
self._save_checkpoint(state)
return state
self._resolve_target_ips(state)
for ip in state.target_ips:
ip_state = state.ip_states.get(ip)
if ip_state and ip_state.get("status") == "SUCCESS":
continue
if ip_state and ip_state.get("status") == "FAILED":
if ip_state.get("rollback_status") == "PENDING_AGENT_CONFIRMATION":
state.pending_confirmation = f"rollback-ip:{ip}"
self._save_checkpoint(state)
return state
continue
if not ip_state:
state.events.append({"type": "IP_START", "ip": ip, "message": "start"})
ip_state = {
"status": "RUNNING",
"completed_steps": [],
"failed_stage": "",
"failure_reason": "",
"rollback_status": "ROLLBACK_NOT_RUN",
"rollback_stop_first": False,
"log_file": "",
}
state.ip_states[ip] = ip_state
for action in IP_ACTION_SEQUENCE:
completed_steps = ip_state.setdefault("completed_steps", [])
if action in completed_steps:
continue
result = self.router.run_action(state, action, ip=ip)
failed = (not result.ok) or self._business_failed(action, result.values)
state.events.append(
{
"type": "ACTION_FAIL" if failed else "ACTION_DONE",
"stage": action,
"backend": result.backend,
"ip": ip,
"message": result.error_summary or result.values.get("MESSAGE", "ok"),
}
)
if failed:
self._record_ip_failure(state, ip, action, result.error_summary or str(result.values))
if action != "download-log":
self._download_log_best_effort(state, ip)
state.pending_confirmation = f"rollback-ip:{ip}"
self._save_checkpoint(state)
return state
self._apply_ip_result(ip_state, action, result.values)
completed_steps.append(action)
self._save_checkpoint(state)
ip_state["status"] = "SUCCESS"
state.events.append({"type": "IP_DONE", "ip": ip, "message": "success"})
self._save_checkpoint(state)
return state
def build_confirmation_request(self, state: AgentState) -> dict[str, Any]:
"""把 pending_confirmation 转换为面向用户的确认请求。"""
if not state.pending_confirmation:
return {}
kind, _, value = state.pending_confirmation.partition(":")
if kind == "rollback-ip":
ip_state = state.ip_states.get(value, {})
return {
"type": "rollback-ip",
"ip": value,
"failed_stage": ip_state.get("failed_stage", ""),
"failure_reason": ip_state.get("failure_reason", ""),
"rollback_stop_first": bool(ip_state.get("rollback_stop_first", False)),
"allowed_decisions": ["approve", "reject"],
}
return {
"type": kind,
"value": value,
"allowed_decisions": ["approve", "reject"],
}
def confirm_pending(self, state: AgentState, *, approved: bool, operator_note: str = "") -> AgentState:
"""处理人工确认结果;当前支持失败 IP 的回滚确认。"""
request = self.build_confirmation_request(state)
if not request:
raise ValueError("当前没有待确认事项")
if request["type"] != "rollback-ip":
raise ValueError(f"不支持的确认类型: {request['type']}")
ip = request["ip"]
ip_state = state.ip_states[ip]
if not approved:
ip_state["rollback_status"] = "REJECTED_BY_OPERATOR"
state.events.append(
{
"type": "CONFIRMATION_REJECTED",
"stage": "rollback-ip",
"ip": ip,
"message": operator_note or "rollback rejected by operator",
}
)
state.pending_confirmation = ""
self._save_checkpoint(state)
return state
result = self.router.run_action(
state,
"rollback-ip",
ip=ip,
stop_first=bool(ip_state.get("rollback_stop_first", False)),
)
ip_state["rollback_status"] = "ROLLBACK_DONE" if result.ok else "ROLLBACK_FAILED"
state.events.append(
{
"type": "ACTION_DONE" if result.ok else "ACTION_FAIL",
"stage": "rollback-ip",
"backend": result.backend,
"ip": ip,
"message": result.error_summary or result.values.get("MESSAGE", "ok"),
}
)
if result.ok:
state.pending_confirmation = ""
state.last_success_step = "rollback-ip"
state.last_failed_step = ""
else:
state.pending_confirmation = f"rollback-ip:{ip}"
state.last_failed_step = "rollback-ip"
self._save_checkpoint(state)
return state
def _apply_result(self, state: AgentState, action: str, values: dict[str, Any]) -> None:
"""把全局 action 返回值写回 AgentState。"""
if "HASH_CODE" in values:
state.hash_code = str(values["HASH_CODE"])
if "NODE_URL" in values:
state.node_url = str(values["NODE_URL"])
if action == "get-online-ips":
ips = values.get("IP", [])
if isinstance(ips, str):
ips = [ips]
state.online_ips = list(ips)
state.target_ips = state.target_ips or state.online_ips.copy()
def _resolve_target_ips(self, state: AgentState) -> None:
"""根据在线 IP 和用户指定 IP 计算最终目标 IP。"""
if not state.target_ips:
state.target_ips = state.online_ips.copy()
return
online = set(state.online_ips)
requested = state.target_ips
state.target_ips = [ip for ip in requested if ip in online]
missing = [ip for ip in requested if ip not in online]
if missing:
state.events.append(
{
"type": "TARGET_SCOPE_CHANGED",
"message": "some requested IPs are not online",
"missing_ips": missing,
"target_ips": state.target_ips,
}
)
def _business_failed(self, action: str, values: dict[str, Any]) -> bool:
"""识别 exit code 之外的业务失败条件。"""
if action == "verify-ip":
success = values.get("SUCCESS")
if success is None:
return False
return str(success).lower() not in ("true", "1", "yes")
return False
def _apply_ip_result(self, ip_state: dict[str, Any], action: str, values: dict[str, Any]) -> None:
"""把逐 IP action 返回值写回单 IP 状态。"""
if action == "download-log":
ip_state["log_file"] = str(values.get("LOG_FILE", ""))
def _record_ip_failure(self, state: AgentState, ip: str, action: str, reason: str) -> None:
"""记录单 IP 失败,并设置待回滚确认状态。"""
ip_state = state.ip_states[ip]
stop_first = action in ("start-ip", "verify-ip")
ip_state.update(
{
"status": "FAILED",
"failed_stage": action,
"failure_reason": reason,
"rollback_status": "PENDING_AGENT_CONFIRMATION",
"rollback_stop_first": stop_first,
}
)
state.last_failed_step = action
state.events.append(
{
"type": "CONFIRMATION_REQUIRED",
"stage": "rollback-ip",
"ip": ip,
"stop_first": stop_first,
"message": f"{action} 执行失败,需要确认是否回滚",
}
)
def _download_log_best_effort(self, state: AgentState, ip: str) -> None:
"""失败后尽力下载日志,日志失败不覆盖原失败原因。"""
result = self.router.run_action(state, "download-log", ip=ip)
ip_state = state.ip_states[ip]
if result.ok:
ip_state["log_file"] = str(result.values.get("LOG_FILE", ""))
state.events.append(
{
"type": "ACTION_DONE",
"stage": "download-log",
"backend": result.backend,
"ip": ip,
"message": "已尽力下载日志",
}
)
else:
state.events.append(
{
"type": "ACTION_FAIL",
"stage": "download-log",
"backend": result.backend,
"ip": ip,
"message": result.error_summary or "尽力下载日志失败",
}
)
def _save_checkpoint(self, state: AgentState) -> None:
"""如果配置了 checkpoint 路径,则保存完整运行状态。"""
if state.checkpoint_path:
save_checkpoint(state, state.checkpoint_path, redact=False)
def render_report(self, state: AgentState) -> str:
"""渲染当前部署状态报告。"""
success = sum(1 for item in state.ip_states.values() if item.get("status") == "SUCCESS")
failed = sum(1 for item in state.ip_states.values() if item.get("status") == "FAILED")
lines = [
"## PAM 智能部署报告",
"",
f"- 执行策略: {state.execution_strategy}",
f"- 机场: {state.params['AIRPORT_CODE']}",
f"- 应用: {state.params['APP_NAME']}",
f"- 模块: {state.params['MODULE_NAME']}",
f"- 版本: {state.params['VERSION_NUMBER']}",
f"- 在线工作站数: {len(state.online_ips)}",
f"- 目标工作站数: {len(state.target_ips)}",
f"- 成功: {success}",
f"- 失败: {failed}",
f"- 待确认: {state.pending_confirmation or '-'}",
"",
"| IP | 状态 | 失败阶段 | 回滚状态 | 日志 |",
"| --- | --- | --- | --- | --- |",
]
for ip, ip_state in state.ip_states.items():
lines.append(
"| {ip} | {status} | {failed_stage} | {rollback_status} | {log_file} |".format(
ip=ip,
status=ip_state.get("status", ""),
failed_stage=ip_state.get("failed_stage") or "-",
rollback_status=ip_state.get("rollback_status") or "-",
log_file=ip_state.get("log_file") or "-",
)
)
return "\n".join(lines)