llm 提示词和规则:新增 progress_complete 判断字段。 deploy.sh / deploy.ps1:poll-* action 入口改为单次查询。 interactive.py:chat 会播报进度更新。 config.txt.example / README / packaging 文档 / Skill 文档:同步进度查询参数和新 workflow 语义。 测试补充了进度重复查询、超时暂停、chat 进度播报。
1363 lines
55 KiB
Python
1363 lines
55 KiB
Python
"""PAM 部署 Agent 运行时。
|
||
|
||
本模块不强依赖 LangGraph,可独立运行;同一组节点也可在
|
||
`pam_deploy_graph.graph` 中接入 LangGraph。
|
||
"""
|
||
|
||
from __future__ import annotations
|
||
|
||
import re
|
||
import time
|
||
import logging
|
||
from dataclasses import asdict
|
||
from pathlib import Path
|
||
from typing import Any, Callable
|
||
|
||
from .action_router import ActionRouter, build_action_backends
|
||
from .checkpoint_store import save_checkpoint
|
||
from .config_writer import write_config
|
||
from .constants import DEFAULT_PARAMS, GLOBAL_ACTION_SEQUENCE, IP_ACTION_SEQUENCE, REQUIRED_PARAMS
|
||
from .fake_runner import FakeActionRunner
|
||
from .llm import LlmClient, RuleBasedLlmClient, validate_deploy_plan, validate_intent_result
|
||
from .logging_utils import configure_logging, json_for_log
|
||
from .mcp_runner import McpActionRunner
|
||
from .models import ActionResult, AgentState, ExecutionStrategy, LlmActionAnalysis, LlmDeployPlan, LlmIntentResult, LlmParamResult
|
||
from .script_runner import ScriptActionRunner, select_script_entry
|
||
from .skill_policy import load_skill_policy
|
||
|
||
logger = logging.getLogger(__name__)
|
||
|
||
REQUIRED_ACTION_VALUES = {
|
||
"upload-package": ("HASH_CODE",),
|
||
"get-node-url": ("NODE_URL",),
|
||
}
|
||
|
||
PROGRESS_ACTIONS = {"poll-download-progress", "poll-upgrade-progress"}
|
||
|
||
|
||
class PamDeployAgent:
|
||
"""PAM 部署主 Agent,串联 LLM、action 路由、确认和续跑状态。"""
|
||
|
||
def __init__(
|
||
self,
|
||
*,
|
||
skill_path: str | Path = "doc_scripts/PAM_AUTO_DEPLY_SKILL.md",
|
||
script_base_dir: str | Path = "doc_scripts",
|
||
mcp_runner: McpActionRunner | None = None,
|
||
fake_runner: FakeActionRunner | None = None,
|
||
llm_client: LlmClient | None = None,
|
||
action_analysis_enabled: bool = False,
|
||
progress_callback: Callable[[dict[str, Any]], None] | None = None,
|
||
) -> None:
|
||
"""初始化策略、脚本 runner、MCP runner、fake runner 和 LLM client。"""
|
||
self.log_path = configure_logging()
|
||
self.skill_policy = load_skill_policy(skill_path)
|
||
self.script_base_dir = Path(script_base_dir)
|
||
self.script_runner = ScriptActionRunner(self.script_base_dir)
|
||
self.fake_runner = fake_runner or FakeActionRunner()
|
||
self.mcp_runner = mcp_runner
|
||
self.llm_client = llm_client or RuleBasedLlmClient()
|
||
self.action_analysis_enabled = action_analysis_enabled
|
||
self.progress_callback = progress_callback
|
||
self.router = ActionRouter(
|
||
script_runner=self.script_runner,
|
||
mcp_runner=mcp_runner,
|
||
fake_runner=self.fake_runner,
|
||
)
|
||
logger.info(
|
||
"Agent 初始化完成 skill=%s script_base_dir=%s llm_client=%s mcp_runner=%s action_analysis_events=%s",
|
||
skill_path,
|
||
self.script_base_dir,
|
||
type(self.llm_client).__name__,
|
||
type(self.mcp_runner).__name__ if self.mcp_runner else "",
|
||
self.action_analysis_enabled,
|
||
)
|
||
|
||
def understand_request(self, text: str) -> LlmIntentResult:
|
||
"""调用 LLM 识别用户意图,并执行基础校验。"""
|
||
logger.info("LLM 意图识别开始 client=%s text_len=%s", type(self.llm_client).__name__, len(text))
|
||
try:
|
||
result = self.llm_client.understand_request(text)
|
||
except Exception:
|
||
logger.exception("LLM 意图识别失败 client=%s", type(self.llm_client).__name__)
|
||
raise
|
||
validate_intent_result(result)
|
||
logger.info("LLM 意图识别完成 result=%s", json_for_log(asdict(result)))
|
||
return result
|
||
|
||
def extract_params(self, text: str, base_params: dict[str, Any] | None = None) -> LlmParamResult:
|
||
"""从自然语言中抽取部署参数和控制参数。"""
|
||
logger.info(
|
||
"LLM 参数抽取开始 client=%s text_len=%s base_params=%s",
|
||
type(self.llm_client).__name__,
|
||
len(text),
|
||
json_for_log(base_params or {}),
|
||
)
|
||
try:
|
||
result = self.llm_client.extract_params(text, base_params)
|
||
except Exception:
|
||
logger.exception("LLM 参数抽取失败 client=%s", type(self.llm_client).__name__)
|
||
raise
|
||
logger.info("LLM 参数抽取完成 result=%s", json_for_log(asdict(result)))
|
||
return result
|
||
|
||
def generate_plan(
|
||
self,
|
||
*,
|
||
params: dict[str, Any],
|
||
intent: str,
|
||
strategy: ExecutionStrategy,
|
||
) -> LlmDeployPlan:
|
||
"""根据参数、意图和执行策略生成部署计划。"""
|
||
logger.info(
|
||
"LLM 计划生成开始 client=%s intent=%s strategy=%s params=%s",
|
||
type(self.llm_client).__name__,
|
||
intent,
|
||
strategy,
|
||
json_for_log(params),
|
||
)
|
||
try:
|
||
plan = self.llm_client.generate_plan(params=params, intent=intent, strategy=strategy)
|
||
except Exception:
|
||
logger.exception("LLM 计划生成失败 client=%s intent=%s strategy=%s", type(self.llm_client).__name__, intent, strategy)
|
||
raise
|
||
validate_deploy_plan(plan)
|
||
logger.info("LLM 计划生成完成 result=%s", json_for_log(asdict(plan)))
|
||
return plan
|
||
|
||
def analyze_request(self, text: str, base_params: dict[str, Any] | None = None) -> dict[str, Any]:
|
||
"""完成意图识别、参数抽取和计划生成,供 analyze/chat 使用。"""
|
||
logger.info("需求分析开始 text_len=%s base_params=%s", len(text), json_for_log(base_params or {}))
|
||
intent = self.understand_request(text)
|
||
params = self.extract_params(text, base_params)
|
||
strategy = self._choose_strategy(intent.strategy_preference)
|
||
plan = self.generate_plan(
|
||
params={**DEFAULT_PARAMS, **params.extracted_params},
|
||
intent=intent.intent,
|
||
strategy=strategy,
|
||
)
|
||
result = {
|
||
"intent": intent,
|
||
"params": params,
|
||
"plan": plan,
|
||
}
|
||
logger.info(
|
||
"需求分析完成 intent=%s strategy=%s missing=%s plan_actions=%s",
|
||
intent.intent,
|
||
strategy,
|
||
params.missing_required_params,
|
||
plan.planned_actions,
|
||
)
|
||
return result
|
||
|
||
def normalize_params(self, params: dict[str, Any]) -> dict[str, Any]:
|
||
"""合并默认参数并校验必填参数是否齐全。"""
|
||
normalized = {**DEFAULT_PARAMS, **params}
|
||
missing = [key for key in REQUIRED_PARAMS if not normalized.get(key)]
|
||
if missing:
|
||
raise ValueError(f"缺少必填参数: {', '.join(missing)}")
|
||
normalized["ZIP_FILE_PATH"] = _normalize_local_file_path(str(normalized["ZIP_FILE_PATH"]).strip())
|
||
return normalized
|
||
|
||
def _choose_strategy(self, preference: str) -> ExecutionStrategy:
|
||
"""把 LLM 给出的策略偏好转换为内部执行策略。"""
|
||
if preference in ("hybrid_node_mcp", "script_only", "fake"):
|
||
return preference # type: ignore[return-value]
|
||
return "hybrid_node_mcp"
|
||
|
||
def create_state(
|
||
self,
|
||
*,
|
||
params: dict[str, Any],
|
||
execution_strategy: ExecutionStrategy = "hybrid_node_mcp",
|
||
run_id: str | None = None,
|
||
script_entry: str | None = None,
|
||
config_path: str | None = None,
|
||
trace_file_path: str | None = None,
|
||
checkpoint_path: str | None = None,
|
||
target_ips: list[str] | None = None,
|
||
) -> AgentState:
|
||
"""创建一次运行所需的 AgentState,并写入脚本配置文件。"""
|
||
logger.info(
|
||
"创建 AgentState 开始 strategy=%s checkpoint=%s target_ips=%s params=%s",
|
||
execution_strategy,
|
||
checkpoint_path or "",
|
||
target_ips or [],
|
||
json_for_log(params),
|
||
)
|
||
normalized = self.normalize_params(params)
|
||
actual_run_id = run_id or time.strftime("%Y%m%d_%H%M%S")
|
||
actual_script_entry = script_entry or select_script_entry()
|
||
runtime_dir = Path("runtime")
|
||
actual_config_path = _absolute_path(config_path or runtime_dir / f"config_{actual_run_id}.txt")
|
||
actual_trace_path = _absolute_path(trace_file_path or Path("logs") / f"api_trace_{actual_run_id}.log")
|
||
write_config(normalized, actual_config_path)
|
||
state = AgentState(
|
||
run_id=actual_run_id,
|
||
params=normalized,
|
||
execution_strategy=execution_strategy,
|
||
action_backends=build_action_backends(execution_strategy),
|
||
script_entry=actual_script_entry,
|
||
script_base_dir=str(self.script_base_dir),
|
||
config_path=str(actual_config_path),
|
||
trace_file_path=str(actual_trace_path),
|
||
checkpoint_path=checkpoint_path or "",
|
||
target_ips=target_ips or [],
|
||
)
|
||
logger.info(
|
||
"创建 AgentState 完成 run_id=%s config=%s trace=%s script_entry=%s backends=%s",
|
||
state.run_id,
|
||
state.config_path,
|
||
state.trace_file_path,
|
||
state.script_entry,
|
||
json_for_log(state.action_backends),
|
||
)
|
||
return state
|
||
|
||
def pause_state(
|
||
self,
|
||
state: AgentState,
|
||
*,
|
||
reason: str,
|
||
review_context: dict[str, Any] | None = None,
|
||
) -> AgentState:
|
||
"""将当前 state 标记为暂停,并持久化 checkpoint。"""
|
||
logger.info(
|
||
"暂停 state run_id=%s reason=%s checkpoint=%s context=%s",
|
||
state.run_id,
|
||
reason,
|
||
state.checkpoint_path,
|
||
json_for_log(review_context or {}),
|
||
)
|
||
state.paused = True
|
||
state.pause_reason = reason
|
||
state.review_context = dict(review_context or {})
|
||
self._save_checkpoint(state)
|
||
return state
|
||
|
||
def resume_state(self, state: AgentState) -> AgentState:
|
||
"""清理暂停标记,允许后续继续执行。"""
|
||
logger.info("恢复 state run_id=%s previous_reason=%s checkpoint=%s", state.run_id, state.pause_reason, state.checkpoint_path)
|
||
state.paused = False
|
||
state.pause_reason = ""
|
||
state.review_context = {}
|
||
self._save_checkpoint(state)
|
||
return state
|
||
|
||
def update_state_params(self, state: AgentState, updates: dict[str, Any]) -> AgentState:
|
||
"""热更新 state 中的参数,并回写 config 文件。"""
|
||
logger.info("热更新 state 参数开始 run_id=%s updates=%s", state.run_id, json_for_log(updates))
|
||
merged = {**state.params, **updates}
|
||
normalized = self.normalize_params(merged)
|
||
state.params = normalized
|
||
if state.config_path:
|
||
write_config(normalized, state.config_path)
|
||
self._save_checkpoint(state)
|
||
logger.info("热更新 state 参数完成 run_id=%s config=%s params=%s", state.run_id, state.config_path, json_for_log(state.params))
|
||
return state
|
||
|
||
def preview(self, params: dict[str, Any], strategy: ExecutionStrategy = "hybrid_node_mcp") -> str:
|
||
"""渲染部署预览,展示参数和 action 路由。"""
|
||
normalized = self.normalize_params(params)
|
||
routes = build_action_backends(strategy)
|
||
if strategy == "hybrid_node_mcp":
|
||
home_backend = "脚本 action"
|
||
node_backend = "MCP"
|
||
elif strategy == "script_only":
|
||
home_backend = "脚本 action"
|
||
node_backend = "脚本 action"
|
||
else:
|
||
home_backend = "fake"
|
||
node_backend = "fake"
|
||
lines = [
|
||
"## PAM 部署预览",
|
||
"",
|
||
f"- 执行策略: {strategy}",
|
||
f"- PAM_HOME: {home_backend}",
|
||
f"- PAM_NODE: {node_backend}",
|
||
f"- 机场: {normalized['AIRPORT_CODE']}",
|
||
f"- 应用: {normalized['APP_NAME']}",
|
||
f"- 模块: {normalized['MODULE_NAME']}",
|
||
f"- 版本: {normalized['VERSION_NUMBER']}",
|
||
"",
|
||
"| action | backend |",
|
||
"| --- | --- |",
|
||
]
|
||
for action in GLOBAL_ACTION_SEQUENCE:
|
||
lines.append(f"| `{action}` | `{routes[action]}` |")
|
||
return "\n".join(lines)
|
||
|
||
def run_global_flow(self, state: AgentState) -> AgentState:
|
||
"""执行全局部署阶段,并跳过 checkpoint 中已完成的步骤。"""
|
||
logger.info(
|
||
"全局流程开始 run_id=%s paused=%s completed=%s",
|
||
state.run_id,
|
||
state.paused,
|
||
state.completed_global_steps,
|
||
)
|
||
if state.paused:
|
||
self._save_checkpoint(state)
|
||
return state
|
||
while True:
|
||
action = self.next_global_action(state)
|
||
if action is None:
|
||
logger.info("全局流程完成 run_id=%s completed=%s", state.run_id, state.completed_global_steps)
|
||
return state
|
||
self.run_global_action(state, action)
|
||
|
||
def next_global_action(self, state: AgentState) -> str | None:
|
||
"""返回下一个未完成的全局 action。"""
|
||
if state.paused:
|
||
return None
|
||
for action in GLOBAL_ACTION_SEQUENCE:
|
||
if action in state.completed_global_steps:
|
||
continue
|
||
return action
|
||
return None
|
||
|
||
def run_global_action(self, state: AgentState, action: str) -> AgentState:
|
||
"""执行一个全局 action,并把结果写回 AgentState。"""
|
||
if action in state.completed_global_steps:
|
||
logger.info("跳过已完成全局 action run_id=%s action=%s", state.run_id, action)
|
||
return state
|
||
kwargs: dict[str, Any] = {}
|
||
if action == "publish-version":
|
||
if not state.hash_code:
|
||
state.last_failed_step = action
|
||
self._save_checkpoint(state)
|
||
raise RuntimeError("publish-version 缺少 HASH_CODE,请确认 upload-package 是否成功返回 HASH_CODE")
|
||
kwargs["hash_code"] = state.hash_code
|
||
backend = state.action_backends.get(action, "script")
|
||
logger.info(
|
||
"全局 action 开始 run_id=%s action=%s backend=%s kwargs=%s",
|
||
state.run_id,
|
||
action,
|
||
backend,
|
||
json_for_log(kwargs),
|
||
)
|
||
self._emit_progress({"type": "ACTION_START", "stage": action, "backend": backend})
|
||
try:
|
||
result = self.router.run_action(state, action, **kwargs)
|
||
except Exception as exc:
|
||
logger.exception("全局 action 调用异常 run_id=%s action=%s backend=%s", state.run_id, action, backend)
|
||
result = ActionResult(
|
||
action=action,
|
||
backend=backend,
|
||
ok=False,
|
||
error_summary=str(exc),
|
||
)
|
||
logger.info("全局 action 返回 run_id=%s result=%s", state.run_id, _action_result_for_log(result))
|
||
analysis = self._append_action_analysis(state, action, result)
|
||
if not result.ok:
|
||
fail_event = {
|
||
"type": "ACTION_FAIL",
|
||
"stage": action,
|
||
"backend": result.backend,
|
||
"message": result.error_summary or "action 执行失败",
|
||
}
|
||
state.events.append(fail_event)
|
||
self._emit_progress(fail_event)
|
||
state.last_failed_step = action
|
||
self.pause_state(
|
||
state,
|
||
reason="action_failed",
|
||
review_context=self._review_context(action=action, analysis=analysis, result=result),
|
||
)
|
||
self._save_checkpoint(state)
|
||
raise RuntimeError(f"{action} 执行失败: {result.error_summary}")
|
||
missing_values = self._missing_required_values(action, result.values)
|
||
if missing_values:
|
||
message = f"{action} 返回缺少必要字段: {', '.join(missing_values)}"
|
||
fail_event = {
|
||
"type": "ACTION_FAIL",
|
||
"stage": action,
|
||
"backend": result.backend,
|
||
"message": message,
|
||
}
|
||
state.events.append(fail_event)
|
||
self._emit_progress(fail_event)
|
||
state.last_failed_step = action
|
||
self.pause_state(
|
||
state,
|
||
reason="action_missing_required_values",
|
||
review_context={
|
||
"type": "action_review",
|
||
"stage": action,
|
||
"message": message,
|
||
"missing_values": missing_values,
|
||
},
|
||
)
|
||
self._save_checkpoint(state)
|
||
raise RuntimeError(message)
|
||
if analysis is not None and not analysis.should_continue:
|
||
state.last_failed_step = action
|
||
state.events.append(
|
||
{
|
||
"type": "ACTION_BLOCKED",
|
||
"stage": action,
|
||
"backend": result.backend,
|
||
"message": analysis.suggested_action or analysis.possible_reason or "LLM 审核要求暂停",
|
||
}
|
||
)
|
||
self.pause_state(
|
||
state,
|
||
reason="llm_review_blocked",
|
||
review_context=self._review_context(action=action, analysis=analysis, result=result),
|
||
)
|
||
logger.info("全局 action 被 LLM 审核拦截 run_id=%s action=%s analysis=%s", state.run_id, action, json_for_log(asdict(analysis)))
|
||
return state
|
||
if self._handle_progress_action(state, action, result, analysis):
|
||
return state
|
||
self._apply_result(state, action, result.values)
|
||
state.completed_global_steps.append(action)
|
||
state.last_success_step = action
|
||
if state.last_failed_step == action:
|
||
state.last_failed_step = ""
|
||
done_message = self._progress_message(action, result) if action in PROGRESS_ACTIONS else result.values.get("MESSAGE", "ok")
|
||
done_event = {
|
||
"type": "ACTION_DONE",
|
||
"stage": action,
|
||
"backend": result.backend,
|
||
"message": done_message,
|
||
}
|
||
state.events.append(done_event)
|
||
self._emit_progress(done_event)
|
||
self._save_checkpoint(state)
|
||
logger.info("全局 action 完成 run_id=%s action=%s completed=%s", state.run_id, action, state.completed_global_steps)
|
||
return state
|
||
|
||
def _missing_required_values(self, action: str, values: dict[str, Any]) -> list[str]:
|
||
"""检查 action 成功返回后是否带回后续步骤必需字段。"""
|
||
required = REQUIRED_ACTION_VALUES.get(action, ())
|
||
return [key for key in required if not values.get(key)]
|
||
|
||
def run_deploy_flow(self, state: AgentState) -> AgentState:
|
||
"""执行完整部署流程:全局阶段后进入逐 IP 阶段。"""
|
||
logger.info(
|
||
"部署流程开始 run_id=%s paused=%s pending=%s strategy=%s",
|
||
state.run_id,
|
||
state.paused,
|
||
state.pending_confirmation,
|
||
state.execution_strategy,
|
||
)
|
||
if state.pending_confirmation or state.paused:
|
||
self._save_checkpoint(state)
|
||
return state
|
||
self.run_global_flow(state)
|
||
self.run_ip_flow(state)
|
||
logger.info("部署流程结束 run_id=%s paused=%s pending=%s", state.run_id, state.paused, state.pending_confirmation)
|
||
return state
|
||
|
||
def run_ip_flow(self, state: AgentState) -> AgentState:
|
||
"""执行逐 IP 部署流程,失败时暂停在当前 action,等待修复后重试。"""
|
||
logger.info(
|
||
"逐 IP 流程开始 run_id=%s paused=%s target_ips=%s online_ips=%s",
|
||
state.run_id,
|
||
state.paused,
|
||
state.target_ips,
|
||
state.online_ips,
|
||
)
|
||
if state.paused:
|
||
self._save_checkpoint(state)
|
||
return state
|
||
while True:
|
||
work = self.next_ip_action(state)
|
||
if work is None:
|
||
logger.info("逐 IP 流程完成或等待确认 run_id=%s pending=%s ip_states=%s", state.run_id, state.pending_confirmation, json_for_log(state.ip_states))
|
||
return state
|
||
ip, action = work
|
||
self.run_ip_action(state, ip, action)
|
||
|
||
def next_ip_action(self, state: AgentState) -> tuple[str, str] | None:
|
||
"""返回下一个待执行的单 IP action,并按需初始化 IP 状态。"""
|
||
if state.pending_confirmation or state.paused:
|
||
self._save_checkpoint(state)
|
||
return None
|
||
self._resolve_target_ips(state)
|
||
logger.info("计算下一个 IP action run_id=%s target_ips=%s", state.run_id, state.target_ips)
|
||
for ip in state.target_ips:
|
||
ip_state = state.ip_states.get(ip)
|
||
if ip_state and ip_state.get("status") == "SUCCESS":
|
||
continue
|
||
if ip_state and ip_state.get("status") == "FAILED":
|
||
if ip_state.get("rollback_status") == "ROLLBACK_DONE":
|
||
continue
|
||
failed_stage = str(ip_state.get("failed_stage", ""))
|
||
completed_steps = ip_state.setdefault("completed_steps", [])
|
||
if failed_stage and failed_stage not in completed_steps:
|
||
return ip, failed_stage
|
||
continue
|
||
if not ip_state:
|
||
logger.info("初始化 IP 状态 run_id=%s ip=%s", state.run_id, ip)
|
||
state.events.append({"type": "IP_START", "ip": ip, "message": "start"})
|
||
ip_state = {
|
||
"status": "RUNNING",
|
||
"completed_steps": [],
|
||
"failed_stage": "",
|
||
"failure_reason": "",
|
||
"rollback_status": "ROLLBACK_NOT_RUN",
|
||
"rollback_stop_first": False,
|
||
"log_file": "",
|
||
}
|
||
state.ip_states[ip] = ip_state
|
||
|
||
completed_steps = ip_state.setdefault("completed_steps", [])
|
||
for action in IP_ACTION_SEQUENCE:
|
||
if action not in completed_steps:
|
||
return ip, action
|
||
|
||
ip_state["status"] = "SUCCESS"
|
||
state.events.append({"type": "IP_DONE", "ip": ip, "message": "success"})
|
||
self._save_checkpoint(state)
|
||
logger.info("IP 部署完成 run_id=%s ip=%s", state.run_id, ip)
|
||
return None
|
||
|
||
def run_ip_action(self, state: AgentState, ip: str, action: str) -> AgentState:
|
||
"""执行一个单 IP action;失败时暂停并保留该 action 供 resume 重试。"""
|
||
ip_state = state.ip_states[ip]
|
||
completed_steps = ip_state.setdefault("completed_steps", [])
|
||
if action in completed_steps:
|
||
logger.info("跳过已完成 IP action run_id=%s ip=%s action=%s", state.run_id, ip, action)
|
||
return state
|
||
logger.info(
|
||
"IP action 开始 run_id=%s ip=%s action=%s backend=%s ip_state=%s",
|
||
state.run_id,
|
||
ip,
|
||
action,
|
||
state.action_backends.get(action, ""),
|
||
json_for_log(ip_state),
|
||
)
|
||
self._emit_progress(
|
||
{
|
||
"type": "ACTION_START",
|
||
"stage": action,
|
||
"backend": state.action_backends.get(action, ""),
|
||
"ip": ip,
|
||
}
|
||
)
|
||
backend = state.action_backends.get(action, "script")
|
||
try:
|
||
result = self.router.run_action(state, action, ip=ip)
|
||
except Exception as exc:
|
||
logger.exception("IP action 调用异常 run_id=%s ip=%s action=%s backend=%s", state.run_id, ip, action, backend)
|
||
result = ActionResult(
|
||
action=action,
|
||
backend=backend,
|
||
ok=False,
|
||
error_summary=str(exc),
|
||
)
|
||
failed = (not result.ok) or self._business_failed(action, result.values)
|
||
logger.info(
|
||
"IP action 返回 run_id=%s ip=%s action=%s failed=%s result=%s",
|
||
state.run_id,
|
||
ip,
|
||
action,
|
||
failed,
|
||
_action_result_for_log(result),
|
||
)
|
||
analysis = self._append_action_analysis(state, action, result, ip=ip)
|
||
|
||
if failed:
|
||
fail_event = {
|
||
"type": "ACTION_FAIL",
|
||
"stage": action,
|
||
"backend": result.backend,
|
||
"ip": ip,
|
||
"message": result.error_summary or result.values.get("MESSAGE", "action 执行失败"),
|
||
}
|
||
state.events.append(fail_event)
|
||
self._emit_progress(fail_event)
|
||
self._record_ip_failure(state, ip, action, result.error_summary or str(result.values))
|
||
self.pause_state(
|
||
state,
|
||
reason="action_failed",
|
||
review_context=self._review_context(action=action, analysis=analysis, result=result, ip=ip),
|
||
)
|
||
self._save_checkpoint(state)
|
||
logger.info("IP action 失败并暂停等待重试 run_id=%s ip=%s action=%s", state.run_id, ip, action)
|
||
return state
|
||
|
||
if analysis is not None and not analysis.should_continue:
|
||
ip_state["failed_stage"] = action
|
||
ip_state["failure_reason"] = analysis.possible_reason or analysis.suggested_action or "LLM 审核要求暂停"
|
||
state.last_failed_step = action
|
||
state.events.append(
|
||
{
|
||
"type": "ACTION_BLOCKED",
|
||
"stage": action,
|
||
"backend": result.backend,
|
||
"ip": ip,
|
||
"message": analysis.suggested_action or analysis.possible_reason or "LLM 审核要求暂停",
|
||
}
|
||
)
|
||
self.pause_state(
|
||
state,
|
||
reason="llm_review_blocked",
|
||
review_context=self._review_context(action=action, analysis=analysis, result=result, ip=ip),
|
||
)
|
||
logger.info("IP action 被 LLM 审核拦截 run_id=%s ip=%s action=%s analysis=%s", state.run_id, ip, action, json_for_log(asdict(analysis)))
|
||
return state
|
||
if self._handle_progress_action(state, action, result, analysis, ip=ip):
|
||
ip_state["status"] = "RUNNING"
|
||
return state
|
||
self._apply_ip_result(ip_state, action, result.values)
|
||
ip_state["status"] = "RUNNING"
|
||
ip_state["failed_stage"] = ""
|
||
ip_state["failure_reason"] = ""
|
||
completed_steps.append(action)
|
||
if state.last_failed_step == action:
|
||
state.last_failed_step = ""
|
||
done_message = self._progress_message(action, result, ip=ip) if action in PROGRESS_ACTIONS else result.values.get("MESSAGE", "ok")
|
||
done_event = {
|
||
"type": "ACTION_DONE",
|
||
"stage": action,
|
||
"backend": result.backend,
|
||
"ip": ip,
|
||
"message": done_message,
|
||
}
|
||
state.events.append(done_event)
|
||
self._emit_progress(done_event)
|
||
self._save_checkpoint(state)
|
||
logger.info("IP action 完成 run_id=%s ip=%s action=%s completed=%s", state.run_id, ip, action, completed_steps)
|
||
return state
|
||
|
||
def build_confirmation_request(self, state: AgentState) -> dict[str, Any]:
|
||
"""把 pending_confirmation 转换为面向用户的确认请求。"""
|
||
if not state.pending_confirmation:
|
||
return {}
|
||
kind, _, value = state.pending_confirmation.partition(":")
|
||
if kind == "rollback-ip":
|
||
ip_state = state.ip_states.get(value, {})
|
||
return {
|
||
"type": "rollback-ip",
|
||
"ip": value,
|
||
"failed_stage": ip_state.get("failed_stage", ""),
|
||
"failure_reason": ip_state.get("failure_reason", ""),
|
||
"rollback_stop_first": bool(ip_state.get("rollback_stop_first", False)),
|
||
"allowed_decisions": ["approve", "reject"],
|
||
}
|
||
return {
|
||
"type": kind,
|
||
"value": value,
|
||
"allowed_decisions": ["approve", "reject"],
|
||
}
|
||
|
||
def confirm_pending(self, state: AgentState, *, approved: bool, operator_note: str = "") -> AgentState:
|
||
"""处理人工确认结果;当前支持失败 IP 的回滚确认。"""
|
||
request = self.build_confirmation_request(state)
|
||
if not request:
|
||
raise ValueError("当前没有待确认事项")
|
||
if request["type"] != "rollback-ip":
|
||
raise ValueError(f"不支持的确认类型: {request['type']}")
|
||
|
||
ip = request["ip"]
|
||
ip_state = state.ip_states[ip]
|
||
logger.info(
|
||
"人工确认开始 run_id=%s approved=%s request=%s note_len=%s",
|
||
state.run_id,
|
||
approved,
|
||
json_for_log(request),
|
||
len(operator_note),
|
||
)
|
||
if not approved:
|
||
ip_state["rollback_status"] = "REJECTED_BY_OPERATOR"
|
||
state.events.append(
|
||
{
|
||
"type": "CONFIRMATION_REJECTED",
|
||
"stage": "rollback-ip",
|
||
"ip": ip,
|
||
"message": operator_note or "rollback rejected by operator",
|
||
}
|
||
)
|
||
state.pending_confirmation = ""
|
||
state.paused = False
|
||
state.pause_reason = ""
|
||
state.review_context = {}
|
||
self._save_checkpoint(state)
|
||
logger.info("人工确认拒绝回滚 run_id=%s ip=%s", state.run_id, ip)
|
||
return state
|
||
|
||
backend = state.action_backends.get("rollback-ip", "script")
|
||
self._emit_progress(
|
||
{
|
||
"type": "ACTION_START",
|
||
"stage": "rollback-ip",
|
||
"backend": backend,
|
||
"ip": ip,
|
||
}
|
||
)
|
||
try:
|
||
result = self.router.run_action(
|
||
state,
|
||
"rollback-ip",
|
||
ip=ip,
|
||
stop_first=bool(ip_state.get("rollback_stop_first", False)),
|
||
)
|
||
except Exception as exc:
|
||
logger.exception("rollback action 调用异常 run_id=%s ip=%s backend=%s", state.run_id, ip, backend)
|
||
result = ActionResult(
|
||
action="rollback-ip",
|
||
backend=backend,
|
||
ok=False,
|
||
error_summary=str(exc),
|
||
)
|
||
logger.info("rollback action 返回 run_id=%s ip=%s result=%s", state.run_id, ip, _action_result_for_log(result))
|
||
ip_state["rollback_status"] = "ROLLBACK_DONE" if result.ok else "ROLLBACK_FAILED"
|
||
state.events.append(
|
||
{
|
||
"type": "ACTION_DONE" if result.ok else "ACTION_FAIL",
|
||
"stage": "rollback-ip",
|
||
"backend": result.backend,
|
||
"ip": ip,
|
||
"message": result.error_summary or result.values.get("MESSAGE", "ok"),
|
||
}
|
||
)
|
||
self._append_action_analysis(state, "rollback-ip", result, ip=ip)
|
||
if result.ok:
|
||
state.pending_confirmation = ""
|
||
state.last_success_step = "rollback-ip"
|
||
state.last_failed_step = ""
|
||
state.paused = False
|
||
state.pause_reason = ""
|
||
state.review_context = {}
|
||
self._emit_progress(
|
||
{
|
||
"type": "ACTION_DONE",
|
||
"stage": "rollback-ip",
|
||
"backend": result.backend,
|
||
"ip": ip,
|
||
"message": result.values.get("MESSAGE", "ok"),
|
||
}
|
||
)
|
||
else:
|
||
state.pending_confirmation = f"rollback-ip:{ip}"
|
||
state.last_failed_step = "rollback-ip"
|
||
state.paused = True
|
||
state.pause_reason = "rollback_failed"
|
||
self._emit_progress(
|
||
{
|
||
"type": "ACTION_FAIL",
|
||
"stage": "rollback-ip",
|
||
"backend": result.backend,
|
||
"ip": ip,
|
||
"message": result.error_summary or result.values.get("MESSAGE", "rollback 执行失败"),
|
||
}
|
||
)
|
||
self._save_checkpoint(state)
|
||
logger.info(
|
||
"人工确认处理完成 run_id=%s ip=%s rollback_status=%s pending=%s paused=%s",
|
||
state.run_id,
|
||
ip,
|
||
ip_state.get("rollback_status"),
|
||
state.pending_confirmation,
|
||
state.paused,
|
||
)
|
||
return state
|
||
|
||
def rollback_ip(
|
||
self,
|
||
state: AgentState,
|
||
ip: str,
|
||
*,
|
||
stop_first: bool | None = None,
|
||
operator_note: str = "",
|
||
) -> AgentState:
|
||
"""显式执行单个 IP 的回滚;该动作不属于主 workflow 自动分支。"""
|
||
if ip not in state.ip_states:
|
||
raise ValueError(f"IP 不在当前运行状态中: {ip}")
|
||
ip_state = state.ip_states[ip]
|
||
actual_stop_first = bool(ip_state.get("rollback_stop_first", False)) if stop_first is None else stop_first
|
||
backend = state.action_backends.get("rollback-ip", "script")
|
||
logger.info(
|
||
"显式回滚开始 run_id=%s ip=%s backend=%s stop_first=%s note_len=%s",
|
||
state.run_id,
|
||
ip,
|
||
backend,
|
||
actual_stop_first,
|
||
len(operator_note),
|
||
)
|
||
self._emit_progress(
|
||
{
|
||
"type": "ACTION_START",
|
||
"stage": "rollback-ip",
|
||
"backend": backend,
|
||
"ip": ip,
|
||
}
|
||
)
|
||
try:
|
||
result = self.router.run_action(
|
||
state,
|
||
"rollback-ip",
|
||
ip=ip,
|
||
stop_first=actual_stop_first,
|
||
)
|
||
except Exception as exc:
|
||
logger.exception("显式回滚 action 调用异常 run_id=%s ip=%s backend=%s", state.run_id, ip, backend)
|
||
result = ActionResult(
|
||
action="rollback-ip",
|
||
backend=backend,
|
||
ok=False,
|
||
error_summary=str(exc),
|
||
)
|
||
logger.info("显式回滚 action 返回 run_id=%s ip=%s result=%s", state.run_id, ip, _action_result_for_log(result))
|
||
ip_state["rollback_status"] = "ROLLBACK_DONE" if result.ok else "ROLLBACK_FAILED"
|
||
ip_state["rollback_stop_first"] = actual_stop_first
|
||
state.events.append(
|
||
{
|
||
"type": "ACTION_DONE" if result.ok else "ACTION_FAIL",
|
||
"stage": "rollback-ip",
|
||
"backend": result.backend,
|
||
"ip": ip,
|
||
"message": result.error_summary or result.values.get("MESSAGE", "ok"),
|
||
"operator_note": operator_note,
|
||
}
|
||
)
|
||
self._append_action_analysis(state, "rollback-ip", result, ip=ip)
|
||
if result.ok:
|
||
state.pending_confirmation = ""
|
||
state.last_success_step = "rollback-ip"
|
||
state.last_failed_step = ""
|
||
state.paused = False
|
||
state.pause_reason = ""
|
||
state.review_context = {}
|
||
self._emit_progress(
|
||
{
|
||
"type": "ACTION_DONE",
|
||
"stage": "rollback-ip",
|
||
"backend": result.backend,
|
||
"ip": ip,
|
||
"message": result.values.get("MESSAGE", "ok"),
|
||
}
|
||
)
|
||
else:
|
||
state.last_failed_step = "rollback-ip"
|
||
state.paused = True
|
||
state.pause_reason = "rollback_failed"
|
||
state.review_context = self._review_context(action="rollback-ip", analysis=None, result=result, ip=ip)
|
||
self._emit_progress(
|
||
{
|
||
"type": "ACTION_FAIL",
|
||
"stage": "rollback-ip",
|
||
"backend": result.backend,
|
||
"ip": ip,
|
||
"message": result.error_summary or result.values.get("MESSAGE", "rollback 执行失败"),
|
||
}
|
||
)
|
||
self._save_checkpoint(state)
|
||
logger.info(
|
||
"显式回滚结束 run_id=%s ip=%s rollback_status=%s paused=%s",
|
||
state.run_id,
|
||
ip,
|
||
ip_state.get("rollback_status"),
|
||
state.paused,
|
||
)
|
||
return state
|
||
|
||
def _emit_progress(self, payload: dict[str, Any]) -> None:
|
||
"""向 CLI/chat 回调 action 执行进度,回调失败不影响主流程。"""
|
||
if self.progress_callback is None:
|
||
return
|
||
try:
|
||
self.progress_callback(payload)
|
||
except Exception:
|
||
return
|
||
|
||
def _apply_result(self, state: AgentState, action: str, values: dict[str, Any]) -> None:
|
||
"""把全局 action 返回值写回 AgentState。"""
|
||
if "HASH_CODE" in values:
|
||
state.hash_code = str(values["HASH_CODE"])
|
||
if "NODE_URL" in values:
|
||
state.node_url = str(values["NODE_URL"])
|
||
if action == "get-online-ips":
|
||
ips = values.get("IP", [])
|
||
if isinstance(ips, str):
|
||
ips = [ips]
|
||
state.online_ips = list(ips)
|
||
state.target_ips = state.target_ips or state.online_ips.copy()
|
||
|
||
def _resolve_target_ips(self, state: AgentState) -> None:
|
||
"""根据在线 IP 和用户指定 IP 计算最终目标 IP。"""
|
||
if not state.target_ips:
|
||
state.target_ips = state.online_ips.copy()
|
||
logger.info("目标 IP 未指定,使用全部在线 IP run_id=%s target_ips=%s", state.run_id, state.target_ips)
|
||
return
|
||
online = set(state.online_ips)
|
||
requested = state.target_ips
|
||
state.target_ips = [ip for ip in requested if ip in online]
|
||
missing = [ip for ip in requested if ip not in online]
|
||
if missing:
|
||
state.events.append(
|
||
{
|
||
"type": "TARGET_SCOPE_CHANGED",
|
||
"message": "some requested IPs are not online",
|
||
"missing_ips": missing,
|
||
"target_ips": state.target_ips,
|
||
}
|
||
)
|
||
logger.info(
|
||
"目标 IP 范围调整 run_id=%s requested=%s target_ips=%s missing=%s",
|
||
state.run_id,
|
||
requested,
|
||
state.target_ips,
|
||
missing,
|
||
)
|
||
|
||
def _handle_progress_action(
|
||
self,
|
||
state: AgentState,
|
||
action: str,
|
||
result: ActionResult,
|
||
analysis: LlmActionAnalysis | None,
|
||
*,
|
||
ip: str | None = None,
|
||
) -> bool:
|
||
"""处理进度查询 action;未完成时保留当前 action 等待下一次查询。"""
|
||
if action not in PROGRESS_ACTIONS:
|
||
return False
|
||
if ip:
|
||
ip_state = state.ip_states.get(ip, {})
|
||
ip_state["progress"] = dict(result.values)
|
||
|
||
key = self._poll_attempt_key(action, ip=ip)
|
||
if self._progress_complete(action, result, analysis):
|
||
state.poll_attempts.pop(key, None)
|
||
logger.info(
|
||
"进度 action 已完成 run_id=%s action=%s ip=%s values=%s",
|
||
state.run_id,
|
||
action,
|
||
ip or "",
|
||
json_for_log(result.values),
|
||
)
|
||
return False
|
||
|
||
max_attempts, interval_sec = self._poll_limits(state, action)
|
||
attempt = state.poll_attempts.get(key, 0) + 1
|
||
state.poll_attempts[key] = attempt
|
||
message = self._progress_message(action, result, ip=ip, attempt=attempt, max_attempts=max_attempts)
|
||
progress_event = {
|
||
"type": "ACTION_PROGRESS",
|
||
"stage": action,
|
||
"backend": result.backend,
|
||
"ip": ip or "",
|
||
"message": message,
|
||
"attempt": attempt,
|
||
"max_attempts": max_attempts,
|
||
"values": dict(result.values),
|
||
}
|
||
state.events.append(progress_event)
|
||
self._emit_progress(progress_event)
|
||
|
||
if attempt >= max_attempts:
|
||
timeout_message = f"{action} 进度查询达到最大次数 {max_attempts},当前仍未完成。{message}"
|
||
logger.warning(
|
||
"进度 action 超时 run_id=%s action=%s ip=%s attempt=%s max=%s values=%s",
|
||
state.run_id,
|
||
action,
|
||
ip or "",
|
||
attempt,
|
||
max_attempts,
|
||
json_for_log(result.values),
|
||
)
|
||
fail_event = {
|
||
"type": "ACTION_FAIL",
|
||
"stage": action,
|
||
"backend": result.backend,
|
||
"ip": ip or "",
|
||
"message": timeout_message,
|
||
}
|
||
state.events.append(fail_event)
|
||
self._emit_progress(fail_event)
|
||
state.last_failed_step = action
|
||
self.pause_state(
|
||
state,
|
||
reason="progress_timeout",
|
||
review_context={
|
||
"type": "action_review",
|
||
"stage": action,
|
||
"ip": ip or "",
|
||
"backend": result.backend,
|
||
"ok": result.ok,
|
||
"error_summary": timeout_message,
|
||
"possible_reason": "进度查询超过最大次数但未达到完成条件。",
|
||
"suggested_action": "请检查 PAM_HOME/PAM_NODE 任务状态;确认外部任务仍在运行时,可调大轮询次数后 resume 重试当前 action。",
|
||
"should_continue": False,
|
||
"progress_values": dict(result.values),
|
||
"attempt": attempt,
|
||
"max_attempts": max_attempts,
|
||
},
|
||
)
|
||
return True
|
||
|
||
self._save_checkpoint(state)
|
||
logger.info(
|
||
"进度 action 未完成,等待下一次查询 run_id=%s action=%s ip=%s attempt=%s max=%s interval=%s message=%s",
|
||
state.run_id,
|
||
action,
|
||
ip or "",
|
||
attempt,
|
||
max_attempts,
|
||
interval_sec,
|
||
message,
|
||
)
|
||
if interval_sec > 0:
|
||
time.sleep(interval_sec)
|
||
return True
|
||
|
||
def _poll_attempt_key(self, action: str, *, ip: str | None = None) -> str:
|
||
"""生成 progress action 的 checkpoint 计数 key。"""
|
||
return f"ip:{ip}:{action}" if ip else f"global:{action}"
|
||
|
||
def _poll_limits(self, state: AgentState, action: str) -> tuple[int, float]:
|
||
"""从运行参数读取轮询最大次数和间隔。"""
|
||
interval_sec = _safe_float(state.params.get("POLL_INTERVAL_SEC"), float(DEFAULT_PARAMS["POLL_INTERVAL_SEC"]))
|
||
if action == "poll-upgrade-progress":
|
||
max_attempts = _safe_int(
|
||
state.params.get("UPGRADE_POLL_MAX_ATTEMPTS"),
|
||
int(DEFAULT_PARAMS["UPGRADE_POLL_MAX_ATTEMPTS"]),
|
||
)
|
||
else:
|
||
max_attempts = _safe_int(
|
||
state.params.get("DOWNLOAD_POLL_MAX_ATTEMPTS"),
|
||
int(DEFAULT_PARAMS["DOWNLOAD_POLL_MAX_ATTEMPTS"]),
|
||
)
|
||
return max(max_attempts, 1), max(interval_sec, 0.0)
|
||
|
||
def _progress_complete(
|
||
self,
|
||
action: str,
|
||
result: ActionResult,
|
||
analysis: LlmActionAnalysis | None,
|
||
) -> bool:
|
||
"""判断进度 action 是否完成,优先尊重 LLM 明确结论。"""
|
||
if analysis is not None and analysis.progress_complete is not None:
|
||
return bool(analysis.progress_complete)
|
||
return _progress_values_complete(action, result.values)
|
||
|
||
def _progress_message(
|
||
self,
|
||
action: str,
|
||
result: ActionResult,
|
||
*,
|
||
ip: str | None = None,
|
||
attempt: int | None = None,
|
||
max_attempts: int | None = None,
|
||
) -> str:
|
||
"""把进度字段格式化为用户和日志可读的短消息。"""
|
||
values = result.values
|
||
parts: list[str] = []
|
||
if ip:
|
||
parts.append(f"IP={ip}")
|
||
if attempt is not None and max_attempts is not None:
|
||
parts.append(f"第 {attempt}/{max_attempts} 次查询")
|
||
for key in ("RATE_OF_PROGRESS", "STEP", "MSG", "STATUS", "SUCCESS", "CODE", "FINISH", "MESSAGE"):
|
||
value = values.get(key)
|
||
if value not in (None, ""):
|
||
parts.append(f"{key}={value}")
|
||
if not parts:
|
||
parts.append("进度接口已返回,但未包含明确进度字段")
|
||
return ",".join(parts)
|
||
|
||
def _business_failed(self, action: str, values: dict[str, Any]) -> bool:
|
||
"""识别 exit code 之外的业务失败条件。"""
|
||
if action == "verify-ip":
|
||
success = values.get("SUCCESS")
|
||
if success is None:
|
||
return False
|
||
return str(success).lower() not in ("true", "1", "yes")
|
||
return False
|
||
|
||
def _apply_ip_result(self, ip_state: dict[str, Any], action: str, values: dict[str, Any]) -> None:
|
||
"""把逐 IP action 返回值写回单 IP 状态。"""
|
||
if action == "download-log":
|
||
ip_state["log_file"] = str(values.get("LOG_FILE", ""))
|
||
|
||
def _record_ip_failure(self, state: AgentState, ip: str, action: str, reason: str) -> None:
|
||
"""记录单 IP 失败,并保留当前 action 供 resume 重试。"""
|
||
ip_state = state.ip_states[ip]
|
||
stop_first = action in ("start-ip", "verify-ip")
|
||
logger.info(
|
||
"记录 IP 失败 run_id=%s ip=%s action=%s reason=%s stop_first=%s",
|
||
state.run_id,
|
||
ip,
|
||
action,
|
||
reason,
|
||
stop_first,
|
||
)
|
||
ip_state.update(
|
||
{
|
||
"status": "FAILED",
|
||
"failed_stage": action,
|
||
"failure_reason": reason,
|
||
"rollback_status": ip_state.get("rollback_status") or "ROLLBACK_NOT_RUN",
|
||
"rollback_stop_first": stop_first,
|
||
}
|
||
)
|
||
state.last_failed_step = action
|
||
state.events.append(
|
||
{
|
||
"type": "ACTION_RETRY_REQUIRED",
|
||
"stage": action,
|
||
"ip": ip,
|
||
"stop_first": stop_first,
|
||
"message": f"{action} 执行失败,流程已暂停;修复后 resume 将从该 action 重试,需回滚时请显式执行 rollback",
|
||
}
|
||
)
|
||
|
||
def _save_checkpoint(self, state: AgentState) -> None:
|
||
"""如果配置了 checkpoint 路径,则保存完整运行状态。"""
|
||
if state.checkpoint_path:
|
||
logger.info(
|
||
"保存 checkpoint run_id=%s path=%s paused=%s pending=%s last_success=%s last_failed=%s",
|
||
state.run_id,
|
||
state.checkpoint_path,
|
||
state.paused,
|
||
state.pending_confirmation,
|
||
state.last_success_step,
|
||
state.last_failed_step,
|
||
)
|
||
save_checkpoint(state, state.checkpoint_path, redact=False)
|
||
|
||
def _append_action_analysis(
|
||
self,
|
||
state: AgentState,
|
||
action: str,
|
||
result,
|
||
*,
|
||
ip: str | None = None,
|
||
) -> Any:
|
||
"""启用 action 后分析时,把诊断结果追加到 events。"""
|
||
logger.info(
|
||
"LLM action 审核开始 run_id=%s action=%s ip=%s result=%s",
|
||
state.run_id,
|
||
action,
|
||
ip or "",
|
||
_action_result_for_log(result),
|
||
)
|
||
self._emit_progress(
|
||
{
|
||
"type": "ACTION_REVIEW_START",
|
||
"stage": action,
|
||
"ip": ip or "",
|
||
"message": "LLM 开始分析 action 结果",
|
||
}
|
||
)
|
||
try:
|
||
analysis = self.llm_client.analyze_action_result(
|
||
action=action,
|
||
result=result,
|
||
state_summary=self._state_summary_for_llm(state, ip=ip),
|
||
)
|
||
except Exception as exc: # pragma: no cover - 审核失败时也要显式暂停,避免黑盒继续执行
|
||
logger.exception("LLM action 审核失败 run_id=%s action=%s ip=%s", state.run_id, action, ip or "")
|
||
state.events.append(
|
||
{
|
||
"type": "ACTION_ANALYSIS_FAIL",
|
||
"stage": action,
|
||
"ip": ip or "",
|
||
"message": str(exc),
|
||
}
|
||
)
|
||
self._emit_progress(
|
||
{
|
||
"type": "ACTION_REVIEW_FAIL",
|
||
"stage": action,
|
||
"ip": ip or "",
|
||
"message": str(exc),
|
||
}
|
||
)
|
||
return LlmActionAnalysis(
|
||
action=action,
|
||
has_anomaly=True,
|
||
severity="high",
|
||
possible_reason=f"LLM 审核失败: {exc}",
|
||
suggested_action="请检查 LLM 配置、网络或 action 审核提示词文件后再继续。",
|
||
requires_confirmation=True,
|
||
should_continue=False,
|
||
notes=["action 结果未完成 LLM 审核,流程已自动暂停。"],
|
||
)
|
||
payload = asdict(analysis)
|
||
payload.update({"type": "ACTION_ANALYSIS", "stage": action})
|
||
if ip:
|
||
payload["ip"] = ip
|
||
if self.action_analysis_enabled:
|
||
state.events.append(payload)
|
||
self._emit_progress(
|
||
{
|
||
"type": "ACTION_REVIEW_DONE",
|
||
"stage": action,
|
||
"ip": ip or "",
|
||
"message": analysis.suggested_action or analysis.possible_reason or "LLM 审核完成",
|
||
"has_anomaly": analysis.has_anomaly,
|
||
"severity": analysis.severity,
|
||
"should_continue": analysis.should_continue,
|
||
"progress_complete": analysis.progress_complete,
|
||
}
|
||
)
|
||
logger.info(
|
||
"LLM action 审核完成 run_id=%s action=%s ip=%s analysis=%s",
|
||
state.run_id,
|
||
action,
|
||
ip or "",
|
||
json_for_log(payload),
|
||
)
|
||
return analysis
|
||
|
||
def _state_summary_for_llm(self, state: AgentState, *, ip: str | None = None) -> dict[str, Any]:
|
||
"""生成给 LLM action 分析使用的脱敏状态摘要。"""
|
||
return {
|
||
"run_id": state.run_id,
|
||
"execution_strategy": state.execution_strategy,
|
||
"completed_global_steps": state.completed_global_steps,
|
||
"online_ip_count": len(state.online_ips),
|
||
"target_ips": state.target_ips,
|
||
"current_ip": ip or "",
|
||
"current_ip_state": state.ip_states.get(ip, {}) if ip else {},
|
||
"pending_confirmation": state.pending_confirmation,
|
||
"paused": state.paused,
|
||
"pause_reason": state.pause_reason,
|
||
"last_success_step": state.last_success_step,
|
||
"last_failed_step": state.last_failed_step,
|
||
"poll_attempts": state.poll_attempts,
|
||
}
|
||
|
||
def _review_context(
|
||
self,
|
||
*,
|
||
action: str,
|
||
analysis,
|
||
result,
|
||
ip: str | None = None,
|
||
) -> dict[str, Any]:
|
||
"""构造面向用户展示的审核暂停上下文。"""
|
||
context = {
|
||
"type": "action_review",
|
||
"stage": action,
|
||
"ip": ip or "",
|
||
"backend": result.backend,
|
||
"ok": result.ok,
|
||
"error_summary": result.error_summary,
|
||
}
|
||
if analysis is not None:
|
||
context.update(
|
||
{
|
||
"severity": analysis.severity,
|
||
"has_anomaly": analysis.has_anomaly,
|
||
"possible_reason": analysis.possible_reason,
|
||
"suggested_action": analysis.suggested_action,
|
||
"should_continue": analysis.should_continue,
|
||
"progress_complete": analysis.progress_complete,
|
||
"notes": list(analysis.notes),
|
||
}
|
||
)
|
||
return context
|
||
|
||
def render_report(self, state: AgentState) -> str:
|
||
"""渲染当前部署状态报告。"""
|
||
success = sum(1 for item in state.ip_states.values() if item.get("status") == "SUCCESS")
|
||
failed = sum(1 for item in state.ip_states.values() if item.get("status") == "FAILED")
|
||
lines = [
|
||
"## PAM 智能部署报告",
|
||
"",
|
||
f"- 执行策略: {state.execution_strategy}",
|
||
f"- 机场: {state.params['AIRPORT_CODE']}",
|
||
f"- 应用: {state.params['APP_NAME']}",
|
||
f"- 模块: {state.params['MODULE_NAME']}",
|
||
f"- 版本: {state.params['VERSION_NUMBER']}",
|
||
f"- 在线工作站数: {len(state.online_ips)}",
|
||
f"- 目标工作站数: {len(state.target_ips)}",
|
||
f"- 成功: {success}",
|
||
f"- 失败: {failed}",
|
||
f"- 待确认: {state.pending_confirmation or '-'}",
|
||
f"- 暂停状态: {'是' if state.paused else '否'}",
|
||
f"- 暂停原因: {state.pause_reason or '-'}",
|
||
"",
|
||
"| IP | 状态 | 失败阶段 | 回滚状态 | 日志 |",
|
||
"| --- | --- | --- | --- | --- |",
|
||
]
|
||
for ip, ip_state in state.ip_states.items():
|
||
lines.append(
|
||
"| {ip} | {status} | {failed_stage} | {rollback_status} | {log_file} |".format(
|
||
ip=ip,
|
||
status=ip_state.get("status", ""),
|
||
failed_stage=ip_state.get("failed_stage") or "-",
|
||
rollback_status=ip_state.get("rollback_status") or "-",
|
||
log_file=ip_state.get("log_file") or "-",
|
||
)
|
||
)
|
||
return "\n".join(lines)
|
||
|
||
|
||
def _absolute_path(path: str | Path) -> Path:
|
||
"""把传给脚本的文件路径转换为绝对路径,避免 cwd 切换后读错文件。"""
|
||
return Path(path).expanduser().resolve()
|
||
|
||
|
||
def _normalize_local_file_path(path: str) -> str:
|
||
"""把本地相对文件路径转换为绝对路径,Windows/Unix 绝对路径保持不变。"""
|
||
if re.match(r"^[A-Za-z]:[\\/]", path):
|
||
return path
|
||
if path.startswith("/"):
|
||
return path
|
||
value = Path(path).expanduser()
|
||
if value.is_absolute():
|
||
return str(value)
|
||
return str(value.resolve())
|
||
|
||
|
||
def _action_result_for_log(result: ActionResult) -> str:
|
||
"""生成 action 结果日志摘要,避免写入完整 stdout/stderr。"""
|
||
return json_for_log(
|
||
{
|
||
"action": result.action,
|
||
"backend": result.backend,
|
||
"ok": result.ok,
|
||
"exit_code": result.exit_code,
|
||
"tool_name": result.tool_name,
|
||
"values": result.values,
|
||
"stderr": result.stderr,
|
||
"error_summary": result.error_summary,
|
||
},
|
||
max_text_len=1000,
|
||
)
|
||
|
||
|
||
def _progress_values_complete(action: str, values: dict[str, Any]) -> bool:
|
||
"""根据 action 返回字段判断下载/推送进度是否完成。"""
|
||
step = _lower_value(values.get("STEP"))
|
||
status = _lower_value(values.get("STATUS"))
|
||
msg = _lower_value(values.get("MSG"))
|
||
success = _lower_value(values.get("SUCCESS"))
|
||
finish = _lower_value(values.get("FINISH"))
|
||
code = _lower_value(values.get("CODE"))
|
||
rate = _lower_value(values.get("RATE_OF_PROGRESS"))
|
||
if step == "done":
|
||
return True
|
||
if status in ("completed", "complete", "done", "success", "succeeded"):
|
||
return True
|
||
if success in ("true", "1", "yes"):
|
||
return True
|
||
if action == "poll-upgrade-progress" and finish in ("true", "1", "yes"):
|
||
return True
|
||
return msg == "success" and rate == "100" and (not code or code == "0")
|
||
|
||
|
||
def _lower_value(value: Any) -> str:
|
||
"""把结构化字段转换为小写字符串,便于规则判断。"""
|
||
return str(value).strip().lower() if value is not None else ""
|
||
|
||
|
||
def _safe_int(value: Any, default: int) -> int:
|
||
"""安全读取整数参数。"""
|
||
try:
|
||
return int(str(value).strip())
|
||
except (TypeError, ValueError):
|
||
return default
|
||
|
||
|
||
def _safe_float(value: Any, default: float) -> float:
|
||
"""安全读取浮点参数。"""
|
||
try:
|
||
return float(str(value).strip())
|
||
except (TypeError, ValueError):
|
||
return default
|