From e572a26e6fc12f2a38d78dddd0d9cbf5d0fa3119 Mon Sep 17 00:00:00 2001 From: dark Date: Thu, 4 Jun 2026 16:28:18 +0800 Subject: [PATCH] =?UTF-8?q?pam=5Fdeploy=5Fgraph/agent.py=EF=BC=9Aprogress?= =?UTF-8?q?=20action=20=E6=9C=AA=E5=AE=8C=E6=88=90=E4=B8=8D=E6=A0=87?= =?UTF-8?q?=E8=AE=B0=20completed=EF=BC=8C=E8=B6=85=E6=97=B6=E6=9A=82?= =?UTF-8?q?=E5=81=9C=E5=9C=A8=E5=BD=93=E5=89=8D=20action=EF=BC=8C=E6=94=AF?= =?UTF-8?q?=E6=8C=81=E6=96=AD=E7=82=B9=E7=BB=A7=E7=BB=AD=E3=80=82=20llm=20?= =?UTF-8?q?=E6=8F=90=E7=A4=BA=E8=AF=8D=E5=92=8C=E8=A7=84=E5=88=99=EF=BC=9A?= =?UTF-8?q?=E6=96=B0=E5=A2=9E=20progress=5Fcomplete=20=E5=88=A4=E6=96=AD?= =?UTF-8?q?=E5=AD=97=E6=AE=B5=E3=80=82=20deploy.sh=20/=20deploy.ps1?= =?UTF-8?q?=EF=BC=9Apoll-*=20action=20=E5=85=A5=E5=8F=A3=E6=94=B9=E4=B8=BA?= =?UTF-8?q?=E5=8D=95=E6=AC=A1=E6=9F=A5=E8=AF=A2=E3=80=82=20interactive.py?= =?UTF-8?q?=EF=BC=9Achat=20=E4=BC=9A=E6=92=AD=E6=8A=A5=E8=BF=9B=E5=BA=A6?= =?UTF-8?q?=E6=9B=B4=E6=96=B0=E3=80=82=20config.txt.example=20/=20README?= =?UTF-8?q?=20/=20packaging=20=E6=96=87=E6=A1=A3=20/=20Skill=20=E6=96=87?= =?UTF-8?q?=E6=A1=A3=EF=BC=9A=E5=90=8C=E6=AD=A5=E8=BF=9B=E5=BA=A6=E6=9F=A5?= =?UTF-8?q?=E8=AF=A2=E5=8F=82=E6=95=B0=E5=92=8C=E6=96=B0=20workflow=20?= =?UTF-8?q?=E8=AF=AD=E4=B9=89=E3=80=82=20=E6=B5=8B=E8=AF=95=E8=A1=A5?= =?UTF-8?q?=E5=85=85=E4=BA=86=E8=BF=9B=E5=BA=A6=E9=87=8D=E5=A4=8D=E6=9F=A5?= =?UTF-8?q?=E8=AF=A2=E3=80=81=E8=B6=85=E6=97=B6=E6=9A=82=E5=81=9C=E3=80=81?= =?UTF-8?q?chat=20=E8=BF=9B=E5=BA=A6=E6=92=AD=E6=8A=A5=E3=80=82?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- README.md | 11 +- doc_scripts/PAM_AUTO_DEPLY_SKILL.md | 93 +++--- doc_scripts/config.txt.example | 3 + doc_scripts/deploy.ps1 | 164 ++++++++++- doc_scripts/deploy.sh | 293 ++++++++++--------- docs/current_logic_flow.md | 37 ++- packaging/README_linux_package.md | 2 + packaging/README_packaged_agent.md | 2 + pam_deploy_graph/agent.py | 328 ++++++++++++++++++---- pam_deploy_graph/config_writer.py | 3 + pam_deploy_graph/constants.py | 3 + pam_deploy_graph/fake_runner.py | 9 + pam_deploy_graph/interactive.py | 5 + pam_deploy_graph/langgraph_runtime.py | 2 +- pam_deploy_graph/llm/openai_compatible.py | 18 ++ pam_deploy_graph/llm/prompts.py | 10 +- pam_deploy_graph/llm/rule_based.py | 67 +++++ pam_deploy_graph/models.py | 2 + pam_deploy_graph/tool_catalog.py | 4 +- prompts/action_review.txt | 5 + tests/test_agent_flow.py | 86 ++++++ tests/test_interactive_cli.py | 39 ++- 22 files changed, 934 insertions(+), 252 deletions(-) diff --git a/README.md b/README.md index 754dd6a..f324c23 100644 --- a/README.md +++ b/README.md @@ -83,6 +83,7 @@ packaging/ - chat 执行前会归一化参数并展示实际写入脚本配置的值;`script_only` / `hybrid_node_mcp` 会提前检查 `ZIP_FILE_PATH` 是否存在。 - chat 执行中会播报每个 action 的开始、完成或失败;action 执行失败会停在当前 checkpoint,不再误报 LangGraph 不可用。 - 每个 action 完成后都会进入一次 LLM/规则审核;只有审核通过才会把 action 记为 completed,如果审核建议停止,流程会暂停并等待用户 `resume` 重试当前 action。 +- `poll-download-progress` 和 `poll-upgrade-progress` 已改为单次进度查询;workflow 负责按配置重复调用,每次查询结果都会交给 LLM/规则审核判断是否完成,并通过 chat 播报进度。 - `--analyze-actions` 和 `llm action-analysis on` 改为只控制是否把详细审核结果写入 `events`,不再控制审核是否执行。 - chat 会播报 action 审核开始、审核完成和审核失败,避免黑盒执行。 - chat 支持执行中按 `Ctrl+C` 中断,保存 checkpoint 后再 `resume`。 @@ -90,7 +91,7 @@ packaging/ - 支持通过 `--llm-action-analysis-prompt-file`、`PAM_LLM_ACTION_ANALYSIS_PROMPT_FILE` 或 chat 内 `llm config action_analysis_prompt_file=...` 自定义 action 审核提示词。 - 增加统一运行日志,默认写入 `logs/pam_deploy_agent.log`,覆盖 CLI/chat、LLM 调用、action 路由、脚本/MCP 调用、LangGraph、checkpoint 等关键流程。 - chat 支持 `llm test [文本]`,可用当前 LLM client 做一次轻量调用,确认真实 LLM 或规则 fallback 是否正常加载。 -- 添加基础测试,当前本地结果为 `62 passed, 2 skipped`。 +- 添加基础测试,当前本地结果为 `66 passed, 3 skipped`。 未完成: @@ -299,7 +300,13 @@ PAM> resume PAM> exit ``` -`chat` 默认仍要求在会话内显式输入 `run`,并确认参数、目标 IP 范围和最终执行后才会执行 action。输入 `你好`、`hello` 这类问候不会触发 LLM/结构化分析;需要分析部署需求时可直接描述部署任务,或显式使用 `analyze <需求>`。每个 action 完成后都会自动进入一次 LLM/规则审核,并播报审核开始/结束;只有审核通过才会把 action 记为 completed;如果审核建议停止或审核本身失败,流程会暂停并输出建议,等待用户决定是否 `resume` 重试当前 action。逐 IP action 失败时也会暂停,修复外部环境后输入 `resume` 会从当前 action 重试;如果确实需要回滚,使用 `rollback [IP]` 显式执行。`llm test [文本]` 可测试当前 LLM client 是否可用。`--analyze-actions` 仅控制详细审核结果是否写入 `events`。执行中可按 `Ctrl+C` 中断,chat 会保存当前 checkpoint 并把流程标记为 `user_interrupted`。`set KEY=VALUE` 和 `load params <路径>` 会把更新同步到当前运行 state、`config.txt` 和 checkpoint。`chat` 也支持 `--llm-base-url` / `--llm-api-key` / `--llm-model` / `--llm-action-analysis-prompt-file`、`--mcp-config` 和 `--analyze-actions`。 +`chat` 默认仍要求在会话内显式输入 `run`,并确认参数、目标 IP 范围和最终执行后才会执行 action。输入 `你好`、`hello` 这类问候不会触发 LLM/结构化分析;需要分析部署需求时可直接描述部署任务,或显式使用 `analyze <需求>`。每个 action 完成后都会自动进入一次 LLM/规则审核,并播报审核开始/结束;只有审核通过才会把 action 记为 completed;如果审核建议停止或审核本身失败,流程会暂停并输出建议,等待用户决定是否 `resume` 重试当前 action。`poll-download-progress` 和 `poll-upgrade-progress` 每次只查询一次进度,workflow 会按 `POLL_INTERVAL_SEC`、`DOWNLOAD_POLL_MAX_ATTEMPTS`、`UPGRADE_POLL_MAX_ATTEMPTS` 重复调用,并在每次返回后让 LLM/规则判断是否完成、播报进度;未完成时不会跳到下一个 action。逐 IP action 失败时也会暂停,修复外部环境后输入 `resume` 会从当前 action 重试;如果确实需要回滚,使用 `rollback [IP]` 显式执行。`llm test [文本]` 可测试当前 LLM client 是否可用。`--analyze-actions` 仅控制详细审核结果是否写入 `events`。执行中可按 `Ctrl+C` 中断,chat 会保存当前 checkpoint 并把流程标记为 `user_interrupted`。`set KEY=VALUE` 和 `load params <路径>` 会把更新同步到当前运行 state、`config.txt` 和 checkpoint。`chat` 也支持 `--llm-base-url` / `--llm-api-key` / `--llm-model` / `--llm-action-analysis-prompt-file`、`--mcp-config` 和 `--analyze-actions`。 + +进度查询相关参数: + +- `POLL_INTERVAL_SEC`:两次进度查询之间的等待秒数,默认 `2`。 +- `DOWNLOAD_POLL_MAX_ATTEMPTS`:云下载进度最大查询次数,默认 `60`。 +- `UPGRADE_POLL_MAX_ATTEMPTS`:单 IP 推送进度最大查询次数,默认 `600`。 ## 日志 diff --git a/doc_scripts/PAM_AUTO_DEPLY_SKILL.md b/doc_scripts/PAM_AUTO_DEPLY_SKILL.md index 713d92c..91e555d 100644 --- a/doc_scripts/PAM_AUTO_DEPLY_SKILL.md +++ b/doc_scripts/PAM_AUTO_DEPLY_SKILL.md @@ -1,6 +1,6 @@ --- name: pam-auto-deply -description: 面向 PAM HOME/NODE 的智能部署 Skill。由 Skill 负责理解用户需求、收集并确认参数、选择执行模式、编排主流程、控制回滚确认与最终汇总;由现有 deploy.sh / deploy.ps1 提供 action 能力执行建版、上传、发布、节点发现、云下载、升级、启停、校验、日志下载和手动回滚。禁止自动生成或修改脚本,禁止使用脚本主流程做部署。 +description: 面向 PAM HOME/NODE 的智能部署 Skill。由 Skill 负责理解用户需求、收集并确认参数、选择执行模式、编排主流程、控制进度查询与最终汇总;由现有 deploy.sh / deploy.ps1 提供 action 能力执行建版、上传、发布、节点发现、云下载、升级、启停、校验、日志下载和手动回滚。禁止自动生成或修改脚本,禁止使用脚本主流程做部署。 --- # PAM_AUTO_DEPLY Skill @@ -22,7 +22,7 @@ description: 面向 PAM HOME/NODE 的智能部署 Skill。由 Skill 负责理解 - 禁止自动生成、重建、覆盖或修改 `deploy.sh`、`deploy.ps1`、`deploy.bat`、`test_deploy.sh`、`test_deploy.ps1`、`test_deploy.bat`。 - 在任何真实调用前,必须先向用户展示归一化后的参数并得到确认。 - 在真实部署执行过程中,必须持续向用户展示当前阶段、下一步动作和阶段结果,禁止长时间静默执行。 -- 回滚不得自动执行。脚本只能输出 `PENDING_AGENT_CONFIRMATION(...)`,必须由 Agent 先向用户确认。 +- 回滚不得自动执行;主 workflow 失败后只暂停在当前 action。需要回滚时,必须由用户显式输入 `rollback [IP]` 或直接调用 `rollback-ip` action。 ## 2. 执行模式选择 @@ -68,6 +68,9 @@ description: 面向 PAM HOME/NODE 的智能部署 Skill。由 Skill 负责理解 | `actionType` | `ACTION_TYPE` | 否 | 升级类型,默认 `FULL` | | `timeOut` | `TIMEOUT` | 否 | 接口级超时参数,默认 `120` | | `logName` | `LOG_NAME` | 否 | 日志文件名,默认 `app.log` | +| `pollIntervalSec` | `POLL_INTERVAL_SEC` | 否 | 两次进度查询间隔,默认 `2` 秒 | +| `downloadPollMaxAttempts` | `DOWNLOAD_POLL_MAX_ATTEMPTS` | 否 | 云下载进度最大查询次数,默认 `60` | +| `upgradePollMaxAttempts` | `UPGRADE_POLL_MAX_ATTEMPTS` | 否 | 单 IP 推送进度最大查询次数,默认 `600` | ### 3.2 运行控制参数 @@ -77,13 +80,12 @@ description: 面向 PAM HOME/NODE 的智能部署 Skill。由 Skill 负责理解 - `showUsageOnly`: 是否只说明现有脚本用法而不执行 - `userSpecifiedIps`: 用户指定的目标 IP 子集 - `allOrNothing`: 是否要求全有或全无 -- `rollbackApproved`: 用户是否已确认回滚 +- `rollbackApproved`: 用户是否已明确要求执行回滚 - `osTarget`: 目标脚本入口环境 - `checkpointPath`: 检查点文件路径 - `resumeFromCheckpoint`: 是否按已有检查点断点续试 - `traceFilePath`: 当前部署统一复用的接口跟踪日志文件路径 - `stepIntervalSec`: 全局 action 与 action 之间的执行间隔 -- `firstPollDelaySec`: 创建下载任务后,到首次轮询下载进度前的等待间隔 - `perIpStepIntervalSec`: 同一台 IP 内部步骤之间的执行间隔 - `perIpIntervalSec`: 一台 IP 完成后到下一台 IP 开始前的间隔 - `failurePauseSec`: 某步骤失败后进入下一分支前的等待间隔 @@ -91,7 +93,6 @@ description: 面向 PAM HOME/NODE 的智能部署 Skill。由 Skill 负责理解 推荐默认值: - `stepIntervalSec = 2` -- `firstPollDelaySec = 2` - `perIpStepIntervalSec = 1` - `perIpIntervalSec = 3` - `failurePauseSec = 0` @@ -160,6 +161,9 @@ description: 面向 PAM HOME/NODE 的智能部署 Skill。由 Skill 负责理解 - `ACTION_TYPE` - `TIMEOUT` - `LOG_NAME` + - `POLL_INTERVAL_SEC` + - `DOWNLOAD_POLL_MAX_ATTEMPTS` + - `UPGRADE_POLL_MAX_ATTEMPTS` - 命令行只传 action 级控制参数: - `--action` / `-Action` - `--ip` / `-Ip` @@ -168,7 +172,8 @@ description: 面向 PAM HOME/NODE 的智能部署 Skill。由 Skill 负责理解 - 不要把整套业务参数直接拼接到命令行。 - `client_secret` 等敏感字段不得通过命令行透传。 - 如果用户明确要求“不落地配置文件”,则本 Skill 不执行真实部署,只说明限制和原因。 -- `traceFilePath` 与间隔控制参数不写入 `config.txt`,由 Agent 在运行时持有并应用。 +- `traceFilePath` 不写入 `config.txt`,由 Agent 在运行时持有并应用。 +- 进度查询间隔和最大次数写入 `config.txt`,由 Agent workflow 和脚本调试流程共同读取。 ## 4. 主流程(硬约束) @@ -194,22 +199,22 @@ description: 面向 PAM HOME/NODE 的智能部署 Skill。由 Skill 负责理解 12. 调用 `get-online-ips`。 13. 若用户指定了目标 IP,则基于在线 IP 列表做过滤。 14. 调用 `create-download-task`。 -15. 调用 `poll-download-progress`,直到下载完成、失败或超时。 +15. 重复调用 `poll-download-progress` 单次查询进度;每次返回后交给 LLM/规则判断,直到下载完成、失败或达到最大查询次数。 16. 按在线 IP 或过滤后的目标 IP 列表逐台执行: - `upgrade-ip` - - `poll-upgrade-progress` + - 重复调用 `poll-upgrade-progress` 单次查询进度;每次返回后交给 LLM/规则判断,直到推送完成、失败或达到最大查询次数 - `start-ip` - `verify-ip` - `download-log` 17. 汇总每台 IP 的结果。 -18. 若出现 `PENDING_AGENT_CONFIRMATION(...)`,立即中止自动后续动作,转入回滚确认分支。 -19. 输出最终报告。 +18. 若 action 失败、LLM/规则审核要求停止,或出现 legacy `PENDING_AGENT_CONFIRMATION(...)`,暂停在当前 action 并输出建议。 +19. 输出最终报告;需要回滚时,等待用户显式执行 `rollback [IP]`。 主流程补充规则: 1. 一次完整部署中的所有 action 调用,应复用同一个 `traceFilePath`,禁止每个 action 各自新建独立 trace 文件。 2. 全局 action 与下一 action 之间,按 `stepIntervalSec` 等待。 -3. `create-download-task` 成功后,到首次 `poll-download-progress` 前,按 `firstPollDelaySec` 等待。 +3. `create-download-task` 成功后,直接进入 `poll-download-progress`;未完成时按 `POLL_INTERVAL_SEC` 等待后再次查询当前 action。 4. 同一台 IP 内部: - `upgrade-ip -> poll-upgrade-progress` - `poll-upgrade-progress -> start-ip` @@ -219,13 +224,14 @@ description: 面向 PAM HOME/NODE 的智能部署 Skill。由 Skill 负责理解 5. 当前一台 IP 处理完成后,到下一台 IP 开始前,按 `perIpIntervalSec` 等待。 6. 若某步骤失败后需要进入提示、确认或分支流程,可按 `failurePauseSec` 等待。 7. 若某个间隔值为 `0`,表示该层级不等待,直接进入下一动作。 +8. `poll-download-progress` 和 `poll-upgrade-progress` 的脚本 action 只执行一次进度查询;正式 workflow 的循环、checkpoint、LLM 判断和进度播报由 Agent Runtime 负责。 ### 4.2 主流程中的强制确认点 以下节点必须等待用户确认,不能自动越过: 1. 参数确认单确认前。 -2. 出现回滚条件时。 +2. 执行 `rollback [IP]` 或 `rollback-ip` 前。 3. 用户指定 IP 与在线 IP 过滤结果不一致,且会影响部署范围时。 4. 用户显式要求修改默认间隔策略时。 @@ -238,9 +244,9 @@ description: 面向 PAM HOME/NODE 的智能部署 Skill。由 Skill 负责理解 3. 在每个全局步骤成功后,告知用户该步骤已完成,并说明关键结果。 4. 在每个全局步骤失败后,立即告知用户失败阶段、失败原因和后续处理。 5. 在逐台 IP 处理时,必须告知当前正在处理哪一台 IP。 -6. 在云下载进度轮询阶段,必须持续汇报当前进度,不能静默等待完成。 +6. 在云下载和单 IP 推送进度查询阶段,每次 `poll-*` 返回后都必须汇报当前进度,不能静默等待完成。 7. 若执行耗时较长,必须按阶段持续播报,不能等全部结束后一次性汇总。 -8. 若进入回滚确认状态,必须明确告诉用户: +8. 若失败后建议回滚,必须明确告诉用户: - 哪一台 IP 失败 - 失败阶段 - 建议是否回滚 @@ -349,9 +355,10 @@ description: 面向 PAM HOME/NODE 的智能部署 Skill。由 Skill 负责理解 4. 若部分 IP 已成功完成: - 默认跳过成功 IP - 只继续未完成或失败的 IP -5. 若存在 `PENDING_AGENT_CONFIRMATION(...)`: - - 检查点中必须保留该状态 - - 未得到用户确认前,不得自动继续后续动作 +5. 若存在失败暂停或 legacy `PENDING_AGENT_CONFIRMATION(...)`: + - 检查点中必须保留失败阶段、失败原因和审核建议 + - 修复后 `resume` 默认从当前失败 action 重试 + - 需要回滚时必须由用户显式执行 `rollback [IP]` 6. 若用户要求“从头重新开始”: - 先明确说明将忽略现有检查点 - 再从第 1 步重新执行 @@ -430,14 +437,14 @@ description: 面向 PAM HOME/NODE 的智能部署 Skill。由 Skill 负责理解 | 12 | 获取在线 IP | `get-online-ips` | 返回 `COUNT>0` 且有 `IP=...` 行 | 停止并报告 `GET_ONLINE_IPS` 失败 | | 13 | 过滤目标 IP | 按用户指定 IP 与在线 IP 交集过滤 | 过滤结果明确 | 过滤后为空时停止;范围变化需确认 | | 14 | 创建云下载任务 | `create-download-task` | 返回 `RESULT=TASK_CREATED` | 停止并报告 `CREATE_DOWNLOAD_TASK` 失败 | -| 15 | 轮询下载进度 | `poll-download-progress` | `STEP=DONE` 或 `MSG=success` 且 `RATE_OF_PROGRESS=100` | 停止并报告 `POLL_DOWNLOAD_PROGRESS` 失败或超时 | -| 16.1 | 创建单 IP 推送任务 | `upgrade-ip --ip ...` | 返回 `RESULT=TASK_CREATED` | 记录失败,标记 `PENDING_AGENT_CONFIRMATION(stopFirst=false)` | -| 16.2 | 轮询单 IP 推送进度 | `poll-upgrade-progress --ip ...` | `STEP=DONE` 或 `FINISH=true` 或 `MSG=success` 且 `RATE_OF_PROGRESS=100` | 记录失败,标记 `PENDING_AGENT_CONFIRMATION(stopFirst=false)` | -| 16.3 | 启动单 IP | `start-ip --ip ...` | action 成功返回 | 记录失败,标记 `PENDING_AGENT_CONFIRMATION(stopFirst=true)` | -| 16.4 | 校验单 IP | `verify-ip --ip ...` | 返回 `SUCCESS=true` | 记录失败,标记 `PENDING_AGENT_CONFIRMATION(stopFirst=true)` | +| 15 | 查询下载进度 | 重复调用单次 `poll-download-progress` | LLM/规则判断 `progress_complete=true`;或 `STEP=DONE` / `MSG=success` 且 `RATE_OF_PROGRESS=100` | 停止并报告 `POLL_DOWNLOAD_PROGRESS` 失败或超时 | +| 16.1 | 创建单 IP 推送任务 | `upgrade-ip --ip ...` | 返回 `RESULT=TASK_CREATED` | 暂停在当前 action,修复后 `resume` 重试;需要回滚时显式执行 rollback | +| 16.2 | 查询单 IP 推送进度 | 重复调用单次 `poll-upgrade-progress --ip ...` | LLM/规则判断 `progress_complete=true`;或 `STEP=DONE` / `FINISH=true` / `MSG=success` 且 `RATE_OF_PROGRESS=100` | 暂停在当前 action,修复后 `resume` 重试;需要回滚时显式执行 rollback | +| 16.3 | 启动单 IP | `start-ip --ip ...` | action 成功返回 | 暂停在当前 action,修复后 `resume` 重试;需要回滚时显式执行 rollback | +| 16.4 | 校验单 IP | `verify-ip --ip ...` | 返回 `SUCCESS=true` | 暂停在当前 action,修复后 `resume` 重试;需要回滚时显式执行 rollback | | 16.5 | 下载日志 | `download-log --ip ...` | 返回 `LOG_FILE=...` | 记录日志下载失败,但不覆盖原主失败原因 | | 17 | 汇总结果 | 汇总每台 IP 的阶段、失败原因、回滚状态、日志路径 | 报告内容完整 | 若汇总失败,至少保留原始 action 输出 | -| 18 | 回滚确认分支 | 发现 `PENDING_AGENT_CONFIRMATION(...)` 时进入回滚确认 | 用户明确是否回滚 | 未确认时停止,不自动回滚 | +| 18 | 失败暂停或显式回滚 | 失败后默认停在当前 action;用户输入 `rollback [IP]` 后才执行回滚 | 用户明确要求回滚或修复后 `resume` | 未显式要求回滚时不自动回滚 | | 19 | 最终报告 | 输出最终报告 | 报告包含模式、入口、阶段结果、日志、回滚状态 | 不省略失败细节 | ## 5. 通用执行原则 @@ -456,7 +463,7 @@ description: 面向 PAM HOME/NODE 的智能部署 Skill。由 Skill 负责理解 - `[FLOW][FAIL]` 10. 只允许调用脚本 `action` 入口,禁止调用脚本主流程。 11. 脚本 action 输出以 `key=value` 为主,Agent 应优先读取这些结果行。 -12. 遇到需要回滚的场景,脚本只返回 `PENDING_AGENT_CONFIRMATION(stopFirst=...)`,Agent 必须先确认。 +12. 遇到需要回滚的场景,Agent 只能提示风险和建议;不得自动回滚,必须等待用户显式执行 rollback。 ## 6. 脚本 action 能力 @@ -485,10 +492,10 @@ powershell -File .\deploy.ps1 -ConfigPath .\config.txt -Action [-Ip | `get-node-url` | 获取目标 Node 地址 | 无 | | `get-online-ips` | 获取在线工作站 IP 列表 | 无 | | `create-download-task` | 创建云下载任务 | 无 | -| `poll-download-progress` | 轮询下载进度 | 无 | +| `poll-download-progress` | 单次查询下载进度;是否继续查询由 Agent workflow 和 LLM/规则决定 | 无 | | `download-cloud-to-node` | 创建下载任务并轮询至完成,仅调试使用,不得进入正式主流程 | 无 | | `upgrade-ip` | 为指定 IP 创建推送任务,固定使用 `timeOut=0` | `--ip` / `-Ip` | -| `poll-upgrade-progress` | 轮询指定 IP 的推送进度 | `--ip` / `-Ip` | +| `poll-upgrade-progress` | 单次查询指定 IP 的推送进度;是否继续查询由 Agent workflow 和 LLM/规则决定 | `--ip` / `-Ip` | | `start-ip` | 启动指定 IP 应用 | `--ip` / `-Ip` | | `stop-ip` | 停止指定 IP 应用 | `--ip` / `-Ip` | | `verify-ip` | 校验指定 IP | `--ip` / `-Ip` | @@ -559,9 +566,9 @@ Agent 读取时: - `create-download-task` - `upgrade-ip` -### 7.4 手动回滚分支 +### 7.4 显式回滚命令 -当部署结果出现 `PENDING_AGENT_CONFIRMATION(...)` 且用户明确同意回滚时: +当用户明确输入 `rollback [IP]` 或直接要求对指定 IP 回滚时: 1. 再次向用户确认目标 IP 和 `stopFirst` 值。 2. 调用 `rollback-ip` action。 @@ -613,19 +620,16 @@ Agent 读取时: ### 8.3 回滚规则 -回滚只允许在 Agent 与用户确认后执行。 +回滚只允许在用户显式要求后执行。 -回滚状态有三类: +回滚状态包括: - `ROLLBACK_NOT_RUN` -- `PENDING_AGENT_CONFIRMATION(stopFirst=true|false)` -- 真正执行后的结果: - - `ROLLBACK_SUCCESS` - - `ROLLBACK_FAILED` - - `ROLLBACK_REQUEST_FAILED` - - `ROLLBACK_VERIFY_FAILED` +- `ROLLBACK_DONE` +- `ROLLBACK_FAILED` +- `REJECTED_BY_OPERATOR` -默认确认逻辑: +默认建议: - 升级失败:建议回滚,`stopFirst=false` - 启动失败:建议回滚,`stopFirst=true` @@ -674,7 +678,9 @@ powershell -File .\deploy.ps1 -ConfigPath .\config.txt -Action rollback-ip -Ip 1 - 失败: 1 - 间隔控制: - stepIntervalSec: 2 - - firstPollDelaySec: 2 + - pollIntervalSec: 2 + - downloadPollMaxAttempts: 60 + - upgradePollMaxAttempts: 600 - perIpStepIntervalSec: 1 - perIpIntervalSec: 3 - failurePauseSec: 0 @@ -684,7 +690,7 @@ powershell -File .\deploy.ps1 -ConfigPath .\config.txt -Action rollback-ip -Ip 1 | --- | --- | --- | --- | --- | | 192.168.1.10 | SUCCESS | - | - | logs/deploy_192.168.1.10.zip | | 192.168.1.11 | SUCCESS | - | - | logs/deploy_192.168.1.11.zip | -| 192.168.1.12 | FAILED | VERIFY | PENDING_AGENT_CONFIRMATION(stopFirst=true) | logs/deploy_192.168.1.12.zip | +| 192.168.1.12 | FAILED | VERIFY | ROLLBACK_NOT_RUN | logs/deploy_192.168.1.12.zip | ``` 更完整的最终报告模板: @@ -709,7 +715,7 @@ powershell -File .\deploy.ps1 -ConfigPath .\config.txt -Action rollback-ip -Ip 1 | IP | 状态 | 失败阶段 | 失败原因 | 回滚状态 | 日志 | | --- | --- | --- | --- | --- | --- | | 192.168.1.10 | SUCCESS | - | - | - | logs/deploy_192.168.1.10.log | -| 192.168.1.12 | FAILED | VERIFY | Health check failed | PENDING_AGENT_CONFIRMATION(stopFirst=true) | logs/deploy_192.168.1.12.log | +| 192.168.1.12 | FAILED | VERIFY | Health check failed | ROLLBACK_NOT_RUN | logs/deploy_192.168.1.12.log | ## 检查点摘要 @@ -724,9 +730,10 @@ powershell -File .\deploy.ps1 -ConfigPath .\config.txt -Action rollback-ip -Ip 1 - get-online-ips - create-download-task -## 待确认事项 +## 后续建议 -- 是否对 192.168.1.12 执行回滚 +- 192.168.1.12 停在 verify-ip;修复后可 resume 重试当前 action +- 如确认需要回滚,可执行 rollback 192.168.1.12 ``` ## 10. Agent 执行建议 @@ -740,7 +747,7 @@ powershell -File .\deploy.ps1 -ConfigPath .\config.txt -Action rollback-ip -Ip 1 - 回滚需要确认 4. 参数未确认前,不触发任何真实部署 action。 5. 用户只要求“生成脚本不执行”时,由于本 Skill 禁止自动生成或修改脚本,应直接说明限制,而不是自动产出脚本文件。 -6. 如果 action 输出中出现 `PENDING_AGENT_CONFIRMATION(...)`,立即中止自动后续动作并请求确认。 +6. 如果 action 输出中出现 legacy `PENDING_AGENT_CONFIRMATION(...)`,立即暂停当前 workflow,输出建议;需要回滚时等待用户显式执行 rollback。 7. 如果存在检查点,优先评估能否从断点续试,而不是默认从头执行。 8. 任何长耗时阶段都要主动播报进度,尤其是: - `create-download-task` diff --git a/doc_scripts/config.txt.example b/doc_scripts/config.txt.example index 8fb2c49..3ebbeb3 100644 --- a/doc_scripts/config.txt.example +++ b/doc_scripts/config.txt.example @@ -9,3 +9,6 @@ ZIP_FILE_PATH=C:\path\to\pam-2.0.5.zip ACTION_TYPE=FULL TIMEOUT=120 LOG_NAME=app.log +POLL_INTERVAL_SEC=2 +DOWNLOAD_POLL_MAX_ATTEMPTS=60 +UPGRADE_POLL_MAX_ATTEMPTS=600 diff --git a/doc_scripts/deploy.ps1 b/doc_scripts/deploy.ps1 index 690d8fe..56b9335 100644 --- a/doc_scripts/deploy.ps1 +++ b/doc_scripts/deploy.ps1 @@ -23,6 +23,8 @@ Notes: - deploy.bat is only a wrapper for this script. - The wrapper avoids cmd.exe delayed-expansion issues with CLIENT_SECRET values containing exclamation marks. + - poll-download-progress and poll-upgrade-progress only query progress once. + The Agent workflow repeats them and asks LLM/rules to judge completion. '@ | Write-Host } @@ -366,6 +368,9 @@ function Get-PamConfig { 'ACTION_TYPE' { $config[$key] = $value } 'TIMEOUT' { $config[$key] = $value } 'LOG_NAME' { $config[$key] = $value } + 'POLL_INTERVAL_SEC' { $config[$key] = $value } + 'DOWNLOAD_POLL_MAX_ATTEMPTS' { $config[$key] = $value } + 'UPGRADE_POLL_MAX_ATTEMPTS' { $config[$key] = $value } } } } else { @@ -384,6 +389,9 @@ function Get-PamConfig { ACTION_TYPE = 'FULL' TIMEOUT = '120' LOG_NAME = 'app.log' + POLL_INTERVAL_SEC = '2' + DOWNLOAD_POLL_MAX_ATTEMPTS = '60' + UPGRADE_POLL_MAX_ATTEMPTS = '600' } foreach ($name in $defaults.Keys) { @@ -647,8 +655,14 @@ function Wait-DownloadProgress { RateOfProgress = '' RawResponse = '' } + $maxAttempts = 60 + [int]::TryParse([string]$Config.DOWNLOAD_POLL_MAX_ATTEMPTS, [ref]$maxAttempts) | Out-Null + if ($maxAttempts -lt 1) { $maxAttempts = 60 } + $pollIntervalSec = 2 + [int]::TryParse([string]$Config.POLL_INTERVAL_SEC, [ref]$pollIntervalSec) | Out-Null + if ($pollIntervalSec -lt 0) { $pollIntervalSec = 2 } - for ($attempt = 0; $attempt -lt 60; $attempt++) { + for ($attempt = 0; $attempt -lt $maxAttempts; $attempt++) { $response = Invoke-PamWebRequest -Method GET -Url $progressUrl -Token $Token -Headers @{ 'Target-Node' = $NodeUrl } @@ -681,7 +695,7 @@ function Wait-DownloadProgress { if ($progressParts.Count -gt 0) { Write-Info ("Step 3.3b: async download progress -> {0}" -f ($progressParts -join ', ')) } else { - Write-Info ("Step 3.3b: async download progress polling... ({0}/60)" -f ($attempt + 1)) + Write-Info ("Step 3.3b: async download progress polling... ({0}/{1})" -f ($attempt + 1), $maxAttempts) } if ($step -eq 'DONE' -or $status -eq 'completed' -or $successFlag -eq 'true' -or (($msg -eq 'success') -and ($progressValue -eq '100'))) { @@ -694,12 +708,64 @@ function Wait-DownloadProgress { throw "Node download failed: $message" } - Start-Sleep -Seconds 2 + Start-Sleep -Seconds $pollIntervalSec } throw 'Node download timed out.' } +function Read-DownloadProgress { + param($Config, [string]$Token, [string]$NodeUrl) + + $query = Join-RequestPairs ([ordered]@{ + applicationName = $Config.APP_NAME + moduleName = $Config.MODULE_NAME + airportCode = $Config.AIRPORT_CODE + versionNumber = $Config.VERSION_NUMBER + }) + $progressUrl = "$($Config.HOME_BASE_URL)/node-proxy/$($Config.AIRPORT_CODE)/api/mcp/version/upgrade/download-cloud/progress?$query" + $response = Invoke-PamWebRequest -Method GET -Url $progressUrl -Token $Token -Headers @{ + 'Target-Node' = $NodeUrl + } + + $status = Get-ResponseValue -Response $response -Candidates @('status') + $successFlag = Get-ResponseValue -Response $response -Candidates @('success') + $step = Get-ResponseValue -Response $response -Candidates @('step') + $msg = Get-ResponseValue -Response $response -Candidates @('msg') + $progressValue = Get-ResponseValue -Response $response -Candidates @('rateOfProgress', 'progress', 'percent', 'data.rateOfProgress', 'data.progress', 'data.percent') + $message = Get-ResponseValue -Response $response -Candidates @('message') + if (-not $message) { $message = $msg } + $script:DownloadProgressState = [ordered]@{ + Status = [string]$status + Success = [string]$successFlag + Step = [string]$step + Msg = [string]$msg + Message = [string]$message + RateOfProgress = [string]$progressValue + RawResponse = [string]$response + } + + $progressParts = [System.Collections.Generic.List[string]]::new() + if ($msg) { $progressParts.Add("msg=$msg") } + if ($step) { $progressParts.Add("step=$step") } + if ($progressValue) { $progressParts.Add("rateOfProgress=$progressValue") } + if ($status) { $progressParts.Add("status=$status") } + if ($successFlag) { $progressParts.Add("success=$successFlag") } + if ($message -and $message -ne $msg) { $progressParts.Add("message=$message") } + + if ($progressParts.Count -gt 0) { + Write-Info ("Step 3.3b: async download progress single query -> {0}" -f ($progressParts -join ', ')) + } else { + Write-Info 'Step 3.3b: async download progress single query returned no explicit progress fields.' + } + + if ((@($step, $message, $msg, $status) -join ' ') -match '(?i)fail|error') { + if (-not $message) { $message = $step } + if (-not $message) { $message = $msg } + throw "Node download failed: $message" + } +} + function Create-DownloadTask { param($Config, [string]$Token, [string]$NodeUrl) @@ -751,8 +817,14 @@ function Wait-UpgradeProgress { LastModify = '' RawResponse = '' } + $maxAttempts = 600 + [int]::TryParse([string]$Config.UPGRADE_POLL_MAX_ATTEMPTS, [ref]$maxAttempts) | Out-Null + if ($maxAttempts -lt 1) { $maxAttempts = 600 } + $pollIntervalSec = 2 + [int]::TryParse([string]$Config.POLL_INTERVAL_SEC, [ref]$pollIntervalSec) | Out-Null + if ($pollIntervalSec -lt 0) { $pollIntervalSec = 2 } - for ($attempt = 0; $attempt -lt 60; $attempt++) { + for ($attempt = 0; $attempt -lt $maxAttempts; $attempt++) { $response = Invoke-PamWebRequest -Method GET -Url $progressUrl -Token $Token -Headers @{ 'Target-Node' = $NodeUrl } @@ -797,7 +869,7 @@ function Wait-UpgradeProgress { if ($progressParts.Count -gt 1) { Write-Info ("Step 3.4a: async upgrade progress -> {0}" -f ($progressParts -join ', ')) } else { - Write-Info ("Step 3.4a: async upgrade progress polling... ip={0} ({1}/60)" -f $Ip, ($attempt + 1)) + Write-Info ("Step 3.4a: async upgrade progress polling... ip={0} ({1}/{2})" -f $Ip, ($attempt + 1), $maxAttempts) } if ($step -eq 'DONE' -or $finish -eq 'true' -or $status -eq 'completed' -or $successFlag -eq 'true') { @@ -821,12 +893,88 @@ function Wait-UpgradeProgress { throw "Node upgrade failed: ip=$Ip, message=$message" } - Start-Sleep -Seconds 2 + Start-Sleep -Seconds $pollIntervalSec } throw "Node upgrade timed out: ip=$Ip" } +function Read-UpgradeProgress { + param( + $Config, + [string]$Token, + [string]$NodeUrl, + [string]$Ip + ) + + $query = Join-RequestPairs ([ordered]@{ + applicationName = $Config.APP_NAME + moduleName = $Config.MODULE_NAME + airportCode = $Config.AIRPORT_CODE + versionNumber = $Config.VERSION_NUMBER + }) + $progressUrl = "$($Config.HOME_BASE_URL)/node-proxy/$($Config.AIRPORT_CODE)/api/mcp/version/upgrade/progress?$query" + $response = Invoke-PamWebRequest -Method GET -Url $progressUrl -Token $Token -Headers @{ + 'Target-Node' = $NodeUrl + } + $progressResponse = Get-ScopedResponseObject -Response $response -ScopeKey $Ip + + $status = Get-ResponseValue -Response $progressResponse -Candidates @('status') + $successFlag = Get-ResponseValue -Response $progressResponse -Candidates @('success') + $step = Get-ResponseValue -Response $progressResponse -Candidates @('step') + $msg = Get-ResponseValue -Response $progressResponse -Candidates @('msg') + $progressValue = Get-ResponseValue -Response $progressResponse -Candidates @('rateOfProgress', 'progress', 'percent', 'data.rateOfProgress', 'data.progress', 'data.percent') + $message = Get-ResponseValue -Response $progressResponse -Candidates @('message') + $code = Get-ResponseValue -Response $progressResponse -Candidates @('code') + $finish = Get-ResponseValue -Response $progressResponse -Candidates @('finish') + $lastModify = Get-ResponseValue -Response $progressResponse -Candidates @('lastModify') + if (-not $message) { $message = $msg } + + $script:UpgradeProgressState = [ordered]@{ + Status = [string]$status + Success = [string]$successFlag + Step = [string]$step + Msg = [string]$msg + Message = [string]$message + RateOfProgress = [string]$progressValue + Code = [string]$code + Finish = [string]$finish + LastModify = [string]$lastModify + RawResponse = [string]$response + } + + $progressParts = [System.Collections.Generic.List[string]]::new() + $progressParts.Add("ip=$Ip") + if ($msg) { $progressParts.Add("msg=$msg") } + if ($step) { $progressParts.Add("step=$step") } + if ($progressValue) { $progressParts.Add("rateOfProgress=$progressValue") } + if ($code) { $progressParts.Add("code=$code") } + if ($finish) { $progressParts.Add("finish=$finish") } + if ($status) { $progressParts.Add("status=$status") } + if ($successFlag) { $progressParts.Add("success=$successFlag") } + if ($lastModify) { $progressParts.Add("lastModify=$lastModify") } + if ($message -and $message -ne $msg) { $progressParts.Add("message=$message") } + + if ($progressParts.Count -gt 1) { + Write-Info ("Step 3.4a: async upgrade progress single query -> {0}" -f ($progressParts -join ', ')) + } else { + Write-Info ("Step 3.4a: async upgrade progress single query returned no explicit progress fields: ip={0}" -f $Ip) + } + + if ($code -and $code -ne '0') { + if (-not $message) { $message = $msg } + if (-not $message) { $message = $step } + if (-not $message) { $message = "code=$code" } + throw "Node upgrade failed: ip=$Ip, message=$message" + } + + if ((@($step, $message, $msg, $status) -join ' ') -match '(?i)fail|error') { + if (-not $message) { $message = $step } + if (-not $message) { $message = $msg } + throw "Node upgrade failed: ip=$Ip, message=$message" + } +} + function Invoke-UpgradeRequest { param($Config, [string]$Token, [string]$NodeUrl, [string]$Ip) @@ -1273,7 +1421,7 @@ function Invoke-PamAction { 'poll-download-progress' { $token = Invoke-FlowStep -Name 'Get-Token' -Action { Get-Token -Config $config } $nodeUrl = Invoke-FlowStep -Name 'Get-NodeUrl' -Action { Get-NodeUrl -Config $config -Token $token } - Invoke-FlowStep -Name 'Wait-DownloadProgress' -Action { Wait-DownloadProgress -Config $config -Token $token -NodeUrl $nodeUrl } | Out-Null + Invoke-FlowStep -Name 'Read-DownloadProgress' -Action { Read-DownloadProgress -Config $config -Token $token -NodeUrl $nodeUrl } | Out-Null Write-DownloadProgressResult } 'download-cloud-to-node' { @@ -1287,7 +1435,7 @@ function Invoke-PamAction { Require-IpArgument -TargetIp $Ip $token = Invoke-FlowStep -Name 'Get-Token' -Action { Get-Token -Config $config } $nodeUrl = Invoke-FlowStep -Name 'Get-NodeUrl' -Action { Get-NodeUrl -Config $config -Token $token } - Invoke-FlowStep -Name "Wait-UpgradeProgress[$Ip]" -Action { Wait-UpgradeProgress -Config $config -Token $token -NodeUrl $nodeUrl -Ip $Ip } | Out-Null + Invoke-FlowStep -Name "Read-UpgradeProgress[$Ip]" -Action { Read-UpgradeProgress -Config $config -Token $token -NodeUrl $nodeUrl -Ip $Ip } | Out-Null Write-UpgradeProgressResult -Ip $Ip } 'upgrade-ip' { diff --git a/doc_scripts/deploy.sh b/doc_scripts/deploy.sh index 8da979d..9110874 100644 --- a/doc_scripts/deploy.sh +++ b/doc_scripts/deploy.sh @@ -57,6 +57,13 @@ usage() { ACTION_TYPE TIMEOUT LOG_NAME + POLL_INTERVAL_SEC + DOWNLOAD_POLL_MAX_ATTEMPTS + UPGRADE_POLL_MAX_ATTEMPTS + +说明: + --action poll-download-progress 和 poll-upgrade-progress 只执行一次进度查询。 + Agent workflow 会重复调用单次进度查询,并在每次返回后交给 LLM/规则审核判断是否完成。 EOF } @@ -342,6 +349,9 @@ set_defaults() { : "${ACTION_TYPE:=FULL}" : "${TIMEOUT:=120}" : "${LOG_NAME:=app.log}" + : "${POLL_INTERVAL_SEC:=2}" + : "${DOWNLOAD_POLL_MAX_ATTEMPTS:=60}" + : "${UPGRADE_POLL_MAX_ATTEMPTS:=600}" } load_config() { @@ -366,7 +376,7 @@ load_config() { value="$(strip_inline_comment "$value")" case "$key" in - HOME_BASE_URL|CLIENT_ID|CLIENT_SECRET|AIRPORT_CODE|APP_NAME|MODULE_NAME|VERSION_NUMBER|ZIP_FILE_PATH|ACTION_TYPE|TIMEOUT|LOG_NAME) + HOME_BASE_URL|CLIENT_ID|CLIENT_SECRET|AIRPORT_CODE|APP_NAME|MODULE_NAME|VERSION_NUMBER|ZIP_FILE_PATH|ACTION_TYPE|TIMEOUT|LOG_NAME|POLL_INTERVAL_SEC|DOWNLOAD_POLL_MAX_ATTEMPTS|UPGRADE_POLL_MAX_ATTEMPTS) printf -v "$key" '%s' "$value" ;; esac @@ -961,8 +971,6 @@ get_online_ips() { poll_download_progress() { local progress_url="${HOME_BASE_URL}/node-proxy/${AIRPORT_CODE}/api/mcp/version/upgrade/download-cloud/progress?applicationName=${APP_NAME}&moduleName=${MODULE_NAME}&airportCode=${AIRPORT_CODE}&versionNumber=${VERSION_NUMBER}" - local attempt=0 - local max_attempts=60 local error_regex='[Ff]ail|[Ee]rror' DOWNLOAD_PROGRESS_STATUS="" @@ -973,65 +981,78 @@ poll_download_progress() { DOWNLOAD_PROGRESS_RATE="" DOWNLOAD_PROGRESS_RESPONSE="" - while (( attempt < max_attempts )); do - local response - response=$(http_request "GET" "$progress_url" "" "" "Target-Node: ${NODE_URL}") || return 1 + local response + response=$(http_request "GET" "$progress_url" "" "" "Target-Node: ${NODE_URL}") || return 1 - local status - status="$(json_value "$response" '.status')" - local success_flag - success_flag="$(json_value "$response" '.success')" - local step_value - step_value="$(json_value "$response" '.step')" - local msg_value - msg_value="$(json_value "$response" '.msg')" - local message - message="$(json_value "$response" '.message')" - local progress_value - progress_value="$(json_value "$response" '.rateOfProgress')" - [[ -z "$progress_value" ]] && progress_value="$(json_value "$response" '.progress')" - [[ -z "$progress_value" ]] && progress_value="$(json_value "$response" '.percent')" - [[ -z "$progress_value" ]] && progress_value="$(json_value "$response" '.data.progress')" - [[ -z "$progress_value" ]] && progress_value="$(json_value "$response" '.data.percent')" + local status + status="$(json_value "$response" '.status')" + local success_flag + success_flag="$(json_value "$response" '.success')" + local step_value + step_value="$(json_value "$response" '.step')" + local msg_value + msg_value="$(json_value "$response" '.msg')" + local message + message="$(json_value "$response" '.message')" + local progress_value + progress_value="$(json_value "$response" '.rateOfProgress')" + [[ -z "$progress_value" ]] && progress_value="$(json_value "$response" '.progress')" + [[ -z "$progress_value" ]] && progress_value="$(json_value "$response" '.percent')" + [[ -z "$progress_value" ]] && progress_value="$(json_value "$response" '.data.progress')" + [[ -z "$progress_value" ]] && progress_value="$(json_value "$response" '.data.percent')" + [[ -z "$message" ]] && message="$msg_value" + DOWNLOAD_PROGRESS_STATUS="$status" + DOWNLOAD_PROGRESS_SUCCESS="$success_flag" + DOWNLOAD_PROGRESS_STEP="$step_value" + DOWNLOAD_PROGRESS_MSG="$msg_value" + DOWNLOAD_PROGRESS_MESSAGE="$message" + DOWNLOAD_PROGRESS_RATE="$progress_value" + DOWNLOAD_PROGRESS_RESPONSE="$response" + + if [[ -n "$msg_value" || -n "$step_value" || -n "$progress_value" || -n "$status" || -n "$success_flag" || -n "$message" ]]; then + local -a progress_parts=() + [[ -n "$msg_value" ]] && progress_parts+=("msg=${msg_value}") + [[ -n "$step_value" ]] && progress_parts+=("step=${step_value}") + [[ -n "$progress_value" ]] && progress_parts+=("rateOfProgress=${progress_value}") + [[ -n "$status" ]] && progress_parts+=("status=${status}") + [[ -n "$success_flag" ]] && progress_parts+=("success=${success_flag}") + [[ -n "$message" && "$message" != "$msg_value" ]] && progress_parts+=("message=${message}") + log_info "Step 3.3b: 异步下载进度单次查询 -> ${progress_parts[*]}" + else + log_info "Step 3.3b: 异步下载进度单次查询未返回明确进度字段。" + fi + + if [[ "${step_value} ${message} ${msg_value} ${status}" =~ $error_regex ]]; then + [[ -z "$message" ]] && message="$step_value" [[ -z "$message" ]] && message="$msg_value" - DOWNLOAD_PROGRESS_STATUS="$status" - DOWNLOAD_PROGRESS_SUCCESS="$success_flag" - DOWNLOAD_PROGRESS_STEP="$step_value" - DOWNLOAD_PROGRESS_MSG="$msg_value" - DOWNLOAD_PROGRESS_MESSAGE="$message" - DOWNLOAD_PROGRESS_RATE="$progress_value" - DOWNLOAD_PROGRESS_RESPONSE="$response" + log_error "Node 下载失败: $message" + return 1 + fi - if [[ -n "$msg_value" || -n "$step_value" || -n "$progress_value" || -n "$status" || -n "$success_flag" || -n "$message" ]]; then - local -a progress_parts=() - [[ -n "$msg_value" ]] && progress_parts+=("msg=${msg_value}") - [[ -n "$step_value" ]] && progress_parts+=("step=${step_value}") - [[ -n "$progress_value" ]] && progress_parts+=("rateOfProgress=${progress_value}") - [[ -n "$status" ]] && progress_parts+=("status=${status}") - [[ -n "$success_flag" ]] && progress_parts+=("success=${success_flag}") - [[ -n "$message" && "$message" != "$msg_value" ]] && progress_parts+=("message=${message}") - log_info "Step 3.3b: 异步下载进度 -> ${progress_parts[*]}" - else - log_info "Step 3.3b: 异步下载进度轮询中... ($((attempt + 1))/${max_attempts})" - fi + return 0 +} - if [[ "$step_value" == "DONE" || "$status" == "completed" || "$success_flag" == "true" ]]; then +download_progress_complete() { + [[ "$DOWNLOAD_PROGRESS_STEP" == "DONE" || "$DOWNLOAD_PROGRESS_STATUS" == "completed" || "$DOWNLOAD_PROGRESS_SUCCESS" == "true" ]] && return 0 + [[ "$DOWNLOAD_PROGRESS_MSG" == "success" && "$DOWNLOAD_PROGRESS_RATE" == "100" ]] && return 0 + return 1 +} + +wait_download_progress() { + local attempt=0 + local max_attempts="${DOWNLOAD_POLL_MAX_ATTEMPTS:-60}" + local interval_sec="${POLL_INTERVAL_SEC:-2}" + [[ "$max_attempts" =~ ^[0-9]+$ ]] || max_attempts=60 + [[ -n "$interval_sec" ]] || interval_sec=2 + + while (( attempt < max_attempts )); do + poll_download_progress || return 1 + if download_progress_complete; then return 0 fi - - if [[ "$msg_value" == "success" && "$progress_value" == "100" ]]; then - return 0 - fi - - if [[ "${step_value} ${message} ${msg_value}" =~ $error_regex ]]; then - [[ -z "$message" ]] && message="$step_value" - [[ -z "$message" ]] && message="$msg_value" - log_error "Node 下载失败: $message" - return 1 - fi - attempt=$((attempt + 1)) - sleep 2 + log_info "Step 3.3b: 异步下载进度未完成,等待下一次查询... (${attempt}/${max_attempts})" + sleep "$interval_sec" done log_error "Node 下载超时。" @@ -1050,14 +1071,12 @@ create_download_task() { download_cloud_to_node() { create_download_task || return 1 - poll_download_progress + wait_download_progress } poll_upgrade_progress() { local ip="$1" local progress_url="${HOME_BASE_URL}/node-proxy/${AIRPORT_CODE}/api/mcp/version/upgrade/progress?applicationName=${APP_NAME}&moduleName=${MODULE_NAME}&airportCode=${AIRPORT_CODE}&versionNumber=${VERSION_NUMBER}" - local attempt=0 - local max_attempts=600 local error_regex='[Ff]ail|[Ee]rror' UPGRADE_PROGRESS_STATUS="" @@ -1071,85 +1090,99 @@ poll_upgrade_progress() { UPGRADE_PROGRESS_LAST_MODIFY="" UPGRADE_PROGRESS_RESPONSE="" - while (( attempt < max_attempts )); do - local response - response=$(http_request "GET" "$progress_url" "" "" "Target-Node: ${NODE_URL}") || return 1 + local response + response=$(http_request "GET" "$progress_url" "" "" "Target-Node: ${NODE_URL}") || return 1 - local status - status="$(json_scoped_value "$response" "$ip" '.status')" - local success_flag - success_flag="$(json_scoped_value "$response" "$ip" '.success')" - local step_value - step_value="$(json_scoped_value "$response" "$ip" '.step')" - local msg_value - msg_value="$(json_scoped_value "$response" "$ip" '.msg')" - local message - message="$(json_scoped_value "$response" "$ip" '.message')" - local progress_value - progress_value="$(json_scoped_value "$response" "$ip" '.rateOfProgress')" - [[ -z "$progress_value" ]] && progress_value="$(json_scoped_value "$response" "$ip" '.progress')" - [[ -z "$progress_value" ]] && progress_value="$(json_scoped_value "$response" "$ip" '.percent')" - local code_value - code_value="$(json_scoped_value "$response" "$ip" '.code')" - local finish_value - finish_value="$(json_scoped_value "$response" "$ip" '.finish')" - local last_modify_value - last_modify_value="$(json_scoped_value "$response" "$ip" '.lastModify')" + local status + status="$(json_scoped_value "$response" "$ip" '.status')" + local success_flag + success_flag="$(json_scoped_value "$response" "$ip" '.success')" + local step_value + step_value="$(json_scoped_value "$response" "$ip" '.step')" + local msg_value + msg_value="$(json_scoped_value "$response" "$ip" '.msg')" + local message + message="$(json_scoped_value "$response" "$ip" '.message')" + local progress_value + progress_value="$(json_scoped_value "$response" "$ip" '.rateOfProgress')" + [[ -z "$progress_value" ]] && progress_value="$(json_scoped_value "$response" "$ip" '.progress')" + [[ -z "$progress_value" ]] && progress_value="$(json_scoped_value "$response" "$ip" '.percent')" + local code_value + code_value="$(json_scoped_value "$response" "$ip" '.code')" + local finish_value + finish_value="$(json_scoped_value "$response" "$ip" '.finish')" + local last_modify_value + last_modify_value="$(json_scoped_value "$response" "$ip" '.lastModify')" + [[ -z "$message" ]] && message="$msg_value" + + UPGRADE_PROGRESS_STATUS="$status" + UPGRADE_PROGRESS_SUCCESS="$success_flag" + UPGRADE_PROGRESS_STEP="$step_value" + UPGRADE_PROGRESS_MSG="$msg_value" + UPGRADE_PROGRESS_MESSAGE="$message" + UPGRADE_PROGRESS_RATE="$progress_value" + UPGRADE_PROGRESS_CODE="$code_value" + UPGRADE_PROGRESS_FINISH="$finish_value" + UPGRADE_PROGRESS_LAST_MODIFY="$last_modify_value" + UPGRADE_PROGRESS_RESPONSE="$response" + + if [[ -n "$msg_value" || -n "$step_value" || -n "$progress_value" || -n "$status" || -n "$success_flag" || -n "$message" || -n "$code_value" || -n "$finish_value" || -n "$last_modify_value" ]]; then + local -a progress_parts=() + progress_parts+=("ip=${ip}") + [[ -n "$msg_value" ]] && progress_parts+=("msg=${msg_value}") + [[ -n "$step_value" ]] && progress_parts+=("step=${step_value}") + [[ -n "$progress_value" ]] && progress_parts+=("rateOfProgress=${progress_value}") + [[ -n "$code_value" ]] && progress_parts+=("code=${code_value}") + [[ -n "$finish_value" ]] && progress_parts+=("finish=${finish_value}") + [[ -n "$status" ]] && progress_parts+=("status=${status}") + [[ -n "$success_flag" ]] && progress_parts+=("success=${success_flag}") + [[ -n "$last_modify_value" ]] && progress_parts+=("lastModify=${last_modify_value}") + [[ -n "$message" && "$message" != "$msg_value" ]] && progress_parts+=("message=${message}") + log_info "Step 3.4a: async push progress single query -> ${progress_parts[*]}" + else + log_info "Step 3.4a: async push progress single query returned no explicit progress fields: ip=${ip}" + fi + + if [[ -n "$code_value" && "$code_value" != "0" ]]; then [[ -z "$message" ]] && message="$msg_value" + [[ -z "$message" ]] && message="$step_value" + [[ -z "$message" ]] && message="code=${code_value}" + log_error "Node push failed: ip=${ip}, message=${message}" + return 1 + fi - UPGRADE_PROGRESS_STATUS="$status" - UPGRADE_PROGRESS_SUCCESS="$success_flag" - UPGRADE_PROGRESS_STEP="$step_value" - UPGRADE_PROGRESS_MSG="$msg_value" - UPGRADE_PROGRESS_MESSAGE="$message" - UPGRADE_PROGRESS_RATE="$progress_value" - UPGRADE_PROGRESS_CODE="$code_value" - UPGRADE_PROGRESS_FINISH="$finish_value" - UPGRADE_PROGRESS_LAST_MODIFY="$last_modify_value" - UPGRADE_PROGRESS_RESPONSE="$response" + if [[ "${step_value} ${message} ${msg_value} ${status}" =~ $error_regex ]]; then + [[ -z "$message" ]] && message="$step_value" + [[ -z "$message" ]] && message="$msg_value" + log_error "Node push failed: ip=${ip}, message=${message}" + return 1 + fi - if [[ -n "$msg_value" || -n "$step_value" || -n "$progress_value" || -n "$status" || -n "$success_flag" || -n "$message" || -n "$code_value" || -n "$finish_value" || -n "$last_modify_value" ]]; then - local -a progress_parts=() - progress_parts+=("ip=${ip}") - [[ -n "$msg_value" ]] && progress_parts+=("msg=${msg_value}") - [[ -n "$step_value" ]] && progress_parts+=("step=${step_value}") - [[ -n "$progress_value" ]] && progress_parts+=("rateOfProgress=${progress_value}") - [[ -n "$code_value" ]] && progress_parts+=("code=${code_value}") - [[ -n "$finish_value" ]] && progress_parts+=("finish=${finish_value}") - [[ -n "$status" ]] && progress_parts+=("status=${status}") - [[ -n "$success_flag" ]] && progress_parts+=("success=${success_flag}") - [[ -n "$last_modify_value" ]] && progress_parts+=("lastModify=${last_modify_value}") - [[ -n "$message" && "$message" != "$msg_value" ]] && progress_parts+=("message=${message}") - log_info "Step 3.4a: async push progress -> ${progress_parts[*]}" - else - log_info "Step 3.4a: async push progress polling... ip=${ip} ($((attempt + 1))/${max_attempts})" - fi + return 0 +} - if [[ "$step_value" == "DONE" || "$finish_value" == "true" || "$status" == "completed" || "$success_flag" == "true" ]]; then +upgrade_progress_complete() { + [[ "$UPGRADE_PROGRESS_STEP" == "DONE" || "$UPGRADE_PROGRESS_FINISH" == "true" || "$UPGRADE_PROGRESS_STATUS" == "completed" || "$UPGRADE_PROGRESS_SUCCESS" == "true" ]] && return 0 + [[ "$UPGRADE_PROGRESS_MSG" == "success" && "$UPGRADE_PROGRESS_RATE" == "100" ]] && [[ -z "$UPGRADE_PROGRESS_CODE" || "$UPGRADE_PROGRESS_CODE" == "0" ]] && return 0 + return 1 +} + +wait_upgrade_progress() { + local ip="$1" + local attempt=0 + local max_attempts="${UPGRADE_POLL_MAX_ATTEMPTS:-600}" + local interval_sec="${POLL_INTERVAL_SEC:-2}" + [[ "$max_attempts" =~ ^[0-9]+$ ]] || max_attempts=600 + [[ -n "$interval_sec" ]] || interval_sec=2 + + while (( attempt < max_attempts )); do + poll_upgrade_progress "$ip" || return 1 + if upgrade_progress_complete; then return 0 fi - - if [[ "$msg_value" == "success" && "$progress_value" == "100" ]] && [[ -z "$code_value" || "$code_value" == "0" ]]; then - return 0 - fi - - if [[ -n "$code_value" && "$code_value" != "0" ]]; then - [[ -z "$message" ]] && message="$msg_value" - [[ -z "$message" ]] && message="$step_value" - [[ -z "$message" ]] && message="code=${code_value}" - log_error "Node push failed: ip=${ip}, message=${message}" - return 1 - fi - - if [[ "${step_value} ${message} ${msg_value} ${status}" =~ $error_regex ]]; then - [[ -z "$message" ]] && message="$step_value" - [[ -z "$message" ]] && message="$msg_value" - log_error "Node push failed: ip=${ip}, message=${message}" - return 1 - fi - attempt=$((attempt + 1)) - sleep 2 + log_info "Step 3.4a: async push progress not complete, waiting for next query... ip=${ip} (${attempt}/${max_attempts})" + sleep "$interval_sec" done log_error "Node push timed out: ip=${ip}" @@ -1522,7 +1555,7 @@ deploy_one_ip() { return fi - if ! run_flow_step "poll_upgrade_progress[${ip}]" poll_upgrade_progress "$ip"; then + if ! run_flow_step "wait_upgrade_progress[${ip}]" wait_upgrade_progress "$ip"; then local message message="$UPGRADE_PROGRESS_MESSAGE" [[ -z "$message" ]] && message="$UPGRADE_PROGRESS_MSG" diff --git a/docs/current_logic_flow.md b/docs/current_logic_flow.md index f8fa12f..b2c1c40 100644 --- a/docs/current_logic_flow.md +++ b/docs/current_logic_flow.md @@ -79,13 +79,19 @@ flowchart TD G4 --> G5[get-node-url] G5 --> G6[get-online-ips] G6 --> G7[create-download-task] - G7 --> G8[poll-download-progress] - G8 --> H[prepare_ip 节点选择下一个 IP action] + G7 --> G8[poll-download-progress 单次查询] + G8 --> G9{LLM/规则判断下载完成} + G9 -- 未完成且正常 --> G8 + G9 -- 已完成 --> H[prepare_ip 节点选择下一个 IP action] + G9 -- 异常或超时 --> R H --> I[resolve_target_ips 计算目标 IP] I --> J[ip_action 节点执行 upgrade-ip] - J --> K[ip_action 节点执行 poll-upgrade-progress] - K --> L[ip_action 节点执行 start-ip] + J --> K[ip_action 节点执行 poll-upgrade-progress 单次查询] + K --> K1{LLM/规则判断推送完成} + K1 -- 未完成且正常 --> K + K1 -- 已完成 --> L[ip_action 节点执行 start-ip] + K1 -- 异常或超时 --> R L --> M[ip_action 节点执行 verify-ip] M --> N[ip_action 节点执行 download-log] N --> O{还有下一个 IP} @@ -133,6 +139,29 @@ flowchart TD - 如果审核建议停止或审核本身失败,当前 action 不会计入 completed,`resume` 会重试当前 action。 - 如果审核本身失败,也会生成“停止继续”的审核结果并暂停流程,避免黑盒继续执行。 +## 进度查询 action 语义 + +```mermaid +flowchart TD + A[poll-download-progress / poll-upgrade-progress] --> B[执行一次进度查询] + B --> C[ActionResult 返回结构化进度字段] + C --> D[LLM/规则审核 progress_complete] + D --> E{是否完成} + E -- 是 --> F[写入 completed,进入下一个 action] + E -- 否但正常 --> G[追加 ACTION_PROGRESS,保存 checkpoint] + G --> H[按 POLL_INTERVAL_SEC 等待] + H --> A + E -- 异常 --> I[暂停在当前 progress action] + G --> J{达到最大查询次数} + J -- 是 --> I + J -- 否 --> H +``` + +- `poll-download-progress` 和 `poll-upgrade-progress` 不再在脚本内部长时间循环;脚本/MCP/fake 每次只返回一次进度查询结果。 +- LLM/规则通过 `progress_complete` 判断进度是否完成。未完成但正常时,`should_continue=true`、`progress_complete=false`,workflow 会保留当前 action 并再次查询。 +- 查询间隔由 `POLL_INTERVAL_SEC` 控制,下载最大次数由 `DOWNLOAD_POLL_MAX_ATTEMPTS` 控制,单 IP 推送最大次数由 `UPGRADE_POLL_MAX_ATTEMPTS` 控制。 +- 每次进度查询都会播报 `ACTION_PROGRESS` 并保存 checkpoint;中断或失败后 `resume` 会从同一个 progress action 继续。 + ## 失败、显式回滚和续跑 ```mermaid diff --git a/packaging/README_linux_package.md b/packaging/README_linux_package.md index e9f91aa..67b6edc 100644 --- a/packaging/README_linux_package.md +++ b/packaging/README_linux_package.md @@ -71,11 +71,13 @@ cd pam-deploy-agent-linux-x86_64 本次发布包对应的运行时行为也已同步到包内 `README.md`: - 每个 action 完成后都会自动执行一次 LLM/规则审核,只有审核通过才会把 action 记为 completed。 +- `poll-download-progress` 和 `poll-upgrade-progress` 是单次进度查询 action;Agent workflow 会按配置重复调用,每次返回后交给 LLM/规则判断是否完成并播报进度。 - `--analyze-actions` 只控制是否把详细审核结果写入 `events`。 - action 失败或审核阻断后会保存 checkpoint 并暂停;修复外部环境后通过 `resume` 从当前 action 重试。 - 回滚不再属于主 workflow 自动分支;需要时使用 chat 内 `rollback [IP]` 或 CLI `rollback --checkpoint ...` 显式执行。 - chat 支持执行中 `Ctrl+C` 中断后保存 checkpoint,再通过 `resume` 重试当前 action。 - chat 支持 `set KEY=VALUE` 和 `load params <路径>` 热更新当前运行任务参数。 +- 进度查询间隔和最大次数可通过 `POLL_INTERVAL_SEC`、`DOWNLOAD_POLL_MAX_ATTEMPTS`、`UPGRADE_POLL_MAX_ATTEMPTS` 配置。 - 支持通过 `--llm-action-analysis-prompt-file` 或 chat 内 `llm config action_analysis_prompt_file=...` 自定义 action 审核提示词。 - chat 支持 `llm test [文本]` 测试当前 LLM client 是否正常加载。 - 默认运行日志写入 `logs/pam_deploy_agent.log`,可通过 `PAM_AGENT_LOG_FILE` 和 `PAM_AGENT_LOG_LEVEL` 调整。 diff --git a/packaging/README_packaged_agent.md b/packaging/README_packaged_agent.md index b0266a1..d246f28 100644 --- a/packaging/README_packaged_agent.md +++ b/packaging/README_packaged_agent.md @@ -37,6 +37,7 @@ pam-deploy-agent-linux-x86_64/ 发布包默认会优先使用 `prompt_toolkit` 增强输入,支持更稳定的退格、历史记录和补全;如果增强输入初始化失败,会自动降级到普通 `input()`。输出仍会在可用时使用 `rich` 做更清晰的文本展示。 action 失败或审核阻断后会保存 checkpoint 并暂停;修复外部环境后输入 `resume` 会从当前 action 重试。回滚不再属于主 workflow 自动分支,需要时在 chat 内输入 `rollback [IP]` 显式执行。 chat 会在执行前归一化并展示实际写入脚本配置的参数;`script_only` / `hybrid_node_mcp` 会先检查 `ZIP_FILE_PATH` 是否存在,避免脚本运行后才用默认路径失败。执行过程中每个 action 都会输出开始、完成或失败状态;每个 action 完成后还会自动进入一次 LLM/规则审核,并播报审核开始和审核结果;只有审核通过才会把 action 记为 completed。 +`poll-download-progress` 和 `poll-upgrade-progress` 每次只查询一次进度,Agent workflow 会按 `POLL_INTERVAL_SEC`、`DOWNLOAD_POLL_MAX_ATTEMPTS`、`UPGRADE_POLL_MAX_ATTEMPTS` 重复调用,并在每次返回后交给 LLM/规则判断是否完成、向 chat 播报进度。 ## 交互式使用 @@ -243,6 +244,7 @@ MCP token 获取方式与 HOME 一致,默认按 `client_credentials` POST 到 - 执行真实 action 前请确认配置文件中的 `HOME_BASE_URL`、`CLIENT_ID`、`CLIENT_SECRET`、`AIRPORT_CODE`、`APP_NAME`、`MODULE_NAME`、`VERSION_NUMBER`、`ZIP_FILE_PATH`。 - `chat` 中输入 `你好`、`hello` 这类问候不会触发 LLM/结构化分析;需要分析部署需求时请直接描述部署任务,或显式使用 `analyze <需求>`。 - 每个 action 完成后都会自动执行一次 LLM/规则审核;`--analyze-actions` 和 `llm action-analysis on` 只控制是否把详细审核结果写入 `events`。 +- `poll-download-progress` 和 `poll-upgrade-progress` 是单次进度查询 action,未完成时不会进入下一个 action;最大查询次数和间隔可通过 `config.txt` 或 chat `set` 热更新。 - `llm test [文本]` 可测试当前 LLM client 是否可用。 - 如果审核建议停止、审核本身失败,或用户在执行中按下 `Ctrl+C`,流程都会保存 checkpoint 并进入暂停状态;后续可使用 `resume` 重试当前 action。 - `set KEY=VALUE` 和 `load params <路径>` 会热更新当前运行任务的参数,并回写运行中的 `config.txt` 和 checkpoint。 diff --git a/pam_deploy_graph/agent.py b/pam_deploy_graph/agent.py index 5e998e5..927f90b 100644 --- a/pam_deploy_graph/agent.py +++ b/pam_deploy_graph/agent.py @@ -32,6 +32,8 @@ REQUIRED_ACTION_VALUES = { "get-node-url": ("NODE_URL",), } +PROGRESS_ACTIONS = {"poll-download-progress", "poll-upgrade-progress"} + class PamDeployAgent: """PAM 部署主 Agent,串联 LLM、action 路由、确认和续跑状态。""" @@ -343,26 +345,18 @@ class PamDeployAgent: backend=backend, ok=False, error_summary=str(exc), - ) - logger.info("全局 action 返回 run_id=%s result=%s", state.run_id, _action_result_for_log(result)) - state.events.append( - { - "type": "ACTION_DONE" if result.ok else "ACTION_FAIL", - "stage": action, - "backend": result.backend, - "message": result.error_summary or "ok", - } ) + logger.info("全局 action 返回 run_id=%s result=%s", state.run_id, _action_result_for_log(result)) analysis = self._append_action_analysis(state, action, result) if not result.ok: - self._emit_progress( - { - "type": "ACTION_FAIL", - "stage": action, - "backend": result.backend, - "message": result.error_summary or "action 执行失败", - } - ) + fail_event = { + "type": "ACTION_FAIL", + "stage": action, + "backend": result.backend, + "message": result.error_summary or "action 执行失败", + } + state.events.append(fail_event) + self._emit_progress(fail_event) state.last_failed_step = action self.pause_state( state, @@ -374,14 +368,14 @@ class PamDeployAgent: missing_values = self._missing_required_values(action, result.values) if missing_values: message = f"{action} 返回缺少必要字段: {', '.join(missing_values)}" - self._emit_progress( - { - "type": "ACTION_FAIL", - "stage": action, - "backend": result.backend, - "message": message, - } - ) + fail_event = { + "type": "ACTION_FAIL", + "stage": action, + "backend": result.backend, + "message": message, + } + state.events.append(fail_event) + self._emit_progress(fail_event) state.last_failed_step = action self.pause_state( state, @@ -397,6 +391,14 @@ class PamDeployAgent: raise RuntimeError(message) if analysis is not None and not analysis.should_continue: state.last_failed_step = action + state.events.append( + { + "type": "ACTION_BLOCKED", + "stage": action, + "backend": result.backend, + "message": analysis.suggested_action or analysis.possible_reason or "LLM 审核要求暂停", + } + ) self.pause_state( state, reason="llm_review_blocked", @@ -404,19 +406,22 @@ class PamDeployAgent: ) logger.info("全局 action 被 LLM 审核拦截 run_id=%s action=%s analysis=%s", state.run_id, action, json_for_log(asdict(analysis))) return state + if self._handle_progress_action(state, action, result, analysis): + return state self._apply_result(state, action, result.values) state.completed_global_steps.append(action) state.last_success_step = action if state.last_failed_step == action: state.last_failed_step = "" - self._emit_progress( - { - "type": "ACTION_DONE", - "stage": action, - "backend": result.backend, - "message": result.values.get("MESSAGE", "ok"), - } - ) + done_message = self._progress_message(action, result) if action in PROGRESS_ACTIONS else result.values.get("MESSAGE", "ok") + done_event = { + "type": "ACTION_DONE", + "stage": action, + "backend": result.backend, + "message": done_message, + } + state.events.append(done_event) + self._emit_progress(done_event) self._save_checkpoint(state) logger.info("全局 action 完成 run_id=%s action=%s completed=%s", state.run_id, action, state.completed_global_steps) return state @@ -550,27 +555,18 @@ class PamDeployAgent: failed, _action_result_for_log(result), ) - state.events.append( - { - "type": "ACTION_FAIL" if failed else "ACTION_DONE", - "stage": action, - "backend": result.backend, - "ip": ip, - "message": result.error_summary or result.values.get("MESSAGE", "ok"), - } - ) analysis = self._append_action_analysis(state, action, result, ip=ip) if failed: - self._emit_progress( - { - "type": "ACTION_FAIL", - "stage": action, - "backend": result.backend, - "ip": ip, - "message": result.error_summary or result.values.get("MESSAGE", "action 执行失败"), - } - ) + fail_event = { + "type": "ACTION_FAIL", + "stage": action, + "backend": result.backend, + "ip": ip, + "message": result.error_summary or result.values.get("MESSAGE", "action 执行失败"), + } + state.events.append(fail_event) + self._emit_progress(fail_event) self._record_ip_failure(state, ip, action, result.error_summary or str(result.values)) self.pause_state( state, @@ -585,6 +581,15 @@ class PamDeployAgent: ip_state["failed_stage"] = action ip_state["failure_reason"] = analysis.possible_reason or analysis.suggested_action or "LLM 审核要求暂停" state.last_failed_step = action + state.events.append( + { + "type": "ACTION_BLOCKED", + "stage": action, + "backend": result.backend, + "ip": ip, + "message": analysis.suggested_action or analysis.possible_reason or "LLM 审核要求暂停", + } + ) self.pause_state( state, reason="llm_review_blocked", @@ -592,6 +597,9 @@ class PamDeployAgent: ) logger.info("IP action 被 LLM 审核拦截 run_id=%s ip=%s action=%s analysis=%s", state.run_id, ip, action, json_for_log(asdict(analysis))) return state + if self._handle_progress_action(state, action, result, analysis, ip=ip): + ip_state["status"] = "RUNNING" + return state self._apply_ip_result(ip_state, action, result.values) ip_state["status"] = "RUNNING" ip_state["failed_stage"] = "" @@ -599,15 +607,16 @@ class PamDeployAgent: completed_steps.append(action) if state.last_failed_step == action: state.last_failed_step = "" - self._emit_progress( - { - "type": "ACTION_DONE", - "stage": action, - "backend": result.backend, - "ip": ip, - "message": result.values.get("MESSAGE", "ok"), - } - ) + done_message = self._progress_message(action, result, ip=ip) if action in PROGRESS_ACTIONS else result.values.get("MESSAGE", "ok") + done_event = { + "type": "ACTION_DONE", + "stage": action, + "backend": result.backend, + "ip": ip, + "message": done_message, + } + state.events.append(done_event) + self._emit_progress(done_event) self._save_checkpoint(state) logger.info("IP action 完成 run_id=%s ip=%s action=%s completed=%s", state.run_id, ip, action, completed_steps) return state @@ -893,6 +902,161 @@ class PamDeployAgent: missing, ) + def _handle_progress_action( + self, + state: AgentState, + action: str, + result: ActionResult, + analysis: LlmActionAnalysis | None, + *, + ip: str | None = None, + ) -> bool: + """处理进度查询 action;未完成时保留当前 action 等待下一次查询。""" + if action not in PROGRESS_ACTIONS: + return False + if ip: + ip_state = state.ip_states.get(ip, {}) + ip_state["progress"] = dict(result.values) + + key = self._poll_attempt_key(action, ip=ip) + if self._progress_complete(action, result, analysis): + state.poll_attempts.pop(key, None) + logger.info( + "进度 action 已完成 run_id=%s action=%s ip=%s values=%s", + state.run_id, + action, + ip or "", + json_for_log(result.values), + ) + return False + + max_attempts, interval_sec = self._poll_limits(state, action) + attempt = state.poll_attempts.get(key, 0) + 1 + state.poll_attempts[key] = attempt + message = self._progress_message(action, result, ip=ip, attempt=attempt, max_attempts=max_attempts) + progress_event = { + "type": "ACTION_PROGRESS", + "stage": action, + "backend": result.backend, + "ip": ip or "", + "message": message, + "attempt": attempt, + "max_attempts": max_attempts, + "values": dict(result.values), + } + state.events.append(progress_event) + self._emit_progress(progress_event) + + if attempt >= max_attempts: + timeout_message = f"{action} 进度查询达到最大次数 {max_attempts},当前仍未完成。{message}" + logger.warning( + "进度 action 超时 run_id=%s action=%s ip=%s attempt=%s max=%s values=%s", + state.run_id, + action, + ip or "", + attempt, + max_attempts, + json_for_log(result.values), + ) + fail_event = { + "type": "ACTION_FAIL", + "stage": action, + "backend": result.backend, + "ip": ip or "", + "message": timeout_message, + } + state.events.append(fail_event) + self._emit_progress(fail_event) + state.last_failed_step = action + self.pause_state( + state, + reason="progress_timeout", + review_context={ + "type": "action_review", + "stage": action, + "ip": ip or "", + "backend": result.backend, + "ok": result.ok, + "error_summary": timeout_message, + "possible_reason": "进度查询超过最大次数但未达到完成条件。", + "suggested_action": "请检查 PAM_HOME/PAM_NODE 任务状态;确认外部任务仍在运行时,可调大轮询次数后 resume 重试当前 action。", + "should_continue": False, + "progress_values": dict(result.values), + "attempt": attempt, + "max_attempts": max_attempts, + }, + ) + return True + + self._save_checkpoint(state) + logger.info( + "进度 action 未完成,等待下一次查询 run_id=%s action=%s ip=%s attempt=%s max=%s interval=%s message=%s", + state.run_id, + action, + ip or "", + attempt, + max_attempts, + interval_sec, + message, + ) + if interval_sec > 0: + time.sleep(interval_sec) + return True + + def _poll_attempt_key(self, action: str, *, ip: str | None = None) -> str: + """生成 progress action 的 checkpoint 计数 key。""" + return f"ip:{ip}:{action}" if ip else f"global:{action}" + + def _poll_limits(self, state: AgentState, action: str) -> tuple[int, float]: + """从运行参数读取轮询最大次数和间隔。""" + interval_sec = _safe_float(state.params.get("POLL_INTERVAL_SEC"), float(DEFAULT_PARAMS["POLL_INTERVAL_SEC"])) + if action == "poll-upgrade-progress": + max_attempts = _safe_int( + state.params.get("UPGRADE_POLL_MAX_ATTEMPTS"), + int(DEFAULT_PARAMS["UPGRADE_POLL_MAX_ATTEMPTS"]), + ) + else: + max_attempts = _safe_int( + state.params.get("DOWNLOAD_POLL_MAX_ATTEMPTS"), + int(DEFAULT_PARAMS["DOWNLOAD_POLL_MAX_ATTEMPTS"]), + ) + return max(max_attempts, 1), max(interval_sec, 0.0) + + def _progress_complete( + self, + action: str, + result: ActionResult, + analysis: LlmActionAnalysis | None, + ) -> bool: + """判断进度 action 是否完成,优先尊重 LLM 明确结论。""" + if analysis is not None and analysis.progress_complete is not None: + return bool(analysis.progress_complete) + return _progress_values_complete(action, result.values) + + def _progress_message( + self, + action: str, + result: ActionResult, + *, + ip: str | None = None, + attempt: int | None = None, + max_attempts: int | None = None, + ) -> str: + """把进度字段格式化为用户和日志可读的短消息。""" + values = result.values + parts: list[str] = [] + if ip: + parts.append(f"IP={ip}") + if attempt is not None and max_attempts is not None: + parts.append(f"第 {attempt}/{max_attempts} 次查询") + for key in ("RATE_OF_PROGRESS", "STEP", "MSG", "STATUS", "SUCCESS", "CODE", "FINISH", "MESSAGE"): + value = values.get(key) + if value not in (None, ""): + parts.append(f"{key}={value}") + if not parts: + parts.append("进度接口已返回,但未包含明确进度字段") + return ",".join(parts) + def _business_failed(self, action: str, values: dict[str, Any]) -> bool: """识别 exit code 之外的业务失败条件。""" if action == "verify-ip": @@ -1026,6 +1190,7 @@ class PamDeployAgent: "has_anomaly": analysis.has_anomaly, "severity": analysis.severity, "should_continue": analysis.should_continue, + "progress_complete": analysis.progress_complete, } ) logger.info( @@ -1052,6 +1217,7 @@ class PamDeployAgent: "pause_reason": state.pause_reason, "last_success_step": state.last_success_step, "last_failed_step": state.last_failed_step, + "poll_attempts": state.poll_attempts, } def _review_context( @@ -1079,6 +1245,7 @@ class PamDeployAgent: "possible_reason": analysis.possible_reason, "suggested_action": analysis.suggested_action, "should_continue": analysis.should_continue, + "progress_complete": analysis.progress_complete, "notes": list(analysis.notes), } ) @@ -1152,3 +1319,44 @@ def _action_result_for_log(result: ActionResult) -> str: }, max_text_len=1000, ) + + +def _progress_values_complete(action: str, values: dict[str, Any]) -> bool: + """根据 action 返回字段判断下载/推送进度是否完成。""" + step = _lower_value(values.get("STEP")) + status = _lower_value(values.get("STATUS")) + msg = _lower_value(values.get("MSG")) + success = _lower_value(values.get("SUCCESS")) + finish = _lower_value(values.get("FINISH")) + code = _lower_value(values.get("CODE")) + rate = _lower_value(values.get("RATE_OF_PROGRESS")) + if step == "done": + return True + if status in ("completed", "complete", "done", "success", "succeeded"): + return True + if success in ("true", "1", "yes"): + return True + if action == "poll-upgrade-progress" and finish in ("true", "1", "yes"): + return True + return msg == "success" and rate == "100" and (not code or code == "0") + + +def _lower_value(value: Any) -> str: + """把结构化字段转换为小写字符串,便于规则判断。""" + return str(value).strip().lower() if value is not None else "" + + +def _safe_int(value: Any, default: int) -> int: + """安全读取整数参数。""" + try: + return int(str(value).strip()) + except (TypeError, ValueError): + return default + + +def _safe_float(value: Any, default: float) -> float: + """安全读取浮点参数。""" + try: + return float(str(value).strip()) + except (TypeError, ValueError): + return default diff --git a/pam_deploy_graph/config_writer.py b/pam_deploy_graph/config_writer.py index 53c8e4d..58c59fc 100644 --- a/pam_deploy_graph/config_writer.py +++ b/pam_deploy_graph/config_writer.py @@ -17,6 +17,9 @@ CONFIG_KEYS = ( "ACTION_TYPE", "TIMEOUT", "LOG_NAME", + "POLL_INTERVAL_SEC", + "DOWNLOAD_POLL_MAX_ATTEMPTS", + "UPGRADE_POLL_MAX_ATTEMPTS", ) diff --git a/pam_deploy_graph/constants.py b/pam_deploy_graph/constants.py index 1dcebf0..1868de7 100644 --- a/pam_deploy_graph/constants.py +++ b/pam_deploy_graph/constants.py @@ -64,6 +64,9 @@ DEFAULT_PARAMS = { "ACTION_TYPE": "FULL", "TIMEOUT": 120, "LOG_NAME": "app.log", + "POLL_INTERVAL_SEC": 2, + "DOWNLOAD_POLL_MAX_ATTEMPTS": 60, + "UPGRADE_POLL_MAX_ATTEMPTS": 600, } # 日志、报告和 LLM 输入中需要脱敏的字段。 diff --git a/pam_deploy_graph/fake_runner.py b/pam_deploy_graph/fake_runner.py index d5b72f0..298a4e5 100644 --- a/pam_deploy_graph/fake_runner.py +++ b/pam_deploy_graph/fake_runner.py @@ -43,6 +43,14 @@ class FakeActionRunner: return {"ACTION": action, "NODE_URL": "https://fake-node.local"} if action == "get-online-ips": return {"ACTION": action, "COUNT": "2", "IP": ["192.168.1.10", "192.168.1.11"]} + if action == "poll-download-progress": + return { + "ACTION": action, + "STEP": "DONE", + "RATE_OF_PROGRESS": "100", + "MSG": "success", + "MESSAGE": "success", + } if action == "upgrade-ip": return {"ACTION": action, "IP": kwargs.get("ip", ""), "RESULT": "TASK_CREATED"} if action == "poll-upgrade-progress": @@ -51,6 +59,7 @@ class FakeActionRunner: "IP": kwargs.get("ip", ""), "STEP": "DONE", "RATE_OF_PROGRESS": "100", + "MSG": "success", "MESSAGE": "success", } if action == "start-ip": diff --git a/pam_deploy_graph/interactive.py b/pam_deploy_graph/interactive.py index bc6bb32..5e5938e 100644 --- a/pam_deploy_graph/interactive.py +++ b/pam_deploy_graph/interactive.py @@ -824,6 +824,8 @@ class InteractiveCliSession: ip = context.get("ip") rollback_hint = f"rollback {ip}" if ip else "rollback " self.output(f"请修复失败原因后输入 resume 重试当前 action;如需回滚,输入 {rollback_hint}。") + elif reason == "progress_timeout": + self.output("请检查外部任务状态;如任务仍在运行,可调大最大查询次数或等待后输入 resume 重试当前进度 action。") elif reason == "rollback_failed": self.output("请检查回滚失败原因;修复后可再次输入 rollback 重试,或人工处理后再 resume。") @@ -848,6 +850,9 @@ class InteractiveCliSession: elif event_type == "ACTION_FAIL": detail = f": {message}" if message else "" self.output(f"失败 action: {stage}{suffix}{detail}") + elif event_type == "ACTION_PROGRESS": + detail = f": {message}" if message else "" + self.output(f"进度更新: {stage}{suffix}{detail}") elif event_type == "ACTION_REVIEW_START": self.output(f"开始分析 action 结果: {stage}{suffix}") elif event_type == "ACTION_REVIEW_DONE": diff --git a/pam_deploy_graph/langgraph_runtime.py b/pam_deploy_graph/langgraph_runtime.py index f42e3b5..f580979 100644 --- a/pam_deploy_graph/langgraph_runtime.py +++ b/pam_deploy_graph/langgraph_runtime.py @@ -80,7 +80,7 @@ class LangGraphDeploymentRuntime: def _config(self) -> dict[str, Any]: """生成 LangGraph checkpointer 使用的线程配置。""" - return {"configurable": {"thread_id": self.thread_id}} + return {"configurable": {"thread_id": self.thread_id}, "recursion_limit": 10000} def _consume(self, chunks: Any) -> LangGraphRunResult: """消费 LangGraph stream 输出,提取状态、报告和旧版 interrupt 请求。""" diff --git a/pam_deploy_graph/llm/openai_compatible.py b/pam_deploy_graph/llm/openai_compatible.py index c490ce0..d778127 100644 --- a/pam_deploy_graph/llm/openai_compatible.py +++ b/pam_deploy_graph/llm/openai_compatible.py @@ -170,6 +170,7 @@ class OpenAICompatibleLlmClient: suggested_action=_string(payload, "suggested_action", ""), requires_confirmation=bool(payload.get("requires_confirmation", False)), should_continue=bool(payload.get("should_continue", True)), + progress_complete=_optional_bool(payload.get("progress_complete")), notes=_string_list(payload.get("notes")), ) @@ -385,6 +386,23 @@ def _float(payload: dict[str, Any], key: str, default: float) -> float: return default +def _optional_bool(value: Any) -> bool | None: + """解析可选布尔值,字段缺失时保留 None。""" + if value is None: + return None + if isinstance(value, bool): + return value + if isinstance(value, str): + lowered = value.strip().lower() + if lowered in ("", "null", "none"): + return None + if lowered in ("true", "1", "yes", "y"): + return True + if lowered in ("false", "0", "no", "n"): + return False + return bool(value) + + def _dict(value: Any) -> dict[str, Any]: """确保返回 dict,非法值降级为空 dict。""" return value if isinstance(value, dict) else {} diff --git a/pam_deploy_graph/llm/prompts.py b/pam_deploy_graph/llm/prompts.py index e8a03ee..5a9280f 100644 --- a/pam_deploy_graph/llm/prompts.py +++ b/pam_deploy_graph/llm/prompts.py @@ -38,7 +38,10 @@ PARAM_PROMPT = """从用户输入中抽取 PAM 部署参数和控制信息。 "ZIP_FILE_PATH": "...", "ACTION_TYPE": "...", "TIMEOUT": "...", - "LOG_NAME": "..." + "LOG_NAME": "...", + "POLL_INTERVAL_SEC": "...", + "DOWNLOAD_POLL_MAX_ATTEMPTS": "...", + "UPGRADE_POLL_MAX_ATTEMPTS": "..." }, "extracted_control": { "user_specified_ips": ["..."] @@ -77,12 +80,17 @@ ACTION_ANALYSIS_PROMPT = """分析一次 PAM action 执行结果。 "suggested_action": "...", "requires_confirmation": false, "should_continue": true, + "progress_complete": null, "notes": ["..."] } 要求: - 必须明确给出 `should_continue`:没有问题时为 true;存在需要人工判断的问题时为 false。 - 如果 exit_code 非 0、ok=false、verify-ip SUCCESS=false、出现 legacy pending_confirmation,应标记异常。 +- 对 `poll-download-progress`、`poll-upgrade-progress` 必须判断 `progress_complete`:已完成为 true;未完成但正常为 false;非进度 action 可为 null。 +- 进度 action 未完成但正常时,`has_anomaly=false`、`should_continue=true`、`progress_complete=false`,建议继续查询进度。 +- 进度 action 完成条件优先看 `STEP=DONE`、`STATUS=completed/done/success`、`SUCCESS=true`、`FINISH=true`,或 `MSG=success` 且 `RATE_OF_PROGRESS=100` 且 `CODE` 为空或 0。 +- 进度 action 出现 `CODE` 非 0,或 `STEP/MSG/STATUS/MESSAGE` 含 fail/error,应标记异常并 `should_continue=false`。 - 主要依据结构化字段 `ok`、`exit_code`、`values`、`error_summary` 判断;只有输入里存在 `diagnostic_log` 时,才把它当作异常诊断上下文。 - 脚本正常过程日志不会作为错误依据,不能因为日志来自 stderr 就判定异常。 - 不要输出密钥、token、Authorization 或完整日志原文。 diff --git a/pam_deploy_graph/llm/rule_based.py b/pam_deploy_graph/llm/rule_based.py index f12f9db..5286650 100644 --- a/pam_deploy_graph/llm/rule_based.py +++ b/pam_deploy_graph/llm/rule_based.py @@ -192,6 +192,7 @@ class RuleBasedLlmClient: suggested_action = "继续观察。" requires_confirmation = False should_continue = True + progress_complete: bool | None = None if not result.ok: severity = "medium" @@ -218,6 +219,25 @@ class RuleBasedLlmClient: notes.append("rollback-ip 失败需要人工处理。") should_continue = False + if action in ("poll-download-progress", "poll-upgrade-progress"): + progress_complete, progress_has_anomaly, progress_reason, progress_note = _analyze_progress_values(action, result.values) + if progress_note: + notes.append(progress_note) + if progress_has_anomaly: + has_anomaly = True + severity = "high" + possible_reason = progress_reason or possible_reason or "进度接口返回失败状态。" + suggested_action = "停止后续 action,检查下载/推送任务状态、PAM_HOME/PAM_NODE 日志和接口返回。" + should_continue = False + elif progress_complete: + has_anomaly = has_anomaly or False + suggested_action = "进度已完成,可以继续下一个 action。" + should_continue = should_continue and True + elif result.ok: + severity = severity if has_anomaly else "info" + suggested_action = "进度未完成,继续查询进度。" + should_continue = should_continue and True + if result.values.get("PENDING_AGENT_CONFIRMATION"): has_anomaly = True severity = "high" @@ -235,6 +255,7 @@ class RuleBasedLlmClient: suggested_action=suggested_action, requires_confirmation=requires_confirmation, should_continue=should_continue, + progress_complete=progress_complete, notes=notes, ) logger.info("规则 LLM action 审核完成 analysis=%s", json_for_log(asdict(analysis))) @@ -265,3 +286,49 @@ class RuleBasedLlmClient: if match: params[key] = match.group(1) return params + + +def _analyze_progress_values(action: str, values: dict[str, Any]) -> tuple[bool, bool, str, str]: + """分析进度字段,返回完成状态、异常状态、原因和备注。""" + step = _lower_value(values.get("STEP")) + status = _lower_value(values.get("STATUS")) + msg = _lower_value(values.get("MSG")) + message = _lower_value(values.get("MESSAGE")) + success = _lower_value(values.get("SUCCESS")) + finish = _lower_value(values.get("FINISH")) + code = _lower_value(values.get("CODE")) + rate = _lower_value(values.get("RATE_OF_PROGRESS")) + + complete = False + if step == "done": + complete = True + elif status in ("completed", "complete", "done", "success", "succeeded"): + complete = True + elif success in ("true", "1", "yes"): + complete = True + elif action == "poll-upgrade-progress" and finish in ("true", "1", "yes"): + complete = True + elif msg == "success" and rate == "100" and (not code or code == "0"): + complete = True + + if code and code != "0": + return complete, True, f"进度接口返回非 0 CODE: {code}", _progress_note(values) + combined = " ".join(item for item in (step, status, msg, message) if item) + if re.search(r"fail|error", combined, flags=re.IGNORECASE): + return complete, True, values.get("MESSAGE") or values.get("MSG") or values.get("STEP") or "进度接口返回失败状态", _progress_note(values) + return complete, False, "", _progress_note(values) + + +def _progress_note(values: dict[str, Any]) -> str: + """把进度核心字段整理成一条备注。""" + parts = [] + for key in ("RATE_OF_PROGRESS", "STEP", "MSG", "STATUS", "SUCCESS", "CODE", "FINISH", "MESSAGE"): + value = values.get(key) + if value not in (None, ""): + parts.append(f"{key}={value}") + return "当前进度: " + ", ".join(parts) if parts else "进度接口未返回明确进度字段。" + + +def _lower_value(value: Any) -> str: + """把字段值转成小写字符串。""" + return str(value).strip().lower() if value is not None else "" diff --git a/pam_deploy_graph/models.py b/pam_deploy_graph/models.py index 2f645bd..dacae5d 100644 --- a/pam_deploy_graph/models.py +++ b/pam_deploy_graph/models.py @@ -100,6 +100,7 @@ class LlmActionAnalysis: suggested_action: str = "" requires_confirmation: bool = False should_continue: bool = True + progress_complete: bool | None = None notes: list[str] = field(default_factory=list) @@ -131,3 +132,4 @@ class AgentState: pause_reason: str = "" review_context: dict[str, Any] = field(default_factory=dict) events: list[dict[str, Any]] = field(default_factory=list) + poll_attempts: dict[str, int] = field(default_factory=dict) diff --git a/pam_deploy_graph/tool_catalog.py b/pam_deploy_graph/tool_catalog.py index 62ef370..5aabc0d 100644 --- a/pam_deploy_graph/tool_catalog.py +++ b/pam_deploy_graph/tool_catalog.py @@ -70,7 +70,7 @@ ACTION_TOOL_SPECS: dict[str, ActionToolSpec] = { name="poll_download_progress", action="poll-download-progress", scope="global", - description="轮询云下载任务进度。", + description="单次查询云下载任务进度;是否继续查询由 Agent workflow 和 LLM 审核决定。", risk_level="medium", ), "upgrade-ip": ActionToolSpec( @@ -85,7 +85,7 @@ ACTION_TOOL_SPECS: dict[str, ActionToolSpec] = { name="poll_upgrade_progress", action="poll-upgrade-progress", scope="ip", - description="轮询单个工作站升级进度。", + description="单次查询单个工作站升级进度;是否继续查询由 Agent workflow 和 LLM 审核决定。", risk_level="medium", ), "start-ip": ActionToolSpec( diff --git a/prompts/action_review.txt b/prompts/action_review.txt index d8b4ba9..c5dd4fa 100644 --- a/prompts/action_review.txt +++ b/prompts/action_review.txt @@ -9,12 +9,17 @@ "suggested_action": "...", "requires_confirmation": false, "should_continue": true, + "progress_complete": null, "notes": ["..."] } 要求: - 必须明确给出 `should_continue`:没有问题时为 true;存在需要人工判断的问题时为 false。 - 如果 exit_code 非 0、ok=false、verify-ip SUCCESS=false、出现旧版 pending_confirmation,应标记异常。 +- 对 `poll-download-progress`、`poll-upgrade-progress` 必须判断 `progress_complete`:已完成为 true;未完成但正常为 false;非进度 action 可为 null。 +- 进度 action 未完成但正常时,`has_anomaly=false`、`should_continue=true`、`progress_complete=false`,建议继续查询进度。 +- 进度 action 完成条件优先看 `STEP=DONE`、`STATUS=completed/done/success`、`SUCCESS=true`、`FINISH=true`,或 `MSG=success` 且 `RATE_OF_PROGRESS=100` 且 `CODE` 为空或 0。 +- 进度 action 出现 `CODE` 非 0,或 `STEP/MSG/STATUS/MESSAGE` 含 fail/error,应标记异常并 `should_continue=false`。 - 主要依据结构化字段 `ok`、`exit_code`、`values`、`error_summary` 判断;只有输入里存在 `diagnostic_log` 时,才把它当作异常诊断上下文。 - 脚本正常过程日志不会作为错误依据,不能因为日志来自 stderr 就判定异常。 - 不要输出密钥、token、Authorization 或完整日志原文。 diff --git a/tests/test_agent_flow.py b/tests/test_agent_flow.py index 0da31ea..8ad73d1 100644 --- a/tests/test_agent_flow.py +++ b/tests/test_agent_flow.py @@ -60,6 +60,39 @@ class BrokenReviewLlmClient: raise RuntimeError("review transport failed") +class ProgressivePollRunner(FakeActionRunner): + """模拟下载和推送进度多次查询后才完成。""" + + def __init__(self) -> None: + super().__init__() + self.download_progress = ["10", "55", "100"] + self.upgrade_progress: dict[str, list[str]] = {} + + def _fixture_for(self, action, kwargs): + if action == "poll-download-progress": + rate = self.download_progress.pop(0) if self.download_progress else "100" + return { + "ACTION": action, + "STEP": "DONE" if rate == "100" else "RUNNING", + "RATE_OF_PROGRESS": rate, + "MSG": "success" if rate == "100" else "running", + "MESSAGE": f"download {rate}%", + } + if action == "poll-upgrade-progress": + ip = kwargs.get("ip", "") + values = self.upgrade_progress.setdefault(str(ip), ["30", "100"]) + rate = values.pop(0) if values else "100" + return { + "ACTION": action, + "IP": ip, + "STEP": "DONE" if rate == "100" else "RUNNING", + "RATE_OF_PROGRESS": rate, + "MSG": "success" if rate == "100" else "running", + "MESSAGE": f"upgrade {rate}%", + } + return super()._fixture_for(action, kwargs) + + def test_run_deploy_flow_success(tmp_path: Path): agent = PamDeployAgent(fake_runner=FakeActionRunner()) state = agent.create_state( @@ -75,6 +108,59 @@ def test_run_deploy_flow_success(tmp_path: Path): assert all(item["status"] == "SUCCESS" for item in state.ip_states.values()) +def test_progress_actions_repeat_until_llm_marks_complete(tmp_path: Path): + fake = ProgressivePollRunner() + agent = PamDeployAgent(fake_runner=fake) + state = agent.create_state( + params={**PARAMS, "POLL_INTERVAL_SEC": 0}, + execution_strategy="fake", + config_path=str(tmp_path / "config.txt"), + checkpoint_path=str(tmp_path / "checkpoint.json"), + ) + + agent.run_deploy_flow(state) + + calls = [call[0] for call in fake.calls] + assert calls.count("poll-download-progress") == 3 + assert calls.count("poll-upgrade-progress") == 4 + assert "poll-download-progress" in state.completed_global_steps + assert state.poll_attempts == {} + assert all(item["status"] == "SUCCESS" for item in state.ip_states.values()) + progress_events = [event for event in state.events if event["type"] == "ACTION_PROGRESS"] + assert any(event["stage"] == "poll-download-progress" and "RATE_OF_PROGRESS=10" in event["message"] for event in progress_events) + assert any(event["stage"] == "poll-upgrade-progress" and event["ip"] == "192.168.1.10" for event in progress_events) + + +def test_progress_timeout_pauses_on_current_action(tmp_path: Path): + fake = FakeActionRunner( + { + "poll-download-progress": { + "ACTION": "poll-download-progress", + "STEP": "RUNNING", + "RATE_OF_PROGRESS": "20", + "MSG": "running", + "MESSAGE": "download 20%", + } + } + ) + agent = PamDeployAgent(fake_runner=fake) + state = agent.create_state( + params={**PARAMS, "POLL_INTERVAL_SEC": 0, "DOWNLOAD_POLL_MAX_ATTEMPTS": 2}, + execution_strategy="fake", + config_path=str(tmp_path / "config.txt"), + checkpoint_path=str(tmp_path / "checkpoint.json"), + ) + + agent.run_deploy_flow(state) + + assert state.paused is True + assert state.pause_reason == "progress_timeout" + assert state.last_failed_step == "poll-download-progress" + assert "poll-download-progress" not in state.completed_global_steps + assert state.review_context["stage"] == "poll-download-progress" + assert state.poll_attempts["global:poll-download-progress"] == 2 + + def test_create_state_writes_absolute_script_config_path_and_normalized_zip(tmp_path: Path): package_path = tmp_path / "pkg.zip" params = {**PARAMS, "ZIP_FILE_PATH": str(package_path)} diff --git a/tests/test_interactive_cli.py b/tests/test_interactive_cli.py index aaa4a9f..a486983 100644 --- a/tests/test_interactive_cli.py +++ b/tests/test_interactive_cli.py @@ -80,6 +80,26 @@ class FlakyVerifyRunner(FakeActionRunner): return super()._fixture_for(action, kwargs) +class ChatProgressRunner(FakeActionRunner): + """让 chat fake 部署产生一次可见的进度更新。""" + + def __init__(self) -> None: + super().__init__() + self.download_progress = ["40", "100"] + + def _fixture_for(self, action, kwargs): + if action == "poll-download-progress": + rate = self.download_progress.pop(0) if self.download_progress else "100" + return { + "ACTION": action, + "STEP": "DONE" if rate == "100" else "RUNNING", + "RATE_OF_PROGRESS": rate, + "MSG": "success" if rate == "100" else "running", + "MESSAGE": f"download {rate}%", + } + return super()._fixture_for(action, kwargs) + + def run_session(session: InteractiveCliSession, inputs: list[str]) -> list[str]: output: list[str] = [] iterator = iter(inputs) @@ -138,6 +158,23 @@ def test_chat_run_prints_action_progress(tmp_path: Path): assert any("分析完成: verify-ip" in item for item in output) +def test_chat_run_prints_progress_poll_updates(tmp_path: Path): + checkpoint = tmp_path / "checkpoint.json" + session = InteractiveCliSession( + agent=PamDeployAgent(fake_runner=ChatProgressRunner()), + params={**PARAMS, "POLL_INTERVAL_SEC": 0}, + strategy="fake", + checkpoint_path=str(checkpoint), + ) + + output = run_session(session, ["run", "yes", "yes", "yes", "exit"]) + + assert any("进度更新: poll-download-progress" in item for item in output) + assert any("RATE_OF_PROGRESS=40" in item for item in output) + assert session.state is not None + assert "poll-download-progress" in session.state.completed_global_steps + + def test_chat_greeting_does_not_trigger_structured_analysis(tmp_path: Path): session = InteractiveCliSession( agent=PamDeployAgent(), @@ -253,7 +290,7 @@ def test_chat_params_events_and_checkpoint_commands(tmp_path: Path): "yes", "yes", "yes", - "events 2", + "events 20", "list checkpoints", "load checkpoint " + str(checkpoint), "exit",