一套可直接落地的 **Hyperf 事故复盘与演练平台工程版**开源方案覆盖 从0搭建到持续维护并给出关键代码骨架可运行方向。 ---1)平台目标工程版 MVP 先做这8个核心能力1. 事故登记与状态流转发现 -处理中 -已恢复 -已复盘2. 事故时间线Timeline自动沉淀3. 复盘报告模板化生成5 Whys CAPA4. 演练计划定时演练、手动演练5. 演练执行记录步骤、结果、评分6. 行动项Owner、截止时间、跟踪状态7. 指标看板MTTA、MTTR、复发率、演练通过率8. API Webhook对接告警平台/IM ---2)推荐仓库结构 hyperf-incident-drill/ ├─ app/ │ ├─ Controller/ │ │ ├─ IncidentController.php │ │ ├─ PostmortemController.php │ │ ├─ DrillPlanController.php │ │ └─ MetricsController.php │ ├─ Model/ │ │ ├─ Incident.php │ │ ├─ IncidentTimeline.php │ │ ├─ Postmortem.php │ │ ├─ ActionItem.php │ │ ├─ DrillPlan.php │ │ └─ DrillRun.php │ ├─ Service/ │ │ ├─ IncidentService.php │ │ ├─ PostmortemService.php │ │ ├─ DrillService.php │ │ └─ MetricsService.php │ ├─ Job/ │ │ ├─ DrillRunJob.php │ │ └─ ReminderActionItemJob.php │ ├─ Crontab/ │ │ └─ DrillSchedulerCrontab.php │ └─ Middleware/ │ └─ AuthzMiddleware.php ├─ config/autoload/ │ ├─ routes.php │ ├─ async_queue.php │ ├─ crontab.php │ └─ databases.php ├─ migrations/ ├─ tests/ ├─ docker-compose.yml ├─ .github/workflows/ci.yml ├─ README.md ├─ SECURITY.md └─ LICENSE ---3)从0初始化composercreate-project hyperf/hyperf-skeleton hyperf-incident-drillcdhyperf-incident-drillcomposerrequire hyperf/db-connection hyperf/databasecomposerrequire hyperf/async-queue hyperf/rediscomposerrequire hyperf/crontab hyperf/validationcomposerrequire ramsey/uuid nesbot/carboncomposerrequire--devphpunit/phpunit phpstan/phpstan friendsofphp/php-cs-fixer ---4)数据库设计关键表4.1incidents事故主表 Schema::create(incidents,function(Blueprint$table){$table-bigIncrements(id);$table-string(incident_no,64)-unique();$table-string(title,200);$table-tinyInteger(severity)-default(2);//1~4$table-tinyInteger(status)-default(1);//1open2mitigating3recovered4reviewed$table-string(service,100)-nullable();$table-string(commander,64)-nullable();$table-timestamp(detected_at)-nullable();$table-timestamp(mitigated_at)-nullable();$table-timestamp(resolved_at)-nullable();$table-timestamp(reviewed_at)-nullable();$table-json(tags)-nullable();$table-timestamps();$table-index([status,severity]);$table-index([detected_at]);});4.2incident_timelines时间线 Schema::create(incident_timelines,function(Blueprint$table){$table-bigIncrements(id);$table-unsignedBigInteger(incident_id);$table-timestamp(event_time);$table-string(event_type,50);// alert/mitigation/recovery/note$table-text(content);$table-string(operator,64)-nullable();$table-timestamps();$table-index([incident_id,event_time]);});4.3postmortems复盘报告 Schema::create(postmortems,function(Blueprint$table){$table-bigIncrements(id);$table-unsignedBigInteger(incident_id)-unique();$table-text(impact_summary)-nullable();$table-json(root_causes)-nullable();$table-json(five_whys)-nullable();$table-json(lessons)-nullable();$table-tinyInteger(status)-default(1);//1draft2published$table-timestamps();});4.4action_items改进行动项 Schema::create(action_items,function(Blueprint$table){$table-bigIncrements(id);$table-unsignedBigInteger(incident_id)-nullable();$table-unsignedBigInteger(postmortem_id)-nullable();$table-string(title,200);$table-string(owner,64);$table-date(due_date)-nullable();$table-tinyInteger(priority)-default(2);//1high2med3low$table-tinyInteger(status)-default(1);//1todo2doing3done4overdue$table-timestamps();$table-index([owner,status]);$table-index([due_date,status]);});4.5drill_plans / drill_runs演练计划与执行 Schema::create(drill_plans,function(Blueprint$table){$table-bigIncrements(id);$table-string(name,150);$table-string(target_service,100);$table-string(scenario,200);$table-string(cron_expr,64)-nullable();$table-json(steps);// 演练步骤$table-tinyInteger(enabled)-default(1);$table-timestamp(next_run_at)-nullable();$table-timestamps();});Schema::create(drill_runs,function(Blueprint$table){$table-bigIncrements(id);$table-unsignedBigInteger(plan_id);$table-string(run_no,64)-unique();$table-tinyInteger(status)-default(1);//1running2pass3fail$table-unsignedInteger(score)-default(0);$table-json(result)-nullable();$table-timestamp(started_at)-nullable();$table-timestamp(finished_at)-nullable();$table-timestamps();$table-index([plan_id,created_at]);});---5)核心代码骨架5.1事故服务状态流转 时间线 app/Service/IncidentService.php?php declare(strict_types1);namespace App\Service;use App\Model\Incident;use App\Model\IncidentTimeline;use Hyperf\DbConnection\Db;use Ramsey\Uuid\Uuid;final class IncidentService{publicfunctioncreate(array$data): Incident{returnDb::transaction(function()use($data){$incidentIncident::query()-create([incident_noINC-.date(Ymd).-.substr(Uuid::uuid4()-toString(),0,8),title$data[title],severity(int)($data[severity]??2),status1,service$data[service]?? null,commander$data[commander]?? null,detected_at$data[detected_at]?? date(Y-m-d H:i:s),tags$data[tags]??[],]);$this-appendTimeline((int)$incident-id,alert,incident created,$data[operator]??system);return $incident;});} public function transition(int $incidentId,int $toStatus,string $operator,string $note):void { Db::transaction(function()use($incidentId,$toStatus,$operator,$note){ $incidentIncident::query()-findOrFail($incidentId);$incident-status$toStatus;if($toStatus2)$incident-mitigated_atdate(Y-m-d H:i:s);if($toStatus3)$incident-resolved_atdate(Y-m-d H:i:s);if($toStatus4)$incident-reviewed_atdate(Y-m-d H:i:s);$incident-save();$this-appendTimeline($incidentId,status_change,status-{$toStatus};{$note},$operator);});} public function appendTimeline(int $incidentId,string $type,string $content,string $operator):void { IncidentTimeline::query()-create([ incident_id$incidentId,event_timedate(Y-m-d H:i:s),event_type$type,content$content,operator$operator,]);} }5.2演练执行 Job异步 app/Job/DrillRunJob.php?php declare(strict_types1);namespace App\Job;use App\Model\DrillPlan;use App\Model\DrillRun;use Hyperf\AsyncQueue\Job;use Ramsey\Uuid\Uuid;final class DrillRunJob extends Job { public function __construct(public int $planId){} public function handle():void { $planDrillPlan::query()-findOrFail($this-planId);$runDrillRun::query()-create([ plan_id$plan-id,run_noDR- . date(YmdHis). - . substr(Uuid::uuid4()-toString(),0,6),status1,started_atdate(Y-m-d H:i:s),]);$steps$plan-steps??[];$result[];$score100;$failedfalse;foreach($steps as $idx$step){ try {//这里替换为真实演练动作调用压测平台/故障注入接口/脚本网关 $result[][step$idx1,name$step[name]??step,oktrue];} catch(\Throwable $e){ $failedtrue;$score-20;$result[][step$idx1,name$step[name]??step,okfalse,error$e-getMessage()];} } $run-status$failed?3:2;$run-scoremax(0,$score);$run-result$result;$run-finished_atdate(Y-m-d H:i:s);$run-save();} }5.3定时调度演练每分钟扫描 app/Crontab/DrillSchedulerCrontab.php?php declare(strict_types1);namespace App\Crontab;use App\Model\DrillPlan;use App\Job\DrillRunJob;use Hyperf\AsyncQueue\Driver\DriverFactory;use Hyperf\Crontab\Annotation\Crontab;use Hyperf\DbConnection\Db;use Cron\CronExpression;final class DrillSchedulerCrontab { public function __construct(private DriverFactory $driverFactory){} #[Crontab(rule:*****,memo:schedule drill plans,singleton:true)] public function execute():void { Db::transaction(function(){ $plansDrillPlan::query()-where(enabled,1)-lockForUpdate()-get();foreach($plans as $plan){ if(!$plan-cron_expr)continue;if($plan-next_run_atstrtotime((string)$plan-next_run_at)time())continue;$this-driverFactory-get(default)-push(new DrillRunJob((int)$plan-id));$plan-next_run_atCronExpression::factory($plan-cron_expr)-getNextRunDate()-format(Y-m-d H:i:s);$plan-save();}});}}5.4指标服务MTTA / MTTR app/Service/MetricsService.php?php declare(strict_types1);namespace App\Service;use App\Model\Incident;use Carbon\Carbon;final class MetricsService{publicfunctionsummary(): array{$incidentsIncident::query()-whereNotNull(detected_at)-get();$mtta[];$mttr[];foreach($incidentsas$i){if($i-mitigated_at){$mtta[]Carbon::parse($i-detected_at)-diffInMinutes(Carbon::parse($i-mitigated_at));}if($i-resolved_at){$mttr[]Carbon::parse($i-detected_at)-diffInMinutes(Carbon::parse($i-resolved_at));}}return[incident_total$incidents-count(),mtta_mincount($mtta)? round(array_sum($mtta)/ count($mtta),2):null,mttr_mincount($mttr)? round(array_sum($mttr)/ count($mttr),2):null,];}}---6)API 路由最小闭环 config/autoload/routes.php?php use Hyperf\HttpServer\Router\Router;use App\Controller\IncidentController;use App\Controller\PostmortemController;use App\Controller\DrillPlanController;use App\Controller\MetricsController;Router::addGroup(/api,function(){Router::post(/incidents,[IncidentController::class,create]);Router::post(/incidents/{id:\d}/transition,[IncidentController::class,transition]);Router::post(/incidents/{id:\d}/timeline,[IncidentController::class,appendTimeline]);Router::post(/incidents/{id:\d}/postmortem,[PostmortemController::class,upsert]);Router::post(/drill/plans,[DrillPlanController::class,create]);Router::post(/drill/plans/{id:\d}/run,[DrillPlanController::class,runNow]);Router::get(/metrics/summary,[MetricsController::class,summary]);});---7)本地运行与基础设施 docker-compose.yml最小 version:3.8services: mysql: image: mysql:8.0 environment: MYSQL_ROOT_PASSWORD: root MYSQL_DATABASE: incident ports:[3306:3306]redis: image: redis:7 ports:[6379:6379]---8)CI/CD开源必须 .github/workflows/ci.yml 至少包含 -composervalidate - php-cs-fixer --dry-run - phpstan analyse - phpunit - 集成测试创建 incident -状态流转 -生成 postmortem -触发演练 run ---9)开源发布完整流程1. LICENSEMIT / Apache-2.02. README.md快速启动、状态机、API 示例、架构图3. SECURITY.md漏洞提交通道和 SLA4. Issue/PR 模板5. 首版标签v0.1.06. 发布后维护 CHANGELOG.mdBreaking Change 要写迁移步骤 ---10)持续维护路线图建议 - v0.1事故、时间线、复盘、演练、指标闭环 - v0.2RBAC、多租户、Webhook飞书/Slack/钉钉 - v0.3演练评分模型RTO/RPO/响应协作得分 - v1.0审计日志、SSO、插件化演练动作K8s/DB/Cache ---11)工程版最容易踩坑的点1. 演练任务与生产任务混跑缺少资源隔离2. 时间线依赖人工补录导致复盘不完整3. MTTR 口径不统一检测时间/恢复时间定义混乱4. Action Item 没有逾期提醒和闭环追踪5. 演练“只执行不复盘”无法沉淀改进资产 --- 这套骨架已经能作为开源首版先上线 Incident Timeline Postmortem Drill Metrics 主链路再迭代权限、通知和插件能力。