diff --git a/checklists/docker/DOCKER-CHECKLIST.md b/checklists/docker/DOCKER-CHECKLIST.md index 78092fb..b0448c9 100644 --- a/checklists/docker/DOCKER-CHECKLIST.md +++ b/checklists/docker/DOCKER-CHECKLIST.md @@ -1,9 +1,27 @@ -# Docker Checklist — Pulse Agent +# Docker Health Checklist — Pulse Agent -## Antes de sessoes de trabalho -- [ ] \`docker ps\` — verificar 19 containers rodando a cada alteracao -- [ ] \`docker ps -a -f status=exited --format '{{.Names}}'\` — limpar orfaos +_Executar ao fim de cada alteracao em stacks Docker e no inicio/fim de todas as sessoes._ -## Quando um servico falhar -- [ ] Identificar stack: \`docker stack ps \` -- [ ] Aplicar recovery: \`docker service update --force _\` +## Fase 1 — Servicos rodando +\`\`\`bash +docker ps --format 'table {{.Names}}\t{{.Image}}\t{{.Status}}' +docker service ls +\`\`\` + +## Fase 2 — Servicos em alerta +\`\`\`bash +docker stack ps --no-trunc --no-resolve | grep -E "FAILED|Exit|Complete" +\`\`\` + +## Fase 3 — Containers orfaos (limpar) +\`\`\`bash +docker ps -a -f 'status=exited' --format '{{.Names}}' +docker ps -a -f 'status=dead' --format '{{.Names}}' +\`\`\` + +## Fase 4 — Log de mudancas +\`\`\`md +# Mudancas Docker — +- Servico X: scale 2→1 +- Servico Y: forcue restart +\`\`\` diff --git a/checklists/session/SESSION-CHECKLIST.md b/checklists/session/SESSION-CHECKLIST.md index 175cd57..bb5711c 100644 --- a/checklists/session/SESSION-CHECKLIST.md +++ b/checklists/session/SESSION-CHECKLIST.md @@ -1,13 +1,20 @@ # Session Checklist — Pulse Agent Auto-Check -## Início -- [ ] Ler MEMORY.md -- [ ] Ler SESSION-STATE.md -- [ ] Ler LEARNINGS.md | ERRORS.md | PATTERN_COUNTER.md -- [ ] \`docker ps\` — serviços -- [ ] \`df -h\` — disco -- [ ] \`uptime\` — load +_Executar no início e fim de cada sessão._ -## Fim -- [ ] Atualizar \`memory/.md\` -- [ ] Commit de tudo +## Início de Sessão +- [ ] Ler MEMORY.md (memória curada) +- [ ] Ler SESSION-STATE.md (estado atual) +- [ ] Ler LEARNINGS.md | ERRORS.md | PATTERN_COUNTER.md +- [ ] `docker ps` — verificar serviços +- [ ] `df -h` — verificar disco +- [ ] `uptime` — verificar load + +## Meio de Sessão (a cada 30min) +- [ ] Checar se processos críticos estão vivos +- [ ] Logar aprendizados em LEARNINGS.md se surgir algo novo +- [ ] Logar erros em ERRORS.md se houver falha + +## Fim de Sessão +- [ ] Atualizar `memory/.md` com resumo do dia +- [ ] Ler `.learnings/LEARNINGS.md` e mencionar diff --git a/runbook/DOCKER-SWARM-RUNBOOK.md b/runbook/DOCKER-SWARM-RUNBOOK.md index fda76ed..3a6fe98 100644 --- a/runbook/DOCKER-SWARM-RUNBOOK.md +++ b/runbook/DOCKER-SWARM-RUNBOOK.md @@ -2,9 +2,9 @@ _Atualizado: 2026-05-20 | Responsável: Pulse Agent_ -## \u0001Servicos por Stack +## 📋 Inventário de Stacks (8 ativos) -| Stack | Servicos | Status | +| Stack | Serviços | Status | |-------|----------|--------| | bot | beebot | 🟢 | | code | file (8dcode) | 🟢 | @@ -15,23 +15,35 @@ _Atualizado: 2026-05-20 | Responsável: Pulse Agent_ | pro | leantime, leantime-db | 🟡 | | proxy | caddy (80/443) | 🟢 | -## Recuperacao rapida +## 🚨 Serviços críticos e seus riscos -\`\`\`bash +| Serviço | Risco | Recuperação | +|---------|-------|-------------| +| `bot_office` | HIGH — OOM kill (exit 137), agora UP porém frágil | `docker service scale bot_office=2` | +| `database_mongos-master` | HIGH — 4 containers falharam exit(139) SIGSEGV | `docker service update --force database_mongos-master` | +| `pro_leantime` | HIGH — 4 containers unhealthy, exit(137) | `docker service update --force pro_leantime` | +| `dock_portainer` | MEDIUM — múltiplos Failed | `docker service update --force dock_portainer` | +| `proxy_caddy` | MEDIUM — mount path inválido em réplicas antigas | fix compose mount | + +## 🔧 Comandos de recuperação rápida + +```bash # Status detalhado docker stack ps --no-trunc --no-resolve -# Forcar restart +# Forçar recriação docker service update --force _ -# Escalar (forcar nova réplica) +# Escalar (forçar nova réplica) docker service scale _=2 docker service scale _=1 -# Limpar orfaos -docker ps -a -f 'status=exited' --format '{{.Names}}' | xargs -r docker rm -f -\`\`\` +# Limpar órfãos +docker ps -a -f 'status=exited' --format '{{.Names}}' | xargs docker rm -f +docker ps -a -f 'status=dead' --format '{{.Names}}' | xargs docker rm -f +``` -## Health check coverage -- 3/19 containers com health check definido -- TODO: adicionar para bot_office, gitea, pro_leantime-db, todos do design stack +## 📊 Health check coverage + +- **3/19** containers com health check definido +- **TODO**: adicionar health check para bot_office, gitea, pro_leantime-db, todos do design stack diff --git a/state/docker-state.json b/state/docker-state.json index b68a4e4..789586e 100644 --- a/state/docker-state.json +++ b/state/docker-state.json @@ -1,16 +1,58 @@ { "lastUpdated": "2026-05-20T11:55:00-03:00", - "generatedBy": "Pulse Agent", - "stacks": { - "bot": {"status": "stable"}, - "code": {"status": "stable"}, - "database": {"status": "warning", "note": "mongos-master com containers Failed"}, - "design": {"status": "stable", "running": 7}, - "dock": {"status": "warning"}, - "git": {"status": "stable"}, - "pro": {"status": "warning", "note": "leantime unhealthy containers"}, - "proxy": {"status": "stable"} + "generatedBy": "Pulse Agent — auto-administracao Docker", + "swarm": { + "nodeID": "x3fm004yzn3j7pvhz0fuuskez", + "hostname": "s1", + "role": "manager-leader", + "engine": "29.4.3", + "clusterID": "plz2xbh64yzhgy88jb9stm0pc" }, - "containersRunning": 19, - "containersWithHealthCheck": 2 + "stacks": { + "bot": { + "services": ["beebot"], + "status": "stable", + "runningContainers": 2, + "note": "1 Running / 1 Failed(exit137)" + }, + "code": { + "services": ["file"], + "status": "stable", + "runningContainers": 1 + }, + "database": { + "services": ["mongos-master", "dbadmin"], + "status": "warning", + "runningContainers": 2, + "note": "mongos-master com replicas Failed + 1 UP" + }, + "design": { + "services": ["penpot", "postgres", "valkey", "backend", "frontend", "mcp", "exporter", "mailcatcher"], + "status": "stable", + "runningContainers": 7 + }, + "dock": { + "services": ["portainer", "agent"], + "status": "warning", + "runningContainers": 2, + "note": "portainer com multiplas replicas Failed antigas" + }, + "git": { + "services": ["gitea"], + "status": "stable", + "runningContainers": 1, + "note": "sem health check definido" + }, + "pro": { + "services": ["leantime", "leantime-db"], + "status": "warning", + "runningContainers": 2, + "note": "leantime com 3 containers Failed(exit137, unhealthy)" + }, + "proxy": { + "services": ["caddy"], + "status": "stable", + "runningContainers": 1 + } + } }