Files
workshop/aula-12/alerts/vmalert-rules.yaml
ArgoCD Setup 4b92838ac3 aula-12: Victoria Metrics + Grafana via GitOps
Stack completo de observabilidade:
- Victoria Metrics (vmsingle, vmagent, vmalert)
- Grafana com dashboards built-in
- Alertas customizados (PVC, pods, nodes, deployments)
- pvc-autoresizer para expansão automática de volumes
- Queries PromQL documentadas

Instalação via ArgoCD seguindo padrão GitOps da aula-11.
2026-01-08 17:11:28 -03:00

101 lines
3.7 KiB
YAML

# =============================================================================
# VMRule - Regras de Alerta (Referência)
# =============================================================================
#
# Este arquivo é apenas referência. A versão aplicada está em:
# gitops/apps/victoria-metrics/templates/alerts.yaml
#
# Para aplicar manualmente (sem GitOps):
# kubectl apply -f alerts/vmalert-rules.yaml -n monitoring
#
# =============================================================================
apiVersion: operator.victoriametrics.com/v1beta1
kind: VMRule
metadata:
name: custom-alerts
namespace: monitoring
spec:
groups:
# -------------------------------------------------------------------------
# Alertas de Storage/PVC
# -------------------------------------------------------------------------
- name: storage-alerts
rules:
- alert: PVCAlmostFull
expr: |
(kubelet_volume_stats_used_bytes / kubelet_volume_stats_capacity_bytes) > 0.8
and
(kubelet_volume_stats_used_bytes / kubelet_volume_stats_capacity_bytes) < 0.95
for: 5m
labels:
severity: warning
annotations:
summary: "PVC {{ $labels.persistentvolumeclaim }} está acima de 80%"
description: "PVC {{ $labels.persistentvolumeclaim }} no namespace {{ $labels.namespace }} está usando {{ $value | humanizePercentage }} do espaço."
- alert: PVCFull
expr: |
(kubelet_volume_stats_used_bytes / kubelet_volume_stats_capacity_bytes) > 0.95
for: 1m
labels:
severity: critical
annotations:
summary: "PVC {{ $labels.persistentvolumeclaim }} está CHEIO (>95%)"
description: "AÇÃO URGENTE NECESSÁRIA!"
# -------------------------------------------------------------------------
# Alertas de Pods
# -------------------------------------------------------------------------
- name: pod-alerts
rules:
- alert: PodCrashLooping
expr: |
sum(increase(kube_pod_container_status_restarts_total[1h])) by (pod, namespace) > 5
for: 5m
labels:
severity: warning
annotations:
summary: "Pod {{ $labels.pod }} está em CrashLoop"
- alert: PodNotReady
expr: |
kube_pod_status_ready{condition="false"} == 1
for: 5m
labels:
severity: warning
annotations:
summary: "Pod {{ $labels.pod }} não está Ready"
# -------------------------------------------------------------------------
# Alertas de Nodes
# -------------------------------------------------------------------------
- name: node-alerts
rules:
- alert: NodeHighCPU
expr: |
100 - (avg by(instance) (rate(node_cpu_seconds_total{mode="idle"}[5m])) * 100) > 90
for: 5m
labels:
severity: warning
annotations:
summary: "Node {{ $labels.instance }} com CPU alta"
- alert: NodeLowMemory
expr: |
(node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes) < 0.1
for: 5m
labels:
severity: warning
annotations:
summary: "Node {{ $labels.instance }} com pouca memória"
- alert: NodeDiskFull
expr: |
(node_filesystem_avail_bytes{fstype!~"tmpfs|overlay"} / node_filesystem_size_bytes{fstype!~"tmpfs|overlay"}) < 0.15
for: 5m
labels:
severity: critical
annotations:
summary: "Disco do Node {{ $labels.instance }} quase cheio"