# ============================================================================= # VMRule - Regras de Alerta (Referência) # ============================================================================= # # Este arquivo é apenas referência. A versão aplicada está em: # gitops/apps/victoria-metrics/templates/alerts.yaml # # Para aplicar manualmente (sem GitOps): # kubectl apply -f alerts/vmalert-rules.yaml -n monitoring # # ============================================================================= apiVersion: operator.victoriametrics.com/v1beta1 kind: VMRule metadata: name: custom-alerts namespace: monitoring spec: groups: # ------------------------------------------------------------------------- # Alertas de Storage/PVC # ------------------------------------------------------------------------- - name: storage-alerts rules: - alert: PVCAlmostFull expr: | (kubelet_volume_stats_used_bytes / kubelet_volume_stats_capacity_bytes) > 0.8 and (kubelet_volume_stats_used_bytes / kubelet_volume_stats_capacity_bytes) < 0.95 for: 5m labels: severity: warning annotations: summary: "PVC {{ $labels.persistentvolumeclaim }} está acima de 80%" description: "PVC {{ $labels.persistentvolumeclaim }} no namespace {{ $labels.namespace }} está usando {{ $value | humanizePercentage }} do espaço." - alert: PVCFull expr: | (kubelet_volume_stats_used_bytes / kubelet_volume_stats_capacity_bytes) > 0.95 for: 1m labels: severity: critical annotations: summary: "PVC {{ $labels.persistentvolumeclaim }} está CHEIO (>95%)" description: "AÇÃO URGENTE NECESSÁRIA!" # ------------------------------------------------------------------------- # Alertas de Pods # ------------------------------------------------------------------------- - name: pod-alerts rules: - alert: PodCrashLooping expr: | sum(increase(kube_pod_container_status_restarts_total[1h])) by (pod, namespace) > 5 for: 5m labels: severity: warning annotations: summary: "Pod {{ $labels.pod }} está em CrashLoop" - alert: PodNotReady expr: | kube_pod_status_ready{condition="false"} == 1 for: 5m labels: severity: warning annotations: summary: "Pod {{ $labels.pod }} não está Ready" # ------------------------------------------------------------------------- # Alertas de Nodes # ------------------------------------------------------------------------- - name: node-alerts rules: - alert: NodeHighCPU expr: | 100 - (avg by(instance) (rate(node_cpu_seconds_total{mode="idle"}[5m])) * 100) > 90 for: 5m labels: severity: warning annotations: summary: "Node {{ $labels.instance }} com CPU alta" - alert: NodeLowMemory expr: | (node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes) < 0.1 for: 5m labels: severity: warning annotations: summary: "Node {{ $labels.instance }} com pouca memória" - alert: NodeDiskFull expr: | (node_filesystem_avail_bytes{fstype!~"tmpfs|overlay"} / node_filesystem_size_bytes{fstype!~"tmpfs|overlay"}) < 0.15 for: 5m labels: severity: critical annotations: summary: "Disco do Node {{ $labels.instance }} quase cheio"