Stack completo de observabilidade: - Victoria Metrics (vmsingle, vmagent, vmalert) - Grafana com dashboards built-in - Alertas customizados (PVC, pods, nodes, deployments) - pvc-autoresizer para expansão automática de volumes - Queries PromQL documentadas Instalação via ArgoCD seguindo padrão GitOps da aula-11.
101 lines
3.7 KiB
YAML
101 lines
3.7 KiB
YAML
# =============================================================================
|
|
# VMRule - Regras de Alerta (Referência)
|
|
# =============================================================================
|
|
#
|
|
# Este arquivo é apenas referência. A versão aplicada está em:
|
|
# gitops/apps/victoria-metrics/templates/alerts.yaml
|
|
#
|
|
# Para aplicar manualmente (sem GitOps):
|
|
# kubectl apply -f alerts/vmalert-rules.yaml -n monitoring
|
|
#
|
|
# =============================================================================
|
|
|
|
apiVersion: operator.victoriametrics.com/v1beta1
|
|
kind: VMRule
|
|
metadata:
|
|
name: custom-alerts
|
|
namespace: monitoring
|
|
spec:
|
|
groups:
|
|
# -------------------------------------------------------------------------
|
|
# Alertas de Storage/PVC
|
|
# -------------------------------------------------------------------------
|
|
- name: storage-alerts
|
|
rules:
|
|
- alert: PVCAlmostFull
|
|
expr: |
|
|
(kubelet_volume_stats_used_bytes / kubelet_volume_stats_capacity_bytes) > 0.8
|
|
and
|
|
(kubelet_volume_stats_used_bytes / kubelet_volume_stats_capacity_bytes) < 0.95
|
|
for: 5m
|
|
labels:
|
|
severity: warning
|
|
annotations:
|
|
summary: "PVC {{ $labels.persistentvolumeclaim }} está acima de 80%"
|
|
description: "PVC {{ $labels.persistentvolumeclaim }} no namespace {{ $labels.namespace }} está usando {{ $value | humanizePercentage }} do espaço."
|
|
|
|
- alert: PVCFull
|
|
expr: |
|
|
(kubelet_volume_stats_used_bytes / kubelet_volume_stats_capacity_bytes) > 0.95
|
|
for: 1m
|
|
labels:
|
|
severity: critical
|
|
annotations:
|
|
summary: "PVC {{ $labels.persistentvolumeclaim }} está CHEIO (>95%)"
|
|
description: "AÇÃO URGENTE NECESSÁRIA!"
|
|
|
|
# -------------------------------------------------------------------------
|
|
# Alertas de Pods
|
|
# -------------------------------------------------------------------------
|
|
- name: pod-alerts
|
|
rules:
|
|
- alert: PodCrashLooping
|
|
expr: |
|
|
sum(increase(kube_pod_container_status_restarts_total[1h])) by (pod, namespace) > 5
|
|
for: 5m
|
|
labels:
|
|
severity: warning
|
|
annotations:
|
|
summary: "Pod {{ $labels.pod }} está em CrashLoop"
|
|
|
|
- alert: PodNotReady
|
|
expr: |
|
|
kube_pod_status_ready{condition="false"} == 1
|
|
for: 5m
|
|
labels:
|
|
severity: warning
|
|
annotations:
|
|
summary: "Pod {{ $labels.pod }} não está Ready"
|
|
|
|
# -------------------------------------------------------------------------
|
|
# Alertas de Nodes
|
|
# -------------------------------------------------------------------------
|
|
- name: node-alerts
|
|
rules:
|
|
- alert: NodeHighCPU
|
|
expr: |
|
|
100 - (avg by(instance) (rate(node_cpu_seconds_total{mode="idle"}[5m])) * 100) > 90
|
|
for: 5m
|
|
labels:
|
|
severity: warning
|
|
annotations:
|
|
summary: "Node {{ $labels.instance }} com CPU alta"
|
|
|
|
- alert: NodeLowMemory
|
|
expr: |
|
|
(node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes) < 0.1
|
|
for: 5m
|
|
labels:
|
|
severity: warning
|
|
annotations:
|
|
summary: "Node {{ $labels.instance }} com pouca memória"
|
|
|
|
- alert: NodeDiskFull
|
|
expr: |
|
|
(node_filesystem_avail_bytes{fstype!~"tmpfs|overlay"} / node_filesystem_size_bytes{fstype!~"tmpfs|overlay"}) < 0.15
|
|
for: 5m
|
|
labels:
|
|
severity: critical
|
|
annotations:
|
|
summary: "Disco do Node {{ $labels.instance }} quase cheio"
|