aula-12: Victoria Metrics + Grafana via GitOps

Stack completo de observabilidade:
- Victoria Metrics (vmsingle, vmagent, vmalert)
- Grafana com dashboards built-in
- Alertas customizados (PVC, pods, nodes, deployments)
- pvc-autoresizer para expansão automática de volumes
- Queries PromQL documentadas

Instalação via ArgoCD seguindo padrão GitOps da aula-11.
This commit is contained in:
ArgoCD Setup
2026-01-08 17:11:28 -03:00
parent e75b245c3b
commit 4b92838ac3
9 changed files with 1939 additions and 0 deletions

View File

@@ -0,0 +1,11 @@
apiVersion: v2
name: monitoring-stack
description: Victoria Metrics stack for Kubernetes monitoring
type: application
version: 1.0.0
appVersion: "1.0.0"
dependencies:
- name: victoria-metrics-k8s-stack
version: "0.28.4"
repository: https://victoriametrics.github.io/helm-charts/

View File

@@ -0,0 +1,179 @@
# =============================================================================
# VMRule - Regras de Alerta Customizadas
# =============================================================================
# Documentação: https://docs.victoriametrics.com/operator/resources/vmrule/
# =============================================================================
apiVersion: operator.victoriametrics.com/v1beta1
kind: VMRule
metadata:
name: custom-alerts
namespace: {{ .Release.Namespace }}
spec:
groups:
# -------------------------------------------------------------------------
# Alertas de Storage/PVC
# -------------------------------------------------------------------------
- name: storage-alerts
rules:
- alert: PVCAlmostFull
expr: |
(kubelet_volume_stats_used_bytes / kubelet_volume_stats_capacity_bytes) > 0.8
and
(kubelet_volume_stats_used_bytes / kubelet_volume_stats_capacity_bytes) < 0.95
for: 5m
labels:
severity: warning
annotations:
summary: "PVC {{ "{{" }} $labels.persistentvolumeclaim {{ "}}" }} está acima de 80%"
description: "PVC {{ "{{" }} $labels.persistentvolumeclaim {{ "}}" }} no namespace {{ "{{" }} $labels.namespace {{ "}}" }} está usando {{ "{{" }} $value | humanizePercentage {{ "}}" }} do espaço."
- alert: PVCFull
expr: |
(kubelet_volume_stats_used_bytes / kubelet_volume_stats_capacity_bytes) > 0.95
for: 1m
labels:
severity: critical
annotations:
summary: "PVC {{ "{{" }} $labels.persistentvolumeclaim {{ "}}" }} está CHEIO (>95%)"
description: "PVC {{ "{{" }} $labels.persistentvolumeclaim {{ "}}" }} no namespace {{ "{{" }} $labels.namespace {{ "}}" }} está usando {{ "{{" }} $value | humanizePercentage {{ "}}" }} do espaço. AÇÃO URGENTE NECESSÁRIA!"
- alert: PVCNotBound
expr: |
kube_persistentvolumeclaim_status_phase{phase!="Bound"} == 1
for: 5m
labels:
severity: warning
annotations:
summary: "PVC {{ "{{" }} $labels.persistentvolumeclaim {{ "}}" }} não está Bound"
description: "PVC {{ "{{" }} $labels.persistentvolumeclaim {{ "}}" }} no namespace {{ "{{" }} $labels.namespace {{ "}}" }} está no estado {{ "{{" }} $labels.phase {{ "}}" }}."
# -------------------------------------------------------------------------
# Alertas de Pods
# -------------------------------------------------------------------------
- name: pod-alerts
rules:
- alert: PodCrashLooping
expr: |
sum(increase(kube_pod_container_status_restarts_total[1h])) by (pod, namespace) > 5
for: 5m
labels:
severity: warning
annotations:
summary: "Pod {{ "{{" }} $labels.pod {{ "}}" }} está em CrashLoop"
description: "Pod {{ "{{" }} $labels.pod {{ "}}" }} no namespace {{ "{{" }} $labels.namespace {{ "}}" }} reiniciou {{ "{{" }} $value {{ "}}" }} vezes na última hora."
- alert: PodNotReady
expr: |
kube_pod_status_ready{condition="false"} == 1
for: 5m
labels:
severity: warning
annotations:
summary: "Pod {{ "{{" }} $labels.pod {{ "}}" }} não está Ready"
description: "Pod {{ "{{" }} $labels.pod {{ "}}" }} no namespace {{ "{{" }} $labels.namespace {{ "}}" }} não está pronto há mais de 5 minutos."
- alert: PodPending
expr: |
kube_pod_status_phase{phase="Pending"} == 1
for: 10m
labels:
severity: warning
annotations:
summary: "Pod {{ "{{" }} $labels.pod {{ "}}" }} está Pending"
description: "Pod {{ "{{" }} $labels.pod {{ "}}" }} no namespace {{ "{{" }} $labels.namespace {{ "}}" }} está pendente há mais de 10 minutos. Verifique recursos ou imagens."
- alert: ContainerOOMKilled
expr: |
kube_pod_container_status_last_terminated_reason{reason="OOMKilled"} == 1
for: 0m
labels:
severity: warning
annotations:
summary: "Container {{ "{{" }} $labels.container {{ "}}" }} foi OOMKilled"
description: "Container {{ "{{" }} $labels.container {{ "}}" }} no pod {{ "{{" }} $labels.pod {{ "}}" }} foi terminado por falta de memória."
# -------------------------------------------------------------------------
# Alertas de Nodes
# -------------------------------------------------------------------------
- name: node-alerts
rules:
- alert: NodeHighCPU
expr: |
100 - (avg by(instance) (rate(node_cpu_seconds_total{mode="idle"}[5m])) * 100) > 90
for: 5m
labels:
severity: warning
annotations:
summary: "Node {{ "{{" }} $labels.instance {{ "}}" }} com CPU alta"
description: "Node {{ "{{" }} $labels.instance {{ "}}" }} está usando {{ "{{" }} $value | humanize {{ "}}" }}% de CPU há mais de 5 minutos."
- alert: NodeLowMemory
expr: |
(node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes) < 0.1
for: 5m
labels:
severity: warning
annotations:
summary: "Node {{ "{{" }} $labels.instance {{ "}}" }} com pouca memória"
description: "Node {{ "{{" }} $labels.instance {{ "}}" }} tem apenas {{ "{{" }} $value | humanizePercentage {{ "}}" }} de memória disponível."
- alert: NodeDiskFull
expr: |
(node_filesystem_avail_bytes{fstype!~"tmpfs|overlay"} / node_filesystem_size_bytes{fstype!~"tmpfs|overlay"}) < 0.15
for: 5m
labels:
severity: critical
annotations:
summary: "Disco do Node {{ "{{" }} $labels.instance {{ "}}" }} quase cheio"
description: "Disco {{ "{{" }} $labels.mountpoint {{ "}}" }} no node {{ "{{" }} $labels.instance {{ "}}" }} tem apenas {{ "{{" }} $value | humanizePercentage {{ "}}" }} livre."
- alert: NodeNotReady
expr: |
kube_node_status_condition{condition="Ready",status="true"} == 0
for: 5m
labels:
severity: critical
annotations:
summary: "Node {{ "{{" }} $labels.node {{ "}}" }} não está Ready"
description: "Node {{ "{{" }} $labels.node {{ "}}" }} não está no estado Ready há mais de 5 minutos."
# -------------------------------------------------------------------------
# Alertas de Deployments
# -------------------------------------------------------------------------
- name: deployment-alerts
rules:
- alert: DeploymentReplicasMismatch
expr: |
kube_deployment_spec_replicas != kube_deployment_status_replicas_available
for: 10m
labels:
severity: warning
annotations:
summary: "Deployment {{ "{{" }} $labels.deployment {{ "}}" }} com réplicas inconsistentes"
description: "Deployment {{ "{{" }} $labels.deployment {{ "}}" }} no namespace {{ "{{" }} $labels.namespace {{ "}}" }} tem {{ "{{" }} $value {{ "}}" }} réplicas faltando."
- alert: DeploymentUnavailable
expr: |
kube_deployment_status_replicas_unavailable > 0
for: 5m
labels:
severity: warning
annotations:
summary: "Deployment {{ "{{" }} $labels.deployment {{ "}}" }} tem réplicas indisponíveis"
description: "Deployment {{ "{{" }} $labels.deployment {{ "}}" }} no namespace {{ "{{" }} $labels.namespace {{ "}}" }} tem {{ "{{" }} $value {{ "}}" }} réplicas indisponíveis."
# -------------------------------------------------------------------------
# Alertas de Auto-Resize PVC
# -------------------------------------------------------------------------
- name: autoresize-alerts
rules:
- alert: PVCAutoResized
expr: |
increase(kube_persistentvolumeclaim_resource_requests_storage_bytes[10m]) > 0
for: 0m
labels:
severity: info
annotations:
summary: "PVC {{ "{{" }} $labels.persistentvolumeclaim {{ "}}" }} foi expandido automaticamente"
description: "PVC {{ "{{" }} $labels.persistentvolumeclaim {{ "}}" }} no namespace {{ "{{" }} $labels.namespace {{ "}}" }} foi redimensionado pelo pvc-autoresizer."

View File

@@ -0,0 +1,192 @@
# =============================================================================
# Victoria Metrics K8s Stack - Values
# =============================================================================
# Documentação: https://github.com/VictoriaMetrics/helm-charts/tree/master/charts/victoria-metrics-k8s-stack
# =============================================================================
victoria-metrics-k8s-stack:
# ---------------------------------------------------------------------------
# VMSingle - Armazenamento de métricas (single-node)
# ---------------------------------------------------------------------------
vmsingle:
enabled: true
spec:
retentionPeriod: "14d"
storage:
storageClassName: "hcloud-volumes"
accessModes:
- ReadWriteOnce
resources:
requests:
storage: 10Gi
resources:
requests:
cpu: 100m
memory: 256Mi
limits:
cpu: 500m
memory: 512Mi
# ---------------------------------------------------------------------------
# VMAgent - Coleta de métricas
# ---------------------------------------------------------------------------
vmagent:
enabled: true
spec:
scrapeInterval: "30s"
resources:
requests:
cpu: 50m
memory: 128Mi
limits:
cpu: 200m
memory: 256Mi
# ---------------------------------------------------------------------------
# VMAlert - Sistema de alertas
# ---------------------------------------------------------------------------
vmalert:
enabled: true
spec:
extraArgs:
"notifier.blackhole": "true" # Não envia alertas (sem AlertManager)
resources:
requests:
cpu: 50m
memory: 64Mi
limits:
cpu: 100m
memory: 128Mi
# ---------------------------------------------------------------------------
# AlertManager - Notificações (desabilitado por padrão)
# ---------------------------------------------------------------------------
alertmanager:
enabled: false
# ---------------------------------------------------------------------------
# Grafana - Visualização
# ---------------------------------------------------------------------------
grafana:
enabled: true
# Credenciais
adminUser: admin
# adminPassword é gerado automaticamente se não especificado
# Recursos
resources:
requests:
cpu: 50m
memory: 128Mi
limits:
cpu: 200m
memory: 256Mi
# Persistência para dashboards e configurações
persistence:
enabled: false # Dashboards vêm do GitOps, não precisa persistir
# Ingress
ingress:
enabled: true
ingressClassName: nginx
annotations:
cert-manager.io/cluster-issuer: letsencrypt-prod
hosts:
- grafana.kube.quest
tls:
- secretName: grafana-tls
hosts:
- grafana.kube.quest
# Sidecar carrega dashboards automaticamente do chart
sidecar:
dashboards:
enabled: true
# Desabilitar criação de datasource separado (usa o do sidecar)
grafanaDatasource:
enabled: false
# ---------------------------------------------------------------------------
# Kube State Metrics - Métricas de objetos K8s
# ---------------------------------------------------------------------------
kube-state-metrics:
enabled: true
resources:
requests:
cpu: 10m
memory: 32Mi
limits:
cpu: 100m
memory: 128Mi
# ---------------------------------------------------------------------------
# Prometheus Node Exporter - Métricas de nodes
# ---------------------------------------------------------------------------
prometheus-node-exporter:
enabled: true
resources:
requests:
cpu: 10m
memory: 32Mi
limits:
cpu: 100m
memory: 64Mi
# ---------------------------------------------------------------------------
# Prometheus Operator CRDs
# ---------------------------------------------------------------------------
prometheus-operator-crds:
enabled: true
# ---------------------------------------------------------------------------
# VM Operator (gerencia CRDs do Victoria Metrics)
# ---------------------------------------------------------------------------
victoria-metrics-operator:
enabled: true
resources:
requests:
cpu: 50m
memory: 64Mi
limits:
cpu: 100m
memory: 128Mi
# ---------------------------------------------------------------------------
# Service Monitors padrão
# ---------------------------------------------------------------------------
defaultRules:
create: true
rules:
alertmanager: false # AlertManager desabilitado
etcd: false # Não temos acesso ao etcd no Talos
configReloaders: true
general: true
k8s: true
kubeApiserver: true
kubeApiserverAvailability: true
kubeApiserverBurnrate: true
kubeApiserverHistogram: true
kubeApiserverSlos: true
kubeControllerManager: false # Não acessível no Talos
kubelet: true
kubeProxy: false # Não acessível no Talos
kubePrometheusGeneral: true
kubePrometheusNodeRecording: true
kubernetesApps: true
kubernetesResources: true
kubernetesStorage: true
kubernetesSystem: true
kubeScheduler: false # Não acessível no Talos
kubeStateMetrics: true
network: true
node: true
nodeExporterAlerting: true
nodeExporterRecording: true
prometheus: true
prometheusOperator: true
vmcluster: false # Usando vmsingle
vmagent: true
vmsingle: true