Componentes: - tempo-values.yaml: Grafana Tempo monolithic, 256Mi, 10Gi PVC - otel-collector-values.yaml: recebe OTLP, exporta traces→Tempo, gera span metrics (RED)→Victoria Metrics via spanmetrics connector - demo-app/: Node.js com rotas /fast (1 query), /slow (N+1, 51 queries), /fixed (JOIN), auto-instrumentado com OpenTelemetry - alerts/latency-alerts.yaml: VMRule com Doherty threshold (p95>400ms) - setup.sh: instala Tempo, OTel Collector, configura Grafana datasource, deploy demo app via ConfigMap (sem Docker build necessário) - cleanup.sh: remove apenas recursos da aula-15, preserva aula-12 Zero hardcoded hostnames. Tudo via .env e placeholders.
56 lines
2.2 KiB
YAML
56 lines
2.2 KiB
YAML
# VMRule - Alertas de Latência e Erros (APM)
|
|
# Baseado em métricas geradas pelo spanmetrics connector do OpenTelemetry Collector
|
|
# Métrica: http_server_request_duration_seconds_bucket
|
|
|
|
apiVersion: operator.victoriametrics.com/v1beta1
|
|
kind: VMRule
|
|
metadata:
|
|
name: latency-alerts
|
|
namespace: monitoring
|
|
labels:
|
|
app: victoria-metrics
|
|
spec:
|
|
groups:
|
|
- name: apm.latency
|
|
rules:
|
|
# Doherty Threshold: resposta acima de 400ms degrada a produtividade do usuário
|
|
- alert: DohertyThresholdExceeded
|
|
expr: |
|
|
histogram_quantile(0.95,
|
|
sum(rate(http_server_request_duration_seconds_bucket[5m])) by (le, service)
|
|
) > 0.4
|
|
for: 5m
|
|
labels:
|
|
severity: warning
|
|
annotations:
|
|
summary: "P95 latency above Doherty threshold (400ms) for {{ $labels.service }}"
|
|
description: "Service {{ $labels.service }} has P95 latency of {{ $value | humanizeDuration }}. The Doherty threshold (400ms) states that productivity degrades when response time exceeds this limit."
|
|
|
|
# Taxa de erros HTTP 5xx acima de 1%
|
|
- alert: HighErrorRate
|
|
expr: |
|
|
(
|
|
sum(rate(http_server_request_duration_seconds_count{http_status_code=~"5.."}[5m])) by (service)
|
|
/
|
|
sum(rate(http_server_request_duration_seconds_count[5m])) by (service)
|
|
) > 0.01
|
|
for: 5m
|
|
labels:
|
|
severity: warning
|
|
annotations:
|
|
summary: "Error rate above 1% for {{ $labels.service }}"
|
|
description: "Service {{ $labels.service }} has {{ $value | humanizePercentage }} error rate (5xx responses)."
|
|
|
|
# P99 acima de 1 segundo - latência crítica
|
|
- alert: HighRequestLatencyP99
|
|
expr: |
|
|
histogram_quantile(0.99,
|
|
sum(rate(http_server_request_duration_seconds_bucket[5m])) by (le, service)
|
|
) > 1
|
|
for: 5m
|
|
labels:
|
|
severity: critical
|
|
annotations:
|
|
summary: "P99 latency above 1s for {{ $labels.service }}"
|
|
description: "Service {{ $labels.service }} has P99 latency of {{ $value | humanizeDuration }}. This indicates severe performance degradation."
|