aula-15: implementação completa APM (Tempo + OTel + demo app)
Componentes: - tempo-values.yaml: Grafana Tempo monolithic, 256Mi, 10Gi PVC - otel-collector-values.yaml: recebe OTLP, exporta traces→Tempo, gera span metrics (RED)→Victoria Metrics via spanmetrics connector - demo-app/: Node.js com rotas /fast (1 query), /slow (N+1, 51 queries), /fixed (JOIN), auto-instrumentado com OpenTelemetry - alerts/latency-alerts.yaml: VMRule com Doherty threshold (p95>400ms) - setup.sh: instala Tempo, OTel Collector, configura Grafana datasource, deploy demo app via ConfigMap (sem Docker build necessário) - cleanup.sh: remove apenas recursos da aula-15, preserva aula-12 Zero hardcoded hostnames. Tudo via .env e placeholders.
This commit is contained in:
55
aula-15/alerts/latency-alerts.yaml
Normal file
55
aula-15/alerts/latency-alerts.yaml
Normal file
@@ -0,0 +1,55 @@
|
||||
# VMRule - Alertas de Latência e Erros (APM)
|
||||
# Baseado em métricas geradas pelo spanmetrics connector do OpenTelemetry Collector
|
||||
# Métrica: http_server_request_duration_seconds_bucket
|
||||
|
||||
apiVersion: operator.victoriametrics.com/v1beta1
|
||||
kind: VMRule
|
||||
metadata:
|
||||
name: latency-alerts
|
||||
namespace: monitoring
|
||||
labels:
|
||||
app: victoria-metrics
|
||||
spec:
|
||||
groups:
|
||||
- name: apm.latency
|
||||
rules:
|
||||
# Doherty Threshold: resposta acima de 400ms degrada a produtividade do usuário
|
||||
- alert: DohertyThresholdExceeded
|
||||
expr: |
|
||||
histogram_quantile(0.95,
|
||||
sum(rate(http_server_request_duration_seconds_bucket[5m])) by (le, service)
|
||||
) > 0.4
|
||||
for: 5m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: "P95 latency above Doherty threshold (400ms) for {{ $labels.service }}"
|
||||
description: "Service {{ $labels.service }} has P95 latency of {{ $value | humanizeDuration }}. The Doherty threshold (400ms) states that productivity degrades when response time exceeds this limit."
|
||||
|
||||
# Taxa de erros HTTP 5xx acima de 1%
|
||||
- alert: HighErrorRate
|
||||
expr: |
|
||||
(
|
||||
sum(rate(http_server_request_duration_seconds_count{http_status_code=~"5.."}[5m])) by (service)
|
||||
/
|
||||
sum(rate(http_server_request_duration_seconds_count[5m])) by (service)
|
||||
) > 0.01
|
||||
for: 5m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: "Error rate above 1% for {{ $labels.service }}"
|
||||
description: "Service {{ $labels.service }} has {{ $value | humanizePercentage }} error rate (5xx responses)."
|
||||
|
||||
# P99 acima de 1 segundo - latência crítica
|
||||
- alert: HighRequestLatencyP99
|
||||
expr: |
|
||||
histogram_quantile(0.99,
|
||||
sum(rate(http_server_request_duration_seconds_bucket[5m])) by (le, service)
|
||||
) > 1
|
||||
for: 5m
|
||||
labels:
|
||||
severity: critical
|
||||
annotations:
|
||||
summary: "P99 latency above 1s for {{ $labels.service }}"
|
||||
description: "Service {{ $labels.service }} has P99 latency of {{ $value | humanizeDuration }}. This indicates severe performance degradation."
|
||||
Reference in New Issue
Block a user