# VMRule - Alertas de Latência e Erros (APM) # Baseado em métricas geradas pelo spanmetrics connector do OpenTelemetry Collector # Métrica: http_server_request_duration_seconds_bucket apiVersion: operator.victoriametrics.com/v1beta1 kind: VMRule metadata: name: latency-alerts namespace: monitoring labels: app: victoria-metrics spec: groups: - name: apm.latency rules: # Doherty Threshold: resposta acima de 400ms degrada a produtividade do usuário - alert: DohertyThresholdExceeded expr: | histogram_quantile(0.95, sum(rate(http_server_request_duration_seconds_bucket[5m])) by (le, service) ) > 0.4 for: 5m labels: severity: warning annotations: summary: "P95 latency above Doherty threshold (400ms) for {{ $labels.service }}" description: "Service {{ $labels.service }} has P95 latency of {{ $value | humanizeDuration }}. The Doherty threshold (400ms) states that productivity degrades when response time exceeds this limit." # Taxa de erros HTTP 5xx acima de 1% - alert: HighErrorRate expr: | ( sum(rate(http_server_request_duration_seconds_count{http_status_code=~"5.."}[5m])) by (service) / sum(rate(http_server_request_duration_seconds_count[5m])) by (service) ) > 0.01 for: 5m labels: severity: warning annotations: summary: "Error rate above 1% for {{ $labels.service }}" description: "Service {{ $labels.service }} has {{ $value | humanizePercentage }} error rate (5xx responses)." # P99 acima de 1 segundo - latência crítica - alert: HighRequestLatencyP99 expr: | histogram_quantile(0.99, sum(rate(http_server_request_duration_seconds_bucket[5m])) by (le, service) ) > 1 for: 5m labels: severity: critical annotations: summary: "P99 latency above 1s for {{ $labels.service }}" description: "Service {{ $labels.service }} has P99 latency of {{ $value | humanizeDuration }}. This indicates severe performance degradation."