From 9b3168b996799cba1a02c8241dc85248d9064ddb Mon Sep 17 00:00:00 2001 From: ArgoCD Setup Date: Sat, 14 Mar 2026 02:30:35 -0300 Subject: [PATCH] =?UTF-8?q?aula-15:=20implementa=C3=A7=C3=A3o=20completa?= =?UTF-8?q?=20APM=20(Tempo=20+=20OTel=20+=20demo=20app)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Componentes: - tempo-values.yaml: Grafana Tempo monolithic, 256Mi, 10Gi PVC - otel-collector-values.yaml: recebe OTLP, exporta traces→Tempo, gera span metrics (RED)→Victoria Metrics via spanmetrics connector - demo-app/: Node.js com rotas /fast (1 query), /slow (N+1, 51 queries), /fixed (JOIN), auto-instrumentado com OpenTelemetry - alerts/latency-alerts.yaml: VMRule com Doherty threshold (p95>400ms) - setup.sh: instala Tempo, OTel Collector, configura Grafana datasource, deploy demo app via ConfigMap (sem Docker build necessário) - cleanup.sh: remove apenas recursos da aula-15, preserva aula-12 Zero hardcoded hostnames. Tudo via .env e placeholders. --- aula-15/alerts/latency-alerts.yaml | 55 ++++ aula-15/cleanup.sh | 129 ++++++++ aula-15/demo-app/Dockerfile | 12 + aula-15/demo-app/app.js | 230 +++++++++++++ aula-15/demo-app/k8s/deployment.yaml | 64 ++++ aula-15/demo-app/k8s/ingress.yaml | 24 ++ aula-15/demo-app/k8s/namespace.yaml | 4 + aula-15/demo-app/k8s/postgresql.yaml | 84 +++++ aula-15/demo-app/k8s/service.yaml | 15 + aula-15/demo-app/package.json | 17 + aula-15/demo-app/tracing.js | 45 +++ aula-15/otel-collector-values.yaml | 88 +++++ aula-15/setup.sh | 466 +++++++++++++++++++++++++++ aula-15/tempo-values.yaml | 61 ++++ 14 files changed, 1294 insertions(+) create mode 100644 aula-15/alerts/latency-alerts.yaml create mode 100755 aula-15/cleanup.sh create mode 100644 aula-15/demo-app/Dockerfile create mode 100644 aula-15/demo-app/app.js create mode 100644 aula-15/demo-app/k8s/deployment.yaml create mode 100644 aula-15/demo-app/k8s/ingress.yaml create mode 100644 aula-15/demo-app/k8s/namespace.yaml create mode 100644 aula-15/demo-app/k8s/postgresql.yaml create mode 100644 aula-15/demo-app/k8s/service.yaml create mode 100644 aula-15/demo-app/package.json create mode 100644 aula-15/demo-app/tracing.js create mode 100644 aula-15/otel-collector-values.yaml create mode 100755 aula-15/setup.sh create mode 100644 aula-15/tempo-values.yaml diff --git a/aula-15/alerts/latency-alerts.yaml b/aula-15/alerts/latency-alerts.yaml new file mode 100644 index 0000000..cb15add --- /dev/null +++ b/aula-15/alerts/latency-alerts.yaml @@ -0,0 +1,55 @@ +# VMRule - Alertas de Latência e Erros (APM) +# Baseado em métricas geradas pelo spanmetrics connector do OpenTelemetry Collector +# Métrica: http_server_request_duration_seconds_bucket + +apiVersion: operator.victoriametrics.com/v1beta1 +kind: VMRule +metadata: + name: latency-alerts + namespace: monitoring + labels: + app: victoria-metrics +spec: + groups: + - name: apm.latency + rules: + # Doherty Threshold: resposta acima de 400ms degrada a produtividade do usuário + - alert: DohertyThresholdExceeded + expr: | + histogram_quantile(0.95, + sum(rate(http_server_request_duration_seconds_bucket[5m])) by (le, service) + ) > 0.4 + for: 5m + labels: + severity: warning + annotations: + summary: "P95 latency above Doherty threshold (400ms) for {{ $labels.service }}" + description: "Service {{ $labels.service }} has P95 latency of {{ $value | humanizeDuration }}. The Doherty threshold (400ms) states that productivity degrades when response time exceeds this limit." + + # Taxa de erros HTTP 5xx acima de 1% + - alert: HighErrorRate + expr: | + ( + sum(rate(http_server_request_duration_seconds_count{http_status_code=~"5.."}[5m])) by (service) + / + sum(rate(http_server_request_duration_seconds_count[5m])) by (service) + ) > 0.01 + for: 5m + labels: + severity: warning + annotations: + summary: "Error rate above 1% for {{ $labels.service }}" + description: "Service {{ $labels.service }} has {{ $value | humanizePercentage }} error rate (5xx responses)." + + # P99 acima de 1 segundo - latência crítica + - alert: HighRequestLatencyP99 + expr: | + histogram_quantile(0.99, + sum(rate(http_server_request_duration_seconds_bucket[5m])) by (le, service) + ) > 1 + for: 5m + labels: + severity: critical + annotations: + summary: "P99 latency above 1s for {{ $labels.service }}" + description: "Service {{ $labels.service }} has P99 latency of {{ $value | humanizeDuration }}. This indicates severe performance degradation." diff --git a/aula-15/cleanup.sh b/aula-15/cleanup.sh new file mode 100755 index 0000000..dc674ea --- /dev/null +++ b/aula-15/cleanup.sh @@ -0,0 +1,129 @@ +#!/bin/bash +# ============================================================================= +# Aula 15 - Cleanup (Remove APM: Tempo + OTel + Demo App) +# ============================================================================= +# +# Remove: +# - Demo app namespace (demo) +# - Grafana Tempo (Helm release) +# - OpenTelemetry Collector (Helm release) +# - Grafana datasource ConfigMap (Tempo) +# - Alertas de latencia +# - Arquivo .env +# +# NAO remove: +# - Victoria Metrics / Grafana (pertencem a aula-12) +# - Namespace monitoring +# +# ============================================================================= + +set -e + +RED='\033[0;31m' +GREEN='\033[0;32m' +YELLOW='\033[1;33m' +BLUE='\033[0;34m' +CYAN='\033[0;36m' +NC='\033[0m' + +log_info() { echo -e "${BLUE}[INFO]${NC} $1"; } +log_success() { echo -e "${GREEN}[OK]${NC} $1"; } +log_warn() { echo -e "${YELLOW}[WARN]${NC} $1"; } + +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" + +echo "" +echo -e "${CYAN}═══════════════════════════════════════════════════${NC}" +echo -e "${CYAN} Cleanup - Aula 15 (APM)${NC}" +echo -e "${CYAN}═══════════════════════════════════════════════════${NC}" +echo "" + +log_warn "Isso vai remover:" +echo " - Namespace demo (demo app + PostgreSQL + dados)" +echo " - Grafana Tempo (Helm release)" +echo " - OpenTelemetry Collector (Helm release)" +echo " - Datasource Tempo no Grafana" +echo " - Alertas de latencia" +echo "" +echo -e "${GREEN}NAO sera removido:${NC}" +echo " - Victoria Metrics (aula-12)" +echo " - Grafana (aula-12)" +echo " - Namespace monitoring" +echo "" +read -p "Continuar? (digite 'sim' para confirmar): " confirm + +if [ "$confirm" != "sim" ]; then + log_info "Operacao cancelada" + exit 0 +fi + +echo "" + +# ------------------------------------------------------------------------- +# 1. Remover namespace demo (demo app + PostgreSQL) +# ------------------------------------------------------------------------- +log_info "Removendo namespace demo..." +if kubectl get namespace demo &> /dev/null; then + kubectl delete namespace demo --timeout=60s 2>/dev/null || true + log_success "Namespace demo removido" +else + log_info "Namespace demo nao encontrado" +fi + +# ------------------------------------------------------------------------- +# 2. Remover Grafana Tempo +# ------------------------------------------------------------------------- +log_info "Removendo Grafana Tempo..." +if helm status tempo -n monitoring &> /dev/null; then + helm uninstall tempo -n monitoring --wait 2>/dev/null || true + log_success "Tempo removido" +else + log_info "Tempo nao encontrado" +fi + +# Remover PVC do Tempo +kubectl delete pvc -n monitoring -l app.kubernetes.io/name=tempo --wait=false 2>/dev/null || true + +# ------------------------------------------------------------------------- +# 3. Remover OpenTelemetry Collector +# ------------------------------------------------------------------------- +log_info "Removendo OpenTelemetry Collector..." +if helm status otel-collector -n monitoring &> /dev/null; then + helm uninstall otel-collector -n monitoring --wait 2>/dev/null || true + log_success "OTel Collector removido" +else + log_info "OTel Collector nao encontrado" +fi + +# ------------------------------------------------------------------------- +# 4. Remover datasource ConfigMap do Tempo no Grafana +# ------------------------------------------------------------------------- +log_info "Removendo datasource Tempo do Grafana..." +kubectl delete configmap grafana-datasource-tempo -n monitoring 2>/dev/null || true +log_success "Datasource removido" + +# Reiniciar Grafana para remover datasource +log_info "Reiniciando Grafana..." +kubectl rollout restart deployment -n monitoring -l app.kubernetes.io/name=grafana 2>/dev/null || true + +# ------------------------------------------------------------------------- +# 5. Remover alertas de latencia +# ------------------------------------------------------------------------- +log_info "Removendo alertas de latencia..." +kubectl delete -f "${SCRIPT_DIR}/alerts/latency-alerts.yaml" -n monitoring 2>/dev/null || true +log_success "Alertas removidos" + +# ------------------------------------------------------------------------- +# 6. Remover .env +# ------------------------------------------------------------------------- +if [[ -f "${SCRIPT_DIR}/.env" ]]; then + rm "${SCRIPT_DIR}/.env" + log_info ".env removido" +fi + +echo "" +log_success "Cleanup concluido!" +echo "" +echo "O namespace monitoring e Victoria Metrics + Grafana foram mantidos." +echo "Para remover tudo, execute o cleanup da aula-12." +echo "" diff --git a/aula-15/demo-app/Dockerfile b/aula-15/demo-app/Dockerfile new file mode 100644 index 0000000..e12d1ef --- /dev/null +++ b/aula-15/demo-app/Dockerfile @@ -0,0 +1,12 @@ +FROM node:20-alpine + +WORKDIR /app + +COPY package.json package-lock.json* ./ +RUN npm install --production + +COPY . . + +EXPOSE 3000 + +CMD ["node", "--require", "./tracing.js", "app.js"] diff --git a/aula-15/demo-app/app.js b/aula-15/demo-app/app.js new file mode 100644 index 0000000..518ce28 --- /dev/null +++ b/aula-15/demo-app/app.js @@ -0,0 +1,230 @@ +'use strict'; + +const express = require('express'); +const { Pool } = require('pg'); + +const app = express(); +const PORT = process.env.PORT || 3000; + +const pool = new Pool({ + host: process.env.PG_HOST || 'localhost', + port: parseInt(process.env.PG_PORT || '5432', 10), + user: process.env.PG_USER || 'demo', + password: process.env.PG_PASSWORD || 'demo', + database: process.env.PG_DATABASE || 'demo', +}); + +// --- Database seeding --- + +async function seedDatabase() { + const client = await pool.connect(); + try { + const tableCheck = await client.query(` + SELECT EXISTS ( + SELECT FROM information_schema.tables WHERE table_name = 'users' + ) + `); + + if (tableCheck.rows[0].exists) { + const countResult = await client.query('SELECT COUNT(*) FROM users'); + if (parseInt(countResult.rows[0].count, 10) > 0) { + console.log('Database already seeded, skipping.'); + return; + } + } + + console.log('Seeding database...'); + + await client.query(` + CREATE TABLE IF NOT EXISTS users ( + id SERIAL PRIMARY KEY, + name VARCHAR(100) NOT NULL, + email VARCHAR(150) NOT NULL, + created_at TIMESTAMP DEFAULT NOW() + ) + `); + + await client.query(` + CREATE TABLE IF NOT EXISTS posts ( + id SERIAL PRIMARY KEY, + user_id INTEGER REFERENCES users(id), + title VARCHAR(200) NOT NULL, + body TEXT NOT NULL, + created_at TIMESTAMP DEFAULT NOW() + ) + `); + + // Insert 50 users + for (let i = 1; i <= 50; i++) { + await client.query( + 'INSERT INTO users (name, email) VALUES ($1, $2)', + [`User ${i}`, `user${i}@example.com`] + ); + } + + // Insert 10 posts per user (500 posts total) + for (let userId = 1; userId <= 50; userId++) { + for (let p = 1; p <= 10; p++) { + await client.query( + 'INSERT INTO posts (user_id, title, body) VALUES ($1, $2, $3)', + [userId, `Post ${p} by User ${userId}`, `Content of post ${p} by user ${userId}. Lorem ipsum dolor sit amet.`] + ); + } + } + + console.log('Database seeded: 50 users, 500 posts.'); + } finally { + client.release(); + } +} + +// --- Routes --- + +app.get('/health', (_req, res) => { + res.json({ status: 'ok' }); +}); + +// Fast route: single query, returns 10 users +app.get('/fast', async (_req, res) => { + const start = Date.now(); + try { + const result = await pool.query('SELECT * FROM users LIMIT 10'); + const duration = Date.now() - start; + res.json({ + route: '/fast', + description: 'Single query - SELECT users LIMIT 10', + query_count: 1, + duration_ms: duration, + data: result.rows, + }); + } catch (err) { + res.status(500).json({ error: err.message }); + } +}); + +// Slow route: N+1 query pattern +app.get('/slow', async (_req, res) => { + const start = Date.now(); + try { + const usersResult = await pool.query('SELECT * FROM users'); + const users = usersResult.rows; + let queryCount = 1; + + const usersWithPosts = []; + for (const user of users) { + const postsResult = await pool.query( + 'SELECT * FROM posts WHERE user_id = $1', + [user.id] + ); + queryCount++; + usersWithPosts.push({ + ...user, + posts: postsResult.rows, + }); + } + + const duration = Date.now() - start; + res.json({ + route: '/slow', + description: 'N+1 pattern - 1 query for users + 1 query per user for posts', + query_count: queryCount, + user_count: users.length, + total_posts: usersWithPosts.reduce((sum, u) => sum + u.posts.length, 0), + duration_ms: duration, + data: usersWithPosts, + }); + } catch (err) { + res.status(500).json({ error: err.message }); + } +}); + +// Fixed route: single JOIN query +app.get('/fixed', async (_req, res) => { + const start = Date.now(); + try { + const result = await pool.query(` + SELECT + u.id AS user_id, + u.name, + u.email, + u.created_at AS user_created_at, + p.id AS post_id, + p.title, + p.body, + p.created_at AS post_created_at + FROM users u + LEFT JOIN posts p ON p.user_id = u.id + ORDER BY u.id, p.id + `); + + // Group results by user + const usersMap = new Map(); + for (const row of result.rows) { + if (!usersMap.has(row.user_id)) { + usersMap.set(row.user_id, { + id: row.user_id, + name: row.name, + email: row.email, + created_at: row.user_created_at, + posts: [], + }); + } + if (row.post_id) { + usersMap.get(row.user_id).posts.push({ + id: row.post_id, + title: row.title, + body: row.body, + created_at: row.post_created_at, + }); + } + } + + const usersWithPosts = Array.from(usersMap.values()); + const duration = Date.now() - start; + + res.json({ + route: '/fixed', + description: 'Single JOIN query - the correct way', + query_count: 1, + user_count: usersWithPosts.length, + total_posts: usersWithPosts.reduce((sum, u) => sum + u.posts.length, 0), + duration_ms: duration, + data: usersWithPosts, + }); + } catch (err) { + res.status(500).json({ error: err.message }); + } +}); + +// --- Startup --- + +async function main() { + // Wait for PostgreSQL to be ready (with retries) + let retries = 10; + while (retries > 0) { + try { + await pool.query('SELECT 1'); + console.log('Connected to PostgreSQL.'); + break; + } catch (err) { + retries--; + if (retries === 0) { + console.error('Failed to connect to PostgreSQL after retries:', err.message); + process.exit(1); + } + console.log(`Waiting for PostgreSQL... (${retries} retries left)`); + await new Promise((resolve) => setTimeout(resolve, 3000)); + } + } + + await seedDatabase(); + + app.listen(PORT, () => { + console.log(`Demo app listening on port ${PORT}`); + }); +} + +main().catch((err) => { + console.error('Fatal error:', err); + process.exit(1); +}); diff --git a/aula-15/demo-app/k8s/deployment.yaml b/aula-15/demo-app/k8s/deployment.yaml new file mode 100644 index 0000000..2bf80f8 --- /dev/null +++ b/aula-15/demo-app/k8s/deployment.yaml @@ -0,0 +1,64 @@ +apiVersion: apps/v1 +kind: Deployment +metadata: + name: demo-app + namespace: demo + labels: + app: demo-app +spec: + replicas: 1 + selector: + matchLabels: + app: demo-app + template: + metadata: + labels: + app: demo-app + spec: + containers: + - name: demo-app + image: REGISTRY_PLACEHOLDER/IMAGE_NAME_PLACEHOLDER:latest + ports: + - containerPort: 3000 + env: + - name: PG_HOST + value: demo-postgresql + - name: PG_PORT + value: "5432" + - name: PG_USER + value: demo + - name: PG_PASSWORD + valueFrom: + secretKeyRef: + name: demo-postgresql + key: POSTGRES_PASSWORD + - name: PG_DATABASE + value: demo + - name: OTEL_EXPORTER_OTLP_ENDPOINT + value: http://otel-collector-opentelemetry-collector.monitoring:4317 + - name: OTEL_SERVICE_NAME + value: demo-app + - name: NODE_OPTIONS + value: "--require ./tracing.js" + resources: + requests: + memory: "64Mi" + cpu: "50m" + limits: + memory: "128Mi" + livenessProbe: + httpGet: + path: /health + port: 3000 + initialDelaySeconds: 10 + periodSeconds: 10 + timeoutSeconds: 3 + failureThreshold: 3 + readinessProbe: + httpGet: + path: /health + port: 3000 + initialDelaySeconds: 5 + periodSeconds: 5 + timeoutSeconds: 3 + failureThreshold: 3 diff --git a/aula-15/demo-app/k8s/ingress.yaml b/aula-15/demo-app/k8s/ingress.yaml new file mode 100644 index 0000000..7f6a7bb --- /dev/null +++ b/aula-15/demo-app/k8s/ingress.yaml @@ -0,0 +1,24 @@ +apiVersion: networking.k8s.io/v1 +kind: Ingress +metadata: + name: demo-app + namespace: demo + annotations: + cert-manager.io/cluster-issuer: CLUSTER_ISSUER_PLACEHOLDER +spec: + ingressClassName: nginx + tls: + - hosts: + - DEMO_HOST_PLACEHOLDER + secretName: demo-app-tls + rules: + - host: DEMO_HOST_PLACEHOLDER + http: + paths: + - path: / + pathType: Prefix + backend: + service: + name: demo-app + port: + number: 3000 diff --git a/aula-15/demo-app/k8s/namespace.yaml b/aula-15/demo-app/k8s/namespace.yaml new file mode 100644 index 0000000..18434a6 --- /dev/null +++ b/aula-15/demo-app/k8s/namespace.yaml @@ -0,0 +1,4 @@ +apiVersion: v1 +kind: Namespace +metadata: + name: demo diff --git a/aula-15/demo-app/k8s/postgresql.yaml b/aula-15/demo-app/k8s/postgresql.yaml new file mode 100644 index 0000000..c3d4105 --- /dev/null +++ b/aula-15/demo-app/k8s/postgresql.yaml @@ -0,0 +1,84 @@ +--- +apiVersion: v1 +kind: Secret +metadata: + name: demo-postgresql + namespace: demo +type: Opaque +stringData: + POSTGRES_PASSWORD: demo-secret-pw +--- +apiVersion: v1 +kind: PersistentVolumeClaim +metadata: + name: demo-postgresql-data + namespace: demo +spec: + accessModes: + - ReadWriteOnce + resources: + requests: + storage: 1Gi +--- +apiVersion: apps/v1 +kind: Deployment +metadata: + name: demo-postgresql + namespace: demo + labels: + app: demo-postgresql +spec: + replicas: 1 + selector: + matchLabels: + app: demo-postgresql + template: + metadata: + labels: + app: demo-postgresql + spec: + containers: + - name: postgresql + image: postgres:17-alpine + ports: + - containerPort: 5432 + env: + - name: POSTGRES_DB + value: demo + - name: POSTGRES_USER + value: demo + - name: POSTGRES_PASSWORD + valueFrom: + secretKeyRef: + name: demo-postgresql + key: POSTGRES_PASSWORD + resources: + requests: + memory: "64Mi" + cpu: "50m" + limits: + memory: "128Mi" + volumeMounts: + - name: data + mountPath: /var/lib/postgresql/data + subPath: pgdata + volumes: + - name: data + persistentVolumeClaim: + claimName: demo-postgresql-data +--- +apiVersion: v1 +kind: Service +metadata: + name: demo-postgresql + namespace: demo + labels: + app: demo-postgresql +spec: + type: ClusterIP + ports: + - port: 5432 + targetPort: 5432 + protocol: TCP + selector: + app: demo-postgresql diff --git a/aula-15/demo-app/k8s/service.yaml b/aula-15/demo-app/k8s/service.yaml new file mode 100644 index 0000000..fe58e2c --- /dev/null +++ b/aula-15/demo-app/k8s/service.yaml @@ -0,0 +1,15 @@ +apiVersion: v1 +kind: Service +metadata: + name: demo-app + namespace: demo + labels: + app: demo-app +spec: + type: ClusterIP + ports: + - port: 3000 + targetPort: 3000 + protocol: TCP + selector: + app: demo-app diff --git a/aula-15/demo-app/package.json b/aula-15/demo-app/package.json new file mode 100644 index 0000000..ba05d39 --- /dev/null +++ b/aula-15/demo-app/package.json @@ -0,0 +1,17 @@ +{ + "name": "demo-app", + "version": "1.0.0", + "description": "Demo app for APM with OpenTelemetry - Workshop Aula 15", + "main": "app.js", + "scripts": { + "start": "node --require ./tracing.js app.js" + }, + "dependencies": { + "express": "^4.21.2", + "pg": "^8.13.1", + "@opentelemetry/sdk-node": "^0.57.2", + "@opentelemetry/auto-instrumentations-node": "^0.56.1", + "@opentelemetry/exporter-trace-otlp-grpc": "^0.57.2", + "@opentelemetry/exporter-metrics-otlp-grpc": "^0.57.2" + } +} diff --git a/aula-15/demo-app/tracing.js b/aula-15/demo-app/tracing.js new file mode 100644 index 0000000..0127691 --- /dev/null +++ b/aula-15/demo-app/tracing.js @@ -0,0 +1,45 @@ +'use strict'; + +const { NodeSDK } = require('@opentelemetry/sdk-node'); +const { getNodeAutoInstrumentations } = require('@opentelemetry/auto-instrumentations-node'); +const { OTLPTraceExporter } = require('@opentelemetry/exporter-trace-otlp-grpc'); +const { OTLPMetricExporter } = require('@opentelemetry/exporter-metrics-otlp-grpc'); +const { PeriodicExportingMetricReader } = require('@opentelemetry/sdk-metrics'); + +const otlpEndpoint = process.env.OTEL_EXPORTER_OTLP_ENDPOINT || 'http://otel-collector.monitoring:4317'; +const serviceName = process.env.OTEL_SERVICE_NAME || 'demo-app'; + +const traceExporter = new OTLPTraceExporter({ + url: otlpEndpoint, +}); + +const metricExporter = new OTLPMetricExporter({ + url: otlpEndpoint, +}); + +const metricReader = new PeriodicExportingMetricReader({ + exporter: metricExporter, + exportIntervalMillis: 15000, +}); + +const sdk = new NodeSDK({ + serviceName, + traceExporter, + metricReader, + instrumentations: [ + getNodeAutoInstrumentations({ + '@opentelemetry/instrumentation-fs': { enabled: false }, + }), + ], +}); + +sdk.start(); + +process.on('SIGTERM', () => { + sdk.shutdown() + .then(() => console.log('OpenTelemetry SDK shut down')) + .catch((err) => console.error('Error shutting down OpenTelemetry SDK', err)) + .finally(() => process.exit(0)); +}); + +console.log(`OpenTelemetry initialized for service "${serviceName}" -> ${otlpEndpoint}`); diff --git a/aula-15/otel-collector-values.yaml b/aula-15/otel-collector-values.yaml new file mode 100644 index 0000000..357a031 --- /dev/null +++ b/aula-15/otel-collector-values.yaml @@ -0,0 +1,88 @@ +# OpenTelemetry Collector +# Chart: open-telemetry/opentelemetry-collector +# Recebe telemetria das apps e encaminha para Tempo (traces) e Victoria Metrics (metrics) + +mode: deployment + +replicaCount: 1 + +image: + repository: otel/opentelemetry-collector-contrib + +resources: + requests: + cpu: 50m + memory: 128Mi + limits: + cpu: 200m + memory: 256Mi + +# Only expose OTLP ports (disable jaeger, zipkin) +ports: + otlp: + enabled: true + containerPort: 4317 + servicePort: 4317 + protocol: TCP + appProtocol: grpc + otlp-http: + enabled: true + containerPort: 4318 + servicePort: 4318 + protocol: TCP + jaeger-compact: + enabled: false + jaeger-thrift: + enabled: false + jaeger-grpc: + enabled: false + zipkin: + enabled: false + +config: + receivers: + otlp: + protocols: + grpc: + endpoint: "0.0.0.0:4317" + http: + endpoint: "0.0.0.0:4318" + + connectors: + spanmetrics: + histogram: + explicit: + buckets: [5ms, 10ms, 25ms, 50ms, 100ms, 250ms, 500ms, 1s, 2.5s, 5s, 10s] + dimensions: + - name: http.method + - name: http.route + - name: http.status_code + namespace: "" + + exporters: + otlp/tempo: + endpoint: "tempo.monitoring:4317" + tls: + insecure: true + prometheusremotewrite: + endpoint: "http://vmsingle-monitoring-victoria-metrics-k8s-stack.monitoring:8429/api/v1/write" + + processors: + batch: + timeout: 5s + send_batch_size: 1024 + memory_limiter: + check_interval: 5s + limit_percentage: 80 + spike_limit_percentage: 25 + + service: + pipelines: + traces: + receivers: [otlp] + processors: [memory_limiter, batch] + exporters: [otlp/tempo, spanmetrics] + metrics/spanmetrics: + receivers: [spanmetrics] + processors: [batch] + exporters: [prometheusremotewrite] diff --git a/aula-15/setup.sh b/aula-15/setup.sh new file mode 100755 index 0000000..01d7ee8 --- /dev/null +++ b/aula-15/setup.sh @@ -0,0 +1,466 @@ +#!/bin/bash +# ============================================================================= +# Aula 15 - APM com Grafana Tempo + OpenTelemetry +# ============================================================================= +# +# Este script instala: +# 1. Grafana Tempo (backend de distributed tracing) +# 2. OpenTelemetry Collector (recebe e encaminha telemetria) +# 3. Datasource do Tempo no Grafana +# 4. Demo app instrumentada (Express + PostgreSQL + OTel) +# 5. Alertas de latencia (Doherty threshold) +# +# Pre-requisitos: +# - Cluster Kubernetes (aula-08) +# - kubectl e helm instalados +# - Monitoring namespace com Victoria Metrics + Grafana (aula-12) +# +# ============================================================================= + +set -e + +RED='\033[0;31m' +GREEN='\033[0;32m' +YELLOW='\033[1;33m' +BLUE='\033[0;34m' +CYAN='\033[0;36m' +NC='\033[0m' + +log_info() { echo -e "${BLUE}[INFO]${NC} $1"; } +log_success() { echo -e "${GREEN}[OK]${NC} $1"; } +log_warn() { echo -e "${YELLOW}[WARN]${NC} $1"; } +log_error() { echo -e "${RED}[ERROR]${NC} $1"; } + +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +ENV_FILE="${SCRIPT_DIR}/.env" + +# ============================================================================= +# VERIFICAR PRE-REQUISITOS +# ============================================================================= + +log_info "Verificando pre-requisitos..." + +if ! command -v kubectl &> /dev/null; then + log_error "kubectl nao encontrado. Instale com: brew install kubectl" + exit 1 +fi + +if ! command -v helm &> /dev/null; then + log_error "helm nao encontrado. Instale com: brew install helm" + exit 1 +fi + +if ! kubectl cluster-info &> /dev/null; then + log_error "Nao foi possivel conectar ao cluster Kubernetes" + log_info "Verifique se KUBECONFIG esta configurado corretamente" + exit 1 +fi + +if ! kubectl get namespace monitoring &> /dev/null; then + log_error "Namespace 'monitoring' nao encontrado" + log_info "Execute primeiro a aula-12 para instalar Victoria Metrics + Grafana" + exit 1 +fi + +log_success "Pre-requisitos verificados" + +# ============================================================================= +# CARREGAR CONFIGURACAO +# ============================================================================= + +if [[ -f "$ENV_FILE" ]]; then + log_info "Carregando configuracao local..." + source "$ENV_FILE" +fi + +# Herdar configuracao da aula-12 ou aula-10 +for ENV_SRC in "${SCRIPT_DIR}/../aula-12/.env" "${SCRIPT_DIR}/../aula-10/.env"; do + if [[ -f "$ENV_SRC" && -z "$DOMAIN" ]]; then + log_info "Herdando configuracao de $(basename $(dirname $ENV_SRC))..." + source "$ENV_SRC" + fi +done + +# ============================================================================= +# COLETAR CONFIGURACAO +# ============================================================================= + +echo "" +echo -e "${CYAN}═══════════════════════════════════════════════════${NC}" +echo -e "${CYAN} APM - Grafana Tempo + OpenTelemetry${NC}" +echo -e "${CYAN}═══════════════════════════════════════════════════${NC}" +echo "" + +# Dominio +if [[ -z "$DOMAIN" ]]; then + read -p "Dominio base (ex: kube.quest): " DOMAIN +fi +log_info "Dominio: ${DOMAIN}" + +# Demo host +if [[ -z "$DEMO_HOST" ]]; then + DEFAULT_DEMO="demo.${DOMAIN}" + read -p "Hostname da demo app [${DEFAULT_DEMO}]: " DEMO_HOST + DEMO_HOST="${DEMO_HOST:-$DEFAULT_DEMO}" +fi +log_info "Demo app: ${DEMO_HOST}" + +# TLS (herdar ou perguntar) +if [[ -z "$USE_CLOUDFLARE" && -z "$USE_LETSENCRYPT" ]]; then + echo "" + echo "Configuracao de TLS:" + echo " 1) Let's Encrypt (recomendado)" + echo " 2) CloudFlare (proxy)" + echo " 3) HTTP apenas" + read -p "Escolha [1-3]: " TLS_CHOICE + case $TLS_CHOICE in + 1) USE_CLOUDFLARE=false; USE_LETSENCRYPT=true ;; + 2) USE_CLOUDFLARE=true; USE_LETSENCRYPT=false ;; + *) USE_CLOUDFLARE=false; USE_LETSENCRYPT=false ;; + esac +fi + +# Senha do PostgreSQL da demo +if [[ -z "$DEMO_DB_PASSWORD" ]]; then + DEMO_DB_PASSWORD=$(openssl rand -base64 24 | tr -dc 'a-zA-Z0-9' | head -c 24) + log_info "Senha PostgreSQL da demo gerada automaticamente" +fi + +# Salvar configuracao +cat > "$ENV_FILE" << EOF +# Configuracao gerada pelo setup.sh - $(date) +DOMAIN=${DOMAIN} +DEMO_HOST=${DEMO_HOST} +DEMO_DB_PASSWORD=${DEMO_DB_PASSWORD} +USE_CLOUDFLARE=${USE_CLOUDFLARE} +USE_LETSENCRYPT=${USE_LETSENCRYPT} +EOF + +log_success "Configuracao salva" + +# ============================================================================= +# INSTALAR GRAFANA TEMPO +# ============================================================================= + +echo "" +log_info "=== Instalando Grafana Tempo ===" + +helm repo add grafana https://grafana.github.io/helm-charts 2>/dev/null || true +helm repo update grafana + +if helm status tempo -n monitoring &> /dev/null; then + log_warn "Tempo ja instalado. Atualizando..." + TEMPO_CMD="upgrade" +else + TEMPO_CMD="install" +fi + +helm ${TEMPO_CMD} tempo grafana/tempo \ + --namespace monitoring \ + -f "${SCRIPT_DIR}/tempo-values.yaml" \ + --wait \ + --timeout 5m + +log_success "Grafana Tempo instalado!" + +# ============================================================================= +# INSTALAR OPENTELEMETRY COLLECTOR +# ============================================================================= + +echo "" +log_info "=== Instalando OpenTelemetry Collector ===" + +helm repo add open-telemetry https://open-telemetry.github.io/opentelemetry-helm-charts 2>/dev/null || true +helm repo update open-telemetry + +if helm status otel-collector -n monitoring &> /dev/null; then + log_warn "OTel Collector ja instalado. Atualizando..." + OTEL_CMD="upgrade" +else + OTEL_CMD="install" +fi + +helm ${OTEL_CMD} otel-collector open-telemetry/opentelemetry-collector \ + --namespace monitoring \ + -f "${SCRIPT_DIR}/otel-collector-values.yaml" \ + --wait \ + --timeout 5m + +log_success "OpenTelemetry Collector instalado!" + +# ============================================================================= +# CONFIGURAR DATASOURCE DO TEMPO NO GRAFANA +# ============================================================================= + +echo "" +log_info "=== Configurando datasource do Tempo no Grafana ===" + +kubectl apply -f - <<'DATASOURCE_EOF' +apiVersion: v1 +kind: ConfigMap +metadata: + name: grafana-datasource-tempo + namespace: monitoring + labels: + grafana_datasource: "1" +data: + tempo-datasource.yaml: | + apiVersion: 1 + datasources: + - name: Tempo + type: tempo + url: http://tempo.monitoring:3100 + access: proxy + isDefault: false + jsonData: + tracesToMetrics: + datasourceUid: victoriametrics + tags: [{key: "http.route"}] + serviceMap: + datasourceUid: victoriametrics + nodeGraph: + enabled: true + tracesToLogs: + datasourceUid: "" +DATASOURCE_EOF + +log_success "Datasource configurado" + +# Reiniciar Grafana para carregar o novo datasource +log_info "Reiniciando Grafana para carregar datasource..." +kubectl rollout restart deployment -n monitoring -l app.kubernetes.io/name=grafana 2>/dev/null || true +kubectl rollout status deployment -n monitoring -l app.kubernetes.io/name=grafana --timeout=120s 2>/dev/null || true + +log_success "Grafana reiniciado" + +# ============================================================================= +# DEPLOY DA DEMO APP +# ============================================================================= + +echo "" +log_info "=== Deploy da Demo App ===" + +# Criar namespace +kubectl apply -f "${SCRIPT_DIR}/demo-app/k8s/namespace.yaml" +log_success "Namespace demo criado" + +# Criar secret do PostgreSQL com senha gerada +kubectl create secret generic demo-postgresql \ + --namespace demo \ + --from-literal=POSTGRES_PASSWORD="${DEMO_DB_PASSWORD}" \ + --dry-run=client -o yaml | kubectl apply -f - +log_success "Secret demo-postgresql criado" + +# Deploy do PostgreSQL +kubectl apply -f "${SCRIPT_DIR}/demo-app/k8s/postgresql.yaml" +log_info "Aguardando PostgreSQL ficar pronto..." +kubectl wait --for=condition=available deployment/demo-postgresql -n demo --timeout=120s 2>/dev/null || true +log_success "PostgreSQL pronto" + +# Criar ConfigMap com o codigo da app +kubectl create configmap demo-app-code \ + --from-file=app.js="${SCRIPT_DIR}/demo-app/app.js" \ + --from-file=tracing.js="${SCRIPT_DIR}/demo-app/tracing.js" \ + --from-file=package.json="${SCRIPT_DIR}/demo-app/package.json" \ + --namespace demo \ + --dry-run=client -o yaml | kubectl apply -f - +log_success "ConfigMap demo-app-code criado" + +# Deploy da app (usando node:20-alpine + init container para npm install) +kubectl apply -f - <<'APP_DEPLOY_EOF' +apiVersion: apps/v1 +kind: Deployment +metadata: + name: demo-app + namespace: demo + labels: + app: demo-app +spec: + replicas: 1 + selector: + matchLabels: + app: demo-app + template: + metadata: + labels: + app: demo-app + spec: + initContainers: + - name: install-deps + image: node:20-alpine + command: ["sh", "-c", "cp /code/* /app/ && cd /app && npm install --production"] + volumeMounts: + - name: code + mountPath: /code + readOnly: true + - name: app + mountPath: /app + containers: + - name: demo-app + image: node:20-alpine + command: ["node", "--require", "./tracing.js", "app.js"] + workingDir: /app + ports: + - containerPort: 3000 + env: + - name: PG_HOST + value: demo-postgresql + - name: PG_PORT + value: "5432" + - name: PG_USER + value: demo + - name: PG_PASSWORD + valueFrom: + secretKeyRef: + name: demo-postgresql + key: POSTGRES_PASSWORD + - name: PG_DATABASE + value: demo + - name: OTEL_EXPORTER_OTLP_ENDPOINT + value: http://otel-collector-opentelemetry-collector.monitoring:4317 + - name: OTEL_SERVICE_NAME + value: demo-app + resources: + requests: + memory: "64Mi" + cpu: "50m" + limits: + memory: "256Mi" + livenessProbe: + httpGet: + path: /health + port: 3000 + initialDelaySeconds: 30 + periodSeconds: 10 + timeoutSeconds: 3 + failureThreshold: 3 + readinessProbe: + httpGet: + path: /health + port: 3000 + initialDelaySeconds: 15 + periodSeconds: 5 + timeoutSeconds: 3 + failureThreshold: 3 + volumeMounts: + - name: app + mountPath: /app + volumes: + - name: code + configMap: + name: demo-app-code + - name: app + emptyDir: {} +APP_DEPLOY_EOF + +log_success "Deployment demo-app criado" + +# Deploy do Service +kubectl apply -f "${SCRIPT_DIR}/demo-app/k8s/service.yaml" +log_success "Service demo-app criado" + +# Deploy do Ingress (substituir placeholders) +TEMP_INGRESS=$(mktemp) +sed "s/DEMO_HOST_PLACEHOLDER/${DEMO_HOST}/g" "${SCRIPT_DIR}/demo-app/k8s/ingress.yaml" > "$TEMP_INGRESS" + +if [[ "$USE_LETSENCRYPT" == "true" ]]; then + sed -i.bak "s/CLUSTER_ISSUER_PLACEHOLDER/letsencrypt-prod/g" "$TEMP_INGRESS" + rm -f "$TEMP_INGRESS.bak" +else + # Remover anotacao cert-manager + sed -i.bak '/cert-manager.io\/cluster-issuer/d' "$TEMP_INGRESS" + rm -f "$TEMP_INGRESS.bak" +fi + +if [[ "$USE_CLOUDFLARE" != "true" && "$USE_LETSENCRYPT" != "true" ]]; then + # Remover bloco TLS + sed -i.bak '/tls:/,/secretName:/d' "$TEMP_INGRESS" + rm -f "$TEMP_INGRESS.bak" +fi + +kubectl apply -f "$TEMP_INGRESS" +rm -f "$TEMP_INGRESS" +log_success "Ingress demo-app criado" + +# Aguardar demo-app +log_info "Aguardando demo-app ficar pronta..." +kubectl wait --for=condition=available deployment/demo-app -n demo --timeout=180s 2>/dev/null || true + +# ============================================================================= +# APLICAR ALERTAS +# ============================================================================= + +echo "" +log_info "=== Aplicando alertas de latencia ===" + +kubectl apply -f "${SCRIPT_DIR}/alerts/latency-alerts.yaml" -n monitoring 2>/dev/null || true +log_success "Alertas aplicados" + +# ============================================================================= +# RESUMO FINAL +# ============================================================================= + +PROTOCOL="https" +if [[ "$USE_CLOUDFLARE" == "false" && "$USE_LETSENCRYPT" == "false" ]]; then + PROTOCOL="http" +fi + +GRAFANA_HOST="${GRAFANA_HOST:-grafana.${DOMAIN}}" + +echo "" +echo -e "${CYAN}═══════════════════════════════════════════════════${NC}" +echo -e "${GREEN} APM Instalado com Sucesso!${NC}" +echo -e "${CYAN}═══════════════════════════════════════════════════${NC}" +echo "" +echo "Demo App:" +echo " URL: ${PROTOCOL}://${DEMO_HOST}" +echo " Rotas:" +echo " /health - Health check" +echo " /fast - Query rapida (1 query)" +echo " /slow - Query lenta (N+1 pattern - 51 queries)" +echo " /fixed - Query otimizada (1 JOIN)" +echo "" +echo "Grafana (Traces):" +echo " URL: ${PROTOCOL}://${GRAFANA_HOST}" +echo " Datasource: Tempo (ja configurado)" +echo " Explore: ${PROTOCOL}://${GRAFANA_HOST}/explore" +echo "" +echo -e "${CYAN}--- Como testar ---${NC}" +echo "" +echo " # Gerar traces (executar varias vezes)" +echo " curl ${PROTOCOL}://${DEMO_HOST}/fast" +echo " curl ${PROTOCOL}://${DEMO_HOST}/slow" +echo " curl ${PROTOCOL}://${DEMO_HOST}/fixed" +echo "" +echo " # Ou via port-forward:" +echo " kubectl port-forward -n demo svc/demo-app 3000:3000" +echo " curl http://localhost:3000/slow" +echo "" +echo -e "${CYAN}--- O que observar no Grafana ---${NC}" +echo "" +echo " 1. Abrir Explore > Selecionar datasource 'Tempo'" +echo " 2. Search > Service Name = demo-app" +echo " 3. Comparar traces de /fast vs /slow" +echo " - /fast: 1 span de query (~5ms)" +echo " - /slow: 51 spans de query (~200ms+)" +echo " - /fixed: 1 span de query com JOIN (~10ms)" +echo "" +echo "Verificar pods:" +echo " kubectl get pods -n monitoring -l app.kubernetes.io/name=tempo" +echo " kubectl get pods -n monitoring -l app.kubernetes.io/name=opentelemetry-collector" +echo " kubectl get pods -n demo" +echo "" +echo "Desinstalar:" +echo " ./cleanup.sh" +echo "" +echo -e "${CYAN}═══════════════════════════════════════════════════${NC}" +echo "" + +log_info "Status dos pods:" +echo "" +echo "--- monitoring ---" +kubectl get pods -n monitoring -l "app.kubernetes.io/name in (tempo, opentelemetry-collector)" 2>/dev/null || true +echo "" +echo "--- demo ---" +kubectl get pods -n demo 2>/dev/null || true +echo "" diff --git a/aula-15/tempo-values.yaml b/aula-15/tempo-values.yaml new file mode 100644 index 0000000..3a591f6 --- /dev/null +++ b/aula-15/tempo-values.yaml @@ -0,0 +1,61 @@ +# Grafana Tempo - Monolithic mode (single binary) +# Chart: grafana/tempo +# Recebe traces do OpenTelemetry Collector e armazena localmente + +# Single replica - minimal for workshop on Hetzner CAX11 +replicas: 1 + +tempo: + # Storage config - local filesystem + storage: + trace: + backend: local + local: + path: /var/tempo/traces + wal: + path: /var/tempo/wal + + # Retention (maps to compactor.compaction.block_retention in tempo config) + retention: 168h # 7 days + + # Receive traces via OTLP only (disable other receivers) + receivers: + otlp: + protocols: + grpc: + endpoint: "0.0.0.0:4317" + http: + endpoint: "0.0.0.0:4318" + + # Metrics generator - generates span metrics for Victoria Metrics + metricsGenerator: + enabled: true + remoteWriteUrl: "http://vmsingle-monitoring-victoria-metrics-k8s-stack.monitoring:8429/api/v1/write" + processor: + service_graphs: + dimensions: + - http.method + - http.route + enable_client_server_prefix: true + max_items: 10000 + span_metrics: + dimensions: + - http.method + - http.route + - http.status_code + enable_target_info: true + + # Resources for tempo container (under tempo key for this chart) + resources: + requests: + cpu: 100m + memory: 256Mi + limits: + cpu: 500m + memory: 512Mi + +# Persistence via Hetzner CSI +persistence: + enabled: true + size: 10Gi + storageClassName: hcloud-volumes