aula-15: implementação completa APM (Tempo + OTel + demo app)

Componentes: - tempo-values.yaml: Grafana Tempo monolithic, 256Mi, 10Gi PVC - otel-collector-values.yaml: recebe OTLP, exporta traces→Tempo, gera span metrics (RED)→Victoria Metrics via spanmetrics connector - demo-app/: Node.js com rotas /fast (1 query), /slow (N+1, 51 queries), /fixed (JOIN), auto-instrumentado com OpenTelemetry - alerts/latency-alerts.yaml: VMRule com Doherty threshold (p95>400ms) - setup.sh: instala Tempo, OTel Collector, configura Grafana datasource, deploy demo app via ConfigMap (sem Docker build necessário) - cleanup.sh: remove apenas recursos da aula-15, preserva aula-12 Zero hardcoded hostnames. Tudo via .env e placeholders.
2026-03-14 02:30:35 -03:00
parent 6a8f076d8c
commit 9b3168b996
14 changed files with 1294 additions and 0 deletions
--- a/aula-15/alerts/latency-alerts.yaml
+++ b/aula-15/alerts/latency-alerts.yaml
@@ -0,0 +1,55 @@
+# VMRule - Alertas de Latência e Erros (APM)
+# Baseado em métricas geradas pelo spanmetrics connector do OpenTelemetry Collector
+# Métrica: http_server_request_duration_seconds_bucket
+
+apiVersion: operator.victoriametrics.com/v1beta1
+kind: VMRule
+metadata:
+  name: latency-alerts
+  namespace: monitoring
+  labels:
+    app: victoria-metrics
+spec:
+  groups:
+    - name: apm.latency
+      rules:
+        # Doherty Threshold: resposta acima de 400ms degrada a produtividade do usuário
+        - alert: DohertyThresholdExceeded
+          expr: |
+            histogram_quantile(0.95,
+              sum(rate(http_server_request_duration_seconds_bucket[5m])) by (le, service)
+            ) > 0.4
+          for: 5m
+          labels:
+            severity: warning
+          annotations:
+            summary: "P95 latency above Doherty threshold (400ms) for {{ $labels.service }}"
+            description: "Service {{ $labels.service }} has P95 latency of {{ $value | humanizeDuration }}. The Doherty threshold (400ms) states that productivity degrades when response time exceeds this limit."
+
+        # Taxa de erros HTTP 5xx acima de 1%
+        - alert: HighErrorRate
+          expr: |
+            (
+              sum(rate(http_server_request_duration_seconds_count{http_status_code=~"5.."}[5m])) by (service)
+              /
+              sum(rate(http_server_request_duration_seconds_count[5m])) by (service)
+            ) > 0.01
+          for: 5m
+          labels:
+            severity: warning
+          annotations:
+            summary: "Error rate above 1% for {{ $labels.service }}"
+            description: "Service {{ $labels.service }} has {{ $value | humanizePercentage }} error rate (5xx responses)."
+
+        # P99 acima de 1 segundo - latência crítica
+        - alert: HighRequestLatencyP99
+          expr: |
+            histogram_quantile(0.99,
+              sum(rate(http_server_request_duration_seconds_bucket[5m])) by (le, service)
+            ) > 1
+          for: 5m
+          labels:
+            severity: critical
+          annotations:
+            summary: "P99 latency above 1s for {{ $labels.service }}"
+            description: "Service {{ $labels.service }} has P99 latency of {{ $value | humanizeDuration }}. This indicates severe performance degradation."
--- a/aula-15/cleanup.sh
+++ b/aula-15/cleanup.sh
@@ -0,0 +1,129 @@
+#!/bin/bash
+# =============================================================================
+# Aula 15 - Cleanup (Remove APM: Tempo + OTel + Demo App)
+# =============================================================================
+#
+# Remove:
+#   - Demo app namespace (demo)
+#   - Grafana Tempo (Helm release)
+#   - OpenTelemetry Collector (Helm release)
+#   - Grafana datasource ConfigMap (Tempo)
+#   - Alertas de latencia
+#   - Arquivo .env
+#
+# NAO remove:
+#   - Victoria Metrics / Grafana (pertencem a aula-12)
+#   - Namespace monitoring
+#
+# =============================================================================
+
+set -e
+
+RED='\033[0;31m'
+GREEN='\033[0;32m'
+YELLOW='\033[1;33m'
+BLUE='\033[0;34m'
+CYAN='\033[0;36m'
+NC='\033[0m'
+
+log_info() { echo -e "${BLUE}[INFO]${NC} $1"; }
+log_success() { echo -e "${GREEN}[OK]${NC} $1"; }
+log_warn() { echo -e "${YELLOW}[WARN]${NC} $1"; }
+
+SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
+
+echo ""
+echo -e "${CYAN}═══════════════════════════════════════════════════${NC}"
+echo -e "${CYAN}  Cleanup - Aula 15 (APM)${NC}"
+echo -e "${CYAN}═══════════════════════════════════════════════════${NC}"
+echo ""
+
+log_warn "Isso vai remover:"
+echo "  - Namespace demo (demo app + PostgreSQL + dados)"
+echo "  - Grafana Tempo (Helm release)"
+echo "  - OpenTelemetry Collector (Helm release)"
+echo "  - Datasource Tempo no Grafana"
+echo "  - Alertas de latencia"
+echo ""
+echo -e "${GREEN}NAO sera removido:${NC}"
+echo "  - Victoria Metrics (aula-12)"
+echo "  - Grafana (aula-12)"
+echo "  - Namespace monitoring"
+echo ""
+read -p "Continuar? (digite 'sim' para confirmar): " confirm
+
+if [ "$confirm" != "sim" ]; then
+    log_info "Operacao cancelada"
+    exit 0
+fi
+
+echo ""
+
+# -------------------------------------------------------------------------
+# 1. Remover namespace demo (demo app + PostgreSQL)
+# -------------------------------------------------------------------------
+log_info "Removendo namespace demo..."
+if kubectl get namespace demo &> /dev/null; then
+    kubectl delete namespace demo --timeout=60s 2>/dev/null || true
+    log_success "Namespace demo removido"
+else
+    log_info "Namespace demo nao encontrado"
+fi
+
+# -------------------------------------------------------------------------
+# 2. Remover Grafana Tempo
+# -------------------------------------------------------------------------
+log_info "Removendo Grafana Tempo..."
+if helm status tempo -n monitoring &> /dev/null; then
+    helm uninstall tempo -n monitoring --wait 2>/dev/null || true
+    log_success "Tempo removido"
+else
+    log_info "Tempo nao encontrado"
+fi
+
+# Remover PVC do Tempo
+kubectl delete pvc -n monitoring -l app.kubernetes.io/name=tempo --wait=false 2>/dev/null || true
+
+# -------------------------------------------------------------------------
+# 3. Remover OpenTelemetry Collector
+# -------------------------------------------------------------------------
+log_info "Removendo OpenTelemetry Collector..."
+if helm status otel-collector -n monitoring &> /dev/null; then
+    helm uninstall otel-collector -n monitoring --wait 2>/dev/null || true
+    log_success "OTel Collector removido"
+else
+    log_info "OTel Collector nao encontrado"
+fi
+
+# -------------------------------------------------------------------------
+# 4. Remover datasource ConfigMap do Tempo no Grafana
+# -------------------------------------------------------------------------
+log_info "Removendo datasource Tempo do Grafana..."
+kubectl delete configmap grafana-datasource-tempo -n monitoring 2>/dev/null || true
+log_success "Datasource removido"
+
+# Reiniciar Grafana para remover datasource
+log_info "Reiniciando Grafana..."
+kubectl rollout restart deployment -n monitoring -l app.kubernetes.io/name=grafana 2>/dev/null || true
+
+# -------------------------------------------------------------------------
+# 5. Remover alertas de latencia
+# -------------------------------------------------------------------------
+log_info "Removendo alertas de latencia..."
+kubectl delete -f "${SCRIPT_DIR}/alerts/latency-alerts.yaml" -n monitoring 2>/dev/null || true
+log_success "Alertas removidos"
+
+# -------------------------------------------------------------------------
+# 6. Remover .env
+# -------------------------------------------------------------------------
+if [[ -f "${SCRIPT_DIR}/.env" ]]; then
+    rm "${SCRIPT_DIR}/.env"
+    log_info ".env removido"
+fi
+
+echo ""
+log_success "Cleanup concluido!"
+echo ""
+echo "O namespace monitoring e Victoria Metrics + Grafana foram mantidos."
+echo "Para remover tudo, execute o cleanup da aula-12."
+echo ""
--- a/aula-15/demo-app/Dockerfile
+++ b/aula-15/demo-app/Dockerfile
@@ -0,0 +1,12 @@
+FROM node:20-alpine
+
+WORKDIR /app
+
+COPY package.json package-lock.json* ./
+RUN npm install --production
+
+COPY . .
+
+EXPOSE 3000
+
+CMD ["node", "--require", "./tracing.js", "app.js"]
--- a/aula-15/demo-app/app.js
+++ b/aula-15/demo-app/app.js
@@ -0,0 +1,230 @@
+'use strict';
+
+const express = require('express');
+const { Pool } = require('pg');
+
+const app = express();
+const PORT = process.env.PORT || 3000;
+
+const pool = new Pool({
+  host: process.env.PG_HOST || 'localhost',
+  port: parseInt(process.env.PG_PORT || '5432', 10),
+  user: process.env.PG_USER || 'demo',
+  password: process.env.PG_PASSWORD || 'demo',
+  database: process.env.PG_DATABASE || 'demo',
+});
+
+// --- Database seeding ---
+
+async function seedDatabase() {
+  const client = await pool.connect();
+  try {
+    const tableCheck = await client.query(`
+      SELECT EXISTS (
+        SELECT FROM information_schema.tables WHERE table_name = 'users'
+      )
+    `);
+
+    if (tableCheck.rows[0].exists) {
+      const countResult = await client.query('SELECT COUNT(*) FROM users');
+      if (parseInt(countResult.rows[0].count, 10) > 0) {
+        console.log('Database already seeded, skipping.');
+        return;
+      }
+    }
+
+    console.log('Seeding database...');
+
+    await client.query(`
+      CREATE TABLE IF NOT EXISTS users (
+        id SERIAL PRIMARY KEY,
+        name VARCHAR(100) NOT NULL,
+        email VARCHAR(150) NOT NULL,
+        created_at TIMESTAMP DEFAULT NOW()
+      )
+    `);
+
+    await client.query(`
+      CREATE TABLE IF NOT EXISTS posts (
+        id SERIAL PRIMARY KEY,
+        user_id INTEGER REFERENCES users(id),
+        title VARCHAR(200) NOT NULL,
+        body TEXT NOT NULL,
+        created_at TIMESTAMP DEFAULT NOW()
+      )
+    `);
+
+    // Insert 50 users
+    for (let i = 1; i <= 50; i++) {
+      await client.query(
+        'INSERT INTO users (name, email) VALUES ($1, $2)',
+        [`User ${i}`, `user${i}@example.com`]
+      );
+    }
+
+    // Insert 10 posts per user (500 posts total)
+    for (let userId = 1; userId <= 50; userId++) {
+      for (let p = 1; p <= 10; p++) {
+        await client.query(
+          'INSERT INTO posts (user_id, title, body) VALUES ($1, $2, $3)',
+          [userId, `Post ${p} by User ${userId}`, `Content of post ${p} by user ${userId}. Lorem ipsum dolor sit amet.`]
+        );
+      }
+    }
+
+    console.log('Database seeded: 50 users, 500 posts.');
+  } finally {
+    client.release();
+  }
+}
+
+// --- Routes ---
+
+app.get('/health', (_req, res) => {
+  res.json({ status: 'ok' });
+});
+
+// Fast route: single query, returns 10 users
+app.get('/fast', async (_req, res) => {
+  const start = Date.now();
+  try {
+    const result = await pool.query('SELECT * FROM users LIMIT 10');
+    const duration = Date.now() - start;
+    res.json({
+      route: '/fast',
+      description: 'Single query - SELECT users LIMIT 10',
+      query_count: 1,
+      duration_ms: duration,
+      data: result.rows,
+    });
+  } catch (err) {
+    res.status(500).json({ error: err.message });
+  }
+});
+
+// Slow route: N+1 query pattern
+app.get('/slow', async (_req, res) => {
+  const start = Date.now();
+  try {
+    const usersResult = await pool.query('SELECT * FROM users');
+    const users = usersResult.rows;
+    let queryCount = 1;
+
+    const usersWithPosts = [];
+    for (const user of users) {
+      const postsResult = await pool.query(
+        'SELECT * FROM posts WHERE user_id = $1',
+        [user.id]
+      );
+      queryCount++;
+      usersWithPosts.push({
+        ...user,
+        posts: postsResult.rows,
+      });
+    }
+
+    const duration = Date.now() - start;
+    res.json({
+      route: '/slow',
+      description: 'N+1 pattern - 1 query for users + 1 query per user for posts',
+      query_count: queryCount,
+      user_count: users.length,
+      total_posts: usersWithPosts.reduce((sum, u) => sum + u.posts.length, 0),
+      duration_ms: duration,
+      data: usersWithPosts,
+    });
+  } catch (err) {
+    res.status(500).json({ error: err.message });
+  }
+});
+
+// Fixed route: single JOIN query
+app.get('/fixed', async (_req, res) => {
+  const start = Date.now();
+  try {
+    const result = await pool.query(`
+      SELECT
+        u.id AS user_id,
+        u.name,
+        u.email,
+        u.created_at AS user_created_at,
+        p.id AS post_id,
+        p.title,
+        p.body,
+        p.created_at AS post_created_at
+      FROM users u
+      LEFT JOIN posts p ON p.user_id = u.id
+      ORDER BY u.id, p.id
+    `);
+
+    // Group results by user
+    const usersMap = new Map();
+    for (const row of result.rows) {
+      if (!usersMap.has(row.user_id)) {
+        usersMap.set(row.user_id, {
+          id: row.user_id,
+          name: row.name,
+          email: row.email,
+          created_at: row.user_created_at,
+          posts: [],
+        });
+      }
+      if (row.post_id) {
+        usersMap.get(row.user_id).posts.push({
+          id: row.post_id,
+          title: row.title,
+          body: row.body,
+          created_at: row.post_created_at,
+        });
+      }
+    }
+
+    const usersWithPosts = Array.from(usersMap.values());
+    const duration = Date.now() - start;
+
+    res.json({
+      route: '/fixed',
+      description: 'Single JOIN query - the correct way',
+      query_count: 1,
+      user_count: usersWithPosts.length,
+      total_posts: usersWithPosts.reduce((sum, u) => sum + u.posts.length, 0),
+      duration_ms: duration,
+      data: usersWithPosts,
+    });
+  } catch (err) {
+    res.status(500).json({ error: err.message });
+  }
+});
+
+// --- Startup ---
+
+async function main() {
+  // Wait for PostgreSQL to be ready (with retries)
+  let retries = 10;
+  while (retries > 0) {
+    try {
+      await pool.query('SELECT 1');
+      console.log('Connected to PostgreSQL.');
+      break;
+    } catch (err) {
+      retries--;
+      if (retries === 0) {
+        console.error('Failed to connect to PostgreSQL after retries:', err.message);
+        process.exit(1);
+      }
+      console.log(`Waiting for PostgreSQL... (${retries} retries left)`);
+      await new Promise((resolve) => setTimeout(resolve, 3000));
+    }
+  }
+
+  await seedDatabase();
+
+  app.listen(PORT, () => {
+    console.log(`Demo app listening on port ${PORT}`);
+  });
+}
+
+main().catch((err) => {
+  console.error('Fatal error:', err);
+  process.exit(1);
+});
--- a/aula-15/demo-app/k8s/deployment.yaml
+++ b/aula-15/demo-app/k8s/deployment.yaml
@@ -0,0 +1,64 @@
+apiVersion: apps/v1
+kind: Deployment
+metadata:
+  name: demo-app
+  namespace: demo
+  labels:
+    app: demo-app
+spec:
+  replicas: 1
+  selector:
+    matchLabels:
+      app: demo-app
+  template:
+    metadata:
+      labels:
+        app: demo-app
+    spec:
+      containers:
+        - name: demo-app
+          image: REGISTRY_PLACEHOLDER/IMAGE_NAME_PLACEHOLDER:latest
+          ports:
+            - containerPort: 3000
+          env:
+            - name: PG_HOST
+              value: demo-postgresql
+            - name: PG_PORT
+              value: "5432"
+            - name: PG_USER
+              value: demo
+            - name: PG_PASSWORD
+              valueFrom:
+                secretKeyRef:
+                  name: demo-postgresql
+                  key: POSTGRES_PASSWORD
+            - name: PG_DATABASE
+              value: demo
+            - name: OTEL_EXPORTER_OTLP_ENDPOINT
+              value: http://otel-collector-opentelemetry-collector.monitoring:4317
+            - name: OTEL_SERVICE_NAME
+              value: demo-app
+            - name: NODE_OPTIONS
+              value: "--require ./tracing.js"
+          resources:
+            requests:
+              memory: "64Mi"
+              cpu: "50m"
+            limits:
+              memory: "128Mi"
+          livenessProbe:
+            httpGet:
+              path: /health
+              port: 3000
+            initialDelaySeconds: 10
+            periodSeconds: 10
+            timeoutSeconds: 3
+            failureThreshold: 3
+          readinessProbe:
+            httpGet:
+              path: /health
+              port: 3000
+            initialDelaySeconds: 5
+            periodSeconds: 5
+            timeoutSeconds: 3
+            failureThreshold: 3
--- a/aula-15/demo-app/k8s/ingress.yaml
+++ b/aula-15/demo-app/k8s/ingress.yaml
@@ -0,0 +1,24 @@
+apiVersion: networking.k8s.io/v1
+kind: Ingress
+metadata:
+  name: demo-app
+  namespace: demo
+  annotations:
+    cert-manager.io/cluster-issuer: CLUSTER_ISSUER_PLACEHOLDER
+spec:
+  ingressClassName: nginx
+  tls:
+    - hosts:
+        - DEMO_HOST_PLACEHOLDER
+      secretName: demo-app-tls
+  rules:
+    - host: DEMO_HOST_PLACEHOLDER
+      http:
+        paths:
+          - path: /
+            pathType: Prefix
+            backend:
+              service:
+                name: demo-app
+                port:
+                  number: 3000
--- a/aula-15/demo-app/k8s/namespace.yaml
+++ b/aula-15/demo-app/k8s/namespace.yaml
@@ -0,0 +1,4 @@
+apiVersion: v1
+kind: Namespace
+metadata:
+  name: demo
--- a/aula-15/demo-app/k8s/postgresql.yaml
+++ b/aula-15/demo-app/k8s/postgresql.yaml
@@ -0,0 +1,84 @@
+---
+apiVersion: v1
+kind: Secret
+metadata:
+  name: demo-postgresql
+  namespace: demo
+type: Opaque
+stringData:
+  POSTGRES_PASSWORD: demo-secret-pw
+---
+apiVersion: v1
+kind: PersistentVolumeClaim
+metadata:
+  name: demo-postgresql-data
+  namespace: demo
+spec:
+  accessModes:
+    - ReadWriteOnce
+  resources:
+    requests:
+      storage: 1Gi
+---
+apiVersion: apps/v1
+kind: Deployment
+metadata:
+  name: demo-postgresql
+  namespace: demo
+  labels:
+    app: demo-postgresql
+spec:
+  replicas: 1
+  selector:
+    matchLabels:
+      app: demo-postgresql
+  template:
+    metadata:
+      labels:
+        app: demo-postgresql
+    spec:
+      containers:
+        - name: postgresql
+          image: postgres:17-alpine
+          ports:
+            - containerPort: 5432
+          env:
+            - name: POSTGRES_DB
+              value: demo
+            - name: POSTGRES_USER
+              value: demo
+            - name: POSTGRES_PASSWORD
+              valueFrom:
+                secretKeyRef:
+                  name: demo-postgresql
+                  key: POSTGRES_PASSWORD
+          resources:
+            requests:
+              memory: "64Mi"
+              cpu: "50m"
+            limits:
+              memory: "128Mi"
+          volumeMounts:
+            - name: data
+              mountPath: /var/lib/postgresql/data
+              subPath: pgdata
+      volumes:
+        - name: data
+          persistentVolumeClaim:
+            claimName: demo-postgresql-data
+---
+apiVersion: v1
+kind: Service
+metadata:
+  name: demo-postgresql
+  namespace: demo
+  labels:
+    app: demo-postgresql
+spec:
+  type: ClusterIP
+  ports:
+    - port: 5432
+      targetPort: 5432
+      protocol: TCP
+  selector:
+    app: demo-postgresql
--- a/aula-15/demo-app/k8s/service.yaml
+++ b/aula-15/demo-app/k8s/service.yaml
@@ -0,0 +1,15 @@
+apiVersion: v1
+kind: Service
+metadata:
+  name: demo-app
+  namespace: demo
+  labels:
+    app: demo-app
+spec:
+  type: ClusterIP
+  ports:
+    - port: 3000
+      targetPort: 3000
+      protocol: TCP
+  selector:
+    app: demo-app
--- a/aula-15/demo-app/package.json
+++ b/aula-15/demo-app/package.json
@@ -0,0 +1,17 @@
+{
+  "name": "demo-app",
+  "version": "1.0.0",
+  "description": "Demo app for APM with OpenTelemetry - Workshop Aula 15",
+  "main": "app.js",
+  "scripts": {
+    "start": "node --require ./tracing.js app.js"
+  },
+  "dependencies": {
+    "express": "^4.21.2",
+    "pg": "^8.13.1",
+    "@opentelemetry/sdk-node": "^0.57.2",
+    "@opentelemetry/auto-instrumentations-node": "^0.56.1",
+    "@opentelemetry/exporter-trace-otlp-grpc": "^0.57.2",
+    "@opentelemetry/exporter-metrics-otlp-grpc": "^0.57.2"
+  }
+}
--- a/aula-15/demo-app/tracing.js
+++ b/aula-15/demo-app/tracing.js
@@ -0,0 +1,45 @@
+'use strict';
+
+const { NodeSDK } = require('@opentelemetry/sdk-node');
+const { getNodeAutoInstrumentations } = require('@opentelemetry/auto-instrumentations-node');
+const { OTLPTraceExporter } = require('@opentelemetry/exporter-trace-otlp-grpc');
+const { OTLPMetricExporter } = require('@opentelemetry/exporter-metrics-otlp-grpc');
+const { PeriodicExportingMetricReader } = require('@opentelemetry/sdk-metrics');
+
+const otlpEndpoint = process.env.OTEL_EXPORTER_OTLP_ENDPOINT || 'http://otel-collector.monitoring:4317';
+const serviceName = process.env.OTEL_SERVICE_NAME || 'demo-app';
+
+const traceExporter = new OTLPTraceExporter({
+  url: otlpEndpoint,
+});
+
+const metricExporter = new OTLPMetricExporter({
+  url: otlpEndpoint,
+});
+
+const metricReader = new PeriodicExportingMetricReader({
+  exporter: metricExporter,
+  exportIntervalMillis: 15000,
+});
+
+const sdk = new NodeSDK({
+  serviceName,
+  traceExporter,
+  metricReader,
+  instrumentations: [
+    getNodeAutoInstrumentations({
+      '@opentelemetry/instrumentation-fs': { enabled: false },
+    }),
+  ],
+});
+
+sdk.start();
+
+process.on('SIGTERM', () => {
+  sdk.shutdown()
+    .then(() => console.log('OpenTelemetry SDK shut down'))
+    .catch((err) => console.error('Error shutting down OpenTelemetry SDK', err))
+    .finally(() => process.exit(0));
+});
+
+console.log(`OpenTelemetry initialized for service "${serviceName}" -> ${otlpEndpoint}`);
--- a/aula-15/otel-collector-values.yaml
+++ b/aula-15/otel-collector-values.yaml
@@ -0,0 +1,88 @@
+# OpenTelemetry Collector
+# Chart: open-telemetry/opentelemetry-collector
+# Recebe telemetria das apps e encaminha para Tempo (traces) e Victoria Metrics (metrics)
+
+mode: deployment
+
+replicaCount: 1
+
+image:
+  repository: otel/opentelemetry-collector-contrib
+
+resources:
+  requests:
+    cpu: 50m
+    memory: 128Mi
+  limits:
+    cpu: 200m
+    memory: 256Mi
+
+# Only expose OTLP ports (disable jaeger, zipkin)
+ports:
+  otlp:
+    enabled: true
+    containerPort: 4317
+    servicePort: 4317
+    protocol: TCP
+    appProtocol: grpc
+  otlp-http:
+    enabled: true
+    containerPort: 4318
+    servicePort: 4318
+    protocol: TCP
+  jaeger-compact:
+    enabled: false
+  jaeger-thrift:
+    enabled: false
+  jaeger-grpc:
+    enabled: false
+  zipkin:
+    enabled: false
+
+config:
+  receivers:
+    otlp:
+      protocols:
+        grpc:
+          endpoint: "0.0.0.0:4317"
+        http:
+          endpoint: "0.0.0.0:4318"
+
+  connectors:
+    spanmetrics:
+      histogram:
+        explicit:
+          buckets: [5ms, 10ms, 25ms, 50ms, 100ms, 250ms, 500ms, 1s, 2.5s, 5s, 10s]
+      dimensions:
+        - name: http.method
+        - name: http.route
+        - name: http.status_code
+      namespace: ""
+
+  exporters:
+    otlp/tempo:
+      endpoint: "tempo.monitoring:4317"
+      tls:
+        insecure: true
+    prometheusremotewrite:
+      endpoint: "http://vmsingle-monitoring-victoria-metrics-k8s-stack.monitoring:8429/api/v1/write"
+
+  processors:
+    batch:
+      timeout: 5s
+      send_batch_size: 1024
+    memory_limiter:
+      check_interval: 5s
+      limit_percentage: 80
+      spike_limit_percentage: 25
+
+  service:
+    pipelines:
+      traces:
+        receivers: [otlp]
+        processors: [memory_limiter, batch]
+        exporters: [otlp/tempo, spanmetrics]
+      metrics/spanmetrics:
+        receivers: [spanmetrics]
+        processors: [batch]
+        exporters: [prometheusremotewrite]
--- a/aula-15/setup.sh
+++ b/aula-15/setup.sh
@@ -0,0 +1,466 @@
+#!/bin/bash
+# =============================================================================
+# Aula 15 - APM com Grafana Tempo + OpenTelemetry
+# =============================================================================
+#
+# Este script instala:
+#   1. Grafana Tempo (backend de distributed tracing)
+#   2. OpenTelemetry Collector (recebe e encaminha telemetria)
+#   3. Datasource do Tempo no Grafana
+#   4. Demo app instrumentada (Express + PostgreSQL + OTel)
+#   5. Alertas de latencia (Doherty threshold)
+#
+# Pre-requisitos:
+#   - Cluster Kubernetes (aula-08)
+#   - kubectl e helm instalados
+#   - Monitoring namespace com Victoria Metrics + Grafana (aula-12)
+#
+# =============================================================================
+
+set -e
+
+RED='\033[0;31m'
+GREEN='\033[0;32m'
+YELLOW='\033[1;33m'
+BLUE='\033[0;34m'
+CYAN='\033[0;36m'
+NC='\033[0m'
+
+log_info() { echo -e "${BLUE}[INFO]${NC} $1"; }
+log_success() { echo -e "${GREEN}[OK]${NC} $1"; }
+log_warn() { echo -e "${YELLOW}[WARN]${NC} $1"; }
+log_error() { echo -e "${RED}[ERROR]${NC} $1"; }
+
+SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
+ENV_FILE="${SCRIPT_DIR}/.env"
+
+# =============================================================================
+# VERIFICAR PRE-REQUISITOS
+# =============================================================================
+
+log_info "Verificando pre-requisitos..."
+
+if ! command -v kubectl &> /dev/null; then
+    log_error "kubectl nao encontrado. Instale com: brew install kubectl"
+    exit 1
+fi
+
+if ! command -v helm &> /dev/null; then
+    log_error "helm nao encontrado. Instale com: brew install helm"
+    exit 1
+fi
+
+if ! kubectl cluster-info &> /dev/null; then
+    log_error "Nao foi possivel conectar ao cluster Kubernetes"
+    log_info "Verifique se KUBECONFIG esta configurado corretamente"
+    exit 1
+fi
+
+if ! kubectl get namespace monitoring &> /dev/null; then
+    log_error "Namespace 'monitoring' nao encontrado"
+    log_info "Execute primeiro a aula-12 para instalar Victoria Metrics + Grafana"
+    exit 1
+fi
+
+log_success "Pre-requisitos verificados"
+
+# =============================================================================
+# CARREGAR CONFIGURACAO
+# =============================================================================
+
+if [[ -f "$ENV_FILE" ]]; then
+    log_info "Carregando configuracao local..."
+    source "$ENV_FILE"
+fi
+
+# Herdar configuracao da aula-12 ou aula-10
+for ENV_SRC in "${SCRIPT_DIR}/../aula-12/.env" "${SCRIPT_DIR}/../aula-10/.env"; do
+    if [[ -f "$ENV_SRC" && -z "$DOMAIN" ]]; then
+        log_info "Herdando configuracao de $(basename $(dirname $ENV_SRC))..."
+        source "$ENV_SRC"
+    fi
+done
+
+# =============================================================================
+# COLETAR CONFIGURACAO
+# =============================================================================
+
+echo ""
+echo -e "${CYAN}═══════════════════════════════════════════════════${NC}"
+echo -e "${CYAN}  APM - Grafana Tempo + OpenTelemetry${NC}"
+echo -e "${CYAN}═══════════════════════════════════════════════════${NC}"
+echo ""
+
+# Dominio
+if [[ -z "$DOMAIN" ]]; then
+    read -p "Dominio base (ex: kube.quest): " DOMAIN
+fi
+log_info "Dominio: ${DOMAIN}"
+
+# Demo host
+if [[ -z "$DEMO_HOST" ]]; then
+    DEFAULT_DEMO="demo.${DOMAIN}"
+    read -p "Hostname da demo app [${DEFAULT_DEMO}]: " DEMO_HOST
+    DEMO_HOST="${DEMO_HOST:-$DEFAULT_DEMO}"
+fi
+log_info "Demo app: ${DEMO_HOST}"
+
+# TLS (herdar ou perguntar)
+if [[ -z "$USE_CLOUDFLARE" && -z "$USE_LETSENCRYPT" ]]; then
+    echo ""
+    echo "Configuracao de TLS:"
+    echo "  1) Let's Encrypt (recomendado)"
+    echo "  2) CloudFlare (proxy)"
+    echo "  3) HTTP apenas"
+    read -p "Escolha [1-3]: " TLS_CHOICE
+    case $TLS_CHOICE in
+        1) USE_CLOUDFLARE=false; USE_LETSENCRYPT=true ;;
+        2) USE_CLOUDFLARE=true; USE_LETSENCRYPT=false ;;
+        *) USE_CLOUDFLARE=false; USE_LETSENCRYPT=false ;;
+    esac
+fi
+
+# Senha do PostgreSQL da demo
+if [[ -z "$DEMO_DB_PASSWORD" ]]; then
+    DEMO_DB_PASSWORD=$(openssl rand -base64 24 | tr -dc 'a-zA-Z0-9' | head -c 24)
+    log_info "Senha PostgreSQL da demo gerada automaticamente"
+fi
+
+# Salvar configuracao
+cat > "$ENV_FILE" << EOF
+# Configuracao gerada pelo setup.sh - $(date)
+DOMAIN=${DOMAIN}
+DEMO_HOST=${DEMO_HOST}
+DEMO_DB_PASSWORD=${DEMO_DB_PASSWORD}
+USE_CLOUDFLARE=${USE_CLOUDFLARE}
+USE_LETSENCRYPT=${USE_LETSENCRYPT}
+EOF
+
+log_success "Configuracao salva"
+
+# =============================================================================
+# INSTALAR GRAFANA TEMPO
+# =============================================================================
+
+echo ""
+log_info "=== Instalando Grafana Tempo ==="
+
+helm repo add grafana https://grafana.github.io/helm-charts 2>/dev/null || true
+helm repo update grafana
+
+if helm status tempo -n monitoring &> /dev/null; then
+    log_warn "Tempo ja instalado. Atualizando..."
+    TEMPO_CMD="upgrade"
+else
+    TEMPO_CMD="install"
+fi
+
+helm ${TEMPO_CMD} tempo grafana/tempo \
+    --namespace monitoring \
+    -f "${SCRIPT_DIR}/tempo-values.yaml" \
+    --wait \
+    --timeout 5m
+
+log_success "Grafana Tempo instalado!"
+
+# =============================================================================
+# INSTALAR OPENTELEMETRY COLLECTOR
+# =============================================================================
+
+echo ""
+log_info "=== Instalando OpenTelemetry Collector ==="
+
+helm repo add open-telemetry https://open-telemetry.github.io/opentelemetry-helm-charts 2>/dev/null || true
+helm repo update open-telemetry
+
+if helm status otel-collector -n monitoring &> /dev/null; then
+    log_warn "OTel Collector ja instalado. Atualizando..."
+    OTEL_CMD="upgrade"
+else
+    OTEL_CMD="install"
+fi
+
+helm ${OTEL_CMD} otel-collector open-telemetry/opentelemetry-collector \
+    --namespace monitoring \
+    -f "${SCRIPT_DIR}/otel-collector-values.yaml" \
+    --wait \
+    --timeout 5m
+
+log_success "OpenTelemetry Collector instalado!"
+
+# =============================================================================
+# CONFIGURAR DATASOURCE DO TEMPO NO GRAFANA
+# =============================================================================
+
+echo ""
+log_info "=== Configurando datasource do Tempo no Grafana ==="
+
+kubectl apply -f - <<'DATASOURCE_EOF'
+apiVersion: v1
+kind: ConfigMap
+metadata:
+  name: grafana-datasource-tempo
+  namespace: monitoring
+  labels:
+    grafana_datasource: "1"
+data:
+  tempo-datasource.yaml: |
+    apiVersion: 1
+    datasources:
+      - name: Tempo
+        type: tempo
+        url: http://tempo.monitoring:3100
+        access: proxy
+        isDefault: false
+        jsonData:
+          tracesToMetrics:
+            datasourceUid: victoriametrics
+            tags: [{key: "http.route"}]
+          serviceMap:
+            datasourceUid: victoriametrics
+          nodeGraph:
+            enabled: true
+          tracesToLogs:
+            datasourceUid: ""
+DATASOURCE_EOF
+
+log_success "Datasource configurado"
+
+# Reiniciar Grafana para carregar o novo datasource
+log_info "Reiniciando Grafana para carregar datasource..."
+kubectl rollout restart deployment -n monitoring -l app.kubernetes.io/name=grafana 2>/dev/null || true
+kubectl rollout status deployment -n monitoring -l app.kubernetes.io/name=grafana --timeout=120s 2>/dev/null || true
+
+log_success "Grafana reiniciado"
+
+# =============================================================================
+# DEPLOY DA DEMO APP
+# =============================================================================
+
+echo ""
+log_info "=== Deploy da Demo App ==="
+
+# Criar namespace
+kubectl apply -f "${SCRIPT_DIR}/demo-app/k8s/namespace.yaml"
+log_success "Namespace demo criado"
+
+# Criar secret do PostgreSQL com senha gerada
+kubectl create secret generic demo-postgresql \
+    --namespace demo \
+    --from-literal=POSTGRES_PASSWORD="${DEMO_DB_PASSWORD}" \
+    --dry-run=client -o yaml | kubectl apply -f -
+log_success "Secret demo-postgresql criado"
+
+# Deploy do PostgreSQL
+kubectl apply -f "${SCRIPT_DIR}/demo-app/k8s/postgresql.yaml"
+log_info "Aguardando PostgreSQL ficar pronto..."
+kubectl wait --for=condition=available deployment/demo-postgresql -n demo --timeout=120s 2>/dev/null || true
+log_success "PostgreSQL pronto"
+
+# Criar ConfigMap com o codigo da app
+kubectl create configmap demo-app-code \
+    --from-file=app.js="${SCRIPT_DIR}/demo-app/app.js" \
+    --from-file=tracing.js="${SCRIPT_DIR}/demo-app/tracing.js" \
+    --from-file=package.json="${SCRIPT_DIR}/demo-app/package.json" \
+    --namespace demo \
+    --dry-run=client -o yaml | kubectl apply -f -
+log_success "ConfigMap demo-app-code criado"
+
+# Deploy da app (usando node:20-alpine + init container para npm install)
+kubectl apply -f - <<'APP_DEPLOY_EOF'
+apiVersion: apps/v1
+kind: Deployment
+metadata:
+  name: demo-app
+  namespace: demo
+  labels:
+    app: demo-app
+spec:
+  replicas: 1
+  selector:
+    matchLabels:
+      app: demo-app
+  template:
+    metadata:
+      labels:
+        app: demo-app
+    spec:
+      initContainers:
+        - name: install-deps
+          image: node:20-alpine
+          command: ["sh", "-c", "cp /code/* /app/ && cd /app && npm install --production"]
+          volumeMounts:
+            - name: code
+              mountPath: /code
+              readOnly: true
+            - name: app
+              mountPath: /app
+      containers:
+        - name: demo-app
+          image: node:20-alpine
+          command: ["node", "--require", "./tracing.js", "app.js"]
+          workingDir: /app
+          ports:
+            - containerPort: 3000
+          env:
+            - name: PG_HOST
+              value: demo-postgresql
+            - name: PG_PORT
+              value: "5432"
+            - name: PG_USER
+              value: demo
+            - name: PG_PASSWORD
+              valueFrom:
+                secretKeyRef:
+                  name: demo-postgresql
+                  key: POSTGRES_PASSWORD
+            - name: PG_DATABASE
+              value: demo
+            - name: OTEL_EXPORTER_OTLP_ENDPOINT
+              value: http://otel-collector-opentelemetry-collector.monitoring:4317
+            - name: OTEL_SERVICE_NAME
+              value: demo-app
+          resources:
+            requests:
+              memory: "64Mi"
+              cpu: "50m"
+            limits:
+              memory: "256Mi"
+          livenessProbe:
+            httpGet:
+              path: /health
+              port: 3000
+            initialDelaySeconds: 30
+            periodSeconds: 10
+            timeoutSeconds: 3
+            failureThreshold: 3
+          readinessProbe:
+            httpGet:
+              path: /health
+              port: 3000
+            initialDelaySeconds: 15
+            periodSeconds: 5
+            timeoutSeconds: 3
+            failureThreshold: 3
+          volumeMounts:
+            - name: app
+              mountPath: /app
+      volumes:
+        - name: code
+          configMap:
+            name: demo-app-code
+        - name: app
+          emptyDir: {}
+APP_DEPLOY_EOF
+
+log_success "Deployment demo-app criado"
+
+# Deploy do Service
+kubectl apply -f "${SCRIPT_DIR}/demo-app/k8s/service.yaml"
+log_success "Service demo-app criado"
+
+# Deploy do Ingress (substituir placeholders)
+TEMP_INGRESS=$(mktemp)
+sed "s/DEMO_HOST_PLACEHOLDER/${DEMO_HOST}/g" "${SCRIPT_DIR}/demo-app/k8s/ingress.yaml" > "$TEMP_INGRESS"
+
+if [[ "$USE_LETSENCRYPT" == "true" ]]; then
+    sed -i.bak "s/CLUSTER_ISSUER_PLACEHOLDER/letsencrypt-prod/g" "$TEMP_INGRESS"
+    rm -f "$TEMP_INGRESS.bak"
+else
+    # Remover anotacao cert-manager
+    sed -i.bak '/cert-manager.io\/cluster-issuer/d' "$TEMP_INGRESS"
+    rm -f "$TEMP_INGRESS.bak"
+fi
+
+if [[ "$USE_CLOUDFLARE" != "true" && "$USE_LETSENCRYPT" != "true" ]]; then
+    # Remover bloco TLS
+    sed -i.bak '/tls:/,/secretName:/d' "$TEMP_INGRESS"
+    rm -f "$TEMP_INGRESS.bak"
+fi
+
+kubectl apply -f "$TEMP_INGRESS"
+rm -f "$TEMP_INGRESS"
+log_success "Ingress demo-app criado"
+
+# Aguardar demo-app
+log_info "Aguardando demo-app ficar pronta..."
+kubectl wait --for=condition=available deployment/demo-app -n demo --timeout=180s 2>/dev/null || true
+
+# =============================================================================
+# APLICAR ALERTAS
+# =============================================================================
+
+echo ""
+log_info "=== Aplicando alertas de latencia ==="
+
+kubectl apply -f "${SCRIPT_DIR}/alerts/latency-alerts.yaml" -n monitoring 2>/dev/null || true
+log_success "Alertas aplicados"
+
+# =============================================================================
+# RESUMO FINAL
+# =============================================================================
+
+PROTOCOL="https"
+if [[ "$USE_CLOUDFLARE" == "false" && "$USE_LETSENCRYPT" == "false" ]]; then
+    PROTOCOL="http"
+fi
+
+GRAFANA_HOST="${GRAFANA_HOST:-grafana.${DOMAIN}}"
+
+echo ""
+echo -e "${CYAN}═══════════════════════════════════════════════════${NC}"
+echo -e "${GREEN}  APM Instalado com Sucesso!${NC}"
+echo -e "${CYAN}═══════════════════════════════════════════════════${NC}"
+echo ""
+echo "Demo App:"
+echo "  URL:        ${PROTOCOL}://${DEMO_HOST}"
+echo "  Rotas:"
+echo "    /health   - Health check"
+echo "    /fast     - Query rapida (1 query)"
+echo "    /slow     - Query lenta (N+1 pattern - 51 queries)"
+echo "    /fixed    - Query otimizada (1 JOIN)"
+echo ""
+echo "Grafana (Traces):"
+echo "  URL:        ${PROTOCOL}://${GRAFANA_HOST}"
+echo "  Datasource: Tempo (ja configurado)"
+echo "  Explore:    ${PROTOCOL}://${GRAFANA_HOST}/explore"
+echo ""
+echo -e "${CYAN}--- Como testar ---${NC}"
+echo ""
+echo "  # Gerar traces (executar varias vezes)"
+echo "  curl ${PROTOCOL}://${DEMO_HOST}/fast"
+echo "  curl ${PROTOCOL}://${DEMO_HOST}/slow"
+echo "  curl ${PROTOCOL}://${DEMO_HOST}/fixed"
+echo ""
+echo "  # Ou via port-forward:"
+echo "  kubectl port-forward -n demo svc/demo-app 3000:3000"
+echo "  curl http://localhost:3000/slow"
+echo ""
+echo -e "${CYAN}--- O que observar no Grafana ---${NC}"
+echo ""
+echo "  1. Abrir Explore > Selecionar datasource 'Tempo'"
+echo "  2. Search > Service Name = demo-app"
+echo "  3. Comparar traces de /fast vs /slow"
+echo "     - /fast: 1 span de query (~5ms)"
+echo "     - /slow: 51 spans de query (~200ms+)"
+echo "     - /fixed: 1 span de query com JOIN (~10ms)"
+echo ""
+echo "Verificar pods:"
+echo "  kubectl get pods -n monitoring -l app.kubernetes.io/name=tempo"
+echo "  kubectl get pods -n monitoring -l app.kubernetes.io/name=opentelemetry-collector"
+echo "  kubectl get pods -n demo"
+echo ""
+echo "Desinstalar:"
+echo "  ./cleanup.sh"
+echo ""
+echo -e "${CYAN}═══════════════════════════════════════════════════${NC}"
+echo ""
+
+log_info "Status dos pods:"
+echo ""
+echo "--- monitoring ---"
+kubectl get pods -n monitoring -l "app.kubernetes.io/name in (tempo, opentelemetry-collector)" 2>/dev/null || true
+echo ""
+echo "--- demo ---"
+kubectl get pods -n demo 2>/dev/null || true
+echo ""
--- a/aula-15/tempo-values.yaml
+++ b/aula-15/tempo-values.yaml
@@ -0,0 +1,61 @@
+# Grafana Tempo - Monolithic mode (single binary)
+# Chart: grafana/tempo
+# Recebe traces do OpenTelemetry Collector e armazena localmente
+
+# Single replica - minimal for workshop on Hetzner CAX11
+replicas: 1
+
+tempo:
+  # Storage config - local filesystem
+  storage:
+    trace:
+      backend: local
+      local:
+        path: /var/tempo/traces
+      wal:
+        path: /var/tempo/wal
+
+  # Retention (maps to compactor.compaction.block_retention in tempo config)
+  retention: 168h  # 7 days
+
+  # Receive traces via OTLP only (disable other receivers)
+  receivers:
+    otlp:
+      protocols:
+        grpc:
+          endpoint: "0.0.0.0:4317"
+        http:
+          endpoint: "0.0.0.0:4318"
+
+  # Metrics generator - generates span metrics for Victoria Metrics
+  metricsGenerator:
+    enabled: true
+    remoteWriteUrl: "http://vmsingle-monitoring-victoria-metrics-k8s-stack.monitoring:8429/api/v1/write"
+    processor:
+      service_graphs:
+        dimensions:
+          - http.method
+          - http.route
+        enable_client_server_prefix: true
+        max_items: 10000
+      span_metrics:
+        dimensions:
+          - http.method
+          - http.route
+          - http.status_code
+        enable_target_info: true
+
+  # Resources for tempo container (under tempo key for this chart)
+  resources:
+    requests:
+      cpu: 100m
+      memory: 256Mi
+    limits:
+      cpu: 500m
+      memory: 512Mi
+
+# Persistence via Hetzner CSI
+persistence:
+  enabled: true
+  size: 10Gi
+  storageClassName: hcloud-volumes