fix(aula-13): reescrever benchmarks eStargz vs GZIP

Benchmarks antigos eram falhos: node hardcoded, imagens diferentes, sem verificação de snapshotter, sem controle de cache, 1 iteração. Novos scripts: - prepare-images.sh: constrói mesma imagem em gzip e estargz - benchmark.sh: múltiplas iterações, detecção de cache hits, verificação de snapshotter, 3 tipos de imagem (nginx/node/postgres) Requer worker node sem cache (node fresh via cluster autoscaler).
2026-03-14 07:48:51 -03:00
parent 7167e6ee11
commit f5cb6f0581
6 changed files with 612 additions and 625 deletions
--- a/aula-13/benchmarks/benchmark.sh
+++ b/aula-13/benchmarks/benchmark.sh
@@ -0,0 +1,485 @@
+#!/bin/bash
+# =============================================================================
+# Benchmark: eStargz vs GZIP — Cold Pull Performance
+# =============================================================================
+#
+# Metodologia:
+#
+#   1. VERIFICAÇÃO — confirma stargz-snapshotter ativo no worker
+#   2. PREPARAÇÃO  — namespace, registry secret, identificação do worker
+#   3. EXECUÇÃO    — para cada imagem × formato × iteração:
+#        a) Remove imagem do cache do worker (via crictl em pod privilegiado)
+#        b) Cria pod pinado no worker
+#        c) Mede tempo de pull (evento Pulling→Pulled) e total (até Ready)
+#        d) Remove pod
+#   4. RELATÓRIO   — médias por imagem/formato, tabela comparativa
+#
+# Controles de validade:
+#   - Mesma imagem base, mesmo Dockerfile, só a compressão muda
+#   - Pod pinado no mesmo worker (elimina variação de rede entre nodes)
+#   - Evento "Pulled" checado para descartar cache hits silenciosos
+#   - Múltiplas iterações com média
+#
+# IMPORTANTE — Cache de imagens:
+#   O benchmark precisa de um worker node SEM as imagens em cache.
+#   Não é possível limpar cache de forma confiável no Talos (sem crictl/ctr).
+#   A forma mais segura é escalar um worker novo via cluster autoscaler:
+#
+#     kubectl apply -f ../test-autoscaler.yaml   # Força scale-up
+#     kubectl get nodes -w                        # Esperar novo node
+#     ./benchmark.sh                              # Rodar no node limpo
+#
+#   O script detecta cache e avisa antes de rodar.
+#
+# Pré-requisitos:
+#   - kubectl configurado (KUBECONFIG)
+#   - Imagens preparadas com ./prepare-images.sh
+#   - stargz-snapshotter ativo nos workers (extensão Talos)
+#   - Worker node SEM cache das imagens de benchmark
+#
+# Uso:
+#   ./benchmark.sh                    # 3 iterações (padrão)
+#   BENCH_ITERATIONS=5 ./benchmark.sh # 5 iterações
+#
+# =============================================================================
+
+set -euo pipefail
+
+# ─────────────────────────────────────────────────────────────
+# Configuração
+# ─────────────────────────────────────────────────────────────
+
+REGISTRY="${REGISTRY:-gitea.kube.quest}"
+ORG="bench"
+NAMESPACE="benchmark"
+ITERATIONS="${BENCH_ITERATIONS:-3}"
+TALOSCONFIG="${TALOSCONFIG:-$(cd "$(dirname "$0")/../.." && pwd)/aula-08/talosconfig}"
+
+WORKER_NODE=""
+WORKER_IP=""
+
+RED='\033[0;31m'
+GREEN='\033[0;32m'
+YELLOW='\033[1;33m'
+BLUE='\033[0;34m'
+CYAN='\033[0;36m'
+BOLD='\033[1m'
+NC='\033[0m'
+
+log_info()    { echo -e "${BLUE}[INFO]${NC} $1"; }
+log_success() { echo -e "${GREEN}[OK]${NC} $1"; }
+log_warn()    { echo -e "${YELLOW}[WARN]${NC} $1"; }
+log_error()   { echo -e "${RED}[ERRO]${NC} $1"; }
+
+# Timestamp em milissegundos (compatível com macOS + Linux)
+ts_ms() { python3 -c "import time; print(int(time.time() * 1000))"; }
+
+# Converter timestamp ISO 8601 → epoch ms
+iso_to_ms() {
+    python3 -c "
+from datetime import datetime, timezone
+s = '${1}'.rstrip('Z')
+try:
+    dt = datetime.fromisoformat(s).replace(tzinfo=timezone.utc)
+except:
+    dt = datetime.strptime(s, '%Y-%m-%dT%H:%M:%S').replace(tzinfo=timezone.utc)
+print(int(dt.timestamp() * 1000))
+"
+}
+
+# ─────────────────────────────────────────────────────────────
+# 1. PRÉ-FLIGHT
+# ─────────────────────────────────────────────────────────────
+
+preflight() {
+    echo ""
+    echo -e "${CYAN}════════════════════════════════════════════════════════════${NC}"
+    echo -e "${CYAN}  Benchmark: eStargz vs GZIP — Cold Pull Performance${NC}"
+    echo -e "${CYAN}════════════════════════════════════════════════════════════${NC}"
+    echo ""
+    echo "  Registry:   ${REGISTRY}/${ORG}"
+    echo "  Iterações:  ${ITERATIONS}"
+    echo ""
+
+    # Detectar worker
+    WORKER_NODE=$(kubectl get nodes \
+        -l '!node-role.kubernetes.io/control-plane' \
+        -o jsonpath='{.items[0].metadata.name}')
+    WORKER_IP=$(kubectl get node "$WORKER_NODE" \
+        -o jsonpath='{.status.addresses[?(@.type=="InternalIP")].address}')
+
+    if [[ -z "$WORKER_NODE" ]]; then
+        log_error "Nenhum worker node encontrado"
+        exit 1
+    fi
+    log_info "Worker: ${WORKER_NODE} (${WORKER_IP})"
+
+    # Verificar stargz-snapshotter
+    log_info "Verificando stargz-snapshotter..."
+    local snapshotter_ok=false
+
+    if [[ -f "$TALOSCONFIG" ]]; then
+        local snap_conf
+        snap_conf=$(TALOSCONFIG="$TALOSCONFIG" talosctl -n "$WORKER_IP" \
+            read /etc/cri/conf.d/10-stargz-snapshotter.part 2>/dev/null || echo "")
+
+        if echo "$snap_conf" | grep -q "stargz"; then
+            snapshotter_ok=true
+        fi
+
+        if [[ "$snapshotter_ok" == "false" ]]; then
+            local ext
+            ext=$(TALOSCONFIG="$TALOSCONFIG" talosctl -n "$WORKER_IP" \
+                get extensionstatuses 2>/dev/null || echo "")
+            if echo "$ext" | grep -qi "stargz"; then
+                snapshotter_ok=true
+            fi
+        fi
+    fi
+
+    if [[ "$snapshotter_ok" == "true" ]]; then
+        log_success "stargz-snapshotter ativo"
+    else
+        echo ""
+        log_warn "stargz-snapshotter NÃO detectado em ${WORKER_NODE}"
+        log_warn "Sem ele, imagens eStargz são tratadas como gzip — resultados serão idênticos."
+        echo ""
+        echo -n "  Continuar mesmo assim? [s/N]: "
+        read -r choice
+        [[ "$choice" == "s" || "$choice" == "S" ]] || exit 1
+    fi
+
+    # Verificar imagens no registry
+    log_info "Verificando imagens no registry..."
+    for img in nginx node postgres; do
+        for fmt in gzip estargz; do
+            local ref="${REGISTRY}/${ORG}/${img}:${fmt}"
+            if ! docker manifest inspect "$ref" &>/dev/null; then
+                log_error "Imagem não encontrada: ${ref}"
+                log_error "Execute ./prepare-images.sh primeiro"
+                exit 1
+            fi
+        done
+    done
+    log_success "Todas as imagens encontradas"
+
+    # Verificar se o node tem cache (invalida resultados)
+    verify_clean_node
+}
+
+# ─────────────────────────────────────────────────────────────
+# 2. SETUP DO NAMESPACE
+# ─────────────────────────────────────────────────────────────
+
+setup_namespace() {
+    log_info "Preparando namespace ${NAMESPACE}..."
+
+    kubectl delete namespace "$NAMESPACE" --ignore-not-found --wait 2>/dev/null || true
+    kubectl create namespace "$NAMESPACE"
+    kubectl label namespace "$NAMESPACE" \
+        pod-security.kubernetes.io/enforce=privileged --overwrite
+
+    # Registry secret — usar credenciais do docker config
+    local docker_cfg="${HOME}/.docker/config.json"
+    if [[ -f "$docker_cfg" ]]; then
+        kubectl create secret generic bench-registry \
+            --from-file=.dockerconfigjson="$docker_cfg" \
+            --type=kubernetes.io/dockerconfigjson \
+            -n "$NAMESPACE" 2>/dev/null || true
+    else
+        log_warn "~/.docker/config.json não encontrado — pull de imagens privadas pode falhar"
+    fi
+
+    log_success "Namespace pronto"
+}
+
+# ─────────────────────────────────────────────────────────────
+# 3. LIMPEZA DE CACHE
+# ─────────────────────────────────────────────────────────────
+
+verify_clean_node() {
+    # Verifica se o worker node tem as imagens de benchmark em cache.
+    # Se tiver, o benchmark não é válido — precisa de node fresh.
+    log_info "Verificando cache de imagens no worker..."
+
+    local cached_count=0
+    local cached_images=""
+
+    # Usa talosctl para listar imagens no node
+    if [[ -f "$TALOSCONFIG" ]]; then
+        for img in nginx node postgres; do
+            for fmt in gzip estargz; do
+                local ref="${REGISTRY}/${ORG}/${img}:${fmt}"
+                if TALOSCONFIG="$TALOSCONFIG" talosctl -n "$WORKER_IP" \
+                    image list 2>/dev/null | grep -q "$ref"; then
+                    cached_count=$((cached_count + 1))
+                    cached_images="${cached_images}    ${ref}\n"
+                fi
+            done
+        done
+    fi
+
+    if [[ "$cached_count" -gt 0 ]]; then
+        echo ""
+        log_warn "O worker ${WORKER_NODE} tem ${cached_count} imagens de benchmark em cache:"
+        echo -e "$cached_images"
+        log_warn "Para resultados válidos, use um worker SEM cache."
+        echo ""
+        echo "  Opções:"
+        echo "    1. Escalar um worker novo (cluster autoscaler cria node limpo):"
+        echo "       kubectl apply -f ../test-autoscaler.yaml"
+        echo "       # Aguardar novo node ficar Ready"
+        echo "       # Re-rodar benchmark (vai usar o novo node)"
+        echo ""
+        echo "    2. Continuar mesmo assim (resultados podem ter cache hits)"
+        echo ""
+        echo -n "  Continuar com cache? [s/N]: "
+        read -r choice
+        [[ "$choice" == "s" || "$choice" == "S" ]] || exit 1
+    else
+        log_success "Worker sem cache de benchmark — resultados serão válidos"
+    fi
+}
+
+# ─────────────────────────────────────────────────────────────
+# 4. EXECUTAR UM TESTE
+#    Saída: pull_ms total_ms cached(yes/no)
+# ─────────────────────────────────────────────────────────────
+
+run_single_test() {
+    local image=$1     # nginx, node, postgres
+    local format=$2    # gzip, estargz
+    local run=$3       # número da iteração
+    local image_ref="${REGISTRY}/${ORG}/${image}:${format}"
+    local pod_name="b-${image}-${format}-${run}"
+
+    # Configuração específica por imagem
+    local env_yaml=""
+    local cmd_yaml=""
+    case $image in
+        postgres)
+            env_yaml="    env:
+    - name: POSTGRES_PASSWORD
+      value: benchtest"
+            ;;
+        *)
+            # nginx e node precisam de um command para não sair imediatamente
+            cmd_yaml="    command: [\"sleep\", \"infinity\"]"
+            ;;
+    esac
+
+    # Registrar tempo de parede
+    local t_start
+    t_start=$(ts_ms)
+
+    # Criar pod
+    cat <<EOF | kubectl apply -f - -n "$NAMESPACE" >/dev/null 2>&1
+apiVersion: v1
+kind: Pod
+metadata:
+  name: ${pod_name}
+spec:
+  nodeName: ${WORKER_NODE}
+  restartPolicy: Never
+  terminationGracePeriodSeconds: 0
+  imagePullSecrets:
+  - name: bench-registry
+  containers:
+  - name: app
+    image: ${image_ref}
+${cmd_yaml}
+${env_yaml}
+EOF
+
+    # Esperar Ready
+    kubectl wait --for=condition=Ready "pod/${pod_name}" \
+        -n "$NAMESPACE" --timeout=300s >/dev/null 2>&1
+
+    local t_ready
+    t_ready=$(ts_ms)
+    local total_ms=$(( t_ready - t_start ))
+
+    # Extrair tempo de pull dos eventos do Kubernetes
+    sleep 1  # dar tempo pro event ser registrado
+
+    local pull_ms=-1
+    local cached="no"
+
+    local pull_msg
+    pull_msg=$(kubectl get events -n "$NAMESPACE" \
+        --field-selector "involvedObject.name=${pod_name},reason=Pulled" \
+        -o jsonpath='{.items[0].message}' 2>/dev/null || echo "")
+
+    if echo "$pull_msg" | grep -qi "already present"; then
+        cached="yes"
+        pull_ms=0
+        log_warn "    ⚠ CACHE HIT — resultado inválido" >&2
+    else
+        # Tentar extrair timestamps dos eventos
+        local ts_pulling ts_pulled
+        ts_pulling=$(kubectl get events -n "$NAMESPACE" \
+            --field-selector "involvedObject.name=${pod_name},reason=Pulling" \
+            -o jsonpath='{.items[0].firstTimestamp}' 2>/dev/null || echo "")
+        ts_pulled=$(kubectl get events -n "$NAMESPACE" \
+            --field-selector "involvedObject.name=${pod_name},reason=Pulled" \
+            -o jsonpath='{.items[0].firstTimestamp}' 2>/dev/null || echo "")
+
+        if [[ -n "$ts_pulling" && -n "$ts_pulled" ]]; then
+            local ms_start ms_end
+            ms_start=$(iso_to_ms "$ts_pulling")
+            ms_end=$(iso_to_ms "$ts_pulled")
+            pull_ms=$(( ms_end - ms_start ))
+        fi
+    fi
+
+    # Limpar pod
+    kubectl delete pod "$pod_name" -n "$NAMESPACE" \
+        --grace-period=0 --force >/dev/null 2>&1 || true
+
+    # Esperar pod ser removido antes da próxima iteração
+    kubectl wait --for=delete "pod/${pod_name}" \
+        -n "$NAMESPACE" --timeout=30s >/dev/null 2>&1 || true
+
+    echo "${pull_ms} ${total_ms} ${cached}"
+}
+
+# ─────────────────────────────────────────────────────────────
+# 5. RELATÓRIO (lê resultados do RESULTS_DIR)
+# ─────────────────────────────────────────────────────────────
+
+# Helpers para ler/escrever resultados em arquivos (compatível com bash 3.2)
+rset() { echo "$2" > "${RESULTS_DIR}/$1"; }
+rget() { cat "${RESULTS_DIR}/$1" 2>/dev/null || echo "${2:-0}"; }
+radd() { local cur; cur=$(rget "$1" 0); local val="${2:-0}"; rset "$1" $(( cur + val )); }
+
+print_report() {
+    echo ""
+    echo -e "${CYAN}════════════════════════════════════════════════════════════════════════════${NC}"
+    echo -e "${BOLD}  RESULTADOS — eStargz vs GZIP Cold Pull${NC}"
+    echo -e "${CYAN}════════════════════════════════════════════════════════════════════════════${NC}"
+    echo ""
+    echo "  Worker: ${WORKER_NODE}"
+    echo "  Iterações: ${ITERATIONS}"
+    echo ""
+
+    printf "  %-12s │ %-8s │ %10s │ %10s │ %10s │ %7s\n" \
+        "IMAGEM" "FORMATO" "PULL (ms)" "TOTAL (ms)" "PULL (s)" "CACHE?"
+    echo "  ─────────────┼──────────┼────────────┼────────────┼────────────┼────────"
+
+    for img in nginx node postgres; do
+        for fmt in gzip estargz; do
+            local pull_avg; pull_avg=$(rget "${img}_${fmt}_pull_avg" "?")
+            local total_avg; total_avg=$(rget "${img}_${fmt}_total_avg" "?")
+            local cache_hits; cache_hits=$(rget "${img}_${fmt}_cache" 0)
+            local pull_sec="?"
+
+            if [[ "$pull_avg" != "?" ]] && [[ "$pull_avg" -ge 0 ]] 2>/dev/null; then
+                pull_sec=$(python3 -c "print(f'{${pull_avg}/1000:.1f}')")
+            fi
+
+            printf "  %-12s │ %-8s │ %10s │ %10s │ %10s │ %d/%d\n" \
+                "$img" "$fmt" "$pull_avg" "$total_avg" "$pull_sec" \
+                "$cache_hits" "$ITERATIONS"
+        done
+        echo "  ─────────────┼──────────┼────────────┼────────────┼────────────┼────────"
+    done
+
+    echo ""
+
+    # Comparação resumida
+    echo -e "  ${BOLD}Comparação (tempo médio de pull):${NC}"
+    echo ""
+    for img in nginx node postgres; do
+        local gzip_pull; gzip_pull=$(rget "${img}_gzip_pull_avg" 0)
+        local estargz_pull; estargz_pull=$(rget "${img}_estargz_pull_avg" 0)
+
+        if [[ "$gzip_pull" != "?" ]] && [[ "$estargz_pull" != "?" ]] && [[ "$gzip_pull" -gt 0 ]] && [[ "$estargz_pull" -gt 0 ]]; then
+            local diff_ms=$(( gzip_pull - estargz_pull ))
+            local ratio
+            ratio=$(python3 -c "print(f'{${gzip_pull}/${estargz_pull}:.1f}')" 2>/dev/null || echo "?")
+
+            if [[ "$diff_ms" -gt 0 ]]; then
+                echo -e "  ${img}: eStargz ${GREEN}${ratio}x mais rápido${NC} (${diff_ms}ms menos)"
+            elif [[ "$diff_ms" -lt 0 ]]; then
+                local abs_diff=$(( -diff_ms ))
+                echo -e "  ${img}: eStargz ${RED}${abs_diff}ms mais lento${NC}"
+            else
+                echo "  ${img}: resultados idênticos"
+            fi
+        else
+            echo "  ${img}: dados insuficientes para comparação"
+        fi
+    done
+
+    echo ""
+    echo -e "${CYAN}════════════════════════════════════════════════════════════════════════════${NC}"
+    echo ""
+    echo "  Notas:"
+    echo "    - PULL (ms): tempo entre eventos Pulling→Pulled (resolução ~1s)"
+    echo "    - TOTAL (ms): tempo de parede até pod Ready (inclui pull + startup)"
+    echo "    - CACHE? mostra quantos testes tiveram cache hit (deveria ser 0/N)"
+    echo "    - Se CACHE? > 0, a limpeza de cache falhou e o resultado é inválido"
+    echo ""
+}
+
+# ─────────────────────────────────────────────────────────────
+# MAIN
+# ─────────────────────────────────────────────────────────────
+
+# Diretório temporário para acumular resultados (substitui declare -A)
+RESULTS_DIR=$(mktemp -d)
+trap "rm -rf $RESULTS_DIR" EXIT
+
+preflight
+setup_namespace
+
+for img in nginx node postgres; do
+    echo ""
+    echo -e "${CYAN}────────────────────────────────────────────────${NC}"
+    echo -e "  ${BOLD}${img}${NC}"
+    echo -e "${CYAN}────────────────────────────────────────────────${NC}"
+
+    for fmt in gzip estargz; do
+        key="${img}_${fmt}"
+        rset "${key}_pull_sum" 0
+        rset "${key}_total_sum" 0
+        rset "${key}_count" 0
+        rset "${key}_cache" 0
+
+        for run in $(seq 1 "$ITERATIONS"); do
+            echo -n "  ${fmt} #${run}/${ITERATIONS} ... "
+
+            result=$(run_single_test "$img" "$fmt" "$run")
+            pull_ms=$(echo "$result" | awk '{print $1}')
+            total_ms=$(echo "$result" | awk '{print $2}')
+            cached=$(echo "$result" | awk '{print $3}')
+
+            if [[ "$cached" == "yes" ]]; then
+                radd "${key}_cache" 1
+                echo -e "${YELLOW}cache hit${NC} (total: ${total_ms}ms)"
+            else
+                radd "${key}_pull_sum" "$pull_ms"
+                radd "${key}_total_sum" "$total_ms"
+                radd "${key}_count" 1
+                echo "pull: ${pull_ms}ms  total: ${total_ms}ms"
+            fi
+        done
+
+        # Calcular médias
+        count=$(rget "${key}_count" 0)
+        if [[ "$count" -gt 0 ]]; then
+            rset "${key}_pull_avg" $(( $(rget "${key}_pull_sum") / count ))
+            rset "${key}_total_avg" $(( $(rget "${key}_total_sum") / count ))
+        else
+            rset "${key}_pull_avg" "?"
+            rset "${key}_total_avg" "?"
+        fi
+    done
+done
+
+print_report
+
+# Limpeza
+log_info "Limpando namespace de benchmark..."
+kubectl delete namespace "$NAMESPACE" --ignore-not-found >/dev/null 2>&1 || true
+log_success "Benchmark concluído"