From 4b92838ac32344e7ebe93e0d9e6a19a474a25803 Mon Sep 17 00:00:00 2001 From: ArgoCD Setup Date: Thu, 8 Jan 2026 17:11:28 -0300 Subject: [PATCH] aula-12: Victoria Metrics + Grafana via GitOps MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Stack completo de observabilidade: - Victoria Metrics (vmsingle, vmagent, vmalert) - Grafana com dashboards built-in - Alertas customizados (PVC, pods, nodes, deployments) - pvc-autoresizer para expansão automática de volumes - Queries PromQL documentadas Instalação via ArgoCD seguindo padrão GitOps da aula-11. --- aula-12/README.md | 446 +++++++++++++++++ aula-12/alerts/vmalert-rules.yaml | 100 ++++ aula-12/argocd/application.yaml | 54 ++ aula-12/cleanup.sh | 167 +++++++ .../gitops/apps/victoria-metrics/Chart.yaml | 11 + .../victoria-metrics/templates/alerts.yaml | 179 +++++++ .../gitops/apps/victoria-metrics/values.yaml | 192 +++++++ aula-12/queries/useful-queries.md | 323 ++++++++++++ aula-12/setup.sh | 467 ++++++++++++++++++ 9 files changed, 1939 insertions(+) create mode 100644 aula-12/README.md create mode 100644 aula-12/alerts/vmalert-rules.yaml create mode 100644 aula-12/argocd/application.yaml create mode 100755 aula-12/cleanup.sh create mode 100644 aula-12/gitops/apps/victoria-metrics/Chart.yaml create mode 100644 aula-12/gitops/apps/victoria-metrics/templates/alerts.yaml create mode 100644 aula-12/gitops/apps/victoria-metrics/values.yaml create mode 100644 aula-12/queries/useful-queries.md create mode 100755 aula-12/setup.sh diff --git a/aula-12/README.md b/aula-12/README.md new file mode 100644 index 0000000..5ec6eca --- /dev/null +++ b/aula-12/README.md @@ -0,0 +1,446 @@ +# Aula 12 - Victoria Metrics (Observabilidade) + +Stack completo de monitoramento com Victoria Metrics, Grafana e alertas, instalado via **GitOps com ArgoCD**. + +## Por que Victoria Metrics? + +### Alternativa ao Prometheus + +Victoria Metrics oferece **compatibilidade total** com Prometheus, mas com vantagens: + +| Feature | Prometheus | Victoria Metrics | +|---------|------------|------------------| +| Consumo de RAM | Alto | ~7x menor | +| Compressão | Básica | ~10x melhor | +| Velocidade | Normal | ~2-3x mais rápido | +| Setup HA | Complexo (Thanos) | Simples | +| API | PromQL | PromQL + extensões | + +### Diferença da Aula-05 + +| Aula-05 (KEDA) | Aula-12 (Observabilidade) | +|----------------|---------------------------| +| Foco: Auto-scaling | Foco: Monitoramento | +| VM básico (sem Grafana) | VM + Grafana completo | +| Queries para KEDA | Queries para operações | +| Ambiente local | Cluster Hetzner + GitOps | + +## Arquitetura + +``` +┌─────────────────────────────────────────────────────────────────┐ +│ Victoria Metrics Stack │ +├─────────────────────────────────────────────────────────────────┤ +│ ┌─────────────┐ ┌─────────────┐ ┌─────────────────────────┐ │ +│ │ vmagent │ │ vmsingle │ │ Grafana │ │ +│ │ (coleta) │──│ (storage) │──│ (visualização) │ │ +│ └─────────────┘ └─────────────┘ └─────────────────────────┘ │ +│ │ │ │ │ +│ ┌──────┴──────┐ ┌─────┴─────┐ ┌─────┴─────┐ │ +│ │ node-export │ │ vmalert │ │ Dashboards│ │ +│ │ kube-state │ │ (alertas) │ │ pré-config│ │ +│ └─────────────┘ └───────────┘ └───────────┘ │ +└─────────────────────────────────────────────────────────────────┘ +``` + +### Componentes + +| Componente | Função | +|------------|--------| +| **vmsingle** | Armazena métricas (modo single-node) | +| **vmagent** | Coleta métricas via scrape | +| **vmalert** | Avalia regras e dispara alertas | +| **Grafana** | Interface de visualização | +| **kube-state-metrics** | Métricas de objetos K8s (pods, deployments) | +| **node-exporter** | Métricas de nodes (CPU, RAM, disco) | + +## Pré-requisitos + +- Cluster Kubernetes Hetzner (aula-08) +- ArgoCD instalado (aula-11) +- GitLab com grupo `factory` (aula-11) + +## Estrutura + +``` +aula-12/ +├── README.md # Este arquivo +├── setup.sh # Instalação automatizada +├── cleanup.sh # Remoção via ArgoCD +│ +├── gitops/ # Conteúdo para GitLab +│ └── apps/ +│ └── victoria-metrics/ +│ ├── Chart.yaml # Helm chart wrapper +│ ├── values.yaml # Configurações do stack +│ └── templates/ +│ └── alerts.yaml # VMRule customizado +│ +├── argocd/ +│ └── application.yaml # ArgoCD Application CRD +│ +├── alerts/ +│ └── vmalert-rules.yaml # Referência (aplicada via gitops/) +│ +└── queries/ + └── useful-queries.md # Queries PromQL úteis +``` + +## Instalação (GitOps) + +### 1. Executar Setup + +```bash +cd aula-12 +./setup.sh +``` + +O script: +1. Cria projeto `factory/monitoring` no GitLab +2. Faz push dos manifests GitOps +3. Cria namespace `monitoring` +4. Aplica ArgoCD Application +5. Aguarda sincronização + +### 2. Fluxo GitOps + +``` +┌──────────────────┐ push ┌──────────────────┐ +│ aula-12/ │ ──────────► │ factory/monitoring│ +│ gitops/ │ │ (GitLab) │ +└──────────────────┘ └────────┬─────────┘ + │ + sync │ + ▼ + ┌──────────────────┐ + │ ArgoCD │ + └────────┬─────────┘ + │ + apply │ + ▼ + ┌──────────────────┐ + │ Kubernetes │ + │ ns: monitoring │ + └──────────────────┘ +``` + +### 3. Verificar Instalação + +```bash +# Status do ArgoCD Application +kubectl get application monitoring -n argocd + +# Pods rodando +kubectl get pods -n monitoring + +# Serviços +kubectl get svc -n monitoring +``` + +## Configuração de DNS + +Antes de acessar o Grafana, configure o DNS: + +```bash +# Obter IP do Load Balancer +kubectl get svc -n ingress-nginx ingress-nginx-controller -o jsonpath='{.status.loadBalancer.ingress[0].ip}' +``` + +Configure o registro DNS: +- **grafana.kube.quest** → IP do Load Balancer (ex: 49.13.44.102) + +## Acessar Grafana + +### Via Ingress (Recomendado) + +Após configurar o DNS: +- **URL**: https://grafana.kube.quest +- Certificado TLS via Let's Encrypt (automático) + +### Credenciais + +- **Usuário**: admin +- **Senha**: (gerada automaticamente) + +```bash +# Obter senha do Grafana +kubectl get secret monitoring-grafana -n monitoring \ + -o jsonpath='{.data.admin-password}' | base64 -d; echo +``` + +### Via Port-Forward (Alternativa) + +Se não tiver DNS configurado: + +```bash +# Port-forward Grafana +kubectl port-forward -n monitoring svc/monitoring-grafana 3000:80 + +# Acessar +open http://localhost:3000 +``` + +## Dashboards Incluídos + +O chart `victoria-metrics-k8s-stack` já inclui **20+ dashboards** via Grafana sidecar: + +### Dashboards Principais + +| Dashboard | Descrição | +|-----------|-----------| +| **Kubernetes / Views / Global** | Visão geral do cluster | +| **Kubernetes / Views / Namespaces** | Recursos por namespace | +| **Kubernetes / Views / Nodes** | CPU, memória, disco por node | +| **Kubernetes / Views / Pods** | Métricas detalhadas de pods | +| **Node Exporter / Nodes** | Métricas de sistema operacional | +| **Node Exporter / USE Method / Node** | Utilização, Saturação, Erros | +| **VictoriaMetrics / vmagent** | Status da coleta de métricas | +| **VictoriaMetrics / vmsingle** | Status do storage de métricas | + +### Ver Todos os Dashboards + +No Grafana: +1. Menu lateral → **Dashboards** +2. Ou acesse: https://grafana.kube.quest/dashboards + +## Alertas Configurados + +| Alerta | Condição | Severidade | +|--------|----------|------------| +| PVCAlmostFull | PVC > 80% | warning | +| PVCFull | PVC > 95% | critical | +| PodCrashLooping | Restarts > 5/hora | warning | +| PodNotReady | Pod não Ready > 5min | warning | +| NodeHighCPU | CPU > 90% por 5min | warning | +| NodeLowMemory | RAM livre < 10% | warning | +| NodeDiskFull | Disco > 85% | critical | + +### Verificar Alertas + +```bash +# Ver alertas ativos +kubectl get vmrule -n monitoring + +# Ver status no VMAlert +kubectl port-forward -n monitoring svc/vmalert 8880:8880 +open http://localhost:8880/alerts +``` + +## Queries PromQL Úteis + +### Uso de Storage + +```promql +# Uso de PVC em porcentagem +kubelet_volume_stats_used_bytes / kubelet_volume_stats_capacity_bytes * 100 + +# PVCs acima de 80% +(kubelet_volume_stats_used_bytes / kubelet_volume_stats_capacity_bytes) > 0.8 + +# Espaço disponível por PVC +kubelet_volume_stats_available_bytes +``` + +### CPU e Memória + +```promql +# CPU por pod (cores) +sum(rate(container_cpu_usage_seconds_total{container!=""}[5m])) by (pod, namespace) + +# Memória por namespace +sum(container_memory_working_set_bytes{container!=""}) by (namespace) + +# CPU por node (%) +100 - (avg by(instance) (rate(node_cpu_seconds_total{mode="idle"}[5m])) * 100) +``` + +### Pods e Containers + +```promql +# Pods restartando na última hora +sum(increase(kube_pod_container_status_restarts_total[1h])) by (pod, namespace) > 0 + +# Pods não Ready +kube_pod_status_ready{condition="false"} + +# Pods em CrashLoopBackOff +kube_pod_container_status_waiting_reason{reason="CrashLoopBackOff"} +``` + +### Network + +```promql +# Bytes recebidos por pod +sum(rate(container_network_receive_bytes_total[5m])) by (pod) + +# Bytes enviados por pod +sum(rate(container_network_transmit_bytes_total[5m])) by (pod) +``` + +## Integrações + +### Com KEDA (Aula-05) + +Victoria Metrics pode alimentar KEDA para auto-scaling: + +```yaml +triggers: + - type: prometheus + metadata: + serverAddress: http://vmsingle-vm.monitoring:8429 + query: sum(kube_deployment_status_replicas_unavailable{deployment="myapp"}) + threshold: '1' +``` + +### Com Alertmanager + +Para notificações (Slack, Email, PagerDuty): + +```yaml +# values.yaml +vmalert: + alertmanager: + url: http://alertmanager.monitoring:9093 +``` + +## Troubleshooting + +### Grafana não carrega dashboards + +```bash +# Verificar configmap de dashboards +kubectl get configmap -n monitoring | grep dashboard + +# Verificar logs do sidecar +kubectl logs -n monitoring -l app.kubernetes.io/name=grafana -c sc-dashboard +``` + +### Métricas não aparecem + +```bash +# Verificar targets do vmagent +kubectl port-forward -n monitoring svc/vmagent 8429:8429 +open http://localhost:8429/targets + +# Verificar scrape configs +kubectl get configmap -n monitoring vmagent-config -o yaml +``` + +### VMAlert não dispara alertas + +```bash +# Verificar regras carregadas +kubectl port-forward -n monitoring svc/vmalert 8880:8880 +open http://localhost:8880/rules + +# Verificar logs +kubectl logs -n monitoring -l app=vmalert +``` + +### ArgoCD mostra OutOfSync + +```bash +# Ver diff +argocd app diff monitoring + +# Forçar sync +argocd app sync monitoring --prune +``` + +## Auto-Resize de PVC (Opcional) + +Expansão automática de volumes quando atingirem 80% da capacidade. + +### Instalar pvc-autoresizer + +```bash +# Adicionar repo Helm +helm repo add pvc-autoresizer https://topolvm.github.io/pvc-autoresizer/ + +# Instalar com Victoria Metrics como fonte de métricas +helm install pvc-autoresizer pvc-autoresizer/pvc-autoresizer \ + --namespace pvc-autoresizer \ + --create-namespace \ + --set controller.args.prometheusURL=http://vmsingle-monitoring-victoria-metrics-k8s-stack.monitoring:8429 + +# Habilitar auto-resize no StorageClass +kubectl annotate storageclass hcloud-volumes resize.topolvm.io/enabled="true" +``` + +### Anotar PVCs para Auto-Resize + +```bash +# Anotar um PVC específico +kubectl annotate pvc -n \ + resize.topolvm.io/storage_limit="50Gi" \ + resize.topolvm.io/threshold="20%" \ + resize.topolvm.io/increase="10Gi" + +# Anotar TODOS os PVCs +kubectl get pvc -A --no-headers | awk '{print $1, $2}' | \ + xargs -n2 sh -c 'kubectl annotate pvc "$1" -n "$0" \ + resize.topolvm.io/storage_limit="50Gi" \ + resize.topolvm.io/threshold="20%" \ + resize.topolvm.io/increase="10Gi" --overwrite' +``` + +### Configuração das Annotations + +| Annotation | Valor | Descrição | +|------------|-------|-----------| +| `storage_limit` | `10Ti` | Limite máximo (10TB = max Hetzner) | +| `threshold` | `20%` | Expandir quando free < 20% (usado > 80%) | +| `increase` | `10Gi` | Quanto aumentar por vez | + +### Alerta de Notificação + +Quando um PVC é redimensionado, o alerta **PVCAutoResized** é disparado automaticamente (severity: info). + +### Verificar Funcionamento + +```bash +# Logs do controller +kubectl logs -n pvc-autoresizer deployment/pvc-autoresizer-controller + +# Ver PVCs com auto-resize habilitado +kubectl get pvc -A -o jsonpath='{range .items[*]}{.metadata.namespace}/{.metadata.name}: {.metadata.annotations.resize\.topolvm\.io/storage_limit}{"\n"}{end}' +``` + +### Referência + +- [pvc-autoresizer GitHub](https://github.com/topolvm/pvc-autoresizer) + +--- + +## Retenção de Dados + +Configuração padrão: **14 dias** + +Para alterar: + +```yaml +# gitops/apps/victoria-metrics/values.yaml +vmsingle: + retentionPeriod: "30d" +``` + +## Cleanup + +```bash +./cleanup.sh +``` + +O script remove: +1. ArgoCD Application +2. Namespace `monitoring` +3. Projeto GitLab (opcional) + +**Nota**: Dados de métricas serão perdidos! + +## Referências + +- [Victoria Metrics Docs](https://docs.victoriametrics.com/) +- [victoria-metrics-k8s-stack Chart](https://github.com/VictoriaMetrics/helm-charts/tree/master/charts/victoria-metrics-k8s-stack) +- [Grafana Dashboards](https://grafana.com/grafana/dashboards/) +- [PromQL Cheat Sheet](https://promlabs.com/promql-cheat-sheet/) +- [pvc-autoresizer](https://github.com/topolvm/pvc-autoresizer) - Auto-resize de volumes diff --git a/aula-12/alerts/vmalert-rules.yaml b/aula-12/alerts/vmalert-rules.yaml new file mode 100644 index 0000000..890a9cd --- /dev/null +++ b/aula-12/alerts/vmalert-rules.yaml @@ -0,0 +1,100 @@ +# ============================================================================= +# VMRule - Regras de Alerta (Referência) +# ============================================================================= +# +# Este arquivo é apenas referência. A versão aplicada está em: +# gitops/apps/victoria-metrics/templates/alerts.yaml +# +# Para aplicar manualmente (sem GitOps): +# kubectl apply -f alerts/vmalert-rules.yaml -n monitoring +# +# ============================================================================= + +apiVersion: operator.victoriametrics.com/v1beta1 +kind: VMRule +metadata: + name: custom-alerts + namespace: monitoring +spec: + groups: + # ------------------------------------------------------------------------- + # Alertas de Storage/PVC + # ------------------------------------------------------------------------- + - name: storage-alerts + rules: + - alert: PVCAlmostFull + expr: | + (kubelet_volume_stats_used_bytes / kubelet_volume_stats_capacity_bytes) > 0.8 + and + (kubelet_volume_stats_used_bytes / kubelet_volume_stats_capacity_bytes) < 0.95 + for: 5m + labels: + severity: warning + annotations: + summary: "PVC {{ $labels.persistentvolumeclaim }} está acima de 80%" + description: "PVC {{ $labels.persistentvolumeclaim }} no namespace {{ $labels.namespace }} está usando {{ $value | humanizePercentage }} do espaço." + + - alert: PVCFull + expr: | + (kubelet_volume_stats_used_bytes / kubelet_volume_stats_capacity_bytes) > 0.95 + for: 1m + labels: + severity: critical + annotations: + summary: "PVC {{ $labels.persistentvolumeclaim }} está CHEIO (>95%)" + description: "AÇÃO URGENTE NECESSÁRIA!" + + # ------------------------------------------------------------------------- + # Alertas de Pods + # ------------------------------------------------------------------------- + - name: pod-alerts + rules: + - alert: PodCrashLooping + expr: | + sum(increase(kube_pod_container_status_restarts_total[1h])) by (pod, namespace) > 5 + for: 5m + labels: + severity: warning + annotations: + summary: "Pod {{ $labels.pod }} está em CrashLoop" + + - alert: PodNotReady + expr: | + kube_pod_status_ready{condition="false"} == 1 + for: 5m + labels: + severity: warning + annotations: + summary: "Pod {{ $labels.pod }} não está Ready" + + # ------------------------------------------------------------------------- + # Alertas de Nodes + # ------------------------------------------------------------------------- + - name: node-alerts + rules: + - alert: NodeHighCPU + expr: | + 100 - (avg by(instance) (rate(node_cpu_seconds_total{mode="idle"}[5m])) * 100) > 90 + for: 5m + labels: + severity: warning + annotations: + summary: "Node {{ $labels.instance }} com CPU alta" + + - alert: NodeLowMemory + expr: | + (node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes) < 0.1 + for: 5m + labels: + severity: warning + annotations: + summary: "Node {{ $labels.instance }} com pouca memória" + + - alert: NodeDiskFull + expr: | + (node_filesystem_avail_bytes{fstype!~"tmpfs|overlay"} / node_filesystem_size_bytes{fstype!~"tmpfs|overlay"}) < 0.15 + for: 5m + labels: + severity: critical + annotations: + summary: "Disco do Node {{ $labels.instance }} quase cheio" diff --git a/aula-12/argocd/application.yaml b/aula-12/argocd/application.yaml new file mode 100644 index 0000000..9661003 --- /dev/null +++ b/aula-12/argocd/application.yaml @@ -0,0 +1,54 @@ +# ============================================================================= +# ArgoCD Application - Victoria Metrics +# ============================================================================= +# +# Este arquivo é referência. O setup.sh aplica automaticamente. +# Para aplicar manualmente: +# kubectl apply -f argocd/application.yaml +# +# ============================================================================= + +apiVersion: argoproj.io/v1alpha1 +kind: Application +metadata: + name: monitoring + namespace: argocd + finalizers: + - resources-finalizer.argocd.argoproj.io +spec: + project: default + + source: + # URL do repositório GitLab (ajustar conforme seu domínio) + repoURL: https://git.kube.quest/factory/monitoring.git + targetRevision: HEAD + path: apps/victoria-metrics + + # Helm com arquivo de valores + helm: + valueFiles: + - values.yaml + + destination: + server: https://kubernetes.default.svc + namespace: monitoring + + syncPolicy: + automated: + # Remover recursos que não existem mais no Git + prune: true + # Reverter mudanças manuais + selfHeal: true + + syncOptions: + # Criar namespace se não existir + - CreateNamespace=true + # Usar server-side apply (melhor para CRDs) + - ServerSideApply=true + + retry: + limit: 5 + backoff: + duration: 5s + factor: 2 + maxDuration: 3m diff --git a/aula-12/cleanup.sh b/aula-12/cleanup.sh new file mode 100755 index 0000000..2db887b --- /dev/null +++ b/aula-12/cleanup.sh @@ -0,0 +1,167 @@ +#!/bin/bash +# ============================================================================= +# Aula 12 - Cleanup Victoria Metrics +# ============================================================================= +# +# Remove Victoria Metrics stack via ArgoCD +# +# ============================================================================= + +set -e + +# Cores para output +RED='\033[0;31m' +GREEN='\033[0;32m' +YELLOW='\033[1;33m' +BLUE='\033[0;34m' +NC='\033[0m' # No Color + +# Funções de log +log_info() { echo -e "${BLUE}[INFO]${NC} $1"; } +log_success() { echo -e "${GREEN}[OK]${NC} $1"; } +log_warn() { echo -e "${YELLOW}[WARN]${NC} $1"; } +log_error() { echo -e "${RED}[ERROR]${NC} $1"; } + +# Diretório do script +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +ENV_FILE="${SCRIPT_DIR}/.env" + +# Carregar configuração +if [[ -f "$ENV_FILE" ]]; then + source "$ENV_FILE" +fi + +echo "" +echo "==========================================" +echo " Cleanup Victoria Metrics" +echo "==========================================" +echo "" +echo "Este script irá remover:" +echo " - ArgoCD Application 'monitoring'" +echo " - Todos os recursos no namespace 'monitoring'" +echo " - Secret do repositório no ArgoCD" +echo "" +echo "ATENÇÃO: Dados de métricas serão PERDIDOS!" +echo "" +read -p "Continuar? [y/N]: " CONFIRM + +if [[ "$CONFIRM" != "y" && "$CONFIRM" != "Y" ]]; then + log_info "Cancelado pelo usuário" + exit 0 +fi + +# ============================================================================= +# REMOVER ARGOCD APPLICATION +# ============================================================================= + +log_info "Removendo ArgoCD Application..." + +if kubectl get application monitoring -n argocd &> /dev/null; then + # Remover finalizers para permitir deleção + kubectl patch application monitoring -n argocd \ + --type json \ + --patch='[{"op": "remove", "path": "/metadata/finalizers"}]' 2>/dev/null || true + + kubectl delete application monitoring -n argocd --wait=false + log_success "ArgoCD Application removida" +else + log_info "ArgoCD Application já não existe" +fi + +# Aguardar recursos serem deletados pelo ArgoCD +log_info "Aguardando ArgoCD deletar recursos..." +sleep 5 + +# ============================================================================= +# REMOVER NAMESPACE +# ============================================================================= + +log_info "Removendo namespace 'monitoring'..." + +if kubectl get namespace monitoring &> /dev/null; then + # Forçar remoção de recursos se necessário + kubectl delete all --all -n monitoring --timeout=60s 2>/dev/null || true + kubectl delete pvc --all -n monitoring --timeout=60s 2>/dev/null || true + kubectl delete configmap --all -n monitoring --timeout=60s 2>/dev/null || true + kubectl delete secret --all -n monitoring --timeout=60s 2>/dev/null || true + + kubectl delete namespace monitoring --timeout=120s 2>/dev/null || { + log_warn "Timeout deletando namespace, forçando..." + kubectl get namespace monitoring -o json | \ + jq '.spec.finalizers = []' | \ + kubectl replace --raw "/api/v1/namespaces/monitoring/finalize" -f - 2>/dev/null || true + } + log_success "Namespace 'monitoring' removido" +else + log_info "Namespace 'monitoring' já não existe" +fi + +# ============================================================================= +# REMOVER SECRET DO REPOSITÓRIO +# ============================================================================= + +log_info "Removendo secret do repositório..." + +if kubectl get secret factory-monitoring-repo -n argocd &> /dev/null; then + kubectl delete secret factory-monitoring-repo -n argocd + log_success "Secret do repositório removido" +else + log_info "Secret do repositório já não existe" +fi + +# ============================================================================= +# REMOVER PROJETO GITLAB (OPCIONAL) +# ============================================================================= + +if [[ -n "$GITLAB_HOST" && -n "$GITLAB_TOKEN" ]]; then + echo "" + read -p "Remover projeto 'factory/monitoring' do GitLab? [y/N]: " REMOVE_PROJECT + + if [[ "$REMOVE_PROJECT" == "y" || "$REMOVE_PROJECT" == "Y" ]]; then + log_info "Removendo projeto do GitLab..." + + # Obter ID do projeto + PROJECT_RESPONSE=$(curl -s --header "PRIVATE-TOKEN: ${GITLAB_TOKEN}" \ + "https://${GITLAB_HOST}/api/v4/projects/factory%2Fmonitoring") + + PROJECT_ID=$(echo "$PROJECT_RESPONSE" | grep -o '"id":[0-9]*' | head -1 | cut -d: -f2) + + if [[ -n "$PROJECT_ID" ]]; then + curl -s --request DELETE --header "PRIVATE-TOKEN: ${GITLAB_TOKEN}" \ + "https://${GITLAB_HOST}/api/v4/projects/${PROJECT_ID}" + log_success "Projeto removido do GitLab" + else + log_info "Projeto não encontrado no GitLab" + fi + fi +fi + +# ============================================================================= +# LIMPAR ARQUIVO .ENV +# ============================================================================= + +echo "" +read -p "Remover arquivo de configuração local (.env)? [y/N]: " REMOVE_ENV + +if [[ "$REMOVE_ENV" == "y" || "$REMOVE_ENV" == "Y" ]]; then + rm -f "$ENV_FILE" + log_success "Arquivo .env removido" +fi + +# ============================================================================= +# FINALIZAÇÃO +# ============================================================================= + +echo "" +echo "==========================================" +echo " Cleanup Concluído!" +echo "==========================================" +echo "" +echo "Recursos removidos:" +echo " - ArgoCD Application 'monitoring'" +echo " - Namespace 'monitoring'" +echo " - Secret do repositório no ArgoCD" +echo "" +echo "Para reinstalar:" +echo " ./setup.sh" +echo "" diff --git a/aula-12/gitops/apps/victoria-metrics/Chart.yaml b/aula-12/gitops/apps/victoria-metrics/Chart.yaml new file mode 100644 index 0000000..301800a --- /dev/null +++ b/aula-12/gitops/apps/victoria-metrics/Chart.yaml @@ -0,0 +1,11 @@ +apiVersion: v2 +name: monitoring-stack +description: Victoria Metrics stack for Kubernetes monitoring +type: application +version: 1.0.0 +appVersion: "1.0.0" + +dependencies: + - name: victoria-metrics-k8s-stack + version: "0.28.4" + repository: https://victoriametrics.github.io/helm-charts/ diff --git a/aula-12/gitops/apps/victoria-metrics/templates/alerts.yaml b/aula-12/gitops/apps/victoria-metrics/templates/alerts.yaml new file mode 100644 index 0000000..fe77ce1 --- /dev/null +++ b/aula-12/gitops/apps/victoria-metrics/templates/alerts.yaml @@ -0,0 +1,179 @@ +# ============================================================================= +# VMRule - Regras de Alerta Customizadas +# ============================================================================= +# Documentação: https://docs.victoriametrics.com/operator/resources/vmrule/ +# ============================================================================= + +apiVersion: operator.victoriametrics.com/v1beta1 +kind: VMRule +metadata: + name: custom-alerts + namespace: {{ .Release.Namespace }} +spec: + groups: + # ------------------------------------------------------------------------- + # Alertas de Storage/PVC + # ------------------------------------------------------------------------- + - name: storage-alerts + rules: + - alert: PVCAlmostFull + expr: | + (kubelet_volume_stats_used_bytes / kubelet_volume_stats_capacity_bytes) > 0.8 + and + (kubelet_volume_stats_used_bytes / kubelet_volume_stats_capacity_bytes) < 0.95 + for: 5m + labels: + severity: warning + annotations: + summary: "PVC {{ "{{" }} $labels.persistentvolumeclaim {{ "}}" }} está acima de 80%" + description: "PVC {{ "{{" }} $labels.persistentvolumeclaim {{ "}}" }} no namespace {{ "{{" }} $labels.namespace {{ "}}" }} está usando {{ "{{" }} $value | humanizePercentage {{ "}}" }} do espaço." + + - alert: PVCFull + expr: | + (kubelet_volume_stats_used_bytes / kubelet_volume_stats_capacity_bytes) > 0.95 + for: 1m + labels: + severity: critical + annotations: + summary: "PVC {{ "{{" }} $labels.persistentvolumeclaim {{ "}}" }} está CHEIO (>95%)" + description: "PVC {{ "{{" }} $labels.persistentvolumeclaim {{ "}}" }} no namespace {{ "{{" }} $labels.namespace {{ "}}" }} está usando {{ "{{" }} $value | humanizePercentage {{ "}}" }} do espaço. AÇÃO URGENTE NECESSÁRIA!" + + - alert: PVCNotBound + expr: | + kube_persistentvolumeclaim_status_phase{phase!="Bound"} == 1 + for: 5m + labels: + severity: warning + annotations: + summary: "PVC {{ "{{" }} $labels.persistentvolumeclaim {{ "}}" }} não está Bound" + description: "PVC {{ "{{" }} $labels.persistentvolumeclaim {{ "}}" }} no namespace {{ "{{" }} $labels.namespace {{ "}}" }} está no estado {{ "{{" }} $labels.phase {{ "}}" }}." + + # ------------------------------------------------------------------------- + # Alertas de Pods + # ------------------------------------------------------------------------- + - name: pod-alerts + rules: + - alert: PodCrashLooping + expr: | + sum(increase(kube_pod_container_status_restarts_total[1h])) by (pod, namespace) > 5 + for: 5m + labels: + severity: warning + annotations: + summary: "Pod {{ "{{" }} $labels.pod {{ "}}" }} está em CrashLoop" + description: "Pod {{ "{{" }} $labels.pod {{ "}}" }} no namespace {{ "{{" }} $labels.namespace {{ "}}" }} reiniciou {{ "{{" }} $value {{ "}}" }} vezes na última hora." + + - alert: PodNotReady + expr: | + kube_pod_status_ready{condition="false"} == 1 + for: 5m + labels: + severity: warning + annotations: + summary: "Pod {{ "{{" }} $labels.pod {{ "}}" }} não está Ready" + description: "Pod {{ "{{" }} $labels.pod {{ "}}" }} no namespace {{ "{{" }} $labels.namespace {{ "}}" }} não está pronto há mais de 5 minutos." + + - alert: PodPending + expr: | + kube_pod_status_phase{phase="Pending"} == 1 + for: 10m + labels: + severity: warning + annotations: + summary: "Pod {{ "{{" }} $labels.pod {{ "}}" }} está Pending" + description: "Pod {{ "{{" }} $labels.pod {{ "}}" }} no namespace {{ "{{" }} $labels.namespace {{ "}}" }} está pendente há mais de 10 minutos. Verifique recursos ou imagens." + + - alert: ContainerOOMKilled + expr: | + kube_pod_container_status_last_terminated_reason{reason="OOMKilled"} == 1 + for: 0m + labels: + severity: warning + annotations: + summary: "Container {{ "{{" }} $labels.container {{ "}}" }} foi OOMKilled" + description: "Container {{ "{{" }} $labels.container {{ "}}" }} no pod {{ "{{" }} $labels.pod {{ "}}" }} foi terminado por falta de memória." + + # ------------------------------------------------------------------------- + # Alertas de Nodes + # ------------------------------------------------------------------------- + - name: node-alerts + rules: + - alert: NodeHighCPU + expr: | + 100 - (avg by(instance) (rate(node_cpu_seconds_total{mode="idle"}[5m])) * 100) > 90 + for: 5m + labels: + severity: warning + annotations: + summary: "Node {{ "{{" }} $labels.instance {{ "}}" }} com CPU alta" + description: "Node {{ "{{" }} $labels.instance {{ "}}" }} está usando {{ "{{" }} $value | humanize {{ "}}" }}% de CPU há mais de 5 minutos." + + - alert: NodeLowMemory + expr: | + (node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes) < 0.1 + for: 5m + labels: + severity: warning + annotations: + summary: "Node {{ "{{" }} $labels.instance {{ "}}" }} com pouca memória" + description: "Node {{ "{{" }} $labels.instance {{ "}}" }} tem apenas {{ "{{" }} $value | humanizePercentage {{ "}}" }} de memória disponível." + + - alert: NodeDiskFull + expr: | + (node_filesystem_avail_bytes{fstype!~"tmpfs|overlay"} / node_filesystem_size_bytes{fstype!~"tmpfs|overlay"}) < 0.15 + for: 5m + labels: + severity: critical + annotations: + summary: "Disco do Node {{ "{{" }} $labels.instance {{ "}}" }} quase cheio" + description: "Disco {{ "{{" }} $labels.mountpoint {{ "}}" }} no node {{ "{{" }} $labels.instance {{ "}}" }} tem apenas {{ "{{" }} $value | humanizePercentage {{ "}}" }} livre." + + - alert: NodeNotReady + expr: | + kube_node_status_condition{condition="Ready",status="true"} == 0 + for: 5m + labels: + severity: critical + annotations: + summary: "Node {{ "{{" }} $labels.node {{ "}}" }} não está Ready" + description: "Node {{ "{{" }} $labels.node {{ "}}" }} não está no estado Ready há mais de 5 minutos." + + # ------------------------------------------------------------------------- + # Alertas de Deployments + # ------------------------------------------------------------------------- + - name: deployment-alerts + rules: + - alert: DeploymentReplicasMismatch + expr: | + kube_deployment_spec_replicas != kube_deployment_status_replicas_available + for: 10m + labels: + severity: warning + annotations: + summary: "Deployment {{ "{{" }} $labels.deployment {{ "}}" }} com réplicas inconsistentes" + description: "Deployment {{ "{{" }} $labels.deployment {{ "}}" }} no namespace {{ "{{" }} $labels.namespace {{ "}}" }} tem {{ "{{" }} $value {{ "}}" }} réplicas faltando." + + - alert: DeploymentUnavailable + expr: | + kube_deployment_status_replicas_unavailable > 0 + for: 5m + labels: + severity: warning + annotations: + summary: "Deployment {{ "{{" }} $labels.deployment {{ "}}" }} tem réplicas indisponíveis" + description: "Deployment {{ "{{" }} $labels.deployment {{ "}}" }} no namespace {{ "{{" }} $labels.namespace {{ "}}" }} tem {{ "{{" }} $value {{ "}}" }} réplicas indisponíveis." + + # ------------------------------------------------------------------------- + # Alertas de Auto-Resize PVC + # ------------------------------------------------------------------------- + - name: autoresize-alerts + rules: + - alert: PVCAutoResized + expr: | + increase(kube_persistentvolumeclaim_resource_requests_storage_bytes[10m]) > 0 + for: 0m + labels: + severity: info + annotations: + summary: "PVC {{ "{{" }} $labels.persistentvolumeclaim {{ "}}" }} foi expandido automaticamente" + description: "PVC {{ "{{" }} $labels.persistentvolumeclaim {{ "}}" }} no namespace {{ "{{" }} $labels.namespace {{ "}}" }} foi redimensionado pelo pvc-autoresizer." diff --git a/aula-12/gitops/apps/victoria-metrics/values.yaml b/aula-12/gitops/apps/victoria-metrics/values.yaml new file mode 100644 index 0000000..a956e18 --- /dev/null +++ b/aula-12/gitops/apps/victoria-metrics/values.yaml @@ -0,0 +1,192 @@ +# ============================================================================= +# Victoria Metrics K8s Stack - Values +# ============================================================================= +# Documentação: https://github.com/VictoriaMetrics/helm-charts/tree/master/charts/victoria-metrics-k8s-stack +# ============================================================================= + +victoria-metrics-k8s-stack: + # --------------------------------------------------------------------------- + # VMSingle - Armazenamento de métricas (single-node) + # --------------------------------------------------------------------------- + vmsingle: + enabled: true + spec: + retentionPeriod: "14d" + storage: + storageClassName: "hcloud-volumes" + accessModes: + - ReadWriteOnce + resources: + requests: + storage: 10Gi + resources: + requests: + cpu: 100m + memory: 256Mi + limits: + cpu: 500m + memory: 512Mi + + # --------------------------------------------------------------------------- + # VMAgent - Coleta de métricas + # --------------------------------------------------------------------------- + vmagent: + enabled: true + spec: + scrapeInterval: "30s" + resources: + requests: + cpu: 50m + memory: 128Mi + limits: + cpu: 200m + memory: 256Mi + + # --------------------------------------------------------------------------- + # VMAlert - Sistema de alertas + # --------------------------------------------------------------------------- + vmalert: + enabled: true + spec: + extraArgs: + "notifier.blackhole": "true" # Não envia alertas (sem AlertManager) + resources: + requests: + cpu: 50m + memory: 64Mi + limits: + cpu: 100m + memory: 128Mi + + # --------------------------------------------------------------------------- + # AlertManager - Notificações (desabilitado por padrão) + # --------------------------------------------------------------------------- + alertmanager: + enabled: false + + # --------------------------------------------------------------------------- + # Grafana - Visualização + # --------------------------------------------------------------------------- + grafana: + enabled: true + + # Credenciais + adminUser: admin + # adminPassword é gerado automaticamente se não especificado + + # Recursos + resources: + requests: + cpu: 50m + memory: 128Mi + limits: + cpu: 200m + memory: 256Mi + + # Persistência para dashboards e configurações + persistence: + enabled: false # Dashboards vêm do GitOps, não precisa persistir + + # Ingress + ingress: + enabled: true + ingressClassName: nginx + annotations: + cert-manager.io/cluster-issuer: letsencrypt-prod + hosts: + - grafana.kube.quest + tls: + - secretName: grafana-tls + hosts: + - grafana.kube.quest + + # Sidecar carrega dashboards automaticamente do chart + sidecar: + dashboards: + enabled: true + + # Desabilitar criação de datasource separado (usa o do sidecar) + grafanaDatasource: + enabled: false + + # --------------------------------------------------------------------------- + # Kube State Metrics - Métricas de objetos K8s + # --------------------------------------------------------------------------- + kube-state-metrics: + enabled: true + resources: + requests: + cpu: 10m + memory: 32Mi + limits: + cpu: 100m + memory: 128Mi + + # --------------------------------------------------------------------------- + # Prometheus Node Exporter - Métricas de nodes + # --------------------------------------------------------------------------- + prometheus-node-exporter: + enabled: true + resources: + requests: + cpu: 10m + memory: 32Mi + limits: + cpu: 100m + memory: 64Mi + + # --------------------------------------------------------------------------- + # Prometheus Operator CRDs + # --------------------------------------------------------------------------- + prometheus-operator-crds: + enabled: true + + # --------------------------------------------------------------------------- + # VM Operator (gerencia CRDs do Victoria Metrics) + # --------------------------------------------------------------------------- + victoria-metrics-operator: + enabled: true + resources: + requests: + cpu: 50m + memory: 64Mi + limits: + cpu: 100m + memory: 128Mi + + # --------------------------------------------------------------------------- + # Service Monitors padrão + # --------------------------------------------------------------------------- + defaultRules: + create: true + rules: + alertmanager: false # AlertManager desabilitado + etcd: false # Não temos acesso ao etcd no Talos + configReloaders: true + general: true + k8s: true + kubeApiserver: true + kubeApiserverAvailability: true + kubeApiserverBurnrate: true + kubeApiserverHistogram: true + kubeApiserverSlos: true + kubeControllerManager: false # Não acessível no Talos + kubelet: true + kubeProxy: false # Não acessível no Talos + kubePrometheusGeneral: true + kubePrometheusNodeRecording: true + kubernetesApps: true + kubernetesResources: true + kubernetesStorage: true + kubernetesSystem: true + kubeScheduler: false # Não acessível no Talos + kubeStateMetrics: true + network: true + node: true + nodeExporterAlerting: true + nodeExporterRecording: true + prometheus: true + prometheusOperator: true + vmcluster: false # Usando vmsingle + vmagent: true + vmsingle: true diff --git a/aula-12/queries/useful-queries.md b/aula-12/queries/useful-queries.md new file mode 100644 index 0000000..0cad26f --- /dev/null +++ b/aula-12/queries/useful-queries.md @@ -0,0 +1,323 @@ +# Queries PromQL Úteis + +Queries prontas para uso no Grafana ou diretamente na API do Victoria Metrics. + +## Como usar + +### Via Grafana +1. Acesse Grafana → Explore +2. Selecione datasource "VictoriaMetrics" +3. Cole a query no editor + +### Via API +```bash +# Port-forward +kubectl port-forward -n monitoring svc/vmsingle-vm-victoria-metrics-k8s-stack 8429:8429 + +# Query +curl "http://localhost:8429/api/v1/query?query=up" +``` + +--- + +## Storage / PVC + +### Uso de PVC em porcentagem +```promql +kubelet_volume_stats_used_bytes / kubelet_volume_stats_capacity_bytes * 100 +``` + +### PVCs acima de 80% +```promql +(kubelet_volume_stats_used_bytes / kubelet_volume_stats_capacity_bytes) > 0.8 +``` + +### Espaço disponível por PVC (bytes) +```promql +kubelet_volume_stats_available_bytes +``` + +### Espaço disponível por PVC (GB) +```promql +kubelet_volume_stats_available_bytes / 1024 / 1024 / 1024 +``` + +### Inodes disponíveis +```promql +kubelet_volume_stats_inodes_free / kubelet_volume_stats_inodes * 100 +``` + +### PVCs que vão encher em 24h (previsão) +```promql +predict_linear(kubelet_volume_stats_available_bytes[6h], 24 * 3600) < 0 +``` + +--- + +## CPU + +### CPU por pod (cores) +```promql +sum(rate(container_cpu_usage_seconds_total{container!=""}[5m])) by (pod, namespace) +``` + +### CPU por namespace (cores) +```promql +sum(rate(container_cpu_usage_seconds_total{container!=""}[5m])) by (namespace) +``` + +### CPU por node (%) +```promql +100 - (avg by(instance) (rate(node_cpu_seconds_total{mode="idle"}[5m])) * 100) +``` + +### Top 10 pods por CPU +```promql +topk(10, sum(rate(container_cpu_usage_seconds_total{container!=""}[5m])) by (pod, namespace)) +``` + +### Uso de CPU vs Request +```promql +sum(rate(container_cpu_usage_seconds_total{container!=""}[5m])) by (pod, namespace) +/ +sum(kube_pod_container_resource_requests{resource="cpu"}) by (pod, namespace) +``` + +--- + +## Memória + +### Memória por pod (bytes) +```promql +sum(container_memory_working_set_bytes{container!=""}) by (pod, namespace) +``` + +### Memória por namespace (GB) +```promql +sum(container_memory_working_set_bytes{container!=""}) by (namespace) / 1024 / 1024 / 1024 +``` + +### Memória disponível por node (%) +```promql +(node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes) * 100 +``` + +### Top 10 pods por memória +```promql +topk(10, sum(container_memory_working_set_bytes{container!=""}) by (pod, namespace)) +``` + +### Uso de memória vs Limit +```promql +sum(container_memory_working_set_bytes{container!=""}) by (pod, namespace) +/ +sum(kube_pod_container_resource_limits{resource="memory"}) by (pod, namespace) +``` + +--- + +## Pods e Containers + +### Pods restartando na última hora +```promql +sum(increase(kube_pod_container_status_restarts_total[1h])) by (pod, namespace) > 0 +``` + +### Pods não Ready +```promql +kube_pod_status_ready{condition="false"} +``` + +### Pods em CrashLoopBackOff +```promql +kube_pod_container_status_waiting_reason{reason="CrashLoopBackOff"} +``` + +### Pods pendentes +```promql +kube_pod_status_phase{phase="Pending"} +``` + +### Containers OOMKilled +```promql +kube_pod_container_status_last_terminated_reason{reason="OOMKilled"} +``` + +### Total de pods por namespace +```promql +sum(kube_pod_info) by (namespace) +``` + +### Pods por node +```promql +sum(kube_pod_info) by (node) +``` + +--- + +## Deployments + +### Deployments com réplicas indisponíveis +```promql +kube_deployment_status_replicas_unavailable > 0 +``` + +### Deployments não atualizados +```promql +kube_deployment_status_observed_generation != kube_deployment_metadata_generation +``` + +### Proporção de réplicas disponíveis +```promql +kube_deployment_status_replicas_available / kube_deployment_spec_replicas +``` + +--- + +## Network + +### Bytes recebidos por pod (rate) +```promql +sum(rate(container_network_receive_bytes_total[5m])) by (pod, namespace) +``` + +### Bytes enviados por pod (rate) +```promql +sum(rate(container_network_transmit_bytes_total[5m])) by (pod, namespace) +``` + +### Erros de rede por interface +```promql +sum(rate(node_network_receive_errs_total[5m])) by (instance, device) +``` + +### Conexões TCP por estado +```promql +node_netstat_Tcp_CurrEstab +``` + +--- + +## Nodes + +### Nodes não Ready +```promql +kube_node_status_condition{condition="Ready",status="true"} == 0 +``` + +### Pressão de memória +```promql +kube_node_status_condition{condition="MemoryPressure",status="true"} == 1 +``` + +### Pressão de disco +```promql +kube_node_status_condition{condition="DiskPressure",status="true"} == 1 +``` + +### Disco disponível por node (%) +```promql +(node_filesystem_avail_bytes{fstype!~"tmpfs|overlay"} / node_filesystem_size_bytes{fstype!~"tmpfs|overlay"}) * 100 +``` + +### Load average (1 min) +```promql +node_load1 +``` + +--- + +## Cluster Overview + +### Total de pods Running +```promql +count(kube_pod_status_phase{phase="Running"}) +``` + +### Total de namespaces +```promql +count(kube_namespace_created) +``` + +### Total de deployments +```promql +count(kube_deployment_created) +``` + +### Total de PVCs +```promql +count(kube_persistentvolumeclaim_info) +``` + +### Idade do cluster (dias) +```promql +(time() - min(kube_namespace_created{namespace="kube-system"})) / 86400 +``` + +--- + +## Victoria Metrics + +### Métricas sendo coletadas (por job) +```promql +count by (job) ({__name__!=""}) +``` + +### Taxa de ingestão +```promql +sum(rate(vm_rows_inserted_total[5m])) +``` + +### Uso de disco do VM +```promql +vm_data_size_bytes +``` + +### Queries por segundo +```promql +sum(rate(vm_http_requests_total{path="/api/v1/query"}[5m])) +``` + +--- + +## Dicas + +### Filtrar por namespace +```promql +# Adicione {namespace="meu-namespace"} a qualquer query +sum(container_memory_working_set_bytes{namespace="gitlab"}) by (pod) +``` + +### Excluir namespaces de sistema +```promql +{namespace!~"kube-system|argocd|monitoring|gitlab"} +``` + +### Agregar por label +```promql +sum by (label_app) (kube_pod_info) +``` + +### Ordenar resultados +```promql +sort_desc(sum(container_memory_working_set_bytes) by (namespace)) +``` + +### Top N +```promql +topk(5, sum(rate(container_cpu_usage_seconds_total[5m])) by (pod)) +``` + +### Valor no tempo (offset) +```promql +# Valor de 1 hora atrás +container_memory_working_set_bytes offset 1h +``` + +--- + +## Referências + +- [PromQL Cheat Sheet](https://promlabs.com/promql-cheat-sheet/) +- [Victoria Metrics MetricsQL](https://docs.victoriametrics.com/metricsql/) +- [Grafana Dashboards](https://grafana.com/grafana/dashboards/) diff --git a/aula-12/setup.sh b/aula-12/setup.sh new file mode 100755 index 0000000..f680e5a --- /dev/null +++ b/aula-12/setup.sh @@ -0,0 +1,467 @@ +#!/bin/bash +# ============================================================================= +# Aula 12 - Victoria Metrics (Observabilidade via GitOps) +# ============================================================================= +# +# Este script instala Victoria Metrics stack usando ArgoCD (GitOps): +# 1. Cria projeto 'factory/monitoring' no GitLab +# 2. Push dos manifests GitOps +# 3. Cria ArgoCD Application +# 4. Victoria Metrics + Grafana são sincronizados automaticamente +# +# Pré-requisitos: +# - Cluster Kubernetes (aula-08) +# - ArgoCD instalado (aula-11) +# - GitLab com grupo 'factory' (aula-10/11) +# +# ============================================================================= + +set -e + +# Cores para output +RED='\033[0;31m' +GREEN='\033[0;32m' +YELLOW='\033[1;33m' +BLUE='\033[0;34m' +NC='\033[0m' # No Color + +# Funções de log +log_info() { echo -e "${BLUE}[INFO]${NC} $1"; } +log_success() { echo -e "${GREEN}[OK]${NC} $1"; } +log_warn() { echo -e "${YELLOW}[WARN]${NC} $1"; } +log_error() { echo -e "${RED}[ERROR]${NC} $1"; } + +# Diretório do script +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +ENV_FILE="${SCRIPT_DIR}/.env" +GITOPS_DIR="${SCRIPT_DIR}/gitops" + +# ============================================================================= +# VERIFICAR PRÉ-REQUISITOS +# ============================================================================= + +log_info "Verificando pré-requisitos..." + +# Verificar kubectl +if ! command -v kubectl &> /dev/null; then + log_error "kubectl não encontrado. Instale com: brew install kubectl" + exit 1 +fi + +# Verificar helm +if ! command -v helm &> /dev/null; then + log_error "helm não encontrado. Instale com: brew install helm" + exit 1 +fi + +# Verificar git +if ! command -v git &> /dev/null; then + log_error "git não encontrado. Instale com: brew install git" + exit 1 +fi + +# Verificar conexão com cluster +if ! kubectl cluster-info &> /dev/null; then + log_error "Não foi possível conectar ao cluster Kubernetes" + log_info "Verifique se KUBECONFIG está configurado corretamente" + exit 1 +fi + +# Verificar se ArgoCD está instalado +if ! kubectl get namespace argocd &> /dev/null; then + log_error "Namespace 'argocd' não encontrado" + log_info "Execute primeiro a aula-11 para instalar o ArgoCD" + exit 1 +fi + +# Verificar se ArgoCD está rodando +if ! kubectl get pods -n argocd -l app.kubernetes.io/name=argocd-server --no-headers 2>/dev/null | grep -q Running; then + log_error "ArgoCD server não está rodando" + log_info "Verifique: kubectl get pods -n argocd" + exit 1 +fi + +log_success "Pré-requisitos verificados" + +# ============================================================================= +# CARREGAR CONFIGURAÇÃO +# ============================================================================= + +# Carregar configuração local (se existir) +if [[ -f "$ENV_FILE" ]]; then + log_info "Carregando configuração local..." + source "$ENV_FILE" +fi + +# Herdar configuração da aula-11 +AULA11_ENV="${SCRIPT_DIR}/../aula-11/.env" +if [[ -f "$AULA11_ENV" ]]; then + log_info "Herdando configuração da aula-11..." + source "$AULA11_ENV" +fi + +# Herdar configuração da aula-10 +AULA10_ENV="${SCRIPT_DIR}/../aula-10/.env" +if [[ -f "$AULA10_ENV" ]]; then + log_info "Herdando configuração da aula-10..." + source "$AULA10_ENV" +fi + +# ============================================================================= +# COLETAR CONFIGURAÇÃO +# ============================================================================= + +echo "" +echo "==========================================" +echo " Configuração Victoria Metrics (GitOps)" +echo "==========================================" +echo "" + +# GitLab Host +if [[ -z "$GITLAB_HOST" ]]; then + read -p "Hostname do GitLab (ex: git.kube.quest): " GITLAB_HOST +fi +log_info "GitLab: https://${GITLAB_HOST}" + +# Extrair domínio base +if [[ -z "$DOMAIN" ]]; then + DOMAIN=$(echo "$GITLAB_HOST" | sed 's/^[^.]*\.//') +fi + +# Grafana Host +if [[ -z "$GRAFANA_HOST" ]]; then + DEFAULT_GRAFANA="grafana.${DOMAIN}" + read -p "Hostname do Grafana [${DEFAULT_GRAFANA}]: " GRAFANA_HOST + GRAFANA_HOST="${GRAFANA_HOST:-$DEFAULT_GRAFANA}" +fi +log_info "Grafana: https://${GRAFANA_HOST}" + +# GitLab Token (para criar projeto via API) +if [[ -z "$GITLAB_TOKEN" ]]; then + echo "" + echo "Token de acesso GitLab (para criar projeto via API):" + echo " 1. Acesse https://${GITLAB_HOST}/-/user_settings/personal_access_tokens" + echo " 2. Crie um token com scope 'api'" + echo "" + read -p "GitLab Token: " GITLAB_TOKEN +fi + +# TLS (herdar da aula-11) +if [[ -z "$USE_CLOUDFLARE" && -z "$USE_LETSENCRYPT" ]]; then + echo "" + echo "Configuração de TLS:" + echo " 1) CloudFlare (proxy ativo - TLS na borda)" + echo " 2) Let's Encrypt (cert-manager)" + echo " 3) HTTP apenas (desenvolvimento)" + read -p "Escolha [1-3]: " TLS_CHOICE + + case $TLS_CHOICE in + 1) USE_CLOUDFLARE=true; USE_LETSENCRYPT=false ;; + 2) USE_CLOUDFLARE=false; USE_LETSENCRYPT=true ;; + *) USE_CLOUDFLARE=false; USE_LETSENCRYPT=false ;; + esac +fi + +# Salvar configuração +cat > "$ENV_FILE" << EOF +# Configuração gerada pelo setup.sh +# $(date) +GITLAB_HOST=${GITLAB_HOST} +GRAFANA_HOST=${GRAFANA_HOST} +DOMAIN=${DOMAIN} +USE_CLOUDFLARE=${USE_CLOUDFLARE} +USE_LETSENCRYPT=${USE_LETSENCRYPT} +GITLAB_TOKEN=${GITLAB_TOKEN} +EOF + +log_success "Configuração salva em ${ENV_FILE}" + +# ============================================================================= +# CRIAR PROJETO NO GITLAB (VIA API) +# ============================================================================= + +echo "" +log_info "=== Criando Projeto no GitLab ===" + +# Verificar se grupo factory existe +log_info "Verificando grupo 'factory'..." +GROUP_RESPONSE=$(curl -s --header "PRIVATE-TOKEN: ${GITLAB_TOKEN}" \ + "https://${GITLAB_HOST}/api/v4/groups?search=factory") + +GROUP_ID=$(echo "$GROUP_RESPONSE" | grep -o '"id":[0-9]*' | head -1 | cut -d: -f2) + +if [[ -z "$GROUP_ID" ]]; then + log_info "Criando grupo 'factory'..." + GROUP_CREATE=$(curl -s --request POST --header "PRIVATE-TOKEN: ${GITLAB_TOKEN}" \ + --header "Content-Type: application/json" \ + --data '{"name": "factory", "path": "factory", "visibility": "internal"}' \ + "https://${GITLAB_HOST}/api/v4/groups") + GROUP_ID=$(echo "$GROUP_CREATE" | grep -o '"id":[0-9]*' | head -1 | cut -d: -f2) + + if [[ -z "$GROUP_ID" ]]; then + log_error "Falha ao criar grupo 'factory'" + log_info "Crie manualmente em https://${GITLAB_HOST}/admin/groups/new" + exit 1 + fi + log_success "Grupo 'factory' criado (ID: ${GROUP_ID})" +else + log_success "Grupo 'factory' já existe (ID: ${GROUP_ID})" +fi + +# Verificar se projeto monitoring existe +log_info "Verificando projeto 'monitoring'..." +PROJECT_RESPONSE=$(curl -s --header "PRIVATE-TOKEN: ${GITLAB_TOKEN}" \ + "https://${GITLAB_HOST}/api/v4/projects?search=monitoring") + +PROJECT_EXISTS=$(echo "$PROJECT_RESPONSE" | grep -o '"path_with_namespace":"factory/monitoring"' || true) + +if [[ -z "$PROJECT_EXISTS" ]]; then + log_info "Criando projeto 'monitoring'..." + PROJECT_CREATE=$(curl -s --request POST --header "PRIVATE-TOKEN: ${GITLAB_TOKEN}" \ + --header "Content-Type: application/json" \ + --data "{\"name\": \"monitoring\", \"namespace_id\": ${GROUP_ID}, \"visibility\": \"internal\", \"initialize_with_readme\": false}" \ + "https://${GITLAB_HOST}/api/v4/projects") + + PROJECT_ID=$(echo "$PROJECT_CREATE" | grep -o '"id":[0-9]*' | head -1 | cut -d: -f2) + + if [[ -z "$PROJECT_ID" ]]; then + log_error "Falha ao criar projeto 'monitoring'" + echo "$PROJECT_CREATE" + exit 1 + fi + log_success "Projeto 'monitoring' criado (ID: ${PROJECT_ID})" +else + log_success "Projeto 'monitoring' já existe" +fi + +# ============================================================================= +# PUSH DOS MANIFESTS GITOPS +# ============================================================================= + +echo "" +log_info "=== Push dos Manifests GitOps ===" + +# Criar diretório temporário para clone +TEMP_DIR=$(mktemp -d) +trap "rm -rf ${TEMP_DIR}" EXIT + +cd "${TEMP_DIR}" + +# Configurar git +git config --global user.email "argocd@${DOMAIN}" 2>/dev/null || true +git config --global user.name "ArgoCD Setup" 2>/dev/null || true + +# Clone do repositório (usando token) +log_info "Clonando repositório..." +GIT_URL="https://oauth2:${GITLAB_TOKEN}@${GITLAB_HOST}/factory/monitoring.git" + +if ! git clone "${GIT_URL}" monitoring 2>/dev/null; then + # Repositório vazio, inicializar + mkdir monitoring + cd monitoring + git init + git remote add origin "${GIT_URL}" +else + cd monitoring +fi + +# Copiar arquivos GitOps +log_info "Copiando manifests GitOps..." +cp -r "${GITOPS_DIR}/apps" . + +# Configurar values.yaml com hostname do Grafana +log_info "Configurando Grafana hostname..." +if [[ -f "apps/victoria-metrics/values.yaml" ]]; then + sed -i.bak "s/GRAFANA_HOST_PLACEHOLDER/${GRAFANA_HOST}/g" apps/victoria-metrics/values.yaml + rm -f apps/victoria-metrics/values.yaml.bak +fi + +# Commit e push +git add -A +if git diff --cached --quiet; then + log_info "Nenhuma mudança para commit" +else + git commit -m "feat: Victoria Metrics stack configuration" + log_info "Pushing para GitLab..." + git push -u origin main 2>/dev/null || git push -u origin master 2>/dev/null || { + # Primeiro push em repo vazio + git push --set-upstream origin main 2>/dev/null || git push --set-upstream origin master + } + log_success "Manifests enviados para GitLab" +fi + +cd "${SCRIPT_DIR}" + +# ============================================================================= +# CRIAR NAMESPACE MONITORING +# ============================================================================= + +echo "" +log_info "=== Criando Namespace ===" + +kubectl create namespace monitoring 2>/dev/null || log_info "Namespace 'monitoring' já existe" +log_success "Namespace 'monitoring' pronto" + +# ============================================================================= +# CONFIGURAR REPOSITÓRIO NO ARGOCD +# ============================================================================= + +echo "" +log_info "=== Configurando Repositório no ArgoCD ===" + +# Criar secret para o repositório +log_info "Criando secret de acesso ao repositório..." +kubectl apply -f - << EOF +apiVersion: v1 +kind: Secret +metadata: + name: factory-monitoring-repo + namespace: argocd + labels: + argocd.argoproj.io/secret-type: repository +stringData: + type: git + url: https://${GITLAB_HOST}/factory/monitoring.git + username: oauth2 + password: ${GITLAB_TOKEN} +EOF + +log_success "Repositório configurado no ArgoCD" + +# ============================================================================= +# CRIAR ARGOCD APPLICATION +# ============================================================================= + +echo "" +log_info "=== Criando ArgoCD Application ===" + +# Construir anotações de Ingress +INGRESS_ANNOTATIONS="" +if [[ "$USE_LETSENCRYPT" == "true" ]]; then + INGRESS_ANNOTATIONS='cert-manager.io/cluster-issuer: letsencrypt-prod' +fi + +# Aplicar ArgoCD Application +kubectl apply -f - << EOF +apiVersion: argoproj.io/v1alpha1 +kind: Application +metadata: + name: monitoring + namespace: argocd + finalizers: + - resources-finalizer.argocd.argoproj.io +spec: + project: default + source: + repoURL: https://${GITLAB_HOST}/factory/monitoring.git + targetRevision: HEAD + path: apps/victoria-metrics + helm: + valueFiles: + - values.yaml + destination: + server: https://kubernetes.default.svc + namespace: monitoring + syncPolicy: + automated: + prune: true + selfHeal: true + syncOptions: + - CreateNamespace=true + - ServerSideApply=true +EOF + +log_success "ArgoCD Application criada" + +# ============================================================================= +# AGUARDAR SINCRONIZAÇÃO +# ============================================================================= + +echo "" +log_info "=== Aguardando Sincronização ===" + +log_info "ArgoCD está sincronizando Victoria Metrics stack..." +log_info "Isso pode levar alguns minutos na primeira vez..." + +# Aguardar até 5 minutos +for i in {1..60}; do + STATUS=$(kubectl get application monitoring -n argocd -o jsonpath='{.status.sync.status}' 2>/dev/null || echo "Unknown") + HEALTH=$(kubectl get application monitoring -n argocd -o jsonpath='{.status.health.status}' 2>/dev/null || echo "Unknown") + + echo -ne "\r Status: ${STATUS} | Health: ${HEALTH} " + + if [[ "$STATUS" == "Synced" && "$HEALTH" == "Healthy" ]]; then + echo "" + log_success "Victoria Metrics stack sincronizado e saudável!" + break + fi + + if [[ $i -eq 60 ]]; then + echo "" + log_warn "Timeout aguardando sincronização" + log_info "Verifique o status no ArgoCD UI" + fi + + sleep 5 +done + +# ============================================================================= +# OBTER SENHA DO GRAFANA +# ============================================================================= + +echo "" +log_info "=== Credenciais do Grafana ===" + +# Aguardar secret ser criado +for i in {1..30}; do + if kubectl get secret vm-grafana -n monitoring &> /dev/null; then + break + fi + sleep 2 +done + +GRAFANA_PASSWORD=$(kubectl get secret vm-grafana -n monitoring -o jsonpath='{.data.admin-password}' 2>/dev/null | base64 -d 2>/dev/null || echo "") + +if [[ -z "$GRAFANA_PASSWORD" ]]; then + # Tentar nome alternativo do secret + GRAFANA_PASSWORD=$(kubectl get secret -n monitoring -l app.kubernetes.io/name=grafana -o jsonpath='{.items[0].data.admin-password}' 2>/dev/null | base64 -d 2>/dev/null || echo "admin") +fi + +# ============================================================================= +# INSTRUÇÕES FINAIS +# ============================================================================= + +echo "" +echo "==========================================" +echo " Instalação Concluída!" +echo "==========================================" +echo "" +echo "Victoria Metrics Stack instalado via GitOps" +echo "" +echo "Grafana:" +if [[ "$USE_CLOUDFLARE" == "true" || "$USE_LETSENCRYPT" == "true" ]]; then + echo " URL: https://${GRAFANA_HOST}" +else + echo " URL: http://${GRAFANA_HOST}" +fi +echo " Username: admin" +echo " Password: ${GRAFANA_PASSWORD:-'(ver secret vm-grafana)'}" +echo "" +echo "Acesso via port-forward:" +echo " kubectl port-forward -n monitoring svc/vm-grafana 3000:80" +echo " open http://localhost:3000" +echo "" +echo "ArgoCD Application:" +echo " kubectl get application monitoring -n argocd" +echo "" +echo "Pods:" +echo " kubectl get pods -n monitoring" +echo "" +echo "GitOps Repository:" +echo " https://${GITLAB_HOST}/factory/monitoring" +echo "" +echo "Verificar métricas:" +echo " kubectl port-forward -n monitoring svc/vmsingle-vm 8429:8429" +echo " curl http://localhost:8429/api/v1/query?query=up" +echo "" +echo "=========================================="