Files
workshop/aula-08/cluster-autoscaler.yaml
ArgoCD Setup 6ae82ed183 fix(aula-08): adicionar draining-node-pool com CAX11 no cluster-autoscaler
O autoscaler Hetzner cria internamente um draining-node-pool que
usava cx11 como tipo default. Como a Hetzner descontinuou o cx11,
o lookup falhava e quebrava o loop principal do autoscaler, impedindo
qualquer scale-up de nodes.

Adicionado --nodes=0:0:CAX11:nbg1:draining-node-pool e nodeConfig
correspondente no cluster-config para resolver o erro.
2026-03-14 14:01:12 -03:00

186 lines
5.6 KiB
YAML

############################################################
# Cluster Autoscaler para Hetzner Cloud + Talos
#
# Pools:
# - worker-pool: CAX21 (4 vCPU, 8GB) - workloads gerais
# - gitlab-pool: CAX21 - Gitea e serviços relacionados
# - build-pool: CAX31 (8 vCPU, 16GB) - builds Docker, escala 0-1
############################################################
---
apiVersion: v1
kind: Namespace
metadata:
name: cluster-autoscaler
# Secret is created via install-autoscaler.sh (kubectl create secret)
# to properly handle base64 encoding of cloud-init
---
apiVersion: v1
kind: ServiceAccount
metadata:
name: cluster-autoscaler
namespace: cluster-autoscaler
---
apiVersion: rbac.authorization.k8s.io/v1
kind: ClusterRole
metadata:
name: cluster-autoscaler
rules:
- apiGroups: [""]
resources: ["events", "endpoints"]
verbs: ["create", "patch"]
- apiGroups: [""]
resources: ["pods/eviction"]
verbs: ["create"]
- apiGroups: [""]
resources: ["pods/status"]
verbs: ["update"]
- apiGroups: [""]
resources: ["endpoints"]
resourceNames: ["cluster-autoscaler"]
verbs: ["get", "update"]
- apiGroups: [""]
resources: ["nodes"]
verbs: ["watch", "list", "get", "update"]
- apiGroups: [""]
resources: ["namespaces", "pods", "services", "replicationcontrollers", "persistentvolumeclaims", "persistentvolumes"]
verbs: ["watch", "list", "get"]
- apiGroups: ["batch"]
resources: ["jobs", "cronjobs"]
verbs: ["watch", "list", "get"]
- apiGroups: ["batch", "extensions"]
resources: ["jobs"]
verbs: ["get", "list", "patch", "watch"]
- apiGroups: ["extensions"]
resources: ["replicasets", "daemonsets"]
verbs: ["watch", "list", "get"]
- apiGroups: ["policy"]
resources: ["poddisruptionbudgets"]
verbs: ["watch", "list"]
- apiGroups: ["apps"]
resources: ["statefulsets", "replicasets", "daemonsets"]
verbs: ["watch", "list", "get"]
- apiGroups: ["storage.k8s.io"]
resources: ["storageclasses", "csinodes", "csidrivers", "csistoragecapacities"]
verbs: ["watch", "list", "get"]
- apiGroups: ["coordination.k8s.io"]
resources: ["leases"]
verbs: ["create"]
- apiGroups: ["coordination.k8s.io"]
resources: ["leases"]
resourceNames: ["cluster-autoscaler"]
verbs: ["get", "update"]
- apiGroups: [""]
resources: ["configmaps"]
verbs: ["create", "get", "update", "delete", "list", "watch"]
---
apiVersion: rbac.authorization.k8s.io/v1
kind: ClusterRoleBinding
metadata:
name: cluster-autoscaler
roleRef:
apiGroup: rbac.authorization.k8s.io
kind: ClusterRole
name: cluster-autoscaler
subjects:
- kind: ServiceAccount
name: cluster-autoscaler
namespace: cluster-autoscaler
---
apiVersion: apps/v1
kind: Deployment
metadata:
name: cluster-autoscaler
namespace: cluster-autoscaler
labels:
app: cluster-autoscaler
spec:
replicas: 1
selector:
matchLabels:
app: cluster-autoscaler
template:
metadata:
labels:
app: cluster-autoscaler
spec:
serviceAccountName: cluster-autoscaler
containers:
- name: cluster-autoscaler
image: registry.k8s.io/autoscaling/cluster-autoscaler:v1.31.0
securityContext:
allowPrivilegeEscalation: false
capabilities:
drop: ["ALL"]
runAsNonRoot: true
runAsUser: 65532
seccompProfile:
type: RuntimeDefault
command:
- ./cluster-autoscaler
- --cloud-provider=hetzner
#
# POOLS DE NODES:
#
# worker-pool: Workloads gerais (CAX21 = 4 vCPU, 8GB)
- --nodes=1:3:CAX21:nbg1:worker-pool
#
# gitlab-pool: Gitea e serviços pesados (CAX21)
- --nodes=1:2:CAX21:nbg1:gitlab-pool
#
# build-pool: Builds Docker (CAX31 = 8 vCPU, 16GB)
# Escala 0-1 sob demanda, taint "dedicated=builds:NoSchedule"
- --nodes=0:1:CAX31:nbg1:build-pool
#
# draining-node-pool: Pool interno do autoscaler para nodes em drenagem
# Precisa de tipo válido (CAX11) senão o autoscaler quebra
- --nodes=0:0:CAX11:nbg1:draining-node-pool
#
# CONFIGURAÇÕES DE SCALE DOWN:
#
- --scale-down-enabled=true
- --scale-down-delay-after-add=3m
- --scale-down-unneeded-time=3m
- --scale-down-utilization-threshold=0.5
- --skip-nodes-with-local-storage=false
- --skip-nodes-with-system-pods=false
- --balance-similar-node-groups=false
- --expander=least-waste
- --v=4
env:
- name: HCLOUD_TOKEN
valueFrom:
secretKeyRef:
name: hcloud-autoscaler
key: token
- name: HCLOUD_CLOUD_INIT
valueFrom:
secretKeyRef:
name: hcloud-autoscaler
key: cloud-init
- name: HCLOUD_CLUSTER_CONFIG
valueFrom:
secretKeyRef:
name: hcloud-autoscaler
key: cluster-config
- name: HCLOUD_IMAGE
value: "${TALOS_IMAGE_ID}"
- name: HCLOUD_NETWORK
value: "${NETWORK_NAME}"
- name: HCLOUD_FIREWALL
value: "${FIREWALL_NAME}"
- name: HCLOUD_SSH_KEY
value: "${SSH_KEY_NAME}"
resources:
requests:
cpu: 100m
memory: 300Mi
limits:
cpu: 500m
memory: 500Mi