diff --git a/apps/cluster-health-dashboard/Jenkinsfile b/apps/cluster-health-dashboard/Jenkinsfile new file mode 100644 index 0000000..6344c34 --- /dev/null +++ b/apps/cluster-health-dashboard/Jenkinsfile @@ -0,0 +1,839 @@ +pipeline { + agent any + + environment { + DASHBOARD_NAME = 'cluster-health-dashboard' + OUTPUT_DIR = '/tmp/dashboard-reports' + PROMETHEUS_URL = 'http://prometheus-server.monitoring.svc.cluster.local' + GRAFANA_URL = 'http://grafana.monitoring.svc.cluster.local' + + // Notification + TELEGRAM_BOT_TOKEN = credentials('telegram-bot-token') + TELEGRAM_CHAT_ID = credentials('telegram-chat-id') + + // Pricing (adjust to your actual costs) + CPU_PRICE_PER_HOUR = '0.04' // $0.04 per vCPU hour + MEMORY_PRICE_PER_GB_HOUR = '0.005' // $0.005 per GB hour + } + + triggers { + // Run daily at 8 AM on weekdays + cron('0 8 * * 1-5') + } + + parameters { + choice( + name: 'REPORT_PERIOD', + choices: ['24h', '7d', '30d'], + description: 'Time period for metrics' + ) + booleanParam( + name: 'SEND_EMAIL', + defaultValue: true, + description: 'Send report via email' + ) + booleanParam( + name: 'SEND_TELEGRAM', + defaultValue: true, + description: 'Send summary to Telegram' + ) + } + + stages { + stage('Initialize') { + steps { + script { + echo "π Starting Cluster Health Dashboard generation..." + sh """ + mkdir -p ${OUTPUT_DIR} + rm -f ${OUTPUT_DIR}/* + """ + + if (params.SEND_TELEGRAM) { + sendTelegramNotification( + "π Cluster Health Report\n\n" + + "Generating dashboard for period: ${params.REPORT_PERIOD}" + ) + } + } + } + } + + stage('Collect Cluster Info') { + steps { + script { + echo "π Collecting cluster information..." + + // Get cluster version + env.CLUSTER_VERSION = sh( + script: 'kubectl version --short 2>/dev/null | grep Server | cut -d" " -f3 || echo "unknown"', + returnStdout: true + ).trim() + + // Get node count + env.NODE_COUNT = sh( + script: 'kubectl get nodes --no-headers 2>/dev/null | wc -l || echo "0"', + returnStdout: true + ).trim() + + // Get namespace count + env.NAMESPACE_COUNT = sh( + script: 'kubectl get namespaces --no-headers 2>/dev/null | wc -l || echo "0"', + returnStdout: true + ).trim() + + // Get total pod count + env.POD_COUNT = sh( + script: 'kubectl get pods --all-namespaces --no-headers 2>/dev/null | wc -l || echo "0"', + returnStdout: true + ).trim() + + echo "Cluster version: ${env.CLUSTER_VERSION}" + echo "Nodes: ${env.NODE_COUNT}" + echo "Namespaces: ${env.NAMESPACE_COUNT}" + echo "Pods: ${env.POD_COUNT}" + } + } + } + + stage('Query Prometheus Metrics') { + steps { + script { + echo "π Querying Prometheus for metrics..." + + def period = params.REPORT_PERIOD + + // CPU Usage + env.AVG_CPU_USAGE = queryPrometheus( + "avg(rate(container_cpu_usage_seconds_total{container!=''}[5m])) * 100" + ) + + // Memory Usage + env.TOTAL_MEMORY_USAGE_GB = queryPrometheus( + "sum(container_memory_usage_bytes{container!=''}) / 1024 / 1024 / 1024" + ) + + // Network RX + env.NETWORK_RX_MB = queryPrometheus( + "sum(rate(container_network_receive_bytes_total[5m])) / 1024 / 1024" + ) + + // Network TX + env.NETWORK_TX_MB = queryPrometheus( + "sum(rate(container_network_transmit_bytes_total[5m])) / 1024 / 1024" + ) + + // Pod restart count + env.TOTAL_RESTARTS = queryPrometheus( + "sum(kube_pod_container_status_restarts_total)" + ) + + echo "Metrics collected successfully" + } + } + } + + stage('Analyze Node Resources') { + steps { + script { + echo "π» Analyzing node resources..." + + try { + // Get CPU values + def cpuValues = sh( + script: """kubectl get nodes -o jsonpath='{range .items[*]}{.status.capacity.cpu}{"\\n"}{end}' 2>/dev/null || echo "0" """, + returnStdout: true + ).trim().split('\n') + + // Get Memory values + def memValues = sh( + script: """kubectl get nodes -o jsonpath='{range .items[*]}{.status.capacity.memory}{"\\n"}{end}' 2>/dev/null || echo "0Ki" """, + returnStdout: true + ).trim().split('\n') + + def totalCPU = 0 + def totalMemoryGB = 0 + + cpuValues.each { cpu -> + if (cpu?.trim() && cpu != "0") { + totalCPU += cpu.toInteger() + } + } + + memValues.each { mem -> + if (mem?.trim() && mem != "0Ki") { + def memKi = mem.replaceAll('[^0-9]', '') + if (memKi) { + def memKiLong = memKi.toLong() + totalMemoryGB += (memKiLong / 1024 / 1024) + } + } + } + + env.TOTAL_CPU_CORES = totalCPU.toString() + env.TOTAL_MEMORY_GB = totalMemoryGB.toString() + + echo "Total CPU cores: ${env.TOTAL_CPU_CORES}" + echo "Total Memory: ${env.TOTAL_MEMORY_GB} GB" + } catch (Exception e) { + echo "β οΈ Failed to analyze node resources: ${e.message}" + env.TOTAL_CPU_CORES = "0" + env.TOTAL_MEMORY_GB = "0" + } + } + } + } + + stage('Analyze Pod Status') { + steps { + script { + echo "π¦ Analyzing pod status across namespaces..." + + try { + // Get pod phases + def podPhases = sh( + script: """kubectl get pods --all-namespaces -o jsonpath='{range .items[*]}{.status.phase}{"\\n"}{end}' 2>/dev/null || echo "" """, + returnStdout: true + ).trim() + + def running = 0 + def pending = 0 + def failed = 0 + + if (podPhases) { + podPhases.split('\n').each { phase -> + def p = phase.toLowerCase().trim() + if (p == 'running') running++ + else if (p == 'pending') pending++ + else if (p == 'failed') failed++ + } + } + + env.PODS_RUNNING = running.toString() + env.PODS_PENDING = pending.toString() + env.PODS_FAILED = failed.toString() + + // Get namespace stats + def namespaces = sh( + script: """kubectl get namespaces -o jsonpath='{range .items[*]}{.metadata.name}{"\\n"}{end}' 2>/dev/null || echo "" """, + returnStdout: true + ).trim().split('\n') + + def namespaceStats = [:] + + namespaces.each { ns -> + if (ns?.trim()) { + def podCount = sh( + script: """kubectl get pods -n ${ns} --no-headers 2>/dev/null | wc -l || echo "0" """, + returnStdout: true + ).trim().toInteger() + + if (podCount > 0) { + namespaceStats[ns] = [pods: podCount, containers: podCount * 2] + } + } + } + + // Save namespace stats as string + env.NAMESPACE_STATS = namespaceStats.collect { k, v -> "${k}:${v.pods}:${v.containers}" }.join(',') + + echo "Pods running: ${env.PODS_RUNNING}" + echo "Pods pending: ${env.PODS_PENDING}" + echo "Pods failed: ${env.PODS_FAILED}" + } catch (Exception e) { + echo "β οΈ Failed to analyze pods: ${e.message}" + env.PODS_RUNNING = "0" + env.PODS_PENDING = "0" + env.PODS_FAILED = "0" + env.NAMESPACE_STATS = "" + } + } + } + } + + stage('Calculate Costs') { + steps { + script { + echo "π° Calculating resource costs..." + + try { + def cpuHours = env.TOTAL_CPU_CORES.toFloat() * 24 * 30 // Monthly + def memoryGBHours = env.TOTAL_MEMORY_GB.toFloat() * 24 * 30 + + def cpuCost = cpuHours * env.CPU_PRICE_PER_HOUR.toFloat() + def memoryCost = memoryGBHours * env.MEMORY_PRICE_PER_GB_HOUR.toFloat() + def totalCost = cpuCost + memoryCost + + env.MONTHLY_CPU_COST = String.format('%.2f', cpuCost) + env.MONTHLY_MEMORY_COST = String.format('%.2f', memoryCost) + env.MONTHLY_TOTAL_COST = String.format('%.2f', totalCost) + + echo "Estimated monthly costs:" + echo " CPU: \$${env.MONTHLY_CPU_COST}" + echo " Memory: \$${env.MONTHLY_MEMORY_COST}" + echo " Total: \$${env.MONTHLY_TOTAL_COST}" + } catch (Exception e) { + echo "β οΈ Failed to calculate costs: ${e.message}" + env.MONTHLY_CPU_COST = "0.00" + env.MONTHLY_MEMORY_COST = "0.00" + env.MONTHLY_TOTAL_COST = "0.00" + } + } + } + } + + stage('Check for Issues') { + steps { + script { + echo "π Checking for potential issues..." + + def issues = [] + + // High restart count + if (env.TOTAL_RESTARTS.toFloat() > 10) { + issues << "β οΈ High pod restart count: ${env.TOTAL_RESTARTS}" + } + + // Failed pods + if (env.PODS_FAILED.toInteger() > 0) { + issues << "β ${env.PODS_FAILED} pods in Failed state" + } + + // Pending pods + if (env.PODS_PENDING.toInteger() > 5) { + issues << "β οΈ ${env.PODS_PENDING} pods in Pending state (possible resource constraints)" + } + + // High CPU usage + if (env.AVG_CPU_USAGE.toFloat() > 80) { + issues << "π₯ High CPU usage: ${env.AVG_CPU_USAGE}%" + } + + env.ISSUES = issues.size() > 0 ? issues.join('\n') : "β No issues detected" + + if (issues.size() > 0) { + echo "Found ${issues.size()} issues:" + issues.each { echo it } + } else { + echo "β No critical issues found" + } + } + } + } + + stage('Generate HTML Dashboard') { + steps { + script { + echo "π¨ Generating HTML dashboard..." + + def namespaceTable = generateNamespaceTable(env.NAMESPACE_STATS) + + def html = generateDashboardHTML( + namespaceTable: namespaceTable + ) + + writeFile file: "${OUTPUT_DIR}/dashboard.html", text: html + + echo "β Dashboard generated: ${OUTPUT_DIR}/dashboard.html" + } + } + } + + stage('Generate JSON Report') { + steps { + script { + echo "π Generating JSON report..." + + def report = """ +{ + "generated_at": "${new Date().format('yyyy-MM-dd HH:mm:ss')}", + "period": "${params.REPORT_PERIOD}", + "cluster": { + "version": "${env.CLUSTER_VERSION}", + "nodes": ${env.NODE_COUNT}, + "namespaces": ${env.NAMESPACE_COUNT}, + "total_pods": ${env.POD_COUNT} + }, + "resources": { + "total_cpu_cores": ${env.TOTAL_CPU_CORES}, + "total_memory_gb": ${env.TOTAL_MEMORY_GB}, + "avg_cpu_usage_percent": ${env.AVG_CPU_USAGE}, + "total_memory_usage_gb": ${env.TOTAL_MEMORY_USAGE_GB} + }, + "pods": { + "running": ${env.PODS_RUNNING}, + "pending": ${env.PODS_PENDING}, + "failed": ${env.PODS_FAILED} + }, + "costs": { + "monthly_cpu_usd": ${env.MONTHLY_CPU_COST}, + "monthly_memory_usd": ${env.MONTHLY_MEMORY_COST}, + "monthly_total_usd": ${env.MONTHLY_TOTAL_COST} + }, + "issues": "${env.ISSUES.replaceAll('"', '\\"').replaceAll('\n', '\\\\n')}" +} + """ + + writeFile file: "${OUTPUT_DIR}/report.json", text: report + + echo "β JSON report generated" + } + } + } + } + + post { + success { + script { + echo "β Dashboard generation completed successfully!" + + // Archive artifacts + archiveArtifacts artifacts: "${OUTPUT_DIR}/*", fingerprint: true + + // Publish HTML report + publishHTML([ + allowMissing: false, + alwaysLinkToLastBuild: true, + keepAll: true, + reportDir: OUTPUT_DIR, + reportFiles: 'dashboard.html', + reportName: 'Cluster Health Dashboard', + reportTitles: 'Cluster Health Dashboard' + ]) + + // Send Telegram summary + if (params.SEND_TELEGRAM) { + def message = """ +π Cluster Health Report + +ββββββββββββββββββββββ +π Cluster Info +Version: ${env.CLUSTER_VERSION} +Nodes: ${env.NODE_COUNT} +Namespaces: ${env.NAMESPACE_COUNT} +Total Pods: ${env.POD_COUNT} + +ββββββββββββββββββββββ +π» Resources +CPU Cores: ${env.TOTAL_CPU_CORES} +Memory: ${env.TOTAL_MEMORY_GB} GB +Avg CPU Usage: ${env.AVG_CPU_USAGE}% + +ββββββββββββββββββββββ +π¦ Pod Status +Running: ${env.PODS_RUNNING} β +Pending: ${env.PODS_PENDING} β³ +Failed: ${env.PODS_FAILED} β + +ββββββββββββββββββββββ +π° Estimated Monthly Costs +CPU: \$${env.MONTHLY_CPU_COST} +Memory: \$${env.MONTHLY_MEMORY_COST} +Total: \$${env.MONTHLY_TOTAL_COST} + +ββββββββββββββββββββββ +π Issues +${env.ISSUES} + +View Full Dashboard + """ + + sendTelegramNotification(message) + } + + echo "\nπ Dashboard URL: ${env.BUILD_URL}Cluster_20Health_20Dashboard/" + } + } + + failure { + script { + echo "β Dashboard generation failed!" + + if (params.SEND_TELEGRAM) { + sendTelegramNotification( + "β Cluster Health Report Failed\n\n" + + "View Console Output" + ) + } + } + } + + always { + script { + echo "π§Ή Cleanup completed" + } + } + } +} + +// Helper function to query Prometheus (NO JQ NEEDED!) +def queryPrometheus(query) { + try { + def response = sh( + script: """ + curl -s '${PROMETHEUS_URL}/api/v1/query?query=${URLEncoder.encode(query, "UTF-8")}' + """, + returnStdout: true + ).trim() + + // Extract value using grep and sed (no jq!) + def value = sh( + script: """ + echo '${response}' | grep -oP '"value":\\[\\d+,"\\K[^"]+' || echo "0" + """, + returnStdout: true + ).trim() + + return value ?: "0" + } catch (Exception e) { + echo "β οΈ Failed to query Prometheus: ${e.message}" + return "0" + } +} + +// Helper function to send Telegram notification +def sendTelegramNotification(message) { + try { + sh """ + curl -s -X POST "https://api.telegram.org/bot${TELEGRAM_BOT_TOKEN}/sendMessage" \ + -d chat_id="${TELEGRAM_CHAT_ID}" \ + -d parse_mode="HTML" \ + -d disable_web_page_preview=true \ + -d text="${message}" + """ + } catch (Exception e) { + echo "β οΈ Failed to send Telegram notification: ${e.message}" + } +} + +// Helper function to generate namespace table HTML +def generateNamespaceTable(namespaceStatsStr) { + def rows = "" + + if (namespaceStatsStr) { + def stats = namespaceStatsStr.split(',') + stats.each { stat -> + def parts = stat.split(':') + if (parts.size() >= 3) { + rows += """ +
| Namespace | +Pods | +Containers | +
|---|