Cluster Health Dashboard

pipeline { agent any environment { DASHBOARD_NAME = 'cluster-health-dashboard' OUTPUT_DIR = '${WORKSPACE}/dashboard-reports' PROMETHEUS_URL = 'http://k8s-monitoring-kube-promet-prometheus.monitoring.svc.cluster.local:9090' GRAFANA_URL = 'http://grafana.monitoring.svc.cluster.local:80' // Notification TELEGRAM_BOT_TOKEN = credentials('telegram-bot-token') TELEGRAM_CHAT_ID = credentials('telegram-chat-id') // Pricing (adjust to your actual costs) CPU_PRICE_PER_HOUR = '0.04' // $0.04 per vCPU hour MEMORY_PRICE_PER_GB_HOUR = '0.005' // $0.005 per GB hour } triggers { // Run daily at 8 AM on weekdays cron('0 8 * * 1-5') } parameters { choice( name: 'REPORT_PERIOD', choices: ['24h', '7d', '30d'], description: 'Time period for metrics' ) booleanParam( name: 'SEND_EMAIL', defaultValue: true, description: 'Send report via email' ) booleanParam( name: 'SEND_TELEGRAM', defaultValue: true, description: 'Send summary to Telegram' ) } stages { stage('Initialize') { steps { script { echo "🚀 Starting Cluster Health Dashboard generation..." sh """ mkdir -p \${WORKSPACE}/dashboard-reports rm -f \${WORKSPACE}/dashboard-reports/* """ if (params.SEND_TELEGRAM) { sendTelegramNotification( "📊 Cluster Health Report\n\n" + "Generating dashboard for period: ${params.REPORT_PERIOD}" ) } } } } stage('Collect Cluster Info') { steps { script { echo "📋 Collecting cluster information..." // Get cluster version env.CLUSTER_VERSION = sh( script: 'kubectl version --short 2>/dev/null | grep Server | cut -d" " -f3 || echo "unknown"', returnStdout: true ).trim() // Get node count env.NODE_COUNT = sh( script: 'kubectl get nodes --no-headers 2>/dev/null | wc -l || echo "0"', returnStdout: true ).trim() // Get namespace count env.NAMESPACE_COUNT = sh( script: 'kubectl get namespaces --no-headers 2>/dev/null | wc -l || echo "0"', returnStdout: true ).trim() // Get total pod count env.POD_COUNT = sh( script: 'kubectl get pods --all-namespaces --no-headers 2>/dev/null | wc -l || echo "0"', returnStdout: true ).trim() echo "Cluster version: ${env.CLUSTER_VERSION}" echo "Nodes: ${env.NODE_COUNT}" echo "Namespaces: ${env.NAMESPACE_COUNT}" echo "Pods: ${env.POD_COUNT}" } } } stage('Query Prometheus Metrics') { steps { script { echo "📈 Querying Prometheus for metrics..." def period = params.REPORT_PERIOD // CPU Usage env.AVG_CPU_USAGE = queryPrometheus( "avg(rate(container_cpu_usage_seconds_total{container!=''}[5m])) * 100" ) // Memory Usage env.TOTAL_MEMORY_USAGE_GB = queryPrometheus( "sum(container_memory_usage_bytes{container!=''}) / 1024 / 1024 / 1024" ) // Network RX env.NETWORK_RX_MB = queryPrometheus( "sum(rate(container_network_receive_bytes_total[5m])) / 1024 / 1024" ) // Network TX env.NETWORK_TX_MB = queryPrometheus( "sum(rate(container_network_transmit_bytes_total[5m])) / 1024 / 1024" ) // Pod restart count env.TOTAL_RESTARTS = queryPrometheus( "sum(kube_pod_container_status_restarts_total)" ) echo "Metrics collected successfully" } } } stage('Analyze Node Resources') { steps { script { echo "💻 Analyzing node resources..." try { // Get CPU values def cpuValues = sh( script: """kubectl get nodes -o jsonpath='{range .items[*]}{.status.capacity.cpu}{"\\n"}{end}' 2>/dev/null || echo "0" """, returnStdout: true ).trim().split('\n') // Get Memory values def memValues = sh( script: """kubectl get nodes -o jsonpath='{range .items[*]}{.status.capacity.memory}{"\\n"}{end}' 2>/dev/null || echo "0Ki" """, returnStdout: true ).trim().split('\n') def totalCPU = 0 def totalMemoryGB = 0 cpuValues.each { cpu -> if (cpu?.trim() && cpu != "0") { totalCPU += cpu.toInteger() } } memValues.each { mem -> if (mem?.trim() && mem != "0Ki") { def memKi = mem.replaceAll('[^0-9]', '') if (memKi) { def memKiLong = memKi.toLong() totalMemoryGB += (memKiLong / 1024 / 1024) } } } env.TOTAL_CPU_CORES = totalCPU.toString() env.TOTAL_MEMORY_GB = totalMemoryGB.toString() echo "Total CPU cores: ${env.TOTAL_CPU_CORES}" echo "Total Memory: ${env.TOTAL_MEMORY_GB} GB" } catch (Exception e) { echo "⚠️ Failed to analyze node resources: ${e.message}" env.TOTAL_CPU_CORES = "0" env.TOTAL_MEMORY_GB = "0" } } } } stage('Analyze Pod Status') { steps { script { echo "📦 Analyzing pod status across namespaces..." try { // Get pod phases def podPhases = sh( script: """kubectl get pods --all-namespaces -o jsonpath='{range .items[*]}{.status.phase}{"\\n"}{end}' 2>/dev/null || echo "" """, returnStdout: true ).trim() def running = 0 def pending = 0 def failed = 0 if (podPhases) { podPhases.split('\n').each { phase -> def p = phase.toLowerCase().trim() if (p == 'running') running++ else if (p == 'pending') pending++ else if (p == 'failed') failed++ } } env.PODS_RUNNING = running.toString() env.PODS_PENDING = pending.toString() env.PODS_FAILED = failed.toString() // Get namespace stats def namespaces = sh( script: """kubectl get namespaces -o jsonpath='{range .items[*]}{.metadata.name}{"\\n"}{end}' 2>/dev/null || echo "" """, returnStdout: true ).trim().split('\n') def namespaceStats = [:] namespaces.each { ns -> if (ns?.trim()) { def podCount = sh( script: """kubectl get pods -n ${ns} --no-headers 2>/dev/null | wc -l || echo "0" """, returnStdout: true ).trim().toInteger() if (podCount > 0) { namespaceStats[ns] = [pods: podCount, containers: podCount * 2] } } } // Save namespace stats as string env.NAMESPACE_STATS = namespaceStats.collect { k, v -> "${k}:${v.pods}:${v.containers}" }.join(',') echo "Pods running: ${env.PODS_RUNNING}" echo "Pods pending: ${env.PODS_PENDING}" echo "Pods failed: ${env.PODS_FAILED}" } catch (Exception e) { echo "⚠️ Failed to analyze pods: ${e.message}" env.PODS_RUNNING = "0" env.PODS_PENDING = "0" env.PODS_FAILED = "0" env.NAMESPACE_STATS = "" } } } } stage('Calculate Costs') { steps { script { echo "💰 Calculating resource costs..." try { def cpuHours = env.TOTAL_CPU_CORES.toFloat() * 24 * 30 // Monthly def memoryGBHours = env.TOTAL_MEMORY_GB.toFloat() * 24 * 30 def cpuCost = cpuHours * env.CPU_PRICE_PER_HOUR.toFloat() def memoryCost = memoryGBHours * env.MEMORY_PRICE_PER_GB_HOUR.toFloat() def totalCost = cpuCost + memoryCost env.MONTHLY_CPU_COST = String.format('%.2f', cpuCost) env.MONTHLY_MEMORY_COST = String.format('%.2f', memoryCost) env.MONTHLY_TOTAL_COST = String.format('%.2f', totalCost) echo "Estimated monthly costs:" echo " CPU: \$${env.MONTHLY_CPU_COST}" echo " Memory: \$${env.MONTHLY_MEMORY_COST}" echo " Total: \$${env.MONTHLY_TOTAL_COST}" } catch (Exception e) { echo "⚠️ Failed to calculate costs: ${e.message}" env.MONTHLY_CPU_COST = "0.00" env.MONTHLY_MEMORY_COST = "0.00" env.MONTHLY_TOTAL_COST = "0.00" } } } } stage('Check for Issues') { steps { script { echo "🔍 Checking for potential issues..." def issues = [] // High restart count if (env.TOTAL_RESTARTS.toFloat() > 10) { issues << "⚠️ High pod restart count: ${env.TOTAL_RESTARTS}" } // Failed pods if (env.PODS_FAILED.toInteger() > 0) { issues << "❌ ${env.PODS_FAILED} pods in Failed state" } // Pending pods if (env.PODS_PENDING.toInteger() > 5) { issues << "⚠️ ${env.PODS_PENDING} pods in Pending state (possible resource constraints)" } // High CPU usage if (env.AVG_CPU_USAGE.toFloat() > 80) { issues << "🔥 High CPU usage: ${env.AVG_CPU_USAGE}%" } env.ISSUES = issues.size() > 0 ? issues.join('\n') : "✅ No issues detected" if (issues.size() > 0) { echo "Found ${issues.size()} issues:" issues.each { echo it } } else { echo "✅ No critical issues found" } } } } stage('Generate HTML Dashboard') { steps { script { echo "🎨 Generating HTML dashboard..." def namespaceTable = generateNamespaceTable(env.NAMESPACE_STATS) def html = generateDashboardHTML( namespaceTable: namespaceTable ) writeFile file: "\${WORKSPACE}/dashboard-reports/dashboard.html", text: html echo "✅ Dashboard generated: \${WORKSPACE}/dashboard-reports/dashboard.html" } } } stage('Generate JSON Report') { steps { script { echo "📄 Generating JSON report..." def report = """ { "generated_at": "${new Date().format('yyyy-MM-dd HH:mm:ss')}", "period": "${params.REPORT_PERIOD}", "cluster": { "version": "${env.CLUSTER_VERSION}", "nodes": ${env.NODE_COUNT}, "namespaces": ${env.NAMESPACE_COUNT}, "total_pods": ${env.POD_COUNT} }, "resources": { "total_cpu_cores": ${env.TOTAL_CPU_CORES}, "total_memory_gb": ${env.TOTAL_MEMORY_GB}, "avg_cpu_usage_percent": ${env.AVG_CPU_USAGE}, "total_memory_usage_gb": ${env.TOTAL_MEMORY_USAGE_GB} }, "pods": { "running": ${env.PODS_RUNNING}, "pending": ${env.PODS_PENDING}, "failed": ${env.PODS_FAILED} }, "costs": { "monthly_cpu_usd": ${env.MONTHLY_CPU_COST}, "monthly_memory_usd": ${env.MONTHLY_MEMORY_COST}, "monthly_total_usd": ${env.MONTHLY_TOTAL_COST} }, "issues": "${env.ISSUES.replaceAll('"', '\\"').replaceAll('\n', '\\\\n')}" } """ writeFile file: "\${WORKSPACE}/dashboard-reports/report.json", text: report echo "✅ JSON report generated" } } } } post { success { script { echo "✅ Dashboard generation completed successfully!" // Archive artifacts - FIXED PATH archiveArtifacts artifacts: 'dashboard-reports/*', fingerprint: true // Publish HTML report - FIXED PATH publishHTML([ allowMissing: false, alwaysLinkToLastBuild: true, keepAll: true, reportDir: 'dashboard-reports', reportFiles: 'dashboard.html', reportName: 'Cluster Health Dashboard', reportTitles: 'Cluster Health Dashboard' ]) // Send Telegram summary if (params.SEND_TELEGRAM) { def message = """ 📊 Cluster Health Report ━━━━━━━━━━━━━━━━━━━━━━ 📋 Cluster Info Version: ${env.CLUSTER_VERSION} Nodes: ${env.NODE_COUNT} Namespaces: ${env.NAMESPACE_COUNT} Total Pods: ${env.POD_COUNT} ━━━━━━━━━━━━━━━━━━━━━━ 💻 Resources CPU Cores: ${env.TOTAL_CPU_CORES} Memory: ${env.TOTAL_MEMORY_GB} GB Avg CPU Usage: ${env.AVG_CPU_USAGE}% ━━━━━━━━━━━━━━━━━━━━━━ 📦 Pod Status Running: ${env.PODS_RUNNING} ✅ Pending: ${env.PODS_PENDING} ⏳ Failed: ${env.PODS_FAILED} ❌ ━━━━━━━━━━━━━━━━━━━━━━ 💰 Estimated Monthly Costs CPU: \$${env.MONTHLY_CPU_COST} Memory: \$${env.MONTHLY_MEMORY_COST} Total: \$${env.MONTHLY_TOTAL_COST} ━━━━━━━━━━━━━━━━━━━━━━ 🔍 Issues ${env.ISSUES} View Full Dashboard """ sendTelegramNotification(message) } echo "\n📊 Dashboard URL: ${env.BUILD_URL}Cluster_20Health_20Dashboard/" } } failure { script { echo "❌ Dashboard generation failed!" if (params.SEND_TELEGRAM) { sendTelegramNotification( "❌ Cluster Health Report Failed\n\n" + "View Console Output" ) } } } always { script { echo "🧹 Cleanup completed" } } } } // Helper function to query Prometheus (NO JQ NEEDED!) def queryPrometheus(query) { try { def response = sh( script: """ curl -s '${PROMETHEUS_URL}/api/v1/query?query=${URLEncoder.encode(query, "UTF-8")}' """, returnStdout: true ).trim() // Extract value using grep and sed (no jq!) def value = sh( script: """ echo '${response}' | grep -oP '"value":\\[\\d+,"\\K[^"]+' || echo "0" """, returnStdout: true ).trim() return value ?: "0" } catch (Exception e) { echo "⚠️ Failed to query Prometheus: ${e.message}" return "0" } } // Helper function to send Telegram notification def sendTelegramNotification(message) { try { sh """ curl -s -X POST "https://api.telegram.org/bot${TELEGRAM_BOT_TOKEN}/sendMessage" \ -d chat_id="${TELEGRAM_CHAT_ID}" \ -d parse_mode="HTML" \ -d disable_web_page_preview=true \ -d text="${message}" """ } catch (Exception e) { echo "⚠️ Failed to send Telegram notification: ${e.message}" } } // Helper function to generate namespace table HTML def generateNamespaceTable(namespaceStatsStr) { def rows = "" if (namespaceStatsStr) { def stats = namespaceStatsStr.split(',') stats.each { stat -> def parts = stat.split(':') if (parts.size() >= 3) { rows += """ ${parts[0]} ${parts[1]} ${parts[2]} """ } } } if (!rows) { rows = "No data available" } return rows } // Helper function to generate complete HTML dashboard def generateDashboardHTML(args) { def namespaceTable = args.namespaceTable return """ Cluster Health Dashboard

📋 Cluster Information

Kubernetes Version ${env.CLUSTER_VERSION}

Nodes ${env.NODE_COUNT}

Namespaces ${env.NAMESPACE_COUNT}

Total Pods ${env.POD_COUNT}

💻 Resource Capacity

Total CPU Cores ${env.TOTAL_CPU_CORES}

Total Memory ${env.TOTAL_MEMORY_GB} GB

Avg CPU Usage ${env.AVG_CPU_USAGE}%

📦 Pod Status

Running ${env.PODS_RUNNING}

Pending ${env.PODS_PENDING}

Failed ${env.PODS_FAILED}

Total Restarts ${env.TOTAL_RESTARTS}

💰 Estimated Monthly Costs

CPU Cost \$${env.MONTHLY_CPU_COST}

Memory Cost \$${env.MONTHLY_MEMORY_COST}

Total Cost \$${env.MONTHLY_TOTAL_COST}

💡 Based on: CPU \$${env.CPU_PRICE_PER_HOUR}/core/hour, Memory \$${env.MEMORY_PRICE_PER_GB_HOUR}/GB/hour

🔍 Health Checks & Issues

${env.ISSUES.contains('✅') ? '

' + env.ISSUES + '

' : env.ISSUES.split('\n').collect { "

${it}

" }.join('') }

📊 Resources by Namespace

${namespaceTable}

Namespace	Pods	Containers

Generated by Jenkins CI/CD Pipeline • Build #${env.BUILD_NUMBER}