Cluster Health Dashboard

pipeline { agent any environment { DASHBOARD_NAME = 'cluster-health-dashboard' OUTPUT_DIR = '/tmp/dashboard-reports' PROMETHEUS_URL = 'http://prometheus-server.monitoring.svc.cluster.local' GRAFANA_URL = 'http://grafana.monitoring.svc.cluster.local' // Notification TELEGRAM_BOT_TOKEN = credentials('telegram-bot-token') TELEGRAM_CHAT_ID = credentials('telegram-chat-id') // Pricing (adjust to your actual costs) CPU_PRICE_PER_HOUR = '0.04' // $0.04 per vCPU hour MEMORY_PRICE_PER_GB_HOUR = '0.005' // $0.005 per GB hour } triggers { // Run daily at 8 AM on weekdays cron('0 8 * * 1-5') } parameters { choice( name: 'REPORT_PERIOD', choices: ['24h', '7d', '30d'], description: 'Time period for metrics' ) booleanParam( name: 'SEND_EMAIL', defaultValue: true, description: 'Send report via email' ) booleanParam( name: 'SEND_TELEGRAM', defaultValue: true, description: 'Send summary to Telegram' ) } stages { stage('Initialize') { steps { script { echo "🚀 Starting Cluster Health Dashboard generation..." sh """ mkdir -p ${OUTPUT_DIR} rm -f ${OUTPUT_DIR}/* """ if (params.SEND_TELEGRAM) { sendTelegramNotification( "📊 Cluster Health Report\n\n" + "Generating dashboard for period: ${params.REPORT_PERIOD}" ) } } } } stage('Collect Cluster Info') { steps { script { echo "📋 Collecting cluster information..." // Get cluster version env.CLUSTER_VERSION = sh( script: 'kubectl version --short 2>/dev/null | grep Server | cut -d" " -f3', returnStdout: true ).trim() // Get node count env.NODE_COUNT = sh( script: 'kubectl get nodes --no-headers | wc -l', returnStdout: true ).trim() // Get namespace count env.NAMESPACE_COUNT = sh( script: 'kubectl get namespaces --no-headers | wc -l', returnStdout: true ).trim() // Get total pod count env.POD_COUNT = sh( script: 'kubectl get pods --all-namespaces --no-headers | wc -l', returnStdout: true ).trim() echo "Cluster version: ${env.CLUSTER_VERSION}" echo "Nodes: ${env.NODE_COUNT}" echo "Namespaces: ${env.NAMESPACE_COUNT}" echo "Pods: ${env.POD_COUNT}" } } } stage('Query Prometheus Metrics') { steps { script { echo "📈 Querying Prometheus for metrics..." def period = params.REPORT_PERIOD // CPU Usage env.AVG_CPU_USAGE = queryPrometheus( "avg(rate(container_cpu_usage_seconds_total{container!=''}[5m])) * 100" ) // Memory Usage env.TOTAL_MEMORY_USAGE_GB = queryPrometheus( "sum(container_memory_usage_bytes{container!=''}) / 1024 / 1024 / 1024" ) // Network RX env.NETWORK_RX_MB = queryPrometheus( "sum(rate(container_network_receive_bytes_total[5m])) / 1024 / 1024" ) // Network TX env.NETWORK_TX_MB = queryPrometheus( "sum(rate(container_network_transmit_bytes_total[5m])) / 1024 / 1024" ) // Pod restart count env.TOTAL_RESTARTS = queryPrometheus( "sum(kube_pod_container_status_restarts_total)" ) echo "Metrics collected successfully" } } } stage('Analyze Node Resources') { steps { script { echo "💻 Analyzing node resources..." sh """#!/bin/bash cat > ${OUTPUT_DIR}/node-resources.json << 'EOF' { "nodes": [ EOF first=true kubectl get nodes -o json | jq -r '.items[] | @json' | while read node; do if [ "\$first" = true ]; then first=false else echo "," >> ${OUTPUT_DIR}/node-resources.json fi echo "\$node" >> ${OUTPUT_DIR}/node-resources.json done cat >> ${OUTPUT_DIR}/node-resources.json << 'EOF' ] } EOF """ // Parse and calculate capacity def nodeData = readJSON file: "${OUTPUT_DIR}/node-resources.json" def totalCPU = 0 def totalMemoryGB = 0 nodeData.nodes.each { node -> def cpu = node.status.capacity.cpu.toInteger() def memoryKi = node.status.capacity.memory.replaceAll('[^0-9]', '').toLong() def memoryGB = memoryKi / 1024 / 1024 totalCPU += cpu totalMemoryGB += memoryGB } env.TOTAL_CPU_CORES = totalCPU.toString() env.TOTAL_MEMORY_GB = totalMemoryGB.toString() echo "Total CPU cores: ${env.TOTAL_CPU_CORES}" echo "Total Memory: ${env.TOTAL_MEMORY_GB} GB" } } } stage('Analyze Pod Status') { steps { script { echo "📦 Analyzing pod status across namespaces..." sh """ kubectl get pods --all-namespaces -o json > ${OUTPUT_DIR}/all-pods.json """ def pods = readJSON file: "${OUTPUT_DIR}/all-pods.json" def statusCounts = [ running: 0, pending: 0, failed: 0, succeeded: 0, unknown: 0 ] def namespaceStats = [:] pods.items.each { pod -> def phase = pod.status.phase.toLowerCase() def namespace = pod.metadata.namespace statusCounts[phase] = (statusCounts[phase] ?: 0) + 1 if (!namespaceStats[namespace]) { namespaceStats[namespace] = [pods: 0, containers: 0] } namespaceStats[namespace].pods++ namespaceStats[namespace].containers += pod.spec.containers.size() } env.PODS_RUNNING = statusCounts.running.toString() env.PODS_PENDING = statusCounts.pending.toString() env.PODS_FAILED = statusCounts.failed.toString() // Save namespace stats writeJSON file: "${OUTPUT_DIR}/namespace-stats.json", json: namespaceStats echo "Pods running: ${env.PODS_RUNNING}" echo "Pods pending: ${env.PODS_PENDING}" echo "Pods failed: ${env.PODS_FAILED}" } } } stage('Calculate Costs') { steps { script { echo "💰 Calculating resource costs..." def cpuHours = env.TOTAL_CPU_CORES.toFloat() * 24 * 30 // Monthly def memoryGBHours = env.TOTAL_MEMORY_GB.toFloat() * 24 * 30 def cpuCost = cpuHours * env.CPU_PRICE_PER_HOUR.toFloat() def memoryCost = memoryGBHours * env.MEMORY_PRICE_PER_GB_HOUR.toFloat() def totalCost = cpuCost + memoryCost env.MONTHLY_CPU_COST = String.format('%.2f', cpuCost) env.MONTHLY_MEMORY_COST = String.format('%.2f', memoryCost) env.MONTHLY_TOTAL_COST = String.format('%.2f', totalCost) echo "Estimated monthly costs:" echo " CPU: \$${env.MONTHLY_CPU_COST}" echo " Memory: \$${env.MONTHLY_MEMORY_COST}" echo " Total: \$${env.MONTHLY_TOTAL_COST}" } } } stage('Check for Issues') { steps { script { echo "🔍 Checking for potential issues..." def issues = [] // High restart count if (env.TOTAL_RESTARTS.toInteger() > 10) { issues << "⚠️ High pod restart count: ${env.TOTAL_RESTARTS}" } // Failed pods if (env.PODS_FAILED.toInteger() > 0) { issues << "❌ ${env.PODS_FAILED} pods in Failed state" } // Pending pods if (env.PODS_PENDING.toInteger() > 5) { issues << "⚠️ ${env.PODS_PENDING} pods in Pending state (possible resource constraints)" } // High CPU usage if (env.AVG_CPU_USAGE.toFloat() > 80) { issues << "🔥 High CPU usage: ${env.AVG_CPU_USAGE}%" } env.ISSUES = issues.join('\n') if (issues.size() > 0) { echo "Found ${issues.size()} issues:" issues.each { echo it } } else { echo "✅ No critical issues found" env.ISSUES = "✅ No issues detected" } } } } stage('Generate HTML Dashboard') { steps { script { echo "🎨 Generating HTML dashboard..." def namespaceStats = readJSON file: "${OUTPUT_DIR}/namespace-stats.json" def namespaceTable = generateNamespaceTable(namespaceStats) def html = generateDashboardHTML( namespaceTable: namespaceTable ) writeFile file: "${OUTPUT_DIR}/dashboard.html", text: html echo "✅ Dashboard generated: ${OUTPUT_DIR}/dashboard.html" } } } stage('Generate JSON Report') { steps { script { echo "📄 Generating JSON report..." def report = [ generated_at: new Date().format('yyyy-MM-dd HH:mm:ss'), period: params.REPORT_PERIOD, cluster: [ version: env.CLUSTER_VERSION, nodes: env.NODE_COUNT.toInteger(), namespaces: env.NAMESPACE_COUNT.toInteger(), total_pods: env.POD_COUNT.toInteger() ], resources: [ total_cpu_cores: env.TOTAL_CPU_CORES.toFloat(), total_memory_gb: env.TOTAL_MEMORY_GB.toFloat(), avg_cpu_usage_percent: env.AVG_CPU_USAGE.toFloat(), total_memory_usage_gb: env.TOTAL_MEMORY_USAGE_GB.toFloat() ], pods: [ running: env.PODS_RUNNING.toInteger(), pending: env.PODS_PENDING.toInteger(), failed: env.PODS_FAILED.toInteger() ], costs: [ monthly_cpu_usd: env.MONTHLY_CPU_COST.toFloat(), monthly_memory_usd: env.MONTHLY_MEMORY_COST.toFloat(), monthly_total_usd: env.MONTHLY_TOTAL_COST.toFloat() ], issues: env.ISSUES ] writeJSON file: "${OUTPUT_DIR}/report.json", json: report, pretty: 4 echo "✅ JSON report generated" } } } } post { success { script { echo "✅ Dashboard generation completed successfully!" // Archive artifacts archiveArtifacts artifacts: "${OUTPUT_DIR}/*", fingerprint: true // Publish HTML report publishHTML([ allowMissing: false, alwaysLinkToLastBuild: true, keepAll: true, reportDir: OUTPUT_DIR, reportFiles: 'dashboard.html', reportName: 'Cluster Health Dashboard', reportTitles: 'Cluster Health Dashboard' ]) // Send Telegram summary if (params.SEND_TELEGRAM) { def message = """ 📊 Cluster Health Report ━━━━━━━━━━━━━━━━━━━━━━ 📋 Cluster Info Version: ${env.CLUSTER_VERSION} Nodes: ${env.NODE_COUNT} Namespaces: ${env.NAMESPACE_COUNT} Total Pods: ${env.POD_COUNT} ━━━━━━━━━━━━━━━━━━━━━━ 💻 Resources CPU Cores: ${env.TOTAL_CPU_CORES} Memory: ${env.TOTAL_MEMORY_GB} GB Avg CPU Usage: ${env.AVG_CPU_USAGE}% ━━━━━━━━━━━━━━━━━━━━━━ 📦 Pod Status Running: ${env.PODS_RUNNING} ✅ Pending: ${env.PODS_PENDING} ⏳ Failed: ${env.PODS_FAILED} ❌ ━━━━━━━━━━━━━━━━━━━━━━ 💰 Estimated Monthly Costs CPU: \$${env.MONTHLY_CPU_COST} Memory: \$${env.MONTHLY_MEMORY_COST} Total: \$${env.MONTHLY_TOTAL_COST} ━━━━━━━━━━━━━━━━━━━━━━ 🔍 Issues ${env.ISSUES} View Full Dashboard """ sendTelegramNotification(message) } echo "\n📊 Dashboard URL: ${env.BUILD_URL}Cluster_20Health_20Dashboard/" } } failure { script { echo "❌ Dashboard generation failed!" if (params.SEND_TELEGRAM) { sendTelegramNotification( "❌ Cluster Health Report Failed\n\n" + "View Console Output" ) } } } always { script { echo "🧹 Cleanup completed" } } } } // Helper function to query Prometheus def queryPrometheus(query) { try { def result = sh( script: """ curl -s '${PROMETHEUS_URL}/api/v1/query?query=${URLEncoder.encode(query, "UTF-8")}' | \ jq -r '.data.result[0].value[1] // "0"' """, returnStdout: true ).trim() return result ?: "0" } catch (Exception e) { echo "⚠️ Failed to query Prometheus: ${e.message}" return "0" } } // Helper function to send Telegram notification def sendTelegramNotification(message) { try { sh """ curl -s -X POST "https://api.telegram.org/bot${TELEGRAM_BOT_TOKEN}/sendMessage" \ -d chat_id="${TELEGRAM_CHAT_ID}" \ -d parse_mode="HTML" \ -d disable_web_page_preview=true \ -d text="${message}" """ } catch (Exception e) { echo "⚠️ Failed to send Telegram notification: ${e.message}" } } // Helper function to generate namespace table HTML def generateNamespaceTable(namespaceStats) { def rows = "" // Sort by pod count descending def sortedNamespaces = namespaceStats.sort { -it.value.pods } sortedNamespaces.each { namespace, stats -> rows += """ ${namespace} ${stats.pods} ${stats.containers} """ } return rows } // Helper function to generate complete HTML dashboard def generateDashboardHTML(args) { def namespaceTable = args.namespaceTable return """ Cluster Health Dashboard

📋 Cluster Information

Kubernetes Version ${env.CLUSTER_VERSION}

Nodes ${env.NODE_COUNT}

Namespaces ${env.NAMESPACE_COUNT}

Total Pods ${env.POD_COUNT}

💻 Resource Capacity

Total CPU Cores ${env.TOTAL_CPU_CORES}

Total Memory ${env.TOTAL_MEMORY_GB} GB

Avg CPU Usage ${env.AVG_CPU_USAGE}%

📦 Pod Status

Running ${env.PODS_RUNNING}

Pending ${env.PODS_PENDING}

Failed ${env.PODS_FAILED}

Total Restarts ${env.TOTAL_RESTARTS}

💰 Estimated Monthly Costs

CPU Cost \$${env.MONTHLY_CPU_COST}

Memory Cost \$${env.MONTHLY_MEMORY_COST}

Total Cost \$${env.MONTHLY_TOTAL_COST}

💡 Based on: CPU \$${env.CPU_PRICE_PER_HOUR}/core/hour, Memory \$${env.MEMORY_PRICE_PER_GB_HOUR}/GB/hour

🔍 Health Checks & Issues

${env.ISSUES.contains('✅') ? '

' + env.ISSUES + '

' : env.ISSUES.split('\n').collect { "

${it}

" }.join('') }

📊 Resources by Namespace

${namespaceTable}

Namespace	Pods	Containers

Generated by Jenkins CI/CD Pipeline • Build #${env.BUILD_NUMBER}