diff --git a/apps/cluster-health-dashboard/Jenkinsfile b/apps/cluster-health-dashboard/Jenkinsfile new file mode 100644 index 0000000..22d8f60 --- /dev/null +++ b/apps/cluster-health-dashboard/Jenkinsfile @@ -0,0 +1,798 @@ +pipeline { + agent any + + environment { + DASHBOARD_NAME = 'cluster-health-dashboard' + OUTPUT_DIR = '/tmp/dashboard-reports' + PROMETHEUS_URL = 'http://prometheus-server.monitoring.svc.cluster.local' + GRAFANA_URL = 'http://grafana.monitoring.svc.cluster.local' + + // Notification + TELEGRAM_BOT_TOKEN = credentials('telegram-bot-token') + TELEGRAM_CHAT_ID = credentials('telegram-chat-id') + + // Pricing (adjust to your actual costs) + CPU_PRICE_PER_HOUR = '0.04' // $0.04 per vCPU hour + MEMORY_PRICE_PER_GB_HOUR = '0.005' // $0.005 per GB hour + } + + triggers { + // Run daily at 8 AM on weekdays + cron('0 8 * * 1-5') + } + + parameters { + choice( + name: 'REPORT_PERIOD', + choices: ['24h', '7d', '30d'], + description: 'Time period for metrics' + ) + booleanParam( + name: 'SEND_EMAIL', + defaultValue: true, + description: 'Send report via email' + ) + booleanParam( + name: 'SEND_TELEGRAM', + defaultValue: true, + description: 'Send summary to Telegram' + ) + } + + stages { + stage('Initialize') { + steps { + script { + echo "πŸš€ Starting Cluster Health Dashboard generation..." + sh """ + mkdir -p ${OUTPUT_DIR} + rm -f ${OUTPUT_DIR}/* + """ + + if (params.SEND_TELEGRAM) { + sendTelegramNotification( + "πŸ“Š Cluster Health Report\n\n" + + "Generating dashboard for period: ${params.REPORT_PERIOD}" + ) + } + } + } + } + + stage('Collect Cluster Info') { + steps { + script { + echo "πŸ“‹ Collecting cluster information..." + + // Get cluster version + env.CLUSTER_VERSION = sh( + script: 'kubectl version --short 2>/dev/null | grep Server | cut -d" " -f3', + returnStdout: true + ).trim() + + // Get node count + env.NODE_COUNT = sh( + script: 'kubectl get nodes --no-headers | wc -l', + returnStdout: true + ).trim() + + // Get namespace count + env.NAMESPACE_COUNT = sh( + script: 'kubectl get namespaces --no-headers | wc -l', + returnStdout: true + ).trim() + + // Get total pod count + env.POD_COUNT = sh( + script: 'kubectl get pods --all-namespaces --no-headers | wc -l', + returnStdout: true + ).trim() + + echo "Cluster version: ${env.CLUSTER_VERSION}" + echo "Nodes: ${env.NODE_COUNT}" + echo "Namespaces: ${env.NAMESPACE_COUNT}" + echo "Pods: ${env.POD_COUNT}" + } + } + } + + stage('Query Prometheus Metrics') { + steps { + script { + echo "πŸ“ˆ Querying Prometheus for metrics..." + + def period = params.REPORT_PERIOD + + // CPU Usage + env.AVG_CPU_USAGE = queryPrometheus( + "avg(rate(container_cpu_usage_seconds_total{container!=''}[5m])) * 100" + ) + + // Memory Usage + env.TOTAL_MEMORY_USAGE_GB = queryPrometheus( + "sum(container_memory_usage_bytes{container!=''}) / 1024 / 1024 / 1024" + ) + + // Network RX + env.NETWORK_RX_MB = queryPrometheus( + "sum(rate(container_network_receive_bytes_total[5m])) / 1024 / 1024" + ) + + // Network TX + env.NETWORK_TX_MB = queryPrometheus( + "sum(rate(container_network_transmit_bytes_total[5m])) / 1024 / 1024" + ) + + // Pod restart count + env.TOTAL_RESTARTS = queryPrometheus( + "sum(kube_pod_container_status_restarts_total)" + ) + + echo "Metrics collected successfully" + } + } + } + + stage('Analyze Node Resources') { + steps { + script { + echo "πŸ’» Analyzing node resources..." + + sh """#!/bin/bash + cat > ${OUTPUT_DIR}/node-resources.json << 'EOF' +{ + "nodes": [ +EOF + + first=true + kubectl get nodes -o json | jq -r '.items[] | @json' | while read node; do + if [ "\$first" = true ]; then + first=false + else + echo "," >> ${OUTPUT_DIR}/node-resources.json + fi + echo "\$node" >> ${OUTPUT_DIR}/node-resources.json + done + + cat >> ${OUTPUT_DIR}/node-resources.json << 'EOF' + ] +} +EOF + """ + + // Parse and calculate capacity + def nodeData = readJSON file: "${OUTPUT_DIR}/node-resources.json" + def totalCPU = 0 + def totalMemoryGB = 0 + + nodeData.nodes.each { node -> + def cpu = node.status.capacity.cpu.toInteger() + def memoryKi = node.status.capacity.memory.replaceAll('[^0-9]', '').toLong() + def memoryGB = memoryKi / 1024 / 1024 + + totalCPU += cpu + totalMemoryGB += memoryGB + } + + env.TOTAL_CPU_CORES = totalCPU.toString() + env.TOTAL_MEMORY_GB = totalMemoryGB.toString() + + echo "Total CPU cores: ${env.TOTAL_CPU_CORES}" + echo "Total Memory: ${env.TOTAL_MEMORY_GB} GB" + } + } + } + + stage('Analyze Pod Status') { + steps { + script { + echo "πŸ“¦ Analyzing pod status across namespaces..." + + sh """ + kubectl get pods --all-namespaces -o json > ${OUTPUT_DIR}/all-pods.json + """ + + def pods = readJSON file: "${OUTPUT_DIR}/all-pods.json" + + def statusCounts = [ + running: 0, + pending: 0, + failed: 0, + succeeded: 0, + unknown: 0 + ] + + def namespaceStats = [:] + + pods.items.each { pod -> + def phase = pod.status.phase.toLowerCase() + def namespace = pod.metadata.namespace + + statusCounts[phase] = (statusCounts[phase] ?: 0) + 1 + + if (!namespaceStats[namespace]) { + namespaceStats[namespace] = [pods: 0, containers: 0] + } + namespaceStats[namespace].pods++ + namespaceStats[namespace].containers += pod.spec.containers.size() + } + + env.PODS_RUNNING = statusCounts.running.toString() + env.PODS_PENDING = statusCounts.pending.toString() + env.PODS_FAILED = statusCounts.failed.toString() + + // Save namespace stats + writeJSON file: "${OUTPUT_DIR}/namespace-stats.json", json: namespaceStats + + echo "Pods running: ${env.PODS_RUNNING}" + echo "Pods pending: ${env.PODS_PENDING}" + echo "Pods failed: ${env.PODS_FAILED}" + } + } + } + + stage('Calculate Costs') { + steps { + script { + echo "πŸ’° Calculating resource costs..." + + def cpuHours = env.TOTAL_CPU_CORES.toFloat() * 24 * 30 // Monthly + def memoryGBHours = env.TOTAL_MEMORY_GB.toFloat() * 24 * 30 + + def cpuCost = cpuHours * env.CPU_PRICE_PER_HOUR.toFloat() + def memoryCost = memoryGBHours * env.MEMORY_PRICE_PER_GB_HOUR.toFloat() + def totalCost = cpuCost + memoryCost + + env.MONTHLY_CPU_COST = String.format('%.2f', cpuCost) + env.MONTHLY_MEMORY_COST = String.format('%.2f', memoryCost) + env.MONTHLY_TOTAL_COST = String.format('%.2f', totalCost) + + echo "Estimated monthly costs:" + echo " CPU: \$${env.MONTHLY_CPU_COST}" + echo " Memory: \$${env.MONTHLY_MEMORY_COST}" + echo " Total: \$${env.MONTHLY_TOTAL_COST}" + } + } + } + + stage('Check for Issues') { + steps { + script { + echo "πŸ” Checking for potential issues..." + + def issues = [] + + // High restart count + if (env.TOTAL_RESTARTS.toInteger() > 10) { + issues << "⚠️ High pod restart count: ${env.TOTAL_RESTARTS}" + } + + // Failed pods + if (env.PODS_FAILED.toInteger() > 0) { + issues << "❌ ${env.PODS_FAILED} pods in Failed state" + } + + // Pending pods + if (env.PODS_PENDING.toInteger() > 5) { + issues << "⚠️ ${env.PODS_PENDING} pods in Pending state (possible resource constraints)" + } + + // High CPU usage + if (env.AVG_CPU_USAGE.toFloat() > 80) { + issues << "πŸ”₯ High CPU usage: ${env.AVG_CPU_USAGE}%" + } + + env.ISSUES = issues.join('\n') + + if (issues.size() > 0) { + echo "Found ${issues.size()} issues:" + issues.each { echo it } + } else { + echo "βœ… No critical issues found" + env.ISSUES = "βœ… No issues detected" + } + } + } + } + + stage('Generate HTML Dashboard') { + steps { + script { + echo "🎨 Generating HTML dashboard..." + + def namespaceStats = readJSON file: "${OUTPUT_DIR}/namespace-stats.json" + def namespaceTable = generateNamespaceTable(namespaceStats) + + def html = generateDashboardHTML( + namespaceTable: namespaceTable + ) + + writeFile file: "${OUTPUT_DIR}/dashboard.html", text: html + + echo "βœ… Dashboard generated: ${OUTPUT_DIR}/dashboard.html" + } + } + } + + stage('Generate JSON Report') { + steps { + script { + echo "πŸ“„ Generating JSON report..." + + def report = [ + generated_at: new Date().format('yyyy-MM-dd HH:mm:ss'), + period: params.REPORT_PERIOD, + cluster: [ + version: env.CLUSTER_VERSION, + nodes: env.NODE_COUNT.toInteger(), + namespaces: env.NAMESPACE_COUNT.toInteger(), + total_pods: env.POD_COUNT.toInteger() + ], + resources: [ + total_cpu_cores: env.TOTAL_CPU_CORES.toFloat(), + total_memory_gb: env.TOTAL_MEMORY_GB.toFloat(), + avg_cpu_usage_percent: env.AVG_CPU_USAGE.toFloat(), + total_memory_usage_gb: env.TOTAL_MEMORY_USAGE_GB.toFloat() + ], + pods: [ + running: env.PODS_RUNNING.toInteger(), + pending: env.PODS_PENDING.toInteger(), + failed: env.PODS_FAILED.toInteger() + ], + costs: [ + monthly_cpu_usd: env.MONTHLY_CPU_COST.toFloat(), + monthly_memory_usd: env.MONTHLY_MEMORY_COST.toFloat(), + monthly_total_usd: env.MONTHLY_TOTAL_COST.toFloat() + ], + issues: env.ISSUES + ] + + writeJSON file: "${OUTPUT_DIR}/report.json", json: report, pretty: 4 + + echo "βœ… JSON report generated" + } + } + } + } + + post { + success { + script { + echo "βœ… Dashboard generation completed successfully!" + + // Archive artifacts + archiveArtifacts artifacts: "${OUTPUT_DIR}/*", fingerprint: true + + // Publish HTML report + publishHTML([ + allowMissing: false, + alwaysLinkToLastBuild: true, + keepAll: true, + reportDir: OUTPUT_DIR, + reportFiles: 'dashboard.html', + reportName: 'Cluster Health Dashboard', + reportTitles: 'Cluster Health Dashboard' + ]) + + // Send Telegram summary + if (params.SEND_TELEGRAM) { + def message = """ +πŸ“Š Cluster Health Report + +━━━━━━━━━━━━━━━━━━━━━━ +πŸ“‹ Cluster Info +Version: ${env.CLUSTER_VERSION} +Nodes: ${env.NODE_COUNT} +Namespaces: ${env.NAMESPACE_COUNT} +Total Pods: ${env.POD_COUNT} + +━━━━━━━━━━━━━━━━━━━━━━ +πŸ’» Resources +CPU Cores: ${env.TOTAL_CPU_CORES} +Memory: ${env.TOTAL_MEMORY_GB} GB +Avg CPU Usage: ${env.AVG_CPU_USAGE}% + +━━━━━━━━━━━━━━━━━━━━━━ +πŸ“¦ Pod Status +Running: ${env.PODS_RUNNING} βœ… +Pending: ${env.PODS_PENDING} ⏳ +Failed: ${env.PODS_FAILED} ❌ + +━━━━━━━━━━━━━━━━━━━━━━ +πŸ’° Estimated Monthly Costs +CPU: \$${env.MONTHLY_CPU_COST} +Memory: \$${env.MONTHLY_MEMORY_COST} +Total: \$${env.MONTHLY_TOTAL_COST} + +━━━━━━━━━━━━━━━━━━━━━━ +πŸ” Issues +${env.ISSUES} + +View Full Dashboard + """ + + sendTelegramNotification(message) + } + + echo "\nπŸ“Š Dashboard URL: ${env.BUILD_URL}Cluster_20Health_20Dashboard/" + } + } + + failure { + script { + echo "❌ Dashboard generation failed!" + + if (params.SEND_TELEGRAM) { + sendTelegramNotification( + "❌ Cluster Health Report Failed\n\n" + + "View Console Output" + ) + } + } + } + + always { + script { + echo "🧹 Cleanup completed" + } + } + } +} + +// Helper function to query Prometheus +def queryPrometheus(query) { + try { + def result = sh( + script: """ + curl -s '${PROMETHEUS_URL}/api/v1/query?query=${URLEncoder.encode(query, "UTF-8")}' | \ + jq -r '.data.result[0].value[1] // "0"' + """, + returnStdout: true + ).trim() + + return result ?: "0" + } catch (Exception e) { + echo "⚠️ Failed to query Prometheus: ${e.message}" + return "0" + } +} + +// Helper function to send Telegram notification +def sendTelegramNotification(message) { + try { + sh """ + curl -s -X POST "https://api.telegram.org/bot${TELEGRAM_BOT_TOKEN}/sendMessage" \ + -d chat_id="${TELEGRAM_CHAT_ID}" \ + -d parse_mode="HTML" \ + -d disable_web_page_preview=true \ + -d text="${message}" + """ + } catch (Exception e) { + echo "⚠️ Failed to send Telegram notification: ${e.message}" + } +} + +// Helper function to generate namespace table HTML +def generateNamespaceTable(namespaceStats) { + def rows = "" + + // Sort by pod count descending + def sortedNamespaces = namespaceStats.sort { -it.value.pods } + + sortedNamespaces.each { namespace, stats -> + rows += """ + + ${namespace} + ${stats.pods} + ${stats.containers} + + """ + } + + return rows +} + +// Helper function to generate complete HTML dashboard +def generateDashboardHTML(args) { + def namespaceTable = args.namespaceTable + + return """ + + + + + + Cluster Health Dashboard + + + +
+ +
+

☸️ Kubernetes Cluster Health Dashboard

+
Generated: ${new Date().format('yyyy-MM-dd HH:mm:ss')} | Period: ${params.REPORT_PERIOD}
+
+ + +
+
+

πŸ“‹ Cluster Information

+
+ Kubernetes Version + ${env.CLUSTER_VERSION} +
+
+ Nodes + ${env.NODE_COUNT} +
+
+ Namespaces + ${env.NAMESPACE_COUNT} +
+
+ Total Pods + ${env.POD_COUNT} +
+
+ +
+

πŸ’» Resource Capacity

+
+ Total CPU Cores + ${env.TOTAL_CPU_CORES} +
+
+ Total Memory + ${env.TOTAL_MEMORY_GB} GB +
+
+ Avg CPU Usage + ${env.AVG_CPU_USAGE}% +
+
+
+
+
+ +
+

πŸ“¦ Pod Status

+
+ Running + ${env.PODS_RUNNING} +
+
+ Pending + ${env.PODS_PENDING} +
+
+ Failed + ${env.PODS_FAILED} +
+
+ Total Restarts + ${env.TOTAL_RESTARTS} +
+
+ +
+

πŸ’° Estimated Monthly Costs

+
+ CPU Cost + \$${env.MONTHLY_CPU_COST} +
+
+ Memory Cost + \$${env.MONTHLY_MEMORY_COST} +
+
+ Total Cost + \$${env.MONTHLY_TOTAL_COST} +
+
+ πŸ’‘ Based on: CPU \$${env.CPU_PRICE_PER_HOUR}/core/hour, Memory \$${env.MEMORY_PRICE_PER_GB_HOUR}/GB/hour +
+
+
+ + +
+

πŸ” Health Checks & Issues

+ ${env.ISSUES.contains('βœ…') ? + '
' + env.ISSUES + '
' : + env.ISSUES.split('\n').collect { "
${it}
" }.join('') + } +
+ + +
+

πŸ“Š Resources by Namespace

+ + + + + + + + + + ${namespaceTable} + +
NamespacePodsContainers
+
+ + +
+ Generated by Jenkins CI/CD Pipeline β€’ Build #${env.BUILD_NUMBER} +
+
+ + + """ +} \ No newline at end of file