diff --git a/apps/cluster-health-dashboard/Jenkinsfile b/apps/cluster-health-dashboard/Jenkinsfile deleted file mode 100644 index 6344c34..0000000 --- a/apps/cluster-health-dashboard/Jenkinsfile +++ /dev/null @@ -1,839 +0,0 @@ -pipeline { - agent any - - environment { - DASHBOARD_NAME = 'cluster-health-dashboard' - OUTPUT_DIR = '/tmp/dashboard-reports' - PROMETHEUS_URL = 'http://prometheus-server.monitoring.svc.cluster.local' - GRAFANA_URL = 'http://grafana.monitoring.svc.cluster.local' - - // Notification - TELEGRAM_BOT_TOKEN = credentials('telegram-bot-token') - TELEGRAM_CHAT_ID = credentials('telegram-chat-id') - - // Pricing (adjust to your actual costs) - CPU_PRICE_PER_HOUR = '0.04' // $0.04 per vCPU hour - MEMORY_PRICE_PER_GB_HOUR = '0.005' // $0.005 per GB hour - } - - triggers { - // Run daily at 8 AM on weekdays - cron('0 8 * * 1-5') - } - - parameters { - choice( - name: 'REPORT_PERIOD', - choices: ['24h', '7d', '30d'], - description: 'Time period for metrics' - ) - booleanParam( - name: 'SEND_EMAIL', - defaultValue: true, - description: 'Send report via email' - ) - booleanParam( - name: 'SEND_TELEGRAM', - defaultValue: true, - description: 'Send summary to Telegram' - ) - } - - stages { - stage('Initialize') { - steps { - script { - echo "πŸš€ Starting Cluster Health Dashboard generation..." - sh """ - mkdir -p ${OUTPUT_DIR} - rm -f ${OUTPUT_DIR}/* - """ - - if (params.SEND_TELEGRAM) { - sendTelegramNotification( - "πŸ“Š Cluster Health Report\n\n" + - "Generating dashboard for period: ${params.REPORT_PERIOD}" - ) - } - } - } - } - - stage('Collect Cluster Info') { - steps { - script { - echo "πŸ“‹ Collecting cluster information..." - - // Get cluster version - env.CLUSTER_VERSION = sh( - script: 'kubectl version --short 2>/dev/null | grep Server | cut -d" " -f3 || echo "unknown"', - returnStdout: true - ).trim() - - // Get node count - env.NODE_COUNT = sh( - script: 'kubectl get nodes --no-headers 2>/dev/null | wc -l || echo "0"', - returnStdout: true - ).trim() - - // Get namespace count - env.NAMESPACE_COUNT = sh( - script: 'kubectl get namespaces --no-headers 2>/dev/null | wc -l || echo "0"', - returnStdout: true - ).trim() - - // Get total pod count - env.POD_COUNT = sh( - script: 'kubectl get pods --all-namespaces --no-headers 2>/dev/null | wc -l || echo "0"', - returnStdout: true - ).trim() - - echo "Cluster version: ${env.CLUSTER_VERSION}" - echo "Nodes: ${env.NODE_COUNT}" - echo "Namespaces: ${env.NAMESPACE_COUNT}" - echo "Pods: ${env.POD_COUNT}" - } - } - } - - stage('Query Prometheus Metrics') { - steps { - script { - echo "πŸ“ˆ Querying Prometheus for metrics..." - - def period = params.REPORT_PERIOD - - // CPU Usage - env.AVG_CPU_USAGE = queryPrometheus( - "avg(rate(container_cpu_usage_seconds_total{container!=''}[5m])) * 100" - ) - - // Memory Usage - env.TOTAL_MEMORY_USAGE_GB = queryPrometheus( - "sum(container_memory_usage_bytes{container!=''}) / 1024 / 1024 / 1024" - ) - - // Network RX - env.NETWORK_RX_MB = queryPrometheus( - "sum(rate(container_network_receive_bytes_total[5m])) / 1024 / 1024" - ) - - // Network TX - env.NETWORK_TX_MB = queryPrometheus( - "sum(rate(container_network_transmit_bytes_total[5m])) / 1024 / 1024" - ) - - // Pod restart count - env.TOTAL_RESTARTS = queryPrometheus( - "sum(kube_pod_container_status_restarts_total)" - ) - - echo "Metrics collected successfully" - } - } - } - - stage('Analyze Node Resources') { - steps { - script { - echo "πŸ’» Analyzing node resources..." - - try { - // Get CPU values - def cpuValues = sh( - script: """kubectl get nodes -o jsonpath='{range .items[*]}{.status.capacity.cpu}{"\\n"}{end}' 2>/dev/null || echo "0" """, - returnStdout: true - ).trim().split('\n') - - // Get Memory values - def memValues = sh( - script: """kubectl get nodes -o jsonpath='{range .items[*]}{.status.capacity.memory}{"\\n"}{end}' 2>/dev/null || echo "0Ki" """, - returnStdout: true - ).trim().split('\n') - - def totalCPU = 0 - def totalMemoryGB = 0 - - cpuValues.each { cpu -> - if (cpu?.trim() && cpu != "0") { - totalCPU += cpu.toInteger() - } - } - - memValues.each { mem -> - if (mem?.trim() && mem != "0Ki") { - def memKi = mem.replaceAll('[^0-9]', '') - if (memKi) { - def memKiLong = memKi.toLong() - totalMemoryGB += (memKiLong / 1024 / 1024) - } - } - } - - env.TOTAL_CPU_CORES = totalCPU.toString() - env.TOTAL_MEMORY_GB = totalMemoryGB.toString() - - echo "Total CPU cores: ${env.TOTAL_CPU_CORES}" - echo "Total Memory: ${env.TOTAL_MEMORY_GB} GB" - } catch (Exception e) { - echo "⚠️ Failed to analyze node resources: ${e.message}" - env.TOTAL_CPU_CORES = "0" - env.TOTAL_MEMORY_GB = "0" - } - } - } - } - - stage('Analyze Pod Status') { - steps { - script { - echo "πŸ“¦ Analyzing pod status across namespaces..." - - try { - // Get pod phases - def podPhases = sh( - script: """kubectl get pods --all-namespaces -o jsonpath='{range .items[*]}{.status.phase}{"\\n"}{end}' 2>/dev/null || echo "" """, - returnStdout: true - ).trim() - - def running = 0 - def pending = 0 - def failed = 0 - - if (podPhases) { - podPhases.split('\n').each { phase -> - def p = phase.toLowerCase().trim() - if (p == 'running') running++ - else if (p == 'pending') pending++ - else if (p == 'failed') failed++ - } - } - - env.PODS_RUNNING = running.toString() - env.PODS_PENDING = pending.toString() - env.PODS_FAILED = failed.toString() - - // Get namespace stats - def namespaces = sh( - script: """kubectl get namespaces -o jsonpath='{range .items[*]}{.metadata.name}{"\\n"}{end}' 2>/dev/null || echo "" """, - returnStdout: true - ).trim().split('\n') - - def namespaceStats = [:] - - namespaces.each { ns -> - if (ns?.trim()) { - def podCount = sh( - script: """kubectl get pods -n ${ns} --no-headers 2>/dev/null | wc -l || echo "0" """, - returnStdout: true - ).trim().toInteger() - - if (podCount > 0) { - namespaceStats[ns] = [pods: podCount, containers: podCount * 2] - } - } - } - - // Save namespace stats as string - env.NAMESPACE_STATS = namespaceStats.collect { k, v -> "${k}:${v.pods}:${v.containers}" }.join(',') - - echo "Pods running: ${env.PODS_RUNNING}" - echo "Pods pending: ${env.PODS_PENDING}" - echo "Pods failed: ${env.PODS_FAILED}" - } catch (Exception e) { - echo "⚠️ Failed to analyze pods: ${e.message}" - env.PODS_RUNNING = "0" - env.PODS_PENDING = "0" - env.PODS_FAILED = "0" - env.NAMESPACE_STATS = "" - } - } - } - } - - stage('Calculate Costs') { - steps { - script { - echo "πŸ’° Calculating resource costs..." - - try { - def cpuHours = env.TOTAL_CPU_CORES.toFloat() * 24 * 30 // Monthly - def memoryGBHours = env.TOTAL_MEMORY_GB.toFloat() * 24 * 30 - - def cpuCost = cpuHours * env.CPU_PRICE_PER_HOUR.toFloat() - def memoryCost = memoryGBHours * env.MEMORY_PRICE_PER_GB_HOUR.toFloat() - def totalCost = cpuCost + memoryCost - - env.MONTHLY_CPU_COST = String.format('%.2f', cpuCost) - env.MONTHLY_MEMORY_COST = String.format('%.2f', memoryCost) - env.MONTHLY_TOTAL_COST = String.format('%.2f', totalCost) - - echo "Estimated monthly costs:" - echo " CPU: \$${env.MONTHLY_CPU_COST}" - echo " Memory: \$${env.MONTHLY_MEMORY_COST}" - echo " Total: \$${env.MONTHLY_TOTAL_COST}" - } catch (Exception e) { - echo "⚠️ Failed to calculate costs: ${e.message}" - env.MONTHLY_CPU_COST = "0.00" - env.MONTHLY_MEMORY_COST = "0.00" - env.MONTHLY_TOTAL_COST = "0.00" - } - } - } - } - - stage('Check for Issues') { - steps { - script { - echo "πŸ” Checking for potential issues..." - - def issues = [] - - // High restart count - if (env.TOTAL_RESTARTS.toFloat() > 10) { - issues << "⚠️ High pod restart count: ${env.TOTAL_RESTARTS}" - } - - // Failed pods - if (env.PODS_FAILED.toInteger() > 0) { - issues << "❌ ${env.PODS_FAILED} pods in Failed state" - } - - // Pending pods - if (env.PODS_PENDING.toInteger() > 5) { - issues << "⚠️ ${env.PODS_PENDING} pods in Pending state (possible resource constraints)" - } - - // High CPU usage - if (env.AVG_CPU_USAGE.toFloat() > 80) { - issues << "πŸ”₯ High CPU usage: ${env.AVG_CPU_USAGE}%" - } - - env.ISSUES = issues.size() > 0 ? issues.join('\n') : "βœ… No issues detected" - - if (issues.size() > 0) { - echo "Found ${issues.size()} issues:" - issues.each { echo it } - } else { - echo "βœ… No critical issues found" - } - } - } - } - - stage('Generate HTML Dashboard') { - steps { - script { - echo "🎨 Generating HTML dashboard..." - - def namespaceTable = generateNamespaceTable(env.NAMESPACE_STATS) - - def html = generateDashboardHTML( - namespaceTable: namespaceTable - ) - - writeFile file: "${OUTPUT_DIR}/dashboard.html", text: html - - echo "βœ… Dashboard generated: ${OUTPUT_DIR}/dashboard.html" - } - } - } - - stage('Generate JSON Report') { - steps { - script { - echo "πŸ“„ Generating JSON report..." - - def report = """ -{ - "generated_at": "${new Date().format('yyyy-MM-dd HH:mm:ss')}", - "period": "${params.REPORT_PERIOD}", - "cluster": { - "version": "${env.CLUSTER_VERSION}", - "nodes": ${env.NODE_COUNT}, - "namespaces": ${env.NAMESPACE_COUNT}, - "total_pods": ${env.POD_COUNT} - }, - "resources": { - "total_cpu_cores": ${env.TOTAL_CPU_CORES}, - "total_memory_gb": ${env.TOTAL_MEMORY_GB}, - "avg_cpu_usage_percent": ${env.AVG_CPU_USAGE}, - "total_memory_usage_gb": ${env.TOTAL_MEMORY_USAGE_GB} - }, - "pods": { - "running": ${env.PODS_RUNNING}, - "pending": ${env.PODS_PENDING}, - "failed": ${env.PODS_FAILED} - }, - "costs": { - "monthly_cpu_usd": ${env.MONTHLY_CPU_COST}, - "monthly_memory_usd": ${env.MONTHLY_MEMORY_COST}, - "monthly_total_usd": ${env.MONTHLY_TOTAL_COST} - }, - "issues": "${env.ISSUES.replaceAll('"', '\\"').replaceAll('\n', '\\\\n')}" -} - """ - - writeFile file: "${OUTPUT_DIR}/report.json", text: report - - echo "βœ… JSON report generated" - } - } - } - } - - post { - success { - script { - echo "βœ… Dashboard generation completed successfully!" - - // Archive artifacts - archiveArtifacts artifacts: "${OUTPUT_DIR}/*", fingerprint: true - - // Publish HTML report - publishHTML([ - allowMissing: false, - alwaysLinkToLastBuild: true, - keepAll: true, - reportDir: OUTPUT_DIR, - reportFiles: 'dashboard.html', - reportName: 'Cluster Health Dashboard', - reportTitles: 'Cluster Health Dashboard' - ]) - - // Send Telegram summary - if (params.SEND_TELEGRAM) { - def message = """ -πŸ“Š Cluster Health Report - -━━━━━━━━━━━━━━━━━━━━━━ -πŸ“‹ Cluster Info -Version: ${env.CLUSTER_VERSION} -Nodes: ${env.NODE_COUNT} -Namespaces: ${env.NAMESPACE_COUNT} -Total Pods: ${env.POD_COUNT} - -━━━━━━━━━━━━━━━━━━━━━━ -πŸ’» Resources -CPU Cores: ${env.TOTAL_CPU_CORES} -Memory: ${env.TOTAL_MEMORY_GB} GB -Avg CPU Usage: ${env.AVG_CPU_USAGE}% - -━━━━━━━━━━━━━━━━━━━━━━ -πŸ“¦ Pod Status -Running: ${env.PODS_RUNNING} βœ… -Pending: ${env.PODS_PENDING} ⏳ -Failed: ${env.PODS_FAILED} ❌ - -━━━━━━━━━━━━━━━━━━━━━━ -πŸ’° Estimated Monthly Costs -CPU: \$${env.MONTHLY_CPU_COST} -Memory: \$${env.MONTHLY_MEMORY_COST} -Total: \$${env.MONTHLY_TOTAL_COST} - -━━━━━━━━━━━━━━━━━━━━━━ -πŸ” Issues -${env.ISSUES} - -View Full Dashboard - """ - - sendTelegramNotification(message) - } - - echo "\nπŸ“Š Dashboard URL: ${env.BUILD_URL}Cluster_20Health_20Dashboard/" - } - } - - failure { - script { - echo "❌ Dashboard generation failed!" - - if (params.SEND_TELEGRAM) { - sendTelegramNotification( - "❌ Cluster Health Report Failed\n\n" + - "View Console Output" - ) - } - } - } - - always { - script { - echo "🧹 Cleanup completed" - } - } - } -} - -// Helper function to query Prometheus (NO JQ NEEDED!) -def queryPrometheus(query) { - try { - def response = sh( - script: """ - curl -s '${PROMETHEUS_URL}/api/v1/query?query=${URLEncoder.encode(query, "UTF-8")}' - """, - returnStdout: true - ).trim() - - // Extract value using grep and sed (no jq!) - def value = sh( - script: """ - echo '${response}' | grep -oP '"value":\\[\\d+,"\\K[^"]+' || echo "0" - """, - returnStdout: true - ).trim() - - return value ?: "0" - } catch (Exception e) { - echo "⚠️ Failed to query Prometheus: ${e.message}" - return "0" - } -} - -// Helper function to send Telegram notification -def sendTelegramNotification(message) { - try { - sh """ - curl -s -X POST "https://api.telegram.org/bot${TELEGRAM_BOT_TOKEN}/sendMessage" \ - -d chat_id="${TELEGRAM_CHAT_ID}" \ - -d parse_mode="HTML" \ - -d disable_web_page_preview=true \ - -d text="${message}" - """ - } catch (Exception e) { - echo "⚠️ Failed to send Telegram notification: ${e.message}" - } -} - -// Helper function to generate namespace table HTML -def generateNamespaceTable(namespaceStatsStr) { - def rows = "" - - if (namespaceStatsStr) { - def stats = namespaceStatsStr.split(',') - stats.each { stat -> - def parts = stat.split(':') - if (parts.size() >= 3) { - rows += """ - - ${parts[0]} - ${parts[1]} - ${parts[2]} - - """ - } - } - } - - if (!rows) { - rows = "No data available" - } - - return rows -} - -// Helper function to generate complete HTML dashboard -def generateDashboardHTML(args) { - def namespaceTable = args.namespaceTable - - return """ - - - - - - Cluster Health Dashboard - - - -
- -
-

☸️ Kubernetes Cluster Health Dashboard

-
Generated: ${new Date().format('yyyy-MM-dd HH:mm:ss')} | Period: ${params.REPORT_PERIOD}
-
- - -
-
-

πŸ“‹ Cluster Information

-
- Kubernetes Version - ${env.CLUSTER_VERSION} -
-
- Nodes - ${env.NODE_COUNT} -
-
- Namespaces - ${env.NAMESPACE_COUNT} -
-
- Total Pods - ${env.POD_COUNT} -
-
- -
-

πŸ’» Resource Capacity

-
- Total CPU Cores - ${env.TOTAL_CPU_CORES} -
-
- Total Memory - ${env.TOTAL_MEMORY_GB} GB -
-
- Avg CPU Usage - ${env.AVG_CPU_USAGE}% -
-
-
-
-
- -
-

πŸ“¦ Pod Status

-
- Running - ${env.PODS_RUNNING} -
-
- Pending - ${env.PODS_PENDING} -
-
- Failed - ${env.PODS_FAILED} -
-
- Total Restarts - ${env.TOTAL_RESTARTS} -
-
- -
-

πŸ’° Estimated Monthly Costs

-
- CPU Cost - \$${env.MONTHLY_CPU_COST} -
-
- Memory Cost - \$${env.MONTHLY_MEMORY_COST} -
-
- Total Cost - \$${env.MONTHLY_TOTAL_COST} -
-
- πŸ’‘ Based on: CPU \$${env.CPU_PRICE_PER_HOUR}/core/hour, Memory \$${env.MEMORY_PRICE_PER_GB_HOUR}/GB/hour -
-
-
- - -
-

πŸ” Health Checks & Issues

- ${env.ISSUES.contains('βœ…') ? - '
' + env.ISSUES + '
' : - env.ISSUES.split('\n').collect { "
${it}
" }.join('') - } -
- - -
-

πŸ“Š Resources by Namespace

- - - - - - - - - - ${namespaceTable} - -
NamespacePodsContainers
-
- - -
- Generated by Jenkins CI/CD Pipeline β€’ Build #${env.BUILD_NUMBER} -
-
- - - """ -} \ No newline at end of file