Files
k3s-gitops/apps/cluster-health-dashboard/Jenkinsfile

839 lines
30 KiB
Groovy

pipeline {
agent any
environment {
DASHBOARD_NAME = 'cluster-health-dashboard'
OUTPUT_DIR = '${WORKSPACE}/dashboard-reports'
PROMETHEUS_URL = 'http://k8s-monitoring-kube-promet-prometheus.monitoring.svc.cluster.local'
GRAFANA_URL = 'http://grafana.monitoring.svc.cluster.local'
// Notification
TELEGRAM_BOT_TOKEN = credentials('telegram-bot-token')
TELEGRAM_CHAT_ID = credentials('telegram-chat-id')
// Pricing (adjust to your actual costs)
CPU_PRICE_PER_HOUR = '0.04' // $0.04 per vCPU hour
MEMORY_PRICE_PER_GB_HOUR = '0.005' // $0.005 per GB hour
}
triggers {
// Run daily at 8 AM on weekdays
cron('0 8 * * 1-5')
}
parameters {
choice(
name: 'REPORT_PERIOD',
choices: ['24h', '7d', '30d'],
description: 'Time period for metrics'
)
booleanParam(
name: 'SEND_EMAIL',
defaultValue: true,
description: 'Send report via email'
)
booleanParam(
name: 'SEND_TELEGRAM',
defaultValue: true,
description: 'Send summary to Telegram'
)
}
stages {
stage('Initialize') {
steps {
script {
echo "🚀 Starting Cluster Health Dashboard generation..."
sh """
mkdir -p \${WORKSPACE}/dashboard-reports
rm -f \${WORKSPACE}/dashboard-reports/*
"""
if (params.SEND_TELEGRAM) {
sendTelegramNotification(
"📊 <b>Cluster Health Report</b>\n\n" +
"<i>Generating dashboard for period: ${params.REPORT_PERIOD}</i>"
)
}
}
}
}
stage('Collect Cluster Info') {
steps {
script {
echo "📋 Collecting cluster information..."
// Get cluster version
env.CLUSTER_VERSION = sh(
script: 'kubectl version --short 2>/dev/null | grep Server | cut -d" " -f3 || echo "unknown"',
returnStdout: true
).trim()
// Get node count
env.NODE_COUNT = sh(
script: 'kubectl get nodes --no-headers 2>/dev/null | wc -l || echo "0"',
returnStdout: true
).trim()
// Get namespace count
env.NAMESPACE_COUNT = sh(
script: 'kubectl get namespaces --no-headers 2>/dev/null | wc -l || echo "0"',
returnStdout: true
).trim()
// Get total pod count
env.POD_COUNT = sh(
script: 'kubectl get pods --all-namespaces --no-headers 2>/dev/null | wc -l || echo "0"',
returnStdout: true
).trim()
echo "Cluster version: ${env.CLUSTER_VERSION}"
echo "Nodes: ${env.NODE_COUNT}"
echo "Namespaces: ${env.NAMESPACE_COUNT}"
echo "Pods: ${env.POD_COUNT}"
}
}
}
stage('Query Prometheus Metrics') {
steps {
script {
echo "📈 Querying Prometheus for metrics..."
def period = params.REPORT_PERIOD
// CPU Usage
env.AVG_CPU_USAGE = queryPrometheus(
"avg(rate(container_cpu_usage_seconds_total{container!=''}[5m])) * 100"
)
// Memory Usage
env.TOTAL_MEMORY_USAGE_GB = queryPrometheus(
"sum(container_memory_usage_bytes{container!=''}) / 1024 / 1024 / 1024"
)
// Network RX
env.NETWORK_RX_MB = queryPrometheus(
"sum(rate(container_network_receive_bytes_total[5m])) / 1024 / 1024"
)
// Network TX
env.NETWORK_TX_MB = queryPrometheus(
"sum(rate(container_network_transmit_bytes_total[5m])) / 1024 / 1024"
)
// Pod restart count
env.TOTAL_RESTARTS = queryPrometheus(
"sum(kube_pod_container_status_restarts_total)"
)
echo "Metrics collected successfully"
}
}
}
stage('Analyze Node Resources') {
steps {
script {
echo "💻 Analyzing node resources..."
try {
// Get CPU values
def cpuValues = sh(
script: """kubectl get nodes -o jsonpath='{range .items[*]}{.status.capacity.cpu}{"\\n"}{end}' 2>/dev/null || echo "0" """,
returnStdout: true
).trim().split('\n')
// Get Memory values
def memValues = sh(
script: """kubectl get nodes -o jsonpath='{range .items[*]}{.status.capacity.memory}{"\\n"}{end}' 2>/dev/null || echo "0Ki" """,
returnStdout: true
).trim().split('\n')
def totalCPU = 0
def totalMemoryGB = 0
cpuValues.each { cpu ->
if (cpu?.trim() && cpu != "0") {
totalCPU += cpu.toInteger()
}
}
memValues.each { mem ->
if (mem?.trim() && mem != "0Ki") {
def memKi = mem.replaceAll('[^0-9]', '')
if (memKi) {
def memKiLong = memKi.toLong()
totalMemoryGB += (memKiLong / 1024 / 1024)
}
}
}
env.TOTAL_CPU_CORES = totalCPU.toString()
env.TOTAL_MEMORY_GB = totalMemoryGB.toString()
echo "Total CPU cores: ${env.TOTAL_CPU_CORES}"
echo "Total Memory: ${env.TOTAL_MEMORY_GB} GB"
} catch (Exception e) {
echo "⚠️ Failed to analyze node resources: ${e.message}"
env.TOTAL_CPU_CORES = "0"
env.TOTAL_MEMORY_GB = "0"
}
}
}
}
stage('Analyze Pod Status') {
steps {
script {
echo "📦 Analyzing pod status across namespaces..."
try {
// Get pod phases
def podPhases = sh(
script: """kubectl get pods --all-namespaces -o jsonpath='{range .items[*]}{.status.phase}{"\\n"}{end}' 2>/dev/null || echo "" """,
returnStdout: true
).trim()
def running = 0
def pending = 0
def failed = 0
if (podPhases) {
podPhases.split('\n').each { phase ->
def p = phase.toLowerCase().trim()
if (p == 'running') running++
else if (p == 'pending') pending++
else if (p == 'failed') failed++
}
}
env.PODS_RUNNING = running.toString()
env.PODS_PENDING = pending.toString()
env.PODS_FAILED = failed.toString()
// Get namespace stats
def namespaces = sh(
script: """kubectl get namespaces -o jsonpath='{range .items[*]}{.metadata.name}{"\\n"}{end}' 2>/dev/null || echo "" """,
returnStdout: true
).trim().split('\n')
def namespaceStats = [:]
namespaces.each { ns ->
if (ns?.trim()) {
def podCount = sh(
script: """kubectl get pods -n ${ns} --no-headers 2>/dev/null | wc -l || echo "0" """,
returnStdout: true
).trim().toInteger()
if (podCount > 0) {
namespaceStats[ns] = [pods: podCount, containers: podCount * 2]
}
}
}
// Save namespace stats as string
env.NAMESPACE_STATS = namespaceStats.collect { k, v -> "${k}:${v.pods}:${v.containers}" }.join(',')
echo "Pods running: ${env.PODS_RUNNING}"
echo "Pods pending: ${env.PODS_PENDING}"
echo "Pods failed: ${env.PODS_FAILED}"
} catch (Exception e) {
echo "⚠️ Failed to analyze pods: ${e.message}"
env.PODS_RUNNING = "0"
env.PODS_PENDING = "0"
env.PODS_FAILED = "0"
env.NAMESPACE_STATS = ""
}
}
}
}
stage('Calculate Costs') {
steps {
script {
echo "💰 Calculating resource costs..."
try {
def cpuHours = env.TOTAL_CPU_CORES.toFloat() * 24 * 30 // Monthly
def memoryGBHours = env.TOTAL_MEMORY_GB.toFloat() * 24 * 30
def cpuCost = cpuHours * env.CPU_PRICE_PER_HOUR.toFloat()
def memoryCost = memoryGBHours * env.MEMORY_PRICE_PER_GB_HOUR.toFloat()
def totalCost = cpuCost + memoryCost
env.MONTHLY_CPU_COST = String.format('%.2f', cpuCost)
env.MONTHLY_MEMORY_COST = String.format('%.2f', memoryCost)
env.MONTHLY_TOTAL_COST = String.format('%.2f', totalCost)
echo "Estimated monthly costs:"
echo " CPU: \$${env.MONTHLY_CPU_COST}"
echo " Memory: \$${env.MONTHLY_MEMORY_COST}"
echo " Total: \$${env.MONTHLY_TOTAL_COST}"
} catch (Exception e) {
echo "⚠️ Failed to calculate costs: ${e.message}"
env.MONTHLY_CPU_COST = "0.00"
env.MONTHLY_MEMORY_COST = "0.00"
env.MONTHLY_TOTAL_COST = "0.00"
}
}
}
}
stage('Check for Issues') {
steps {
script {
echo "🔍 Checking for potential issues..."
def issues = []
// High restart count
if (env.TOTAL_RESTARTS.toFloat() > 10) {
issues << "⚠️ High pod restart count: ${env.TOTAL_RESTARTS}"
}
// Failed pods
if (env.PODS_FAILED.toInteger() > 0) {
issues << "❌ ${env.PODS_FAILED} pods in Failed state"
}
// Pending pods
if (env.PODS_PENDING.toInteger() > 5) {
issues << "⚠️ ${env.PODS_PENDING} pods in Pending state (possible resource constraints)"
}
// High CPU usage
if (env.AVG_CPU_USAGE.toFloat() > 80) {
issues << "🔥 High CPU usage: ${env.AVG_CPU_USAGE}%"
}
env.ISSUES = issues.size() > 0 ? issues.join('\n') : "✅ No issues detected"
if (issues.size() > 0) {
echo "Found ${issues.size()} issues:"
issues.each { echo it }
} else {
echo "✅ No critical issues found"
}
}
}
}
stage('Generate HTML Dashboard') {
steps {
script {
echo "🎨 Generating HTML dashboard..."
def namespaceTable = generateNamespaceTable(env.NAMESPACE_STATS)
def html = generateDashboardHTML(
namespaceTable: namespaceTable
)
writeFile file: "\${WORKSPACE}/dashboard-reports/dashboard.html", text: html
echo "✅ Dashboard generated: \${WORKSPACE}/dashboard-reports/dashboard.html"
}
}
}
stage('Generate JSON Report') {
steps {
script {
echo "📄 Generating JSON report..."
def report = """
{
"generated_at": "${new Date().format('yyyy-MM-dd HH:mm:ss')}",
"period": "${params.REPORT_PERIOD}",
"cluster": {
"version": "${env.CLUSTER_VERSION}",
"nodes": ${env.NODE_COUNT},
"namespaces": ${env.NAMESPACE_COUNT},
"total_pods": ${env.POD_COUNT}
},
"resources": {
"total_cpu_cores": ${env.TOTAL_CPU_CORES},
"total_memory_gb": ${env.TOTAL_MEMORY_GB},
"avg_cpu_usage_percent": ${env.AVG_CPU_USAGE},
"total_memory_usage_gb": ${env.TOTAL_MEMORY_USAGE_GB}
},
"pods": {
"running": ${env.PODS_RUNNING},
"pending": ${env.PODS_PENDING},
"failed": ${env.PODS_FAILED}
},
"costs": {
"monthly_cpu_usd": ${env.MONTHLY_CPU_COST},
"monthly_memory_usd": ${env.MONTHLY_MEMORY_COST},
"monthly_total_usd": ${env.MONTHLY_TOTAL_COST}
},
"issues": "${env.ISSUES.replaceAll('"', '\\"').replaceAll('\n', '\\\\n')}"
}
"""
writeFile file: "\${WORKSPACE}/dashboard-reports/report.json", text: report
echo "✅ JSON report generated"
}
}
}
}
post {
success {
script {
echo "✅ Dashboard generation completed successfully!"
// Archive artifacts - FIXED PATH
archiveArtifacts artifacts: 'dashboard-reports/*', fingerprint: true
// Publish HTML report - FIXED PATH
publishHTML([
allowMissing: false,
alwaysLinkToLastBuild: true,
keepAll: true,
reportDir: 'dashboard-reports',
reportFiles: 'dashboard.html',
reportName: 'Cluster Health Dashboard',
reportTitles: 'Cluster Health Dashboard'
])
// Send Telegram summary
if (params.SEND_TELEGRAM) {
def message = """
📊 <b>Cluster Health Report</b>
━━━━━━━━━━━━━━━━━━━━━━
<b>📋 Cluster Info</b>
<b>Version:</b> ${env.CLUSTER_VERSION}
<b>Nodes:</b> ${env.NODE_COUNT}
<b>Namespaces:</b> ${env.NAMESPACE_COUNT}
<b>Total Pods:</b> ${env.POD_COUNT}
━━━━━━━━━━━━━━━━━━━━━━
<b>💻 Resources</b>
<b>CPU Cores:</b> ${env.TOTAL_CPU_CORES}
<b>Memory:</b> ${env.TOTAL_MEMORY_GB} GB
<b>Avg CPU Usage:</b> ${env.AVG_CPU_USAGE}%
━━━━━━━━━━━━━━━━━━━━━━
<b>📦 Pod Status</b>
<b>Running:</b> ${env.PODS_RUNNING} ✅
<b>Pending:</b> ${env.PODS_PENDING} ⏳
<b>Failed:</b> ${env.PODS_FAILED} ❌
━━━━━━━━━━━━━━━━━━━━━━
<b>💰 Estimated Monthly Costs</b>
<b>CPU:</b> \$${env.MONTHLY_CPU_COST}
<b>Memory:</b> \$${env.MONTHLY_MEMORY_COST}
<b>Total:</b> \$${env.MONTHLY_TOTAL_COST}
━━━━━━━━━━━━━━━━━━━━━━
<b>🔍 Issues</b>
${env.ISSUES}
<a href="${env.BUILD_URL}Cluster_20Health_20Dashboard/">View Full Dashboard</a>
"""
sendTelegramNotification(message)
}
echo "\n📊 Dashboard URL: ${env.BUILD_URL}Cluster_20Health_20Dashboard/"
}
}
failure {
script {
echo "❌ Dashboard generation failed!"
if (params.SEND_TELEGRAM) {
sendTelegramNotification(
"❌ <b>Cluster Health Report Failed</b>\n\n" +
"<a href='${env.BUILD_URL}console'>View Console Output</a>"
)
}
}
}
always {
script {
echo "🧹 Cleanup completed"
}
}
}
}
// Helper function to query Prometheus (NO JQ NEEDED!)
def queryPrometheus(query) {
try {
def response = sh(
script: """
curl -s '${PROMETHEUS_URL}/api/v1/query?query=${URLEncoder.encode(query, "UTF-8")}'
""",
returnStdout: true
).trim()
// Extract value using grep and sed (no jq!)
def value = sh(
script: """
echo '${response}' | grep -oP '"value":\\[\\d+,"\\K[^"]+' || echo "0"
""",
returnStdout: true
).trim()
return value ?: "0"
} catch (Exception e) {
echo "⚠️ Failed to query Prometheus: ${e.message}"
return "0"
}
}
// Helper function to send Telegram notification
def sendTelegramNotification(message) {
try {
sh """
curl -s -X POST "https://api.telegram.org/bot${TELEGRAM_BOT_TOKEN}/sendMessage" \
-d chat_id="${TELEGRAM_CHAT_ID}" \
-d parse_mode="HTML" \
-d disable_web_page_preview=true \
-d text="${message}"
"""
} catch (Exception e) {
echo "⚠️ Failed to send Telegram notification: ${e.message}"
}
}
// Helper function to generate namespace table HTML
def generateNamespaceTable(namespaceStatsStr) {
def rows = ""
if (namespaceStatsStr) {
def stats = namespaceStatsStr.split(',')
stats.each { stat ->
def parts = stat.split(':')
if (parts.size() >= 3) {
rows += """
<tr>
<td>${parts[0]}</td>
<td>${parts[1]}</td>
<td>${parts[2]}</td>
</tr>
"""
}
}
}
if (!rows) {
rows = "<tr><td colspan='3'>No data available</td></tr>"
}
return rows
}
// Helper function to generate complete HTML dashboard
def generateDashboardHTML(args) {
def namespaceTable = args.namespaceTable
return """
<!DOCTYPE html>
<html lang="en">
<head>
<meta charset="UTF-8">
<meta name="viewport" content="width=device-width, initial-scale=1.0">
<title>Cluster Health Dashboard</title>
<style>
* {
margin: 0;
padding: 0;
box-sizing: border-box;
}
body {
font-family: 'Segoe UI', Tahoma, Geneva, Verdana, sans-serif;
background: linear-gradient(135deg, #667eea 0%, #764ba2 100%);
padding: 20px;
color: #333;
}
.container {
max-width: 1400px;
margin: 0 auto;
}
.header {
background: white;
padding: 30px;
border-radius: 10px;
box-shadow: 0 4px 6px rgba(0,0,0,0.1);
margin-bottom: 20px;
}
.header h1 {
color: #667eea;
font-size: 36px;
margin-bottom: 10px;
}
.header .timestamp {
color: #666;
font-size: 14px;
}
.grid {
display: grid;
grid-template-columns: repeat(auto-fit, minmax(300px, 1fr));
gap: 20px;
margin-bottom: 20px;
}
.card {
background: white;
padding: 25px;
border-radius: 10px;
box-shadow: 0 4px 6px rgba(0,0,0,0.1);
}
.card h2 {
color: #667eea;
font-size: 18px;
margin-bottom: 15px;
border-bottom: 2px solid #f0f0f0;
padding-bottom: 10px;
}
.metric {
display: flex;
justify-content: space-between;
align-items: center;
padding: 10px 0;
border-bottom: 1px solid #f0f0f0;
}
.metric:last-child {
border-bottom: none;
}
.metric-label {
color: #666;
font-size: 14px;
}
.metric-value {
font-size: 24px;
font-weight: bold;
color: #333;
}
.metric-value.success {
color: #10b981;
}
.metric-value.warning {
color: #f59e0b;
}
.metric-value.error {
color: #ef4444;
}
.chart-container {
background: white;
padding: 25px;
border-radius: 10px;
box-shadow: 0 4px 6px rgba(0,0,0,0.1);
margin-bottom: 20px;
}
table {
width: 100%;
border-collapse: collapse;
}
th, td {
padding: 12px;
text-align: left;
border-bottom: 1px solid #f0f0f0;
}
th {
background: #f8f9fa;
color: #667eea;
font-weight: 600;
}
tr:hover {
background: #f8f9fa;
}
.issues {
background: white;
padding: 25px;
border-radius: 10px;
box-shadow: 0 4px 6px rgba(0,0,0,0.1);
margin-bottom: 20px;
}
.issues h2 {
color: #ef4444;
margin-bottom: 15px;
}
.issue-item {
padding: 10px;
margin-bottom: 10px;
background: #fef2f2;
border-left: 4px solid #ef4444;
border-radius: 4px;
}
.no-issues {
padding: 10px;
background: #f0fdf4;
border-left: 4px solid #10b981;
border-radius: 4px;
color: #166534;
}
.progress-bar {
width: 100%;
height: 20px;
background: #f0f0f0;
border-radius: 10px;
overflow: hidden;
margin: 10px 0;
}
.progress-fill {
height: 100%;
background: linear-gradient(90deg, #667eea 0%, #764ba2 100%);
transition: width 0.3s ease;
}
</style>
</head>
<body>
<div class="container">
<!-- Header -->
<div class="header">
<h1>☸️ Kubernetes Cluster Health Dashboard</h1>
<div class="timestamp">Generated: ${new Date().format('yyyy-MM-dd HH:mm:ss')} | Period: ${params.REPORT_PERIOD}</div>
</div>
<!-- Cluster Overview -->
<div class="grid">
<div class="card">
<h2>📋 Cluster Information</h2>
<div class="metric">
<span class="metric-label">Kubernetes Version</span>
<span class="metric-value">${env.CLUSTER_VERSION}</span>
</div>
<div class="metric">
<span class="metric-label">Nodes</span>
<span class="metric-value success">${env.NODE_COUNT}</span>
</div>
<div class="metric">
<span class="metric-label">Namespaces</span>
<span class="metric-value">${env.NAMESPACE_COUNT}</span>
</div>
<div class="metric">
<span class="metric-label">Total Pods</span>
<span class="metric-value">${env.POD_COUNT}</span>
</div>
</div>
<div class="card">
<h2>💻 Resource Capacity</h2>
<div class="metric">
<span class="metric-label">Total CPU Cores</span>
<span class="metric-value">${env.TOTAL_CPU_CORES}</span>
</div>
<div class="metric">
<span class="metric-label">Total Memory</span>
<span class="metric-value">${env.TOTAL_MEMORY_GB} GB</span>
</div>
<div class="metric">
<span class="metric-label">Avg CPU Usage</span>
<span class="metric-value ${env.AVG_CPU_USAGE.toFloat() > 80 ? 'error' : 'success'}">${env.AVG_CPU_USAGE}%</span>
</div>
<div class="progress-bar">
<div class="progress-fill" style="width: ${env.AVG_CPU_USAGE}%"></div>
</div>
</div>
<div class="card">
<h2>📦 Pod Status</h2>
<div class="metric">
<span class="metric-label">Running</span>
<span class="metric-value success">${env.PODS_RUNNING}</span>
</div>
<div class="metric">
<span class="metric-label">Pending</span>
<span class="metric-value warning">${env.PODS_PENDING}</span>
</div>
<div class="metric">
<span class="metric-label">Failed</span>
<span class="metric-value error">${env.PODS_FAILED}</span>
</div>
<div class="metric">
<span class="metric-label">Total Restarts</span>
<span class="metric-value ${env.TOTAL_RESTARTS.toFloat() > 10 ? 'warning' : 'success'}">${env.TOTAL_RESTARTS}</span>
</div>
</div>
<div class="card">
<h2>💰 Estimated Monthly Costs</h2>
<div class="metric">
<span class="metric-label">CPU Cost</span>
<span class="metric-value">\$${env.MONTHLY_CPU_COST}</span>
</div>
<div class="metric">
<span class="metric-label">Memory Cost</span>
<span class="metric-value">\$${env.MONTHLY_MEMORY_COST}</span>
</div>
<div class="metric">
<span class="metric-label"><strong>Total Cost</strong></span>
<span class="metric-value">\$${env.MONTHLY_TOTAL_COST}</span>
</div>
<div style="margin-top: 15px; padding: 10px; background: #f0f9ff; border-radius: 4px; font-size: 12px; color: #0369a1;">
💡 Based on: CPU \$${env.CPU_PRICE_PER_HOUR}/core/hour, Memory \$${env.MEMORY_PRICE_PER_GB_HOUR}/GB/hour
</div>
</div>
</div>
<!-- Issues Section -->
<div class="issues">
<h2>🔍 Health Checks & Issues</h2>
${env.ISSUES.contains('✅') ?
'<div class="no-issues">' + env.ISSUES + '</div>' :
env.ISSUES.split('\n').collect { "<div class='issue-item'>${it}</div>" }.join('')
}
</div>
<!-- Namespace Details -->
<div class="chart-container">
<h2>📊 Resources by Namespace</h2>
<table>
<thead>
<tr>
<th>Namespace</th>
<th>Pods</th>
<th>Containers</th>
</tr>
</thead>
<tbody>
${namespaceTable}
</tbody>
</table>
</div>
<!-- Footer -->
<div style="text-align: center; color: white; padding: 20px; font-size: 14px;">
Generated by Jenkins CI/CD Pipeline • Build #${env.BUILD_NUMBER}
</div>
</div>
</body>
</html>
"""
}