Files
k3s-gitops/apps/cluster-health-dashboard/Jenkinsfile

798 lines
28 KiB
Groovy

pipeline {
agent any
environment {
DASHBOARD_NAME = 'cluster-health-dashboard'
OUTPUT_DIR = '/tmp/dashboard-reports'
PROMETHEUS_URL = 'http://prometheus-server.monitoring.svc.cluster.local'
GRAFANA_URL = 'http://grafana.monitoring.svc.cluster.local'
// Notification
TELEGRAM_BOT_TOKEN = credentials('telegram-bot-token')
TELEGRAM_CHAT_ID = credentials('telegram-chat-id')
// Pricing (adjust to your actual costs)
CPU_PRICE_PER_HOUR = '0.04' // $0.04 per vCPU hour
MEMORY_PRICE_PER_GB_HOUR = '0.005' // $0.005 per GB hour
}
triggers {
// Run daily at 8 AM on weekdays
cron('0 8 * * 1-5')
}
parameters {
choice(
name: 'REPORT_PERIOD',
choices: ['24h', '7d', '30d'],
description: 'Time period for metrics'
)
booleanParam(
name: 'SEND_EMAIL',
defaultValue: true,
description: 'Send report via email'
)
booleanParam(
name: 'SEND_TELEGRAM',
defaultValue: true,
description: 'Send summary to Telegram'
)
}
stages {
stage('Initialize') {
steps {
script {
echo "🚀 Starting Cluster Health Dashboard generation..."
sh """
mkdir -p ${OUTPUT_DIR}
rm -f ${OUTPUT_DIR}/*
"""
if (params.SEND_TELEGRAM) {
sendTelegramNotification(
"📊 <b>Cluster Health Report</b>\n\n" +
"<i>Generating dashboard for period: ${params.REPORT_PERIOD}</i>"
)
}
}
}
}
stage('Collect Cluster Info') {
steps {
script {
echo "📋 Collecting cluster information..."
// Get cluster version
env.CLUSTER_VERSION = sh(
script: 'kubectl version --short 2>/dev/null | grep Server | cut -d" " -f3',
returnStdout: true
).trim()
// Get node count
env.NODE_COUNT = sh(
script: 'kubectl get nodes --no-headers | wc -l',
returnStdout: true
).trim()
// Get namespace count
env.NAMESPACE_COUNT = sh(
script: 'kubectl get namespaces --no-headers | wc -l',
returnStdout: true
).trim()
// Get total pod count
env.POD_COUNT = sh(
script: 'kubectl get pods --all-namespaces --no-headers | wc -l',
returnStdout: true
).trim()
echo "Cluster version: ${env.CLUSTER_VERSION}"
echo "Nodes: ${env.NODE_COUNT}"
echo "Namespaces: ${env.NAMESPACE_COUNT}"
echo "Pods: ${env.POD_COUNT}"
}
}
}
stage('Query Prometheus Metrics') {
steps {
script {
echo "📈 Querying Prometheus for metrics..."
def period = params.REPORT_PERIOD
// CPU Usage
env.AVG_CPU_USAGE = queryPrometheus(
"avg(rate(container_cpu_usage_seconds_total{container!=''}[5m])) * 100"
)
// Memory Usage
env.TOTAL_MEMORY_USAGE_GB = queryPrometheus(
"sum(container_memory_usage_bytes{container!=''}) / 1024 / 1024 / 1024"
)
// Network RX
env.NETWORK_RX_MB = queryPrometheus(
"sum(rate(container_network_receive_bytes_total[5m])) / 1024 / 1024"
)
// Network TX
env.NETWORK_TX_MB = queryPrometheus(
"sum(rate(container_network_transmit_bytes_total[5m])) / 1024 / 1024"
)
// Pod restart count
env.TOTAL_RESTARTS = queryPrometheus(
"sum(kube_pod_container_status_restarts_total)"
)
echo "Metrics collected successfully"
}
}
}
stage('Analyze Node Resources') {
steps {
script {
echo "💻 Analyzing node resources..."
sh """#!/bin/bash
cat > ${OUTPUT_DIR}/node-resources.json << 'EOF'
{
"nodes": [
EOF
first=true
kubectl get nodes -o json | jq -r '.items[] | @json' | while read node; do
if [ "\$first" = true ]; then
first=false
else
echo "," >> ${OUTPUT_DIR}/node-resources.json
fi
echo "\$node" >> ${OUTPUT_DIR}/node-resources.json
done
cat >> ${OUTPUT_DIR}/node-resources.json << 'EOF'
]
}
EOF
"""
// Parse and calculate capacity
def nodeData = readJSON file: "${OUTPUT_DIR}/node-resources.json"
def totalCPU = 0
def totalMemoryGB = 0
nodeData.nodes.each { node ->
def cpu = node.status.capacity.cpu.toInteger()
def memoryKi = node.status.capacity.memory.replaceAll('[^0-9]', '').toLong()
def memoryGB = memoryKi / 1024 / 1024
totalCPU += cpu
totalMemoryGB += memoryGB
}
env.TOTAL_CPU_CORES = totalCPU.toString()
env.TOTAL_MEMORY_GB = totalMemoryGB.toString()
echo "Total CPU cores: ${env.TOTAL_CPU_CORES}"
echo "Total Memory: ${env.TOTAL_MEMORY_GB} GB"
}
}
}
stage('Analyze Pod Status') {
steps {
script {
echo "📦 Analyzing pod status across namespaces..."
sh """
kubectl get pods --all-namespaces -o json > ${OUTPUT_DIR}/all-pods.json
"""
def pods = readJSON file: "${OUTPUT_DIR}/all-pods.json"
def statusCounts = [
running: 0,
pending: 0,
failed: 0,
succeeded: 0,
unknown: 0
]
def namespaceStats = [:]
pods.items.each { pod ->
def phase = pod.status.phase.toLowerCase()
def namespace = pod.metadata.namespace
statusCounts[phase] = (statusCounts[phase] ?: 0) + 1
if (!namespaceStats[namespace]) {
namespaceStats[namespace] = [pods: 0, containers: 0]
}
namespaceStats[namespace].pods++
namespaceStats[namespace].containers += pod.spec.containers.size()
}
env.PODS_RUNNING = statusCounts.running.toString()
env.PODS_PENDING = statusCounts.pending.toString()
env.PODS_FAILED = statusCounts.failed.toString()
// Save namespace stats
writeJSON file: "${OUTPUT_DIR}/namespace-stats.json", json: namespaceStats
echo "Pods running: ${env.PODS_RUNNING}"
echo "Pods pending: ${env.PODS_PENDING}"
echo "Pods failed: ${env.PODS_FAILED}"
}
}
}
stage('Calculate Costs') {
steps {
script {
echo "💰 Calculating resource costs..."
def cpuHours = env.TOTAL_CPU_CORES.toFloat() * 24 * 30 // Monthly
def memoryGBHours = env.TOTAL_MEMORY_GB.toFloat() * 24 * 30
def cpuCost = cpuHours * env.CPU_PRICE_PER_HOUR.toFloat()
def memoryCost = memoryGBHours * env.MEMORY_PRICE_PER_GB_HOUR.toFloat()
def totalCost = cpuCost + memoryCost
env.MONTHLY_CPU_COST = String.format('%.2f', cpuCost)
env.MONTHLY_MEMORY_COST = String.format('%.2f', memoryCost)
env.MONTHLY_TOTAL_COST = String.format('%.2f', totalCost)
echo "Estimated monthly costs:"
echo " CPU: \$${env.MONTHLY_CPU_COST}"
echo " Memory: \$${env.MONTHLY_MEMORY_COST}"
echo " Total: \$${env.MONTHLY_TOTAL_COST}"
}
}
}
stage('Check for Issues') {
steps {
script {
echo "🔍 Checking for potential issues..."
def issues = []
// High restart count
if (env.TOTAL_RESTARTS.toInteger() > 10) {
issues << "⚠️ High pod restart count: ${env.TOTAL_RESTARTS}"
}
// Failed pods
if (env.PODS_FAILED.toInteger() > 0) {
issues << "❌ ${env.PODS_FAILED} pods in Failed state"
}
// Pending pods
if (env.PODS_PENDING.toInteger() > 5) {
issues << "⚠️ ${env.PODS_PENDING} pods in Pending state (possible resource constraints)"
}
// High CPU usage
if (env.AVG_CPU_USAGE.toFloat() > 80) {
issues << "🔥 High CPU usage: ${env.AVG_CPU_USAGE}%"
}
env.ISSUES = issues.join('\n')
if (issues.size() > 0) {
echo "Found ${issues.size()} issues:"
issues.each { echo it }
} else {
echo "✅ No critical issues found"
env.ISSUES = "✅ No issues detected"
}
}
}
}
stage('Generate HTML Dashboard') {
steps {
script {
echo "🎨 Generating HTML dashboard..."
def namespaceStats = readJSON file: "${OUTPUT_DIR}/namespace-stats.json"
def namespaceTable = generateNamespaceTable(namespaceStats)
def html = generateDashboardHTML(
namespaceTable: namespaceTable
)
writeFile file: "${OUTPUT_DIR}/dashboard.html", text: html
echo "✅ Dashboard generated: ${OUTPUT_DIR}/dashboard.html"
}
}
}
stage('Generate JSON Report') {
steps {
script {
echo "📄 Generating JSON report..."
def report = [
generated_at: new Date().format('yyyy-MM-dd HH:mm:ss'),
period: params.REPORT_PERIOD,
cluster: [
version: env.CLUSTER_VERSION,
nodes: env.NODE_COUNT.toInteger(),
namespaces: env.NAMESPACE_COUNT.toInteger(),
total_pods: env.POD_COUNT.toInteger()
],
resources: [
total_cpu_cores: env.TOTAL_CPU_CORES.toFloat(),
total_memory_gb: env.TOTAL_MEMORY_GB.toFloat(),
avg_cpu_usage_percent: env.AVG_CPU_USAGE.toFloat(),
total_memory_usage_gb: env.TOTAL_MEMORY_USAGE_GB.toFloat()
],
pods: [
running: env.PODS_RUNNING.toInteger(),
pending: env.PODS_PENDING.toInteger(),
failed: env.PODS_FAILED.toInteger()
],
costs: [
monthly_cpu_usd: env.MONTHLY_CPU_COST.toFloat(),
monthly_memory_usd: env.MONTHLY_MEMORY_COST.toFloat(),
monthly_total_usd: env.MONTHLY_TOTAL_COST.toFloat()
],
issues: env.ISSUES
]
writeJSON file: "${OUTPUT_DIR}/report.json", json: report, pretty: 4
echo "✅ JSON report generated"
}
}
}
}
post {
success {
script {
echo "✅ Dashboard generation completed successfully!"
// Archive artifacts
archiveArtifacts artifacts: "${OUTPUT_DIR}/*", fingerprint: true
// Publish HTML report
publishHTML([
allowMissing: false,
alwaysLinkToLastBuild: true,
keepAll: true,
reportDir: OUTPUT_DIR,
reportFiles: 'dashboard.html',
reportName: 'Cluster Health Dashboard',
reportTitles: 'Cluster Health Dashboard'
])
// Send Telegram summary
if (params.SEND_TELEGRAM) {
def message = """
📊 <b>Cluster Health Report</b>
━━━━━━━━━━━━━━━━━━━━━━
<b>📋 Cluster Info</b>
<b>Version:</b> ${env.CLUSTER_VERSION}
<b>Nodes:</b> ${env.NODE_COUNT}
<b>Namespaces:</b> ${env.NAMESPACE_COUNT}
<b>Total Pods:</b> ${env.POD_COUNT}
━━━━━━━━━━━━━━━━━━━━━━
<b>💻 Resources</b>
<b>CPU Cores:</b> ${env.TOTAL_CPU_CORES}
<b>Memory:</b> ${env.TOTAL_MEMORY_GB} GB
<b>Avg CPU Usage:</b> ${env.AVG_CPU_USAGE}%
━━━━━━━━━━━━━━━━━━━━━━
<b>📦 Pod Status</b>
<b>Running:</b> ${env.PODS_RUNNING} ✅
<b>Pending:</b> ${env.PODS_PENDING} ⏳
<b>Failed:</b> ${env.PODS_FAILED} ❌
━━━━━━━━━━━━━━━━━━━━━━
<b>💰 Estimated Monthly Costs</b>
<b>CPU:</b> \$${env.MONTHLY_CPU_COST}
<b>Memory:</b> \$${env.MONTHLY_MEMORY_COST}
<b>Total:</b> \$${env.MONTHLY_TOTAL_COST}
━━━━━━━━━━━━━━━━━━━━━━
<b>🔍 Issues</b>
${env.ISSUES}
<a href="${env.BUILD_URL}Cluster_20Health_20Dashboard/">View Full Dashboard</a>
"""
sendTelegramNotification(message)
}
echo "\n📊 Dashboard URL: ${env.BUILD_URL}Cluster_20Health_20Dashboard/"
}
}
failure {
script {
echo "❌ Dashboard generation failed!"
if (params.SEND_TELEGRAM) {
sendTelegramNotification(
"❌ <b>Cluster Health Report Failed</b>\n\n" +
"<a href='${env.BUILD_URL}console'>View Console Output</a>"
)
}
}
}
always {
script {
echo "🧹 Cleanup completed"
}
}
}
}
// Helper function to query Prometheus
def queryPrometheus(query) {
try {
def result = sh(
script: """
curl -s '${PROMETHEUS_URL}/api/v1/query?query=${URLEncoder.encode(query, "UTF-8")}' | \
jq -r '.data.result[0].value[1] // "0"'
""",
returnStdout: true
).trim()
return result ?: "0"
} catch (Exception e) {
echo "⚠️ Failed to query Prometheus: ${e.message}"
return "0"
}
}
// Helper function to send Telegram notification
def sendTelegramNotification(message) {
try {
sh """
curl -s -X POST "https://api.telegram.org/bot${TELEGRAM_BOT_TOKEN}/sendMessage" \
-d chat_id="${TELEGRAM_CHAT_ID}" \
-d parse_mode="HTML" \
-d disable_web_page_preview=true \
-d text="${message}"
"""
} catch (Exception e) {
echo "⚠️ Failed to send Telegram notification: ${e.message}"
}
}
// Helper function to generate namespace table HTML
def generateNamespaceTable(namespaceStats) {
def rows = ""
// Sort by pod count descending
def sortedNamespaces = namespaceStats.sort { -it.value.pods }
sortedNamespaces.each { namespace, stats ->
rows += """
<tr>
<td>${namespace}</td>
<td>${stats.pods}</td>
<td>${stats.containers}</td>
</tr>
"""
}
return rows
}
// Helper function to generate complete HTML dashboard
def generateDashboardHTML(args) {
def namespaceTable = args.namespaceTable
return """
<!DOCTYPE html>
<html lang="en">
<head>
<meta charset="UTF-8">
<meta name="viewport" content="width=device-width, initial-scale=1.0">
<title>Cluster Health Dashboard</title>
<style>
* {
margin: 0;
padding: 0;
box-sizing: border-box;
}
body {
font-family: 'Segoe UI', Tahoma, Geneva, Verdana, sans-serif;
background: linear-gradient(135deg, #667eea 0%, #764ba2 100%);
padding: 20px;
color: #333;
}
.container {
max-width: 1400px;
margin: 0 auto;
}
.header {
background: white;
padding: 30px;
border-radius: 10px;
box-shadow: 0 4px 6px rgba(0,0,0,0.1);
margin-bottom: 20px;
}
.header h1 {
color: #667eea;
font-size: 36px;
margin-bottom: 10px;
}
.header .timestamp {
color: #666;
font-size: 14px;
}
.grid {
display: grid;
grid-template-columns: repeat(auto-fit, minmax(300px, 1fr));
gap: 20px;
margin-bottom: 20px;
}
.card {
background: white;
padding: 25px;
border-radius: 10px;
box-shadow: 0 4px 6px rgba(0,0,0,0.1);
}
.card h2 {
color: #667eea;
font-size: 18px;
margin-bottom: 15px;
border-bottom: 2px solid #f0f0f0;
padding-bottom: 10px;
}
.metric {
display: flex;
justify-content: space-between;
align-items: center;
padding: 10px 0;
border-bottom: 1px solid #f0f0f0;
}
.metric:last-child {
border-bottom: none;
}
.metric-label {
color: #666;
font-size: 14px;
}
.metric-value {
font-size: 24px;
font-weight: bold;
color: #333;
}
.metric-value.success {
color: #10b981;
}
.metric-value.warning {
color: #f59e0b;
}
.metric-value.error {
color: #ef4444;
}
.chart-container {
background: white;
padding: 25px;
border-radius: 10px;
box-shadow: 0 4px 6px rgba(0,0,0,0.1);
margin-bottom: 20px;
}
table {
width: 100%;
border-collapse: collapse;
}
th, td {
padding: 12px;
text-align: left;
border-bottom: 1px solid #f0f0f0;
}
th {
background: #f8f9fa;
color: #667eea;
font-weight: 600;
}
tr:hover {
background: #f8f9fa;
}
.issues {
background: white;
padding: 25px;
border-radius: 10px;
box-shadow: 0 4px 6px rgba(0,0,0,0.1);
margin-bottom: 20px;
}
.issues h2 {
color: #ef4444;
margin-bottom: 15px;
}
.issue-item {
padding: 10px;
margin-bottom: 10px;
background: #fef2f2;
border-left: 4px solid #ef4444;
border-radius: 4px;
}
.no-issues {
padding: 10px;
background: #f0fdf4;
border-left: 4px solid #10b981;
border-radius: 4px;
color: #166534;
}
.progress-bar {
width: 100%;
height: 20px;
background: #f0f0f0;
border-radius: 10px;
overflow: hidden;
margin: 10px 0;
}
.progress-fill {
height: 100%;
background: linear-gradient(90deg, #667eea 0%, #764ba2 100%);
transition: width 0.3s ease;
}
</style>
</head>
<body>
<div class="container">
<!-- Header -->
<div class="header">
<h1>☸️ Kubernetes Cluster Health Dashboard</h1>
<div class="timestamp">Generated: ${new Date().format('yyyy-MM-dd HH:mm:ss')} | Period: ${params.REPORT_PERIOD}</div>
</div>
<!-- Cluster Overview -->
<div class="grid">
<div class="card">
<h2>📋 Cluster Information</h2>
<div class="metric">
<span class="metric-label">Kubernetes Version</span>
<span class="metric-value">${env.CLUSTER_VERSION}</span>
</div>
<div class="metric">
<span class="metric-label">Nodes</span>
<span class="metric-value success">${env.NODE_COUNT}</span>
</div>
<div class="metric">
<span class="metric-label">Namespaces</span>
<span class="metric-value">${env.NAMESPACE_COUNT}</span>
</div>
<div class="metric">
<span class="metric-label">Total Pods</span>
<span class="metric-value">${env.POD_COUNT}</span>
</div>
</div>
<div class="card">
<h2>💻 Resource Capacity</h2>
<div class="metric">
<span class="metric-label">Total CPU Cores</span>
<span class="metric-value">${env.TOTAL_CPU_CORES}</span>
</div>
<div class="metric">
<span class="metric-label">Total Memory</span>
<span class="metric-value">${env.TOTAL_MEMORY_GB} GB</span>
</div>
<div class="metric">
<span class="metric-label">Avg CPU Usage</span>
<span class="metric-value ${env.AVG_CPU_USAGE.toFloat() > 80 ? 'error' : 'success'}">${env.AVG_CPU_USAGE}%</span>
</div>
<div class="progress-bar">
<div class="progress-fill" style="width: ${env.AVG_CPU_USAGE}%"></div>
</div>
</div>
<div class="card">
<h2>📦 Pod Status</h2>
<div class="metric">
<span class="metric-label">Running</span>
<span class="metric-value success">${env.PODS_RUNNING}</span>
</div>
<div class="metric">
<span class="metric-label">Pending</span>
<span class="metric-value warning">${env.PODS_PENDING}</span>
</div>
<div class="metric">
<span class="metric-label">Failed</span>
<span class="metric-value error">${env.PODS_FAILED}</span>
</div>
<div class="metric">
<span class="metric-label">Total Restarts</span>
<span class="metric-value ${env.TOTAL_RESTARTS.toInteger() > 10 ? 'warning' : 'success'}">${env.TOTAL_RESTARTS}</span>
</div>
</div>
<div class="card">
<h2>💰 Estimated Monthly Costs</h2>
<div class="metric">
<span class="metric-label">CPU Cost</span>
<span class="metric-value">\$${env.MONTHLY_CPU_COST}</span>
</div>
<div class="metric">
<span class="metric-label">Memory Cost</span>
<span class="metric-value">\$${env.MONTHLY_MEMORY_COST}</span>
</div>
<div class="metric">
<span class="metric-label"><strong>Total Cost</strong></span>
<span class="metric-value">\$${env.MONTHLY_TOTAL_COST}</span>
</div>
<div style="margin-top: 15px; padding: 10px; background: #f0f9ff; border-radius: 4px; font-size: 12px; color: #0369a1;">
💡 Based on: CPU \$${env.CPU_PRICE_PER_HOUR}/core/hour, Memory \$${env.MEMORY_PRICE_PER_GB_HOUR}/GB/hour
</div>
</div>
</div>
<!-- Issues Section -->
<div class="issues">
<h2>🔍 Health Checks & Issues</h2>
${env.ISSUES.contains('✅') ?
'<div class="no-issues">' + env.ISSUES + '</div>' :
env.ISSUES.split('\n').collect { "<div class='issue-item'>${it}</div>" }.join('')
}
</div>
<!-- Namespace Details -->
<div class="chart-container">
<h2>📊 Resources by Namespace</h2>
<table>
<thead>
<tr>
<th>Namespace</th>
<th>Pods</th>
<th>Containers</th>
</tr>
</thead>
<tbody>
${namespaceTable}
</tbody>
</table>
</div>
<!-- Footer -->
<div style="text-align: center; color: white; padding: 20px; font-size: 14px;">
Generated by Jenkins CI/CD Pipeline • Build #${env.BUILD_NUMBER}
</div>
</div>
</body>
</html>
"""
}