CLI-verktøy som sjekker PG, Caddy, Maskinrommet, LiteLLM, Whisper, LiveKit og Authentik parallelt. Output: JSON med status per tjeneste + samlet helsetilstand (healthy/degraded). Exit-kode 0 = alt oppe. Gjenbruker samme mønster som maskinrommet/src/health.rs men som frittstående verktøy — kan brukes av admin-dashboard, overvåking, og direkte fra CLI.
195 lines
6.5 KiB
Rust
195 lines
6.5 KiB
Rust
// synops-health — Sjekk status for alle tjenester i Synops-stacken.
|
|
//
|
|
// Sjekker: PostgreSQL, Caddy, Maskinrommet, LiteLLM, Whisper, LiveKit, Authentik.
|
|
// Output: JSON med status per tjeneste + samlet status.
|
|
// Exit-kode: 0 = alle oppe, 1 = noe nede/degradert, 2 = kjørefeil.
|
|
//
|
|
// Brukes av admin-dashboard og overvåking.
|
|
//
|
|
// Miljøvariabler (alle valgfrie — har fornuftige defaults):
|
|
// DATABASE_URL — PostgreSQL-tilkobling
|
|
// AI_GATEWAY_URL — LiteLLM base-URL (default: http://localhost:4000)
|
|
// WHISPER_URL — faster-whisper base-URL (default: http://localhost:8000)
|
|
// LIVEKIT_URL — LiveKit base-URL (default: http://localhost:7880)
|
|
//
|
|
// Ref: docs/infra/observerbarhet.md, maskinrommet/src/health.rs
|
|
|
|
use clap::Parser;
|
|
use serde::Serialize;
|
|
use std::process;
|
|
use std::time::Instant;
|
|
|
|
/// Sjekk status for alle Synops-tjenester.
|
|
#[derive(Parser)]
|
|
#[command(name = "synops-health", about = "Sjekk tjenestestatus for Synops-stacken")]
|
|
struct Cli {
|
|
/// Timeout per tjeneste i sekunder
|
|
#[arg(long, default_value = "5")]
|
|
timeout: u64,
|
|
|
|
/// Kjør kun stille — bare exit-kode, ingen output
|
|
#[arg(long)]
|
|
quiet: bool,
|
|
}
|
|
|
|
#[derive(Serialize)]
|
|
struct HealthReport {
|
|
timestamp: String,
|
|
overall_status: String, // "healthy", "degraded", "critical"
|
|
services: Vec<ServiceStatus>,
|
|
}
|
|
|
|
#[derive(Serialize, Clone)]
|
|
struct ServiceStatus {
|
|
name: String,
|
|
status: String, // "up", "down", "degraded"
|
|
latency_ms: Option<u64>,
|
|
#[serde(skip_serializing_if = "Option::is_none")]
|
|
details: Option<String>,
|
|
}
|
|
|
|
#[tokio::main]
|
|
async fn main() {
|
|
synops_common::logging::init("synops_health");
|
|
|
|
let cli = Cli::parse();
|
|
let timeout = std::time::Duration::from_secs(cli.timeout);
|
|
|
|
let http = reqwest::Client::builder()
|
|
.timeout(timeout)
|
|
.danger_accept_invalid_certs(true) // Lokale tjenester kan ha self-signed
|
|
.build()
|
|
.expect("Kunne ikke opprette HTTP-klient");
|
|
|
|
// Kjør alle sjekker parallelt
|
|
let (pg, caddy, maskinrommet, litellm, whisper, livekit, authentik) = tokio::join!(
|
|
check_pg(timeout),
|
|
check_http(&http, "Caddy", "http://localhost:2019/config/"),
|
|
check_http(&http, "Maskinrommet", "http://localhost:3100/health"),
|
|
check_litellm(&http),
|
|
check_whisper(&http),
|
|
check_livekit(&http),
|
|
check_http(&http, "Authentik", "https://auth.sidelinja.org/-/health/ready/"),
|
|
);
|
|
|
|
let services = vec![pg, caddy, maskinrommet, litellm, whisper, livekit, authentik];
|
|
|
|
let overall_status = if services.iter().all(|s| s.status == "up") {
|
|
"healthy"
|
|
} else if services.iter().any(|s| s.status == "down") {
|
|
"degraded"
|
|
} else {
|
|
"degraded"
|
|
}
|
|
.to_string();
|
|
|
|
let report = HealthReport {
|
|
timestamp: chrono::Utc::now().to_rfc3339(),
|
|
overall_status: overall_status.clone(),
|
|
services,
|
|
};
|
|
|
|
if !cli.quiet {
|
|
println!(
|
|
"{}",
|
|
serde_json::to_string_pretty(&report).expect("JSON-serialisering feilet")
|
|
);
|
|
}
|
|
|
|
let exit_code = if overall_status == "healthy" { 0 } else { 1 };
|
|
process::exit(exit_code);
|
|
}
|
|
|
|
// =============================================================================
|
|
// Tjeneste-sjekker
|
|
// =============================================================================
|
|
|
|
/// Sjekk PostgreSQL med direkte SQL-spørring.
|
|
async fn check_pg(timeout: std::time::Duration) -> ServiceStatus {
|
|
let name = "PostgreSQL".to_string();
|
|
|
|
// Prøv å koble til med timeout
|
|
let connect_result = tokio::time::timeout(timeout, synops_common::db::connect()).await;
|
|
|
|
match connect_result {
|
|
Ok(Ok(db)) => {
|
|
let start = Instant::now();
|
|
match sqlx::query_scalar::<_, i32>("SELECT 1").fetch_one(&db).await {
|
|
Ok(_) => ServiceStatus {
|
|
name,
|
|
status: "up".to_string(),
|
|
latency_ms: Some(start.elapsed().as_millis() as u64),
|
|
details: None,
|
|
},
|
|
Err(e) => ServiceStatus {
|
|
name,
|
|
status: "down".to_string(),
|
|
latency_ms: None,
|
|
details: Some(format!("{e}")),
|
|
},
|
|
}
|
|
}
|
|
Ok(Err(e)) => ServiceStatus {
|
|
name,
|
|
status: "down".to_string(),
|
|
latency_ms: None,
|
|
details: Some(format!("Tilkobling feilet: {e}")),
|
|
},
|
|
Err(_) => ServiceStatus {
|
|
name,
|
|
status: "down".to_string(),
|
|
latency_ms: None,
|
|
details: Some("Timeout".to_string()),
|
|
},
|
|
}
|
|
}
|
|
|
|
/// Sjekk en HTTP-tjeneste. Aksepterer 401/403 som "up" (auth kreves men tjenesten kjører).
|
|
async fn check_http(client: &reqwest::Client, name: &str, url: &str) -> ServiceStatus {
|
|
let start = Instant::now();
|
|
match client.get(url).send().await {
|
|
Ok(resp) => {
|
|
let latency = start.elapsed().as_millis() as u64;
|
|
let code = resp.status();
|
|
if code.is_success() || code.as_u16() == 401 || code.as_u16() == 403 {
|
|
ServiceStatus {
|
|
name: name.to_string(),
|
|
status: "up".to_string(),
|
|
latency_ms: Some(latency),
|
|
details: None,
|
|
}
|
|
} else {
|
|
ServiceStatus {
|
|
name: name.to_string(),
|
|
status: "degraded".to_string(),
|
|
latency_ms: Some(latency),
|
|
details: Some(format!("HTTP {code}")),
|
|
}
|
|
}
|
|
}
|
|
Err(e) => ServiceStatus {
|
|
name: name.to_string(),
|
|
status: "down".to_string(),
|
|
latency_ms: None,
|
|
details: Some(format!("{e}")),
|
|
},
|
|
}
|
|
}
|
|
|
|
async fn check_litellm(client: &reqwest::Client) -> ServiceStatus {
|
|
let url = std::env::var("AI_GATEWAY_URL")
|
|
.unwrap_or_else(|_| "http://localhost:4000".to_string());
|
|
check_http(client, "LiteLLM", &format!("{url}/health")).await
|
|
}
|
|
|
|
async fn check_whisper(client: &reqwest::Client) -> ServiceStatus {
|
|
let url = std::env::var("WHISPER_URL")
|
|
.unwrap_or_else(|_| "http://localhost:8000".to_string());
|
|
check_http(client, "Whisper", &format!("{url}/health")).await
|
|
}
|
|
|
|
async fn check_livekit(client: &reqwest::Client) -> ServiceStatus {
|
|
let url = std::env::var("LIVEKIT_URL")
|
|
.unwrap_or_else(|_| "http://localhost:7880".to_string());
|
|
check_http(client, "LiveKit", &url).await
|
|
}
|