// synops-health — Sjekk status for alle tjenester i Synops-stacken. // // Sjekker: PostgreSQL, Caddy, Maskinrommet, LiteLLM, Whisper, LiveKit, Authentik. // Output: JSON med status per tjeneste + samlet status. // Exit-kode: 0 = alle oppe, 1 = noe nede/degradert, 2 = kjørefeil. // // Brukes av admin-dashboard og overvåking. // // Miljøvariabler (alle valgfrie — har fornuftige defaults): // DATABASE_URL — PostgreSQL-tilkobling // AI_GATEWAY_URL — LiteLLM base-URL (default: http://localhost:4000) // WHISPER_URL — faster-whisper base-URL (default: http://localhost:8000) // LIVEKIT_URL — LiveKit base-URL (default: http://localhost:7880) // // Ref: docs/infra/observerbarhet.md, maskinrommet/src/health.rs use clap::Parser; use serde::Serialize; use std::process; use std::time::Instant; /// Sjekk status for alle Synops-tjenester. #[derive(Parser)] #[command(name = "synops-health", about = "Sjekk tjenestestatus for Synops-stacken")] struct Cli { /// Timeout per tjeneste i sekunder #[arg(long, default_value = "5")] timeout: u64, /// Kjør kun stille — bare exit-kode, ingen output #[arg(long)] quiet: bool, } #[derive(Serialize)] struct HealthReport { timestamp: String, overall_status: String, // "healthy", "degraded", "critical" services: Vec, } #[derive(Serialize, Clone)] struct ServiceStatus { name: String, status: String, // "up", "down", "degraded" latency_ms: Option, #[serde(skip_serializing_if = "Option::is_none")] details: Option, } #[tokio::main] async fn main() { synops_common::logging::init("synops_health"); let cli = Cli::parse(); let timeout = std::time::Duration::from_secs(cli.timeout); let http = reqwest::Client::builder() .timeout(timeout) .danger_accept_invalid_certs(true) // Lokale tjenester kan ha self-signed .build() .expect("Kunne ikke opprette HTTP-klient"); // Kjør alle sjekker parallelt let (pg, caddy, maskinrommet, litellm, whisper, livekit, authentik) = tokio::join!( check_pg(timeout), check_http(&http, "Caddy", "http://localhost:2019/config/"), check_http(&http, "Maskinrommet", "http://localhost:3100/health"), check_litellm(&http), check_whisper(&http), check_livekit(&http), check_http(&http, "Authentik", "https://auth.sidelinja.org/-/health/ready/"), ); let services = vec![pg, caddy, maskinrommet, litellm, whisper, livekit, authentik]; let overall_status = if services.iter().all(|s| s.status == "up") { "healthy" } else if services.iter().any(|s| s.status == "down") { "degraded" } else { "degraded" } .to_string(); let report = HealthReport { timestamp: chrono::Utc::now().to_rfc3339(), overall_status: overall_status.clone(), services, }; if !cli.quiet { println!( "{}", serde_json::to_string_pretty(&report).expect("JSON-serialisering feilet") ); } let exit_code = if overall_status == "healthy" { 0 } else { 1 }; process::exit(exit_code); } // ============================================================================= // Tjeneste-sjekker // ============================================================================= /// Sjekk PostgreSQL med direkte SQL-spørring. async fn check_pg(timeout: std::time::Duration) -> ServiceStatus { let name = "PostgreSQL".to_string(); // Prøv å koble til med timeout let connect_result = tokio::time::timeout(timeout, synops_common::db::connect()).await; match connect_result { Ok(Ok(db)) => { let start = Instant::now(); match sqlx::query_scalar::<_, i32>("SELECT 1").fetch_one(&db).await { Ok(_) => ServiceStatus { name, status: "up".to_string(), latency_ms: Some(start.elapsed().as_millis() as u64), details: None, }, Err(e) => ServiceStatus { name, status: "down".to_string(), latency_ms: None, details: Some(format!("{e}")), }, } } Ok(Err(e)) => ServiceStatus { name, status: "down".to_string(), latency_ms: None, details: Some(format!("Tilkobling feilet: {e}")), }, Err(_) => ServiceStatus { name, status: "down".to_string(), latency_ms: None, details: Some("Timeout".to_string()), }, } } /// Sjekk en HTTP-tjeneste. Aksepterer 401/403 som "up" (auth kreves men tjenesten kjører). async fn check_http(client: &reqwest::Client, name: &str, url: &str) -> ServiceStatus { let start = Instant::now(); match client.get(url).send().await { Ok(resp) => { let latency = start.elapsed().as_millis() as u64; let code = resp.status(); if code.is_success() || code.as_u16() == 401 || code.as_u16() == 403 { ServiceStatus { name: name.to_string(), status: "up".to_string(), latency_ms: Some(latency), details: None, } } else { ServiceStatus { name: name.to_string(), status: "degraded".to_string(), latency_ms: Some(latency), details: Some(format!("HTTP {code}")), } } } Err(e) => ServiceStatus { name: name.to_string(), status: "down".to_string(), latency_ms: None, details: Some(format!("{e}")), }, } } async fn check_litellm(client: &reqwest::Client) -> ServiceStatus { let url = std::env::var("AI_GATEWAY_URL") .unwrap_or_else(|_| "http://localhost:4000".to_string()); check_http(client, "LiteLLM", &format!("{url}/health")).await } async fn check_whisper(client: &reqwest::Client) -> ServiceStatus { let url = std::env::var("WHISPER_URL") .unwrap_or_else(|_| "http://localhost:8000".to_string()); check_http(client, "Whisper", &format!("{url}/health")).await } async fn check_livekit(client: &reqwest::Client) -> ServiceStatus { let url = std::env::var("LIVEKIT_URL") .unwrap_or_else(|_| "http://localhost:7880".to_string()); check_http(client, "LiveKit", &url).await }