diff --git a/tasks.md b/tasks.md index 6be9e40..dca349e 100644 --- a/tasks.md +++ b/tasks.md @@ -381,8 +381,7 @@ modell som brukes til hva. - [x] 28.4 `synops-notify`: send varsel via epost (synops-mail), WebSocket-push, eller begge. Input: `--to --message [--channel email|ws|both]`. Brukes av orkestreringer og vaktmesteren. - [x] 28.5 `synops-validate`: sjekk at en node matcher forventet skjema for sin node_kind. Input: `--node-id `. Output: liste av avvik. Brukes av valideringsfasen og som pre-commit sjekk. - [x] 28.6 `synops-backup`: PG-dump + CAS-filiste + metadata-snapshot. Input: `[--full | --incremental]`. Output: backup-sti. Erstatter cron-scriptet fra 12.2. -- [~] 28.7 `synops-health`: sjekk status for alle tjenester (PG, Caddy, vaktmesteren, LiteLLM, Whisper, LiveKit). Output: JSON med status per tjeneste. Brukes av admin-dashboard og overvåking. - > Påbegynt: 2026-03-18T20:50 +- [x] 28.7 `synops-health`: sjekk status for alle tjenester (PG, Caddy, vaktmesteren, LiteLLM, Whisper, LiveKit). Output: JSON med status per tjeneste. Brukes av admin-dashboard og overvåking. ## Fase 29: Universell input — alle modaliteter blir noder diff --git a/tools/README.md b/tools/README.md index 23eca8a..74907e6 100644 --- a/tools/README.md +++ b/tools/README.md @@ -27,6 +27,7 @@ eller maskinrommet-API. Ligger i PATH via symlink eller direkte kall. | `synops-notify` | Send varsel via epost, WebSocket-push, eller begge | Ferdig | | `synops-validate` | Valider at en node matcher forventet skjema for sin node_kind | Ferdig | | `synops-backup` | PG-dump + CAS-filiste + metadata-snapshot (`--full` / `--incremental`) | Ferdig | +| `synops-health` | Sjekk status for alle tjenester (PG, Caddy, Maskinrommet, LiteLLM, Whisper, LiveKit, Authentik) | Ferdig | ## Delt bibliotek diff --git a/tools/synops-health/Cargo.toml b/tools/synops-health/Cargo.toml new file mode 100644 index 0000000..9a09d7a --- /dev/null +++ b/tools/synops-health/Cargo.toml @@ -0,0 +1,20 @@ +[package] +name = "synops-health" +version = "0.1.0" +edition = "2024" + +[[bin]] +name = "synops-health" +path = "src/main.rs" + +[dependencies] +clap = { version = "4", features = ["derive"] } +tokio = { version = "1", features = ["full"] } +sqlx = { version = "0.8", features = ["runtime-tokio", "tls-rustls", "postgres", "uuid", "chrono", "json"] } +serde = { version = "1", features = ["derive"] } +serde_json = "1" +chrono = { version = "0.4", features = ["serde"] } +tracing = "0.1" +tracing-subscriber = { version = "0.3", features = ["env-filter"] } +reqwest = { version = "0.12", features = ["json"] } +synops-common = { path = "../synops-common" } diff --git a/tools/synops-health/src/main.rs b/tools/synops-health/src/main.rs new file mode 100644 index 0000000..f05ae81 --- /dev/null +++ b/tools/synops-health/src/main.rs @@ -0,0 +1,195 @@ +// synops-health — Sjekk status for alle tjenester i Synops-stacken. +// +// Sjekker: PostgreSQL, Caddy, Maskinrommet, LiteLLM, Whisper, LiveKit, Authentik. +// Output: JSON med status per tjeneste + samlet status. +// Exit-kode: 0 = alle oppe, 1 = noe nede/degradert, 2 = kjørefeil. +// +// Brukes av admin-dashboard og overvåking. +// +// Miljøvariabler (alle valgfrie — har fornuftige defaults): +// DATABASE_URL — PostgreSQL-tilkobling +// AI_GATEWAY_URL — LiteLLM base-URL (default: http://localhost:4000) +// WHISPER_URL — faster-whisper base-URL (default: http://localhost:8000) +// LIVEKIT_URL — LiveKit base-URL (default: http://localhost:7880) +// +// Ref: docs/infra/observerbarhet.md, maskinrommet/src/health.rs + +use clap::Parser; +use serde::Serialize; +use std::process; +use std::time::Instant; + +/// Sjekk status for alle Synops-tjenester. +#[derive(Parser)] +#[command(name = "synops-health", about = "Sjekk tjenestestatus for Synops-stacken")] +struct Cli { + /// Timeout per tjeneste i sekunder + #[arg(long, default_value = "5")] + timeout: u64, + + /// Kjør kun stille — bare exit-kode, ingen output + #[arg(long)] + quiet: bool, +} + +#[derive(Serialize)] +struct HealthReport { + timestamp: String, + overall_status: String, // "healthy", "degraded", "critical" + services: Vec, +} + +#[derive(Serialize, Clone)] +struct ServiceStatus { + name: String, + status: String, // "up", "down", "degraded" + latency_ms: Option, + #[serde(skip_serializing_if = "Option::is_none")] + details: Option, +} + +#[tokio::main] +async fn main() { + synops_common::logging::init("synops_health"); + + let cli = Cli::parse(); + let timeout = std::time::Duration::from_secs(cli.timeout); + + let http = reqwest::Client::builder() + .timeout(timeout) + .danger_accept_invalid_certs(true) // Lokale tjenester kan ha self-signed + .build() + .expect("Kunne ikke opprette HTTP-klient"); + + // Kjør alle sjekker parallelt + let (pg, caddy, maskinrommet, litellm, whisper, livekit, authentik) = tokio::join!( + check_pg(timeout), + check_http(&http, "Caddy", "http://localhost:2019/config/"), + check_http(&http, "Maskinrommet", "http://localhost:3100/health"), + check_litellm(&http), + check_whisper(&http), + check_livekit(&http), + check_http(&http, "Authentik", "https://auth.sidelinja.org/-/health/ready/"), + ); + + let services = vec![pg, caddy, maskinrommet, litellm, whisper, livekit, authentik]; + + let overall_status = if services.iter().all(|s| s.status == "up") { + "healthy" + } else if services.iter().any(|s| s.status == "down") { + "degraded" + } else { + "degraded" + } + .to_string(); + + let report = HealthReport { + timestamp: chrono::Utc::now().to_rfc3339(), + overall_status: overall_status.clone(), + services, + }; + + if !cli.quiet { + println!( + "{}", + serde_json::to_string_pretty(&report).expect("JSON-serialisering feilet") + ); + } + + let exit_code = if overall_status == "healthy" { 0 } else { 1 }; + process::exit(exit_code); +} + +// ============================================================================= +// Tjeneste-sjekker +// ============================================================================= + +/// Sjekk PostgreSQL med direkte SQL-spørring. +async fn check_pg(timeout: std::time::Duration) -> ServiceStatus { + let name = "PostgreSQL".to_string(); + + // Prøv å koble til med timeout + let connect_result = tokio::time::timeout(timeout, synops_common::db::connect()).await; + + match connect_result { + Ok(Ok(db)) => { + let start = Instant::now(); + match sqlx::query_scalar::<_, i32>("SELECT 1").fetch_one(&db).await { + Ok(_) => ServiceStatus { + name, + status: "up".to_string(), + latency_ms: Some(start.elapsed().as_millis() as u64), + details: None, + }, + Err(e) => ServiceStatus { + name, + status: "down".to_string(), + latency_ms: None, + details: Some(format!("{e}")), + }, + } + } + Ok(Err(e)) => ServiceStatus { + name, + status: "down".to_string(), + latency_ms: None, + details: Some(format!("Tilkobling feilet: {e}")), + }, + Err(_) => ServiceStatus { + name, + status: "down".to_string(), + latency_ms: None, + details: Some("Timeout".to_string()), + }, + } +} + +/// Sjekk en HTTP-tjeneste. Aksepterer 401/403 som "up" (auth kreves men tjenesten kjører). +async fn check_http(client: &reqwest::Client, name: &str, url: &str) -> ServiceStatus { + let start = Instant::now(); + match client.get(url).send().await { + Ok(resp) => { + let latency = start.elapsed().as_millis() as u64; + let code = resp.status(); + if code.is_success() || code.as_u16() == 401 || code.as_u16() == 403 { + ServiceStatus { + name: name.to_string(), + status: "up".to_string(), + latency_ms: Some(latency), + details: None, + } + } else { + ServiceStatus { + name: name.to_string(), + status: "degraded".to_string(), + latency_ms: Some(latency), + details: Some(format!("HTTP {code}")), + } + } + } + Err(e) => ServiceStatus { + name: name.to_string(), + status: "down".to_string(), + latency_ms: None, + details: Some(format!("{e}")), + }, + } +} + +async fn check_litellm(client: &reqwest::Client) -> ServiceStatus { + let url = std::env::var("AI_GATEWAY_URL") + .unwrap_or_else(|_| "http://localhost:4000".to_string()); + check_http(client, "LiteLLM", &format!("{url}/health")).await +} + +async fn check_whisper(client: &reqwest::Client) -> ServiceStatus { + let url = std::env::var("WHISPER_URL") + .unwrap_or_else(|_| "http://localhost:8000".to_string()); + check_http(client, "Whisper", &format!("{url}/health")).await +} + +async fn check_livekit(client: &reqwest::Client) -> ServiceStatus { + let url = std::env::var("LIVEKIT_URL") + .unwrap_or_else(|_| "http://localhost:7880".to_string()); + check_http(client, "LiveKit", &url).await +}