synops/maskinrommet/src/health.rs
vegard 1e34c3c67a Valider fase 15–16: sikkerhet, konsistens og atomisk toggle
Tre fikser funnet under validering:

1. SIKKERHET: Admin-endepunkter manglet autorisasjonssjekk.
   Alle /admin/*-endepunkter brukte kun AuthUser (autentisert),
   ikke admin-rolle. Ny AdminUser-extractor sjekker owner/admin-edge
   til samling — returnerer 403 Forbidden for ikke-admins.
   Berører: maintenance, jobs, resources, health, ai, usage.

2. Race condition i toggle_effect: les-modifiser-skriv uten transaksjon
   på active_effects JSON. Erstattet med atomisk PG jsonb-operasjon.

3. Manglende updated_by i set_gain, set_mute, set_mixer_role, toggle_effect.
   Nå spores hvem som endret mixer-tilstanden.
2026-03-18 15:39:30 +00:00

488 lines
16 KiB
Rust

// Serverhelse-dashboard — tjeneste-status, metrikker, backup-status, logg-tilgang.
//
// Sjekker alle tjenester i stacken (PG, Caddy, Authentik, LiteLLM,
// Whisper, LiveKit) og samler system-metrikker (CPU, minne, disk).
//
// Ref: docs/concepts/adminpanelet.md § 4 "Serverhelse", oppgave 15.6
use axum::{extract::{Query, State}, http::StatusCode, Json};
use serde::{Deserialize, Serialize};
use sqlx::PgPool;
use crate::auth::AdminUser;
use crate::AppState;
// =============================================================================
// Typer
// =============================================================================
#[derive(Serialize)]
pub struct ServiceStatus {
pub name: String,
pub status: String, // "up", "down", "degraded"
pub latency_ms: Option<u64>,
pub details: Option<String>,
}
#[derive(Serialize)]
pub struct SystemMetrics {
pub cpu_usage_percent: f32,
pub cpu_cores: usize,
pub load_avg: [f32; 3],
pub memory_total_bytes: u64,
pub memory_used_bytes: u64,
pub memory_available_bytes: u64,
pub memory_usage_percent: f32,
pub disk: crate::resources::DiskStatus,
pub uptime_seconds: u64,
}
#[derive(Serialize)]
pub struct BackupInfo {
pub backup_type: String,
pub last_success: Option<String>,
pub path: Option<String>,
pub status: String, // "ok", "missing", "stale"
}
#[derive(Serialize)]
pub struct LogEntry {
pub timestamp: String,
pub service: String,
pub level: String,
pub message: String,
}
#[derive(Serialize)]
pub struct HealthDashboard {
pub services: Vec<ServiceStatus>,
pub metrics: SystemMetrics,
pub backups: Vec<BackupInfo>,
pub pg_stats: PgStats,
}
#[derive(Serialize)]
pub struct PgStats {
pub active_connections: i64,
pub max_connections: i64,
pub database_size_bytes: i64,
pub active_queries: i64,
}
#[derive(Deserialize)]
pub struct LogsQuery {
pub service: Option<String>,
pub lines: Option<usize>,
}
#[derive(Serialize)]
pub struct LogsResponse {
pub entries: Vec<LogEntry>,
}
// =============================================================================
// Tjeneste-sjekker
// =============================================================================
async fn check_pg(db: &PgPool) -> ServiceStatus {
let start = std::time::Instant::now();
match sqlx::query_scalar::<_, i32>("SELECT 1").fetch_one(db).await {
Ok(_) => ServiceStatus {
name: "PostgreSQL".to_string(),
status: "up".to_string(),
latency_ms: Some(start.elapsed().as_millis() as u64),
details: None,
},
Err(e) => ServiceStatus {
name: "PostgreSQL".to_string(),
status: "down".to_string(),
latency_ms: None,
details: Some(format!("{e}")),
},
}
}
/// Sjekk en HTTP-tjeneste med timeout.
async fn check_http_service(name: &str, url: &str) -> ServiceStatus {
let client = reqwest::Client::builder()
.timeout(std::time::Duration::from_secs(5))
.build()
.unwrap();
let start = std::time::Instant::now();
match client.get(url).send().await {
Ok(resp) => {
let latency = start.elapsed().as_millis() as u64;
let status_code = resp.status();
if status_code.is_success() || status_code.as_u16() == 401 || status_code.as_u16() == 403 {
// 401/403 betyr at tjenesten kjører, bare auth mangler
ServiceStatus {
name: name.to_string(),
status: "up".to_string(),
latency_ms: Some(latency),
details: None,
}
} else {
ServiceStatus {
name: name.to_string(),
status: "degraded".to_string(),
latency_ms: Some(latency),
details: Some(format!("HTTP {}", status_code)),
}
}
}
Err(e) => ServiceStatus {
name: name.to_string(),
status: "down".to_string(),
latency_ms: None,
details: Some(format!("{e}")),
},
}
}
async fn check_caddy() -> ServiceStatus {
// Caddy kjører lokalt, sjekk admin-API
check_http_service("Caddy", "http://localhost:2019/config/").await
}
async fn check_authentik() -> ServiceStatus {
// Authentik via Caddy
check_http_service("Authentik", "https://auth.sidelinja.org/-/health/ready/").await
}
async fn check_litellm() -> ServiceStatus {
let url = std::env::var("AI_GATEWAY_URL")
.unwrap_or_else(|_| "http://localhost:4000".to_string());
check_http_service("LiteLLM", &format!("{url}/health")).await
}
async fn check_whisper() -> ServiceStatus {
let url = std::env::var("WHISPER_URL")
.unwrap_or_else(|_| "http://localhost:8000".to_string());
check_http_service("Whisper", &format!("{url}/health")).await
}
async fn check_livekit() -> ServiceStatus {
let url = std::env::var("LIVEKIT_URL")
.unwrap_or_else(|_| "http://localhost:7880".to_string());
check_http_service("LiveKit", &url).await
}
// =============================================================================
// System-metrikker
// =============================================================================
fn read_cpu_usage() -> f32 {
// Les /proc/stat for CPU-bruk (snapshot, ikke gjennomsnitt).
// For enkel implementering bruker vi load average i stedet.
// CPU-prosent beregnes fra load_avg[0] / antall kjerner.
let cores = num_cpus();
let load = read_load_avg();
// Tilnærming: load / cores * 100, capped til 100
((load[0] / cores as f32) * 100.0).min(100.0)
}
fn num_cpus() -> usize {
std::thread::available_parallelism()
.map(|n| n.get())
.unwrap_or(1)
}
fn read_load_avg() -> [f32; 3] {
let content = std::fs::read_to_string("/proc/loadavg").unwrap_or_default();
let parts: Vec<f32> = content
.split_whitespace()
.take(3)
.filter_map(|s| s.parse().ok())
.collect();
[
parts.first().copied().unwrap_or(0.0),
parts.get(1).copied().unwrap_or(0.0),
parts.get(2).copied().unwrap_or(0.0),
]
}
fn read_memory_info() -> (u64, u64, u64) {
// Les /proc/meminfo
let content = std::fs::read_to_string("/proc/meminfo").unwrap_or_default();
let mut total: u64 = 0;
let mut available: u64 = 0;
for line in content.lines() {
if let Some(val) = line.strip_prefix("MemTotal:") {
total = parse_meminfo_kb(val) * 1024;
} else if let Some(val) = line.strip_prefix("MemAvailable:") {
available = parse_meminfo_kb(val) * 1024;
}
}
let used = total.saturating_sub(available);
(total, used, available)
}
fn parse_meminfo_kb(s: &str) -> u64 {
s.trim().split_whitespace().next()
.and_then(|v| v.parse().ok())
.unwrap_or(0)
}
fn read_uptime() -> u64 {
let content = std::fs::read_to_string("/proc/uptime").unwrap_or_default();
content.split_whitespace().next()
.and_then(|v| v.parse::<f64>().ok())
.map(|v| v as u64)
.unwrap_or(0)
}
fn collect_metrics() -> SystemMetrics {
let load = read_load_avg();
let cores = num_cpus();
let cpu = read_cpu_usage();
let (mem_total, mem_used, mem_available) = read_memory_info();
let mem_percent = if mem_total > 0 {
(mem_used as f64 / mem_total as f64 * 100.0) as f32
} else {
0.0
};
let cas_root = std::env::var("CAS_ROOT")
.unwrap_or_else(|_| "/srv/synops/media/cas".to_string());
let disk = crate::resources::check_disk_usage(&cas_root)
.unwrap_or_else(|_| crate::resources::check_disk_usage("/").unwrap_or(
crate::resources::DiskStatus {
mount_point: "/".to_string(),
total_bytes: 0,
used_bytes: 0,
available_bytes: 0,
usage_percent: 0.0,
alert_level: None,
}
));
SystemMetrics {
cpu_usage_percent: cpu,
cpu_cores: cores,
load_avg: load,
memory_total_bytes: mem_total,
memory_used_bytes: mem_used,
memory_available_bytes: mem_available,
memory_usage_percent: mem_percent,
disk,
uptime_seconds: read_uptime(),
}
}
// =============================================================================
// PG-statistikk
// =============================================================================
async fn collect_pg_stats(db: &PgPool) -> PgStats {
let active = sqlx::query_scalar::<_, i64>(
"SELECT count(*) FROM pg_stat_activity WHERE state = 'active'"
)
.fetch_one(db)
.await
.unwrap_or(0);
let max_conn = sqlx::query_scalar::<_, i64>(
"SELECT setting::bigint FROM pg_settings WHERE name = 'max_connections'"
)
.fetch_one(db)
.await
.unwrap_or(100);
let db_size = sqlx::query_scalar::<_, i64>(
"SELECT pg_database_size(current_database())"
)
.fetch_one(db)
.await
.unwrap_or(0);
let queries = sqlx::query_scalar::<_, i64>(
"SELECT count(*) FROM pg_stat_activity WHERE state = 'active' AND query NOT LIKE '%pg_stat_activity%'"
)
.fetch_one(db)
.await
.unwrap_or(0);
PgStats {
active_connections: active,
max_connections: max_conn,
database_size_bytes: db_size,
active_queries: queries,
}
}
// =============================================================================
// Backup-status
// =============================================================================
fn check_backups() -> Vec<BackupInfo> {
let mut backups = Vec::new();
// PG-dump — sjekk /srv/synops/backup/pg/ for nyeste .dump-fil
let pg_dir = "/srv/synops/backup/pg";
let mut pg_backup = BackupInfo {
backup_type: "PostgreSQL dump".to_string(),
last_success: None,
path: None,
status: "missing".to_string(),
};
if let Ok(entries) = std::fs::read_dir(pg_dir) {
// Finn nyeste dump-fil
let mut newest: Option<(std::time::SystemTime, std::path::PathBuf)> = None;
for entry in entries.flatten() {
let name = entry.file_name().to_string_lossy().to_string();
if name.ends_with(".dump") {
if let Ok(meta) = entry.metadata() {
if let Ok(modified) = meta.modified() {
if newest.as_ref().map_or(true, |(t, _)| modified > *t) {
newest = Some((modified, entry.path()));
}
}
}
}
}
if let Some((modified, path)) = newest {
let age = modified.elapsed().unwrap_or_default();
let ts = chrono::DateTime::<chrono::Utc>::from(modified);
pg_backup.last_success = Some(ts.to_rfc3339());
pg_backup.path = Some(path.to_string_lossy().to_string());
pg_backup.status = if age.as_secs() < 86400 + 3600 {
// Litt slakk: 25 timer (cron kjører daglig)
"ok".to_string()
} else {
"stale".to_string()
};
}
}
backups.push(pg_backup);
backups
}
// =============================================================================
// Logg-tilgang
// =============================================================================
fn read_service_logs(service: &str, max_lines: usize) -> Vec<LogEntry> {
// Bruk journalctl for systemd-tjenester, docker logs for containere
let cmd = match service {
"maskinrommet" | "caddy" | "sveltekit" => {
format!("journalctl -u {service} --no-pager -n {max_lines} --output=short-iso 2>/dev/null")
}
"postgres" | "authentik" | "litellm" | "whisper" | "livekit" => {
let container = match service {
"postgres" => "sidelinja-postgres-1",
"authentik" => "sidelinja-authentik-server-1",
"litellm" => "sidelinja-ai-gateway-1",
"whisper" => "sidelinja-faster-whisper-1",
"livekit" => "sidelinja-livekit-1",
_ => return Vec::new(),
};
format!("docker logs --tail {max_lines} --timestamps {container} 2>&1")
}
_ => return Vec::new(),
};
let output = std::process::Command::new("bash")
.arg("-c")
.arg(&cmd)
.output();
match output {
Ok(out) => {
let text = String::from_utf8_lossy(&out.stdout);
text.lines()
.rev() // nyeste først
.take(max_lines)
.map(|line| {
// Prøv å parse tidsstempel fra starten av linjen
let (ts, msg) = if line.len() > 24 {
(line[..24].trim().to_string(), line[24..].trim().to_string())
} else {
(String::new(), line.to_string())
};
let level = if msg.contains("ERROR") || msg.contains("error") || msg.contains("ERR") {
"error"
} else if msg.contains("WARN") || msg.contains("warn") {
"warn"
} else if msg.contains("INFO") || msg.contains("info") {
"info"
} else {
"debug"
};
LogEntry {
timestamp: ts,
service: service.to_string(),
level: level.to_string(),
message: msg,
}
})
.collect()
}
Err(_) => Vec::new(),
}
}
// =============================================================================
// API-handlers
// =============================================================================
/// GET /admin/health — komplett serverhelse-dashboard.
pub async fn health_dashboard(
State(state): State<AppState>,
_admin: AdminUser,
) -> Result<Json<HealthDashboard>, (StatusCode, Json<crate::intentions::ErrorResponse>)> {
// Kjør alle tjeneste-sjekker parallelt
let (pg, caddy, authentik, litellm, whisper, livekit) = tokio::join!(
check_pg(&state.db),
check_caddy(),
check_authentik(),
check_litellm(),
check_whisper(),
check_livekit(),
);
let services = vec![pg, caddy, authentik, litellm, whisper, livekit];
let metrics = collect_metrics();
let backups = check_backups();
let pg_stats = collect_pg_stats(&state.db).await;
Ok(Json(HealthDashboard {
services,
metrics,
backups,
pg_stats,
}))
}
/// GET /admin/health/logs?service=maskinrommet&lines=50
pub async fn health_logs(
_admin: AdminUser,
Query(params): Query<LogsQuery>,
) -> Json<LogsResponse> {
let max_lines = params.lines.unwrap_or(50).min(200);
let entries = if let Some(service) = &params.service {
read_service_logs(service, max_lines)
} else {
// Alle tjenester, siste linjer fra hver
let services = ["maskinrommet", "caddy", "postgres", "authentik", "litellm", "whisper", "livekit"];
let per_service = (max_lines / services.len()).max(10);
let mut all = Vec::new();
for svc in &services {
all.extend(read_service_logs(svc, per_service));
}
// Sorter etter timestamp (nyeste først)
all.sort_by(|a, b| b.timestamp.cmp(&a.timestamp));
all.truncate(max_lines);
all
};
Json(LogsResponse { entries })
}