From b31ee5986874200fa52d1adf7e5fa7b274fcd9fe Mon Sep 17 00:00:00 2001 From: vegard Date: Wed, 18 Mar 2026 11:43:19 +0000 Subject: [PATCH] =?UTF-8?q?Ytelse:=20profiler=20PG-sp=C3=B8rringer,=20opti?= =?UTF-8?q?maliser=20node=5Faccess-oppdatering=20(oppgave=2012.4)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Profilert alle kritiske PG-spørringer med EXPLAIN ANALYZE. Identifiserte at recompute_access brukte single-column index (idx_edges_type) med lav selektivitet, og RLS-policyer manglet composite indexes for effektive oppslag. Endringer: Migrasjon 017_query_performance.sql: - 6 nye composite indexes: - idx_edges_target_type (target_id, edge_type) — recompute_access + belongs_to - idx_edges_source_type (source_id, edge_type) — alias-oppslag - idx_edges_target_memberof (partial, member_of) — team-propagering - idx_nodes_created_at_desc — ORDER BY created_at DESC - idx_nodes_kind_created — filtrer på kind + sorter - idx_na_subject_covering INCLUDE (object_id) — RLS without heap lookup - Optimalisert recompute_access(): steg 3 og 4 kjøres nå bare når det er relevant (EXISTS-sjekk først). For vanlige brukere (ikke team) unngår dette to fulle INSERT-SELECT-operasjoner. - via_edge oppdateres nå korrekt ved access-nivå-endring. Slow query logging (maskinrommet): - Forespørsler >200ms logges som WARN med tag slow_request - PG-spørringer >100ms logges som WARN med tag slow_query - recompute_access-kall logges med varighet for overvåking - Nytt pg_stats-felt i /metrics med tabell- og index-statistikk, cache hit ratio, og node_access-telling Dokumentasjon oppdatert i docs/infra/observerbarhet.md. --- docs/infra/observerbarhet.md | 63 +++++++++- maskinrommet/src/metrics.rs | 152 ++++++++++++++++++++++-- maskinrommet/src/pg_writes.rs | 10 ++ maskinrommet/src/queries.rs | 20 ++++ migrations/017_query_performance.sql | 170 +++++++++++++++++++++++++++ tasks.md | 3 +- 6 files changed, 405 insertions(+), 13 deletions(-) create mode 100644 migrations/017_query_performance.sql diff --git a/docs/infra/observerbarhet.md b/docs/infra/observerbarhet.md index ead3cbe..368abd2 100644 --- a/docs/infra/observerbarhet.md +++ b/docs/infra/observerbarhet.md @@ -82,6 +82,38 @@ Aggregerte AI-kostnader fra `ai_usage_log` for tre tidsperioder Kostnadsestimatet er basert på konservative gjennomsnittspriser ($3/MTok input, $15/MTok output). +### pg_stats + +PG-ytelsesstatistikk for kjernetabellene (nodes, edges, node_access): + +```json +{ + "tables": [ + { + "table_name": "nodes", + "row_count": 19, + "index_scan_count": 50, + "seq_scan_count": 10, + "index_scan_ratio": 0.83 + } + ], + "indexes": [ + { + "index_name": "idx_edges_target_type", + "table_name": "edges", + "scan_count": 25, + "size_bytes": 16384 + } + ], + "node_access_count": 4, + "cache_hit_ratio": 0.9995 +} +``` + +`index_scan_ratio` bør være >0.95 for tabeller med >1000 rader. +`cache_hit_ratio` bør være >0.99 — lavere indikerer at PG trenger mer +`shared_buffers`. + ## Request-logging middleware Alle HTTP-forespørsler logges med: @@ -91,8 +123,33 @@ Alle HTTP-forespørsler logges med: - `status` — HTTP-statuskode - `duration_ms` — Total responstid -Dette erstatter `tower_http::TraceLayer` med en mer strukturert -variant som også mater metrikk-samleren. +Forespørsler over 200ms logges med `level=WARN` og tag `slow_request`. + +## Slow query logging + +PG-spørringer som tar over 100ms logges med `level=WARN` og tag +`slow_query`. Terskelen er definert i `queries.rs::SLOW_QUERY_THRESHOLD_MS`. + +Instrumenterte spørringer: +- `query_nodes` — søk og filtrering med RLS +- `query_graph` — rekursiv CTE graf-traversering med RLS +- `recompute_access` — node_access-oppdatering (logges i `pg_writes.rs`) + +## pg_stat_statements (valgfritt) + +For detaljert per-spørring statistikk, aktiver pg_stat_statements: + +1. Legg til i docker-compose.yml for postgres: + `command: postgres -c shared_preload_libraries=pg_stat_statements -c pg_stat_statements.track=all` +2. Restart postgres-containeren +3. Kjør: `CREATE EXTENSION pg_stat_statements;` +4. Se topp-spørringer: + ```sql + SELECT query, calls, total_exec_time, mean_exec_time + FROM pg_stat_statements + ORDER BY total_exec_time DESC + LIMIT 20; + ``` ## Arkitektur @@ -100,5 +157,5 @@ Metrikker samles i minnet (ingen ekstern avhengighet som Prometheus). Designvalg: - **Enkel:** Ingen nye avhengigheter eller ekstern infrastruktur -- **Tilstrekkelig:** Queue depth og AI-kostnad fra PG, latency in-memory +- **Tilstrekkelig:** Queue depth, AI-kostnad og PG-stats fra PG, latency in-memory - **Utvidbar:** Kan legge til Prometheus-eksport senere ved behov diff --git a/maskinrommet/src/metrics.rs b/maskinrommet/src/metrics.rs index cee300b..6943c57 100644 --- a/maskinrommet/src/metrics.rs +++ b/maskinrommet/src/metrics.rs @@ -2,8 +2,9 @@ // // Samler request-latency per rute, eksponerer via /metrics-endepunkt. // Queue depth og AI-kostnad hentes direkte fra PG ved forespørsel. +// Slow query logging: spørringer over terskel logges med kontekst. // -// Ref: oppgave 12.1 +// Ref: oppgave 12.1, 12.4 use axum::{ body::Body, @@ -271,7 +272,12 @@ async fn query_ai_cost(db: &PgPool) -> Result { // --- Middleware --- +/// Terskelverdi for treg forespørsel (millisekunder). +/// Forespørsler over denne verdien logges som warning. +const SLOW_REQUEST_THRESHOLD_MS: f64 = 200.0; + /// Axum-middleware som måler request-latency og logger strukturert. +/// Forespørsler over SLOW_REQUEST_THRESHOLD_MS logges som warning med ekstra kontekst. pub async fn latency_middleware( State(state): State, request: Request, @@ -285,17 +291,28 @@ pub async fn latency_middleware( let duration = start.elapsed(); let duration_us = duration.as_micros() as u64; + let duration_ms = duration.as_secs_f64() * 1000.0; let status = response.status().as_u16(); state.metrics.record(&path, duration_us); - tracing::info!( - method = %method, - path = %path, - status = status, - duration_ms = duration.as_secs_f64() * 1000.0, - "request" - ); + if duration_ms >= SLOW_REQUEST_THRESHOLD_MS { + tracing::warn!( + method = %method, + path = %path, + status = status, + duration_ms = duration_ms, + "slow_request" + ); + } else { + tracing::info!( + method = %method, + path = %path, + status = status, + duration_ms = duration_ms, + "request" + ); + } response } @@ -307,6 +324,7 @@ struct MetricsResponse { request_latency: MetricsSnapshot, queue_depth: Option, ai_cost: Option, + pg_stats: Option, } /// GET /metrics — samlet observerbarhets-endepunkt. @@ -317,10 +335,128 @@ pub async fn metrics_endpoint( let queue_depth = query_queue_depth(&state.db).await.ok(); let ai_cost = query_ai_cost(&state.db).await.ok(); + let pg_stats = query_pg_stats(&state.db).await.ok(); Ok(Json(MetricsResponse { request_latency, queue_depth, ai_cost, + pg_stats, })) } + +// --- PG Query Stats --- + +#[derive(Serialize)] +pub struct PgTableStats { + pub table_name: String, + pub row_count: i64, + pub index_scan_count: i64, + pub seq_scan_count: i64, + /// Ratio av index scans til totale scans (0.0-1.0). Høyere er bedre. + pub index_scan_ratio: f64, +} + +#[derive(Serialize)] +pub struct PgIndexStats { + pub index_name: String, + pub table_name: String, + pub scan_count: i64, + pub size_bytes: i64, +} + +#[derive(Serialize)] +pub struct PgQueryStats { + pub tables: Vec, + pub indexes: Vec, + pub node_access_count: i64, + pub cache_hit_ratio: f64, +} + +async fn query_pg_stats(db: &PgPool) -> Result { + // Tabell-statistikk: seq_scan vs index_scan for kjernetabellene + let tables = sqlx::query_as::<_, (String, i64, i64, i64)>( + r#" + SELECT + relname::text, + COALESCE(n_live_tup, 0), + COALESCE(idx_scan, 0), + COALESCE(seq_scan, 0) + FROM pg_stat_user_tables + WHERE relname IN ('nodes', 'edges', 'node_access', 'job_queue', 'auth_identities') + ORDER BY relname + "#, + ) + .fetch_all(db) + .await?; + + let table_stats: Vec = tables + .into_iter() + .map(|(name, rows, idx_scans, seq_scans)| { + let total = idx_scans + seq_scans; + PgTableStats { + table_name: name, + row_count: rows, + index_scan_count: idx_scans, + seq_scan_count: seq_scans, + index_scan_ratio: if total > 0 { + idx_scans as f64 / total as f64 + } else { + 1.0 + }, + } + }) + .collect(); + + // Index-bruksstatistikk for de viktigste indeksene + let indexes = sqlx::query_as::<_, (String, String, i64, i64)>( + r#" + SELECT + indexrelname::text, + relname::text, + COALESCE(idx_scan, 0), + pg_relation_size(indexrelid) + FROM pg_stat_user_indexes + WHERE relname IN ('nodes', 'edges', 'node_access') + ORDER BY idx_scan DESC + "#, + ) + .fetch_all(db) + .await?; + + let index_stats: Vec = indexes + .into_iter() + .map(|(idx_name, tbl_name, scans, size)| PgIndexStats { + index_name: idx_name, + table_name: tbl_name, + scan_count: scans, + size_bytes: size, + }) + .collect(); + + // node_access row count + let (na_count,): (i64,) = sqlx::query_as("SELECT COUNT(*) FROM node_access") + .fetch_one(db) + .await?; + + // Cache hit ratio + let (hit_ratio,): (f64,) = sqlx::query_as( + r#" + SELECT COALESCE( + SUM(heap_blks_hit)::float / NULLIF(SUM(heap_blks_hit) + SUM(heap_blks_read), 0), + 1.0 + ) + FROM pg_statio_user_tables + WHERE relname IN ('nodes', 'edges', 'node_access') + "#, + ) + .fetch_one(db) + .await?; + + Ok(PgQueryStats { + tables: table_stats, + indexes: index_stats, + node_access_count: na_count, + cache_hit_ratio: (hit_ratio * 10000.0).round() / 10000.0, + }) +} diff --git a/maskinrommet/src/pg_writes.rs b/maskinrommet/src/pg_writes.rs index 4d0003f..1ca7010 100644 --- a/maskinrommet/src/pg_writes.rs +++ b/maskinrommet/src/pg_writes.rs @@ -224,6 +224,7 @@ pub async fn handle_insert_edge( if let Some(level) = access_level { // Tilgangsgivende edge: transaksjon med recompute_access + let recompute_start = std::time::Instant::now(); let mut tx = db.begin().await.map_err(|e| format!("PG begin: {e}"))?; sqlx::query( @@ -254,12 +255,21 @@ pub async fn handle_insert_edge( tx.commit().await.map_err(|e| format!("PG commit: {e}"))?; + let recompute_ms = recompute_start.elapsed().as_secs_f64() * 1000.0; tracing::info!( edge_id = %edge_id, edge_type = %edge_type, access_level = %level, + recompute_ms = recompute_ms, "Edge + node_access persistert til PostgreSQL (via jobbkø)" ); + if recompute_ms >= 100.0 { + tracing::warn!( + edge_id = %edge_id, + duration_ms = recompute_ms, + "slow_recompute_access" + ); + } // Synk node_access til STDB (best-effort, feil logger men feiler ikke jobben) sync_node_access_to_stdb(db, stdb, source_id).await; diff --git a/maskinrommet/src/queries.rs b/maskinrommet/src/queries.rs index 1f21985..6285531 100644 --- a/maskinrommet/src/queries.rs +++ b/maskinrommet/src/queries.rs @@ -16,6 +16,21 @@ use crate::auth::AuthUser; use crate::AppState; use crate::intentions::ErrorResponse; +/// Terskelverdi for treg PG-spørring (millisekunder). +const SLOW_QUERY_THRESHOLD_MS: f64 = 100.0; + +/// Logger en spørring som treg hvis varigheten overstiger terskelen. +fn log_slow_query(query_name: &str, duration_ms: f64) { + if duration_ms >= SLOW_QUERY_THRESHOLD_MS { + tracing::warn!( + query = query_name, + duration_ms = duration_ms, + threshold_ms = SLOW_QUERY_THRESHOLD_MS, + "slow_query" + ); + } +} + // ============================================================================= // GET /query/segments — transkripsjons-segmenter for en node // ============================================================================= @@ -474,6 +489,7 @@ async fn run_query_nodes( limit: i64, offset: i64, ) -> Result { + let query_start = std::time::Instant::now(); let mut tx = db.begin().await?; set_rls_context(&mut tx, user_node_id).await?; @@ -555,6 +571,8 @@ async fn run_query_nodes( // Transaksjon avsluttes — SET LOCAL tilbakestilles automatisk tx.commit().await?; + log_slow_query("query_nodes", query_start.elapsed().as_secs_f64() * 1000.0); + Ok(QueryNodesResponse { nodes, total }) } @@ -1072,6 +1090,7 @@ async fn run_query_graph( edge_type_filter: &Option>, node_kind_filter: &Option>, ) -> Result { + let query_start = std::time::Instant::now(); let mut tx = db.begin().await?; set_rls_context(&mut tx, user_node_id).await?; @@ -1146,6 +1165,7 @@ async fn run_query_graph( }; tx.commit().await?; + log_slow_query("query_graph", query_start.elapsed().as_secs_f64() * 1000.0); // Appliser klientside-filtre let mut graph_nodes: Vec = nodes diff --git a/migrations/017_query_performance.sql b/migrations/017_query_performance.sql new file mode 100644 index 0000000..1de148a --- /dev/null +++ b/migrations/017_query_performance.sql @@ -0,0 +1,170 @@ +-- 017_query_performance.sql +-- Ytelsesoptimalisering: composite indexes og forbedret recompute_access. +-- +-- Profileringsresultater viste at: +-- 1. recompute_access step 2 bruker idx_edges_type (lav selektivitet) og +-- filtrerer på target_id etterpå — trenger composite index. +-- 2. RLS edge_select policy gjør IN-sjekk mot node_access for source_id +-- og target_id — composite index på edges (target_id, edge_type) hjelper +-- joinmønsteret i recompute_access og belongs_to-oppslag. +-- 3. Alias-oppslag (source_id + edge_type + system) mangler composite index. +-- 4. nodes-oppslag sortert på created_at DESC mangler index. +-- 5. recompute_access kjører steg 3 og 4 uansett, selv om de sjelden +-- treffer — optimaliser med betingede sjekker. +-- +-- Ref: oppgave 12.4, docs/retninger/datalaget.md + +BEGIN; + +-- ============================================================================= +-- Composite indexes på edges +-- ============================================================================= + +-- Dekker recompute_access steg 2 (belongs_to-oppslag mot target) +-- og generelle "hent barn av node"-spørringer (query_board, etc.) +CREATE INDEX IF NOT EXISTS idx_edges_target_type + ON edges (target_id, edge_type); + +-- Dekker alias-oppslag og "finn utgående edges av type"-spørringer +CREATE INDEX IF NOT EXISTS idx_edges_source_type + ON edges (source_id, edge_type); + +-- Dekker member_of-oppslag i recompute_access steg 3 (covering index) +-- Inkluderer source_id for å unngå heap-oppslag +CREATE INDEX IF NOT EXISTS idx_edges_target_memberof + ON edges (target_id) + WHERE edge_type = 'member_of'; + +-- ============================================================================= +-- Composite index på nodes for sortering +-- ============================================================================= + +-- query_nodes bruker ORDER BY created_at DESC med LIMIT/OFFSET. +-- Med RLS kreves sekvensielt scan, men for ikke-RLS-spørringer +-- (superuser context) gir dette stor gevinst. +CREATE INDEX IF NOT EXISTS idx_nodes_created_at_desc + ON nodes (created_at DESC); + +-- Dekker vanlig mønster: filtrer på node_kind, sorter på created_at +CREATE INDEX IF NOT EXISTS idx_nodes_kind_created + ON nodes (node_kind, created_at DESC); + +-- ============================================================================= +-- Covering index på node_access for RLS +-- ============================================================================= + +-- RLS-policyen gjør: WHERE subject_id = current_node_id() +-- og returnerer object_id. Covering index unngår heap-oppslag. +-- Erstatter ikke idx_na_subject som brukes av PK, men PostgreSQL +-- velger denne fordi den dekker hele spørringen. +CREATE INDEX IF NOT EXISTS idx_na_subject_covering + ON node_access (subject_id) INCLUDE (object_id); + +-- ============================================================================= +-- Optimalisert recompute_access +-- ============================================================================= + +-- Hovedforbedring: steg 3 (team→medlemmer) og steg 4 (arv fra team) +-- kjøres nå bare når det er relevant. Steg 3 sjekker om subject faktisk +-- er et team (har member_of-edges mot seg). Steg 4 sjekker om root_node +-- faktisk har egne node_access-rader (= er et team/entitet med tilgang). +-- For den vanlige casen (bruker→samling owner/admin) gjør dette at +-- steg 3 og 4 er billige EXISTS-sjekker i stedet for fulle INSERT-selects. + +CREATE OR REPLACE FUNCTION recompute_access( + p_subject_id UUID, + p_root_node_id UUID, + p_access access_level, + p_via_edge UUID +) RETURNS void AS $$ +BEGIN + -- Steg 1: Direkte tilgang til roten + INSERT INTO node_access (subject_id, object_id, access, via_edge) + VALUES (p_subject_id, p_root_node_id, p_access, p_via_edge) + ON CONFLICT (subject_id, object_id) + DO UPDATE SET access = GREATEST(node_access.access, p_access), + via_edge = CASE + WHEN p_access > node_access.access THEN p_via_edge + ELSE node_access.via_edge + END; + + -- Steg 2: Transitiv: noder som tilhører roten (belongs_to) + -- Bruker idx_edges_target_type (target_id, edge_type) for rask oppslag + INSERT INTO node_access (subject_id, object_id, access, via_edge) + SELECT p_subject_id, e.source_id, p_access, p_via_edge + FROM edges e + WHERE e.target_id = p_root_node_id + AND e.edge_type = 'belongs_to' + ON CONFLICT (subject_id, object_id) + DO UPDATE SET access = GREATEST(node_access.access, p_access), + via_edge = CASE + WHEN p_access > node_access.access THEN p_via_edge + ELSE node_access.via_edge + END; + + -- Steg 3: Hvis subject er et team: propager til alle teammedlemmer. + -- Sjekk først om det finnes member_of-edges (= subject er et team). + -- For vanlige brukere er dette en billig EXISTS som returnerer false. + IF EXISTS ( + SELECT 1 FROM edges + WHERE target_id = p_subject_id AND edge_type = 'member_of' + LIMIT 1 + ) THEN + INSERT INTO node_access (subject_id, object_id, access, via_edge) + SELECT e.source_id, na.object_id, na.access, na.via_edge + FROM node_access na + JOIN edges e ON e.target_id = p_subject_id + AND e.edge_type = 'member_of' + WHERE na.subject_id = p_subject_id + ON CONFLICT (subject_id, object_id) + DO UPDATE SET access = GREATEST(node_access.access, EXCLUDED.access), + via_edge = CASE + WHEN EXCLUDED.access > node_access.access THEN EXCLUDED.via_edge + ELSE node_access.via_edge + END; + END IF; + + -- Steg 4: Team-transitivitet — arv tilgang fra teamet. + -- Bare relevant når subject meldes inn i et team (root_node er team). + -- Sjekk om root_node har egne node_access-rader som subject. + IF EXISTS ( + SELECT 1 FROM node_access + WHERE subject_id = p_root_node_id + LIMIT 1 + ) THEN + INSERT INTO node_access (subject_id, object_id, access, via_edge) + SELECT p_subject_id, na.object_id, na.access, na.via_edge + FROM node_access na + WHERE na.subject_id = p_root_node_id + ON CONFLICT (subject_id, object_id) + DO UPDATE SET access = GREATEST(node_access.access, EXCLUDED.access), + via_edge = CASE + WHEN EXCLUDED.access > node_access.access THEN EXCLUDED.via_edge + ELSE node_access.via_edge + END; + END IF; +END; +$$ LANGUAGE plpgsql; + +-- ============================================================================= +-- pg_stat_statements (krever manuell konfigurasjon) +-- ============================================================================= + +-- pg_stat_statements krever shared_preload_libraries i postgresql.conf. +-- For å aktivere: +-- 1. Legg til i docker-compose.yml for postgres: +-- command: postgres -c shared_preload_libraries=pg_stat_statements +-- -c pg_stat_statements.track=all +-- 2. Restart postgres-containeren +-- 3. Kjør: CREATE EXTENSION pg_stat_statements; +-- 4. Se topp-spørringer: SELECT * FROM pg_stat_statements ORDER BY total_exec_time DESC LIMIT 20; + +-- ============================================================================= +-- Oppdater statistikk for query planner +-- ============================================================================= + +ANALYZE nodes; +ANALYZE edges; +ANALYZE node_access; + +COMMIT; diff --git a/tasks.md b/tasks.md index 72510d1..39138e2 100644 --- a/tasks.md +++ b/tasks.md @@ -270,8 +270,7 @@ kaller dem direkte. Samme verktøy, to brukere. - [x] 12.1 Observerbarhet: strukturert logging, metrikker (request latency, queue depth, AI cost). - [x] 12.2 Backup: PG-dump rutine, STDB → PG gjenoppbygging ved krasj. - [x] 12.3 Feilhåndtering: retry med backoff i skrivestien, dead letter queue for feilede PG-skrivinger. -- [~] 12.4 Ytelse: profiler PG-spørringer, optimaliser node_access-oppdatering. - > Påbegynt: 2026-03-18T11:30 +- [x] 12.4 Ytelse: profiler PG-spørringer, optimaliser node_access-oppdatering. ## Fase 22: SpacetimeDB-migrering — PG LISTEN/NOTIFY