diff --git a/tasks.md b/tasks.md index 4db251c..295c23b 100644 --- a/tasks.md +++ b/tasks.md @@ -335,8 +335,7 @@ Readability, og oppretter innholdsnode med AI-beriking. Brukes av @bot i chat ("les denne artikkelen"), orkestreringer, og fremtidig browser-extension. - [x] 25.1 `synops-clip` CLI: hent URL, parse med Readability (mozilla/readability via JS eller Rust-port), returner ren tekst + metadata (tittel, forfatter, dato, ingress). Fallback til headless browser (Playwright) for JS-rendrede sider. Detekter betalingsmur (kort/avkuttet innhold, "logg inn for å lese", kjente paywall-mønstre) — returner `"paywall": true` og tilgjengelig innhold (ingress/utdrag). Output: JSON med `title`, `author`, `date`, `content`, `url`, `paywall`. -- [~] 25.2 Node-opprettelse: `synops-clip --write` oppretter `content`-node med artikkelinnhold, `metadata.source_url`, og `tagged`-edge "clipped". AI-oppsummering via LiteLLM. `mentions`-edges til gjenkjente entiteter i kunnskapsgrafen. - > Påbegynt: 2026-03-18T18:25 +- [x] 25.2 Node-opprettelse: `synops-clip --write` oppretter `content`-node med artikkelinnhold, `metadata.source_url`, og `tagged`-edge "clipped". AI-oppsummering via LiteLLM. `mentions`-edges til gjenkjente entiteter i kunnskapsgrafen. - [ ] 25.3 @bot-integrasjon: bruker limer inn URL i chat → boten gjenkjenner URL, kaller `synops-clip`, presenterer oppsummering i chatten, oppretter node i bakgrunnen. Ved paywall: "Denne artikkelen er bak betalingsmur. Jeg fikk med tittel og ingress — lim inn innholdet om du vil dele resten." - [ ] 25.4 Orkestrering-støtte: `synops-clip` tilgjengelig som verktøy i orkestreringer. F.eks. "Clip alle URL-er som deles i #Redaksjonen og oppsummer dem". diff --git a/tools/synops-clip/Cargo.toml b/tools/synops-clip/Cargo.toml index 75d34d6..a1c774a 100644 --- a/tools/synops-clip/Cargo.toml +++ b/tools/synops-clip/Cargo.toml @@ -12,8 +12,10 @@ clap = { version = "4", features = ["derive"] } tokio = { version = "1", features = ["full"] } serde = { version = "1", features = ["derive"] } serde_json = "1" -reqwest = { version = "0.12", default-features = false, features = ["rustls-tls"] } +reqwest = { version = "0.12", default-features = false, features = ["rustls-tls", "json"] } regex = "1" tracing = "0.1" tracing-subscriber = { version = "0.3", features = ["env-filter"] } +uuid = { version = "1", features = ["v7", "serde"] } +sqlx = { version = "0.8", features = ["runtime-tokio", "tls-rustls", "postgres", "uuid", "json"] } synops-common = { path = "../synops-common" } diff --git a/tools/synops-clip/src/main.rs b/tools/synops-clip/src/main.rs index 3140f82..482ad7a 100644 --- a/tools/synops-clip/src/main.rs +++ b/tools/synops-clip/src/main.rs @@ -3,14 +3,23 @@ // Input: --url // Output: JSON til stdout med title, author, date, content, url, paywall // +// Med --write: opprett content-node i PG med artikkelinnhold, +// tagged-edge "clipped", AI-oppsummering via LiteLLM, og +// mentions-edges til gjenkjente entiteter i kunnskapsgrafen. +// // Strategi: // 1. Hent HTML med reqwest (vanlig HTTP-klient) // 2. Parse med Mozilla Readability via Node.js-hjelpeskript // 3. Hvis Readability feiler eller innholdet er for kort, prøv Playwright (headless browser) // 4. Detekter betalingsmur basert på innholdslengde og kjente mønstre +// 5. (--write) Opprett node, edges, AI-oppsummering // // Miljøvariabler: // SYNOPS_CLIP_SCRIPTS — Sti til scripts/-mappen (default: ved siden av binæren) +// DATABASE_URL — PostgreSQL-tilkobling (påkrevd med --write) +// AI_GATEWAY_URL — LiteLLM gateway (default: http://localhost:4000) +// LITELLM_MASTER_KEY — API-nøkkel for LiteLLM +// AI_CLIP_MODEL — Modellalias for oppsummering (default: sidelinja/rutine) // RUST_LOG — Loggnivå (default: synops_clip=info) // // Ref: docs/retninger/unix_filosofi.md @@ -21,6 +30,7 @@ use serde::{Deserialize, Serialize}; use std::path::PathBuf; use std::process; use tracing::{debug, info, warn}; +use uuid::Uuid; /// Hent og parse webartikler til ren tekst + metadata. #[derive(Parser)] @@ -38,6 +48,14 @@ struct Cli { #[arg(long)] playwright: bool, + /// Skriv resultat til database som content-node med edges + #[arg(long)] + write: bool, + + /// Bruker-ID som clipper (påkrevd med --write) + #[arg(long)] + created_by: Option, + /// Jobb-payload som JSON (for maskinrommet/jobbkø) #[arg(long)] payload_json: Option, @@ -65,6 +83,9 @@ struct JobPayload { playwright: bool, #[serde(default = "default_timeout")] timeout: u64, + #[serde(default)] + write: bool, + created_by: Option, } fn default_timeout() -> u64 { @@ -72,7 +93,7 @@ fn default_timeout() -> u64 { } /// Endelig output-format. -#[derive(Serialize)] +#[derive(Serialize, Clone)] struct ClipOutput { url: String, title: Option, @@ -84,6 +105,101 @@ struct ClipOutput { source: String, } +// --- LLM request/response (OpenAI-kompatibel, samme mønster som synops-suggest-edges) --- + +#[derive(Serialize)] +struct ChatRequest { + model: String, + messages: Vec, + temperature: f32, + response_format: ResponseFormat, +} + +#[derive(Serialize)] +struct ResponseFormat { + r#type: String, +} + +#[derive(Serialize)] +struct ChatMessage { + role: String, + content: String, +} + +#[derive(Deserialize)] +struct ChatResponse { + choices: Vec, + #[serde(default)] + usage: Option, + #[serde(default)] + model: Option, +} + +#[derive(Deserialize, Clone)] +struct UsageInfo { + #[serde(default)] + prompt_tokens: i64, + #[serde(default)] + completion_tokens: i64, +} + +#[derive(Deserialize)] +struct Choice { + message: MessageContent, +} + +#[derive(Deserialize)] +struct MessageContent { + content: Option, +} + +/// AI-analysens output: oppsummering + entiteter. +#[derive(Deserialize, Debug)] +struct AiAnalysis { + #[serde(default)] + summary: String, + #[serde(default)] + topics: Vec, + #[serde(default)] + mentions: Vec, +} + +#[derive(Deserialize, Debug)] +struct MentionSuggestion { + name: String, + #[serde(default = "default_entity_type")] + entity_type: String, +} + +fn default_entity_type() -> String { + "person".to_string() +} + +#[derive(sqlx::FromRow)] +struct TopicRow { + id: Uuid, + title: String, +} + +const CLIP_SYSTEM_PROMPT: &str = r#"Du er en innholdsanalysator for en norsk redaksjonsplattform. Du skal: + +1. Lage en kort oppsummering (2–4 setninger) av artikkelen på norsk. +2. Ekstrahere topics (emner/temaer). Bruk korte, presise norske termer. Maks 5. +3. Ekstrahere mentions (navngitte entiteter: personer, organisasjoner, steder). + +Returner KUN et JSON-objekt: +{ + "summary": "Kort oppsummering av artikkelen.", + "topics": ["emne1", "emne2"], + "mentions": [{"name": "Navn", "entity_type": "person"}] +} + +Regler: +- Oppsummeringen skal fange artikkelens kjerne og være nyttig uten å lese originalen. +- Bruk eksisterende topics fra listen nedenfor der det passer. +- entity_type er en av: "person", "organisasjon", "sted", "konsept". +- Returner tomme lister hvis teksten ikke har meningsfullt innhold."#; + #[tokio::main] async fn main() { synops_common::logging::init("synops_clip"); @@ -91,21 +207,46 @@ async fn main() { let cli = Cli::parse(); // Støtt payload-json fra jobbkø - let (url, use_playwright, timeout) = if let Some(ref payload) = cli.payload_json { - match serde_json::from_str::(payload) { - Ok(p) => (p.url, p.playwright, p.timeout), - Err(e) => { - eprintln!("Ugyldig payload-json: {e}"); - process::exit(1); + let (url, use_playwright, timeout, write, created_by) = + if let Some(ref payload) = cli.payload_json { + match serde_json::from_str::(payload) { + Ok(p) => (p.url, p.playwright, p.timeout, p.write, p.created_by), + Err(e) => { + eprintln!("Ugyldig payload-json: {e}"); + process::exit(1); + } } - } - } else { - (cli.url.clone(), cli.playwright, cli.timeout) - }; + } else { + ( + cli.url.clone(), + cli.playwright, + cli.timeout, + cli.write, + cli.created_by, + ) + }; + + // Valider at created_by er satt ved --write + if write && created_by.is_none() { + eprintln!("Feil: --created-by er påkrevd sammen med --write"); + process::exit(1); + } match run(&url, use_playwright, timeout).await { Ok(output) => { - println!("{}", serde_json::to_string_pretty(&output).unwrap()); + if write { + match write_to_db(&output, created_by.unwrap()).await { + Ok(result) => { + println!("{}", serde_json::to_string_pretty(&result).unwrap()); + } + Err(e) => { + eprintln!("Feil ved skriving til database: {e}"); + process::exit(1); + } + } + } else { + println!("{}", serde_json::to_string_pretty(&output).unwrap()); + } } Err(e) => { eprintln!("Feil: {e}"); @@ -114,6 +255,407 @@ async fn main() { } } +/// Skriv clip-resultat til databasen: content-node, tagged-edge, AI-analyse, mentions-edges. +async fn write_to_db( + output: &ClipOutput, + created_by: Uuid, +) -> Result { + let db = synops_common::db::connect().await?; + + let node_id = Uuid::now_v7(); + + // 1. AI-analyse: oppsummering + entiteter + let (analysis, ai_usage, ai_model) = match call_llm_analysis(output, &db).await { + Ok(result) => result, + Err(e) => { + warn!("AI-analyse feilet, fortsetter uten: {e}"); + ( + AiAnalysis { + summary: String::new(), + topics: vec![], + mentions: vec![], + }, + None, + None, + ) + } + }; + + // 2. Opprett content-node + let metadata = serde_json::json!({ + "source_url": output.url, + "original_author": output.author, + "original_date": output.date, + "paywall": output.paywall, + "clip_source": output.source, + }); + + // Bruk AI-oppsummering som innhold om den finnes, ellers bruk excerpt + let node_content = if !analysis.summary.is_empty() { + format!("{}\n\n---\n\n{}", analysis.summary, output.content) + } else { + output.content.clone() + }; + + let title = output + .title + .clone() + .unwrap_or_else(|| format!("Clip: {}", truncate_str(&output.url, 60))); + + sqlx::query( + r#" + INSERT INTO nodes (id, node_kind, title, content, visibility, metadata, created_by) + VALUES ($1, 'content', $2, $3, 'hidden'::visibility, $4, $5) + "#, + ) + .bind(node_id) + .bind(&title) + .bind(&node_content) + .bind(&metadata) + .bind(created_by) + .execute(&db) + .await + .map_err(|e| format!("PG insert node feilet: {e}"))?; + + info!(node_id = %node_id, title = %title, "Content-node opprettet"); + + // 3. Opprett tagged-edge "clipped" + let tagged_edge_id = Uuid::now_v7(); + sqlx::query( + r#" + INSERT INTO edges (id, source_id, target_id, edge_type, metadata, system, created_by) + VALUES ($1, $2, $2, 'tagged', $3, false, $4) + ON CONFLICT (source_id, target_id, edge_type) DO NOTHING + "#, + ) + .bind(tagged_edge_id) + .bind(node_id) + .bind(serde_json::json!({"tag": "clipped"})) + .bind(created_by) + .execute(&db) + .await + .map_err(|e| format!("PG insert tagged-edge feilet: {e}"))?; + + info!(node_id = %node_id, "Tagged-edge 'clipped' opprettet"); + + // 4. Opprett mentions-edges til gjenkjente entiteter + let mut topics_created = 0u32; + let mut edges_created = 0u32; + + // Hent eksisterende topic-noder for matching + let existing_topics = sqlx::query_as::<_, TopicRow>( + "SELECT id, title FROM nodes WHERE node_kind = 'topic' ORDER BY created_at DESC LIMIT 200", + ) + .fetch_all(&db) + .await + .map_err(|e| format!("PG-feil ved henting av topics: {e}"))?; + + // Prosesser topics fra AI-analyse + for topic_name in &analysis.topics { + let topic_name = topic_name.trim(); + if topic_name.is_empty() { + continue; + } + + let existing = existing_topics + .iter() + .find(|t| t.title.to_lowercase() == topic_name.to_lowercase()); + + let topic_id = if let Some(t) = existing { + t.id + } else { + let new_id = Uuid::now_v7(); + create_topic_node(&db, new_id, topic_name, created_by).await?; + topics_created += 1; + new_id + }; + + if create_mentions_edge(&db, node_id, topic_id, created_by).await? { + edges_created += 1; + } + } + + // Prosesser mentions (navngitte entiteter) + for mention in &analysis.mentions { + let name = mention.name.trim(); + if name.is_empty() { + continue; + } + + let existing_entity = sqlx::query_scalar::<_, Uuid>( + "SELECT id FROM nodes WHERE node_kind = 'topic' AND LOWER(title) = LOWER($1) LIMIT 1", + ) + .bind(name) + .fetch_optional(&db) + .await + .map_err(|e| format!("PG-feil ved entitet-søk: {e}"))?; + + let entity_id = if let Some(id) = existing_entity { + id + } else { + let new_id = Uuid::now_v7(); + create_entity_node(&db, new_id, name, &mention.entity_type, created_by).await?; + topics_created += 1; + new_id + }; + + if create_mentions_edge(&db, node_id, entity_id, created_by).await? { + edges_created += 1; + } + } + + info!( + node_id = %node_id, + topics_created, + edges_created, + "Clip skrevet til database" + ); + + // 5. Logg AI-ressursforbruk + let tokens_in = ai_usage.as_ref().map(|u| u.prompt_tokens).unwrap_or(0); + let tokens_out = ai_usage.as_ref().map(|u| u.completion_tokens).unwrap_or(0); + let model_id = ai_model.unwrap_or_else(|| "unknown".to_string()); + + if tokens_in > 0 || tokens_out > 0 { + if let Err(e) = sqlx::query( + "INSERT INTO resource_usage_log (target_node_id, triggered_by, resource_type, detail) + VALUES ($1, $2, $3, $4)", + ) + .bind(node_id) + .bind(Some(created_by)) + .bind("ai") + .bind(serde_json::json!({ + "model_id": model_id, + "tokens_in": tokens_in, + "tokens_out": tokens_out, + "job_type": "synops_clip" + })) + .execute(&db) + .await + { + warn!(error = %e, "Kunne ikke logge AI-ressursforbruk"); + } + } + + Ok(serde_json::json!({ + "status": "completed", + "node_id": node_id.to_string(), + "url": output.url, + "title": title, + "summary": analysis.summary, + "content_length": output.content.len(), + "paywall": output.paywall, + "topics_created": topics_created, + "edges_created": edges_created, + "model": model_id, + "tokens_in": tokens_in, + "tokens_out": tokens_out, + })) +} + +/// Kall LiteLLM for oppsummering og entitetsekstraksjon. +async fn call_llm_analysis( + output: &ClipOutput, + db: &sqlx::PgPool, +) -> Result<(AiAnalysis, Option, Option), String> { + let gateway_url = + std::env::var("AI_GATEWAY_URL").unwrap_or_else(|_| "http://localhost:4000".to_string()); + let api_key = std::env::var("LITELLM_MASTER_KEY").unwrap_or_default(); + let model = + std::env::var("AI_CLIP_MODEL").unwrap_or_else(|_| "sidelinja/rutine".to_string()); + + // Hent eksisterende topics for kontekst + let existing_topics = sqlx::query_scalar::<_, String>( + "SELECT title FROM nodes WHERE node_kind = 'topic' ORDER BY created_at DESC LIMIT 100", + ) + .fetch_all(db) + .await + .map_err(|e| format!("PG-feil ved henting av topics: {e}"))?; + + let topic_context = if existing_topics.is_empty() { + String::new() + } else { + format!("\n\nEksisterende topics: {}", existing_topics.join(", ")) + }; + + // Begrens innholdslengde for LLM (maks ~4000 tegn) + let content_for_llm = truncate_str(&output.content, 4000); + + let user_content = format!( + "Artikkel:\nTittel: {}\nForfatter: {}\n\n{}{topic_context}", + output.title.as_deref().unwrap_or("(ukjent)"), + output.author.as_deref().unwrap_or("(ukjent)"), + content_for_llm, + ); + + info!("Sender til LLM for oppsummering og entitetsekstraksjon"); + + let request = ChatRequest { + model, + messages: vec![ + ChatMessage { + role: "system".to_string(), + content: CLIP_SYSTEM_PROMPT.to_string(), + }, + ChatMessage { + role: "user".to_string(), + content: user_content, + }, + ], + temperature: 0.2, + response_format: ResponseFormat { + r#type: "json_object".to_string(), + }, + }; + + let client = reqwest::Client::new(); + let url = format!("{gateway_url}/v1/chat/completions"); + + let resp = client + .post(&url) + .header("Authorization", format!("Bearer {api_key}")) + .header("Content-Type", "application/json") + .json(&request) + .timeout(std::time::Duration::from_secs(60)) + .send() + .await + .map_err(|e| format!("LiteLLM-kall feilet: {e}"))?; + + if !resp.status().is_success() { + let status = resp.status(); + let body = resp.text().await.unwrap_or_default(); + return Err(format!("LiteLLM returnerte {status}: {body}")); + } + + let chat_resp: ChatResponse = resp + .json() + .await + .map_err(|e| format!("Kunne ikke parse LiteLLM-respons: {e}"))?; + + let content = chat_resp + .choices + .first() + .and_then(|c| c.message.content.as_deref()) + .ok_or("LiteLLM returnerte ingen content")?; + + let analysis: AiAnalysis = serde_json::from_str(content) + .map_err(|e| format!("Kunne ikke parse LLM JSON: {e}. Rå output: {content}"))?; + + info!( + summary_len = analysis.summary.len(), + topics = analysis.topics.len(), + mentions = analysis.mentions.len(), + "LLM-analyse mottatt" + ); + + Ok((analysis, chat_resp.usage, chat_resp.model)) +} + +/// Opprett en topic-node. +async fn create_topic_node( + db: &sqlx::PgPool, + id: Uuid, + title: &str, + created_by: Uuid, +) -> Result<(), String> { + let metadata = serde_json::json!({"ai_generated": true}); + + sqlx::query( + r#" + INSERT INTO nodes (id, node_kind, title, content, visibility, metadata, created_by) + VALUES ($1, 'topic', $2, '', 'discoverable', $3, $4) + ON CONFLICT (id) DO NOTHING + "#, + ) + .bind(id) + .bind(title) + .bind(&metadata) + .bind(created_by) + .execute(db) + .await + .map_err(|e| format!("PG insert topic feilet: {e}"))?; + + info!(topic_id = %id, title = %title, "Ny topic-node opprettet"); + Ok(()) +} + +/// Opprett en entitet-node (person, organisasjon, sted, konsept). +async fn create_entity_node( + db: &sqlx::PgPool, + id: Uuid, + name: &str, + entity_type: &str, + created_by: Uuid, +) -> Result<(), String> { + let metadata = serde_json::json!({ + "ai_generated": true, + "entity_type": entity_type + }); + + sqlx::query( + r#" + INSERT INTO nodes (id, node_kind, title, content, visibility, metadata, created_by) + VALUES ($1, 'topic', $2, '', 'discoverable', $3, $4) + ON CONFLICT (id) DO NOTHING + "#, + ) + .bind(id) + .bind(name) + .bind(&metadata) + .bind(created_by) + .execute(db) + .await + .map_err(|e| format!("PG insert entity feilet: {e}"))?; + + info!(entity_id = %id, name = %name, entity_type = %entity_type, "Ny entitet-node opprettet"); + Ok(()) +} + +/// Opprett en mentions-edge. Returnerer true hvis ny edge ble opprettet. +async fn create_mentions_edge( + db: &sqlx::PgPool, + source_id: Uuid, + target_id: Uuid, + created_by: Uuid, +) -> Result { + let edge_id = Uuid::now_v7(); + let metadata = serde_json::json!({"origin": "ai_clip"}); + + let result = sqlx::query( + r#" + INSERT INTO edges (id, source_id, target_id, edge_type, metadata, system, created_by) + VALUES ($1, $2, $3, 'mentions', $4, false, $5) + ON CONFLICT (source_id, target_id, edge_type) DO NOTHING + "#, + ) + .bind(edge_id) + .bind(source_id) + .bind(target_id) + .bind(&metadata) + .bind(created_by) + .execute(db) + .await + .map_err(|e| format!("PG insert mentions-edge feilet: {e}"))?; + + let created = result.rows_affected() > 0; + if created { + info!(source = %source_id, target = %target_id, "Mentions-edge opprettet"); + } + Ok(created) +} + +/// Avkort streng til maks lengde uten å bryte midt i et ord. +fn truncate_str(s: &str, max_len: usize) -> &str { + if s.len() <= max_len { + return s; + } + // Finn siste mellomrom før grensen + match s[..max_len].rfind(' ') { + Some(pos) => &s[..pos], + None => &s[..max_len], + } +} + async fn run(url: &str, use_playwright: bool, timeout: u64) -> Result { let scripts_dir = find_scripts_dir()?; info!(url, "Starter clip"); @@ -240,19 +782,6 @@ async fn fetch_with_playwright( url: &str, timeout: u64, ) -> Result { - // Playwright-skriptet returnerer Readability-output direkte, - // men her trenger vi bare HTML. For ren HTML-henting bruker vi - // en enkel variant. Merk: vi kaller heller run_playwright_readability - // og får ferdig parsert resultat. - // - // For fallback-scenarioet der vi allerede har HTML men Readability feilet, - // bruker vi run_playwright_readability direkte. - // - // For scenarioet der HTTP-henting feilet, trenger vi Playwright for å hente - // selve HTML-en. Vi bruker playwright.mjs som returnerer Readability-output - // direkte, men vi trenger HTML for paywall-deteksjon. La oss hente HTML - // separat med en enkel Playwright-kommando. - let output = tokio::process::Command::new("node") .arg("--input-type=module") .arg("-e")