// synops-clip — Hent og parse webartikler til ren tekst + metadata. // // Input: --url // Output: JSON til stdout med title, author, date, content, url, paywall // // Med --write: opprett content-node i PG med artikkelinnhold, // tagged-edge "clipped", AI-oppsummering via LiteLLM, og // mentions-edges til gjenkjente entiteter i kunnskapsgrafen. // // Strategi: // 1. Hent HTML med reqwest (vanlig HTTP-klient) // 2. Parse med Mozilla Readability via Node.js-hjelpeskript // 3. Hvis Readability feiler eller innholdet er for kort, prøv Playwright (headless browser) // 4. Detekter betalingsmur basert på innholdslengde og kjente mønstre // 5. (--write) Opprett node, edges, AI-oppsummering // // Miljøvariabler: // SYNOPS_CLIP_SCRIPTS — Sti til scripts/-mappen (default: ved siden av binæren) // DATABASE_URL — PostgreSQL-tilkobling (påkrevd med --write) // AI_GATEWAY_URL — LiteLLM gateway (default: http://localhost:4000) // LITELLM_MASTER_KEY — API-nøkkel for LiteLLM // AI_CLIP_MODEL — Modellalias for oppsummering (default: synops/low) // RUST_LOG — Loggnivå (default: synops_clip=info) // // Ref: docs/retninger/unix_filosofi.md use clap::Parser; use regex::Regex; use serde::{Deserialize, Serialize}; use std::path::PathBuf; use std::process; use tracing::{debug, info, warn}; use uuid::Uuid; /// Hent og parse webartikler til ren tekst + metadata. #[derive(Parser)] #[command(name = "synops-clip", about = "Hent og parse webartikler")] struct Cli { /// URL som skal hentes og parses #[arg(long)] url: String, /// Timeout i sekunder for HTTP-forespørsler #[arg(long, default_value = "30")] timeout: u64, /// Tving bruk av Playwright (headless browser) i stedet for vanlig HTTP #[arg(long)] playwright: bool, /// Skriv resultat til database som content-node med edges #[arg(long)] write: bool, /// Bruker-ID som clipper (påkrevd med --write) #[arg(long)] created_by: Option, /// Jobb-payload som JSON (for maskinrommet/jobbkø) #[arg(long)] payload_json: Option, } /// Resultat fra Readability-parsing (Node.js-skriptene). #[derive(Deserialize)] #[allow(dead_code)] struct ReadabilityResult { title: Option, author: Option, content: Option, excerpt: Option, #[serde(rename = "siteName")] site_name: Option, length: Option, error: Option, } /// Payload fra jobbkø. #[derive(Deserialize)] struct JobPayload { url: String, #[serde(default)] playwright: bool, #[serde(default = "default_timeout")] timeout: u64, #[serde(default)] write: bool, created_by: Option, } fn default_timeout() -> u64 { 30 } /// Endelig output-format. #[derive(Serialize, Clone)] struct ClipOutput { url: String, title: Option, author: Option, date: Option, content: String, excerpt: Option, paywall: bool, source: String, } // --- LLM request/response (OpenAI-kompatibel, samme mønster som synops-suggest-edges) --- #[derive(Serialize)] struct ChatRequest { model: String, messages: Vec, temperature: f32, response_format: ResponseFormat, } #[derive(Serialize)] struct ResponseFormat { r#type: String, } #[derive(Serialize)] struct ChatMessage { role: String, content: String, } #[derive(Deserialize)] struct ChatResponse { choices: Vec, #[serde(default)] usage: Option, #[serde(default)] model: Option, } #[derive(Deserialize, Clone)] struct UsageInfo { #[serde(default)] prompt_tokens: i64, #[serde(default)] completion_tokens: i64, } #[derive(Deserialize)] struct Choice { message: MessageContent, } #[derive(Deserialize)] struct MessageContent { content: Option, } /// AI-analysens output: oppsummering + entiteter. #[derive(Deserialize, Debug)] struct AiAnalysis { #[serde(default)] summary: String, #[serde(default)] topics: Vec, #[serde(default)] mentions: Vec, } #[derive(Deserialize, Debug)] struct MentionSuggestion { name: String, #[serde(default = "default_entity_type")] entity_type: String, } fn default_entity_type() -> String { "person".to_string() } #[derive(sqlx::FromRow)] struct TopicRow { id: Uuid, title: String, } const CLIP_SYSTEM_PROMPT: &str = r#"Du er en innholdsanalysator for en norsk redaksjonsplattform. Du skal: 1. Lage en kort oppsummering (2–4 setninger) av artikkelen på norsk. 2. Ekstrahere topics (emner/temaer). Bruk korte, presise norske termer. Maks 5. 3. Ekstrahere mentions (navngitte entiteter: personer, organisasjoner, steder). Returner KUN et JSON-objekt: { "summary": "Kort oppsummering av artikkelen.", "topics": ["emne1", "emne2"], "mentions": [{"name": "Navn", "entity_type": "person"}] } Regler: - Oppsummeringen skal fange artikkelens kjerne og være nyttig uten å lese originalen. - Bruk eksisterende topics fra listen nedenfor der det passer. - entity_type er en av: "person", "organisasjon", "sted", "konsept". - Returner tomme lister hvis teksten ikke har meningsfullt innhold."#; #[tokio::main] async fn main() { synops_common::logging::init("synops_clip"); let cli = Cli::parse(); // Støtt payload-json fra jobbkø let (url, use_playwright, timeout, write, created_by) = if let Some(ref payload) = cli.payload_json { match serde_json::from_str::(payload) { Ok(p) => (p.url, p.playwright, p.timeout, p.write, p.created_by), Err(e) => { eprintln!("Ugyldig payload-json: {e}"); process::exit(1); } } } else { ( cli.url.clone(), cli.playwright, cli.timeout, cli.write, cli.created_by, ) }; // Valider at created_by er satt ved --write if write && created_by.is_none() { eprintln!("Feil: --created-by er påkrevd sammen med --write"); process::exit(1); } match run(&url, use_playwright, timeout).await { Ok(output) => { if write { match write_to_db(&output, created_by.unwrap()).await { Ok(result) => { println!("{}", serde_json::to_string_pretty(&result).unwrap()); } Err(e) => { eprintln!("Feil ved skriving til database: {e}"); process::exit(1); } } } else { println!("{}", serde_json::to_string_pretty(&output).unwrap()); } } Err(e) => { eprintln!("Feil: {e}"); process::exit(1); } } } /// Skriv clip-resultat til databasen: content-node, tagged-edge, AI-analyse, mentions-edges. async fn write_to_db( output: &ClipOutput, created_by: Uuid, ) -> Result { let db = synops_common::db::connect().await?; let node_id = Uuid::now_v7(); // 1. AI-analyse: oppsummering + entiteter let (analysis, ai_usage, ai_model) = match call_llm_analysis(output, &db).await { Ok(result) => result, Err(e) => { warn!("AI-analyse feilet, fortsetter uten: {e}"); ( AiAnalysis { summary: String::new(), topics: vec![], mentions: vec![], }, None, None, ) } }; // 2. Opprett content-node let metadata = serde_json::json!({ "source_url": output.url, "original_author": output.author, "original_date": output.date, "paywall": output.paywall, "clip_source": output.source, }); // Bruk AI-oppsummering som innhold om den finnes, ellers bruk excerpt let node_content = if !analysis.summary.is_empty() { format!("{}\n\n---\n\n{}", analysis.summary, output.content) } else { output.content.clone() }; let title = output .title .clone() .unwrap_or_else(|| format!("Clip: {}", truncate_str(&output.url, 60))); sqlx::query( r#" INSERT INTO nodes (id, node_kind, title, content, visibility, metadata, created_by) VALUES ($1, 'content', $2, $3, 'hidden'::visibility, $4, $5) "#, ) .bind(node_id) .bind(&title) .bind(&node_content) .bind(&metadata) .bind(created_by) .execute(&db) .await .map_err(|e| format!("PG insert node feilet: {e}"))?; info!(node_id = %node_id, title = %title, "Content-node opprettet"); // 3. Opprett tagged-edge "clipped" let tagged_edge_id = Uuid::now_v7(); sqlx::query( r#" INSERT INTO edges (id, source_id, target_id, edge_type, metadata, system, created_by) VALUES ($1, $2, $2, 'tagged', $3, false, $4) ON CONFLICT (source_id, target_id, edge_type) DO NOTHING "#, ) .bind(tagged_edge_id) .bind(node_id) .bind(serde_json::json!({"tag": "clipped"})) .bind(created_by) .execute(&db) .await .map_err(|e| format!("PG insert tagged-edge feilet: {e}"))?; info!(node_id = %node_id, "Tagged-edge 'clipped' opprettet"); // 4. Opprett mentions-edges til gjenkjente entiteter let mut topics_created = 0u32; let mut edges_created = 0u32; // Hent eksisterende topic-noder for matching let existing_topics = sqlx::query_as::<_, TopicRow>( "SELECT id, title FROM nodes WHERE node_kind = 'topic' ORDER BY created_at DESC LIMIT 200", ) .fetch_all(&db) .await .map_err(|e| format!("PG-feil ved henting av topics: {e}"))?; // Prosesser topics fra AI-analyse for topic_name in &analysis.topics { let topic_name = topic_name.trim(); if topic_name.is_empty() { continue; } let existing = existing_topics .iter() .find(|t| t.title.to_lowercase() == topic_name.to_lowercase()); let topic_id = if let Some(t) = existing { t.id } else { let new_id = Uuid::now_v7(); create_topic_node(&db, new_id, topic_name, created_by).await?; topics_created += 1; new_id }; if create_mentions_edge(&db, node_id, topic_id, created_by).await? { edges_created += 1; } } // Prosesser mentions (navngitte entiteter) for mention in &analysis.mentions { let name = mention.name.trim(); if name.is_empty() { continue; } let existing_entity = sqlx::query_scalar::<_, Uuid>( "SELECT id FROM nodes WHERE node_kind = 'topic' AND LOWER(title) = LOWER($1) LIMIT 1", ) .bind(name) .fetch_optional(&db) .await .map_err(|e| format!("PG-feil ved entitet-søk: {e}"))?; let entity_id = if let Some(id) = existing_entity { id } else { let new_id = Uuid::now_v7(); create_entity_node(&db, new_id, name, &mention.entity_type, created_by).await?; topics_created += 1; new_id }; if create_mentions_edge(&db, node_id, entity_id, created_by).await? { edges_created += 1; } } info!( node_id = %node_id, topics_created, edges_created, "Clip skrevet til database" ); // 5. Logg AI-ressursforbruk let tokens_in = ai_usage.as_ref().map(|u| u.prompt_tokens).unwrap_or(0); let tokens_out = ai_usage.as_ref().map(|u| u.completion_tokens).unwrap_or(0); let model_id = ai_model.unwrap_or_else(|| "unknown".to_string()); if tokens_in > 0 || tokens_out > 0 { if let Err(e) = sqlx::query( "INSERT INTO resource_usage_log (target_node_id, triggered_by, resource_type, detail) VALUES ($1, $2, $3, $4)", ) .bind(node_id) .bind(Some(created_by)) .bind("ai") .bind(serde_json::json!({ "model_id": model_id, "tokens_in": tokens_in, "tokens_out": tokens_out, "job_type": "synops_clip" })) .execute(&db) .await { warn!(error = %e, "Kunne ikke logge AI-ressursforbruk"); } } Ok(serde_json::json!({ "status": "completed", "node_id": node_id.to_string(), "url": output.url, "title": title, "summary": analysis.summary, "content_length": output.content.len(), "paywall": output.paywall, "topics_created": topics_created, "edges_created": edges_created, "model": model_id, "tokens_in": tokens_in, "tokens_out": tokens_out, })) } /// Kall LiteLLM for oppsummering og entitetsekstraksjon. async fn call_llm_analysis( output: &ClipOutput, db: &sqlx::PgPool, ) -> Result<(AiAnalysis, Option, Option), String> { let gateway_url = std::env::var("AI_GATEWAY_URL").unwrap_or_else(|_| "http://localhost:4000".to_string()); let api_key = std::env::var("LITELLM_MASTER_KEY").unwrap_or_default(); let model = std::env::var("AI_CLIP_MODEL").unwrap_or_else(|_| "synops/low".to_string()); // Hent eksisterende topics for kontekst let existing_topics = sqlx::query_scalar::<_, String>( "SELECT title FROM nodes WHERE node_kind = 'topic' ORDER BY created_at DESC LIMIT 100", ) .fetch_all(db) .await .map_err(|e| format!("PG-feil ved henting av topics: {e}"))?; let topic_context = if existing_topics.is_empty() { String::new() } else { format!("\n\nEksisterende topics: {}", existing_topics.join(", ")) }; // Begrens innholdslengde for LLM (maks ~4000 tegn) let content_for_llm = truncate_str(&output.content, 4000); let user_content = format!( "Artikkel:\nTittel: {}\nForfatter: {}\n\n{}{topic_context}", output.title.as_deref().unwrap_or("(ukjent)"), output.author.as_deref().unwrap_or("(ukjent)"), content_for_llm, ); info!("Sender til LLM for oppsummering og entitetsekstraksjon"); let request = ChatRequest { model, messages: vec![ ChatMessage { role: "system".to_string(), content: CLIP_SYSTEM_PROMPT.to_string(), }, ChatMessage { role: "user".to_string(), content: user_content, }, ], temperature: 0.2, response_format: ResponseFormat { r#type: "json_object".to_string(), }, }; let client = reqwest::Client::new(); let url = format!("{gateway_url}/v1/chat/completions"); let resp = client .post(&url) .header("Authorization", format!("Bearer {api_key}")) .header("Content-Type", "application/json") .json(&request) .timeout(std::time::Duration::from_secs(60)) .send() .await .map_err(|e| format!("LiteLLM-kall feilet: {e}"))?; if !resp.status().is_success() { let status = resp.status(); let body = resp.text().await.unwrap_or_default(); return Err(format!("LiteLLM returnerte {status}: {body}")); } let chat_resp: ChatResponse = resp .json() .await .map_err(|e| format!("Kunne ikke parse LiteLLM-respons: {e}"))?; let content = chat_resp .choices .first() .and_then(|c| c.message.content.as_deref()) .ok_or("LiteLLM returnerte ingen content")?; let analysis: AiAnalysis = serde_json::from_str(content) .map_err(|e| format!("Kunne ikke parse LLM JSON: {e}. Rå output: {content}"))?; info!( summary_len = analysis.summary.len(), topics = analysis.topics.len(), mentions = analysis.mentions.len(), "LLM-analyse mottatt" ); Ok((analysis, chat_resp.usage, chat_resp.model)) } /// Opprett en topic-node. async fn create_topic_node( db: &sqlx::PgPool, id: Uuid, title: &str, created_by: Uuid, ) -> Result<(), String> { let metadata = serde_json::json!({"ai_generated": true}); sqlx::query( r#" INSERT INTO nodes (id, node_kind, title, content, visibility, metadata, created_by) VALUES ($1, 'topic', $2, '', 'discoverable', $3, $4) ON CONFLICT (id) DO NOTHING "#, ) .bind(id) .bind(title) .bind(&metadata) .bind(created_by) .execute(db) .await .map_err(|e| format!("PG insert topic feilet: {e}"))?; info!(topic_id = %id, title = %title, "Ny topic-node opprettet"); Ok(()) } /// Opprett en entitet-node (person, organisasjon, sted, konsept). async fn create_entity_node( db: &sqlx::PgPool, id: Uuid, name: &str, entity_type: &str, created_by: Uuid, ) -> Result<(), String> { let metadata = serde_json::json!({ "ai_generated": true, "entity_type": entity_type }); sqlx::query( r#" INSERT INTO nodes (id, node_kind, title, content, visibility, metadata, created_by) VALUES ($1, 'topic', $2, '', 'discoverable', $3, $4) ON CONFLICT (id) DO NOTHING "#, ) .bind(id) .bind(name) .bind(&metadata) .bind(created_by) .execute(db) .await .map_err(|e| format!("PG insert entity feilet: {e}"))?; info!(entity_id = %id, name = %name, entity_type = %entity_type, "Ny entitet-node opprettet"); Ok(()) } /// Opprett en mentions-edge. Returnerer true hvis ny edge ble opprettet. async fn create_mentions_edge( db: &sqlx::PgPool, source_id: Uuid, target_id: Uuid, created_by: Uuid, ) -> Result { let edge_id = Uuid::now_v7(); let metadata = serde_json::json!({"origin": "ai_clip"}); let result = sqlx::query( r#" INSERT INTO edges (id, source_id, target_id, edge_type, metadata, system, created_by) VALUES ($1, $2, $3, 'mentions', $4, false, $5) ON CONFLICT (source_id, target_id, edge_type) DO NOTHING "#, ) .bind(edge_id) .bind(source_id) .bind(target_id) .bind(&metadata) .bind(created_by) .execute(db) .await .map_err(|e| format!("PG insert mentions-edge feilet: {e}"))?; let created = result.rows_affected() > 0; if created { info!(source = %source_id, target = %target_id, "Mentions-edge opprettet"); } Ok(created) } /// Avkort streng til maks lengde uten å bryte midt i et ord. fn truncate_str(s: &str, max_len: usize) -> &str { if s.len() <= max_len { return s; } // Finn siste mellomrom før grensen match s[..max_len].rfind(' ') { Some(pos) => &s[..pos], None => &s[..max_len], } } async fn run(url: &str, use_playwright: bool, timeout: u64) -> Result { let scripts_dir = find_scripts_dir()?; info!(url, "Starter clip"); // Steg 1: Hent HTML let html = if use_playwright { info!("Bruker Playwright for å hente side"); fetch_with_playwright(&scripts_dir, url, timeout).await? } else { match fetch_html(url, timeout).await { Ok(html) => html, Err(e) => { warn!("HTTP-henting feilet: {e}, prøver Playwright"); fetch_with_playwright(&scripts_dir, url, timeout).await? } } }; // Steg 2: Parse med Readability let parsed = parse_with_readability(&scripts_dir, url, &html).await?; // Steg 3: Hvis Readability feilet, prøv Playwright (om vi ikke allerede brukte det) let (parsed, source) = if parsed.error.is_some() && !use_playwright { warn!("Readability feilet, prøver Playwright-fallback"); let result = run_playwright_readability(&scripts_dir, url, timeout).await?; (result, "playwright") } else if parsed.error.is_some() { return Err(format!( "Readability feilet: {}", parsed.error.unwrap_or_default() )); } else { (parsed, "readability") }; let content = parsed.content.unwrap_or_default(); let content = clean_text(&content); // Steg 4: Forsøk å finne dato fra HTML let date = extract_date(&html); // Steg 5: Detekter betalingsmur let paywall = detect_paywall(&content, &html); if paywall { info!("Betalingsmur detektert"); } Ok(ClipOutput { url: url.to_string(), title: parsed.title, author: parsed.author, date, content, excerpt: parsed.excerpt, paywall, source: source.to_string(), }) } /// Finn scripts/-mappen relativt til binæren eller via miljøvariabel. fn find_scripts_dir() -> Result { if let Ok(dir) = std::env::var("SYNOPS_CLIP_SCRIPTS") { let p = PathBuf::from(dir); if p.exists() { return Ok(p); } } // Prøv relativt til binæren: ../scripts/ (når installert) eller // fra kilde: tools/synops-clip/scripts/ let exe = std::env::current_exe().map_err(|e| format!("Fant ikke exe-sti: {e}"))?; let exe_dir = exe.parent().ok_or("Ingen parent-mappe for exe")?; // I utvikling: target/release/ → ../../scripts/ for candidate in &[ exe_dir.join("../../scripts"), exe_dir.join("../scripts"), exe_dir.join("scripts"), PathBuf::from("/home/vegard/synops/tools/synops-clip/scripts"), ] { let resolved = candidate .canonicalize() .unwrap_or_else(|_| candidate.clone()); if resolved.join("readability.mjs").exists() { debug!(?resolved, "Fant scripts-mappe"); return Ok(resolved); } } Err("Fant ikke scripts/-mappen. Sett SYNOPS_CLIP_SCRIPTS.".to_string()) } /// Hent HTML med reqwest. async fn fetch_html(url: &str, timeout_secs: u64) -> Result { let client = reqwest::Client::builder() .timeout(std::time::Duration::from_secs(timeout_secs)) .user_agent("Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36") .redirect(reqwest::redirect::Policy::limited(10)) .build() .map_err(|e| format!("Kunne ikke opprette HTTP-klient: {e}"))?; let resp = client .get(url) .header("Accept", "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8") .header("Accept-Language", "nb-NO,nb;q=0.9,no;q=0.8,en;q=0.5") .send() .await .map_err(|e| format!("HTTP-forespørsel feilet: {e}"))?; let status = resp.status(); if !status.is_success() { return Err(format!("HTTP {status}")); } resp.text() .await .map_err(|e| format!("Kunne ikke lese respons-body: {e}")) } /// Hent ferdigrendret HTML via Playwright (for JS-tunge sider). async fn fetch_with_playwright( scripts_dir: &PathBuf, url: &str, timeout: u64, ) -> Result { let output = tokio::process::Command::new("node") .arg("--input-type=module") .arg("-e") .arg(format!( r#" import {{ chromium }} from 'playwright'; const browser = await chromium.launch({{ headless: true }}); const ctx = await browser.newContext({{ userAgent: 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36' }}); const page = await ctx.newPage(); await page.goto({url}, {{ waitUntil: 'networkidle', timeout: {timeout} }}); const html = await page.content(); process.stdout.write(html); await browser.close(); "#, url = serde_json::to_string(url).unwrap(), timeout = timeout * 1000, )) .env( "NODE_PATH", scripts_dir.join("node_modules").to_string_lossy().to_string(), ) .output() .await .map_err(|e| format!("Playwright-prosess feilet: {e}"))?; if !output.status.success() { let stderr = String::from_utf8_lossy(&output.stderr); return Err(format!("Playwright feilet: {stderr}")); } String::from_utf8(output.stdout).map_err(|e| format!("Ugyldig UTF-8 fra Playwright: {e}")) } /// Parse HTML med Readability via Node.js-hjelpeskript. async fn parse_with_readability( scripts_dir: &PathBuf, url: &str, html: &str, ) -> Result { let script = scripts_dir.join("readability.mjs"); let mut child = tokio::process::Command::new("node") .arg(&script) .arg("--url") .arg(url) .env( "NODE_PATH", scripts_dir.join("node_modules").to_string_lossy().to_string(), ) .stdin(std::process::Stdio::piped()) .stdout(std::process::Stdio::piped()) .stderr(std::process::Stdio::piped()) .spawn() .map_err(|e| format!("Kunne ikke starte readability.mjs: {e}"))?; // Skriv HTML til stdin if let Some(mut stdin) = child.stdin.take() { use tokio::io::AsyncWriteExt; stdin .write_all(html.as_bytes()) .await .map_err(|e| format!("Kunne ikke skrive til stdin: {e}"))?; drop(stdin); } let output = child .wait_with_output() .await .map_err(|e| format!("Readability-prosess feilet: {e}"))?; let stdout = String::from_utf8_lossy(&output.stdout); debug!(stdout = %stdout, "Readability-output"); serde_json::from_str::(&stdout) .map_err(|e| format!("Kunne ikke parse Readability-output: {e} — raw: {stdout}")) } /// Kjør Playwright + Readability i ett steg. async fn run_playwright_readability( scripts_dir: &PathBuf, url: &str, timeout: u64, ) -> Result { let script = scripts_dir.join("playwright.mjs"); let output = tokio::process::Command::new("node") .arg(&script) .arg("--url") .arg(url) .arg("--timeout") .arg((timeout * 1000).to_string()) .env( "NODE_PATH", scripts_dir.join("node_modules").to_string_lossy().to_string(), ) .output() .await .map_err(|e| format!("Playwright-prosess feilet: {e}"))?; let stdout = String::from_utf8_lossy(&output.stdout); if !output.status.success() { let stderr = String::from_utf8_lossy(&output.stderr); warn!(stderr = %stderr, "Playwright feilet"); } serde_json::from_str::(&stdout) .map_err(|e| format!("Kunne ikke parse Playwright-output: {e} — raw: {stdout}")) } /// Forsøk å ekstrahere publiseringsdato fra HTML-metadata. fn extract_date(html: &str) -> Option { // Vanlige meta-tagger for publiseringsdato let patterns = [ r#"property="article:published_time"\s+content="([^"]+)""#, r#"content="([^"]+)"\s+property="article:published_time""#, r#"name="date"\s+content="([^"]+)""#, r#"content="([^"]+)"\s+name="date""#, r#"name="pubdate"\s+content="([^"]+)""#, r#"content="([^"]+)"\s+name="pubdate""#, r#"name="publishdate"\s+content="([^"]+)""#, r#"itemprop="datePublished"\s+content="([^"]+)""#, r#"content="([^"]+)"\s+itemprop="datePublished""#, r#"name="DC\.date"\s+content="([^"]+)""#, r#""datePublished"\s*:\s*"([^"]+)""#, ]; for pat in &patterns { if let Ok(re) = Regex::new(pat) { if let Some(caps) = re.captures(html) { if let Some(date_str) = caps.get(1) { let d = date_str.as_str().to_string(); // Normaliser til YYYY-MM-DD om mulig if d.len() >= 10 { return Some(d[..10].to_string()); } return Some(d); } } } } None } /// Rens opp tekst fra Readability (fjern overflødige mellomrom/linjeskift). fn clean_text(text: &str) -> String { // Fjern ledende/etterfølgende whitespace per linje, kollapser blanke linjer let lines: Vec<&str> = text.lines().map(|l| l.trim()).collect(); let mut result = Vec::new(); let mut prev_empty = false; for line in lines { if line.is_empty() { if !prev_empty { result.push(""); prev_empty = true; } } else { result.push(line); prev_empty = false; } } result.join("\n").trim().to_string() } /// Detekter betalingsmur basert på innhold og HTML-mønstre. fn detect_paywall(content: &str, html: &str) -> bool { let content_lower = content.to_lowercase(); let html_lower = html.to_lowercase(); // 1. Veldig kort innhold (under 200 tegn) tyder på betalingsmur let trimmed_len = content.trim().len(); if trimmed_len > 0 && trimmed_len < 200 { debug!(trimmed_len, "Kort innhold — mulig betalingsmur"); return true; } // 2. Kjente betalingsmur-mønstre i innholdet let paywall_content_patterns = [ "logg inn for å lese", "log in to read", "sign in to continue", "subscribe to continue", "abonner for å lese", "kun for abonnenter", "for subscribers only", "du må være innlogget", "denne artikkelen er for abonnenter", "kjøp tilgang", "buy access", "unlock this article", "lås opp artikkelen", "registrer deg for å lese", "create an account to read", "meld deg inn", "bli abonnent", "allerede abonnent", "already a subscriber", ]; for pattern in &paywall_content_patterns { if content_lower.contains(pattern) { debug!(pattern, "Betalingsmur-mønster funnet i innhold"); return true; } } // 3. Kjente betalingsmur-klasser/elementer i HTML let paywall_html_patterns = [ "class=\"paywall", "class=\"pw-", "id=\"paywall", "data-paywall", "class=\"subscriber-only", "class=\"premium-content", "class=\"locked-content", "class=\"article-paywall", "class=\"metered-", "class=\"piano-", "id=\"piano-", "class=\"tp-modal", "data-piano", "class=\"ns-paywall", "class=\"nettavisen-paywall", "class=\"schibsted-paywall", "class=\"amedia-paywall", ]; for pattern in &paywall_html_patterns { if html_lower.contains(pattern) { debug!(pattern, "Betalingsmur-mønster funnet i HTML"); return true; } } // 4. Sjekk for meta-tagger som indikerer betalt innhold if html_lower.contains(r#"content="locked""#) || html_lower.contains(r#"content="premium""#) || html_lower.contains("isAccessibleForFree") && html_lower.contains(r#""false""#) { debug!("Betalingsmur-meta-tagg funnet"); return true; } false }