synops/tools/synops-clip/src/main.rs
vegard 69997c6e87 Rydd sidelinja-referanser: generiske AI-nivåer + synops.no-domener
AI-aliaser: sidelinja/rutine → synops/low, sidelinja/resonering →
synops/high. Fire nivåer i LiteLLM: low/medium/high/extreme.
Oppdatert i: LiteLLM config, PG ai_job_routing, all Rust-kode
(maskinrommet + 5 CLI-verktøy).

Domener: sidelinja.org → synops.no i fallback-URLer, health-sjekker,
LiveKit WSS, bandwidth-logger, docs/erfaringer, docs/setup,
reference/server-state, .env.example.

Docker container-navn (sidelinja-*) beholdes — styrt av
COMPOSE_PROJECT_NAME i /srv/synops/.env, endres separat ved behov.

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
2026-03-19 17:22:22 +00:00

1034 lines
31 KiB
Rust
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

// synops-clip — Hent og parse webartikler til ren tekst + metadata.
//
// Input: --url <url>
// Output: JSON til stdout med title, author, date, content, url, paywall
//
// Med --write: opprett content-node i PG med artikkelinnhold,
// tagged-edge "clipped", AI-oppsummering via LiteLLM, og
// mentions-edges til gjenkjente entiteter i kunnskapsgrafen.
//
// Strategi:
// 1. Hent HTML med reqwest (vanlig HTTP-klient)
// 2. Parse med Mozilla Readability via Node.js-hjelpeskript
// 3. Hvis Readability feiler eller innholdet er for kort, prøv Playwright (headless browser)
// 4. Detekter betalingsmur basert på innholdslengde og kjente mønstre
// 5. (--write) Opprett node, edges, AI-oppsummering
//
// Miljøvariabler:
// SYNOPS_CLIP_SCRIPTS — Sti til scripts/-mappen (default: ved siden av binæren)
// DATABASE_URL — PostgreSQL-tilkobling (påkrevd med --write)
// AI_GATEWAY_URL — LiteLLM gateway (default: http://localhost:4000)
// LITELLM_MASTER_KEY — API-nøkkel for LiteLLM
// AI_CLIP_MODEL — Modellalias for oppsummering (default: synops/low)
// RUST_LOG — Loggnivå (default: synops_clip=info)
//
// Ref: docs/retninger/unix_filosofi.md
use clap::Parser;
use regex::Regex;
use serde::{Deserialize, Serialize};
use std::path::PathBuf;
use std::process;
use tracing::{debug, info, warn};
use uuid::Uuid;
/// Hent og parse webartikler til ren tekst + metadata.
#[derive(Parser)]
#[command(name = "synops-clip", about = "Hent og parse webartikler")]
struct Cli {
/// URL som skal hentes og parses
#[arg(long)]
url: String,
/// Timeout i sekunder for HTTP-forespørsler
#[arg(long, default_value = "30")]
timeout: u64,
/// Tving bruk av Playwright (headless browser) i stedet for vanlig HTTP
#[arg(long)]
playwright: bool,
/// Skriv resultat til database som content-node med edges
#[arg(long)]
write: bool,
/// Bruker-ID som clipper (påkrevd med --write)
#[arg(long)]
created_by: Option<Uuid>,
/// Jobb-payload som JSON (for maskinrommet/jobbkø)
#[arg(long)]
payload_json: Option<String>,
}
/// Resultat fra Readability-parsing (Node.js-skriptene).
#[derive(Deserialize)]
#[allow(dead_code)]
struct ReadabilityResult {
title: Option<String>,
author: Option<String>,
content: Option<String>,
excerpt: Option<String>,
#[serde(rename = "siteName")]
site_name: Option<String>,
length: Option<usize>,
error: Option<String>,
}
/// Payload fra jobbkø.
#[derive(Deserialize)]
struct JobPayload {
url: String,
#[serde(default)]
playwright: bool,
#[serde(default = "default_timeout")]
timeout: u64,
#[serde(default)]
write: bool,
created_by: Option<Uuid>,
}
fn default_timeout() -> u64 {
30
}
/// Endelig output-format.
#[derive(Serialize, Clone)]
struct ClipOutput {
url: String,
title: Option<String>,
author: Option<String>,
date: Option<String>,
content: String,
excerpt: Option<String>,
paywall: bool,
source: String,
}
// --- LLM request/response (OpenAI-kompatibel, samme mønster som synops-suggest-edges) ---
#[derive(Serialize)]
struct ChatRequest {
model: String,
messages: Vec<ChatMessage>,
temperature: f32,
response_format: ResponseFormat,
}
#[derive(Serialize)]
struct ResponseFormat {
r#type: String,
}
#[derive(Serialize)]
struct ChatMessage {
role: String,
content: String,
}
#[derive(Deserialize)]
struct ChatResponse {
choices: Vec<Choice>,
#[serde(default)]
usage: Option<UsageInfo>,
#[serde(default)]
model: Option<String>,
}
#[derive(Deserialize, Clone)]
struct UsageInfo {
#[serde(default)]
prompt_tokens: i64,
#[serde(default)]
completion_tokens: i64,
}
#[derive(Deserialize)]
struct Choice {
message: MessageContent,
}
#[derive(Deserialize)]
struct MessageContent {
content: Option<String>,
}
/// AI-analysens output: oppsummering + entiteter.
#[derive(Deserialize, Debug)]
struct AiAnalysis {
#[serde(default)]
summary: String,
#[serde(default)]
topics: Vec<String>,
#[serde(default)]
mentions: Vec<MentionSuggestion>,
}
#[derive(Deserialize, Debug)]
struct MentionSuggestion {
name: String,
#[serde(default = "default_entity_type")]
entity_type: String,
}
fn default_entity_type() -> String {
"person".to_string()
}
#[derive(sqlx::FromRow)]
struct TopicRow {
id: Uuid,
title: String,
}
const CLIP_SYSTEM_PROMPT: &str = r#"Du er en innholdsanalysator for en norsk redaksjonsplattform. Du skal:
1. Lage en kort oppsummering (24 setninger) av artikkelen på norsk.
2. Ekstrahere topics (emner/temaer). Bruk korte, presise norske termer. Maks 5.
3. Ekstrahere mentions (navngitte entiteter: personer, organisasjoner, steder).
Returner KUN et JSON-objekt:
{
"summary": "Kort oppsummering av artikkelen.",
"topics": ["emne1", "emne2"],
"mentions": [{"name": "Navn", "entity_type": "person"}]
}
Regler:
- Oppsummeringen skal fange artikkelens kjerne og være nyttig uten å lese originalen.
- Bruk eksisterende topics fra listen nedenfor der det passer.
- entity_type er en av: "person", "organisasjon", "sted", "konsept".
- Returner tomme lister hvis teksten ikke har meningsfullt innhold."#;
#[tokio::main]
async fn main() {
synops_common::logging::init("synops_clip");
let cli = Cli::parse();
// Støtt payload-json fra jobbkø
let (url, use_playwright, timeout, write, created_by) =
if let Some(ref payload) = cli.payload_json {
match serde_json::from_str::<JobPayload>(payload) {
Ok(p) => (p.url, p.playwright, p.timeout, p.write, p.created_by),
Err(e) => {
eprintln!("Ugyldig payload-json: {e}");
process::exit(1);
}
}
} else {
(
cli.url.clone(),
cli.playwright,
cli.timeout,
cli.write,
cli.created_by,
)
};
// Valider at created_by er satt ved --write
if write && created_by.is_none() {
eprintln!("Feil: --created-by er påkrevd sammen med --write");
process::exit(1);
}
match run(&url, use_playwright, timeout).await {
Ok(output) => {
if write {
match write_to_db(&output, created_by.unwrap()).await {
Ok(result) => {
println!("{}", serde_json::to_string_pretty(&result).unwrap());
}
Err(e) => {
eprintln!("Feil ved skriving til database: {e}");
process::exit(1);
}
}
} else {
println!("{}", serde_json::to_string_pretty(&output).unwrap());
}
}
Err(e) => {
eprintln!("Feil: {e}");
process::exit(1);
}
}
}
/// Skriv clip-resultat til databasen: content-node, tagged-edge, AI-analyse, mentions-edges.
async fn write_to_db(
output: &ClipOutput,
created_by: Uuid,
) -> Result<serde_json::Value, String> {
let db = synops_common::db::connect().await?;
let node_id = Uuid::now_v7();
// 1. AI-analyse: oppsummering + entiteter
let (analysis, ai_usage, ai_model) = match call_llm_analysis(output, &db).await {
Ok(result) => result,
Err(e) => {
warn!("AI-analyse feilet, fortsetter uten: {e}");
(
AiAnalysis {
summary: String::new(),
topics: vec![],
mentions: vec![],
},
None,
None,
)
}
};
// 2. Opprett content-node
let metadata = serde_json::json!({
"source_url": output.url,
"original_author": output.author,
"original_date": output.date,
"paywall": output.paywall,
"clip_source": output.source,
});
// Bruk AI-oppsummering som innhold om den finnes, ellers bruk excerpt
let node_content = if !analysis.summary.is_empty() {
format!("{}\n\n---\n\n{}", analysis.summary, output.content)
} else {
output.content.clone()
};
let title = output
.title
.clone()
.unwrap_or_else(|| format!("Clip: {}", truncate_str(&output.url, 60)));
sqlx::query(
r#"
INSERT INTO nodes (id, node_kind, title, content, visibility, metadata, created_by)
VALUES ($1, 'content', $2, $3, 'hidden'::visibility, $4, $5)
"#,
)
.bind(node_id)
.bind(&title)
.bind(&node_content)
.bind(&metadata)
.bind(created_by)
.execute(&db)
.await
.map_err(|e| format!("PG insert node feilet: {e}"))?;
info!(node_id = %node_id, title = %title, "Content-node opprettet");
// 3. Opprett tagged-edge "clipped"
let tagged_edge_id = Uuid::now_v7();
sqlx::query(
r#"
INSERT INTO edges (id, source_id, target_id, edge_type, metadata, system, created_by)
VALUES ($1, $2, $2, 'tagged', $3, false, $4)
ON CONFLICT (source_id, target_id, edge_type) DO NOTHING
"#,
)
.bind(tagged_edge_id)
.bind(node_id)
.bind(serde_json::json!({"tag": "clipped"}))
.bind(created_by)
.execute(&db)
.await
.map_err(|e| format!("PG insert tagged-edge feilet: {e}"))?;
info!(node_id = %node_id, "Tagged-edge 'clipped' opprettet");
// 4. Opprett mentions-edges til gjenkjente entiteter
let mut topics_created = 0u32;
let mut edges_created = 0u32;
// Hent eksisterende topic-noder for matching
let existing_topics = sqlx::query_as::<_, TopicRow>(
"SELECT id, title FROM nodes WHERE node_kind = 'topic' ORDER BY created_at DESC LIMIT 200",
)
.fetch_all(&db)
.await
.map_err(|e| format!("PG-feil ved henting av topics: {e}"))?;
// Prosesser topics fra AI-analyse
for topic_name in &analysis.topics {
let topic_name = topic_name.trim();
if topic_name.is_empty() {
continue;
}
let existing = existing_topics
.iter()
.find(|t| t.title.to_lowercase() == topic_name.to_lowercase());
let topic_id = if let Some(t) = existing {
t.id
} else {
let new_id = Uuid::now_v7();
create_topic_node(&db, new_id, topic_name, created_by).await?;
topics_created += 1;
new_id
};
if create_mentions_edge(&db, node_id, topic_id, created_by).await? {
edges_created += 1;
}
}
// Prosesser mentions (navngitte entiteter)
for mention in &analysis.mentions {
let name = mention.name.trim();
if name.is_empty() {
continue;
}
let existing_entity = sqlx::query_scalar::<_, Uuid>(
"SELECT id FROM nodes WHERE node_kind = 'topic' AND LOWER(title) = LOWER($1) LIMIT 1",
)
.bind(name)
.fetch_optional(&db)
.await
.map_err(|e| format!("PG-feil ved entitet-søk: {e}"))?;
let entity_id = if let Some(id) = existing_entity {
id
} else {
let new_id = Uuid::now_v7();
create_entity_node(&db, new_id, name, &mention.entity_type, created_by).await?;
topics_created += 1;
new_id
};
if create_mentions_edge(&db, node_id, entity_id, created_by).await? {
edges_created += 1;
}
}
info!(
node_id = %node_id,
topics_created,
edges_created,
"Clip skrevet til database"
);
// 5. Logg AI-ressursforbruk
let tokens_in = ai_usage.as_ref().map(|u| u.prompt_tokens).unwrap_or(0);
let tokens_out = ai_usage.as_ref().map(|u| u.completion_tokens).unwrap_or(0);
let model_id = ai_model.unwrap_or_else(|| "unknown".to_string());
if tokens_in > 0 || tokens_out > 0 {
if let Err(e) = sqlx::query(
"INSERT INTO resource_usage_log (target_node_id, triggered_by, resource_type, detail)
VALUES ($1, $2, $3, $4)",
)
.bind(node_id)
.bind(Some(created_by))
.bind("ai")
.bind(serde_json::json!({
"model_id": model_id,
"tokens_in": tokens_in,
"tokens_out": tokens_out,
"job_type": "synops_clip"
}))
.execute(&db)
.await
{
warn!(error = %e, "Kunne ikke logge AI-ressursforbruk");
}
}
Ok(serde_json::json!({
"status": "completed",
"node_id": node_id.to_string(),
"url": output.url,
"title": title,
"summary": analysis.summary,
"content_length": output.content.len(),
"paywall": output.paywall,
"topics_created": topics_created,
"edges_created": edges_created,
"model": model_id,
"tokens_in": tokens_in,
"tokens_out": tokens_out,
}))
}
/// Kall LiteLLM for oppsummering og entitetsekstraksjon.
async fn call_llm_analysis(
output: &ClipOutput,
db: &sqlx::PgPool,
) -> Result<(AiAnalysis, Option<UsageInfo>, Option<String>), String> {
let gateway_url =
std::env::var("AI_GATEWAY_URL").unwrap_or_else(|_| "http://localhost:4000".to_string());
let api_key = std::env::var("LITELLM_MASTER_KEY").unwrap_or_default();
let model =
std::env::var("AI_CLIP_MODEL").unwrap_or_else(|_| "synops/low".to_string());
// Hent eksisterende topics for kontekst
let existing_topics = sqlx::query_scalar::<_, String>(
"SELECT title FROM nodes WHERE node_kind = 'topic' ORDER BY created_at DESC LIMIT 100",
)
.fetch_all(db)
.await
.map_err(|e| format!("PG-feil ved henting av topics: {e}"))?;
let topic_context = if existing_topics.is_empty() {
String::new()
} else {
format!("\n\nEksisterende topics: {}", existing_topics.join(", "))
};
// Begrens innholdslengde for LLM (maks ~4000 tegn)
let content_for_llm = truncate_str(&output.content, 4000);
let user_content = format!(
"Artikkel:\nTittel: {}\nForfatter: {}\n\n{}{topic_context}",
output.title.as_deref().unwrap_or("(ukjent)"),
output.author.as_deref().unwrap_or("(ukjent)"),
content_for_llm,
);
info!("Sender til LLM for oppsummering og entitetsekstraksjon");
let request = ChatRequest {
model,
messages: vec![
ChatMessage {
role: "system".to_string(),
content: CLIP_SYSTEM_PROMPT.to_string(),
},
ChatMessage {
role: "user".to_string(),
content: user_content,
},
],
temperature: 0.2,
response_format: ResponseFormat {
r#type: "json_object".to_string(),
},
};
let client = reqwest::Client::new();
let url = format!("{gateway_url}/v1/chat/completions");
let resp = client
.post(&url)
.header("Authorization", format!("Bearer {api_key}"))
.header("Content-Type", "application/json")
.json(&request)
.timeout(std::time::Duration::from_secs(60))
.send()
.await
.map_err(|e| format!("LiteLLM-kall feilet: {e}"))?;
if !resp.status().is_success() {
let status = resp.status();
let body = resp.text().await.unwrap_or_default();
return Err(format!("LiteLLM returnerte {status}: {body}"));
}
let chat_resp: ChatResponse = resp
.json()
.await
.map_err(|e| format!("Kunne ikke parse LiteLLM-respons: {e}"))?;
let content = chat_resp
.choices
.first()
.and_then(|c| c.message.content.as_deref())
.ok_or("LiteLLM returnerte ingen content")?;
let analysis: AiAnalysis = serde_json::from_str(content)
.map_err(|e| format!("Kunne ikke parse LLM JSON: {e}. Rå output: {content}"))?;
info!(
summary_len = analysis.summary.len(),
topics = analysis.topics.len(),
mentions = analysis.mentions.len(),
"LLM-analyse mottatt"
);
Ok((analysis, chat_resp.usage, chat_resp.model))
}
/// Opprett en topic-node.
async fn create_topic_node(
db: &sqlx::PgPool,
id: Uuid,
title: &str,
created_by: Uuid,
) -> Result<(), String> {
let metadata = serde_json::json!({"ai_generated": true});
sqlx::query(
r#"
INSERT INTO nodes (id, node_kind, title, content, visibility, metadata, created_by)
VALUES ($1, 'topic', $2, '', 'discoverable', $3, $4)
ON CONFLICT (id) DO NOTHING
"#,
)
.bind(id)
.bind(title)
.bind(&metadata)
.bind(created_by)
.execute(db)
.await
.map_err(|e| format!("PG insert topic feilet: {e}"))?;
info!(topic_id = %id, title = %title, "Ny topic-node opprettet");
Ok(())
}
/// Opprett en entitet-node (person, organisasjon, sted, konsept).
async fn create_entity_node(
db: &sqlx::PgPool,
id: Uuid,
name: &str,
entity_type: &str,
created_by: Uuid,
) -> Result<(), String> {
let metadata = serde_json::json!({
"ai_generated": true,
"entity_type": entity_type
});
sqlx::query(
r#"
INSERT INTO nodes (id, node_kind, title, content, visibility, metadata, created_by)
VALUES ($1, 'topic', $2, '', 'discoverable', $3, $4)
ON CONFLICT (id) DO NOTHING
"#,
)
.bind(id)
.bind(name)
.bind(&metadata)
.bind(created_by)
.execute(db)
.await
.map_err(|e| format!("PG insert entity feilet: {e}"))?;
info!(entity_id = %id, name = %name, entity_type = %entity_type, "Ny entitet-node opprettet");
Ok(())
}
/// Opprett en mentions-edge. Returnerer true hvis ny edge ble opprettet.
async fn create_mentions_edge(
db: &sqlx::PgPool,
source_id: Uuid,
target_id: Uuid,
created_by: Uuid,
) -> Result<bool, String> {
let edge_id = Uuid::now_v7();
let metadata = serde_json::json!({"origin": "ai_clip"});
let result = sqlx::query(
r#"
INSERT INTO edges (id, source_id, target_id, edge_type, metadata, system, created_by)
VALUES ($1, $2, $3, 'mentions', $4, false, $5)
ON CONFLICT (source_id, target_id, edge_type) DO NOTHING
"#,
)
.bind(edge_id)
.bind(source_id)
.bind(target_id)
.bind(&metadata)
.bind(created_by)
.execute(db)
.await
.map_err(|e| format!("PG insert mentions-edge feilet: {e}"))?;
let created = result.rows_affected() > 0;
if created {
info!(source = %source_id, target = %target_id, "Mentions-edge opprettet");
}
Ok(created)
}
/// Avkort streng til maks lengde uten å bryte midt i et ord.
fn truncate_str(s: &str, max_len: usize) -> &str {
if s.len() <= max_len {
return s;
}
// Finn siste mellomrom før grensen
match s[..max_len].rfind(' ') {
Some(pos) => &s[..pos],
None => &s[..max_len],
}
}
async fn run(url: &str, use_playwright: bool, timeout: u64) -> Result<ClipOutput, String> {
let scripts_dir = find_scripts_dir()?;
info!(url, "Starter clip");
// Steg 1: Hent HTML
let html = if use_playwright {
info!("Bruker Playwright for å hente side");
fetch_with_playwright(&scripts_dir, url, timeout).await?
} else {
match fetch_html(url, timeout).await {
Ok(html) => html,
Err(e) => {
warn!("HTTP-henting feilet: {e}, prøver Playwright");
fetch_with_playwright(&scripts_dir, url, timeout).await?
}
}
};
// Steg 2: Parse med Readability
let parsed = parse_with_readability(&scripts_dir, url, &html).await?;
// Steg 3: Hvis Readability feilet, prøv Playwright (om vi ikke allerede brukte det)
let (parsed, source) = if parsed.error.is_some() && !use_playwright {
warn!("Readability feilet, prøver Playwright-fallback");
let result = run_playwright_readability(&scripts_dir, url, timeout).await?;
(result, "playwright")
} else if parsed.error.is_some() {
return Err(format!(
"Readability feilet: {}",
parsed.error.unwrap_or_default()
));
} else {
(parsed, "readability")
};
let content = parsed.content.unwrap_or_default();
let content = clean_text(&content);
// Steg 4: Forsøk å finne dato fra HTML
let date = extract_date(&html);
// Steg 5: Detekter betalingsmur
let paywall = detect_paywall(&content, &html);
if paywall {
info!("Betalingsmur detektert");
}
Ok(ClipOutput {
url: url.to_string(),
title: parsed.title,
author: parsed.author,
date,
content,
excerpt: parsed.excerpt,
paywall,
source: source.to_string(),
})
}
/// Finn scripts/-mappen relativt til binæren eller via miljøvariabel.
fn find_scripts_dir() -> Result<PathBuf, String> {
if let Ok(dir) = std::env::var("SYNOPS_CLIP_SCRIPTS") {
let p = PathBuf::from(dir);
if p.exists() {
return Ok(p);
}
}
// Prøv relativt til binæren: ../scripts/ (når installert) eller
// fra kilde: tools/synops-clip/scripts/
let exe = std::env::current_exe().map_err(|e| format!("Fant ikke exe-sti: {e}"))?;
let exe_dir = exe.parent().ok_or("Ingen parent-mappe for exe")?;
// I utvikling: target/release/ → ../../scripts/
for candidate in &[
exe_dir.join("../../scripts"),
exe_dir.join("../scripts"),
exe_dir.join("scripts"),
PathBuf::from("/home/vegard/synops/tools/synops-clip/scripts"),
] {
let resolved = candidate
.canonicalize()
.unwrap_or_else(|_| candidate.clone());
if resolved.join("readability.mjs").exists() {
debug!(?resolved, "Fant scripts-mappe");
return Ok(resolved);
}
}
Err("Fant ikke scripts/-mappen. Sett SYNOPS_CLIP_SCRIPTS.".to_string())
}
/// Hent HTML med reqwest.
async fn fetch_html(url: &str, timeout_secs: u64) -> Result<String, String> {
let client = reqwest::Client::builder()
.timeout(std::time::Duration::from_secs(timeout_secs))
.user_agent("Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36")
.redirect(reqwest::redirect::Policy::limited(10))
.build()
.map_err(|e| format!("Kunne ikke opprette HTTP-klient: {e}"))?;
let resp = client
.get(url)
.header("Accept", "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8")
.header("Accept-Language", "nb-NO,nb;q=0.9,no;q=0.8,en;q=0.5")
.send()
.await
.map_err(|e| format!("HTTP-forespørsel feilet: {e}"))?;
let status = resp.status();
if !status.is_success() {
return Err(format!("HTTP {status}"));
}
resp.text()
.await
.map_err(|e| format!("Kunne ikke lese respons-body: {e}"))
}
/// Hent ferdigrendret HTML via Playwright (for JS-tunge sider).
async fn fetch_with_playwright(
scripts_dir: &PathBuf,
url: &str,
timeout: u64,
) -> Result<String, String> {
let output = tokio::process::Command::new("node")
.arg("--input-type=module")
.arg("-e")
.arg(format!(
r#"
import {{ chromium }} from 'playwright';
const browser = await chromium.launch({{ headless: true }});
const ctx = await browser.newContext({{
userAgent: 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36'
}});
const page = await ctx.newPage();
await page.goto({url}, {{ waitUntil: 'networkidle', timeout: {timeout} }});
const html = await page.content();
process.stdout.write(html);
await browser.close();
"#,
url = serde_json::to_string(url).unwrap(),
timeout = timeout * 1000,
))
.env(
"NODE_PATH",
scripts_dir.join("node_modules").to_string_lossy().to_string(),
)
.output()
.await
.map_err(|e| format!("Playwright-prosess feilet: {e}"))?;
if !output.status.success() {
let stderr = String::from_utf8_lossy(&output.stderr);
return Err(format!("Playwright feilet: {stderr}"));
}
String::from_utf8(output.stdout).map_err(|e| format!("Ugyldig UTF-8 fra Playwright: {e}"))
}
/// Parse HTML med Readability via Node.js-hjelpeskript.
async fn parse_with_readability(
scripts_dir: &PathBuf,
url: &str,
html: &str,
) -> Result<ReadabilityResult, String> {
let script = scripts_dir.join("readability.mjs");
let mut child = tokio::process::Command::new("node")
.arg(&script)
.arg("--url")
.arg(url)
.env(
"NODE_PATH",
scripts_dir.join("node_modules").to_string_lossy().to_string(),
)
.stdin(std::process::Stdio::piped())
.stdout(std::process::Stdio::piped())
.stderr(std::process::Stdio::piped())
.spawn()
.map_err(|e| format!("Kunne ikke starte readability.mjs: {e}"))?;
// Skriv HTML til stdin
if let Some(mut stdin) = child.stdin.take() {
use tokio::io::AsyncWriteExt;
stdin
.write_all(html.as_bytes())
.await
.map_err(|e| format!("Kunne ikke skrive til stdin: {e}"))?;
drop(stdin);
}
let output = child
.wait_with_output()
.await
.map_err(|e| format!("Readability-prosess feilet: {e}"))?;
let stdout = String::from_utf8_lossy(&output.stdout);
debug!(stdout = %stdout, "Readability-output");
serde_json::from_str::<ReadabilityResult>(&stdout)
.map_err(|e| format!("Kunne ikke parse Readability-output: {e} — raw: {stdout}"))
}
/// Kjør Playwright + Readability i ett steg.
async fn run_playwright_readability(
scripts_dir: &PathBuf,
url: &str,
timeout: u64,
) -> Result<ReadabilityResult, String> {
let script = scripts_dir.join("playwright.mjs");
let output = tokio::process::Command::new("node")
.arg(&script)
.arg("--url")
.arg(url)
.arg("--timeout")
.arg((timeout * 1000).to_string())
.env(
"NODE_PATH",
scripts_dir.join("node_modules").to_string_lossy().to_string(),
)
.output()
.await
.map_err(|e| format!("Playwright-prosess feilet: {e}"))?;
let stdout = String::from_utf8_lossy(&output.stdout);
if !output.status.success() {
let stderr = String::from_utf8_lossy(&output.stderr);
warn!(stderr = %stderr, "Playwright feilet");
}
serde_json::from_str::<ReadabilityResult>(&stdout)
.map_err(|e| format!("Kunne ikke parse Playwright-output: {e} — raw: {stdout}"))
}
/// Forsøk å ekstrahere publiseringsdato fra HTML-metadata.
fn extract_date(html: &str) -> Option<String> {
// Vanlige meta-tagger for publiseringsdato
let patterns = [
r#"property="article:published_time"\s+content="([^"]+)""#,
r#"content="([^"]+)"\s+property="article:published_time""#,
r#"name="date"\s+content="([^"]+)""#,
r#"content="([^"]+)"\s+name="date""#,
r#"name="pubdate"\s+content="([^"]+)""#,
r#"content="([^"]+)"\s+name="pubdate""#,
r#"name="publishdate"\s+content="([^"]+)""#,
r#"itemprop="datePublished"\s+content="([^"]+)""#,
r#"content="([^"]+)"\s+itemprop="datePublished""#,
r#"name="DC\.date"\s+content="([^"]+)""#,
r#""datePublished"\s*:\s*"([^"]+)""#,
];
for pat in &patterns {
if let Ok(re) = Regex::new(pat) {
if let Some(caps) = re.captures(html) {
if let Some(date_str) = caps.get(1) {
let d = date_str.as_str().to_string();
// Normaliser til YYYY-MM-DD om mulig
if d.len() >= 10 {
return Some(d[..10].to_string());
}
return Some(d);
}
}
}
}
None
}
/// Rens opp tekst fra Readability (fjern overflødige mellomrom/linjeskift).
fn clean_text(text: &str) -> String {
// Fjern ledende/etterfølgende whitespace per linje, kollapser blanke linjer
let lines: Vec<&str> = text.lines().map(|l| l.trim()).collect();
let mut result = Vec::new();
let mut prev_empty = false;
for line in lines {
if line.is_empty() {
if !prev_empty {
result.push("");
prev_empty = true;
}
} else {
result.push(line);
prev_empty = false;
}
}
result.join("\n").trim().to_string()
}
/// Detekter betalingsmur basert på innhold og HTML-mønstre.
fn detect_paywall(content: &str, html: &str) -> bool {
let content_lower = content.to_lowercase();
let html_lower = html.to_lowercase();
// 1. Veldig kort innhold (under 200 tegn) tyder på betalingsmur
let trimmed_len = content.trim().len();
if trimmed_len > 0 && trimmed_len < 200 {
debug!(trimmed_len, "Kort innhold — mulig betalingsmur");
return true;
}
// 2. Kjente betalingsmur-mønstre i innholdet
let paywall_content_patterns = [
"logg inn for å lese",
"log in to read",
"sign in to continue",
"subscribe to continue",
"abonner for å lese",
"kun for abonnenter",
"for subscribers only",
"du må være innlogget",
"denne artikkelen er for abonnenter",
"kjøp tilgang",
"buy access",
"unlock this article",
"lås opp artikkelen",
"registrer deg for å lese",
"create an account to read",
"meld deg inn",
"bli abonnent",
"allerede abonnent",
"already a subscriber",
];
for pattern in &paywall_content_patterns {
if content_lower.contains(pattern) {
debug!(pattern, "Betalingsmur-mønster funnet i innhold");
return true;
}
}
// 3. Kjente betalingsmur-klasser/elementer i HTML
let paywall_html_patterns = [
"class=\"paywall",
"class=\"pw-",
"id=\"paywall",
"data-paywall",
"class=\"subscriber-only",
"class=\"premium-content",
"class=\"locked-content",
"class=\"article-paywall",
"class=\"metered-",
"class=\"piano-",
"id=\"piano-",
"class=\"tp-modal",
"data-piano",
"class=\"ns-paywall",
"class=\"nettavisen-paywall",
"class=\"schibsted-paywall",
"class=\"amedia-paywall",
];
for pattern in &paywall_html_patterns {
if html_lower.contains(pattern) {
debug!(pattern, "Betalingsmur-mønster funnet i HTML");
return true;
}
}
// 4. Sjekk for meta-tagger som indikerer betalt innhold
if html_lower.contains(r#"content="locked""#)
|| html_lower.contains(r#"content="premium""#)
|| html_lower.contains("isAccessibleForFree")
&& html_lower.contains(r#""false""#)
{
debug!("Betalingsmur-meta-tagg funnet");
return true;
}
false
}