AI-aliaser: sidelinja/rutine → synops/low, sidelinja/resonering → synops/high. Fire nivåer i LiteLLM: low/medium/high/extreme. Oppdatert i: LiteLLM config, PG ai_job_routing, all Rust-kode (maskinrommet + 5 CLI-verktøy). Domener: sidelinja.org → synops.no i fallback-URLer, health-sjekker, LiveKit WSS, bandwidth-logger, docs/erfaringer, docs/setup, reference/server-state, .env.example. Docker container-navn (sidelinja-*) beholdes — styrt av COMPOSE_PROJECT_NAME i /srv/synops/.env, endres separat ved behov. Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
1034 lines
31 KiB
Rust
1034 lines
31 KiB
Rust
// synops-clip — Hent og parse webartikler til ren tekst + metadata.
|
||
//
|
||
// Input: --url <url>
|
||
// Output: JSON til stdout med title, author, date, content, url, paywall
|
||
//
|
||
// Med --write: opprett content-node i PG med artikkelinnhold,
|
||
// tagged-edge "clipped", AI-oppsummering via LiteLLM, og
|
||
// mentions-edges til gjenkjente entiteter i kunnskapsgrafen.
|
||
//
|
||
// Strategi:
|
||
// 1. Hent HTML med reqwest (vanlig HTTP-klient)
|
||
// 2. Parse med Mozilla Readability via Node.js-hjelpeskript
|
||
// 3. Hvis Readability feiler eller innholdet er for kort, prøv Playwright (headless browser)
|
||
// 4. Detekter betalingsmur basert på innholdslengde og kjente mønstre
|
||
// 5. (--write) Opprett node, edges, AI-oppsummering
|
||
//
|
||
// Miljøvariabler:
|
||
// SYNOPS_CLIP_SCRIPTS — Sti til scripts/-mappen (default: ved siden av binæren)
|
||
// DATABASE_URL — PostgreSQL-tilkobling (påkrevd med --write)
|
||
// AI_GATEWAY_URL — LiteLLM gateway (default: http://localhost:4000)
|
||
// LITELLM_MASTER_KEY — API-nøkkel for LiteLLM
|
||
// AI_CLIP_MODEL — Modellalias for oppsummering (default: synops/low)
|
||
// RUST_LOG — Loggnivå (default: synops_clip=info)
|
||
//
|
||
// Ref: docs/retninger/unix_filosofi.md
|
||
|
||
use clap::Parser;
|
||
use regex::Regex;
|
||
use serde::{Deserialize, Serialize};
|
||
use std::path::PathBuf;
|
||
use std::process;
|
||
use tracing::{debug, info, warn};
|
||
use uuid::Uuid;
|
||
|
||
/// Hent og parse webartikler til ren tekst + metadata.
|
||
#[derive(Parser)]
|
||
#[command(name = "synops-clip", about = "Hent og parse webartikler")]
|
||
struct Cli {
|
||
/// URL som skal hentes og parses
|
||
#[arg(long)]
|
||
url: String,
|
||
|
||
/// Timeout i sekunder for HTTP-forespørsler
|
||
#[arg(long, default_value = "30")]
|
||
timeout: u64,
|
||
|
||
/// Tving bruk av Playwright (headless browser) i stedet for vanlig HTTP
|
||
#[arg(long)]
|
||
playwright: bool,
|
||
|
||
/// Skriv resultat til database som content-node med edges
|
||
#[arg(long)]
|
||
write: bool,
|
||
|
||
/// Bruker-ID som clipper (påkrevd med --write)
|
||
#[arg(long)]
|
||
created_by: Option<Uuid>,
|
||
|
||
/// Jobb-payload som JSON (for maskinrommet/jobbkø)
|
||
#[arg(long)]
|
||
payload_json: Option<String>,
|
||
}
|
||
|
||
/// Resultat fra Readability-parsing (Node.js-skriptene).
|
||
#[derive(Deserialize)]
|
||
#[allow(dead_code)]
|
||
struct ReadabilityResult {
|
||
title: Option<String>,
|
||
author: Option<String>,
|
||
content: Option<String>,
|
||
excerpt: Option<String>,
|
||
#[serde(rename = "siteName")]
|
||
site_name: Option<String>,
|
||
length: Option<usize>,
|
||
error: Option<String>,
|
||
}
|
||
|
||
/// Payload fra jobbkø.
|
||
#[derive(Deserialize)]
|
||
struct JobPayload {
|
||
url: String,
|
||
#[serde(default)]
|
||
playwright: bool,
|
||
#[serde(default = "default_timeout")]
|
||
timeout: u64,
|
||
#[serde(default)]
|
||
write: bool,
|
||
created_by: Option<Uuid>,
|
||
}
|
||
|
||
fn default_timeout() -> u64 {
|
||
30
|
||
}
|
||
|
||
/// Endelig output-format.
|
||
#[derive(Serialize, Clone)]
|
||
struct ClipOutput {
|
||
url: String,
|
||
title: Option<String>,
|
||
author: Option<String>,
|
||
date: Option<String>,
|
||
content: String,
|
||
excerpt: Option<String>,
|
||
paywall: bool,
|
||
source: String,
|
||
}
|
||
|
||
// --- LLM request/response (OpenAI-kompatibel, samme mønster som synops-suggest-edges) ---
|
||
|
||
#[derive(Serialize)]
|
||
struct ChatRequest {
|
||
model: String,
|
||
messages: Vec<ChatMessage>,
|
||
temperature: f32,
|
||
response_format: ResponseFormat,
|
||
}
|
||
|
||
#[derive(Serialize)]
|
||
struct ResponseFormat {
|
||
r#type: String,
|
||
}
|
||
|
||
#[derive(Serialize)]
|
||
struct ChatMessage {
|
||
role: String,
|
||
content: String,
|
||
}
|
||
|
||
#[derive(Deserialize)]
|
||
struct ChatResponse {
|
||
choices: Vec<Choice>,
|
||
#[serde(default)]
|
||
usage: Option<UsageInfo>,
|
||
#[serde(default)]
|
||
model: Option<String>,
|
||
}
|
||
|
||
#[derive(Deserialize, Clone)]
|
||
struct UsageInfo {
|
||
#[serde(default)]
|
||
prompt_tokens: i64,
|
||
#[serde(default)]
|
||
completion_tokens: i64,
|
||
}
|
||
|
||
#[derive(Deserialize)]
|
||
struct Choice {
|
||
message: MessageContent,
|
||
}
|
||
|
||
#[derive(Deserialize)]
|
||
struct MessageContent {
|
||
content: Option<String>,
|
||
}
|
||
|
||
/// AI-analysens output: oppsummering + entiteter.
|
||
#[derive(Deserialize, Debug)]
|
||
struct AiAnalysis {
|
||
#[serde(default)]
|
||
summary: String,
|
||
#[serde(default)]
|
||
topics: Vec<String>,
|
||
#[serde(default)]
|
||
mentions: Vec<MentionSuggestion>,
|
||
}
|
||
|
||
#[derive(Deserialize, Debug)]
|
||
struct MentionSuggestion {
|
||
name: String,
|
||
#[serde(default = "default_entity_type")]
|
||
entity_type: String,
|
||
}
|
||
|
||
fn default_entity_type() -> String {
|
||
"person".to_string()
|
||
}
|
||
|
||
#[derive(sqlx::FromRow)]
|
||
struct TopicRow {
|
||
id: Uuid,
|
||
title: String,
|
||
}
|
||
|
||
const CLIP_SYSTEM_PROMPT: &str = r#"Du er en innholdsanalysator for en norsk redaksjonsplattform. Du skal:
|
||
|
||
1. Lage en kort oppsummering (2–4 setninger) av artikkelen på norsk.
|
||
2. Ekstrahere topics (emner/temaer). Bruk korte, presise norske termer. Maks 5.
|
||
3. Ekstrahere mentions (navngitte entiteter: personer, organisasjoner, steder).
|
||
|
||
Returner KUN et JSON-objekt:
|
||
{
|
||
"summary": "Kort oppsummering av artikkelen.",
|
||
"topics": ["emne1", "emne2"],
|
||
"mentions": [{"name": "Navn", "entity_type": "person"}]
|
||
}
|
||
|
||
Regler:
|
||
- Oppsummeringen skal fange artikkelens kjerne og være nyttig uten å lese originalen.
|
||
- Bruk eksisterende topics fra listen nedenfor der det passer.
|
||
- entity_type er en av: "person", "organisasjon", "sted", "konsept".
|
||
- Returner tomme lister hvis teksten ikke har meningsfullt innhold."#;
|
||
|
||
#[tokio::main]
|
||
async fn main() {
|
||
synops_common::logging::init("synops_clip");
|
||
|
||
let cli = Cli::parse();
|
||
|
||
// Støtt payload-json fra jobbkø
|
||
let (url, use_playwright, timeout, write, created_by) =
|
||
if let Some(ref payload) = cli.payload_json {
|
||
match serde_json::from_str::<JobPayload>(payload) {
|
||
Ok(p) => (p.url, p.playwright, p.timeout, p.write, p.created_by),
|
||
Err(e) => {
|
||
eprintln!("Ugyldig payload-json: {e}");
|
||
process::exit(1);
|
||
}
|
||
}
|
||
} else {
|
||
(
|
||
cli.url.clone(),
|
||
cli.playwright,
|
||
cli.timeout,
|
||
cli.write,
|
||
cli.created_by,
|
||
)
|
||
};
|
||
|
||
// Valider at created_by er satt ved --write
|
||
if write && created_by.is_none() {
|
||
eprintln!("Feil: --created-by er påkrevd sammen med --write");
|
||
process::exit(1);
|
||
}
|
||
|
||
match run(&url, use_playwright, timeout).await {
|
||
Ok(output) => {
|
||
if write {
|
||
match write_to_db(&output, created_by.unwrap()).await {
|
||
Ok(result) => {
|
||
println!("{}", serde_json::to_string_pretty(&result).unwrap());
|
||
}
|
||
Err(e) => {
|
||
eprintln!("Feil ved skriving til database: {e}");
|
||
process::exit(1);
|
||
}
|
||
}
|
||
} else {
|
||
println!("{}", serde_json::to_string_pretty(&output).unwrap());
|
||
}
|
||
}
|
||
Err(e) => {
|
||
eprintln!("Feil: {e}");
|
||
process::exit(1);
|
||
}
|
||
}
|
||
}
|
||
|
||
/// Skriv clip-resultat til databasen: content-node, tagged-edge, AI-analyse, mentions-edges.
|
||
async fn write_to_db(
|
||
output: &ClipOutput,
|
||
created_by: Uuid,
|
||
) -> Result<serde_json::Value, String> {
|
||
let db = synops_common::db::connect().await?;
|
||
|
||
let node_id = Uuid::now_v7();
|
||
|
||
// 1. AI-analyse: oppsummering + entiteter
|
||
let (analysis, ai_usage, ai_model) = match call_llm_analysis(output, &db).await {
|
||
Ok(result) => result,
|
||
Err(e) => {
|
||
warn!("AI-analyse feilet, fortsetter uten: {e}");
|
||
(
|
||
AiAnalysis {
|
||
summary: String::new(),
|
||
topics: vec![],
|
||
mentions: vec![],
|
||
},
|
||
None,
|
||
None,
|
||
)
|
||
}
|
||
};
|
||
|
||
// 2. Opprett content-node
|
||
let metadata = serde_json::json!({
|
||
"source_url": output.url,
|
||
"original_author": output.author,
|
||
"original_date": output.date,
|
||
"paywall": output.paywall,
|
||
"clip_source": output.source,
|
||
});
|
||
|
||
// Bruk AI-oppsummering som innhold om den finnes, ellers bruk excerpt
|
||
let node_content = if !analysis.summary.is_empty() {
|
||
format!("{}\n\n---\n\n{}", analysis.summary, output.content)
|
||
} else {
|
||
output.content.clone()
|
||
};
|
||
|
||
let title = output
|
||
.title
|
||
.clone()
|
||
.unwrap_or_else(|| format!("Clip: {}", truncate_str(&output.url, 60)));
|
||
|
||
sqlx::query(
|
||
r#"
|
||
INSERT INTO nodes (id, node_kind, title, content, visibility, metadata, created_by)
|
||
VALUES ($1, 'content', $2, $3, 'hidden'::visibility, $4, $5)
|
||
"#,
|
||
)
|
||
.bind(node_id)
|
||
.bind(&title)
|
||
.bind(&node_content)
|
||
.bind(&metadata)
|
||
.bind(created_by)
|
||
.execute(&db)
|
||
.await
|
||
.map_err(|e| format!("PG insert node feilet: {e}"))?;
|
||
|
||
info!(node_id = %node_id, title = %title, "Content-node opprettet");
|
||
|
||
// 3. Opprett tagged-edge "clipped"
|
||
let tagged_edge_id = Uuid::now_v7();
|
||
sqlx::query(
|
||
r#"
|
||
INSERT INTO edges (id, source_id, target_id, edge_type, metadata, system, created_by)
|
||
VALUES ($1, $2, $2, 'tagged', $3, false, $4)
|
||
ON CONFLICT (source_id, target_id, edge_type) DO NOTHING
|
||
"#,
|
||
)
|
||
.bind(tagged_edge_id)
|
||
.bind(node_id)
|
||
.bind(serde_json::json!({"tag": "clipped"}))
|
||
.bind(created_by)
|
||
.execute(&db)
|
||
.await
|
||
.map_err(|e| format!("PG insert tagged-edge feilet: {e}"))?;
|
||
|
||
info!(node_id = %node_id, "Tagged-edge 'clipped' opprettet");
|
||
|
||
// 4. Opprett mentions-edges til gjenkjente entiteter
|
||
let mut topics_created = 0u32;
|
||
let mut edges_created = 0u32;
|
||
|
||
// Hent eksisterende topic-noder for matching
|
||
let existing_topics = sqlx::query_as::<_, TopicRow>(
|
||
"SELECT id, title FROM nodes WHERE node_kind = 'topic' ORDER BY created_at DESC LIMIT 200",
|
||
)
|
||
.fetch_all(&db)
|
||
.await
|
||
.map_err(|e| format!("PG-feil ved henting av topics: {e}"))?;
|
||
|
||
// Prosesser topics fra AI-analyse
|
||
for topic_name in &analysis.topics {
|
||
let topic_name = topic_name.trim();
|
||
if topic_name.is_empty() {
|
||
continue;
|
||
}
|
||
|
||
let existing = existing_topics
|
||
.iter()
|
||
.find(|t| t.title.to_lowercase() == topic_name.to_lowercase());
|
||
|
||
let topic_id = if let Some(t) = existing {
|
||
t.id
|
||
} else {
|
||
let new_id = Uuid::now_v7();
|
||
create_topic_node(&db, new_id, topic_name, created_by).await?;
|
||
topics_created += 1;
|
||
new_id
|
||
};
|
||
|
||
if create_mentions_edge(&db, node_id, topic_id, created_by).await? {
|
||
edges_created += 1;
|
||
}
|
||
}
|
||
|
||
// Prosesser mentions (navngitte entiteter)
|
||
for mention in &analysis.mentions {
|
||
let name = mention.name.trim();
|
||
if name.is_empty() {
|
||
continue;
|
||
}
|
||
|
||
let existing_entity = sqlx::query_scalar::<_, Uuid>(
|
||
"SELECT id FROM nodes WHERE node_kind = 'topic' AND LOWER(title) = LOWER($1) LIMIT 1",
|
||
)
|
||
.bind(name)
|
||
.fetch_optional(&db)
|
||
.await
|
||
.map_err(|e| format!("PG-feil ved entitet-søk: {e}"))?;
|
||
|
||
let entity_id = if let Some(id) = existing_entity {
|
||
id
|
||
} else {
|
||
let new_id = Uuid::now_v7();
|
||
create_entity_node(&db, new_id, name, &mention.entity_type, created_by).await?;
|
||
topics_created += 1;
|
||
new_id
|
||
};
|
||
|
||
if create_mentions_edge(&db, node_id, entity_id, created_by).await? {
|
||
edges_created += 1;
|
||
}
|
||
}
|
||
|
||
info!(
|
||
node_id = %node_id,
|
||
topics_created,
|
||
edges_created,
|
||
"Clip skrevet til database"
|
||
);
|
||
|
||
// 5. Logg AI-ressursforbruk
|
||
let tokens_in = ai_usage.as_ref().map(|u| u.prompt_tokens).unwrap_or(0);
|
||
let tokens_out = ai_usage.as_ref().map(|u| u.completion_tokens).unwrap_or(0);
|
||
let model_id = ai_model.unwrap_or_else(|| "unknown".to_string());
|
||
|
||
if tokens_in > 0 || tokens_out > 0 {
|
||
if let Err(e) = sqlx::query(
|
||
"INSERT INTO resource_usage_log (target_node_id, triggered_by, resource_type, detail)
|
||
VALUES ($1, $2, $3, $4)",
|
||
)
|
||
.bind(node_id)
|
||
.bind(Some(created_by))
|
||
.bind("ai")
|
||
.bind(serde_json::json!({
|
||
"model_id": model_id,
|
||
"tokens_in": tokens_in,
|
||
"tokens_out": tokens_out,
|
||
"job_type": "synops_clip"
|
||
}))
|
||
.execute(&db)
|
||
.await
|
||
{
|
||
warn!(error = %e, "Kunne ikke logge AI-ressursforbruk");
|
||
}
|
||
}
|
||
|
||
Ok(serde_json::json!({
|
||
"status": "completed",
|
||
"node_id": node_id.to_string(),
|
||
"url": output.url,
|
||
"title": title,
|
||
"summary": analysis.summary,
|
||
"content_length": output.content.len(),
|
||
"paywall": output.paywall,
|
||
"topics_created": topics_created,
|
||
"edges_created": edges_created,
|
||
"model": model_id,
|
||
"tokens_in": tokens_in,
|
||
"tokens_out": tokens_out,
|
||
}))
|
||
}
|
||
|
||
/// Kall LiteLLM for oppsummering og entitetsekstraksjon.
|
||
async fn call_llm_analysis(
|
||
output: &ClipOutput,
|
||
db: &sqlx::PgPool,
|
||
) -> Result<(AiAnalysis, Option<UsageInfo>, Option<String>), String> {
|
||
let gateway_url =
|
||
std::env::var("AI_GATEWAY_URL").unwrap_or_else(|_| "http://localhost:4000".to_string());
|
||
let api_key = std::env::var("LITELLM_MASTER_KEY").unwrap_or_default();
|
||
let model =
|
||
std::env::var("AI_CLIP_MODEL").unwrap_or_else(|_| "synops/low".to_string());
|
||
|
||
// Hent eksisterende topics for kontekst
|
||
let existing_topics = sqlx::query_scalar::<_, String>(
|
||
"SELECT title FROM nodes WHERE node_kind = 'topic' ORDER BY created_at DESC LIMIT 100",
|
||
)
|
||
.fetch_all(db)
|
||
.await
|
||
.map_err(|e| format!("PG-feil ved henting av topics: {e}"))?;
|
||
|
||
let topic_context = if existing_topics.is_empty() {
|
||
String::new()
|
||
} else {
|
||
format!("\n\nEksisterende topics: {}", existing_topics.join(", "))
|
||
};
|
||
|
||
// Begrens innholdslengde for LLM (maks ~4000 tegn)
|
||
let content_for_llm = truncate_str(&output.content, 4000);
|
||
|
||
let user_content = format!(
|
||
"Artikkel:\nTittel: {}\nForfatter: {}\n\n{}{topic_context}",
|
||
output.title.as_deref().unwrap_or("(ukjent)"),
|
||
output.author.as_deref().unwrap_or("(ukjent)"),
|
||
content_for_llm,
|
||
);
|
||
|
||
info!("Sender til LLM for oppsummering og entitetsekstraksjon");
|
||
|
||
let request = ChatRequest {
|
||
model,
|
||
messages: vec![
|
||
ChatMessage {
|
||
role: "system".to_string(),
|
||
content: CLIP_SYSTEM_PROMPT.to_string(),
|
||
},
|
||
ChatMessage {
|
||
role: "user".to_string(),
|
||
content: user_content,
|
||
},
|
||
],
|
||
temperature: 0.2,
|
||
response_format: ResponseFormat {
|
||
r#type: "json_object".to_string(),
|
||
},
|
||
};
|
||
|
||
let client = reqwest::Client::new();
|
||
let url = format!("{gateway_url}/v1/chat/completions");
|
||
|
||
let resp = client
|
||
.post(&url)
|
||
.header("Authorization", format!("Bearer {api_key}"))
|
||
.header("Content-Type", "application/json")
|
||
.json(&request)
|
||
.timeout(std::time::Duration::from_secs(60))
|
||
.send()
|
||
.await
|
||
.map_err(|e| format!("LiteLLM-kall feilet: {e}"))?;
|
||
|
||
if !resp.status().is_success() {
|
||
let status = resp.status();
|
||
let body = resp.text().await.unwrap_or_default();
|
||
return Err(format!("LiteLLM returnerte {status}: {body}"));
|
||
}
|
||
|
||
let chat_resp: ChatResponse = resp
|
||
.json()
|
||
.await
|
||
.map_err(|e| format!("Kunne ikke parse LiteLLM-respons: {e}"))?;
|
||
|
||
let content = chat_resp
|
||
.choices
|
||
.first()
|
||
.and_then(|c| c.message.content.as_deref())
|
||
.ok_or("LiteLLM returnerte ingen content")?;
|
||
|
||
let analysis: AiAnalysis = serde_json::from_str(content)
|
||
.map_err(|e| format!("Kunne ikke parse LLM JSON: {e}. Rå output: {content}"))?;
|
||
|
||
info!(
|
||
summary_len = analysis.summary.len(),
|
||
topics = analysis.topics.len(),
|
||
mentions = analysis.mentions.len(),
|
||
"LLM-analyse mottatt"
|
||
);
|
||
|
||
Ok((analysis, chat_resp.usage, chat_resp.model))
|
||
}
|
||
|
||
/// Opprett en topic-node.
|
||
async fn create_topic_node(
|
||
db: &sqlx::PgPool,
|
||
id: Uuid,
|
||
title: &str,
|
||
created_by: Uuid,
|
||
) -> Result<(), String> {
|
||
let metadata = serde_json::json!({"ai_generated": true});
|
||
|
||
sqlx::query(
|
||
r#"
|
||
INSERT INTO nodes (id, node_kind, title, content, visibility, metadata, created_by)
|
||
VALUES ($1, 'topic', $2, '', 'discoverable', $3, $4)
|
||
ON CONFLICT (id) DO NOTHING
|
||
"#,
|
||
)
|
||
.bind(id)
|
||
.bind(title)
|
||
.bind(&metadata)
|
||
.bind(created_by)
|
||
.execute(db)
|
||
.await
|
||
.map_err(|e| format!("PG insert topic feilet: {e}"))?;
|
||
|
||
info!(topic_id = %id, title = %title, "Ny topic-node opprettet");
|
||
Ok(())
|
||
}
|
||
|
||
/// Opprett en entitet-node (person, organisasjon, sted, konsept).
|
||
async fn create_entity_node(
|
||
db: &sqlx::PgPool,
|
||
id: Uuid,
|
||
name: &str,
|
||
entity_type: &str,
|
||
created_by: Uuid,
|
||
) -> Result<(), String> {
|
||
let metadata = serde_json::json!({
|
||
"ai_generated": true,
|
||
"entity_type": entity_type
|
||
});
|
||
|
||
sqlx::query(
|
||
r#"
|
||
INSERT INTO nodes (id, node_kind, title, content, visibility, metadata, created_by)
|
||
VALUES ($1, 'topic', $2, '', 'discoverable', $3, $4)
|
||
ON CONFLICT (id) DO NOTHING
|
||
"#,
|
||
)
|
||
.bind(id)
|
||
.bind(name)
|
||
.bind(&metadata)
|
||
.bind(created_by)
|
||
.execute(db)
|
||
.await
|
||
.map_err(|e| format!("PG insert entity feilet: {e}"))?;
|
||
|
||
info!(entity_id = %id, name = %name, entity_type = %entity_type, "Ny entitet-node opprettet");
|
||
Ok(())
|
||
}
|
||
|
||
/// Opprett en mentions-edge. Returnerer true hvis ny edge ble opprettet.
|
||
async fn create_mentions_edge(
|
||
db: &sqlx::PgPool,
|
||
source_id: Uuid,
|
||
target_id: Uuid,
|
||
created_by: Uuid,
|
||
) -> Result<bool, String> {
|
||
let edge_id = Uuid::now_v7();
|
||
let metadata = serde_json::json!({"origin": "ai_clip"});
|
||
|
||
let result = sqlx::query(
|
||
r#"
|
||
INSERT INTO edges (id, source_id, target_id, edge_type, metadata, system, created_by)
|
||
VALUES ($1, $2, $3, 'mentions', $4, false, $5)
|
||
ON CONFLICT (source_id, target_id, edge_type) DO NOTHING
|
||
"#,
|
||
)
|
||
.bind(edge_id)
|
||
.bind(source_id)
|
||
.bind(target_id)
|
||
.bind(&metadata)
|
||
.bind(created_by)
|
||
.execute(db)
|
||
.await
|
||
.map_err(|e| format!("PG insert mentions-edge feilet: {e}"))?;
|
||
|
||
let created = result.rows_affected() > 0;
|
||
if created {
|
||
info!(source = %source_id, target = %target_id, "Mentions-edge opprettet");
|
||
}
|
||
Ok(created)
|
||
}
|
||
|
||
/// Avkort streng til maks lengde uten å bryte midt i et ord.
|
||
fn truncate_str(s: &str, max_len: usize) -> &str {
|
||
if s.len() <= max_len {
|
||
return s;
|
||
}
|
||
// Finn siste mellomrom før grensen
|
||
match s[..max_len].rfind(' ') {
|
||
Some(pos) => &s[..pos],
|
||
None => &s[..max_len],
|
||
}
|
||
}
|
||
|
||
async fn run(url: &str, use_playwright: bool, timeout: u64) -> Result<ClipOutput, String> {
|
||
let scripts_dir = find_scripts_dir()?;
|
||
info!(url, "Starter clip");
|
||
|
||
// Steg 1: Hent HTML
|
||
let html = if use_playwright {
|
||
info!("Bruker Playwright for å hente side");
|
||
fetch_with_playwright(&scripts_dir, url, timeout).await?
|
||
} else {
|
||
match fetch_html(url, timeout).await {
|
||
Ok(html) => html,
|
||
Err(e) => {
|
||
warn!("HTTP-henting feilet: {e}, prøver Playwright");
|
||
fetch_with_playwright(&scripts_dir, url, timeout).await?
|
||
}
|
||
}
|
||
};
|
||
|
||
// Steg 2: Parse med Readability
|
||
let parsed = parse_with_readability(&scripts_dir, url, &html).await?;
|
||
|
||
// Steg 3: Hvis Readability feilet, prøv Playwright (om vi ikke allerede brukte det)
|
||
let (parsed, source) = if parsed.error.is_some() && !use_playwright {
|
||
warn!("Readability feilet, prøver Playwright-fallback");
|
||
let result = run_playwright_readability(&scripts_dir, url, timeout).await?;
|
||
(result, "playwright")
|
||
} else if parsed.error.is_some() {
|
||
return Err(format!(
|
||
"Readability feilet: {}",
|
||
parsed.error.unwrap_or_default()
|
||
));
|
||
} else {
|
||
(parsed, "readability")
|
||
};
|
||
|
||
let content = parsed.content.unwrap_or_default();
|
||
let content = clean_text(&content);
|
||
|
||
// Steg 4: Forsøk å finne dato fra HTML
|
||
let date = extract_date(&html);
|
||
|
||
// Steg 5: Detekter betalingsmur
|
||
let paywall = detect_paywall(&content, &html);
|
||
|
||
if paywall {
|
||
info!("Betalingsmur detektert");
|
||
}
|
||
|
||
Ok(ClipOutput {
|
||
url: url.to_string(),
|
||
title: parsed.title,
|
||
author: parsed.author,
|
||
date,
|
||
content,
|
||
excerpt: parsed.excerpt,
|
||
paywall,
|
||
source: source.to_string(),
|
||
})
|
||
}
|
||
|
||
/// Finn scripts/-mappen relativt til binæren eller via miljøvariabel.
|
||
fn find_scripts_dir() -> Result<PathBuf, String> {
|
||
if let Ok(dir) = std::env::var("SYNOPS_CLIP_SCRIPTS") {
|
||
let p = PathBuf::from(dir);
|
||
if p.exists() {
|
||
return Ok(p);
|
||
}
|
||
}
|
||
|
||
// Prøv relativt til binæren: ../scripts/ (når installert) eller
|
||
// fra kilde: tools/synops-clip/scripts/
|
||
let exe = std::env::current_exe().map_err(|e| format!("Fant ikke exe-sti: {e}"))?;
|
||
let exe_dir = exe.parent().ok_or("Ingen parent-mappe for exe")?;
|
||
|
||
// I utvikling: target/release/ → ../../scripts/
|
||
for candidate in &[
|
||
exe_dir.join("../../scripts"),
|
||
exe_dir.join("../scripts"),
|
||
exe_dir.join("scripts"),
|
||
PathBuf::from("/home/vegard/synops/tools/synops-clip/scripts"),
|
||
] {
|
||
let resolved = candidate
|
||
.canonicalize()
|
||
.unwrap_or_else(|_| candidate.clone());
|
||
if resolved.join("readability.mjs").exists() {
|
||
debug!(?resolved, "Fant scripts-mappe");
|
||
return Ok(resolved);
|
||
}
|
||
}
|
||
|
||
Err("Fant ikke scripts/-mappen. Sett SYNOPS_CLIP_SCRIPTS.".to_string())
|
||
}
|
||
|
||
/// Hent HTML med reqwest.
|
||
async fn fetch_html(url: &str, timeout_secs: u64) -> Result<String, String> {
|
||
let client = reqwest::Client::builder()
|
||
.timeout(std::time::Duration::from_secs(timeout_secs))
|
||
.user_agent("Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36")
|
||
.redirect(reqwest::redirect::Policy::limited(10))
|
||
.build()
|
||
.map_err(|e| format!("Kunne ikke opprette HTTP-klient: {e}"))?;
|
||
|
||
let resp = client
|
||
.get(url)
|
||
.header("Accept", "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8")
|
||
.header("Accept-Language", "nb-NO,nb;q=0.9,no;q=0.8,en;q=0.5")
|
||
.send()
|
||
.await
|
||
.map_err(|e| format!("HTTP-forespørsel feilet: {e}"))?;
|
||
|
||
let status = resp.status();
|
||
if !status.is_success() {
|
||
return Err(format!("HTTP {status}"));
|
||
}
|
||
|
||
resp.text()
|
||
.await
|
||
.map_err(|e| format!("Kunne ikke lese respons-body: {e}"))
|
||
}
|
||
|
||
/// Hent ferdigrendret HTML via Playwright (for JS-tunge sider).
|
||
async fn fetch_with_playwright(
|
||
scripts_dir: &PathBuf,
|
||
url: &str,
|
||
timeout: u64,
|
||
) -> Result<String, String> {
|
||
let output = tokio::process::Command::new("node")
|
||
.arg("--input-type=module")
|
||
.arg("-e")
|
||
.arg(format!(
|
||
r#"
|
||
import {{ chromium }} from 'playwright';
|
||
const browser = await chromium.launch({{ headless: true }});
|
||
const ctx = await browser.newContext({{
|
||
userAgent: 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36'
|
||
}});
|
||
const page = await ctx.newPage();
|
||
await page.goto({url}, {{ waitUntil: 'networkidle', timeout: {timeout} }});
|
||
const html = await page.content();
|
||
process.stdout.write(html);
|
||
await browser.close();
|
||
"#,
|
||
url = serde_json::to_string(url).unwrap(),
|
||
timeout = timeout * 1000,
|
||
))
|
||
.env(
|
||
"NODE_PATH",
|
||
scripts_dir.join("node_modules").to_string_lossy().to_string(),
|
||
)
|
||
.output()
|
||
.await
|
||
.map_err(|e| format!("Playwright-prosess feilet: {e}"))?;
|
||
|
||
if !output.status.success() {
|
||
let stderr = String::from_utf8_lossy(&output.stderr);
|
||
return Err(format!("Playwright feilet: {stderr}"));
|
||
}
|
||
|
||
String::from_utf8(output.stdout).map_err(|e| format!("Ugyldig UTF-8 fra Playwright: {e}"))
|
||
}
|
||
|
||
/// Parse HTML med Readability via Node.js-hjelpeskript.
|
||
async fn parse_with_readability(
|
||
scripts_dir: &PathBuf,
|
||
url: &str,
|
||
html: &str,
|
||
) -> Result<ReadabilityResult, String> {
|
||
let script = scripts_dir.join("readability.mjs");
|
||
|
||
let mut child = tokio::process::Command::new("node")
|
||
.arg(&script)
|
||
.arg("--url")
|
||
.arg(url)
|
||
.env(
|
||
"NODE_PATH",
|
||
scripts_dir.join("node_modules").to_string_lossy().to_string(),
|
||
)
|
||
.stdin(std::process::Stdio::piped())
|
||
.stdout(std::process::Stdio::piped())
|
||
.stderr(std::process::Stdio::piped())
|
||
.spawn()
|
||
.map_err(|e| format!("Kunne ikke starte readability.mjs: {e}"))?;
|
||
|
||
// Skriv HTML til stdin
|
||
if let Some(mut stdin) = child.stdin.take() {
|
||
use tokio::io::AsyncWriteExt;
|
||
stdin
|
||
.write_all(html.as_bytes())
|
||
.await
|
||
.map_err(|e| format!("Kunne ikke skrive til stdin: {e}"))?;
|
||
drop(stdin);
|
||
}
|
||
|
||
let output = child
|
||
.wait_with_output()
|
||
.await
|
||
.map_err(|e| format!("Readability-prosess feilet: {e}"))?;
|
||
|
||
let stdout = String::from_utf8_lossy(&output.stdout);
|
||
debug!(stdout = %stdout, "Readability-output");
|
||
|
||
serde_json::from_str::<ReadabilityResult>(&stdout)
|
||
.map_err(|e| format!("Kunne ikke parse Readability-output: {e} — raw: {stdout}"))
|
||
}
|
||
|
||
/// Kjør Playwright + Readability i ett steg.
|
||
async fn run_playwright_readability(
|
||
scripts_dir: &PathBuf,
|
||
url: &str,
|
||
timeout: u64,
|
||
) -> Result<ReadabilityResult, String> {
|
||
let script = scripts_dir.join("playwright.mjs");
|
||
|
||
let output = tokio::process::Command::new("node")
|
||
.arg(&script)
|
||
.arg("--url")
|
||
.arg(url)
|
||
.arg("--timeout")
|
||
.arg((timeout * 1000).to_string())
|
||
.env(
|
||
"NODE_PATH",
|
||
scripts_dir.join("node_modules").to_string_lossy().to_string(),
|
||
)
|
||
.output()
|
||
.await
|
||
.map_err(|e| format!("Playwright-prosess feilet: {e}"))?;
|
||
|
||
let stdout = String::from_utf8_lossy(&output.stdout);
|
||
|
||
if !output.status.success() {
|
||
let stderr = String::from_utf8_lossy(&output.stderr);
|
||
warn!(stderr = %stderr, "Playwright feilet");
|
||
}
|
||
|
||
serde_json::from_str::<ReadabilityResult>(&stdout)
|
||
.map_err(|e| format!("Kunne ikke parse Playwright-output: {e} — raw: {stdout}"))
|
||
}
|
||
|
||
/// Forsøk å ekstrahere publiseringsdato fra HTML-metadata.
|
||
fn extract_date(html: &str) -> Option<String> {
|
||
// Vanlige meta-tagger for publiseringsdato
|
||
let patterns = [
|
||
r#"property="article:published_time"\s+content="([^"]+)""#,
|
||
r#"content="([^"]+)"\s+property="article:published_time""#,
|
||
r#"name="date"\s+content="([^"]+)""#,
|
||
r#"content="([^"]+)"\s+name="date""#,
|
||
r#"name="pubdate"\s+content="([^"]+)""#,
|
||
r#"content="([^"]+)"\s+name="pubdate""#,
|
||
r#"name="publishdate"\s+content="([^"]+)""#,
|
||
r#"itemprop="datePublished"\s+content="([^"]+)""#,
|
||
r#"content="([^"]+)"\s+itemprop="datePublished""#,
|
||
r#"name="DC\.date"\s+content="([^"]+)""#,
|
||
r#""datePublished"\s*:\s*"([^"]+)""#,
|
||
];
|
||
|
||
for pat in &patterns {
|
||
if let Ok(re) = Regex::new(pat) {
|
||
if let Some(caps) = re.captures(html) {
|
||
if let Some(date_str) = caps.get(1) {
|
||
let d = date_str.as_str().to_string();
|
||
// Normaliser til YYYY-MM-DD om mulig
|
||
if d.len() >= 10 {
|
||
return Some(d[..10].to_string());
|
||
}
|
||
return Some(d);
|
||
}
|
||
}
|
||
}
|
||
}
|
||
None
|
||
}
|
||
|
||
/// Rens opp tekst fra Readability (fjern overflødige mellomrom/linjeskift).
|
||
fn clean_text(text: &str) -> String {
|
||
// Fjern ledende/etterfølgende whitespace per linje, kollapser blanke linjer
|
||
let lines: Vec<&str> = text.lines().map(|l| l.trim()).collect();
|
||
|
||
let mut result = Vec::new();
|
||
let mut prev_empty = false;
|
||
for line in lines {
|
||
if line.is_empty() {
|
||
if !prev_empty {
|
||
result.push("");
|
||
prev_empty = true;
|
||
}
|
||
} else {
|
||
result.push(line);
|
||
prev_empty = false;
|
||
}
|
||
}
|
||
|
||
result.join("\n").trim().to_string()
|
||
}
|
||
|
||
/// Detekter betalingsmur basert på innhold og HTML-mønstre.
|
||
fn detect_paywall(content: &str, html: &str) -> bool {
|
||
let content_lower = content.to_lowercase();
|
||
let html_lower = html.to_lowercase();
|
||
|
||
// 1. Veldig kort innhold (under 200 tegn) tyder på betalingsmur
|
||
let trimmed_len = content.trim().len();
|
||
if trimmed_len > 0 && trimmed_len < 200 {
|
||
debug!(trimmed_len, "Kort innhold — mulig betalingsmur");
|
||
return true;
|
||
}
|
||
|
||
// 2. Kjente betalingsmur-mønstre i innholdet
|
||
let paywall_content_patterns = [
|
||
"logg inn for å lese",
|
||
"log in to read",
|
||
"sign in to continue",
|
||
"subscribe to continue",
|
||
"abonner for å lese",
|
||
"kun for abonnenter",
|
||
"for subscribers only",
|
||
"du må være innlogget",
|
||
"denne artikkelen er for abonnenter",
|
||
"kjøp tilgang",
|
||
"buy access",
|
||
"unlock this article",
|
||
"lås opp artikkelen",
|
||
"registrer deg for å lese",
|
||
"create an account to read",
|
||
"meld deg inn",
|
||
"bli abonnent",
|
||
"allerede abonnent",
|
||
"already a subscriber",
|
||
];
|
||
|
||
for pattern in &paywall_content_patterns {
|
||
if content_lower.contains(pattern) {
|
||
debug!(pattern, "Betalingsmur-mønster funnet i innhold");
|
||
return true;
|
||
}
|
||
}
|
||
|
||
// 3. Kjente betalingsmur-klasser/elementer i HTML
|
||
let paywall_html_patterns = [
|
||
"class=\"paywall",
|
||
"class=\"pw-",
|
||
"id=\"paywall",
|
||
"data-paywall",
|
||
"class=\"subscriber-only",
|
||
"class=\"premium-content",
|
||
"class=\"locked-content",
|
||
"class=\"article-paywall",
|
||
"class=\"metered-",
|
||
"class=\"piano-",
|
||
"id=\"piano-",
|
||
"class=\"tp-modal",
|
||
"data-piano",
|
||
"class=\"ns-paywall",
|
||
"class=\"nettavisen-paywall",
|
||
"class=\"schibsted-paywall",
|
||
"class=\"amedia-paywall",
|
||
];
|
||
|
||
for pattern in &paywall_html_patterns {
|
||
if html_lower.contains(pattern) {
|
||
debug!(pattern, "Betalingsmur-mønster funnet i HTML");
|
||
return true;
|
||
}
|
||
}
|
||
|
||
// 4. Sjekk for meta-tagger som indikerer betalt innhold
|
||
if html_lower.contains(r#"content="locked""#)
|
||
|| html_lower.contains(r#"content="premium""#)
|
||
|| html_lower.contains("isAccessibleForFree")
|
||
&& html_lower.contains(r#""false""#)
|
||
{
|
||
debug!("Betalingsmur-meta-tagg funnet");
|
||
return true;
|
||
}
|
||
|
||
false
|
||
}
|