From 1d8ebf259b6b844ef0559f58b0190ae1c9d1390c Mon Sep 17 00:00:00 2001 From: vegard Date: Wed, 18 Mar 2026 21:07:00 +0000 Subject: [PATCH] =?UTF-8?q?Skjermklipp-input:=20paste=20screenshot=20i=20c?= =?UTF-8?q?hat=20=E2=86=92=20CAS=20=E2=86=92=20media-node=20(oppgave=2029.?= =?UTF-8?q?1)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Frontend: - ChatInput: paste-handler detekterer bilder fra clipboard (ClipboardEvent), laster opp til CAS via uploadMedia med metadata_extra { source: "screenshot" } - Chat-side: viser bildenoder inline med AI-beskrivelse når tilgjengelig - api.ts: uploadMedia støtter nå metadata_extra for ekstra node-metadata Backend (maskinrommet): - upload_media: nytt metadata_extra multipart-felt som merges inn i media-nodens metadata (f.eks. source, description) - describe_image: ny jobbtype — enqueuues automatisk for screenshot-uploads, kaller synops-ai med --image for AI-beskrivelse av bildet - Beskrivelsen lagres tilbake i media-nodens metadata.description synops-ai: - Nytt --image flag for multimodal LLM-kall (vision) via LiteLLM - Sender bilde som base64 data-URL i OpenAI-kompatibelt format - Brukes av describe_image-jobben for bildbeskrivelse --- frontend/src/lib/api.ts | 3 + frontend/src/lib/components/ChatInput.svelte | 140 ++++++++++++++----- frontend/src/routes/chat/[id]/+page.svelte | 42 +++++- maskinrommet/src/describe_image.rs | 108 ++++++++++++++ maskinrommet/src/intentions.rs | 65 ++++++++- maskinrommet/src/jobs.rs | 3 + maskinrommet/src/main.rs | 1 + tasks.md | 3 +- tools/synops-ai/Cargo.lock | 1 + tools/synops-ai/Cargo.toml | 1 + tools/synops-ai/src/main.rs | 86 +++++++++++- 11 files changed, 403 insertions(+), 50 deletions(-) create mode 100644 maskinrommet/src/describe_image.rs diff --git a/frontend/src/lib/api.ts b/frontend/src/lib/api.ts index 1a52027..8f1e9e0 100644 --- a/frontend/src/lib/api.ts +++ b/frontend/src/lib/api.ts @@ -239,6 +239,8 @@ export interface UploadMediaRequest { source_id?: string; visibility?: string; title?: string; + /** Ekstra metadata som merges inn i media-nodens metadata (f.eks. { source: "screenshot" }) */ + metadata_extra?: Record; } export interface UploadMediaResponse { @@ -258,6 +260,7 @@ export async function uploadMedia( if (data.source_id) form.append('source_id', data.source_id); if (data.visibility) form.append('visibility', data.visibility); if (data.title) form.append('title', data.title); + if (data.metadata_extra) form.append('metadata_extra', JSON.stringify(data.metadata_extra)); const res = await fetch(`${BASE_URL}/intentions/upload_media`, { method: 'POST', diff --git a/frontend/src/lib/components/ChatInput.svelte b/frontend/src/lib/components/ChatInput.svelte index f455afa..3c37768 100644 --- a/frontend/src/lib/components/ChatInput.svelte +++ b/frontend/src/lib/components/ChatInput.svelte @@ -1,5 +1,6 @@ -
-
- + {#if error} +

{error}

+ {/if} +
+ - {#if error} -

{error}

- {/if} + onerror={(msg) => { error = msg; }} + /> +
- { error = msg; }} - /> - diff --git a/frontend/src/routes/chat/[id]/+page.svelte b/frontend/src/routes/chat/[id]/+page.svelte index 1b2539f..a4008dd 100644 --- a/frontend/src/routes/chat/[id]/+page.svelte +++ b/frontend/src/routes/chat/[id]/+page.svelte @@ -154,6 +154,31 @@ } catch { return false; } } + /** Check if this node is an image media node */ + function isImageNode(node: Node): boolean { + if (node.nodeKind !== 'media') return false; + try { + const meta = JSON.parse(node.metadata ?? '{}'); + return typeof meta.mime === 'string' && meta.mime.startsWith('image/'); + } catch { return false; } + } + + /** Get CAS image URL for a media node */ + function imageSrc(node: Node): string { + try { + const meta = JSON.parse(node.metadata ?? '{}'); + return casUrl(meta.cas_hash); + } catch { return ''; } + } + + /** Get AI-generated description for an image node */ + function imageDescription(node: Node): string | undefined { + try { + const meta = JSON.parse(node.metadata ?? '{}'); + return meta.description; + } catch { return undefined; } + } + /** Get CAS audio URL for a media node */ function audioSrc(node: Node): string { try { @@ -234,9 +259,10 @@ {#each messages as msg (msg.id)} {@const own = isOwnMessage(msg)} {@const audio = isAudioNode(msg)} + {@const image = isImageNode(msg)} {@const bot = isAgentMessage(msg)}
-
+
{#if !own}

{#if bot}🤖 {/if}{senderName(msg)} @@ -258,12 +284,24 @@ accessToken={hasSegments(msg) ? accessToken : undefined} compact /> + {:else if image} +

+ {imageDescription(msg) + {#if imageDescription(msg)} +

{imageDescription(msg)}

+ {/if} +
{:else}

{msg.content || ''}

{/if} -

+

{formatTime(msg)}

diff --git a/maskinrommet/src/describe_image.rs b/maskinrommet/src/describe_image.rs new file mode 100644 index 0000000..b924096 --- /dev/null +++ b/maskinrommet/src/describe_image.rs @@ -0,0 +1,108 @@ +// describe_image — AI-beskrivelse av bilder via synops-ai CLI. +// +// Jobbtype: "describe_image" +// Payload: { "media_node_id", "cas_hash", "mime" } +// +// Flyten: +// 1. Sender bildefilen (CAS-sti) til synops-ai prompt --image +// 2. synops-ai sender multimodal melding til LLM via LiteLLM +// 3. Oppdaterer media-nodens metadata med description-feltet +// +// Ref: docs/retninger/unix_filosofi.md, oppgave 29.1 + +use sqlx::PgPool; +use uuid::Uuid; + +use crate::cas::CasStore; +use crate::jobs::JobRow; + +/// Håndterer describe_image-jobb. +/// +/// Sender bildet til synops-ai for beskrivelse via vision-modell, +/// og lagrer resultatet i media-nodens metadata. +pub async fn handle_describe_image( + job: &JobRow, + db: &PgPool, + cas: &CasStore, +) -> Result { + let media_node_id: Uuid = job.payload["media_node_id"] + .as_str() + .and_then(|s| s.parse().ok()) + .ok_or("Mangler gyldig 'media_node_id' i payload")?; + + let cas_hash = job.payload["cas_hash"] + .as_str() + .ok_or("Mangler 'cas_hash' i payload")?; + + let mime = job.payload["mime"] + .as_str() + .unwrap_or("image/png"); + + // Verifiser at bildet finnes i CAS + let cas_path = cas.path_for(cas_hash); + if !cas_path.exists() { + return Err(format!("Bilde ikke funnet i CAS: {cas_hash}")); + } + + let bin = std::env::var("SYNOPS_AI_BIN") + .unwrap_or_else(|_| "synops-ai".to_string()); + + tracing::info!( + media_node_id = %media_node_id, + cas_hash, + "Sender bilde til AI for beskrivelse" + ); + + let output = tokio::process::Command::new(&bin) + .arg("prompt") + .arg("--prompt") + .arg("Beskriv dette bildet kort og konsist på norsk. Fokuser på innholdet, ikke formatet. Maks 2-3 setninger.") + .arg("--image") + .arg(cas_path.to_str().unwrap_or("")) + .arg("--image-mime") + .arg(mime) + .arg("--job-type") + .arg("describe_image") + .arg("--temperature") + .arg("0.3") + .output() + .await + .map_err(|e| format!("Kunne ikke starte {bin}: {e}"))?; + + if !output.status.success() { + let stderr = String::from_utf8_lossy(&output.stderr); + return Err(format!("synops-ai feilet: {stderr}")); + } + + let description = String::from_utf8_lossy(&output.stdout).trim().to_string(); + + if description.is_empty() { + return Err("synops-ai returnerte tom beskrivelse".into()); + } + + tracing::info!( + media_node_id = %media_node_id, + description_len = description.len(), + "Bildebeskrivelse mottatt fra AI" + ); + + // Oppdater media-nodens metadata med description + sqlx::query( + r#"UPDATE nodes + SET metadata = metadata || jsonb_build_object('description', $2::text), + updated_at = now() + WHERE id = $1"#, + ) + .bind(media_node_id) + .bind(&description) + .execute(db) + .await + .map_err(|e| format!("Kunne ikke oppdatere media-node med beskrivelse: {e}"))?; + + // PG NOTIFY-trigger på nodes-tabellen sender sanntidsoppdatering automatisk. + + Ok(serde_json::json!({ + "media_node_id": media_node_id.to_string(), + "description": description, + })) +} diff --git a/maskinrommet/src/intentions.rs b/maskinrommet/src/intentions.rs index f4a05bf..3ea4c56 100644 --- a/maskinrommet/src/intentions.rs +++ b/maskinrommet/src/intentions.rs @@ -2154,6 +2154,8 @@ pub struct UploadMediaResponse { /// - `source_id` (valgfritt): Node-ID å koble media til via `has_media`-edge. /// - `visibility` (valgfritt): Synlighet for media-noden. Default: "hidden". /// - `title` (valgfritt): Tittel for media-noden (default: filnavn). +/// - `metadata_extra` (valgfritt): JSON-objekt med ekstra metadata som merges +/// inn i media-nodens metadata (f.eks. `{"source":"screenshot"}`). /// /// Ref: docs/primitiver/nodes.md (media), docs/retninger/universell_input.md pub async fn upload_media( @@ -2167,6 +2169,7 @@ pub async fn upload_media( let mut source_id: Option = None; let mut visibility = "hidden".to_string(); let mut title: Option = None; + let mut metadata_extra: Option = None; // -- Parse multipart-felter -- while let Some(field) = multipart.next_field().await.map_err(|e| { @@ -2219,6 +2222,18 @@ pub async fn upload_media( })?; title = Some(text); } + "metadata_extra" => { + let text = field.text().await.map_err(|e| { + bad_request(&format!("Kunne ikke lese metadata_extra: {e}")) + })?; + let parsed: serde_json::Value = serde_json::from_str(&text).map_err(|e| { + bad_request(&format!("Ugyldig JSON i metadata_extra: {e}")) + })?; + if !parsed.is_object() { + return Err(bad_request("metadata_extra må være et JSON-objekt")); + } + metadata_extra = Some(parsed); + } _ => { // Ignorer ukjente felter } @@ -2249,12 +2264,21 @@ pub async fn upload_media( let mime = content_type.unwrap_or_else(|| "application/octet-stream".to_string()); let node_title = title.unwrap_or_else(|| file_name.unwrap_or_default()); - let metadata = serde_json::json!({ + let mut metadata = serde_json::json!({ "cas_hash": cas_result.hash, "mime": mime, "size_bytes": cas_result.size, }); + // Merge ekstra metadata fra klienten (f.eks. source: "screenshot") + if let Some(extra) = &metadata_extra { + if let Some(obj) = extra.as_object() { + for (k, v) in obj { + metadata[k] = v.clone(); + } + } + } + // Skriv media-node til PG sqlx::query( r#"INSERT INTO nodes (id, node_kind, title, content, visibility, metadata, created_by) @@ -2339,6 +2363,45 @@ pub async fn upload_media( } } + // -- Enqueue AI-beskrivelse for skjermklipp -- + let is_screenshot = metadata_extra + .as_ref() + .and_then(|v| v.get("source")) + .and_then(|v| v.as_str()) + == Some("screenshot"); + let is_image = mime.starts_with("image/"); + + if is_screenshot && is_image { + let payload = serde_json::json!({ + "media_node_id": media_node_id, + "cas_hash": cas_result.hash, + "mime": mime, + }); + + let collection_id = if let Some(src_id) = source_id { + find_collection_for_node(&state.db, src_id).await.ok().flatten() + } else { + None + }; + + match crate::jobs::enqueue(&state.db, "describe_image", payload, collection_id, 5).await { + Ok(job_id) => { + tracing::info!( + job_id = %job_id, + media_node_id = %media_node_id, + "describe_image-jobb opprettet for skjermklipp" + ); + } + Err(e) => { + tracing::error!( + media_node_id = %media_node_id, + error = %e, + "Kunne ikke opprette describe_image-jobb" + ); + } + } + } + // -- Enqueue transkripsjons-jobb for lydfiler -- if is_audio_mime(&mime) { let payload = serde_json::json!({ diff --git a/maskinrommet/src/jobs.rs b/maskinrommet/src/jobs.rs index f12a45c..dc5c0e5 100644 --- a/maskinrommet/src/jobs.rs +++ b/maskinrommet/src/jobs.rs @@ -223,6 +223,9 @@ async fn dispatch( "clip_url" => { clip::handle_clip_url(job, db).await } + "describe_image" => { + crate::describe_image::handle_describe_image(job, db, cas).await + } // Orchestration: trigger-evaluering har lagt jobben i kø. // Kompilatoren parser scriptet og validerer det. // Utførelse av kompilert script kommer i oppgave 24.5. diff --git a/maskinrommet/src/main.rs b/maskinrommet/src/main.rs index 58888e9..3bd770a 100644 --- a/maskinrommet/src/main.rs +++ b/maskinrommet/src/main.rs @@ -10,6 +10,7 @@ pub mod cas; pub mod cli_dispatch; pub mod clip; mod custom_domain; +pub mod describe_image; mod intentions; pub mod jobs; pub mod livekit; diff --git a/tasks.md b/tasks.md index c718847..b52315e 100644 --- a/tasks.md +++ b/tasks.md @@ -390,8 +390,7 @@ alle relevante modaliteter. Alt ender som noder. Modaliteten er transport, noden er det som lever videre. ### Skjermklipp -- [~] 29.1 Skjermklipp-input: paste screenshot fra clipboard i chat/editor → upload til CAS → media-node. Frontend detekterer bilde-paste (ClipboardEvent). Valgfri AI-beskrivelse via synops-ai ("beskriv dette bildet"). Metadata: `{ "source": "screenshot", "description": "..." }`. - > Påbegynt: 2026-03-18T20:55 +- [x] 29.1 Skjermklipp-input: paste screenshot fra clipboard i chat/editor → upload til CAS → media-node. Frontend detekterer bilde-paste (ClipboardEvent). Valgfri AI-beskrivelse via synops-ai ("beskriv dette bildet"). Metadata: `{ "source": "screenshot", "description": "..." }`. ### RSS/Feed-abonnement - [ ] 29.2 `synops-feed` CLI: abonner på RSS/Atom-feed. Input: `--url --collection-id [--interval 30m]`. Poller feed, oppretter `content`-node for nye entries med `metadata.source_url` og `tagged`-edge "feed". AI-oppsummering valgfritt. Paywall-deteksjon gjenbrukt fra synops-clip. diff --git a/tools/synops-ai/Cargo.lock b/tools/synops-ai/Cargo.lock index db67712..cc27ef9 100644 --- a/tools/synops-ai/Cargo.lock +++ b/tools/synops-ai/Cargo.lock @@ -1909,6 +1909,7 @@ dependencies = [ name = "synops-ai" version = "0.1.0" dependencies = [ + "base64", "chrono", "clap", "reqwest", diff --git a/tools/synops-ai/Cargo.toml b/tools/synops-ai/Cargo.toml index ec33b98..efd2d7c 100644 --- a/tools/synops-ai/Cargo.toml +++ b/tools/synops-ai/Cargo.toml @@ -18,4 +18,5 @@ chrono = { version = "0.4", features = ["serde"] } reqwest = { version = "0.12", default-features = false, features = ["rustls-tls", "json"] } tracing = "0.1" tracing-subscriber = { version = "0.3", features = ["env-filter"] } +base64 = "0.22" synops-common = { path = "../synops-common" } diff --git a/tools/synops-ai/src/main.rs b/tools/synops-ai/src/main.rs index 7218f96..03a1ace 100644 --- a/tools/synops-ai/src/main.rs +++ b/tools/synops-ai/src/main.rs @@ -44,6 +44,15 @@ struct PromptArgs { #[arg(long)] system: Option, + /// Bilde-fil å inkludere som visuell kontekst (multimodal/vision). + /// Bildet sendes som base64 data-URL til LLM. + #[arg(long)] + image: Option, + + /// MIME-type for bildet (default: image/png) + #[arg(long, default_value = "image/png")] + image_mime: String, + /// Job-type for modelloppslag og logging (default: "simple_prompt") #[arg(long, default_value = "simple_prompt")] job_type: String, @@ -104,7 +113,40 @@ struct ChatRequest { #[derive(Serialize, Clone)] struct ChatMessage { role: String, - content: String, + content: MessageContent, +} + +/// Meldingsinnhold: enten en enkel streng eller multimodal (tekst + bilde). +/// OpenAI-kompatibelt format: string | array<{type, text?, image_url?}> +#[derive(Clone)] +enum MessageContent { + Text(String), + Multimodal(Vec), +} + +#[derive(Serialize, Clone)] +#[serde(tag = "type")] +enum ContentPart { + #[serde(rename = "text")] + Text { text: String }, + #[serde(rename = "image_url")] + ImageUrl { image_url: ImageUrlDetail }, +} + +#[derive(Serialize, Clone)] +struct ImageUrlDetail { + url: String, + #[serde(skip_serializing_if = "Option::is_none")] + detail: Option, +} + +impl Serialize for MessageContent { + fn serialize(&self, serializer: S) -> Result { + match self { + MessageContent::Text(s) => serializer.serialize_str(s), + MessageContent::Multimodal(parts) => parts.serialize(serializer), + } + } } #[derive(Deserialize)] @@ -126,11 +168,11 @@ struct UsageInfo { #[derive(Deserialize)] struct Choice { - message: MessageContent, + message: ResponseMessage, } #[derive(Deserialize)] -struct MessageContent { +struct ResponseMessage { content: Option, } @@ -191,12 +233,42 @@ async fn run_prompt(args: PromptArgs) -> Result<(), String> { if let Some(ref sys) = args.system { messages.push(ChatMessage { role: "system".to_string(), - content: sys.clone(), + content: MessageContent::Text(sys.clone()), }); } + + // Multimodal melding hvis --image er oppgitt + let user_content = if let Some(ref image_path) = args.image { + let image_data = std::fs::read(image_path) + .map_err(|e| format!("Kunne ikke lese bildefil '{}': {e}", image_path))?; + + use base64::Engine; + let b64 = base64::engine::general_purpose::STANDARD.encode(&image_data); + let data_url = format!("data:{};base64,{}", args.image_mime, b64); + + tracing::info!( + image_path, + image_size = image_data.len(), + mime = %args.image_mime, + "Inkluderer bilde i multimodal melding" + ); + + MessageContent::Multimodal(vec![ + ContentPart::Text { text: args.prompt.clone() }, + ContentPart::ImageUrl { + image_url: ImageUrlDetail { + url: data_url, + detail: Some("auto".to_string()), + }, + }, + ]) + } else { + MessageContent::Text(args.prompt.clone()) + }; + messages.push(ChatMessage { role: "user".to_string(), - content: args.prompt.clone(), + content: user_content, }); let (content, usage, actual_model) = @@ -711,11 +783,11 @@ async fn run_script(args: ScriptArgs) -> Result<(), String> { let messages = vec![ ChatMessage { role: "system".to_string(), - content: system_prompt, + content: MessageContent::Text(system_prompt), }, ChatMessage { role: "user".to_string(), - content: user_content, + content: MessageContent::Text(user_content), }, ];