Skjermklipp-input: paste screenshot i chat → CAS → media-node (oppgave 29.1)

Frontend:
- ChatInput: paste-handler detekterer bilder fra clipboard (ClipboardEvent),
  laster opp til CAS via uploadMedia med metadata_extra { source: "screenshot" }
- Chat-side: viser bildenoder inline med AI-beskrivelse når tilgjengelig
- api.ts: uploadMedia støtter nå metadata_extra for ekstra node-metadata

Backend (maskinrommet):
- upload_media: nytt metadata_extra multipart-felt som merges inn i
  media-nodens metadata (f.eks. source, description)
- describe_image: ny jobbtype — enqueuues automatisk for screenshot-uploads,
  kaller synops-ai med --image for AI-beskrivelse av bildet
- Beskrivelsen lagres tilbake i media-nodens metadata.description

synops-ai:
- Nytt --image flag for multimodal LLM-kall (vision) via LiteLLM
- Sender bilde som base64 data-URL i OpenAI-kompatibelt format
- Brukes av describe_image-jobben for bildbeskrivelse
This commit is contained in:
vegard 2026-03-18 21:07:00 +00:00
parent 09ed49d7d7
commit 1d8ebf259b
11 changed files with 403 additions and 50 deletions

View file

@ -239,6 +239,8 @@ export interface UploadMediaRequest {
source_id?: string;
visibility?: string;
title?: string;
/** Ekstra metadata som merges inn i media-nodens metadata (f.eks. { source: "screenshot" }) */
metadata_extra?: Record<string, unknown>;
}
export interface UploadMediaResponse {
@ -258,6 +260,7 @@ export async function uploadMedia(
if (data.source_id) form.append('source_id', data.source_id);
if (data.visibility) form.append('visibility', data.visibility);
if (data.title) form.append('title', data.title);
if (data.metadata_extra) form.append('metadata_extra', JSON.stringify(data.metadata_extra));
const res = await fetch(`${BASE_URL}/intentions/upload_media`, {
method: 'POST',

View file

@ -1,5 +1,6 @@
<script lang="ts">
import VoiceRecorder from './VoiceRecorder.svelte';
import { uploadMedia, casUrl } from '$lib/api';
interface Props {
onsubmit: (content: string) => Promise<void>;
@ -14,9 +15,13 @@
let content = $state('');
let submitting = $state(false);
let error = $state('');
let uploading = $state(0);
let textareaEl: HTMLTextAreaElement | undefined = $state();
const isEmpty = $derived(!content.trim());
/** Preview for pasted image before upload completes */
let imagePreview: { url: string; name: string } | undefined = $state();
const isEmpty = $derived(!content.trim() && uploading === 0);
async function handleSubmit() {
if (isEmpty || submitting || disabled) return;
@ -49,16 +54,74 @@
el.style.height = 'auto';
el.style.height = Math.min(el.scrollHeight, 160) + 'px';
}
/** Handle paste event — detect images from clipboard */
function handlePaste(e: ClipboardEvent) {
const items = e.clipboardData?.items;
if (!items) return;
for (const item of items) {
if (item.type.startsWith('image/')) {
e.preventDefault();
const file = item.getAsFile();
if (file) handleImageUpload(file);
return;
}
}
}
/** Upload a pasted/dropped image as a screenshot media node */
async function handleImageUpload(file: File) {
if (!accessToken) {
error = 'Ikke innlogget — kan ikke laste opp bilder';
return;
}
uploading++;
// Show preview
const previewUrl = URL.createObjectURL(file);
imagePreview = { url: previewUrl, name: file.name || 'Skjermklipp' };
try {
await uploadMedia(accessToken, {
file,
source_id: contextId,
title: file.name || 'Skjermklipp',
visibility: 'hidden',
metadata_extra: { source: 'screenshot' },
});
} catch (e) {
error = e instanceof Error ? e.message : 'Feil ved bildeopplasting';
} finally {
uploading--;
if (imagePreview) {
URL.revokeObjectURL(imagePreview.url);
imagePreview = undefined;
}
}
}
</script>
<div class="flex items-end gap-2">
<div class="flex flex-col gap-2">
{#if imagePreview}
<div class="flex items-center gap-2 rounded-lg border border-blue-200 bg-blue-50 px-3 py-2">
<img src={imagePreview.url} alt="Forhåndsvisning" class="h-12 w-12 rounded object-cover" />
<span class="text-xs text-blue-700">Laster opp {imagePreview.name}</span>
</div>
{/if}
{#if uploading > 0 && !imagePreview}
<p class="text-xs text-blue-600">Laster opp bilde…</p>
{/if}
<div class="flex items-end gap-2">
<div class="relative flex-1">
<textarea
bind:this={textareaEl}
bind:value={content}
onkeydown={handleKeydown}
oninput={autoResize}
placeholder="Skriv en melding…"
onpaste={handlePaste}
placeholder="Skriv en melding… (lim inn bilde med Ctrl+V)"
disabled={disabled || submitting}
rows={1}
class="w-full resize-none rounded-xl border border-gray-200 bg-gray-50 px-4 py-2.5 text-sm text-gray-900 placeholder:text-gray-400 focus:border-blue-300 focus:bg-white focus:outline-none focus:ring-1 focus:ring-blue-300"
@ -90,4 +153,5 @@
</svg>
{/if}
</button>
</div>
</div>

View file

@ -154,6 +154,31 @@
} catch { return false; }
}
/** Check if this node is an image media node */
function isImageNode(node: Node): boolean {
if (node.nodeKind !== 'media') return false;
try {
const meta = JSON.parse(node.metadata ?? '{}');
return typeof meta.mime === 'string' && meta.mime.startsWith('image/');
} catch { return false; }
}
/** Get CAS image URL for a media node */
function imageSrc(node: Node): string {
try {
const meta = JSON.parse(node.metadata ?? '{}');
return casUrl(meta.cas_hash);
} catch { return ''; }
}
/** Get AI-generated description for an image node */
function imageDescription(node: Node): string | undefined {
try {
const meta = JSON.parse(node.metadata ?? '{}');
return meta.description;
} catch { return undefined; }
}
/** Get CAS audio URL for a media node */
function audioSrc(node: Node): string {
try {
@ -234,9 +259,10 @@
{#each messages as msg (msg.id)}
{@const own = isOwnMessage(msg)}
{@const audio = isAudioNode(msg)}
{@const image = isImageNode(msg)}
{@const bot = isAgentMessage(msg)}
<div class="flex {own ? 'justify-end' : 'justify-start'}">
<div class="max-w-[75%] {own ? (audio ? 'bg-blue-50 border border-blue-200 text-gray-900' : 'bg-blue-600 text-white') : bot ? 'bg-amber-50 border border-amber-200 text-gray-900' : 'bg-white border border-gray-200 text-gray-900'} rounded-2xl px-4 py-2 shadow-sm">
<div class="max-w-[75%] {own ? (audio || image ? 'bg-blue-50 border border-blue-200 text-gray-900' : 'bg-blue-600 text-white') : bot ? 'bg-amber-50 border border-amber-200 text-gray-900' : 'bg-white border border-gray-200 text-gray-900'} rounded-2xl px-4 py-2 shadow-sm">
{#if !own}
<p class="mb-0.5 text-xs font-medium {bot ? 'text-amber-700' : 'text-blue-600'}">
{#if bot}<span title="AI-agent">🤖 </span>{/if}{senderName(msg)}
@ -258,12 +284,24 @@
accessToken={hasSegments(msg) ? accessToken : undefined}
compact
/>
{:else if image}
<div class="my-1">
<img
src={imageSrc(msg)}
alt={imageDescription(msg) || msg.title || 'Skjermklipp'}
class="max-w-full rounded-lg"
loading="lazy"
/>
{#if imageDescription(msg)}
<p class="mt-1 text-xs text-gray-500 italic">{imageDescription(msg)}</p>
{/if}
</div>
{:else}
<p class="text-sm whitespace-pre-wrap break-words">
{msg.content || ''}
</p>
{/if}
<p class="mt-1 text-right text-[10px] {own && !audio ? 'text-blue-200' : 'text-gray-400'}">
<p class="mt-1 text-right text-[10px] {own && !audio && !image ? 'text-blue-200' : 'text-gray-400'}">
{formatTime(msg)}
</p>
</div>

View file

@ -0,0 +1,108 @@
// describe_image — AI-beskrivelse av bilder via synops-ai CLI.
//
// Jobbtype: "describe_image"
// Payload: { "media_node_id", "cas_hash", "mime" }
//
// Flyten:
// 1. Sender bildefilen (CAS-sti) til synops-ai prompt --image
// 2. synops-ai sender multimodal melding til LLM via LiteLLM
// 3. Oppdaterer media-nodens metadata med description-feltet
//
// Ref: docs/retninger/unix_filosofi.md, oppgave 29.1
use sqlx::PgPool;
use uuid::Uuid;
use crate::cas::CasStore;
use crate::jobs::JobRow;
/// Håndterer describe_image-jobb.
///
/// Sender bildet til synops-ai for beskrivelse via vision-modell,
/// og lagrer resultatet i media-nodens metadata.
pub async fn handle_describe_image(
job: &JobRow,
db: &PgPool,
cas: &CasStore,
) -> Result<serde_json::Value, String> {
let media_node_id: Uuid = job.payload["media_node_id"]
.as_str()
.and_then(|s| s.parse().ok())
.ok_or("Mangler gyldig 'media_node_id' i payload")?;
let cas_hash = job.payload["cas_hash"]
.as_str()
.ok_or("Mangler 'cas_hash' i payload")?;
let mime = job.payload["mime"]
.as_str()
.unwrap_or("image/png");
// Verifiser at bildet finnes i CAS
let cas_path = cas.path_for(cas_hash);
if !cas_path.exists() {
return Err(format!("Bilde ikke funnet i CAS: {cas_hash}"));
}
let bin = std::env::var("SYNOPS_AI_BIN")
.unwrap_or_else(|_| "synops-ai".to_string());
tracing::info!(
media_node_id = %media_node_id,
cas_hash,
"Sender bilde til AI for beskrivelse"
);
let output = tokio::process::Command::new(&bin)
.arg("prompt")
.arg("--prompt")
.arg("Beskriv dette bildet kort og konsist på norsk. Fokuser på innholdet, ikke formatet. Maks 2-3 setninger.")
.arg("--image")
.arg(cas_path.to_str().unwrap_or(""))
.arg("--image-mime")
.arg(mime)
.arg("--job-type")
.arg("describe_image")
.arg("--temperature")
.arg("0.3")
.output()
.await
.map_err(|e| format!("Kunne ikke starte {bin}: {e}"))?;
if !output.status.success() {
let stderr = String::from_utf8_lossy(&output.stderr);
return Err(format!("synops-ai feilet: {stderr}"));
}
let description = String::from_utf8_lossy(&output.stdout).trim().to_string();
if description.is_empty() {
return Err("synops-ai returnerte tom beskrivelse".into());
}
tracing::info!(
media_node_id = %media_node_id,
description_len = description.len(),
"Bildebeskrivelse mottatt fra AI"
);
// Oppdater media-nodens metadata med description
sqlx::query(
r#"UPDATE nodes
SET metadata = metadata || jsonb_build_object('description', $2::text),
updated_at = now()
WHERE id = $1"#,
)
.bind(media_node_id)
.bind(&description)
.execute(db)
.await
.map_err(|e| format!("Kunne ikke oppdatere media-node med beskrivelse: {e}"))?;
// PG NOTIFY-trigger på nodes-tabellen sender sanntidsoppdatering automatisk.
Ok(serde_json::json!({
"media_node_id": media_node_id.to_string(),
"description": description,
}))
}

View file

@ -2154,6 +2154,8 @@ pub struct UploadMediaResponse {
/// - `source_id` (valgfritt): Node-ID å koble media til via `has_media`-edge.
/// - `visibility` (valgfritt): Synlighet for media-noden. Default: "hidden".
/// - `title` (valgfritt): Tittel for media-noden (default: filnavn).
/// - `metadata_extra` (valgfritt): JSON-objekt med ekstra metadata som merges
/// inn i media-nodens metadata (f.eks. `{"source":"screenshot"}`).
///
/// Ref: docs/primitiver/nodes.md (media), docs/retninger/universell_input.md
pub async fn upload_media(
@ -2167,6 +2169,7 @@ pub async fn upload_media(
let mut source_id: Option<Uuid> = None;
let mut visibility = "hidden".to_string();
let mut title: Option<String> = None;
let mut metadata_extra: Option<serde_json::Value> = None;
// -- Parse multipart-felter --
while let Some(field) = multipart.next_field().await.map_err(|e| {
@ -2219,6 +2222,18 @@ pub async fn upload_media(
})?;
title = Some(text);
}
"metadata_extra" => {
let text = field.text().await.map_err(|e| {
bad_request(&format!("Kunne ikke lese metadata_extra: {e}"))
})?;
let parsed: serde_json::Value = serde_json::from_str(&text).map_err(|e| {
bad_request(&format!("Ugyldig JSON i metadata_extra: {e}"))
})?;
if !parsed.is_object() {
return Err(bad_request("metadata_extra må være et JSON-objekt"));
}
metadata_extra = Some(parsed);
}
_ => {
// Ignorer ukjente felter
}
@ -2249,12 +2264,21 @@ pub async fn upload_media(
let mime = content_type.unwrap_or_else(|| "application/octet-stream".to_string());
let node_title = title.unwrap_or_else(|| file_name.unwrap_or_default());
let metadata = serde_json::json!({
let mut metadata = serde_json::json!({
"cas_hash": cas_result.hash,
"mime": mime,
"size_bytes": cas_result.size,
});
// Merge ekstra metadata fra klienten (f.eks. source: "screenshot")
if let Some(extra) = &metadata_extra {
if let Some(obj) = extra.as_object() {
for (k, v) in obj {
metadata[k] = v.clone();
}
}
}
// Skriv media-node til PG
sqlx::query(
r#"INSERT INTO nodes (id, node_kind, title, content, visibility, metadata, created_by)
@ -2339,6 +2363,45 @@ pub async fn upload_media(
}
}
// -- Enqueue AI-beskrivelse for skjermklipp --
let is_screenshot = metadata_extra
.as_ref()
.and_then(|v| v.get("source"))
.and_then(|v| v.as_str())
== Some("screenshot");
let is_image = mime.starts_with("image/");
if is_screenshot && is_image {
let payload = serde_json::json!({
"media_node_id": media_node_id,
"cas_hash": cas_result.hash,
"mime": mime,
});
let collection_id = if let Some(src_id) = source_id {
find_collection_for_node(&state.db, src_id).await.ok().flatten()
} else {
None
};
match crate::jobs::enqueue(&state.db, "describe_image", payload, collection_id, 5).await {
Ok(job_id) => {
tracing::info!(
job_id = %job_id,
media_node_id = %media_node_id,
"describe_image-jobb opprettet for skjermklipp"
);
}
Err(e) => {
tracing::error!(
media_node_id = %media_node_id,
error = %e,
"Kunne ikke opprette describe_image-jobb"
);
}
}
}
// -- Enqueue transkripsjons-jobb for lydfiler --
if is_audio_mime(&mime) {
let payload = serde_json::json!({

View file

@ -223,6 +223,9 @@ async fn dispatch(
"clip_url" => {
clip::handle_clip_url(job, db).await
}
"describe_image" => {
crate::describe_image::handle_describe_image(job, db, cas).await
}
// Orchestration: trigger-evaluering har lagt jobben i kø.
// Kompilatoren parser scriptet og validerer det.
// Utførelse av kompilert script kommer i oppgave 24.5.

View file

@ -10,6 +10,7 @@ pub mod cas;
pub mod cli_dispatch;
pub mod clip;
mod custom_domain;
pub mod describe_image;
mod intentions;
pub mod jobs;
pub mod livekit;

View file

@ -390,8 +390,7 @@ alle relevante modaliteter. Alt ender som noder. Modaliteten er transport,
noden er det som lever videre.
### Skjermklipp
- [~] 29.1 Skjermklipp-input: paste screenshot fra clipboard i chat/editor → upload til CAS → media-node. Frontend detekterer bilde-paste (ClipboardEvent). Valgfri AI-beskrivelse via synops-ai ("beskriv dette bildet"). Metadata: `{ "source": "screenshot", "description": "..." }`.
> Påbegynt: 2026-03-18T20:55
- [x] 29.1 Skjermklipp-input: paste screenshot fra clipboard i chat/editor → upload til CAS → media-node. Frontend detekterer bilde-paste (ClipboardEvent). Valgfri AI-beskrivelse via synops-ai ("beskriv dette bildet"). Metadata: `{ "source": "screenshot", "description": "..." }`.
### RSS/Feed-abonnement
- [ ] 29.2 `synops-feed` CLI: abonner på RSS/Atom-feed. Input: `--url <feed-url> --collection-id <uuid> [--interval 30m]`. Poller feed, oppretter `content`-node for nye entries med `metadata.source_url` og `tagged`-edge "feed". AI-oppsummering valgfritt. Paywall-deteksjon gjenbrukt fra synops-clip.

View file

@ -1909,6 +1909,7 @@ dependencies = [
name = "synops-ai"
version = "0.1.0"
dependencies = [
"base64",
"chrono",
"clap",
"reqwest",

View file

@ -18,4 +18,5 @@ chrono = { version = "0.4", features = ["serde"] }
reqwest = { version = "0.12", default-features = false, features = ["rustls-tls", "json"] }
tracing = "0.1"
tracing-subscriber = { version = "0.3", features = ["env-filter"] }
base64 = "0.22"
synops-common = { path = "../synops-common" }

View file

@ -44,6 +44,15 @@ struct PromptArgs {
#[arg(long)]
system: Option<String>,
/// Bilde-fil å inkludere som visuell kontekst (multimodal/vision).
/// Bildet sendes som base64 data-URL til LLM.
#[arg(long)]
image: Option<String>,
/// MIME-type for bildet (default: image/png)
#[arg(long, default_value = "image/png")]
image_mime: String,
/// Job-type for modelloppslag og logging (default: "simple_prompt")
#[arg(long, default_value = "simple_prompt")]
job_type: String,
@ -104,7 +113,40 @@ struct ChatRequest {
#[derive(Serialize, Clone)]
struct ChatMessage {
role: String,
content: String,
content: MessageContent,
}
/// Meldingsinnhold: enten en enkel streng eller multimodal (tekst + bilde).
/// OpenAI-kompatibelt format: string | array<{type, text?, image_url?}>
#[derive(Clone)]
enum MessageContent {
Text(String),
Multimodal(Vec<ContentPart>),
}
#[derive(Serialize, Clone)]
#[serde(tag = "type")]
enum ContentPart {
#[serde(rename = "text")]
Text { text: String },
#[serde(rename = "image_url")]
ImageUrl { image_url: ImageUrlDetail },
}
#[derive(Serialize, Clone)]
struct ImageUrlDetail {
url: String,
#[serde(skip_serializing_if = "Option::is_none")]
detail: Option<String>,
}
impl Serialize for MessageContent {
fn serialize<S: serde::Serializer>(&self, serializer: S) -> Result<S::Ok, S::Error> {
match self {
MessageContent::Text(s) => serializer.serialize_str(s),
MessageContent::Multimodal(parts) => parts.serialize(serializer),
}
}
}
#[derive(Deserialize)]
@ -126,11 +168,11 @@ struct UsageInfo {
#[derive(Deserialize)]
struct Choice {
message: MessageContent,
message: ResponseMessage,
}
#[derive(Deserialize)]
struct MessageContent {
struct ResponseMessage {
content: Option<String>,
}
@ -191,12 +233,42 @@ async fn run_prompt(args: PromptArgs) -> Result<(), String> {
if let Some(ref sys) = args.system {
messages.push(ChatMessage {
role: "system".to_string(),
content: sys.clone(),
content: MessageContent::Text(sys.clone()),
});
}
// Multimodal melding hvis --image er oppgitt
let user_content = if let Some(ref image_path) = args.image {
let image_data = std::fs::read(image_path)
.map_err(|e| format!("Kunne ikke lese bildefil '{}': {e}", image_path))?;
use base64::Engine;
let b64 = base64::engine::general_purpose::STANDARD.encode(&image_data);
let data_url = format!("data:{};base64,{}", args.image_mime, b64);
tracing::info!(
image_path,
image_size = image_data.len(),
mime = %args.image_mime,
"Inkluderer bilde i multimodal melding"
);
MessageContent::Multimodal(vec![
ContentPart::Text { text: args.prompt.clone() },
ContentPart::ImageUrl {
image_url: ImageUrlDetail {
url: data_url,
detail: Some("auto".to_string()),
},
},
])
} else {
MessageContent::Text(args.prompt.clone())
};
messages.push(ChatMessage {
role: "user".to_string(),
content: args.prompt.clone(),
content: user_content,
});
let (content, usage, actual_model) =
@ -711,11 +783,11 @@ async fn run_script(args: ScriptArgs) -> Result<(), String> {
let messages = vec![
ChatMessage {
role: "system".to_string(),
content: system_prompt,
content: MessageContent::Text(system_prompt),
},
ChatMessage {
role: "user".to_string(),
content: user_content,
content: MessageContent::Text(user_content),
},
];