diff --git a/maskinrommet/src/intentions.rs b/maskinrommet/src/intentions.rs index 5fa99da..e2741bd 100644 --- a/maskinrommet/src/intentions.rs +++ b/maskinrommet/src/intentions.rs @@ -2227,3 +2227,95 @@ pub async fn summarize( Ok(Json(SummarizeResponse { job_id })) } + +// ============================================================================= +// POST /intentions/generate_tts — tekst-til-tale via ElevenLabs +// ============================================================================= + +#[derive(Deserialize)] +pub struct GenerateTtsRequest { + /// Teksten som skal leses opp (maks 5000 tegn). + pub text: String, + /// Valgfri ElevenLabs voice_id. Faller tilbake på node-preferanse eller default. + pub voice_id: Option, + /// Valgfri kilde-node som TTS-lyden knyttes til med has_media-edge. + /// Hvis noden har metadata.voice_preference, brukes den som fallback for voice_id. + pub source_node_id: Option, + /// Språk (default: "no"). + pub language: Option, +} + +#[derive(Serialize)] +pub struct GenerateTtsResponse { + pub job_id: Uuid, +} + +/// POST /intentions/generate_tts +/// +/// Legger en `tts_generate`-jobb i køen. +/// Lydfilen opprettes asynkront som en ny content-node i CAS. +pub async fn generate_tts( + State(state): State, + user: AuthUser, + Json(req): Json, +) -> Result, (StatusCode, Json)> { + if req.text.is_empty() { + return Err(bad_request("Tekst kan ikke være tom")); + } + if req.text.len() > 5000 { + return Err(bad_request("Tekst for lang (maks 5000 tegn)")); + } + + // Hvis source_node_id er oppgitt, verifiser at noden finnes + if let Some(source_id) = req.source_node_id { + let exists: bool = sqlx::query_scalar::<_, bool>( + "SELECT EXISTS(SELECT 1 FROM nodes WHERE id = $1)", + ) + .bind(source_id) + .fetch_one(&state.db) + .await + .map_err(|e| { + tracing::error!(error = %e, "PG-feil ved node-sjekk"); + internal_error("Databasefeil") + })?; + + if !exists { + return Err(bad_request("Kildenode finnes ikke")); + } + } + + let mut payload = serde_json::json!({ + "text": req.text, + "requested_by": user.node_id.to_string(), + "language": req.language.as_deref().unwrap_or("no"), + }); + + if let Some(ref vid) = req.voice_id { + payload["voice_id"] = serde_json::Value::String(vid.clone()); + } + if let Some(source_id) = req.source_node_id { + payload["source_node_id"] = serde_json::Value::String(source_id.to_string()); + } + + let job_id = crate::jobs::enqueue( + &state.db, + "tts_generate", + payload, + None, + 5, // Middels prioritet + ) + .await + .map_err(|e| { + tracing::error!(error = %e, "Kunne ikke legge TTS-jobb i kø"); + internal_error("Kunne ikke starte TTS-generering") + })?; + + tracing::info!( + job_id = %job_id, + text_len = req.text.len(), + user = %user.node_id, + "TTS-jobb lagt i kø" + ); + + Ok(Json(GenerateTtsResponse { job_id })) +} diff --git a/maskinrommet/src/jobs.rs b/maskinrommet/src/jobs.rs index eff7e7c..c553279 100644 --- a/maskinrommet/src/jobs.rs +++ b/maskinrommet/src/jobs.rs @@ -14,6 +14,7 @@ use crate::cas::CasStore; use crate::stdb::StdbClient; use crate::summarize; use crate::transcribe; +use crate::tts; /// Rad fra job_queue-tabellen. #[derive(sqlx::FromRow, Debug)] @@ -163,6 +164,9 @@ async fn dispatch( "summarize_communication" => { summarize::handle_summarize_communication(job, db, stdb).await } + "tts_generate" => { + tts::handle_tts_job(job, db, stdb, cas).await + } other => Err(format!("Ukjent jobbtype: {other}")), } } diff --git a/maskinrommet/src/main.rs b/maskinrommet/src/main.rs index 03b77e6..9feb30c 100644 --- a/maskinrommet/src/main.rs +++ b/maskinrommet/src/main.rs @@ -9,6 +9,7 @@ mod serving; mod stdb; pub mod summarize; pub mod transcribe; +pub mod tts; mod warmup; use axum::{extract::State, http::StatusCode, routing::{get, post}, Json, Router}; @@ -151,6 +152,7 @@ async fn main() { .route("/intentions/retranscribe", post(intentions::retranscribe)) .route("/intentions/resolve_retranscription", post(intentions::resolve_retranscription)) .route("/intentions/summarize", post(intentions::summarize)) + .route("/intentions/generate_tts", post(intentions::generate_tts)) .route("/query/aliases", get(queries::query_aliases)) .route("/query/graph", get(queries::query_graph)) .route("/query/transcription_versions", get(queries::query_transcription_versions)) diff --git a/maskinrommet/src/tts.rs b/maskinrommet/src/tts.rs new file mode 100644 index 0000000..323d478 --- /dev/null +++ b/maskinrommet/src/tts.rs @@ -0,0 +1,315 @@ +// TTS-pipeline — tekst til lyd via ElevenLabs. +// +// Jobbtype: "tts_generate" +// Payload: { +// "text": "", +// "voice_id": "", (valgfritt, bruker default) +// "language": "no", (valgfritt) +// "source_node_id": "", (noden teksten tilhører) +// "requested_by": "" +// } +// +// Flyten: +// 1. Hent tekst og voice-preferanse fra payload +// 2. Sjekk mottaker-preferanse i source-nodens metadata (voice_preference) +// 3. Kall ElevenLabs API +// 4. Lagre lyd i CAS +// 5. Opprett media-node med has_media-edge til kilde +// 6. Logg ressursforbruk +// +// Ref: docs/features/ressursforbruk.md, docs/proposals/ghost_host_tts.md + +use sqlx::PgPool; +use uuid::Uuid; + +use crate::cas::CasStore; +use crate::jobs::JobRow; +use crate::stdb::StdbClient; + +/// Maks tekst-lengde for TTS (ElevenLabs grense er 5000 tegn per kall). +const MAX_TEXT_LENGTH: usize = 5000; + +/// Håndterer tts_generate-jobb. +pub async fn handle_tts_job( + job: &JobRow, + db: &PgPool, + stdb: &StdbClient, + cas: &CasStore, +) -> Result { + let text = job.payload["text"] + .as_str() + .ok_or("Mangler 'text' i payload")? + .to_string(); + + if text.is_empty() { + return Err("Tom tekst — ingenting å generere".to_string()); + } + if text.len() > MAX_TEXT_LENGTH { + return Err(format!( + "Tekst for lang: {} tegn (maks {})", + text.len(), + MAX_TEXT_LENGTH + )); + } + + let source_node_id: Option = job.payload["source_node_id"] + .as_str() + .and_then(|s| s.parse().ok()); + + let requested_by: Uuid = job.payload["requested_by"] + .as_str() + .and_then(|s| s.parse().ok()) + .ok_or("Mangler gyldig 'requested_by' i payload")?; + + // Bestem voice_id: payload > source-node metadata > env default + let voice_id = resolve_voice_id(job, db, source_node_id).await?; + + let language = job.payload["language"] + .as_str() + .unwrap_or("no") + .to_string(); + + tracing::info!( + text_len = text.len(), + voice_id = %voice_id, + language = %language, + "Starter TTS-generering" + ); + + // 1. Kall ElevenLabs API + let audio_bytes = call_elevenlabs(&text, &voice_id).await?; + + tracing::info!( + audio_size = audio_bytes.len(), + "Mottok lyd fra ElevenLabs" + ); + + // 2. Lagre i CAS + let cas_result = cas + .store(&audio_bytes) + .await + .map_err(|e| format!("CAS-lagring feilet: {e}"))?; + + tracing::info!( + cas_hash = %cas_result.hash, + size = cas_result.size, + dedup = cas_result.already_existed, + "Lyd lagret i CAS" + ); + + // 3. Opprett media-node for lydfilen + let media_node_id = Uuid::now_v7(); + let title = format!("TTS: {}", truncate(&text, 60)); + let metadata = serde_json::json!({ + "cas_hash": cas_result.hash, + "mime": "audio/mpeg", + "size_bytes": cas_result.size, + "tts": { + "provider": "elevenlabs", + "voice_id": voice_id, + "language": language, + "characters": text.len(), + } + }); + let metadata_str = metadata.to_string(); + + // STDB først (sanntid) + stdb.create_node( + &media_node_id.to_string(), + "content", + &title, + "", + "hidden", + &metadata_str, + &requested_by.to_string(), + ) + .await + .map_err(|e| format!("STDB create_node feilet: {e}"))?; + + // PG (persistering) + sqlx::query( + r#" + INSERT INTO nodes (id, node_kind, title, content, visibility, metadata, created_by) + VALUES ($1, 'content', $2, '', 'hidden'::visibility, $3, $4) + "#, + ) + .bind(media_node_id) + .bind(&title) + .bind(&metadata) + .bind(requested_by) + .execute(db) + .await + .map_err(|e| format!("PG insert media-node feilet: {e}"))?; + + // 4. Opprett has_media-edge fra kilde-node til TTS-noden (hvis kilde finnes) + if let Some(source_id) = source_node_id { + let edge_id = Uuid::now_v7(); + let edge_meta = serde_json::json!({ + "media_type": "tts_audio", + "generated_at": chrono::Utc::now().to_rfc3339() + }); + let empty_meta = edge_meta.to_string(); + + stdb.create_edge( + &edge_id.to_string(), + &source_id.to_string(), + &media_node_id.to_string(), + "has_media", + &empty_meta, + false, + &requested_by.to_string(), + ) + .await + .map_err(|e| format!("STDB create_edge (has_media) feilet: {e}"))?; + + sqlx::query( + r#" + INSERT INTO edges (id, source_id, target_id, edge_type, metadata, system, created_by) + VALUES ($1, $2, $3, 'has_media', $4, false, $5) + "#, + ) + .bind(edge_id) + .bind(source_id) + .bind(media_node_id) + .bind(&edge_meta) + .bind(requested_by) + .execute(db) + .await + .map_err(|e| format!("PG insert has_media-edge feilet: {e}"))?; + } + + // 5. Logg ressursforbruk + sqlx::query( + r#" + INSERT INTO resource_usage_log (target_node_id, triggered_by, resource_type, detail) + VALUES ($1, $2, 'tts', $3) + "#, + ) + .bind(media_node_id) + .bind(requested_by) + .bind(serde_json::json!({ + "provider": "elevenlabs", + "characters": text.len(), + "voice_id": voice_id + })) + .execute(db) + .await + .map_err(|e| format!("Ressurslogging feilet: {e}"))?; + + Ok(serde_json::json!({ + "status": "completed", + "media_node_id": media_node_id.to_string(), + "cas_hash": cas_result.hash, + "characters": text.len(), + "audio_size_bytes": cas_result.size, + "voice_id": voice_id, + })) +} + +/// Bestem voice_id: payload-verdi > source-node metadata.voice_preference > env default. +/// "Mottaker-preferanse i metadata" betyr at en node kan ha +/// metadata.voice_preference.voice_id som overstyrer default. +async fn resolve_voice_id( + job: &JobRow, + db: &PgPool, + source_node_id: Option, +) -> Result { + // 1. Eksplisitt i payload — alltid høyest prioritet + if let Some(vid) = job.payload["voice_id"].as_str() { + if !vid.is_empty() { + return Ok(vid.to_string()); + } + } + + // 2. Sjekk source-nodens metadata.voice_preference.voice_id + if let Some(node_id) = source_node_id { + let meta: Option = sqlx::query_scalar( + "SELECT metadata FROM nodes WHERE id = $1", + ) + .bind(node_id) + .fetch_optional(db) + .await + .map_err(|e| format!("PG-feil ved henting av voice_preference: {e}"))? + .flatten(); + + if let Some(meta) = meta { + if let Some(vid) = meta + .get("voice_preference") + .and_then(|vp| vp.get("voice_id")) + .and_then(|v| v.as_str()) + { + if !vid.is_empty() { + tracing::info!( + node_id = %node_id, + voice_id = %vid, + "Bruker mottaker-preferanse fra node-metadata" + ); + return Ok(vid.to_string()); + } + } + } + } + + // 3. Miljøvariabel-default + let default = std::env::var("ELEVENLABS_DEFAULT_VOICE") + .unwrap_or_else(|_| "21m00Tcm4TlvDq8ikWAM".to_string()); // Rachel (ElevenLabs premade) + + Ok(default) +} + +/// Kall ElevenLabs text-to-speech API. +/// Returnerer rå MP3-bytes. +async fn call_elevenlabs(text: &str, voice_id: &str) -> Result, String> { + let api_key = std::env::var("ELEVENLABS_API_KEY") + .map_err(|_| "ELEVENLABS_API_KEY er ikke satt".to_string())?; + + let model_id = std::env::var("ELEVENLABS_MODEL") + .unwrap_or_else(|_| "eleven_multilingual_v2".to_string()); + + let url = format!( + "https://api.elevenlabs.io/v1/text-to-speech/{voice_id}" + ); + + let payload = serde_json::json!({ + "text": text, + "model_id": model_id, + "voice_settings": { + "stability": 0.5, + "similarity_boost": 0.75, + "style": 0.0, + "use_speaker_boost": true + } + }); + + let client = reqwest::Client::new(); + let resp = client + .post(&url) + .header("xi-api-key", &api_key) + .header("Accept", "audio/mpeg") + .json(&payload) + .timeout(std::time::Duration::from_secs(30)) + .send() + .await + .map_err(|e| format!("ElevenLabs HTTP-feil: {e}"))?; + + if !resp.status().is_success() { + let status = resp.status(); + let body = resp.text().await.unwrap_or_default(); + return Err(format!("ElevenLabs returnerte {status}: {body}")); + } + + resp.bytes() + .await + .map(|b| b.to_vec()) + .map_err(|e| format!("Kunne ikke lese ElevenLabs-respons: {e}")) +} + +/// Forkorter en streng til maks `max_len` tegn med "..." suffix. +fn truncate(s: &str, max_len: usize) -> String { + if s.len() <= max_len { + s.to_string() + } else { + let end = s.char_indices().nth(max_len - 3).map(|(i, _)| i).unwrap_or(s.len()); + format!("{}...", &s[..end]) + } +} diff --git a/migrations/009_resource_usage_and_tts.sql b/migrations/009_resource_usage_and_tts.sql new file mode 100644 index 0000000..a2d2897 --- /dev/null +++ b/migrations/009_resource_usage_and_tts.sql @@ -0,0 +1,21 @@ +-- Migration 009: resource_usage_log + TTS-støtte +-- +-- 1. Oppretter resource_usage_log for sporing av ressursforbruk (AI, Whisper, TTS, CAS, etc.) +-- 2. Legger grunnlaget for TTS-jobber (voice_preference i node-metadata) +-- +-- Ref: docs/features/ressursforbruk.md + +CREATE TABLE resource_usage_log ( + id UUID PRIMARY KEY DEFAULT gen_random_uuid(), + target_node_id UUID NOT NULL REFERENCES nodes(id), + triggered_by UUID REFERENCES nodes(id), + collection_id UUID REFERENCES nodes(id), + resource_type TEXT NOT NULL, + detail JSONB NOT NULL, + created_at TIMESTAMPTZ NOT NULL DEFAULT now() +); + +CREATE INDEX idx_resource_usage_target ON resource_usage_log(target_node_id); +CREATE INDEX idx_resource_usage_triggered ON resource_usage_log(triggered_by); +CREATE INDEX idx_resource_usage_collection ON resource_usage_log(collection_id); +CREATE INDEX idx_resource_usage_type_time ON resource_usage_log(resource_type, created_at); diff --git a/scripts/maskinrommet-env.sh b/scripts/maskinrommet-env.sh index 0602373..1845729 100755 --- a/scripts/maskinrommet-env.sh +++ b/scripts/maskinrommet-env.sh @@ -29,6 +29,9 @@ LITELLM_MASTER_KEY=$(read_env LITELLM_MASTER_KEY) LIVEKIT_URL=http://${LIVEKIT_IP:-localhost}:7880 LIVEKIT_API_KEY=$(read_env LIVEKIT_API_KEY) LIVEKIT_API_SECRET=$(read_env LIVEKIT_API_SECRET) +ELEVENLABS_API_KEY=$(read_env ELEVENLABS_API_KEY) +ELEVENLABS_DEFAULT_VOICE=$(read_env ELEVENLABS_DEFAULT_VOICE) +ELEVENLABS_MODEL=$(read_env ELEVENLABS_MODEL) PROJECT_DIR=/home/vegard/synops RUST_LOG=maskinrommet=debug,tower_http=debug EOF diff --git a/tasks.md b/tasks.md index 7f2d4bb..0f69c0e 100644 --- a/tasks.md +++ b/tasks.md @@ -119,8 +119,7 @@ Uavhengige faser kan fortsatt plukkes. - [x] 10.1 LiteLLM oppsett: Docker-container, API-nøkler, modell-routing. Ref: `docs/infra/ai_gateway.md`. - [x] 10.2 AI-foreslåtte edges: maskinrommet sender innhold til LLM → foreslår mentions, topics. - [x] 10.3 Oppsummering: kommunikasjonsnode → AI-generert sammendrag som ny node. -- [~] 10.4 TTS: tekst → lyd via ElevenLabs. Mottaker-preferanse i metadata. - > Påbegynt: 2026-03-17T23:32 +- [x] 10.4 TTS: tekst → lyd via ElevenLabs. Mottaker-preferanse i metadata. ## Fase 11: Produksjons-pipeline