// TTS-pipeline — tekst til lyd via ElevenLabs. // // Jobbtype: "tts_generate" // Payload: { // "text": "", // "voice_id": "", (valgfritt, bruker default) // "language": "no", (valgfritt) // "source_node_id": "", (noden teksten tilhører) // "requested_by": "" // } // // Flyten: // 1. Hent tekst og voice-preferanse fra payload // 2. Sjekk mottaker-preferanse i source-nodens metadata (voice_preference) // 3. Kall ElevenLabs API // 4. Lagre lyd i CAS // 5. Opprett media-node med has_media-edge til kilde // 6. Logg ressursforbruk // // Ref: docs/features/ressursforbruk.md, docs/proposals/ghost_host_tts.md use sqlx::PgPool; use uuid::Uuid; use crate::cas::CasStore; use crate::jobs::JobRow; use crate::stdb::StdbClient; /// Maks tekst-lengde for TTS (ElevenLabs grense er 5000 tegn per kall). const MAX_TEXT_LENGTH: usize = 5000; /// Håndterer tts_generate-jobb. pub async fn handle_tts_job( job: &JobRow, db: &PgPool, stdb: &StdbClient, cas: &CasStore, ) -> Result { let text = job.payload["text"] .as_str() .ok_or("Mangler 'text' i payload")? .to_string(); if text.is_empty() { return Err("Tom tekst — ingenting å generere".to_string()); } if text.len() > MAX_TEXT_LENGTH { return Err(format!( "Tekst for lang: {} tegn (maks {})", text.len(), MAX_TEXT_LENGTH )); } let source_node_id: Option = job.payload["source_node_id"] .as_str() .and_then(|s| s.parse().ok()); let requested_by: Uuid = job.payload["requested_by"] .as_str() .and_then(|s| s.parse().ok()) .ok_or("Mangler gyldig 'requested_by' i payload")?; // Bestem voice_id: payload > source-node metadata > env default let voice_id = resolve_voice_id(job, db, source_node_id).await?; let language = job.payload["language"] .as_str() .unwrap_or("no") .to_string(); tracing::info!( text_len = text.len(), voice_id = %voice_id, language = %language, "Starter TTS-generering" ); // 1. Kall ElevenLabs API let audio_bytes = call_elevenlabs(&text, &voice_id).await?; tracing::info!( audio_size = audio_bytes.len(), "Mottok lyd fra ElevenLabs" ); // 2. Lagre i CAS let cas_result = cas .store(&audio_bytes) .await .map_err(|e| format!("CAS-lagring feilet: {e}"))?; tracing::info!( cas_hash = %cas_result.hash, size = cas_result.size, dedup = cas_result.already_existed, "Lyd lagret i CAS" ); // 3. Opprett media-node for lydfilen let media_node_id = Uuid::now_v7(); let title = format!("TTS: {}", truncate(&text, 60)); let metadata = serde_json::json!({ "cas_hash": cas_result.hash, "mime": "audio/mpeg", "size_bytes": cas_result.size, "tts": { "provider": "elevenlabs", "voice_id": voice_id, "language": language, "characters": text.len(), } }); let metadata_str = metadata.to_string(); // STDB først (sanntid) stdb.create_node( &media_node_id.to_string(), "content", &title, "", "hidden", &metadata_str, &requested_by.to_string(), ) .await .map_err(|e| format!("STDB create_node feilet: {e}"))?; // PG (persistering) sqlx::query( r#" INSERT INTO nodes (id, node_kind, title, content, visibility, metadata, created_by) VALUES ($1, 'content', $2, '', 'hidden'::visibility, $3, $4) "#, ) .bind(media_node_id) .bind(&title) .bind(&metadata) .bind(requested_by) .execute(db) .await .map_err(|e| format!("PG insert media-node feilet: {e}"))?; // 4. Opprett has_media-edge fra kilde-node til TTS-noden (hvis kilde finnes) if let Some(source_id) = source_node_id { let edge_id = Uuid::now_v7(); let edge_meta = serde_json::json!({ "media_type": "tts_audio", "generated_at": chrono::Utc::now().to_rfc3339() }); let empty_meta = edge_meta.to_string(); stdb.create_edge( &edge_id.to_string(), &source_id.to_string(), &media_node_id.to_string(), "has_media", &empty_meta, false, &requested_by.to_string(), ) .await .map_err(|e| format!("STDB create_edge (has_media) feilet: {e}"))?; sqlx::query( r#" INSERT INTO edges (id, source_id, target_id, edge_type, metadata, system, created_by) VALUES ($1, $2, $3, 'has_media', $4, false, $5) "#, ) .bind(edge_id) .bind(source_id) .bind(media_node_id) .bind(&edge_meta) .bind(requested_by) .execute(db) .await .map_err(|e| format!("PG insert has_media-edge feilet: {e}"))?; } // 5. Logg ressursforbruk sqlx::query( r#" INSERT INTO resource_usage_log (target_node_id, triggered_by, resource_type, detail) VALUES ($1, $2, 'tts', $3) "#, ) .bind(media_node_id) .bind(requested_by) .bind(serde_json::json!({ "provider": "elevenlabs", "characters": text.len(), "voice_id": voice_id })) .execute(db) .await .map_err(|e| format!("Ressurslogging feilet: {e}"))?; Ok(serde_json::json!({ "status": "completed", "media_node_id": media_node_id.to_string(), "cas_hash": cas_result.hash, "characters": text.len(), "audio_size_bytes": cas_result.size, "voice_id": voice_id, })) } /// Bestem voice_id: payload-verdi > source-node metadata.voice_preference > env default. /// "Mottaker-preferanse i metadata" betyr at en node kan ha /// metadata.voice_preference.voice_id som overstyrer default. async fn resolve_voice_id( job: &JobRow, db: &PgPool, source_node_id: Option, ) -> Result { // 1. Eksplisitt i payload — alltid høyest prioritet if let Some(vid) = job.payload["voice_id"].as_str() { if !vid.is_empty() { return Ok(vid.to_string()); } } // 2. Sjekk source-nodens metadata.voice_preference.voice_id if let Some(node_id) = source_node_id { let meta: Option = sqlx::query_scalar( "SELECT metadata FROM nodes WHERE id = $1", ) .bind(node_id) .fetch_optional(db) .await .map_err(|e| format!("PG-feil ved henting av voice_preference: {e}"))? .flatten(); if let Some(meta) = meta { if let Some(vid) = meta .get("voice_preference") .and_then(|vp| vp.get("voice_id")) .and_then(|v| v.as_str()) { if !vid.is_empty() { tracing::info!( node_id = %node_id, voice_id = %vid, "Bruker mottaker-preferanse fra node-metadata" ); return Ok(vid.to_string()); } } } } // 3. Miljøvariabel-default let default = std::env::var("ELEVENLABS_DEFAULT_VOICE") .unwrap_or_else(|_| "21m00Tcm4TlvDq8ikWAM".to_string()); // Rachel (ElevenLabs premade) Ok(default) } /// Kall ElevenLabs text-to-speech API. /// Returnerer rå MP3-bytes. async fn call_elevenlabs(text: &str, voice_id: &str) -> Result, String> { let api_key = std::env::var("ELEVENLABS_API_KEY") .map_err(|_| "ELEVENLABS_API_KEY er ikke satt".to_string())?; let model_id = std::env::var("ELEVENLABS_MODEL") .unwrap_or_else(|_| "eleven_multilingual_v2".to_string()); let url = format!( "https://api.elevenlabs.io/v1/text-to-speech/{voice_id}" ); let payload = serde_json::json!({ "text": text, "model_id": model_id, "voice_settings": { "stability": 0.5, "similarity_boost": 0.75, "style": 0.0, "use_speaker_boost": true } }); let client = reqwest::Client::new(); let resp = client .post(&url) .header("xi-api-key", &api_key) .header("Accept", "audio/mpeg") .json(&payload) .timeout(std::time::Duration::from_secs(30)) .send() .await .map_err(|e| format!("ElevenLabs HTTP-feil: {e}"))?; if !resp.status().is_success() { let status = resp.status(); let body = resp.text().await.unwrap_or_default(); return Err(format!("ElevenLabs returnerte {status}: {body}")); } resp.bytes() .await .map(|b| b.to_vec()) .map_err(|e| format!("Kunne ikke lese ElevenLabs-respons: {e}")) } /// Forkorter en streng til maks `max_len` tegn med "..." suffix. fn truncate(s: &str, max_len: usize) -> String { if s.len() <= max_len { s.to_string() } else { let end = s.char_indices().nth(max_len - 3).map(|(i, _)| i).unwrap_or(s.len()); format!("{}...", &s[..end]) } }