TTS-pipeline: tekst → lyd via ElevenLabs (oppgave 10.4)
Ny jobbtype `tts_generate` som kaller ElevenLabs text-to-speech API, lagrer MP3-lyd i CAS, og oppretter media-node med has_media-edge. Voice-preferanse løses i tre lag: eksplisitt i payload → nodens metadata.voice_preference → ELEVENLABS_DEFAULT_VOICE env. Dette er "mottaker-preferanse i metadata" — en node kan sette voice_preference i sin metadata for å styre hvilken stemme som brukes. Ny migrasjon 009: resource_usage_log-tabell for sporing av ressursforbruk (TTS, AI, Whisper, CAS). Ref: docs/features/ressursforbruk.md Endringer: - maskinrommet/src/tts.rs: TTS-handler med ElevenLabs-integrasjon - maskinrommet/src/intentions.rs: POST /intentions/generate_tts - maskinrommet/src/jobs.rs: Dispatcher for tts_generate - migrations/009_resource_usage_and_tts.sql: resource_usage_log - scripts/maskinrommet-env.sh: ELEVENLABS_* env-variabler Krever: ELEVENLABS_API_KEY i /srv/synops/.env (placeholder lagt til) Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
parent
1803619622
commit
e95b7d6663
7 changed files with 438 additions and 2 deletions
|
|
@ -2227,3 +2227,95 @@ pub async fn summarize(
|
|||
|
||||
Ok(Json(SummarizeResponse { job_id }))
|
||||
}
|
||||
|
||||
// =============================================================================
|
||||
// POST /intentions/generate_tts — tekst-til-tale via ElevenLabs
|
||||
// =============================================================================
|
||||
|
||||
#[derive(Deserialize)]
|
||||
pub struct GenerateTtsRequest {
|
||||
/// Teksten som skal leses opp (maks 5000 tegn).
|
||||
pub text: String,
|
||||
/// Valgfri ElevenLabs voice_id. Faller tilbake på node-preferanse eller default.
|
||||
pub voice_id: Option<String>,
|
||||
/// Valgfri kilde-node som TTS-lyden knyttes til med has_media-edge.
|
||||
/// Hvis noden har metadata.voice_preference, brukes den som fallback for voice_id.
|
||||
pub source_node_id: Option<Uuid>,
|
||||
/// Språk (default: "no").
|
||||
pub language: Option<String>,
|
||||
}
|
||||
|
||||
#[derive(Serialize)]
|
||||
pub struct GenerateTtsResponse {
|
||||
pub job_id: Uuid,
|
||||
}
|
||||
|
||||
/// POST /intentions/generate_tts
|
||||
///
|
||||
/// Legger en `tts_generate`-jobb i køen.
|
||||
/// Lydfilen opprettes asynkront som en ny content-node i CAS.
|
||||
pub async fn generate_tts(
|
||||
State(state): State<AppState>,
|
||||
user: AuthUser,
|
||||
Json(req): Json<GenerateTtsRequest>,
|
||||
) -> Result<Json<GenerateTtsResponse>, (StatusCode, Json<ErrorResponse>)> {
|
||||
if req.text.is_empty() {
|
||||
return Err(bad_request("Tekst kan ikke være tom"));
|
||||
}
|
||||
if req.text.len() > 5000 {
|
||||
return Err(bad_request("Tekst for lang (maks 5000 tegn)"));
|
||||
}
|
||||
|
||||
// Hvis source_node_id er oppgitt, verifiser at noden finnes
|
||||
if let Some(source_id) = req.source_node_id {
|
||||
let exists: bool = sqlx::query_scalar::<_, bool>(
|
||||
"SELECT EXISTS(SELECT 1 FROM nodes WHERE id = $1)",
|
||||
)
|
||||
.bind(source_id)
|
||||
.fetch_one(&state.db)
|
||||
.await
|
||||
.map_err(|e| {
|
||||
tracing::error!(error = %e, "PG-feil ved node-sjekk");
|
||||
internal_error("Databasefeil")
|
||||
})?;
|
||||
|
||||
if !exists {
|
||||
return Err(bad_request("Kildenode finnes ikke"));
|
||||
}
|
||||
}
|
||||
|
||||
let mut payload = serde_json::json!({
|
||||
"text": req.text,
|
||||
"requested_by": user.node_id.to_string(),
|
||||
"language": req.language.as_deref().unwrap_or("no"),
|
||||
});
|
||||
|
||||
if let Some(ref vid) = req.voice_id {
|
||||
payload["voice_id"] = serde_json::Value::String(vid.clone());
|
||||
}
|
||||
if let Some(source_id) = req.source_node_id {
|
||||
payload["source_node_id"] = serde_json::Value::String(source_id.to_string());
|
||||
}
|
||||
|
||||
let job_id = crate::jobs::enqueue(
|
||||
&state.db,
|
||||
"tts_generate",
|
||||
payload,
|
||||
None,
|
||||
5, // Middels prioritet
|
||||
)
|
||||
.await
|
||||
.map_err(|e| {
|
||||
tracing::error!(error = %e, "Kunne ikke legge TTS-jobb i kø");
|
||||
internal_error("Kunne ikke starte TTS-generering")
|
||||
})?;
|
||||
|
||||
tracing::info!(
|
||||
job_id = %job_id,
|
||||
text_len = req.text.len(),
|
||||
user = %user.node_id,
|
||||
"TTS-jobb lagt i kø"
|
||||
);
|
||||
|
||||
Ok(Json(GenerateTtsResponse { job_id }))
|
||||
}
|
||||
|
|
|
|||
|
|
@ -14,6 +14,7 @@ use crate::cas::CasStore;
|
|||
use crate::stdb::StdbClient;
|
||||
use crate::summarize;
|
||||
use crate::transcribe;
|
||||
use crate::tts;
|
||||
|
||||
/// Rad fra job_queue-tabellen.
|
||||
#[derive(sqlx::FromRow, Debug)]
|
||||
|
|
@ -163,6 +164,9 @@ async fn dispatch(
|
|||
"summarize_communication" => {
|
||||
summarize::handle_summarize_communication(job, db, stdb).await
|
||||
}
|
||||
"tts_generate" => {
|
||||
tts::handle_tts_job(job, db, stdb, cas).await
|
||||
}
|
||||
other => Err(format!("Ukjent jobbtype: {other}")),
|
||||
}
|
||||
}
|
||||
|
|
|
|||
|
|
@ -9,6 +9,7 @@ mod serving;
|
|||
mod stdb;
|
||||
pub mod summarize;
|
||||
pub mod transcribe;
|
||||
pub mod tts;
|
||||
mod warmup;
|
||||
|
||||
use axum::{extract::State, http::StatusCode, routing::{get, post}, Json, Router};
|
||||
|
|
@ -151,6 +152,7 @@ async fn main() {
|
|||
.route("/intentions/retranscribe", post(intentions::retranscribe))
|
||||
.route("/intentions/resolve_retranscription", post(intentions::resolve_retranscription))
|
||||
.route("/intentions/summarize", post(intentions::summarize))
|
||||
.route("/intentions/generate_tts", post(intentions::generate_tts))
|
||||
.route("/query/aliases", get(queries::query_aliases))
|
||||
.route("/query/graph", get(queries::query_graph))
|
||||
.route("/query/transcription_versions", get(queries::query_transcription_versions))
|
||||
|
|
|
|||
315
maskinrommet/src/tts.rs
Normal file
315
maskinrommet/src/tts.rs
Normal file
|
|
@ -0,0 +1,315 @@
|
|||
// TTS-pipeline — tekst til lyd via ElevenLabs.
|
||||
//
|
||||
// Jobbtype: "tts_generate"
|
||||
// Payload: {
|
||||
// "text": "<tekst som skal leses opp>",
|
||||
// "voice_id": "<elevenlabs voice_id>", (valgfritt, bruker default)
|
||||
// "language": "no", (valgfritt)
|
||||
// "source_node_id": "<uuid>", (noden teksten tilhører)
|
||||
// "requested_by": "<uuid>"
|
||||
// }
|
||||
//
|
||||
// Flyten:
|
||||
// 1. Hent tekst og voice-preferanse fra payload
|
||||
// 2. Sjekk mottaker-preferanse i source-nodens metadata (voice_preference)
|
||||
// 3. Kall ElevenLabs API
|
||||
// 4. Lagre lyd i CAS
|
||||
// 5. Opprett media-node med has_media-edge til kilde
|
||||
// 6. Logg ressursforbruk
|
||||
//
|
||||
// Ref: docs/features/ressursforbruk.md, docs/proposals/ghost_host_tts.md
|
||||
|
||||
use sqlx::PgPool;
|
||||
use uuid::Uuid;
|
||||
|
||||
use crate::cas::CasStore;
|
||||
use crate::jobs::JobRow;
|
||||
use crate::stdb::StdbClient;
|
||||
|
||||
/// Maks tekst-lengde for TTS (ElevenLabs grense er 5000 tegn per kall).
|
||||
const MAX_TEXT_LENGTH: usize = 5000;
|
||||
|
||||
/// Håndterer tts_generate-jobb.
|
||||
pub async fn handle_tts_job(
|
||||
job: &JobRow,
|
||||
db: &PgPool,
|
||||
stdb: &StdbClient,
|
||||
cas: &CasStore,
|
||||
) -> Result<serde_json::Value, String> {
|
||||
let text = job.payload["text"]
|
||||
.as_str()
|
||||
.ok_or("Mangler 'text' i payload")?
|
||||
.to_string();
|
||||
|
||||
if text.is_empty() {
|
||||
return Err("Tom tekst — ingenting å generere".to_string());
|
||||
}
|
||||
if text.len() > MAX_TEXT_LENGTH {
|
||||
return Err(format!(
|
||||
"Tekst for lang: {} tegn (maks {})",
|
||||
text.len(),
|
||||
MAX_TEXT_LENGTH
|
||||
));
|
||||
}
|
||||
|
||||
let source_node_id: Option<Uuid> = job.payload["source_node_id"]
|
||||
.as_str()
|
||||
.and_then(|s| s.parse().ok());
|
||||
|
||||
let requested_by: Uuid = job.payload["requested_by"]
|
||||
.as_str()
|
||||
.and_then(|s| s.parse().ok())
|
||||
.ok_or("Mangler gyldig 'requested_by' i payload")?;
|
||||
|
||||
// Bestem voice_id: payload > source-node metadata > env default
|
||||
let voice_id = resolve_voice_id(job, db, source_node_id).await?;
|
||||
|
||||
let language = job.payload["language"]
|
||||
.as_str()
|
||||
.unwrap_or("no")
|
||||
.to_string();
|
||||
|
||||
tracing::info!(
|
||||
text_len = text.len(),
|
||||
voice_id = %voice_id,
|
||||
language = %language,
|
||||
"Starter TTS-generering"
|
||||
);
|
||||
|
||||
// 1. Kall ElevenLabs API
|
||||
let audio_bytes = call_elevenlabs(&text, &voice_id).await?;
|
||||
|
||||
tracing::info!(
|
||||
audio_size = audio_bytes.len(),
|
||||
"Mottok lyd fra ElevenLabs"
|
||||
);
|
||||
|
||||
// 2. Lagre i CAS
|
||||
let cas_result = cas
|
||||
.store(&audio_bytes)
|
||||
.await
|
||||
.map_err(|e| format!("CAS-lagring feilet: {e}"))?;
|
||||
|
||||
tracing::info!(
|
||||
cas_hash = %cas_result.hash,
|
||||
size = cas_result.size,
|
||||
dedup = cas_result.already_existed,
|
||||
"Lyd lagret i CAS"
|
||||
);
|
||||
|
||||
// 3. Opprett media-node for lydfilen
|
||||
let media_node_id = Uuid::now_v7();
|
||||
let title = format!("TTS: {}", truncate(&text, 60));
|
||||
let metadata = serde_json::json!({
|
||||
"cas_hash": cas_result.hash,
|
||||
"mime": "audio/mpeg",
|
||||
"size_bytes": cas_result.size,
|
||||
"tts": {
|
||||
"provider": "elevenlabs",
|
||||
"voice_id": voice_id,
|
||||
"language": language,
|
||||
"characters": text.len(),
|
||||
}
|
||||
});
|
||||
let metadata_str = metadata.to_string();
|
||||
|
||||
// STDB først (sanntid)
|
||||
stdb.create_node(
|
||||
&media_node_id.to_string(),
|
||||
"content",
|
||||
&title,
|
||||
"",
|
||||
"hidden",
|
||||
&metadata_str,
|
||||
&requested_by.to_string(),
|
||||
)
|
||||
.await
|
||||
.map_err(|e| format!("STDB create_node feilet: {e}"))?;
|
||||
|
||||
// PG (persistering)
|
||||
sqlx::query(
|
||||
r#"
|
||||
INSERT INTO nodes (id, node_kind, title, content, visibility, metadata, created_by)
|
||||
VALUES ($1, 'content', $2, '', 'hidden'::visibility, $3, $4)
|
||||
"#,
|
||||
)
|
||||
.bind(media_node_id)
|
||||
.bind(&title)
|
||||
.bind(&metadata)
|
||||
.bind(requested_by)
|
||||
.execute(db)
|
||||
.await
|
||||
.map_err(|e| format!("PG insert media-node feilet: {e}"))?;
|
||||
|
||||
// 4. Opprett has_media-edge fra kilde-node til TTS-noden (hvis kilde finnes)
|
||||
if let Some(source_id) = source_node_id {
|
||||
let edge_id = Uuid::now_v7();
|
||||
let edge_meta = serde_json::json!({
|
||||
"media_type": "tts_audio",
|
||||
"generated_at": chrono::Utc::now().to_rfc3339()
|
||||
});
|
||||
let empty_meta = edge_meta.to_string();
|
||||
|
||||
stdb.create_edge(
|
||||
&edge_id.to_string(),
|
||||
&source_id.to_string(),
|
||||
&media_node_id.to_string(),
|
||||
"has_media",
|
||||
&empty_meta,
|
||||
false,
|
||||
&requested_by.to_string(),
|
||||
)
|
||||
.await
|
||||
.map_err(|e| format!("STDB create_edge (has_media) feilet: {e}"))?;
|
||||
|
||||
sqlx::query(
|
||||
r#"
|
||||
INSERT INTO edges (id, source_id, target_id, edge_type, metadata, system, created_by)
|
||||
VALUES ($1, $2, $3, 'has_media', $4, false, $5)
|
||||
"#,
|
||||
)
|
||||
.bind(edge_id)
|
||||
.bind(source_id)
|
||||
.bind(media_node_id)
|
||||
.bind(&edge_meta)
|
||||
.bind(requested_by)
|
||||
.execute(db)
|
||||
.await
|
||||
.map_err(|e| format!("PG insert has_media-edge feilet: {e}"))?;
|
||||
}
|
||||
|
||||
// 5. Logg ressursforbruk
|
||||
sqlx::query(
|
||||
r#"
|
||||
INSERT INTO resource_usage_log (target_node_id, triggered_by, resource_type, detail)
|
||||
VALUES ($1, $2, 'tts', $3)
|
||||
"#,
|
||||
)
|
||||
.bind(media_node_id)
|
||||
.bind(requested_by)
|
||||
.bind(serde_json::json!({
|
||||
"provider": "elevenlabs",
|
||||
"characters": text.len(),
|
||||
"voice_id": voice_id
|
||||
}))
|
||||
.execute(db)
|
||||
.await
|
||||
.map_err(|e| format!("Ressurslogging feilet: {e}"))?;
|
||||
|
||||
Ok(serde_json::json!({
|
||||
"status": "completed",
|
||||
"media_node_id": media_node_id.to_string(),
|
||||
"cas_hash": cas_result.hash,
|
||||
"characters": text.len(),
|
||||
"audio_size_bytes": cas_result.size,
|
||||
"voice_id": voice_id,
|
||||
}))
|
||||
}
|
||||
|
||||
/// Bestem voice_id: payload-verdi > source-node metadata.voice_preference > env default.
|
||||
/// "Mottaker-preferanse i metadata" betyr at en node kan ha
|
||||
/// metadata.voice_preference.voice_id som overstyrer default.
|
||||
async fn resolve_voice_id(
|
||||
job: &JobRow,
|
||||
db: &PgPool,
|
||||
source_node_id: Option<Uuid>,
|
||||
) -> Result<String, String> {
|
||||
// 1. Eksplisitt i payload — alltid høyest prioritet
|
||||
if let Some(vid) = job.payload["voice_id"].as_str() {
|
||||
if !vid.is_empty() {
|
||||
return Ok(vid.to_string());
|
||||
}
|
||||
}
|
||||
|
||||
// 2. Sjekk source-nodens metadata.voice_preference.voice_id
|
||||
if let Some(node_id) = source_node_id {
|
||||
let meta: Option<serde_json::Value> = sqlx::query_scalar(
|
||||
"SELECT metadata FROM nodes WHERE id = $1",
|
||||
)
|
||||
.bind(node_id)
|
||||
.fetch_optional(db)
|
||||
.await
|
||||
.map_err(|e| format!("PG-feil ved henting av voice_preference: {e}"))?
|
||||
.flatten();
|
||||
|
||||
if let Some(meta) = meta {
|
||||
if let Some(vid) = meta
|
||||
.get("voice_preference")
|
||||
.and_then(|vp| vp.get("voice_id"))
|
||||
.and_then(|v| v.as_str())
|
||||
{
|
||||
if !vid.is_empty() {
|
||||
tracing::info!(
|
||||
node_id = %node_id,
|
||||
voice_id = %vid,
|
||||
"Bruker mottaker-preferanse fra node-metadata"
|
||||
);
|
||||
return Ok(vid.to_string());
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// 3. Miljøvariabel-default
|
||||
let default = std::env::var("ELEVENLABS_DEFAULT_VOICE")
|
||||
.unwrap_or_else(|_| "21m00Tcm4TlvDq8ikWAM".to_string()); // Rachel (ElevenLabs premade)
|
||||
|
||||
Ok(default)
|
||||
}
|
||||
|
||||
/// Kall ElevenLabs text-to-speech API.
|
||||
/// Returnerer rå MP3-bytes.
|
||||
async fn call_elevenlabs(text: &str, voice_id: &str) -> Result<Vec<u8>, String> {
|
||||
let api_key = std::env::var("ELEVENLABS_API_KEY")
|
||||
.map_err(|_| "ELEVENLABS_API_KEY er ikke satt".to_string())?;
|
||||
|
||||
let model_id = std::env::var("ELEVENLABS_MODEL")
|
||||
.unwrap_or_else(|_| "eleven_multilingual_v2".to_string());
|
||||
|
||||
let url = format!(
|
||||
"https://api.elevenlabs.io/v1/text-to-speech/{voice_id}"
|
||||
);
|
||||
|
||||
let payload = serde_json::json!({
|
||||
"text": text,
|
||||
"model_id": model_id,
|
||||
"voice_settings": {
|
||||
"stability": 0.5,
|
||||
"similarity_boost": 0.75,
|
||||
"style": 0.0,
|
||||
"use_speaker_boost": true
|
||||
}
|
||||
});
|
||||
|
||||
let client = reqwest::Client::new();
|
||||
let resp = client
|
||||
.post(&url)
|
||||
.header("xi-api-key", &api_key)
|
||||
.header("Accept", "audio/mpeg")
|
||||
.json(&payload)
|
||||
.timeout(std::time::Duration::from_secs(30))
|
||||
.send()
|
||||
.await
|
||||
.map_err(|e| format!("ElevenLabs HTTP-feil: {e}"))?;
|
||||
|
||||
if !resp.status().is_success() {
|
||||
let status = resp.status();
|
||||
let body = resp.text().await.unwrap_or_default();
|
||||
return Err(format!("ElevenLabs returnerte {status}: {body}"));
|
||||
}
|
||||
|
||||
resp.bytes()
|
||||
.await
|
||||
.map(|b| b.to_vec())
|
||||
.map_err(|e| format!("Kunne ikke lese ElevenLabs-respons: {e}"))
|
||||
}
|
||||
|
||||
/// Forkorter en streng til maks `max_len` tegn med "..." suffix.
|
||||
fn truncate(s: &str, max_len: usize) -> String {
|
||||
if s.len() <= max_len {
|
||||
s.to_string()
|
||||
} else {
|
||||
let end = s.char_indices().nth(max_len - 3).map(|(i, _)| i).unwrap_or(s.len());
|
||||
format!("{}...", &s[..end])
|
||||
}
|
||||
}
|
||||
21
migrations/009_resource_usage_and_tts.sql
Normal file
21
migrations/009_resource_usage_and_tts.sql
Normal file
|
|
@ -0,0 +1,21 @@
|
|||
-- Migration 009: resource_usage_log + TTS-støtte
|
||||
--
|
||||
-- 1. Oppretter resource_usage_log for sporing av ressursforbruk (AI, Whisper, TTS, CAS, etc.)
|
||||
-- 2. Legger grunnlaget for TTS-jobber (voice_preference i node-metadata)
|
||||
--
|
||||
-- Ref: docs/features/ressursforbruk.md
|
||||
|
||||
CREATE TABLE resource_usage_log (
|
||||
id UUID PRIMARY KEY DEFAULT gen_random_uuid(),
|
||||
target_node_id UUID NOT NULL REFERENCES nodes(id),
|
||||
triggered_by UUID REFERENCES nodes(id),
|
||||
collection_id UUID REFERENCES nodes(id),
|
||||
resource_type TEXT NOT NULL,
|
||||
detail JSONB NOT NULL,
|
||||
created_at TIMESTAMPTZ NOT NULL DEFAULT now()
|
||||
);
|
||||
|
||||
CREATE INDEX idx_resource_usage_target ON resource_usage_log(target_node_id);
|
||||
CREATE INDEX idx_resource_usage_triggered ON resource_usage_log(triggered_by);
|
||||
CREATE INDEX idx_resource_usage_collection ON resource_usage_log(collection_id);
|
||||
CREATE INDEX idx_resource_usage_type_time ON resource_usage_log(resource_type, created_at);
|
||||
|
|
@ -29,6 +29,9 @@ LITELLM_MASTER_KEY=$(read_env LITELLM_MASTER_KEY)
|
|||
LIVEKIT_URL=http://${LIVEKIT_IP:-localhost}:7880
|
||||
LIVEKIT_API_KEY=$(read_env LIVEKIT_API_KEY)
|
||||
LIVEKIT_API_SECRET=$(read_env LIVEKIT_API_SECRET)
|
||||
ELEVENLABS_API_KEY=$(read_env ELEVENLABS_API_KEY)
|
||||
ELEVENLABS_DEFAULT_VOICE=$(read_env ELEVENLABS_DEFAULT_VOICE)
|
||||
ELEVENLABS_MODEL=$(read_env ELEVENLABS_MODEL)
|
||||
PROJECT_DIR=/home/vegard/synops
|
||||
RUST_LOG=maskinrommet=debug,tower_http=debug
|
||||
EOF
|
||||
|
|
|
|||
3
tasks.md
3
tasks.md
|
|
@ -119,8 +119,7 @@ Uavhengige faser kan fortsatt plukkes.
|
|||
- [x] 10.1 LiteLLM oppsett: Docker-container, API-nøkler, modell-routing. Ref: `docs/infra/ai_gateway.md`.
|
||||
- [x] 10.2 AI-foreslåtte edges: maskinrommet sender innhold til LLM → foreslår mentions, topics.
|
||||
- [x] 10.3 Oppsummering: kommunikasjonsnode → AI-generert sammendrag som ny node.
|
||||
- [~] 10.4 TTS: tekst → lyd via ElevenLabs. Mottaker-preferanse i metadata.
|
||||
> Påbegynt: 2026-03-17T23:32
|
||||
- [x] 10.4 TTS: tekst → lyd via ElevenLabs. Mottaker-preferanse i metadata.
|
||||
|
||||
## Fase 11: Produksjons-pipeline
|
||||
|
||||
|
|
|
|||
Loading…
Add table
Reference in a new issue