From 0bfad1eb8a14c2e32bcb9672d55b51c7738b983f Mon Sep 17 00:00:00 2001
From: vegard <vnotnes@pm.me>
Date: Thu, 19 Mar 2026 18:12:27 +0000
Subject: [PATCH] Implementer retry med backoff og token-budsjett i
 synops-agent
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

- Retry med exponential backoff for retryable API-feil (429, 500, 502, 503)
  med konfigurerbar --max-retries (default: 3) og Retry-After-støtte
- --max-cost flagg for token-budsjett (USD), stopper og rapporterer
  gjenstående arbeid ved budsjettgrense (exit code 2)
- Konfigurerbar --max-tokens per provider (erstatter hardkodet 4096/8192)
- Sanntids kostnadsregnskap per modell med cost_per_million_tokens-tabell
- Detaljert token/kostnad-rapport ved avslutning

Ref: docs/proposals/agent_harness.md §3 (selvovervåking)
---
 tools/synops-agent/src/main.rs     |  76 ++++++++++--
 tools/synops-agent/src/provider.rs | 190 +++++++++++++++++++++++++++--
 2 files changed, 249 insertions(+), 17 deletions(-)
diff --git a/tools/synops-agent/src/main.rs b/tools/synops-agent/src/main.rs
index 73b5c63..b0055b1 100644
--- a/tools/synops-agent/src/main.rs
+++ b/tools/synops-agent/src/main.rs
@@ -14,7 +14,10 @@ mod tools;
 
 use clap::Parser;
 use context::{CompactionConfig, CompactionLevel, check_compaction_level, compact_messages};
-use provider::{ApiKeys, CompletionResponse, Message, TokenUsage, create_provider};
+use provider::{
+    ApiKeys, Message, RetryConfig, TokenUsage,
+    calculate_cost, complete_with_retry, create_provider,
+};
 use std::collections::HashMap;
 use std::path::PathBuf;
 
@@ -48,6 +51,18 @@ struct Cli {
     /// Spawn Claude Code i stedet (bruker abonnement)
     #[arg(long)]
     claude: bool,
+
+    /// Maks kostnad i USD (f.eks. 0.50). Stopper og rapporterer ved grense.
+    #[arg(long)]
+    max_cost: Option<f64>,
+
+    /// Maks output-tokens per LLM-kall (overstyrer provider-default)
+    #[arg(long)]
+    max_tokens: Option<u32>,
+
+    /// Maks antall retries ved API-feil (default: 3)
+    #[arg(long, default_value = "3")]
+    max_retries: u32,
 }
 
 #[tokio::main]
@@ -67,7 +82,7 @@ async fn main() -> Result<(), Box<dyn std::error::Error>> {
     }
 
     let api_keys = ApiKeys::from_env();
-    let provider = create_provider(&cli.model, &api_keys)?;
+    let provider = create_provider(&cli.model, &api_keys, cli.max_tokens)?;
 
     tracing::info!(
         model = provider.model_id(),
@@ -98,7 +113,15 @@ async fn main() -> Result<(), Box<dyn std::error::Error>> {
 
     // Token accounting
     let mut total_usage: HashMap<String, TokenUsage> = HashMap::new();
+    let mut total_cost: f64 = 0.0;
     let mut iteration = 0;
+    let mut budget_exhausted = false;
+
+    // Retry config
+    let retry_config = RetryConfig {
+        max_retries: cli.max_retries,
+        ..Default::default()
+    };
 
     // Context compaction config
     let compaction_config = CompactionConfig {
@@ -107,7 +130,8 @@ async fn main() -> Result<(), Box<dyn std::error::Error>> {
     };
     tracing::info!(
         context_window = compaction_config.context_window,
-        "ACC konfigurert"
+        max_cost = cli.max_cost.map(|c| format!("${:.2}", c)).as_deref().unwrap_or("unlimited"),
+        "Agent konfigurert"
     );
 
     // === Agent loop ===
@@ -118,27 +142,49 @@ async fn main() -> Result<(), Box<dyn std::error::Error>> {
             break;
         }
 
-        // Call LLM
-        let response: CompletionResponse = provider.complete(&messages, &tool_defs).await?;
+        // Budget check before calling LLM
+        if let Some(max_cost) = cli.max_cost {
+            if total_cost >= max_cost {
+                budget_exhausted = true;
+                tracing::warn!(
+                    cost = format!("${:.4}", total_cost),
+                    budget = format!("${:.2}", max_cost),
+                    "Budsjettgrense nådd — stopper"
+                );
+                break;
+            }
+        }
 
-        // Accumulate token usage
+        // Call LLM with retry
+        let response = complete_with_retry(
+            provider.as_ref(),
+            &messages,
+            &tool_defs,
+            &retry_config,
+        ).await?;
+
+        // Accumulate token usage and cost
         let entry = total_usage
             .entry(response.model.clone())
             .or_insert_with(TokenUsage::default);
         entry.input_tokens += response.usage.input_tokens;
         entry.output_tokens += response.usage.output_tokens;
 
+        let call_cost = calculate_cost(&response.model, &response.usage);
+        total_cost += call_cost;
+
         if cli.verbose {
             tracing::info!(
                 iteration,
                 input = response.usage.input_tokens,
                 output = response.usage.output_tokens,
+                call_cost = format!("${:.4}", call_cost),
+                total_cost = format!("${:.4}", total_cost),
                 "LLM-kall"
             );
         }
 
         // === Adaptive Context Compaction ===
-        // Use prompt_tokens from the API response as calibration anchor
         let level = check_compaction_level(response.usage.input_tokens, &compaction_config);
         if level != CompactionLevel::None {
             let ratio = response.usage.input_tokens as f64 / compaction_config.context_window as f64;
@@ -212,16 +258,28 @@ async fn main() -> Result<(), Box<dyn std::error::Error>> {
     let mut total_in = 0u64;
     let mut total_out = 0u64;
     for (model, usage) in &total_usage {
+        let model_cost = calculate_cost(model, usage);
         eprintln!(
-            "  {}: {} inn / {} ut",
-            model, usage.input_tokens, usage.output_tokens
+            "  {}: {} inn / {} ut (${:.4})",
+            model, usage.input_tokens, usage.output_tokens, model_cost
         );
         total_in += usage.input_tokens;
         total_out += usage.output_tokens;
     }
     eprintln!("  Totalt: {} inn / {} ut", total_in, total_out);
+    eprintln!("  Kostnad: ${:.4}", total_cost);
+    if let Some(max_cost) = cli.max_cost {
+        eprintln!("  Budsjett: ${:.2} ({:.0}% brukt)", max_cost, (total_cost / max_cost) * 100.0);
+    }
     eprintln!("  Iterasjoner: {}", iteration);
 
+    if budget_exhausted {
+        eprintln!("\n⚠ Budsjettgrense nådd. Oppgaven er ikke fullført.");
+        eprintln!("  Gjenstående arbeid bør fortsettes med høyere --max-cost");
+        eprintln!("  eller manuelt. Kontekst kan gjenopprettes fra meldingsloggen.");
+        std::process::exit(2);
+    }
+
     Ok(())
 }
 
diff --git a/tools/synops-agent/src/provider.rs b/tools/synops-agent/src/provider.rs
index 52d2ebd..f85ad36 100644
--- a/tools/synops-agent/src/provider.rs
+++ b/tools/synops-agent/src/provider.rs
@@ -96,6 +96,38 @@ pub enum ProviderError {
     Parse(String),
     #[error("No API key configured for {provider}")]
     NoApiKey { provider: String },
+    #[allow(dead_code)]
+    #[error("Budget exhausted: {0}")]
+    BudgetExhausted(String),
+}
+
+impl ProviderError {
+    /// Whether this error is retryable (transient API failures).
+    pub fn is_retryable(&self) -> bool {
+        match self {
+            ProviderError::Api { status, .. } => matches!(status, 429 | 500 | 502 | 503),
+            ProviderError::Http(e) => {
+                // Retry on connection/timeout errors
+                e.is_timeout() || e.is_connect()
+            }
+            _ => false,
+        }
+    }
+
+    /// Extract Retry-After hint (seconds) from error body if present.
+    pub fn retry_after_hint(&self) -> Option<u64> {
+        if let ProviderError::Api { status: 429, body } = self {
+            // Some APIs include retry_after in JSON error body
+            if let Ok(v) = serde_json::from_str::<serde_json::Value>(body) {
+                if let Some(secs) = v["error"]["retry_after"].as_u64()
+                    .or_else(|| v["retry_after"].as_u64())
+                {
+                    return Some(secs);
+                }
+            }
+        }
+        None
+    }
 }
 
 // ============================================================================
@@ -108,6 +140,7 @@ pub struct OpenAiCompatible {
     api_key: String,
     model: String,
     provider_name: String,
+    max_tokens: u32,
 }
 
 impl OpenAiCompatible {
@@ -123,9 +156,15 @@ impl OpenAiCompatible {
             api_key: api_key.into(),
             model: model.into(),
             provider_name: provider_name.into(),
+            max_tokens: 4096,
         }
     }
 
+    pub fn with_max_tokens(mut self, max_tokens: u32) -> Self {
+        self.max_tokens = max_tokens;
+        self
+    }
+
     /// OpenRouter
     pub fn openrouter(api_key: impl Into<String>, model: impl Into<String>) -> Self {
         Self::new(
@@ -177,7 +216,7 @@ impl LlmProvider for OpenAiCompatible {
         let mut body = serde_json::json!({
             "model": self.model,
             "messages": messages,
-            "max_tokens": 4096,
+            "max_tokens": self.max_tokens,
         });
 
         if !tools.is_empty() {
@@ -251,6 +290,7 @@ pub struct Anthropic {
     client: reqwest::Client,
     api_key: String,
     model: String,
+    max_tokens: u32,
 }
 
 impl Anthropic {
@@ -259,8 +299,14 @@ impl Anthropic {
             client: reqwest::Client::new(),
             api_key: api_key.into(),
             model: model.into(),
+            max_tokens: 8192,
         }
     }
+
+    pub fn with_max_tokens(mut self, max_tokens: u32) -> Self {
+        self.max_tokens = max_tokens;
+        self
+    }
 }
 
 #[async_trait::async_trait]
@@ -327,7 +373,7 @@ impl LlmProvider for Anthropic {
 
         let mut body = serde_json::json!({
             "model": self.model,
-            "max_tokens": 8192,
+            "max_tokens": self.max_tokens,
             "messages": anthropic_messages,
         });
 
@@ -429,6 +475,7 @@ pub struct Gemini {
     client: reqwest::Client,
     api_key: String,
     model: String,
+    max_tokens: u32,
 }
 
 impl Gemini {
@@ -437,8 +484,14 @@ impl Gemini {
             client: reqwest::Client::new(),
             api_key: api_key.into(),
             model: model.into(),
+            max_tokens: 8192,
         }
     }
+
+    pub fn with_max_tokens(mut self, max_tokens: u32) -> Self {
+        self.max_tokens = max_tokens;
+        self
+    }
 }
 
 #[async_trait::async_trait]
@@ -517,6 +570,9 @@ impl LlmProvider for Gemini {
 
         let mut body = serde_json::json!({
             "contents": contents,
+            "generationConfig": {
+                "maxOutputTokens": self.max_tokens,
+            },
         });
 
         if !system.is_empty() {
@@ -633,6 +689,7 @@ impl LlmProvider for Gemini {
 pub fn create_provider(
     model_spec: &str,
     api_keys: &ApiKeys,
+    max_tokens: Option<u32>,
 ) -> Result<Box<dyn LlmProvider>, ProviderError> {
     let (provider, model) = if let Some(idx) = model_spec.find('/') {
         (&model_spec[..idx], &model_spec[idx + 1..])
@@ -644,35 +701,152 @@ pub fn create_provider(
         "openrouter" => {
             let key = api_keys.openrouter.as_deref()
                 .ok_or_else(|| ProviderError::NoApiKey { provider: "openrouter".into() })?;
-            Ok(Box::new(OpenAiCompatible::openrouter(key, model)))
+            let mut p = OpenAiCompatible::openrouter(key, model);
+            if let Some(mt) = max_tokens { p = p.with_max_tokens(mt); }
+            Ok(Box::new(p))
         }
         "anthropic" => {
             let key = api_keys.anthropic.as_deref()
                 .ok_or_else(|| ProviderError::NoApiKey { provider: "anthropic".into() })?;
-            Ok(Box::new(Anthropic::new(key, model)))
+            let mut p = Anthropic::new(key, model);
+            if let Some(mt) = max_tokens { p = p.with_max_tokens(mt); }
+            Ok(Box::new(p))
         }
         "gemini" | "google" => {
             let key = api_keys.gemini.as_deref()
                 .ok_or_else(|| ProviderError::NoApiKey { provider: "gemini".into() })?;
-            Ok(Box::new(Gemini::new(key, model)))
+            let mut p = Gemini::new(key, model);
+            if let Some(mt) = max_tokens { p = p.with_max_tokens(mt); }
+            Ok(Box::new(p))
         }
         "xai" | "grok" => {
             let key = api_keys.xai.as_deref()
                 .ok_or_else(|| ProviderError::NoApiKey { provider: "xai".into() })?;
-            Ok(Box::new(OpenAiCompatible::xai(key, model)))
+            let mut p = OpenAiCompatible::xai(key, model);
+            if let Some(mt) = max_tokens { p = p.with_max_tokens(mt); }
+            Ok(Box::new(p))
         }
         "openai" => {
             let key = api_keys.openai.as_deref()
                 .ok_or_else(|| ProviderError::NoApiKey { provider: "openai".into() })?;
-            Ok(Box::new(OpenAiCompatible::openai(key, model)))
+            let mut p = OpenAiCompatible::openai(key, model);
+            if let Some(mt) = max_tokens { p = p.with_max_tokens(mt); }
+            Ok(Box::new(p))
         }
         "ollama" | "local" => {
-            Ok(Box::new(OpenAiCompatible::ollama(model)))
+            let mut p = OpenAiCompatible::ollama(model);
+            if let Some(mt) = max_tokens { p = p.with_max_tokens(mt); }
+            Ok(Box::new(p))
         }
         _ => Err(ProviderError::Parse(format!("Unknown provider: {}", provider))),
     }
 }
 
+// ============================================================================
+// Cost estimation
+// ============================================================================
+
+/// Cost per million tokens (input, output) for known models.
+/// Returns (0.0, 0.0) for unknown models.
+pub fn cost_per_million_tokens(model: &str) -> (f64, f64) {
+    let m = model.to_lowercase();
+
+    // Anthropic
+    if m.contains("opus") { return (15.0, 75.0); }
+    if m.contains("sonnet") { return (3.0, 15.0); }
+    if m.contains("haiku") { return (0.25, 1.25); }
+
+    // Gemini
+    if m.contains("gemini") && m.contains("flash") { return (0.075, 0.30); }
+    if m.contains("gemini") && m.contains("pro") { return (1.25, 5.0); }
+
+    // Grok
+    if m.contains("grok-3") && m.contains("mini") { return (0.30, 0.50); }
+    if m.contains("grok-3") { return (3.0, 15.0); }
+    if m.contains("grok") { return (2.0, 10.0); }
+
+    // OpenAI
+    if m.contains("gpt-4o-mini") { return (0.15, 0.60); }
+    if m.contains("gpt-4o") { return (2.50, 10.0); }
+    if m.contains("o1") || m.contains("o3") { return (10.0, 40.0); }
+
+    // Local / unknown — free
+    (0.0, 0.0)
+}
+
+/// Calculate cost from token usage and model.
+pub fn calculate_cost(model: &str, usage: &TokenUsage) -> f64 {
+    let (input_cpm, output_cpm) = cost_per_million_tokens(model);
+    (usage.input_tokens as f64 * input_cpm + usage.output_tokens as f64 * output_cpm) / 1_000_000.0
+}
+
+// ============================================================================
+// Retry with exponential backoff
+// ============================================================================
+
+/// Retry configuration for API calls.
+pub struct RetryConfig {
+    /// Maximum number of retries (default: 3).
+    pub max_retries: u32,
+    /// Base delay in milliseconds (default: 1000).
+    pub base_delay_ms: u64,
+    /// Maximum delay in milliseconds (default: 30000).
+    pub max_delay_ms: u64,
+}
+
+impl Default for RetryConfig {
+    fn default() -> Self {
+        Self {
+            max_retries: 3,
+            base_delay_ms: 1_000,
+            max_delay_ms: 30_000,
+        }
+    }
+}
+
+/// Call provider.complete() with retry + exponential backoff on retryable errors.
+pub async fn complete_with_retry(
+    provider: &dyn LlmProvider,
+    messages: &[Message],
+    tools: &[ToolDef],
+    config: &RetryConfig,
+) -> Result<CompletionResponse, ProviderError> {
+    let mut last_err = None;
+
+    for attempt in 0..=config.max_retries {
+        match provider.complete(messages, tools).await {
+            Ok(resp) => return Ok(resp),
+            Err(e) => {
+                if !e.is_retryable() || attempt == config.max_retries {
+                    return Err(e);
+                }
+
+                // Calculate delay: exponential backoff with jitter
+                let base = config.base_delay_ms * 2u64.pow(attempt);
+                let delay = if let Some(retry_after) = e.retry_after_hint() {
+                    // Respect Retry-After header from 429
+                    retry_after * 1000
+                } else {
+                    base.min(config.max_delay_ms)
+                };
+
+                tracing::warn!(
+                    attempt = attempt + 1,
+                    max_retries = config.max_retries,
+                    delay_ms = delay,
+                    error = %e,
+                    "Retryable API error, backing off"
+                );
+
+                tokio::time::sleep(std::time::Duration::from_millis(delay)).await;
+                last_err = Some(e);
+            }
+        }
+    }
+
+    Err(last_err.unwrap())
+}
+
 /// API keys loaded from environment
 #[derive(Debug, Clone, Default)]
 pub struct ApiKeys {