synops/scripts/benchmark-models.sh
vegard 67bc564f92 Benchmark: lås til git-ref, advar ved dirty tree
Alle modeller testes mot nøyaktig samme kildekode.
Ref og tidsstempel logges i CSV-header.

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
2026-03-18 15:15:36 +00:00

88 lines
3 KiB
Bash
Executable file
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

#!/usr/bin/env bash
# Benchmark alle modeller × effort-nivåer sekvensielt.
# Kjøres når serveren er rolig (ingen task runner, ingen brukere).
#
# Bruk:
# ./scripts/benchmark-models.sh
# Resultater skrives til docs/erfaringer/modell_benchmark.md
set -euo pipefail
ROOT="$(cd "$(dirname "$0")/.." && pwd)"
# Lås til en bestemt git-ref for å sikre at alle modeller
# testes mot nøyaktig samme kildekode. Sett via env eller
# bruk HEAD ved kjøretidspunkt.
BENCHMARK_REF="${BENCHMARK_REF:-$(git -C "$ROOT" rev-parse HEAD)}"
echo "=== Benchmark kjører mot git ref: $BENCHMARK_REF ==="
echo "=== Tidspunkt: $(date -Iseconds) ==="
# Verifiser at working tree matcher ref (advar hvis dirty)
if ! git -C "$ROOT" diff --quiet HEAD 2>/dev/null; then
echo "ADVARSEL: working tree har ucommittede endringer."
echo "Resultatene kan avvike fra ref $BENCHMARK_REF."
fi
SIMPLE_FILE="tools/synops-transcribe/src/main.rs"
COMPLEX_FILE="tools/synops-audio/src/main.rs"
PROMPT_BASE="Les %s nøye. Gi en vurdering: 1) Hva gjør verktøyet? 2) Er det bugs? 3) Hva kan forbedres? Maks 300 ord."
declare -A MODELS=(
[haiku]="claude-haiku-4-5-20251001"
[sonnet]="claude-sonnet-4-6"
[opus]="claude-opus-4-6"
)
declare -A EFFORT_PREFIX=(
[low]="Svar kort og overfladisk. Ikke les grundig."
[medium]=""
[high]="Vær ekstremt grundig. Les hver linje nøye. Tenk dypt på edge cases."
)
OUTDIR="/tmp/benchmark-$(date +%Y%m%d-%H%M)"
mkdir -p "$OUTDIR"
echo "# git_ref: $BENCHMARK_REF" > "$OUTDIR/results.csv"
echo "# tidspunkt: $(date -Iseconds)" >> "$OUTDIR/results.csv"
echo "modell,effort,fil,tid_sek,ord,bugs_nøkkelord" >> "$OUTDIR/results.csv"
for file in "$SIMPLE_FILE" "$COMPLEX_FILE"; do
file_label=$(basename "$file" .rs)
base_prompt=$(printf "$PROMPT_BASE" "$file")
for model_name in haiku sonnet opus; do
for effort in low medium high; do
prefix="${EFFORT_PREFIX[$effort]}"
if [[ -n "$prefix" ]]; then
prompt="$prefix $base_prompt"
else
prompt="$base_prompt"
fi
outfile="$OUTDIR/${file_label}_${model_name}_${effort}.txt"
echo "[$(date +%H:%M:%S)] $file_label / $model_name / $effort ..."
start_time=$(date +%s)
cd "$ROOT"
claude -p --model "${MODELS[$model_name]}" --output-format text --dangerously-skip-permissions "$prompt" > "$outfile" 2>&1 || true
end_time=$(date +%s)
elapsed=$((end_time - start_time))
words=$(wc -w < "$outfile" || echo 0)
bugs=$(grep -ciE 'bug|feil|race|overflow|crash|problem|sårbar' "$outfile" || echo 0)
echo "$model_name,$effort,$file_label,$elapsed,$words,$bugs" >> "$OUTDIR/results.csv"
echo "${elapsed}s, ${words} ord, ${bugs} bug-nøkkelord"
done
done
done
echo ""
echo "=== RESULTATER ==="
column -t -s',' "$OUTDIR/results.csv"
echo ""
echo "Rå resultater: $OUTDIR/"
echo "CSV: $OUTDIR/results.csv"
# Kopier CSV til repo for referanse
cp "$OUTDIR/results.csv" "$ROOT/docs/erfaringer/benchmark_$(date +%Y%m%d).csv"
echo "Kopiert til docs/erfaringer/benchmark_$(date +%Y%m%d).csv"