Alle modeller testes mot nøyaktig samme kildekode. Ref og tidsstempel logges i CSV-header. Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
88 lines
3 KiB
Bash
Executable file
88 lines
3 KiB
Bash
Executable file
#!/usr/bin/env bash
|
||
# Benchmark alle modeller × effort-nivåer sekvensielt.
|
||
# Kjøres når serveren er rolig (ingen task runner, ingen brukere).
|
||
#
|
||
# Bruk:
|
||
# ./scripts/benchmark-models.sh
|
||
# Resultater skrives til docs/erfaringer/modell_benchmark.md
|
||
|
||
set -euo pipefail
|
||
ROOT="$(cd "$(dirname "$0")/.." && pwd)"
|
||
|
||
# Lås til en bestemt git-ref for å sikre at alle modeller
|
||
# testes mot nøyaktig samme kildekode. Sett via env eller
|
||
# bruk HEAD ved kjøretidspunkt.
|
||
BENCHMARK_REF="${BENCHMARK_REF:-$(git -C "$ROOT" rev-parse HEAD)}"
|
||
echo "=== Benchmark kjører mot git ref: $BENCHMARK_REF ==="
|
||
echo "=== Tidspunkt: $(date -Iseconds) ==="
|
||
|
||
# Verifiser at working tree matcher ref (advar hvis dirty)
|
||
if ! git -C "$ROOT" diff --quiet HEAD 2>/dev/null; then
|
||
echo "ADVARSEL: working tree har ucommittede endringer."
|
||
echo "Resultatene kan avvike fra ref $BENCHMARK_REF."
|
||
fi
|
||
|
||
SIMPLE_FILE="tools/synops-transcribe/src/main.rs"
|
||
COMPLEX_FILE="tools/synops-audio/src/main.rs"
|
||
PROMPT_BASE="Les %s nøye. Gi en vurdering: 1) Hva gjør verktøyet? 2) Er det bugs? 3) Hva kan forbedres? Maks 300 ord."
|
||
|
||
declare -A MODELS=(
|
||
[haiku]="claude-haiku-4-5-20251001"
|
||
[sonnet]="claude-sonnet-4-6"
|
||
[opus]="claude-opus-4-6"
|
||
)
|
||
|
||
declare -A EFFORT_PREFIX=(
|
||
[low]="Svar kort og overfladisk. Ikke les grundig."
|
||
[medium]=""
|
||
[high]="Vær ekstremt grundig. Les hver linje nøye. Tenk dypt på edge cases."
|
||
)
|
||
|
||
OUTDIR="/tmp/benchmark-$(date +%Y%m%d-%H%M)"
|
||
mkdir -p "$OUTDIR"
|
||
|
||
echo "# git_ref: $BENCHMARK_REF" > "$OUTDIR/results.csv"
|
||
echo "# tidspunkt: $(date -Iseconds)" >> "$OUTDIR/results.csv"
|
||
echo "modell,effort,fil,tid_sek,ord,bugs_nøkkelord" >> "$OUTDIR/results.csv"
|
||
|
||
for file in "$SIMPLE_FILE" "$COMPLEX_FILE"; do
|
||
file_label=$(basename "$file" .rs)
|
||
base_prompt=$(printf "$PROMPT_BASE" "$file")
|
||
|
||
for model_name in haiku sonnet opus; do
|
||
for effort in low medium high; do
|
||
prefix="${EFFORT_PREFIX[$effort]}"
|
||
if [[ -n "$prefix" ]]; then
|
||
prompt="$prefix $base_prompt"
|
||
else
|
||
prompt="$base_prompt"
|
||
fi
|
||
|
||
outfile="$OUTDIR/${file_label}_${model_name}_${effort}.txt"
|
||
echo "[$(date +%H:%M:%S)] $file_label / $model_name / $effort ..."
|
||
|
||
start_time=$(date +%s)
|
||
cd "$ROOT"
|
||
claude -p --model "${MODELS[$model_name]}" --output-format text --dangerously-skip-permissions "$prompt" > "$outfile" 2>&1 || true
|
||
end_time=$(date +%s)
|
||
|
||
elapsed=$((end_time - start_time))
|
||
words=$(wc -w < "$outfile" || echo 0)
|
||
bugs=$(grep -ciE 'bug|feil|race|overflow|crash|problem|sårbar' "$outfile" || echo 0)
|
||
|
||
echo "$model_name,$effort,$file_label,$elapsed,$words,$bugs" >> "$OUTDIR/results.csv"
|
||
echo " → ${elapsed}s, ${words} ord, ${bugs} bug-nøkkelord"
|
||
done
|
||
done
|
||
done
|
||
|
||
echo ""
|
||
echo "=== RESULTATER ==="
|
||
column -t -s',' "$OUTDIR/results.csv"
|
||
echo ""
|
||
echo "Rå resultater: $OUTDIR/"
|
||
echo "CSV: $OUTDIR/results.csv"
|
||
|
||
# Kopier CSV til repo for referanse
|
||
cp "$OUTDIR/results.csv" "$ROOT/docs/erfaringer/benchmark_$(date +%Y%m%d).csv"
|
||
echo "Kopiert til docs/erfaringer/benchmark_$(date +%Y%m%d).csv"
|