Fullfør oppgave 6.1: CAS-lagring med SHA-256 og deduplisering

Implementerer content-addressable store som Rust-modul i maskinrommet:

- CasStore med SHA-256 hashing (sha2 + hex crates)
- Katalogstruktur: {root}/{hash[0..2]}/{hash[2..4]}/{hash}
- Atomisk skriving via temp-fil + rename (hindrer korrupt read)
- Innebygd deduplisering — identisk hash = ingen skriving
- Konfigurerbar rot via CAS_ROOT env (default: /srv/synops/media/cas)
- 5 enhetstester: store, dedup, exists, ulike filer, path-struktur

Neste steg: 6.2 (upload-endepunkt) bruker CasStore for filmottak.

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
vegard 2026-03-17 16:31:23 +01:00
parent c85f0d4c92
commit b7a6f8e7ef
5 changed files with 231 additions and 3 deletions

View file

@ -953,10 +953,12 @@ version = "0.1.0"
dependencies = [ dependencies = [
"axum", "axum",
"chrono", "chrono",
"hex",
"jsonwebtoken", "jsonwebtoken",
"reqwest", "reqwest",
"serde", "serde",
"serde_json", "serde_json",
"sha2",
"sqlx", "sqlx",
"tokio", "tokio",
"tower-http", "tower-http",

View file

@ -16,3 +16,5 @@ tracing-subscriber = { version = "0.3", features = ["env-filter", "json"] }
tower-http = { version = "0.6", features = ["cors", "trace"] } tower-http = { version = "0.6", features = ["cors", "trace"] }
jsonwebtoken = "9" jsonwebtoken = "9"
reqwest = { version = "0.12", default-features = false, features = ["rustls-tls", "json"] } reqwest = { version = "0.12", default-features = false, features = ["rustls-tls", "json"] }
sha2 = "0.10"
hex = "0.4"

214
maskinrommet/src/cas.rs Normal file
View file

@ -0,0 +1,214 @@
//! Content-Addressable Store (CAS)
//!
//! Lagrer binærfiler identifisert av SHA-256 hash.
//! Katalogstruktur: `{root}/{hash[0..2]}/{hash[2..4]}/{hash}`
//! Deduplisering er innebygd — identisk innhold gir identisk hash.
use sha2::{Digest, Sha256};
use std::path::{Path, PathBuf};
use tokio::io::AsyncWriteExt;
/// CAS-lagring med filsystem-backend.
#[derive(Clone)]
pub struct CasStore {
root: PathBuf,
}
/// Resultat fra en CAS-skriveoperasjon.
pub struct StoreResult {
/// SHA-256 hex-digest (64 tegn).
pub hash: String,
/// Antall bytes lagret.
pub size: u64,
/// `true` hvis filen allerede fantes (deduplisert).
pub already_existed: bool,
}
impl CasStore {
/// Opprett en ny CAS-store med gitt rot-katalog.
/// Oppretter rot-katalogen hvis den ikke finnes.
pub async fn new(root: impl Into<PathBuf>) -> std::io::Result<Self> {
let root = root.into();
tokio::fs::create_dir_all(&root).await?;
Ok(Self { root })
}
/// Lagre bytes i CAS. Returnerer hash og metadata.
/// Hvis filen allerede finnes (identisk hash), skrives ingenting — deduplisering.
pub async fn store(&self, data: &[u8]) -> std::io::Result<StoreResult> {
let hash = Self::sha256(data);
let size = data.len() as u64;
let path = self.path_for(&hash);
if path.exists() {
return Ok(StoreResult {
hash,
size,
already_existed: true,
});
}
// Opprett prefix-kataloger
if let Some(parent) = path.parent() {
tokio::fs::create_dir_all(parent).await?;
}
// Skriv til temp-fil, deretter atomisk rename.
// Dette hindrer at en halvskrevet fil blir lest av andre prosesser.
let tmp_path = self.tmp_path(&hash);
if let Some(parent) = tmp_path.parent() {
tokio::fs::create_dir_all(parent).await?;
}
let mut file = tokio::fs::File::create(&tmp_path).await?;
file.write_all(data).await?;
file.flush().await?;
// Synkroniser til disk for dataintegritet
file.sync_all().await?;
drop(file);
// Atomisk rename — trygt selv ved strømbrudd
tokio::fs::rename(&tmp_path, &path).await?;
tracing::info!(hash = %hash, size = size, "Lagret ny fil i CAS");
Ok(StoreResult {
hash,
size,
already_existed: false,
})
}
/// Sjekk om en hash finnes i CAS.
pub fn exists(&self, hash: &str) -> bool {
self.path_for(hash).exists()
}
/// Full filbane for en gitt hash.
/// Struktur: `{root}/{hash[0..2]}/{hash[2..4]}/{hash}`
pub fn path_for(&self, hash: &str) -> PathBuf {
let (prefix1, rest) = hash.split_at(2.min(hash.len()));
let (prefix2, _) = rest.split_at(2.min(rest.len()));
self.root.join(prefix1).join(prefix2).join(hash)
}
/// Beregn SHA-256 hex-digest for gitte bytes.
pub fn sha256(data: &[u8]) -> String {
let mut hasher = Sha256::new();
hasher.update(data);
hex::encode(hasher.finalize())
}
/// Rot-katalogen for CAS.
pub fn root(&self) -> &Path {
&self.root
}
/// Temp-filbane brukt under skriving (unngår korrupte reads).
fn tmp_path(&self, hash: &str) -> PathBuf {
self.root.join("tmp").join(format!("{hash}.tmp"))
}
}
#[cfg(test)]
mod tests {
use super::*;
use std::path::PathBuf;
fn test_root() -> PathBuf {
PathBuf::from(env!("CARGO_MANIFEST_DIR")).join("target").join("test-cas")
}
#[tokio::test]
async fn store_and_retrieve() {
let root = test_root().join("store_and_retrieve");
let _ = tokio::fs::remove_dir_all(&root).await;
let cas = CasStore::new(&root).await.unwrap();
let data = b"hello world";
let result = cas.store(data).await.unwrap();
// SHA-256 av "hello world"
assert_eq!(
result.hash,
"b94d27b9934d3e08a52e52d7da7dabfac484efe37a5380ee9088f7ace2efcde9"
);
assert_eq!(result.size, 11);
assert!(!result.already_existed);
// Filen skal finnes på riktig path
let path = cas.path_for(&result.hash);
assert!(path.exists());
let content = tokio::fs::read(&path).await.unwrap();
assert_eq!(content, data);
// Katalogstruktur: b9/4d/{full hash}
assert!(path.to_string_lossy().contains("/b9/4d/"));
// Cleanup
let _ = tokio::fs::remove_dir_all(&root).await;
}
#[tokio::test]
async fn deduplication() {
let root = test_root().join("deduplication");
let _ = tokio::fs::remove_dir_all(&root).await;
let cas = CasStore::new(&root).await.unwrap();
let data = b"duplicate content";
let first = cas.store(data).await.unwrap();
assert!(!first.already_existed);
let second = cas.store(data).await.unwrap();
assert!(second.already_existed);
assert_eq!(first.hash, second.hash);
let _ = tokio::fs::remove_dir_all(&root).await;
}
#[tokio::test]
async fn exists_check() {
let root = test_root().join("exists_check");
let _ = tokio::fs::remove_dir_all(&root).await;
let cas = CasStore::new(&root).await.unwrap();
assert!(!cas.exists("nonexistent_hash"));
let result = cas.store(b"test").await.unwrap();
assert!(cas.exists(&result.hash));
let _ = tokio::fs::remove_dir_all(&root).await;
}
#[tokio::test]
async fn different_content_different_hash() {
let root = test_root().join("different_content");
let _ = tokio::fs::remove_dir_all(&root).await;
let cas = CasStore::new(&root).await.unwrap();
let a = cas.store(b"content A").await.unwrap();
let b = cas.store(b"content B").await.unwrap();
assert_ne!(a.hash, b.hash);
assert!(cas.exists(&a.hash));
assert!(cas.exists(&b.hash));
let _ = tokio::fs::remove_dir_all(&root).await;
}
#[test]
fn path_structure() {
let cas = CasStore {
root: PathBuf::from("/srv/synops/media/cas"),
};
let hash = "b94d27b9934d3e08a52e52d7da7dabfac484efe37a5380ee9088f7ace2efcde9";
let path = cas.path_for(hash);
assert_eq!(
path,
PathBuf::from("/srv/synops/media/cas/b9/4d/b94d27b9934d3e08a52e52d7da7dabfac484efe37a5380ee9088f7ace2efcde9")
);
}
}

View file

@ -1,4 +1,5 @@
mod auth; mod auth;
pub mod cas;
mod intentions; mod intentions;
mod queries; mod queries;
mod stdb; mod stdb;
@ -12,6 +13,7 @@ use tower_http::trace::TraceLayer;
use tracing_subscriber::{layer::SubscriberExt, util::SubscriberInitExt, EnvFilter}; use tracing_subscriber::{layer::SubscriberExt, util::SubscriberInitExt, EnvFilter};
use auth::{AuthUser, JwksKeys}; use auth::{AuthUser, JwksKeys};
use cas::CasStore;
use stdb::StdbClient; use stdb::StdbClient;
#[derive(Clone)] #[derive(Clone)]
@ -19,6 +21,7 @@ pub struct AppState {
pub db: PgPool, pub db: PgPool,
pub jwks: JwksKeys, pub jwks: JwksKeys,
pub stdb: StdbClient, pub stdb: StdbClient,
pub cas: CasStore,
} }
#[derive(Serialize)] #[derive(Serialize)]
@ -108,7 +111,15 @@ async fn main() {
} }
} }
let state = AppState { db, jwks, stdb }; // CAS — content-addressable store for binærfiler
let cas_root = std::env::var("CAS_ROOT")
.unwrap_or_else(|_| "/srv/synops/media/cas".to_string());
let cas = CasStore::new(&cas_root)
.await
.expect("Kunne ikke opprette CAS-katalog");
tracing::info!(root = %cas_root, "CAS initialisert");
let state = AppState { db, jwks, stdb, cas };
// Ruter: /health er offentlig, /me krever gyldig JWT // Ruter: /health er offentlig, /me krever gyldig JWT
let app = Router::new() let app = Router::new()

View file

@ -86,8 +86,7 @@ Uavhengige faser kan fortsatt plukkes.
## Fase 6: CAS og mediefiler ## Fase 6: CAS og mediefiler
- [~] 6.1 CAS-lagring: filsystem med content-addressable hashing (SHA-256). Katalogstruktur med hash-prefix. Deduplisering. - [x] 6.1 CAS-lagring: filsystem med content-addressable hashing (SHA-256). Katalogstruktur med hash-prefix. Deduplisering.
> Påbegynt: 2026-03-17T16:27
- [ ] 6.2 Upload-endepunkt: `POST /intentions/upload_media` → hash fil, lagre i CAS, opprett media-node med `has_media`-edge. - [ ] 6.2 Upload-endepunkt: `POST /intentions/upload_media` → hash fil, lagre i CAS, opprett media-node med `has_media`-edge.
- [ ] 6.3 Serving: `GET /cas/{hash}` → stream fil fra disk. Caddy kan serve direkte for ytelse. - [ ] 6.3 Serving: `GET /cas/{hash}` → stream fil fra disk. Caddy kan serve direkte for ytelse.
- [ ] 6.4 Bilder i TipTap: drag-and-drop/paste → upload → CAS-node → inline i `metadata.document` via `node_id`. - [ ] 6.4 Bilder i TipTap: drag-and-drop/paste → upload → CAS-node → inline i `metadata.document` via `node_id`.