From b7a6f8e7ef6b91419947a2eb8882458721e47744 Mon Sep 17 00:00:00 2001 From: vegard Date: Tue, 17 Mar 2026 16:31:23 +0100 Subject: [PATCH] =?UTF-8?q?Fullf=C3=B8r=20oppgave=206.1:=20CAS-lagring=20m?= =?UTF-8?q?ed=20SHA-256=20og=20deduplisering?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Implementerer content-addressable store som Rust-modul i maskinrommet: - CasStore med SHA-256 hashing (sha2 + hex crates) - Katalogstruktur: {root}/{hash[0..2]}/{hash[2..4]}/{hash} - Atomisk skriving via temp-fil + rename (hindrer korrupt read) - Innebygd deduplisering — identisk hash = ingen skriving - Konfigurerbar rot via CAS_ROOT env (default: /srv/synops/media/cas) - 5 enhetstester: store, dedup, exists, ulike filer, path-struktur Neste steg: 6.2 (upload-endepunkt) bruker CasStore for filmottak. Co-Authored-By: Claude Opus 4.6 --- maskinrommet/Cargo.lock | 2 + maskinrommet/Cargo.toml | 2 + maskinrommet/src/cas.rs | 214 +++++++++++++++++++++++++++++++++++++++ maskinrommet/src/main.rs | 13 ++- tasks.md | 3 +- 5 files changed, 231 insertions(+), 3 deletions(-) create mode 100644 maskinrommet/src/cas.rs diff --git a/maskinrommet/Cargo.lock b/maskinrommet/Cargo.lock index b77a93c..aea54b5 100644 --- a/maskinrommet/Cargo.lock +++ b/maskinrommet/Cargo.lock @@ -953,10 +953,12 @@ version = "0.1.0" dependencies = [ "axum", "chrono", + "hex", "jsonwebtoken", "reqwest", "serde", "serde_json", + "sha2", "sqlx", "tokio", "tower-http", diff --git a/maskinrommet/Cargo.toml b/maskinrommet/Cargo.toml index 6ed4612..30308cd 100644 --- a/maskinrommet/Cargo.toml +++ b/maskinrommet/Cargo.toml @@ -16,3 +16,5 @@ tracing-subscriber = { version = "0.3", features = ["env-filter", "json"] } tower-http = { version = "0.6", features = ["cors", "trace"] } jsonwebtoken = "9" reqwest = { version = "0.12", default-features = false, features = ["rustls-tls", "json"] } +sha2 = "0.10" +hex = "0.4" diff --git a/maskinrommet/src/cas.rs b/maskinrommet/src/cas.rs new file mode 100644 index 0000000..a2461ce --- /dev/null +++ b/maskinrommet/src/cas.rs @@ -0,0 +1,214 @@ +//! Content-Addressable Store (CAS) +//! +//! Lagrer binærfiler identifisert av SHA-256 hash. +//! Katalogstruktur: `{root}/{hash[0..2]}/{hash[2..4]}/{hash}` +//! Deduplisering er innebygd — identisk innhold gir identisk hash. + +use sha2::{Digest, Sha256}; +use std::path::{Path, PathBuf}; +use tokio::io::AsyncWriteExt; + +/// CAS-lagring med filsystem-backend. +#[derive(Clone)] +pub struct CasStore { + root: PathBuf, +} + +/// Resultat fra en CAS-skriveoperasjon. +pub struct StoreResult { + /// SHA-256 hex-digest (64 tegn). + pub hash: String, + /// Antall bytes lagret. + pub size: u64, + /// `true` hvis filen allerede fantes (deduplisert). + pub already_existed: bool, +} + +impl CasStore { + /// Opprett en ny CAS-store med gitt rot-katalog. + /// Oppretter rot-katalogen hvis den ikke finnes. + pub async fn new(root: impl Into) -> std::io::Result { + let root = root.into(); + tokio::fs::create_dir_all(&root).await?; + Ok(Self { root }) + } + + /// Lagre bytes i CAS. Returnerer hash og metadata. + /// Hvis filen allerede finnes (identisk hash), skrives ingenting — deduplisering. + pub async fn store(&self, data: &[u8]) -> std::io::Result { + let hash = Self::sha256(data); + let size = data.len() as u64; + let path = self.path_for(&hash); + + if path.exists() { + return Ok(StoreResult { + hash, + size, + already_existed: true, + }); + } + + // Opprett prefix-kataloger + if let Some(parent) = path.parent() { + tokio::fs::create_dir_all(parent).await?; + } + + // Skriv til temp-fil, deretter atomisk rename. + // Dette hindrer at en halvskrevet fil blir lest av andre prosesser. + let tmp_path = self.tmp_path(&hash); + if let Some(parent) = tmp_path.parent() { + tokio::fs::create_dir_all(parent).await?; + } + + let mut file = tokio::fs::File::create(&tmp_path).await?; + file.write_all(data).await?; + file.flush().await?; + // Synkroniser til disk for dataintegritet + file.sync_all().await?; + drop(file); + + // Atomisk rename — trygt selv ved strømbrudd + tokio::fs::rename(&tmp_path, &path).await?; + + tracing::info!(hash = %hash, size = size, "Lagret ny fil i CAS"); + + Ok(StoreResult { + hash, + size, + already_existed: false, + }) + } + + /// Sjekk om en hash finnes i CAS. + pub fn exists(&self, hash: &str) -> bool { + self.path_for(hash).exists() + } + + /// Full filbane for en gitt hash. + /// Struktur: `{root}/{hash[0..2]}/{hash[2..4]}/{hash}` + pub fn path_for(&self, hash: &str) -> PathBuf { + let (prefix1, rest) = hash.split_at(2.min(hash.len())); + let (prefix2, _) = rest.split_at(2.min(rest.len())); + self.root.join(prefix1).join(prefix2).join(hash) + } + + /// Beregn SHA-256 hex-digest for gitte bytes. + pub fn sha256(data: &[u8]) -> String { + let mut hasher = Sha256::new(); + hasher.update(data); + hex::encode(hasher.finalize()) + } + + /// Rot-katalogen for CAS. + pub fn root(&self) -> &Path { + &self.root + } + + /// Temp-filbane brukt under skriving (unngår korrupte reads). + fn tmp_path(&self, hash: &str) -> PathBuf { + self.root.join("tmp").join(format!("{hash}.tmp")) + } +} + +#[cfg(test)] +mod tests { + use super::*; + use std::path::PathBuf; + + fn test_root() -> PathBuf { + PathBuf::from(env!("CARGO_MANIFEST_DIR")).join("target").join("test-cas") + } + + #[tokio::test] + async fn store_and_retrieve() { + let root = test_root().join("store_and_retrieve"); + let _ = tokio::fs::remove_dir_all(&root).await; + + let cas = CasStore::new(&root).await.unwrap(); + let data = b"hello world"; + let result = cas.store(data).await.unwrap(); + + // SHA-256 av "hello world" + assert_eq!( + result.hash, + "b94d27b9934d3e08a52e52d7da7dabfac484efe37a5380ee9088f7ace2efcde9" + ); + assert_eq!(result.size, 11); + assert!(!result.already_existed); + + // Filen skal finnes på riktig path + let path = cas.path_for(&result.hash); + assert!(path.exists()); + let content = tokio::fs::read(&path).await.unwrap(); + assert_eq!(content, data); + + // Katalogstruktur: b9/4d/{full hash} + assert!(path.to_string_lossy().contains("/b9/4d/")); + + // Cleanup + let _ = tokio::fs::remove_dir_all(&root).await; + } + + #[tokio::test] + async fn deduplication() { + let root = test_root().join("deduplication"); + let _ = tokio::fs::remove_dir_all(&root).await; + + let cas = CasStore::new(&root).await.unwrap(); + let data = b"duplicate content"; + + let first = cas.store(data).await.unwrap(); + assert!(!first.already_existed); + + let second = cas.store(data).await.unwrap(); + assert!(second.already_existed); + assert_eq!(first.hash, second.hash); + + let _ = tokio::fs::remove_dir_all(&root).await; + } + + #[tokio::test] + async fn exists_check() { + let root = test_root().join("exists_check"); + let _ = tokio::fs::remove_dir_all(&root).await; + + let cas = CasStore::new(&root).await.unwrap(); + + assert!(!cas.exists("nonexistent_hash")); + + let result = cas.store(b"test").await.unwrap(); + assert!(cas.exists(&result.hash)); + + let _ = tokio::fs::remove_dir_all(&root).await; + } + + #[tokio::test] + async fn different_content_different_hash() { + let root = test_root().join("different_content"); + let _ = tokio::fs::remove_dir_all(&root).await; + + let cas = CasStore::new(&root).await.unwrap(); + + let a = cas.store(b"content A").await.unwrap(); + let b = cas.store(b"content B").await.unwrap(); + + assert_ne!(a.hash, b.hash); + assert!(cas.exists(&a.hash)); + assert!(cas.exists(&b.hash)); + + let _ = tokio::fs::remove_dir_all(&root).await; + } + + #[test] + fn path_structure() { + let cas = CasStore { + root: PathBuf::from("/srv/synops/media/cas"), + }; + let hash = "b94d27b9934d3e08a52e52d7da7dabfac484efe37a5380ee9088f7ace2efcde9"; + let path = cas.path_for(hash); + assert_eq!( + path, + PathBuf::from("/srv/synops/media/cas/b9/4d/b94d27b9934d3e08a52e52d7da7dabfac484efe37a5380ee9088f7ace2efcde9") + ); + } +} diff --git a/maskinrommet/src/main.rs b/maskinrommet/src/main.rs index eda0685..61fc5ee 100644 --- a/maskinrommet/src/main.rs +++ b/maskinrommet/src/main.rs @@ -1,4 +1,5 @@ mod auth; +pub mod cas; mod intentions; mod queries; mod stdb; @@ -12,6 +13,7 @@ use tower_http::trace::TraceLayer; use tracing_subscriber::{layer::SubscriberExt, util::SubscriberInitExt, EnvFilter}; use auth::{AuthUser, JwksKeys}; +use cas::CasStore; use stdb::StdbClient; #[derive(Clone)] @@ -19,6 +21,7 @@ pub struct AppState { pub db: PgPool, pub jwks: JwksKeys, pub stdb: StdbClient, + pub cas: CasStore, } #[derive(Serialize)] @@ -108,7 +111,15 @@ async fn main() { } } - let state = AppState { db, jwks, stdb }; + // CAS — content-addressable store for binærfiler + let cas_root = std::env::var("CAS_ROOT") + .unwrap_or_else(|_| "/srv/synops/media/cas".to_string()); + let cas = CasStore::new(&cas_root) + .await + .expect("Kunne ikke opprette CAS-katalog"); + tracing::info!(root = %cas_root, "CAS initialisert"); + + let state = AppState { db, jwks, stdb, cas }; // Ruter: /health er offentlig, /me krever gyldig JWT let app = Router::new() diff --git a/tasks.md b/tasks.md index 6e6fdeb..ba10060 100644 --- a/tasks.md +++ b/tasks.md @@ -86,8 +86,7 @@ Uavhengige faser kan fortsatt plukkes. ## Fase 6: CAS og mediefiler -- [~] 6.1 CAS-lagring: filsystem med content-addressable hashing (SHA-256). Katalogstruktur med hash-prefix. Deduplisering. - > Påbegynt: 2026-03-17T16:27 +- [x] 6.1 CAS-lagring: filsystem med content-addressable hashing (SHA-256). Katalogstruktur med hash-prefix. Deduplisering. - [ ] 6.2 Upload-endepunkt: `POST /intentions/upload_media` → hash fil, lagre i CAS, opprett media-node med `has_media`-edge. - [ ] 6.3 Serving: `GET /cas/{hash}` → stream fil fra disk. Caddy kan serve direkte for ytelse. - [ ] 6.4 Bilder i TipTap: drag-and-drop/paste → upload → CAS-node → inline i `metadata.document` via `node_id`.