From 5dead5dac95e56053578523c02c9cdf59efda13b Mon Sep 17 00:00:00 2001 From: vegard Date: Wed, 18 Mar 2026 18:24:22 +0000 Subject: [PATCH] synops-clip: CLI for web article extraction (oppgave 25.1) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Nytt verktøy som henter og parser webartikler til ren tekst + metadata. Bruker Mozilla Readability (via Node.js) for artikkelekstraksjon, med Playwright som fallback for JS-rendrede sider. Arkitektur: - Rust CLI (clap, reqwest) håndterer HTTP-henting, paywall-deteksjon, JSON-output - Node.js-hjelpeskript (readability.mjs) bruker @mozilla/readability + jsdom - Playwright-script (playwright.mjs) for headless browser-fallback - Støtter --payload-json for maskinrommet/jobbkø-integrasjon Paywall-deteksjon basert på: - Kort innhold (<200 tegn) - Norske/engelske paywall-fraser i innholdet - CSS-klasser/HTML-attributter (piano, schibsted, amedia, etc.) - Schema.org isAccessibleForFree meta-tagg Output: JSON med title, author, date, content, url, paywall, excerpt, source --- tasks.md | 3 +- tools/README.md | 1 + tools/synops-clip/.gitignore | 2 + tools/synops-clip/Cargo.toml | 19 + tools/synops-clip/scripts/package-lock.json | 810 ++++++++++++++++++++ tools/synops-clip/scripts/package.json | 11 + tools/synops-clip/scripts/playwright.mjs | 63 ++ tools/synops-clip/scripts/readability.mjs | 43 ++ tools/synops-clip/src/main.rs | 505 ++++++++++++ 9 files changed, 1455 insertions(+), 2 deletions(-) create mode 100644 tools/synops-clip/.gitignore create mode 100644 tools/synops-clip/Cargo.toml create mode 100644 tools/synops-clip/scripts/package-lock.json create mode 100644 tools/synops-clip/scripts/package.json create mode 100644 tools/synops-clip/scripts/playwright.mjs create mode 100644 tools/synops-clip/scripts/readability.mjs create mode 100644 tools/synops-clip/src/main.rs diff --git a/tasks.md b/tasks.md index bdfda2f..315b3cd 100644 --- a/tasks.md +++ b/tasks.md @@ -334,8 +334,7 @@ Ref: `docs/proposals/web_clipper.md`. CLI-verktøy som henter URL, parser med Readability, og oppretter innholdsnode med AI-beriking. Brukes av @bot i chat ("les denne artikkelen"), orkestreringer, og fremtidig browser-extension. -- [~] 25.1 `synops-clip` CLI: hent URL, parse med Readability (mozilla/readability via JS eller Rust-port), returner ren tekst + metadata (tittel, forfatter, dato, ingress). Fallback til headless browser (Playwright) for JS-rendrede sider. Detekter betalingsmur (kort/avkuttet innhold, "logg inn for å lese", kjente paywall-mønstre) — returner `"paywall": true` og tilgjengelig innhold (ingress/utdrag). Output: JSON med `title`, `author`, `date`, `content`, `url`, `paywall`. - > Påbegynt: 2026-03-18T18:15 +- [x] 25.1 `synops-clip` CLI: hent URL, parse med Readability (mozilla/readability via JS eller Rust-port), returner ren tekst + metadata (tittel, forfatter, dato, ingress). Fallback til headless browser (Playwright) for JS-rendrede sider. Detekter betalingsmur (kort/avkuttet innhold, "logg inn for å lese", kjente paywall-mønstre) — returner `"paywall": true` og tilgjengelig innhold (ingress/utdrag). Output: JSON med `title`, `author`, `date`, `content`, `url`, `paywall`. - [ ] 25.2 Node-opprettelse: `synops-clip --write` oppretter `content`-node med artikkelinnhold, `metadata.source_url`, og `tagged`-edge "clipped". AI-oppsummering via LiteLLM. `mentions`-edges til gjenkjente entiteter i kunnskapsgrafen. - [ ] 25.3 @bot-integrasjon: bruker limer inn URL i chat → boten gjenkjenner URL, kaller `synops-clip`, presenterer oppsummering i chatten, oppretter node i bakgrunnen. Ved paywall: "Denne artikkelen er bak betalingsmur. Jeg fikk med tittel og ingress — lim inn innholdet om du vil dele resten." - [ ] 25.4 Orkestrering-støtte: `synops-clip` tilgjengelig som verktøy i orkestreringer. F.eks. "Clip alle URL-er som deles i #Redaksjonen og oppsummer dem". diff --git a/tools/README.md b/tools/README.md index 73677f8..9bf70d1 100644 --- a/tools/README.md +++ b/tools/README.md @@ -22,6 +22,7 @@ eller maskinrommet-API. Ligger i PATH via symlink eller direkte kall. | `synops-feature-status` | Sjekk feature-status: spec, oppgaver, commits, feedback | Ferdig | | `synops-node` | Hent/vis en node med edges (UUID, --depth, --format json/md) | Ferdig | | `synops-ai` | AI-assistert generering av orkestreringsscript fra fritekst | Ferdig | +| `synops-clip` | Hent og parse webartikler (Readability + Playwright-fallback, paywall-deteksjon) | Ferdig | ## Delt bibliotek diff --git a/tools/synops-clip/.gitignore b/tools/synops-clip/.gitignore new file mode 100644 index 0000000..b8877dc --- /dev/null +++ b/tools/synops-clip/.gitignore @@ -0,0 +1,2 @@ +target/ +scripts/node_modules/ diff --git a/tools/synops-clip/Cargo.toml b/tools/synops-clip/Cargo.toml new file mode 100644 index 0000000..75d34d6 --- /dev/null +++ b/tools/synops-clip/Cargo.toml @@ -0,0 +1,19 @@ +[package] +name = "synops-clip" +version = "0.1.0" +edition = "2024" + +[[bin]] +name = "synops-clip" +path = "src/main.rs" + +[dependencies] +clap = { version = "4", features = ["derive"] } +tokio = { version = "1", features = ["full"] } +serde = { version = "1", features = ["derive"] } +serde_json = "1" +reqwest = { version = "0.12", default-features = false, features = ["rustls-tls"] } +regex = "1" +tracing = "0.1" +tracing-subscriber = { version = "0.3", features = ["env-filter"] } +synops-common = { path = "../synops-common" } diff --git a/tools/synops-clip/scripts/package-lock.json b/tools/synops-clip/scripts/package-lock.json new file mode 100644 index 0000000..94a4fa9 --- /dev/null +++ b/tools/synops-clip/scripts/package-lock.json @@ -0,0 +1,810 @@ +{ + "name": "synops-clip-scripts", + "version": "0.1.0", + "lockfileVersion": 3, + "requires": true, + "packages": { + "": { + "name": "synops-clip-scripts", + "version": "0.1.0", + "dependencies": { + "@mozilla/readability": "^0.5.0", + "jsdom": "^25.0.0", + "playwright": "^1.58.2" + } + }, + "node_modules/@asamuzakjp/css-color": { + "version": "3.2.0", + "resolved": "https://registry.npmjs.org/@asamuzakjp/css-color/-/css-color-3.2.0.tgz", + "integrity": "sha512-K1A6z8tS3XsmCMM86xoWdn7Fkdn9m6RSVtocUrJYIwZnFVkng/PvkEoWtOWmP+Scc6saYWHWZYbndEEXxl24jw==", + "license": "MIT", + "dependencies": { + "@csstools/css-calc": "^2.1.3", + "@csstools/css-color-parser": "^3.0.9", + "@csstools/css-parser-algorithms": "^3.0.4", + "@csstools/css-tokenizer": "^3.0.3", + "lru-cache": "^10.4.3" + } + }, + "node_modules/@csstools/color-helpers": { + "version": "5.1.0", + "resolved": "https://registry.npmjs.org/@csstools/color-helpers/-/color-helpers-5.1.0.tgz", + "integrity": "sha512-S11EXWJyy0Mz5SYvRmY8nJYTFFd1LCNV+7cXyAgQtOOuzb4EsgfqDufL+9esx72/eLhsRdGZwaldu/h+E4t4BA==", + "funding": [ + { + "type": "github", + "url": "https://github.com/sponsors/csstools" + }, + { + "type": "opencollective", + "url": "https://opencollective.com/csstools" + } + ], + "license": "MIT-0", + "engines": { + "node": ">=18" + } + }, + "node_modules/@csstools/css-calc": { + "version": "2.1.4", + "resolved": "https://registry.npmjs.org/@csstools/css-calc/-/css-calc-2.1.4.tgz", + "integrity": "sha512-3N8oaj+0juUw/1H3YwmDDJXCgTB1gKU6Hc/bB502u9zR0q2vd786XJH9QfrKIEgFlZmhZiq6epXl4rHqhzsIgQ==", + "funding": [ + { + "type": "github", + "url": "https://github.com/sponsors/csstools" + }, + { + "type": "opencollective", + "url": "https://opencollective.com/csstools" + } + ], + "license": "MIT", + "engines": { + "node": ">=18" + }, + "peerDependencies": { + "@csstools/css-parser-algorithms": "^3.0.5", + "@csstools/css-tokenizer": "^3.0.4" + } + }, + "node_modules/@csstools/css-color-parser": { + "version": "3.1.0", + "resolved": "https://registry.npmjs.org/@csstools/css-color-parser/-/css-color-parser-3.1.0.tgz", + "integrity": "sha512-nbtKwh3a6xNVIp/VRuXV64yTKnb1IjTAEEh3irzS+HkKjAOYLTGNb9pmVNntZ8iVBHcWDA2Dof0QtPgFI1BaTA==", + "funding": [ + { + "type": "github", + "url": "https://github.com/sponsors/csstools" + }, + { + "type": "opencollective", + "url": "https://opencollective.com/csstools" + } + ], + "license": "MIT", + "dependencies": { + "@csstools/color-helpers": "^5.1.0", + "@csstools/css-calc": "^2.1.4" + }, + "engines": { + "node": ">=18" + }, + "peerDependencies": { + "@csstools/css-parser-algorithms": "^3.0.5", + "@csstools/css-tokenizer": "^3.0.4" + } + }, + "node_modules/@csstools/css-parser-algorithms": { + "version": "3.0.5", + "resolved": "https://registry.npmjs.org/@csstools/css-parser-algorithms/-/css-parser-algorithms-3.0.5.tgz", + "integrity": "sha512-DaDeUkXZKjdGhgYaHNJTV9pV7Y9B3b644jCLs9Upc3VeNGg6LWARAT6O+Q+/COo+2gg/bM5rhpMAtf70WqfBdQ==", + "funding": [ + { + "type": "github", + "url": "https://github.com/sponsors/csstools" + }, + { + "type": "opencollective", + "url": "https://opencollective.com/csstools" + } + ], + "license": "MIT", + "engines": { + "node": ">=18" + }, + "peerDependencies": { + "@csstools/css-tokenizer": "^3.0.4" + } + }, + "node_modules/@csstools/css-tokenizer": { + "version": "3.0.4", + "resolved": "https://registry.npmjs.org/@csstools/css-tokenizer/-/css-tokenizer-3.0.4.tgz", + "integrity": "sha512-Vd/9EVDiu6PPJt9yAh6roZP6El1xHrdvIVGjyBsHR0RYwNHgL7FJPyIIW4fANJNG6FtyZfvlRPpFI4ZM/lubvw==", + "funding": [ + { + "type": "github", + "url": "https://github.com/sponsors/csstools" + }, + { + "type": "opencollective", + "url": "https://opencollective.com/csstools" + } + ], + "license": "MIT", + "engines": { + "node": ">=18" + } + }, + "node_modules/@mozilla/readability": { + "version": "0.5.0", + "resolved": "https://registry.npmjs.org/@mozilla/readability/-/readability-0.5.0.tgz", + "integrity": "sha512-Z+CZ3QaosfFaTqvhQsIktyGrjFjSC0Fa4EMph4mqKnWhmyoGICsV/8QK+8HpXut6zV7zwfWwqDmEjtk1Qf6EgQ==", + "license": "Apache-2.0", + "engines": { + "node": ">=14.0.0" + } + }, + "node_modules/agent-base": { + "version": "7.1.4", + "resolved": "https://registry.npmjs.org/agent-base/-/agent-base-7.1.4.tgz", + "integrity": "sha512-MnA+YT8fwfJPgBx3m60MNqakm30XOkyIoH1y6huTQvC0PwZG7ki8NacLBcrPbNoo8vEZy7Jpuk7+jMO+CUovTQ==", + "license": "MIT", + "engines": { + "node": ">= 14" + } + }, + "node_modules/asynckit": { + "version": "0.4.0", + "resolved": "https://registry.npmjs.org/asynckit/-/asynckit-0.4.0.tgz", + "integrity": "sha512-Oei9OH4tRh0YqU3GxhX79dM/mwVgvbZJaSNaRk+bshkj0S5cfHcgYakreBjrHwatXKbz+IoIdYLxrKim2MjW0Q==", + "license": "MIT" + }, + "node_modules/call-bind-apply-helpers": { + "version": "1.0.2", + "resolved": "https://registry.npmjs.org/call-bind-apply-helpers/-/call-bind-apply-helpers-1.0.2.tgz", + "integrity": "sha512-Sp1ablJ0ivDkSzjcaJdxEunN5/XvksFJ2sMBFfq6x0ryhQV/2b/KwFe21cMpmHtPOSij8K99/wSfoEuTObmuMQ==", + "license": "MIT", + "dependencies": { + "es-errors": "^1.3.0", + "function-bind": "^1.1.2" + }, + "engines": { + "node": ">= 0.4" + } + }, + "node_modules/combined-stream": { + "version": "1.0.8", + "resolved": "https://registry.npmjs.org/combined-stream/-/combined-stream-1.0.8.tgz", + "integrity": "sha512-FQN4MRfuJeHf7cBbBMJFXhKSDq+2kAArBlmRBvcvFE5BB1HZKXtSFASDhdlz9zOYwxh8lDdnvmMOe/+5cdoEdg==", + "license": "MIT", + "dependencies": { + "delayed-stream": "~1.0.0" + }, + "engines": { + "node": ">= 0.8" + } + }, + "node_modules/cssstyle": { + "version": "4.6.0", + "resolved": "https://registry.npmjs.org/cssstyle/-/cssstyle-4.6.0.tgz", + "integrity": "sha512-2z+rWdzbbSZv6/rhtvzvqeZQHrBaqgogqt85sqFNbabZOuFbCVFb8kPeEtZjiKkbrm395irpNKiYeFeLiQnFPg==", + "license": "MIT", + "dependencies": { + "@asamuzakjp/css-color": "^3.2.0", + "rrweb-cssom": "^0.8.0" + }, + "engines": { + "node": ">=18" + } + }, + "node_modules/cssstyle/node_modules/rrweb-cssom": { + "version": "0.8.0", + "resolved": "https://registry.npmjs.org/rrweb-cssom/-/rrweb-cssom-0.8.0.tgz", + "integrity": "sha512-guoltQEx+9aMf2gDZ0s62EcV8lsXR+0w8915TC3ITdn2YueuNjdAYh/levpU9nFaoChh9RUS5ZdQMrKfVEN9tw==", + "license": "MIT" + }, + "node_modules/data-urls": { + "version": "5.0.0", + "resolved": "https://registry.npmjs.org/data-urls/-/data-urls-5.0.0.tgz", + "integrity": "sha512-ZYP5VBHshaDAiVZxjbRVcFJpc+4xGgT0bK3vzy1HLN8jTO975HEbuYzZJcHoQEY5K1a0z8YayJkyVETa08eNTg==", + "license": "MIT", + "dependencies": { + "whatwg-mimetype": "^4.0.0", + "whatwg-url": "^14.0.0" + }, + "engines": { + "node": ">=18" + } + }, + "node_modules/debug": { + "version": "4.4.3", + "resolved": "https://registry.npmjs.org/debug/-/debug-4.4.3.tgz", + "integrity": "sha512-RGwwWnwQvkVfavKVt22FGLw+xYSdzARwm0ru6DhTVA3umU5hZc28V3kO4stgYryrTlLpuvgI9GiijltAjNbcqA==", + "license": "MIT", + "dependencies": { + "ms": "^2.1.3" + }, + "engines": { + "node": ">=6.0" + }, + "peerDependenciesMeta": { + "supports-color": { + "optional": true + } + } + }, + "node_modules/decimal.js": { + "version": "10.6.0", + "resolved": "https://registry.npmjs.org/decimal.js/-/decimal.js-10.6.0.tgz", + "integrity": "sha512-YpgQiITW3JXGntzdUmyUR1V812Hn8T1YVXhCu+wO3OpS4eU9l4YdD3qjyiKdV6mvV29zapkMeD390UVEf2lkUg==", + "license": "MIT" + }, + "node_modules/delayed-stream": { + "version": "1.0.0", + "resolved": "https://registry.npmjs.org/delayed-stream/-/delayed-stream-1.0.0.tgz", + "integrity": "sha512-ZySD7Nf91aLB0RxL4KGrKHBXl7Eds1DAmEdcoVawXnLD7SDhpNgtuII2aAkg7a7QS41jxPSZ17p4VdGnMHk3MQ==", + "license": "MIT", + "engines": { + "node": ">=0.4.0" + } + }, + "node_modules/dunder-proto": { + "version": "1.0.1", + "resolved": "https://registry.npmjs.org/dunder-proto/-/dunder-proto-1.0.1.tgz", + "integrity": "sha512-KIN/nDJBQRcXw0MLVhZE9iQHmG68qAVIBg9CqmUYjmQIhgij9U5MFvrqkUL5FbtyyzZuOeOt0zdeRe4UY7ct+A==", + "license": "MIT", + "dependencies": { + "call-bind-apply-helpers": "^1.0.1", + "es-errors": "^1.3.0", + "gopd": "^1.2.0" + }, + "engines": { + "node": ">= 0.4" + } + }, + "node_modules/entities": { + "version": "6.0.1", + "resolved": "https://registry.npmjs.org/entities/-/entities-6.0.1.tgz", + "integrity": "sha512-aN97NXWF6AWBTahfVOIrB/NShkzi5H7F9r1s9mD3cDj4Ko5f2qhhVoYMibXF7GlLveb/D2ioWay8lxI97Ven3g==", + "license": "BSD-2-Clause", + "engines": { + "node": ">=0.12" + }, + "funding": { + "url": "https://github.com/fb55/entities?sponsor=1" + } + }, + "node_modules/es-define-property": { + "version": "1.0.1", + "resolved": "https://registry.npmjs.org/es-define-property/-/es-define-property-1.0.1.tgz", + "integrity": "sha512-e3nRfgfUZ4rNGL232gUgX06QNyyez04KdjFrF+LTRoOXmrOgFKDg4BCdsjW8EnT69eqdYGmRpJwiPVYNrCaW3g==", + "license": "MIT", + "engines": { + "node": ">= 0.4" + } + }, + "node_modules/es-errors": { + "version": "1.3.0", + "resolved": "https://registry.npmjs.org/es-errors/-/es-errors-1.3.0.tgz", + "integrity": "sha512-Zf5H2Kxt2xjTvbJvP2ZWLEICxA6j+hAmMzIlypy4xcBg1vKVnx89Wy0GbS+kf5cwCVFFzdCFh2XSCFNULS6csw==", + "license": "MIT", + "engines": { + "node": ">= 0.4" + } + }, + "node_modules/es-object-atoms": { + "version": "1.1.1", + "resolved": "https://registry.npmjs.org/es-object-atoms/-/es-object-atoms-1.1.1.tgz", + "integrity": "sha512-FGgH2h8zKNim9ljj7dankFPcICIK9Cp5bm+c2gQSYePhpaG5+esrLODihIorn+Pe6FGJzWhXQotPv73jTaldXA==", + "license": "MIT", + "dependencies": { + "es-errors": "^1.3.0" + }, + "engines": { + "node": ">= 0.4" + } + }, + "node_modules/es-set-tostringtag": { + "version": "2.1.0", + "resolved": "https://registry.npmjs.org/es-set-tostringtag/-/es-set-tostringtag-2.1.0.tgz", + "integrity": "sha512-j6vWzfrGVfyXxge+O0x5sh6cvxAog0a/4Rdd2K36zCMV5eJ+/+tOAngRO8cODMNWbVRdVlmGZQL2YS3yR8bIUA==", + "license": "MIT", + "dependencies": { + "es-errors": "^1.3.0", + "get-intrinsic": "^1.2.6", + "has-tostringtag": "^1.0.2", + "hasown": "^2.0.2" + }, + "engines": { + "node": ">= 0.4" + } + }, + "node_modules/form-data": { + "version": "4.0.5", + "resolved": "https://registry.npmjs.org/form-data/-/form-data-4.0.5.tgz", + "integrity": "sha512-8RipRLol37bNs2bhoV67fiTEvdTrbMUYcFTiy3+wuuOnUog2QBHCZWXDRijWQfAkhBj2Uf5UnVaiWwA5vdd82w==", + "license": "MIT", + "dependencies": { + "asynckit": "^0.4.0", + "combined-stream": "^1.0.8", + "es-set-tostringtag": "^2.1.0", + "hasown": "^2.0.2", + "mime-types": "^2.1.12" + }, + "engines": { + "node": ">= 6" + } + }, + "node_modules/fsevents": { + "version": "2.3.2", + "resolved": "https://registry.npmjs.org/fsevents/-/fsevents-2.3.2.tgz", + "integrity": "sha512-xiqMQR4xAeHTuB9uWm+fFRcIOgKBMiOBP+eXiyT7jsgVCq1bkVygt00oASowB7EdtpOHaaPgKt812P9ab+DDKA==", + "hasInstallScript": true, + "license": "MIT", + "optional": true, + "os": [ + "darwin" + ], + "engines": { + "node": "^8.16.0 || ^10.6.0 || >=11.0.0" + } + }, + "node_modules/function-bind": { + "version": "1.1.2", + "resolved": "https://registry.npmjs.org/function-bind/-/function-bind-1.1.2.tgz", + "integrity": "sha512-7XHNxH7qX9xG5mIwxkhumTox/MIRNcOgDrxWsMt2pAr23WHp6MrRlN7FBSFpCpr+oVO0F744iUgR82nJMfG2SA==", + "license": "MIT", + "funding": { + "url": "https://github.com/sponsors/ljharb" + } + }, + "node_modules/get-intrinsic": { + "version": "1.3.0", + "resolved": "https://registry.npmjs.org/get-intrinsic/-/get-intrinsic-1.3.0.tgz", + "integrity": "sha512-9fSjSaos/fRIVIp+xSJlE6lfwhES7LNtKaCBIamHsjr2na1BiABJPo0mOjjz8GJDURarmCPGqaiVg5mfjb98CQ==", + "license": "MIT", + "dependencies": { + "call-bind-apply-helpers": "^1.0.2", + "es-define-property": "^1.0.1", + "es-errors": "^1.3.0", + "es-object-atoms": "^1.1.1", + "function-bind": "^1.1.2", + "get-proto": "^1.0.1", + "gopd": "^1.2.0", + "has-symbols": "^1.1.0", + "hasown": "^2.0.2", + "math-intrinsics": "^1.1.0" + }, + "engines": { + "node": ">= 0.4" + }, + "funding": { + "url": "https://github.com/sponsors/ljharb" + } + }, + "node_modules/get-proto": { + "version": "1.0.1", + "resolved": "https://registry.npmjs.org/get-proto/-/get-proto-1.0.1.tgz", + "integrity": "sha512-sTSfBjoXBp89JvIKIefqw7U2CCebsc74kiY6awiGogKtoSGbgjYE/G/+l9sF3MWFPNc9IcoOC4ODfKHfxFmp0g==", + "license": "MIT", + "dependencies": { + "dunder-proto": "^1.0.1", + "es-object-atoms": "^1.0.0" + }, + "engines": { + "node": ">= 0.4" + } + }, + "node_modules/gopd": { + "version": "1.2.0", + "resolved": "https://registry.npmjs.org/gopd/-/gopd-1.2.0.tgz", + "integrity": "sha512-ZUKRh6/kUFoAiTAtTYPZJ3hw9wNxx+BIBOijnlG9PnrJsCcSjs1wyyD6vJpaYtgnzDrKYRSqf3OO6Rfa93xsRg==", + "license": "MIT", + "engines": { + "node": ">= 0.4" + }, + "funding": { + "url": "https://github.com/sponsors/ljharb" + } + }, + "node_modules/has-symbols": { + "version": "1.1.0", + "resolved": "https://registry.npmjs.org/has-symbols/-/has-symbols-1.1.0.tgz", + "integrity": "sha512-1cDNdwJ2Jaohmb3sg4OmKaMBwuC48sYni5HUw2DvsC8LjGTLK9h+eb1X6RyuOHe4hT0ULCW68iomhjUoKUqlPQ==", + "license": "MIT", + "engines": { + "node": ">= 0.4" + }, + "funding": { + "url": "https://github.com/sponsors/ljharb" + } + }, + "node_modules/has-tostringtag": { + "version": "1.0.2", + "resolved": "https://registry.npmjs.org/has-tostringtag/-/has-tostringtag-1.0.2.tgz", + "integrity": "sha512-NqADB8VjPFLM2V0VvHUewwwsw0ZWBaIdgo+ieHtK3hasLz4qeCRjYcqfB6AQrBggRKppKF8L52/VqdVsO47Dlw==", + "license": "MIT", + "dependencies": { + "has-symbols": "^1.0.3" + }, + "engines": { + "node": ">= 0.4" + }, + "funding": { + "url": "https://github.com/sponsors/ljharb" + } + }, + "node_modules/hasown": { + "version": "2.0.2", + "resolved": "https://registry.npmjs.org/hasown/-/hasown-2.0.2.tgz", + "integrity": "sha512-0hJU9SCPvmMzIBdZFqNPXWa6dqh7WdH0cII9y+CyS8rG3nL48Bclra9HmKhVVUHyPWNH5Y7xDwAB7bfgSjkUMQ==", + "license": "MIT", + "dependencies": { + "function-bind": "^1.1.2" + }, + "engines": { + "node": ">= 0.4" + } + }, + "node_modules/html-encoding-sniffer": { + "version": "4.0.0", + "resolved": "https://registry.npmjs.org/html-encoding-sniffer/-/html-encoding-sniffer-4.0.0.tgz", + "integrity": "sha512-Y22oTqIU4uuPgEemfz7NDJz6OeKf12Lsu+QC+s3BVpda64lTiMYCyGwg5ki4vFxkMwQdeZDl2adZoqUgdFuTgQ==", + "license": "MIT", + "dependencies": { + "whatwg-encoding": "^3.1.1" + }, + "engines": { + "node": ">=18" + } + }, + "node_modules/http-proxy-agent": { + "version": "7.0.2", + "resolved": "https://registry.npmjs.org/http-proxy-agent/-/http-proxy-agent-7.0.2.tgz", + "integrity": "sha512-T1gkAiYYDWYx3V5Bmyu7HcfcvL7mUrTWiM6yOfa3PIphViJ/gFPbvidQ+veqSOHci/PxBcDabeUNCzpOODJZig==", + "license": "MIT", + "dependencies": { + "agent-base": "^7.1.0", + "debug": "^4.3.4" + }, + "engines": { + "node": ">= 14" + } + }, + "node_modules/https-proxy-agent": { + "version": "7.0.6", + "resolved": "https://registry.npmjs.org/https-proxy-agent/-/https-proxy-agent-7.0.6.tgz", + "integrity": "sha512-vK9P5/iUfdl95AI+JVyUuIcVtd4ofvtrOr3HNtM2yxC9bnMbEdp3x01OhQNnjb8IJYi38VlTE3mBXwcfvywuSw==", + "license": "MIT", + "dependencies": { + "agent-base": "^7.1.2", + "debug": "4" + }, + "engines": { + "node": ">= 14" + } + }, + "node_modules/iconv-lite": { + "version": "0.6.3", + "resolved": "https://registry.npmjs.org/iconv-lite/-/iconv-lite-0.6.3.tgz", + "integrity": "sha512-4fCk79wshMdzMp2rH06qWrJE4iolqLhCUH+OiuIgU++RB0+94NlDL81atO7GX55uUKueo0txHNtvEyI6D7WdMw==", + "license": "MIT", + "dependencies": { + "safer-buffer": ">= 2.1.2 < 3.0.0" + }, + "engines": { + "node": ">=0.10.0" + } + }, + "node_modules/is-potential-custom-element-name": { + "version": "1.0.1", + "resolved": "https://registry.npmjs.org/is-potential-custom-element-name/-/is-potential-custom-element-name-1.0.1.tgz", + "integrity": "sha512-bCYeRA2rVibKZd+s2625gGnGF/t7DSqDs4dP7CrLA1m7jKWz6pps0LpYLJN8Q64HtmPKJ1hrN3nzPNKFEKOUiQ==", + "license": "MIT" + }, + "node_modules/jsdom": { + "version": "25.0.1", + "resolved": "https://registry.npmjs.org/jsdom/-/jsdom-25.0.1.tgz", + "integrity": "sha512-8i7LzZj7BF8uplX+ZyOlIz86V6TAsSs+np6m1kpW9u0JWi4z/1t+FzcK1aek+ybTnAC4KhBL4uXCNT0wcUIeCw==", + "license": "MIT", + "dependencies": { + "cssstyle": "^4.1.0", + "data-urls": "^5.0.0", + "decimal.js": "^10.4.3", + "form-data": "^4.0.0", + "html-encoding-sniffer": "^4.0.0", + "http-proxy-agent": "^7.0.2", + "https-proxy-agent": "^7.0.5", + "is-potential-custom-element-name": "^1.0.1", + "nwsapi": "^2.2.12", + "parse5": "^7.1.2", + "rrweb-cssom": "^0.7.1", + "saxes": "^6.0.0", + "symbol-tree": "^3.2.4", + "tough-cookie": "^5.0.0", + "w3c-xmlserializer": "^5.0.0", + "webidl-conversions": "^7.0.0", + "whatwg-encoding": "^3.1.1", + "whatwg-mimetype": "^4.0.0", + "whatwg-url": "^14.0.0", + "ws": "^8.18.0", + "xml-name-validator": "^5.0.0" + }, + "engines": { + "node": ">=18" + }, + "peerDependencies": { + "canvas": "^2.11.2" + }, + "peerDependenciesMeta": { + "canvas": { + "optional": true + } + } + }, + "node_modules/lru-cache": { + "version": "10.4.3", + "resolved": "https://registry.npmjs.org/lru-cache/-/lru-cache-10.4.3.tgz", + "integrity": "sha512-JNAzZcXrCt42VGLuYz0zfAzDfAvJWW6AfYlDBQyDV5DClI2m5sAmK+OIO7s59XfsRsWHp02jAJrRadPRGTt6SQ==", + "license": "ISC" + }, + "node_modules/math-intrinsics": { + "version": "1.1.0", + "resolved": "https://registry.npmjs.org/math-intrinsics/-/math-intrinsics-1.1.0.tgz", + "integrity": "sha512-/IXtbwEk5HTPyEwyKX6hGkYXxM9nbj64B+ilVJnC/R6B0pH5G4V3b0pVbL7DBj4tkhBAppbQUlf6F6Xl9LHu1g==", + "license": "MIT", + "engines": { + "node": ">= 0.4" + } + }, + "node_modules/mime-db": { + "version": "1.52.0", + "resolved": "https://registry.npmjs.org/mime-db/-/mime-db-1.52.0.tgz", + "integrity": "sha512-sPU4uV7dYlvtWJxwwxHD0PuihVNiE7TyAbQ5SWxDCB9mUYvOgroQOwYQQOKPJ8CIbE+1ETVlOoK1UC2nU3gYvg==", + "license": "MIT", + "engines": { + "node": ">= 0.6" + } + }, + "node_modules/mime-types": { + "version": "2.1.35", + "resolved": "https://registry.npmjs.org/mime-types/-/mime-types-2.1.35.tgz", + "integrity": "sha512-ZDY+bPm5zTTF+YpCrAU9nK0UgICYPT0QtT1NZWFv4s++TNkcgVaT0g6+4R2uI4MjQjzysHB1zxuWL50hzaeXiw==", + "license": "MIT", + "dependencies": { + "mime-db": "1.52.0" + }, + "engines": { + "node": ">= 0.6" + } + }, + "node_modules/ms": { + "version": "2.1.3", + "resolved": "https://registry.npmjs.org/ms/-/ms-2.1.3.tgz", + "integrity": "sha512-6FlzubTLZG3J2a/NVCAleEhjzq5oxgHyaCU9yYXvcLsvoVaHJq/s5xXI6/XXP6tz7R9xAOtHnSO/tXtF3WRTlA==", + "license": "MIT" + }, + "node_modules/nwsapi": { + "version": "2.2.23", + "resolved": "https://registry.npmjs.org/nwsapi/-/nwsapi-2.2.23.tgz", + "integrity": "sha512-7wfH4sLbt4M0gCDzGE6vzQBo0bfTKjU7Sfpqy/7gs1qBfYz2vEJH6vXcBKpO3+6Yu1telwd0t9HpyOoLEQQbIQ==", + "license": "MIT" + }, + "node_modules/parse5": { + "version": "7.3.0", + "resolved": "https://registry.npmjs.org/parse5/-/parse5-7.3.0.tgz", + "integrity": "sha512-IInvU7fabl34qmi9gY8XOVxhYyMyuH2xUNpb2q8/Y+7552KlejkRvqvD19nMoUW/uQGGbqNpA6Tufu5FL5BZgw==", + "license": "MIT", + "dependencies": { + "entities": "^6.0.0" + }, + "funding": { + "url": "https://github.com/inikulin/parse5?sponsor=1" + } + }, + "node_modules/playwright": { + "version": "1.58.2", + "resolved": "https://registry.npmjs.org/playwright/-/playwright-1.58.2.tgz", + "integrity": "sha512-vA30H8Nvkq/cPBnNw4Q8TWz1EJyqgpuinBcHET0YVJVFldr8JDNiU9LaWAE1KqSkRYazuaBhTpB5ZzShOezQ6A==", + "license": "Apache-2.0", + "dependencies": { + "playwright-core": "1.58.2" + }, + "bin": { + "playwright": "cli.js" + }, + "engines": { + "node": ">=18" + }, + "optionalDependencies": { + "fsevents": "2.3.2" + } + }, + "node_modules/playwright-core": { + "version": "1.58.2", + "resolved": "https://registry.npmjs.org/playwright-core/-/playwright-core-1.58.2.tgz", + "integrity": "sha512-yZkEtftgwS8CsfYo7nm0KE8jsvm6i/PTgVtB8DL726wNf6H2IMsDuxCpJj59KDaxCtSnrWan2AeDqM7JBaultg==", + "license": "Apache-2.0", + "bin": { + "playwright-core": "cli.js" + }, + "engines": { + "node": ">=18" + } + }, + "node_modules/punycode": { + "version": "2.3.1", + "resolved": "https://registry.npmjs.org/punycode/-/punycode-2.3.1.tgz", + "integrity": "sha512-vYt7UD1U9Wg6138shLtLOvdAu+8DsC/ilFtEVHcH+wydcSpNE20AfSOduf6MkRFahL5FY7X1oU7nKVZFtfq8Fg==", + "license": "MIT", + "engines": { + "node": ">=6" + } + }, + "node_modules/rrweb-cssom": { + "version": "0.7.1", + "resolved": "https://registry.npmjs.org/rrweb-cssom/-/rrweb-cssom-0.7.1.tgz", + "integrity": "sha512-TrEMa7JGdVm0UThDJSx7ddw5nVm3UJS9o9CCIZ72B1vSyEZoziDqBYP3XIoi/12lKrJR8rE3jeFHMok2F/Mnsg==", + "license": "MIT" + }, + "node_modules/safer-buffer": { + "version": "2.1.2", + "resolved": "https://registry.npmjs.org/safer-buffer/-/safer-buffer-2.1.2.tgz", + "integrity": "sha512-YZo3K82SD7Riyi0E1EQPojLz7kpepnSQI9IyPbHHg1XXXevb5dJI7tpyN2ADxGcQbHG7vcyRHk0cbwqcQriUtg==", + "license": "MIT" + }, + "node_modules/saxes": { + "version": "6.0.0", + "resolved": "https://registry.npmjs.org/saxes/-/saxes-6.0.0.tgz", + "integrity": "sha512-xAg7SOnEhrm5zI3puOOKyy1OMcMlIJZYNJY7xLBwSze0UjhPLnWfj2GF2EpT0jmzaJKIWKHLsaSSajf35bcYnA==", + "license": "ISC", + "dependencies": { + "xmlchars": "^2.2.0" + }, + "engines": { + "node": ">=v12.22.7" + } + }, + "node_modules/symbol-tree": { + "version": "3.2.4", + "resolved": "https://registry.npmjs.org/symbol-tree/-/symbol-tree-3.2.4.tgz", + "integrity": "sha512-9QNk5KwDF+Bvz+PyObkmSYjI5ksVUYtjW7AU22r2NKcfLJcXp96hkDWU3+XndOsUb+AQ9QhfzfCT2O+CNWT5Tw==", + "license": "MIT" + }, + "node_modules/tldts": { + "version": "6.1.86", + "resolved": "https://registry.npmjs.org/tldts/-/tldts-6.1.86.tgz", + "integrity": "sha512-WMi/OQ2axVTf/ykqCQgXiIct+mSQDFdH2fkwhPwgEwvJ1kSzZRiinb0zF2Xb8u4+OqPChmyI6MEu4EezNJz+FQ==", + "license": "MIT", + "dependencies": { + "tldts-core": "^6.1.86" + }, + "bin": { + "tldts": "bin/cli.js" + } + }, + "node_modules/tldts-core": { + "version": "6.1.86", + "resolved": "https://registry.npmjs.org/tldts-core/-/tldts-core-6.1.86.tgz", + "integrity": "sha512-Je6p7pkk+KMzMv2XXKmAE3McmolOQFdxkKw0R8EYNr7sELW46JqnNeTX8ybPiQgvg1ymCoF8LXs5fzFaZvJPTA==", + "license": "MIT" + }, + "node_modules/tough-cookie": { + "version": "5.1.2", + "resolved": "https://registry.npmjs.org/tough-cookie/-/tough-cookie-5.1.2.tgz", + "integrity": "sha512-FVDYdxtnj0G6Qm/DhNPSb8Ju59ULcup3tuJxkFb5K8Bv2pUXILbf0xZWU8PX8Ov19OXljbUyveOFwRMwkXzO+A==", + "license": "BSD-3-Clause", + "dependencies": { + "tldts": "^6.1.32" + }, + "engines": { + "node": ">=16" + } + }, + "node_modules/tr46": { + "version": "5.1.1", + "resolved": "https://registry.npmjs.org/tr46/-/tr46-5.1.1.tgz", + "integrity": "sha512-hdF5ZgjTqgAntKkklYw0R03MG2x/bSzTtkxmIRw/sTNV8YXsCJ1tfLAX23lhxhHJlEf3CRCOCGGWw3vI3GaSPw==", + "license": "MIT", + "dependencies": { + "punycode": "^2.3.1" + }, + "engines": { + "node": ">=18" + } + }, + "node_modules/w3c-xmlserializer": { + "version": "5.0.0", + "resolved": "https://registry.npmjs.org/w3c-xmlserializer/-/w3c-xmlserializer-5.0.0.tgz", + "integrity": "sha512-o8qghlI8NZHU1lLPrpi2+Uq7abh4GGPpYANlalzWxyWteJOCsr/P+oPBA49TOLu5FTZO4d3F9MnWJfiMo4BkmA==", + "license": "MIT", + "dependencies": { + "xml-name-validator": "^5.0.0" + }, + "engines": { + "node": ">=18" + } + }, + "node_modules/webidl-conversions": { + "version": "7.0.0", + "resolved": "https://registry.npmjs.org/webidl-conversions/-/webidl-conversions-7.0.0.tgz", + "integrity": "sha512-VwddBukDzu71offAQR975unBIGqfKZpM+8ZX6ySk8nYhVoo5CYaZyzt3YBvYtRtO+aoGlqxPg/B87NGVZ/fu6g==", + "license": "BSD-2-Clause", + "engines": { + "node": ">=12" + } + }, + "node_modules/whatwg-encoding": { + "version": "3.1.1", + "resolved": "https://registry.npmjs.org/whatwg-encoding/-/whatwg-encoding-3.1.1.tgz", + "integrity": "sha512-6qN4hJdMwfYBtE3YBTTHhoeuUrDBPZmbQaxWAqSALV/MeEnR5z1xd8UKud2RAkFoPkmB+hli1TZSnyi84xz1vQ==", + "deprecated": "Use @exodus/bytes instead for a more spec-conformant and faster implementation", + "license": "MIT", + "dependencies": { + "iconv-lite": "0.6.3" + }, + "engines": { + "node": ">=18" + } + }, + "node_modules/whatwg-mimetype": { + "version": "4.0.0", + "resolved": "https://registry.npmjs.org/whatwg-mimetype/-/whatwg-mimetype-4.0.0.tgz", + "integrity": "sha512-QaKxh0eNIi2mE9p2vEdzfagOKHCcj1pJ56EEHGQOVxp8r9/iszLUUV7v89x9O1p/T+NlTM5W7jW6+cz4Fq1YVg==", + "license": "MIT", + "engines": { + "node": ">=18" + } + }, + "node_modules/whatwg-url": { + "version": "14.2.0", + "resolved": "https://registry.npmjs.org/whatwg-url/-/whatwg-url-14.2.0.tgz", + "integrity": "sha512-De72GdQZzNTUBBChsXueQUnPKDkg/5A5zp7pFDuQAj5UFoENpiACU0wlCvzpAGnTkj++ihpKwKyYewn/XNUbKw==", + "license": "MIT", + "dependencies": { + "tr46": "^5.1.0", + "webidl-conversions": "^7.0.0" + }, + "engines": { + "node": ">=18" + } + }, + "node_modules/ws": { + "version": "8.19.0", + "resolved": "https://registry.npmjs.org/ws/-/ws-8.19.0.tgz", + "integrity": "sha512-blAT2mjOEIi0ZzruJfIhb3nps74PRWTCz1IjglWEEpQl5XS/UNama6u2/rjFkDDouqr4L67ry+1aGIALViWjDg==", + "license": "MIT", + "engines": { + "node": ">=10.0.0" + }, + "peerDependencies": { + "bufferutil": "^4.0.1", + "utf-8-validate": ">=5.0.2" + }, + "peerDependenciesMeta": { + "bufferutil": { + "optional": true + }, + "utf-8-validate": { + "optional": true + } + } + }, + "node_modules/xml-name-validator": { + "version": "5.0.0", + "resolved": "https://registry.npmjs.org/xml-name-validator/-/xml-name-validator-5.0.0.tgz", + "integrity": "sha512-EvGK8EJ3DhaHfbRlETOWAS5pO9MZITeauHKJyb8wyajUfQUenkIg2MvLDTZ4T/TgIcm3HU0TFBgWWboAZ30UHg==", + "license": "Apache-2.0", + "engines": { + "node": ">=18" + } + }, + "node_modules/xmlchars": { + "version": "2.2.0", + "resolved": "https://registry.npmjs.org/xmlchars/-/xmlchars-2.2.0.tgz", + "integrity": "sha512-JZnDKK8B0RCDw84FNdDAIpZK+JuJw+s7Lz8nksI7SIuU3UXJJslUthsi+uWBUYOwPFwW7W7PRLRfUKpxjtjFCw==", + "license": "MIT" + } + } +} diff --git a/tools/synops-clip/scripts/package.json b/tools/synops-clip/scripts/package.json new file mode 100644 index 0000000..4e7184a --- /dev/null +++ b/tools/synops-clip/scripts/package.json @@ -0,0 +1,11 @@ +{ + "name": "synops-clip-scripts", + "version": "0.1.0", + "private": true, + "type": "module", + "dependencies": { + "@mozilla/readability": "^0.5.0", + "jsdom": "^25.0.0", + "playwright": "^1.58.2" + } +} diff --git a/tools/synops-clip/scripts/playwright.mjs b/tools/synops-clip/scripts/playwright.mjs new file mode 100644 index 0000000..4bff7fa --- /dev/null +++ b/tools/synops-clip/scripts/playwright.mjs @@ -0,0 +1,63 @@ +#!/usr/bin/env node +// playwright.mjs — Fetch a JS-rendered page with Playwright, then extract with Readability. +// +// Args: --url (required) +// --timeout (optional, default 30000) +// Output: JSON on stdout (same shape as readability.mjs) +// +// Requires: npx playwright install chromium (one-time setup) + +import { Readability } from "@mozilla/readability"; +import { JSDOM } from "jsdom"; +import { chromium } from "playwright"; + +const args = process.argv.slice(2); +function getArg(name, fallback) { + const idx = args.indexOf(name); + return idx !== -1 ? args[idx + 1] : fallback; +} + +const url = getArg("--url", null); +const timeout = parseInt(getArg("--timeout", "30000"), 10); + +if (!url) { + process.stderr.write("--url is required\n"); + process.exit(1); +} + +let browser; +try { + browser = await chromium.launch({ headless: true }); + const context = await browser.newContext({ + userAgent: + "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36", + }); + const page = await context.newPage(); + await page.goto(url, { waitUntil: "networkidle", timeout }); + + const html = await page.content(); + const dom = new JSDOM(html, { url }); + const reader = new Readability(dom.window.document); + const article = reader.parse(); + + if (!article) { + process.stdout.write(JSON.stringify({ error: "readability_failed" })); + process.exit(1); + } + + process.stdout.write( + JSON.stringify({ + title: article.title || null, + author: article.byline || null, + content: article.textContent || "", + excerpt: article.excerpt || null, + siteName: article.siteName || null, + length: article.length || 0, + }) + ); +} catch (e) { + process.stdout.write(JSON.stringify({ error: e.message })); + process.exit(1); +} finally { + if (browser) await browser.close(); +} diff --git a/tools/synops-clip/scripts/readability.mjs b/tools/synops-clip/scripts/readability.mjs new file mode 100644 index 0000000..88a3ffc --- /dev/null +++ b/tools/synops-clip/scripts/readability.mjs @@ -0,0 +1,43 @@ +#!/usr/bin/env node +// readability.mjs — Parse HTML with Mozilla Readability. +// +// Input: HTML on stdin +// Args: --url (required, used by Readability for relative URL resolution) +// Output: JSON on stdout with title, author, content, excerpt, siteName, length +// +// Exit code 0 = success, 1 = parse failure or error. + +import { Readability } from "@mozilla/readability"; +import { JSDOM } from "jsdom"; +import { readFileSync } from "node:fs"; + +const args = process.argv.slice(2); +const urlIdx = args.indexOf("--url"); +const url = urlIdx !== -1 ? args[urlIdx + 1] : "https://example.com"; + +const html = readFileSync("/dev/stdin", "utf-8"); + +try { + const dom = new JSDOM(html, { url }); + const reader = new Readability(dom.window.document); + const article = reader.parse(); + + if (!article) { + process.stdout.write(JSON.stringify({ error: "readability_failed" })); + process.exit(1); + } + + process.stdout.write( + JSON.stringify({ + title: article.title || null, + author: article.byline || null, + content: article.textContent || "", + excerpt: article.excerpt || null, + siteName: article.siteName || null, + length: article.length || 0, + }) + ); +} catch (e) { + process.stdout.write(JSON.stringify({ error: e.message })); + process.exit(1); +} diff --git a/tools/synops-clip/src/main.rs b/tools/synops-clip/src/main.rs new file mode 100644 index 0000000..3140f82 --- /dev/null +++ b/tools/synops-clip/src/main.rs @@ -0,0 +1,505 @@ +// synops-clip — Hent og parse webartikler til ren tekst + metadata. +// +// Input: --url +// Output: JSON til stdout med title, author, date, content, url, paywall +// +// Strategi: +// 1. Hent HTML med reqwest (vanlig HTTP-klient) +// 2. Parse med Mozilla Readability via Node.js-hjelpeskript +// 3. Hvis Readability feiler eller innholdet er for kort, prøv Playwright (headless browser) +// 4. Detekter betalingsmur basert på innholdslengde og kjente mønstre +// +// Miljøvariabler: +// SYNOPS_CLIP_SCRIPTS — Sti til scripts/-mappen (default: ved siden av binæren) +// RUST_LOG — Loggnivå (default: synops_clip=info) +// +// Ref: docs/retninger/unix_filosofi.md + +use clap::Parser; +use regex::Regex; +use serde::{Deserialize, Serialize}; +use std::path::PathBuf; +use std::process; +use tracing::{debug, info, warn}; + +/// Hent og parse webartikler til ren tekst + metadata. +#[derive(Parser)] +#[command(name = "synops-clip", about = "Hent og parse webartikler")] +struct Cli { + /// URL som skal hentes og parses + #[arg(long)] + url: String, + + /// Timeout i sekunder for HTTP-forespørsler + #[arg(long, default_value = "30")] + timeout: u64, + + /// Tving bruk av Playwright (headless browser) i stedet for vanlig HTTP + #[arg(long)] + playwright: bool, + + /// Jobb-payload som JSON (for maskinrommet/jobbkø) + #[arg(long)] + payload_json: Option, +} + +/// Resultat fra Readability-parsing (Node.js-skriptene). +#[derive(Deserialize)] +#[allow(dead_code)] +struct ReadabilityResult { + title: Option, + author: Option, + content: Option, + excerpt: Option, + #[serde(rename = "siteName")] + site_name: Option, + length: Option, + error: Option, +} + +/// Payload fra jobbkø. +#[derive(Deserialize)] +struct JobPayload { + url: String, + #[serde(default)] + playwright: bool, + #[serde(default = "default_timeout")] + timeout: u64, +} + +fn default_timeout() -> u64 { + 30 +} + +/// Endelig output-format. +#[derive(Serialize)] +struct ClipOutput { + url: String, + title: Option, + author: Option, + date: Option, + content: String, + excerpt: Option, + paywall: bool, + source: String, +} + +#[tokio::main] +async fn main() { + synops_common::logging::init("synops_clip"); + + let cli = Cli::parse(); + + // Støtt payload-json fra jobbkø + let (url, use_playwright, timeout) = if let Some(ref payload) = cli.payload_json { + match serde_json::from_str::(payload) { + Ok(p) => (p.url, p.playwright, p.timeout), + Err(e) => { + eprintln!("Ugyldig payload-json: {e}"); + process::exit(1); + } + } + } else { + (cli.url.clone(), cli.playwright, cli.timeout) + }; + + match run(&url, use_playwright, timeout).await { + Ok(output) => { + println!("{}", serde_json::to_string_pretty(&output).unwrap()); + } + Err(e) => { + eprintln!("Feil: {e}"); + process::exit(1); + } + } +} + +async fn run(url: &str, use_playwright: bool, timeout: u64) -> Result { + let scripts_dir = find_scripts_dir()?; + info!(url, "Starter clip"); + + // Steg 1: Hent HTML + let html = if use_playwright { + info!("Bruker Playwright for å hente side"); + fetch_with_playwright(&scripts_dir, url, timeout).await? + } else { + match fetch_html(url, timeout).await { + Ok(html) => html, + Err(e) => { + warn!("HTTP-henting feilet: {e}, prøver Playwright"); + fetch_with_playwright(&scripts_dir, url, timeout).await? + } + } + }; + + // Steg 2: Parse med Readability + let parsed = parse_with_readability(&scripts_dir, url, &html).await?; + + // Steg 3: Hvis Readability feilet, prøv Playwright (om vi ikke allerede brukte det) + let (parsed, source) = if parsed.error.is_some() && !use_playwright { + warn!("Readability feilet, prøver Playwright-fallback"); + let result = run_playwright_readability(&scripts_dir, url, timeout).await?; + (result, "playwright") + } else if parsed.error.is_some() { + return Err(format!( + "Readability feilet: {}", + parsed.error.unwrap_or_default() + )); + } else { + (parsed, "readability") + }; + + let content = parsed.content.unwrap_or_default(); + let content = clean_text(&content); + + // Steg 4: Forsøk å finne dato fra HTML + let date = extract_date(&html); + + // Steg 5: Detekter betalingsmur + let paywall = detect_paywall(&content, &html); + + if paywall { + info!("Betalingsmur detektert"); + } + + Ok(ClipOutput { + url: url.to_string(), + title: parsed.title, + author: parsed.author, + date, + content, + excerpt: parsed.excerpt, + paywall, + source: source.to_string(), + }) +} + +/// Finn scripts/-mappen relativt til binæren eller via miljøvariabel. +fn find_scripts_dir() -> Result { + if let Ok(dir) = std::env::var("SYNOPS_CLIP_SCRIPTS") { + let p = PathBuf::from(dir); + if p.exists() { + return Ok(p); + } + } + + // Prøv relativt til binæren: ../scripts/ (når installert) eller + // fra kilde: tools/synops-clip/scripts/ + let exe = std::env::current_exe().map_err(|e| format!("Fant ikke exe-sti: {e}"))?; + let exe_dir = exe.parent().ok_or("Ingen parent-mappe for exe")?; + + // I utvikling: target/release/ → ../../scripts/ + for candidate in &[ + exe_dir.join("../../scripts"), + exe_dir.join("../scripts"), + exe_dir.join("scripts"), + PathBuf::from("/home/vegard/synops/tools/synops-clip/scripts"), + ] { + let resolved = candidate + .canonicalize() + .unwrap_or_else(|_| candidate.clone()); + if resolved.join("readability.mjs").exists() { + debug!(?resolved, "Fant scripts-mappe"); + return Ok(resolved); + } + } + + Err("Fant ikke scripts/-mappen. Sett SYNOPS_CLIP_SCRIPTS.".to_string()) +} + +/// Hent HTML med reqwest. +async fn fetch_html(url: &str, timeout_secs: u64) -> Result { + let client = reqwest::Client::builder() + .timeout(std::time::Duration::from_secs(timeout_secs)) + .user_agent("Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36") + .redirect(reqwest::redirect::Policy::limited(10)) + .build() + .map_err(|e| format!("Kunne ikke opprette HTTP-klient: {e}"))?; + + let resp = client + .get(url) + .header("Accept", "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8") + .header("Accept-Language", "nb-NO,nb;q=0.9,no;q=0.8,en;q=0.5") + .send() + .await + .map_err(|e| format!("HTTP-forespørsel feilet: {e}"))?; + + let status = resp.status(); + if !status.is_success() { + return Err(format!("HTTP {status}")); + } + + resp.text() + .await + .map_err(|e| format!("Kunne ikke lese respons-body: {e}")) +} + +/// Hent ferdigrendret HTML via Playwright (for JS-tunge sider). +async fn fetch_with_playwright( + scripts_dir: &PathBuf, + url: &str, + timeout: u64, +) -> Result { + // Playwright-skriptet returnerer Readability-output direkte, + // men her trenger vi bare HTML. For ren HTML-henting bruker vi + // en enkel variant. Merk: vi kaller heller run_playwright_readability + // og får ferdig parsert resultat. + // + // For fallback-scenarioet der vi allerede har HTML men Readability feilet, + // bruker vi run_playwright_readability direkte. + // + // For scenarioet der HTTP-henting feilet, trenger vi Playwright for å hente + // selve HTML-en. Vi bruker playwright.mjs som returnerer Readability-output + // direkte, men vi trenger HTML for paywall-deteksjon. La oss hente HTML + // separat med en enkel Playwright-kommando. + + let output = tokio::process::Command::new("node") + .arg("--input-type=module") + .arg("-e") + .arg(format!( + r#" +import {{ chromium }} from 'playwright'; +const browser = await chromium.launch({{ headless: true }}); +const ctx = await browser.newContext({{ + userAgent: 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36' +}}); +const page = await ctx.newPage(); +await page.goto({url}, {{ waitUntil: 'networkidle', timeout: {timeout} }}); +const html = await page.content(); +process.stdout.write(html); +await browser.close(); +"#, + url = serde_json::to_string(url).unwrap(), + timeout = timeout * 1000, + )) + .env( + "NODE_PATH", + scripts_dir.join("node_modules").to_string_lossy().to_string(), + ) + .output() + .await + .map_err(|e| format!("Playwright-prosess feilet: {e}"))?; + + if !output.status.success() { + let stderr = String::from_utf8_lossy(&output.stderr); + return Err(format!("Playwright feilet: {stderr}")); + } + + String::from_utf8(output.stdout).map_err(|e| format!("Ugyldig UTF-8 fra Playwright: {e}")) +} + +/// Parse HTML med Readability via Node.js-hjelpeskript. +async fn parse_with_readability( + scripts_dir: &PathBuf, + url: &str, + html: &str, +) -> Result { + let script = scripts_dir.join("readability.mjs"); + + let mut child = tokio::process::Command::new("node") + .arg(&script) + .arg("--url") + .arg(url) + .env( + "NODE_PATH", + scripts_dir.join("node_modules").to_string_lossy().to_string(), + ) + .stdin(std::process::Stdio::piped()) + .stdout(std::process::Stdio::piped()) + .stderr(std::process::Stdio::piped()) + .spawn() + .map_err(|e| format!("Kunne ikke starte readability.mjs: {e}"))?; + + // Skriv HTML til stdin + if let Some(mut stdin) = child.stdin.take() { + use tokio::io::AsyncWriteExt; + stdin + .write_all(html.as_bytes()) + .await + .map_err(|e| format!("Kunne ikke skrive til stdin: {e}"))?; + drop(stdin); + } + + let output = child + .wait_with_output() + .await + .map_err(|e| format!("Readability-prosess feilet: {e}"))?; + + let stdout = String::from_utf8_lossy(&output.stdout); + debug!(stdout = %stdout, "Readability-output"); + + serde_json::from_str::(&stdout) + .map_err(|e| format!("Kunne ikke parse Readability-output: {e} — raw: {stdout}")) +} + +/// Kjør Playwright + Readability i ett steg. +async fn run_playwright_readability( + scripts_dir: &PathBuf, + url: &str, + timeout: u64, +) -> Result { + let script = scripts_dir.join("playwright.mjs"); + + let output = tokio::process::Command::new("node") + .arg(&script) + .arg("--url") + .arg(url) + .arg("--timeout") + .arg((timeout * 1000).to_string()) + .env( + "NODE_PATH", + scripts_dir.join("node_modules").to_string_lossy().to_string(), + ) + .output() + .await + .map_err(|e| format!("Playwright-prosess feilet: {e}"))?; + + let stdout = String::from_utf8_lossy(&output.stdout); + + if !output.status.success() { + let stderr = String::from_utf8_lossy(&output.stderr); + warn!(stderr = %stderr, "Playwright feilet"); + } + + serde_json::from_str::(&stdout) + .map_err(|e| format!("Kunne ikke parse Playwright-output: {e} — raw: {stdout}")) +} + +/// Forsøk å ekstrahere publiseringsdato fra HTML-metadata. +fn extract_date(html: &str) -> Option { + // Vanlige meta-tagger for publiseringsdato + let patterns = [ + r#"property="article:published_time"\s+content="([^"]+)""#, + r#"content="([^"]+)"\s+property="article:published_time""#, + r#"name="date"\s+content="([^"]+)""#, + r#"content="([^"]+)"\s+name="date""#, + r#"name="pubdate"\s+content="([^"]+)""#, + r#"content="([^"]+)"\s+name="pubdate""#, + r#"name="publishdate"\s+content="([^"]+)""#, + r#"itemprop="datePublished"\s+content="([^"]+)""#, + r#"content="([^"]+)"\s+itemprop="datePublished""#, + r#"name="DC\.date"\s+content="([^"]+)""#, + r#""datePublished"\s*:\s*"([^"]+)""#, + ]; + + for pat in &patterns { + if let Ok(re) = Regex::new(pat) { + if let Some(caps) = re.captures(html) { + if let Some(date_str) = caps.get(1) { + let d = date_str.as_str().to_string(); + // Normaliser til YYYY-MM-DD om mulig + if d.len() >= 10 { + return Some(d[..10].to_string()); + } + return Some(d); + } + } + } + } + None +} + +/// Rens opp tekst fra Readability (fjern overflødige mellomrom/linjeskift). +fn clean_text(text: &str) -> String { + // Fjern ledende/etterfølgende whitespace per linje, kollapser blanke linjer + let lines: Vec<&str> = text.lines().map(|l| l.trim()).collect(); + + let mut result = Vec::new(); + let mut prev_empty = false; + for line in lines { + if line.is_empty() { + if !prev_empty { + result.push(""); + prev_empty = true; + } + } else { + result.push(line); + prev_empty = false; + } + } + + result.join("\n").trim().to_string() +} + +/// Detekter betalingsmur basert på innhold og HTML-mønstre. +fn detect_paywall(content: &str, html: &str) -> bool { + let content_lower = content.to_lowercase(); + let html_lower = html.to_lowercase(); + + // 1. Veldig kort innhold (under 200 tegn) tyder på betalingsmur + let trimmed_len = content.trim().len(); + if trimmed_len > 0 && trimmed_len < 200 { + debug!(trimmed_len, "Kort innhold — mulig betalingsmur"); + return true; + } + + // 2. Kjente betalingsmur-mønstre i innholdet + let paywall_content_patterns = [ + "logg inn for å lese", + "log in to read", + "sign in to continue", + "subscribe to continue", + "abonner for å lese", + "kun for abonnenter", + "for subscribers only", + "du må være innlogget", + "denne artikkelen er for abonnenter", + "kjøp tilgang", + "buy access", + "unlock this article", + "lås opp artikkelen", + "registrer deg for å lese", + "create an account to read", + "meld deg inn", + "bli abonnent", + "allerede abonnent", + "already a subscriber", + ]; + + for pattern in &paywall_content_patterns { + if content_lower.contains(pattern) { + debug!(pattern, "Betalingsmur-mønster funnet i innhold"); + return true; + } + } + + // 3. Kjente betalingsmur-klasser/elementer i HTML + let paywall_html_patterns = [ + "class=\"paywall", + "class=\"pw-", + "id=\"paywall", + "data-paywall", + "class=\"subscriber-only", + "class=\"premium-content", + "class=\"locked-content", + "class=\"article-paywall", + "class=\"metered-", + "class=\"piano-", + "id=\"piano-", + "class=\"tp-modal", + "data-piano", + "class=\"ns-paywall", + "class=\"nettavisen-paywall", + "class=\"schibsted-paywall", + "class=\"amedia-paywall", + ]; + + for pattern in &paywall_html_patterns { + if html_lower.contains(pattern) { + debug!(pattern, "Betalingsmur-mønster funnet i HTML"); + return true; + } + } + + // 4. Sjekk for meta-tagger som indikerer betalt innhold + if html_lower.contains(r#"content="locked""#) + || html_lower.contains(r#"content="premium""#) + || html_lower.contains("isAccessibleForFree") + && html_lower.contains(r#""false""#) + { + debug!("Betalingsmur-meta-tagg funnet"); + return true; + } + + false +}