Nytt verktøy som henter og parser webartikler til ren tekst + metadata. Bruker Mozilla Readability (via Node.js) for artikkelekstraksjon, med Playwright som fallback for JS-rendrede sider. Arkitektur: - Rust CLI (clap, reqwest) håndterer HTTP-henting, paywall-deteksjon, JSON-output - Node.js-hjelpeskript (readability.mjs) bruker @mozilla/readability + jsdom - Playwright-script (playwright.mjs) for headless browser-fallback - Støtter --payload-json for maskinrommet/jobbkø-integrasjon Paywall-deteksjon basert på: - Kort innhold (<200 tegn) - Norske/engelske paywall-fraser i innholdet - CSS-klasser/HTML-attributter (piano, schibsted, amedia, etc.) - Schema.org isAccessibleForFree meta-tagg Output: JSON med title, author, date, content, url, paywall, excerpt, source
43 lines
1.3 KiB
JavaScript
43 lines
1.3 KiB
JavaScript
#!/usr/bin/env node
|
|
// readability.mjs — Parse HTML with Mozilla Readability.
|
|
//
|
|
// Input: HTML on stdin
|
|
// Args: --url <url> (required, used by Readability for relative URL resolution)
|
|
// Output: JSON on stdout with title, author, content, excerpt, siteName, length
|
|
//
|
|
// Exit code 0 = success, 1 = parse failure or error.
|
|
|
|
import { Readability } from "@mozilla/readability";
|
|
import { JSDOM } from "jsdom";
|
|
import { readFileSync } from "node:fs";
|
|
|
|
const args = process.argv.slice(2);
|
|
const urlIdx = args.indexOf("--url");
|
|
const url = urlIdx !== -1 ? args[urlIdx + 1] : "https://example.com";
|
|
|
|
const html = readFileSync("/dev/stdin", "utf-8");
|
|
|
|
try {
|
|
const dom = new JSDOM(html, { url });
|
|
const reader = new Readability(dom.window.document);
|
|
const article = reader.parse();
|
|
|
|
if (!article) {
|
|
process.stdout.write(JSON.stringify({ error: "readability_failed" }));
|
|
process.exit(1);
|
|
}
|
|
|
|
process.stdout.write(
|
|
JSON.stringify({
|
|
title: article.title || null,
|
|
author: article.byline || null,
|
|
content: article.textContent || "",
|
|
excerpt: article.excerpt || null,
|
|
siteName: article.siteName || null,
|
|
length: article.length || 0,
|
|
})
|
|
);
|
|
} catch (e) {
|
|
process.stdout.write(JSON.stringify({ error: e.message }));
|
|
process.exit(1);
|
|
}
|