Nytt verktøy som henter og parser webartikler til ren tekst + metadata. Bruker Mozilla Readability (via Node.js) for artikkelekstraksjon, med Playwright som fallback for JS-rendrede sider. Arkitektur: - Rust CLI (clap, reqwest) håndterer HTTP-henting, paywall-deteksjon, JSON-output - Node.js-hjelpeskript (readability.mjs) bruker @mozilla/readability + jsdom - Playwright-script (playwright.mjs) for headless browser-fallback - Støtter --payload-json for maskinrommet/jobbkø-integrasjon Paywall-deteksjon basert på: - Kort innhold (<200 tegn) - Norske/engelske paywall-fraser i innholdet - CSS-klasser/HTML-attributter (piano, schibsted, amedia, etc.) - Schema.org isAccessibleForFree meta-tagg Output: JSON med title, author, date, content, url, paywall, excerpt, source
63 lines
1.8 KiB
JavaScript
63 lines
1.8 KiB
JavaScript
#!/usr/bin/env node
|
|
// playwright.mjs — Fetch a JS-rendered page with Playwright, then extract with Readability.
|
|
//
|
|
// Args: --url <url> (required)
|
|
// --timeout <ms> (optional, default 30000)
|
|
// Output: JSON on stdout (same shape as readability.mjs)
|
|
//
|
|
// Requires: npx playwright install chromium (one-time setup)
|
|
|
|
import { Readability } from "@mozilla/readability";
|
|
import { JSDOM } from "jsdom";
|
|
import { chromium } from "playwright";
|
|
|
|
const args = process.argv.slice(2);
|
|
function getArg(name, fallback) {
|
|
const idx = args.indexOf(name);
|
|
return idx !== -1 ? args[idx + 1] : fallback;
|
|
}
|
|
|
|
const url = getArg("--url", null);
|
|
const timeout = parseInt(getArg("--timeout", "30000"), 10);
|
|
|
|
if (!url) {
|
|
process.stderr.write("--url is required\n");
|
|
process.exit(1);
|
|
}
|
|
|
|
let browser;
|
|
try {
|
|
browser = await chromium.launch({ headless: true });
|
|
const context = await browser.newContext({
|
|
userAgent:
|
|
"Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36",
|
|
});
|
|
const page = await context.newPage();
|
|
await page.goto(url, { waitUntil: "networkidle", timeout });
|
|
|
|
const html = await page.content();
|
|
const dom = new JSDOM(html, { url });
|
|
const reader = new Readability(dom.window.document);
|
|
const article = reader.parse();
|
|
|
|
if (!article) {
|
|
process.stdout.write(JSON.stringify({ error: "readability_failed" }));
|
|
process.exit(1);
|
|
}
|
|
|
|
process.stdout.write(
|
|
JSON.stringify({
|
|
title: article.title || null,
|
|
author: article.byline || null,
|
|
content: article.textContent || "",
|
|
excerpt: article.excerpt || null,
|
|
siteName: article.siteName || null,
|
|
length: article.length || 0,
|
|
})
|
|
);
|
|
} catch (e) {
|
|
process.stdout.write(JSON.stringify({ error: e.message }));
|
|
process.exit(1);
|
|
} finally {
|
|
if (browser) await browser.close();
|
|
}
|