#!/usr/bin/env node import fs from "node:fs"; import path from "node:path"; import { fileURLToPath } from "node:url"; const DEFAULT_WAIT_MS = 5000; const MAX_WAIT_MS = 20000; const NAV_TIMEOUT_MS = 30000; const EXTRA_CHALLENGE_WAIT_MS = 8000; const CONTENT_LIMIT = 12000; const __filename = fileURLToPath(import.meta.url); const __dirname = path.dirname(__filename); function fail(message, details) { const payload = { error: message }; if (details) payload.details = details; process.stderr.write(`${JSON.stringify(payload)}\n`); process.exit(1); } function parseWaitTime(raw) { const value = Number.parseInt(raw || `${DEFAULT_WAIT_MS}`, 10); if (!Number.isFinite(value) || value < 0) return DEFAULT_WAIT_MS; return Math.min(value, MAX_WAIT_MS); } function parseTarget(rawUrl) { if (!rawUrl) { fail("Missing URL. Usage: node extract.js "); } let parsed; try { parsed = new URL(rawUrl); } catch (error) { fail("Invalid URL.", error.message); } if (!["http:", "https:"].includes(parsed.protocol)) { fail("Only http and https URLs are allowed."); } return parsed.toString(); } function ensureParentDir(filePath) { if (!filePath) return; fs.mkdirSync(path.dirname(filePath), { recursive: true }); } function sleep(ms) { return new Promise((resolve) => setTimeout(resolve, ms)); } async function detectChallenge(page) { try { return await page.evaluate(() => { const text = (document.body?.innerText || "").toLowerCase(); return ( text.includes("checking your browser") || text.includes("just a moment") || text.includes("verify you are human") || text.includes("press and hold") || document.querySelector('iframe[src*="challenge"]') !== null || document.querySelector('iframe[src*="cloudflare"]') !== null ); }); } catch { return false; } } async function loadCloakBrowser() { try { return await import("cloakbrowser"); } catch (error) { fail( "CloakBrowser is not installed for this skill. Run pnpm install in this skill's scripts directory first.", error.message ); } } async function runWithStderrLogs(fn) { const originalLog = console.log; const originalError = console.error; console.log = (...args) => process.stderr.write(`${args.join(" ")}\n`); console.error = (...args) => process.stderr.write(`${args.join(" ")}\n`); try { return await fn(); } finally { console.log = originalLog; console.error = originalError; } } async function main() { const requestedUrl = parseTarget(process.argv[2]); const waitTime = parseWaitTime(process.env.WAIT_TIME); const screenshotPath = process.env.SCREENSHOT_PATH || ""; const saveHtml = process.env.SAVE_HTML === "true"; const headless = process.env.HEADLESS !== "false"; const userAgent = process.env.USER_AGENT || undefined; const startedAt = Date.now(); const { ensureBinary, launchContext } = await loadCloakBrowser(); let context; try { await runWithStderrLogs(() => ensureBinary()); context = await runWithStderrLogs(() => launchContext({ headless, userAgent, locale: "en-US", viewport: { width: 1440, height: 900 }, humanize: true, })); const page = await context.newPage(); const response = await page.goto(requestedUrl, { waitUntil: "domcontentloaded", timeout: NAV_TIMEOUT_MS }); await sleep(waitTime); let challengeDetected = await detectChallenge(page); if (challengeDetected) { await sleep(EXTRA_CHALLENGE_WAIT_MS); challengeDetected = await detectChallenge(page); } const extracted = await page.evaluate((contentLimit) => { const bodyText = document.body?.innerText || ""; return { finalUrl: window.location.href, title: document.title || "", content: bodyText.slice(0, contentLimit), metaDescription: document.querySelector('meta[name="description"]')?.content || document.querySelector('meta[property="og:description"]')?.content || "" }; }, CONTENT_LIMIT); const result = { requestedUrl, finalUrl: extracted.finalUrl, title: extracted.title, content: extracted.content, metaDescription: extracted.metaDescription, status: response ? response.status() : null, challengeDetected, elapsedSeconds: ((Date.now() - startedAt) / 1000).toFixed(2) }; if (screenshotPath) { ensureParentDir(screenshotPath); await page.screenshot({ path: screenshotPath, fullPage: false, timeout: 10000 }); result.screenshot = screenshotPath; } if (saveHtml) { const htmlTarget = screenshotPath ? screenshotPath.replace(/\.[^.]+$/, ".html") : path.resolve(__dirname, `page-${Date.now()}.html`); ensureParentDir(htmlTarget); fs.writeFileSync(htmlTarget, await page.content()); result.htmlFile = htmlTarget; } process.stdout.write(`${JSON.stringify(result, null, 2)}\n`); await context.close(); } catch (error) { if (context) { try { await context.close(); } catch { // Ignore close errors after the primary failure. } } fail("Scrape failed.", error.message); } } main();