Files
ai-coding-skills/skills/web-automation/cursor/scripts/extract.js
T
stefano 251148c3ff
check / check (ubuntu-latest) (push) Successful in 2m5s
check / check (macos-latest) (push) Has been cancelled
check-online / check-online (ubuntu-latest) (push) Successful in 1m53s
Perform code optimization and document cleanup (#1)
## Summary
- add repository-wide quality tooling and verification scaffolding, including CI workflows, pnpm workspace setup, ESLint/Prettier/markdown checks, and generated-output verification helpers
- reorganize skill sources and generation flow by introducing canonical `_source` variants, generator/manifests, reusable helper abstractions, and shared web-automation/browser utilities
- clean up and expand documentation so the root README flows into docs and skill docs, with clearer development, reviewer, installer, and workflow guidance

## Notable changes
- docs flow and consistency cleanup across `README.md`, `docs/README.md`, and related docs
- new scripts for `check`, docs verification, generated-file verification, shell portability, and safe directory replacement
- refactors in Atlassian and web-automation skill runtimes to reduce duplication and centralize reusable code
- changelog, development documentation, and CI surface updates

## Test Plan
- [ ] `pnpm run check`
- [ ] review generated/manifests and skill sync outputs
- [ ] smoke-check docs flow from `README.md` to `docs/README.md` to skill docs

## Notes
- this branch currently includes tracked `skills/web-automation/shared/node_modules` content that should be reviewed carefully as potentially noisy/accidental committed artifacts

Co-authored-by: Stefano Fiorini <stefano.fiorini@firsthorizon.com>
Reviewed-on: #1
2026-05-04 04:41:34 +00:00

190 lines
5.3 KiB
JavaScript
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
#!/usr/bin/env node
// ⚠️ GENERATED FILE do not edit directly. Edit the canonical source in skills/web-automation/shared/ and run `pnpm run sync:pi`.
import fs from "node:fs";
import path from "node:path";
import { fileURLToPath } from "node:url";
const DEFAULT_WAIT_MS = 5000;
const MAX_WAIT_MS = 20000;
const NAV_TIMEOUT_MS = 30000;
const EXTRA_CHALLENGE_WAIT_MS = 8000;
const CONTENT_LIMIT = 12000;
const __filename = fileURLToPath(import.meta.url);
const __dirname = path.dirname(__filename);
function fail(message, details) {
const payload = { error: message };
if (details) payload.details = details;
process.stderr.write(`${JSON.stringify(payload)}\n`);
process.exit(1);
}
function parseWaitTime(raw) {
const value = Number.parseInt(raw || `${DEFAULT_WAIT_MS}`, 10);
if (!Number.isFinite(value) || value < 0) return DEFAULT_WAIT_MS;
return Math.min(value, MAX_WAIT_MS);
}
function parseTarget(rawUrl) {
if (!rawUrl) {
fail("Missing URL. Usage: node extract.js <URL>");
}
let parsed;
try {
parsed = new URL(rawUrl);
} catch (error) {
fail("Invalid URL.", error.message);
}
if (!["http:", "https:"].includes(parsed.protocol)) {
fail("Only http and https URLs are allowed.");
}
return parsed.toString();
}
function ensureParentDir(filePath) {
if (!filePath) return;
fs.mkdirSync(path.dirname(filePath), { recursive: true });
}
function sleep(ms) {
return new Promise((resolve) => setTimeout(resolve, ms));
}
async function detectChallenge(page) {
try {
return await page.evaluate(() => {
const text = (document.body?.innerText || "").toLowerCase();
return (
text.includes("checking your browser") ||
text.includes("just a moment") ||
text.includes("verify you are human") ||
text.includes("press and hold") ||
document.querySelector('iframe[src*="challenge"]') !== null ||
document.querySelector('iframe[src*="cloudflare"]') !== null
);
});
} catch {
return false;
}
}
async function loadCloakBrowser() {
try {
return await import("cloakbrowser");
} catch (error) {
fail(
"CloakBrowser is not installed for this skill. Run pnpm install in this skill's scripts directory first.",
error.message
);
}
}
async function runWithStderrLogs(fn) {
const originalLog = console.log;
const originalError = console.error;
console.log = (...args) => process.stderr.write(`${args.join(" ")}\n`);
console.error = (...args) => process.stderr.write(`${args.join(" ")}\n`);
try {
return await fn();
} finally {
console.log = originalLog;
console.error = originalError;
}
}
async function main() {
const requestedUrl = parseTarget(process.argv[2]);
const waitTime = parseWaitTime(process.env.WAIT_TIME);
const screenshotPath = process.env.SCREENSHOT_PATH || "";
const saveHtml = process.env.SAVE_HTML === "true";
const headless = process.env.HEADLESS !== "false";
const userAgent = process.env.USER_AGENT || undefined;
const startedAt = Date.now();
const { ensureBinary, launchContext } = await loadCloakBrowser();
let context;
try {
await runWithStderrLogs(() => ensureBinary());
context = await runWithStderrLogs(() => launchContext({
headless,
userAgent,
locale: "en-US",
viewport: { width: 1440, height: 900 },
humanize: true,
}));
const page = await context.newPage();
const response = await page.goto(requestedUrl, {
waitUntil: "domcontentloaded",
timeout: NAV_TIMEOUT_MS
});
await sleep(waitTime);
let challengeDetected = await detectChallenge(page);
if (challengeDetected) {
await sleep(EXTRA_CHALLENGE_WAIT_MS);
challengeDetected = await detectChallenge(page);
}
const extracted = await page.evaluate((contentLimit) => {
const bodyText = document.body?.innerText || "";
return {
finalUrl: window.location.href,
title: document.title || "",
content: bodyText.slice(0, contentLimit),
metaDescription:
document.querySelector('meta[name="description"]')?.content ||
document.querySelector('meta[property="og:description"]')?.content ||
""
};
}, CONTENT_LIMIT);
const result = {
requestedUrl,
finalUrl: extracted.finalUrl,
title: extracted.title,
content: extracted.content,
metaDescription: extracted.metaDescription,
status: response ? response.status() : null,
challengeDetected,
elapsedSeconds: ((Date.now() - startedAt) / 1000).toFixed(2)
};
if (screenshotPath) {
ensureParentDir(screenshotPath);
await page.screenshot({ path: screenshotPath, fullPage: false, timeout: 10000 });
result.screenshot = screenshotPath;
}
if (saveHtml) {
const htmlTarget = screenshotPath
? screenshotPath.replace(/\.[^.]+$/, ".html")
: path.resolve(__dirname, `page-${Date.now()}.html`);
ensureParentDir(htmlTarget);
fs.writeFileSync(htmlTarget, await page.content());
result.htmlFile = htmlTarget;
}
process.stdout.write(`${JSON.stringify(result, null, 2)}\n`);
await context.close();
} catch (error) {
if (context) {
try {
await context.close();
} catch {
// Ignore close errors after the primary failure.
}
}
fail("Scrape failed.", error.message);
}
}
main();