feat(installer): improve cursor and opencode skill handling

This commit is contained in:
Stefano Fiorini
2026-04-24 02:20:06 -05:00
parent d62899308a
commit 193cd45db8
30 changed files with 3832 additions and 64 deletions
+112
View File
@@ -0,0 +1,112 @@
---
name: web-automation
description: Browse and scrape web pages using Playwright-compatible CloakBrowser. Use when automating web workflows, extracting rendered page content, handling authenticated sessions, or running multi-step browser flows.
---
# Web Automation with CloakBrowser (Cursor)
Automated web browsing and scraping using Playwright-compatible CloakBrowser with two execution paths:
- one-shot extraction via `extract.js`
- broader stateful automation via `auth.ts`, `browse.ts`, `flow.ts`, `scan-local-app.ts`, and `scrape.ts`
## Requirements
- Node.js 20+
- pnpm
- Network access to download the CloakBrowser binary on first use
## First-Time Setup
Repo-local install:
```bash
cd .cursor/skills/web-automation/scripts
pnpm install
npx cloakbrowser install
pnpm approve-builds
pnpm rebuild better-sqlite3 esbuild
```
Global install:
```bash
cd ~/.cursor/skills/web-automation/scripts
pnpm install
npx cloakbrowser install
pnpm approve-builds
pnpm rebuild better-sqlite3 esbuild
```
## Updating CloakBrowser
Run from the installed `scripts/` directory:
```bash
pnpm up cloakbrowser playwright-core
npx cloakbrowser install
pnpm approve-builds
pnpm rebuild better-sqlite3 esbuild
```
## Prerequisite Check (MANDATORY)
Before running automation, verify CloakBrowser and Playwright Core are installed and wired correctly.
```bash
cd .cursor/skills/web-automation/scripts || cd ~/.cursor/skills/web-automation/scripts
node check-install.js
```
If the check fails, stop and return:
"Missing dependency/config: web-automation requires `cloakbrowser` and `playwright-core` with CloakBrowser-based scripts. Run setup in this skill, then retry."
If runtime fails with missing native bindings for `better-sqlite3` or `esbuild`, run:
```bash
cd .cursor/skills/web-automation/scripts || cd ~/.cursor/skills/web-automation/scripts
pnpm approve-builds
pnpm rebuild better-sqlite3 esbuild
```
## When To Use Which Command
- Use `node extract.js "<URL>"` for a one-shot rendered fetch with JSON output.
- Use `npx tsx scrape.ts ...` when you need markdown extraction, Readability cleanup, or selector-based scraping.
- Use `npx tsx browse.ts ...`, `auth.ts`, or `flow.ts` when the task needs login handling, persistent sessions, clicks, typing, screenshots, or multi-step navigation.
- Use `npx tsx scan-local-app.ts` when you need a configurable local-app smoke pass driven by `SCAN_*` and `CLOAKBROWSER_*` environment variables.
## Quick Reference
- Install check: `node check-install.js`
- One-shot JSON extract: `node extract.js "https://example.com"`
- Browse page: `npx tsx browse.ts --url "https://example.com"`
- Scrape markdown: `npx tsx scrape.ts --url "https://example.com" --mode main --output page.md`
- Authenticate: `npx tsx auth.ts --url "https://example.com/login"`
- Natural-language flow: `npx tsx flow.ts --instruction 'go to https://example.com then click on "Login" then type "user@example.com" in #email then press enter'`
- Local app smoke scan: `SCAN_BASE_URL=http://localhost:3000 SCAN_ROUTES=/,/dashboard npx tsx scan-local-app.ts`
## Local App Smoke Scan
`scan-local-app.ts` is intentionally generic. Configure it with environment variables instead of editing the file:
- `SCAN_BASE_URL`
- `SCAN_LOGIN_PATH`
- `SCAN_USERNAME`
- `SCAN_PASSWORD`
- `SCAN_USERNAME_SELECTOR`
- `SCAN_PASSWORD_SELECTOR`
- `SCAN_SUBMIT_SELECTOR`
- `SCAN_ROUTES`
- `SCAN_REPORT_PATH`
- `SCAN_HEADLESS`
If `SCAN_USERNAME` or `SCAN_PASSWORD` are omitted, the script falls back to `CLOAKBROWSER_USERNAME` and `CLOAKBROWSER_PASSWORD`.
## Notes
- Sessions persist in CloakBrowser profile storage.
- Use `--wait` for dynamic pages.
- Use `--mode selector --selector "..."` for targeted extraction.
- `extract.js` keeps a bounded stealth/rendered fetch path without needing a long-lived automation session.
@@ -0,0 +1,575 @@
#!/usr/bin/env npx tsx
/**
* Authentication handler for web automation
* Supports generic form login and Microsoft SSO (MSAL)
*
* Usage:
* npx tsx auth.ts --url "https://example.com/login" --type form
* npx tsx auth.ts --url "https://example.com" --type msal
* npx tsx auth.ts --url "https://example.com" --type auto
*/
import { getPage, launchBrowser } from './browse.js';
import parseArgs from 'minimist';
import type { Page, BrowserContext } from 'playwright-core';
import { createInterface } from 'readline';
// Types
type AuthType = 'auto' | 'form' | 'msal';
interface AuthOptions {
url: string;
authType: AuthType;
credentials?: {
username: string;
password: string;
};
headless?: boolean;
timeout?: number;
}
interface AuthResult {
success: boolean;
finalUrl: string;
authType: AuthType;
message: string;
}
// Get credentials from environment or options
function getCredentials(options?: {
username?: string;
password?: string;
}): { username: string; password: string } | null {
const username = options?.username || process.env.CLOAKBROWSER_USERNAME;
const password = options?.password || process.env.CLOAKBROWSER_PASSWORD;
if (!username || !password) {
return null;
}
return { username, password };
}
// Prompt user for input (for MFA or credentials)
async function promptUser(question: string, hidden = false): Promise<string> {
const rl = createInterface({
input: process.stdin,
output: process.stdout,
});
return new Promise((resolve) => {
if (hidden) {
process.stdout.write(question);
// Note: This is a simple implementation. For production, use a proper hidden input library
}
rl.question(question, (answer) => {
rl.close();
resolve(answer);
});
});
}
// Detect authentication type from page
async function detectAuthType(page: Page): Promise<AuthType> {
const url = page.url();
// Check for Microsoft login
if (
url.includes('login.microsoftonline.com') ||
url.includes('login.live.com') ||
url.includes('login.windows.net')
) {
return 'msal';
}
// Check for common form login patterns
const hasLoginForm = await page.evaluate(() => {
const passwordField = document.querySelector(
'input[type="password"], input[name*="password"], input[id*="password"]'
);
const usernameField = document.querySelector(
'input[type="email"], input[type="text"][name*="user"], input[type="text"][name*="email"], input[id*="user"], input[id*="email"]'
);
return !!(passwordField && usernameField);
});
if (hasLoginForm) {
return 'form';
}
return 'auto';
}
// Handle generic form login
async function handleFormLogin(
page: Page,
credentials: { username: string; password: string },
timeout: number
): Promise<boolean> {
console.log('Attempting form login...');
// Find and fill username/email field
const usernameSelectors = [
'input[type="email"]',
'input[name*="user" i]',
'input[name*="email" i]',
'input[id*="user" i]',
'input[id*="email" i]',
'input[autocomplete="username"]',
'input[type="text"]:first-of-type',
];
let usernameField = null;
for (const selector of usernameSelectors) {
usernameField = await page.$(selector);
if (usernameField) break;
}
if (!usernameField) {
console.error('Could not find username/email field');
return false;
}
await usernameField.fill(credentials.username);
console.log('Filled username field');
// Find and fill password field
const passwordSelectors = [
'input[type="password"]',
'input[name*="password" i]',
'input[id*="password" i]',
'input[autocomplete="current-password"]',
];
let passwordField = null;
for (const selector of passwordSelectors) {
passwordField = await page.$(selector);
if (passwordField) break;
}
if (!passwordField) {
console.error('Could not find password field');
return false;
}
await passwordField.fill(credentials.password);
console.log('Filled password field');
// Check for "Remember me" checkbox and check it
const rememberCheckbox = await page.$(
'input[type="checkbox"][name*="remember" i], input[type="checkbox"][id*="remember" i]'
);
if (rememberCheckbox) {
await rememberCheckbox.check();
console.log('Checked "Remember me" checkbox');
}
// Find and click submit button
const submitSelectors = [
'button[type="submit"]',
'input[type="submit"]',
'button:has-text("Sign in")',
'button:has-text("Log in")',
'button:has-text("Login")',
'button:has-text("Submit")',
'[role="button"]:has-text("Sign in")',
];
let submitButton = null;
for (const selector of submitSelectors) {
submitButton = await page.$(selector);
if (submitButton) break;
}
if (!submitButton) {
// Try pressing Enter as fallback
await passwordField.press('Enter');
} else {
await submitButton.click();
}
console.log('Submitted login form');
// Wait for navigation or error
try {
await page.waitForNavigation({ timeout, waitUntil: 'domcontentloaded' });
return true;
} catch {
// Check if we're still on login page with error
const errorMessages = await page.$$eval(
'.error, .alert-danger, [role="alert"], .login-error',
(els) => els.map((el) => el.textContent?.trim()).filter(Boolean)
);
if (errorMessages.length > 0) {
console.error('Login error:', errorMessages.join(', '));
return false;
}
return true; // Might have succeeded without navigation
}
}
// Handle Microsoft SSO login
async function handleMsalLogin(
page: Page,
credentials: { username: string; password: string },
timeout: number
): Promise<boolean> {
console.log('Attempting Microsoft SSO login...');
const currentUrl = page.url();
// If not already on Microsoft login, wait for redirect
if (!currentUrl.includes('login.microsoftonline.com')) {
try {
await page.waitForURL('**/login.microsoftonline.com/**', { timeout: 10000 });
} catch {
console.log('Not redirected to Microsoft login');
return false;
}
}
// Wait for email input
const emailInput = await page.waitForSelector(
'input[type="email"], input[name="loginfmt"]',
{ timeout }
);
if (!emailInput) {
console.error('Could not find email input on Microsoft login');
return false;
}
// Fill email and submit
await emailInput.fill(credentials.username);
console.log('Filled email field');
const nextButton = await page.$('input[type="submit"], button[type="submit"]');
if (nextButton) {
await nextButton.click();
} else {
await emailInput.press('Enter');
}
// Wait for password page
try {
await page.waitForSelector(
'input[type="password"], input[name="passwd"]',
{ timeout }
);
} catch {
// Might be using passwordless auth or different flow
console.log('Password field not found - might be using different auth flow');
return false;
}
// Fill password
const passwordInput = await page.$('input[type="password"], input[name="passwd"]');
if (!passwordInput) {
console.error('Could not find password input');
return false;
}
await passwordInput.fill(credentials.password);
console.log('Filled password field');
// Submit
const signInButton = await page.$('input[type="submit"], button[type="submit"]');
if (signInButton) {
await signInButton.click();
} else {
await passwordInput.press('Enter');
}
// Handle "Stay signed in?" prompt
try {
const staySignedInButton = await page.waitForSelector(
'input[value="Yes"], button:has-text("Yes")',
{ timeout: 5000 }
);
if (staySignedInButton) {
await staySignedInButton.click();
console.log('Clicked "Stay signed in" button');
}
} catch {
// Prompt might not appear
}
// Check for Conditional Access Policy error
const caError = await page.$('text=Conditional Access policy');
if (caError) {
console.error('Blocked by Conditional Access Policy');
// Take screenshot for debugging
await page.screenshot({ path: 'ca-policy-error.png' });
console.log('Screenshot saved: ca-policy-error.png');
return false;
}
// Wait for redirect away from Microsoft login
try {
await page.waitForURL(
(url) => !url.href.includes('login.microsoftonline.com'),
{ timeout }
);
return true;
} catch {
return false;
}
}
// Check if user is already authenticated
async function isAuthenticated(page: Page, targetUrl: string): Promise<boolean> {
const currentUrl = page.url();
// If we're on the target URL (not a login page), we're likely authenticated
if (currentUrl.startsWith(targetUrl)) {
// Check for common login page indicators
const isLoginPage = await page.evaluate(() => {
const loginIndicators = [
'input[type="password"]',
'form[action*="login"]',
'form[action*="signin"]',
'.login-form',
'#login',
];
return loginIndicators.some((sel) => document.querySelector(sel) !== null);
});
return !isLoginPage;
}
return false;
}
// Main authentication function
export async function authenticate(options: AuthOptions): Promise<AuthResult> {
const browser = await launchBrowser({ headless: options.headless ?? true });
const page = await browser.newPage();
const timeout = options.timeout ?? 30000;
try {
// Navigate to URL
console.log(`Navigating to: ${options.url}`);
await page.goto(options.url, { timeout: 60000, waitUntil: 'domcontentloaded' });
// Check if already authenticated
if (await isAuthenticated(page, options.url)) {
return {
success: true,
finalUrl: page.url(),
authType: 'auto',
message: 'Already authenticated (session persisted from profile)',
};
}
// Get credentials
const credentials = options.credentials
? options.credentials
: getCredentials();
if (!credentials) {
// No credentials - open interactive browser
console.log('\nNo credentials provided. Opening browser for manual login...');
console.log('Please complete the login process manually.');
console.log('The session will be saved to your profile.');
// Switch to headed mode for manual login
await browser.close();
const interactiveBrowser = await launchBrowser({ headless: false });
const interactivePage = await interactiveBrowser.newPage();
await interactivePage.goto(options.url);
await promptUser('\nPress Enter when you have completed login...');
const finalUrl = interactivePage.url();
await interactiveBrowser.close();
return {
success: true,
finalUrl,
authType: 'auto',
message: 'Manual login completed - session saved to profile',
};
}
// Detect auth type if auto
let authType = options.authType;
if (authType === 'auto') {
authType = await detectAuthType(page);
console.log(`Detected auth type: ${authType}`);
}
// Handle authentication based on type
let success = false;
switch (authType) {
case 'msal':
success = await handleMsalLogin(page, credentials, timeout);
break;
case 'form':
default:
success = await handleFormLogin(page, credentials, timeout);
break;
}
const finalUrl = page.url();
return {
success,
finalUrl,
authType,
message: success
? `Authentication successful - session saved to profile`
: 'Authentication failed',
};
} finally {
await browser.close();
}
}
// Navigate to authenticated page (handles auth if needed)
export async function navigateAuthenticated(
url: string,
options?: {
credentials?: { username: string; password: string };
headless?: boolean;
}
): Promise<{ page: Page; browser: BrowserContext }> {
const { page, browser } = await getPage({ headless: options?.headless ?? true });
await page.goto(url, { timeout: 60000, waitUntil: 'domcontentloaded' });
// Check if we need to authenticate
if (!(await isAuthenticated(page, url))) {
console.log('Session expired or not authenticated. Attempting login...');
// Get credentials
const credentials = options?.credentials ?? getCredentials();
if (!credentials) {
throw new Error(
'Authentication required but no credentials provided. ' +
'Set CLOAKBROWSER_USERNAME and CLOAKBROWSER_PASSWORD environment variables.'
);
}
// Detect and handle auth
const authType = await detectAuthType(page);
let success = false;
if (authType === 'msal') {
success = await handleMsalLogin(page, credentials, 30000);
} else {
success = await handleFormLogin(page, credentials, 30000);
}
if (!success) {
await browser.close();
throw new Error('Authentication failed');
}
// Navigate back to original URL if we were redirected
if (!page.url().startsWith(url)) {
await page.goto(url, { timeout: 60000, waitUntil: 'domcontentloaded' });
}
}
return { page, browser };
}
// CLI entry point
async function main() {
const args = parseArgs(process.argv.slice(2), {
string: ['url', 'type', 'username', 'password'],
boolean: ['headless', 'help'],
default: {
type: 'auto',
headless: false, // Default to headed for auth so user can see/interact
},
alias: {
u: 'url',
t: 'type',
h: 'help',
},
});
if (args.help || !args.url) {
console.log(`
Web Authentication Handler
Usage:
npx tsx auth.ts --url <url> [options]
Options:
-u, --url <url> URL to authenticate (required)
-t, --type <type> Auth type: auto, form, or msal (default: auto)
--username <user> Username/email (or set CLOAKBROWSER_USERNAME env var)
--password <pass> Password (or set CLOAKBROWSER_PASSWORD env var)
--headless <bool> Run in headless mode (default: false for auth)
-h, --help Show this help message
Auth Types:
auto Auto-detect authentication type
form Generic username/password form
msal Microsoft SSO (login.microsoftonline.com)
Environment Variables:
CLOAKBROWSER_USERNAME Default username/email for authentication
CLOAKBROWSER_PASSWORD Default password for authentication
Examples:
# Interactive login (no credentials, opens browser)
npx tsx auth.ts --url "https://example.com/login"
# Form login with credentials
npx tsx auth.ts --url "https://example.com/login" --type form \\
--username "user@example.com" --password "secret"
# Microsoft SSO login
CLOAKBROWSER_USERNAME=user@company.com CLOAKBROWSER_PASSWORD=secret \\
npx tsx auth.ts --url "https://internal.company.com" --type msal
Notes:
- Session is saved to ~/.cloakbrowser-profile/ for persistence
- After successful auth, subsequent browses will be authenticated
- Use --headless false if you need to handle MFA manually
`);
process.exit(args.help ? 0 : 1);
}
const authType = args.type as AuthType;
if (!['auto', 'form', 'msal'].includes(authType)) {
console.error(`Invalid auth type: ${authType}. Must be auto, form, or msal.`);
process.exit(1);
}
try {
const result = await authenticate({
url: args.url,
authType,
credentials:
args.username && args.password
? { username: args.username, password: args.password }
: undefined,
headless: args.headless,
});
console.log(`\nAuthentication result:`);
console.log(` Success: ${result.success}`);
console.log(` Auth type: ${result.authType}`);
console.log(` Final URL: ${result.finalUrl}`);
console.log(` Message: ${result.message}`);
process.exit(result.success ? 0 : 1);
} catch (error) {
console.error('Error:', error instanceof Error ? error.message : error);
process.exit(1);
}
}
// Run if executed directly
const isMainModule = process.argv[1]?.includes('auth.ts');
if (isMainModule) {
main();
}
@@ -0,0 +1,188 @@
#!/usr/bin/env npx tsx
/**
* Browser launcher using CloakBrowser with persistent profile
*
* Usage:
* npx tsx browse.ts --url "https://example.com"
* npx tsx browse.ts --url "https://example.com" --screenshot --output page.png
* npx tsx browse.ts --url "https://example.com" --headless false --wait 5000
*/
import { launchPersistentContext } from 'cloakbrowser';
import { homedir } from 'os';
import { join } from 'path';
import { existsSync, mkdirSync } from 'fs';
import parseArgs from 'minimist';
import type { Page, BrowserContext } from 'playwright-core';
interface BrowseOptions {
url: string;
headless?: boolean;
screenshot?: boolean;
output?: string;
wait?: number;
timeout?: number;
interactive?: boolean;
}
interface BrowseResult {
title: string;
url: string;
screenshotPath?: string;
}
function sleep(ms: number): Promise<void> {
return new Promise((resolve) => setTimeout(resolve, ms));
}
const getProfilePath = (): string => {
const customPath = process.env.CLOAKBROWSER_PROFILE_PATH;
if (customPath) return customPath;
const profileDir = join(homedir(), '.cloakbrowser-profile');
if (!existsSync(profileDir)) {
mkdirSync(profileDir, { recursive: true });
}
return profileDir;
};
export async function launchBrowser(options: {
headless?: boolean;
}): Promise<BrowserContext> {
const profilePath = getProfilePath();
const envHeadless = process.env.CLOAKBROWSER_HEADLESS;
const headless = options.headless ?? (envHeadless ? envHeadless === 'true' : true);
console.log(`Using profile: ${profilePath}`);
console.log(`Headless mode: ${headless}`);
const context = await launchPersistentContext({
userDataDir: profilePath,
headless,
humanize: true,
});
return context;
}
export async function browse(options: BrowseOptions): Promise<BrowseResult> {
const browser = await launchBrowser({ headless: options.headless });
const page = browser.pages()[0] || await browser.newPage();
try {
console.log(`Navigating to: ${options.url}`);
await page.goto(options.url, {
timeout: options.timeout ?? 60000,
waitUntil: 'domcontentloaded',
});
if (options.wait) {
console.log(`Waiting ${options.wait}ms...`);
await sleep(options.wait);
}
const result: BrowseResult = {
title: await page.title(),
url: page.url(),
};
console.log(`Page title: ${result.title}`);
console.log(`Final URL: ${result.url}`);
if (options.screenshot) {
const outputPath = options.output ?? 'screenshot.png';
await page.screenshot({ path: outputPath, fullPage: true });
result.screenshotPath = outputPath;
console.log(`Screenshot saved: ${outputPath}`);
}
if (options.interactive) {
console.log('\nInteractive mode - browser will stay open.');
console.log('Press Ctrl+C to close.');
await new Promise(() => {});
}
return result;
} finally {
if (!options.interactive) {
await browser.close();
}
}
}
export async function getPage(options?: {
headless?: boolean;
}): Promise<{ page: Page; browser: BrowserContext }> {
const browser = await launchBrowser({ headless: options?.headless });
const page = browser.pages()[0] || await browser.newPage();
return { page, browser };
}
async function main() {
const args = parseArgs(process.argv.slice(2), {
string: ['url', 'output'],
boolean: ['screenshot', 'headless', 'interactive', 'help'],
default: {
headless: true,
screenshot: false,
interactive: false,
},
alias: {
u: 'url',
o: 'output',
s: 'screenshot',
h: 'help',
i: 'interactive',
},
});
if (args.help || !args.url) {
console.log(`
Web Browser with CloakBrowser
Usage:
npx tsx browse.ts --url <url> [options]
Options:
-u, --url <url> URL to navigate to (required)
-s, --screenshot Take a screenshot of the page
-o, --output <path> Output path for screenshot (default: screenshot.png)
--headless <bool> Run in headless mode (default: true)
--wait <ms> Wait time after page load in milliseconds
--timeout <ms> Navigation timeout (default: 60000)
-i, --interactive Keep browser open for manual interaction
-h, --help Show this help message
Examples:
npx tsx browse.ts --url "https://example.com"
npx tsx browse.ts --url "https://example.com" --screenshot --output page.png
npx tsx browse.ts --url "https://example.com" --headless false --interactive
Environment Variables:
CLOAKBROWSER_PROFILE_PATH Custom profile directory (default: ~/.cloakbrowser-profile/)
CLOAKBROWSER_HEADLESS Default headless mode (true/false)
`);
process.exit(args.help ? 0 : 1);
}
try {
await browse({
url: args.url,
headless: args.headless,
screenshot: args.screenshot,
output: args.output,
wait: args.wait ? parseInt(args.wait, 10) : undefined,
timeout: args.timeout ? parseInt(args.timeout, 10) : undefined,
interactive: args.interactive,
});
} catch (error) {
console.error('Error:', error instanceof Error ? error.message : error);
process.exit(1);
}
}
const isMainModule = process.argv[1]?.includes('browse.ts');
if (isMainModule) {
main();
}
@@ -0,0 +1,40 @@
#!/usr/bin/env node
import fs from "node:fs";
import path from "node:path";
import { fileURLToPath } from "node:url";
const __filename = fileURLToPath(import.meta.url);
const __dirname = path.dirname(__filename);
function fail(message, details) {
const payload = { error: message };
if (details) payload.details = details;
process.stderr.write(`${JSON.stringify(payload)}\n`);
process.exit(1);
}
async function main() {
try {
await import("cloakbrowser");
await import("playwright-core");
} catch (error) {
fail(
"Missing dependency/config: web-automation requires cloakbrowser and playwright-core.",
error instanceof Error ? error.message : String(error)
);
}
const browsePath = path.join(__dirname, "browse.ts");
const browseSource = fs.readFileSync(browsePath, "utf8");
if (!/launchPersistentContext/.test(browseSource) || !/from ['"]cloakbrowser['"]/.test(browseSource)) {
fail("browse.ts is not configured for CloakBrowser.");
}
process.stdout.write("OK: cloakbrowser + playwright-core installed\n");
process.stdout.write("OK: CloakBrowser integration detected in browse.ts\n");
}
main().catch((error) => {
fail("Install check failed.", error instanceof Error ? error.message : String(error));
});
+188
View File
@@ -0,0 +1,188 @@
#!/usr/bin/env node
import fs from "node:fs";
import path from "node:path";
import { fileURLToPath } from "node:url";
const DEFAULT_WAIT_MS = 5000;
const MAX_WAIT_MS = 20000;
const NAV_TIMEOUT_MS = 30000;
const EXTRA_CHALLENGE_WAIT_MS = 8000;
const CONTENT_LIMIT = 12000;
const __filename = fileURLToPath(import.meta.url);
const __dirname = path.dirname(__filename);
function fail(message, details) {
const payload = { error: message };
if (details) payload.details = details;
process.stderr.write(`${JSON.stringify(payload)}\n`);
process.exit(1);
}
function parseWaitTime(raw) {
const value = Number.parseInt(raw || `${DEFAULT_WAIT_MS}`, 10);
if (!Number.isFinite(value) || value < 0) return DEFAULT_WAIT_MS;
return Math.min(value, MAX_WAIT_MS);
}
function parseTarget(rawUrl) {
if (!rawUrl) {
fail("Missing URL. Usage: node extract.js <URL>");
}
let parsed;
try {
parsed = new URL(rawUrl);
} catch (error) {
fail("Invalid URL.", error.message);
}
if (!["http:", "https:"].includes(parsed.protocol)) {
fail("Only http and https URLs are allowed.");
}
return parsed.toString();
}
function ensureParentDir(filePath) {
if (!filePath) return;
fs.mkdirSync(path.dirname(filePath), { recursive: true });
}
function sleep(ms) {
return new Promise((resolve) => setTimeout(resolve, ms));
}
async function detectChallenge(page) {
try {
return await page.evaluate(() => {
const text = (document.body?.innerText || "").toLowerCase();
return (
text.includes("checking your browser") ||
text.includes("just a moment") ||
text.includes("verify you are human") ||
text.includes("press and hold") ||
document.querySelector('iframe[src*="challenge"]') !== null ||
document.querySelector('iframe[src*="cloudflare"]') !== null
);
});
} catch {
return false;
}
}
async function loadCloakBrowser() {
try {
return await import("cloakbrowser");
} catch (error) {
fail(
"CloakBrowser is not installed for this skill. Run pnpm install in this skill's scripts directory first.",
error.message
);
}
}
async function runWithStderrLogs(fn) {
const originalLog = console.log;
const originalError = console.error;
console.log = (...args) => process.stderr.write(`${args.join(" ")}\n`);
console.error = (...args) => process.stderr.write(`${args.join(" ")}\n`);
try {
return await fn();
} finally {
console.log = originalLog;
console.error = originalError;
}
}
async function main() {
const requestedUrl = parseTarget(process.argv[2]);
const waitTime = parseWaitTime(process.env.WAIT_TIME);
const screenshotPath = process.env.SCREENSHOT_PATH || "";
const saveHtml = process.env.SAVE_HTML === "true";
const headless = process.env.HEADLESS !== "false";
const userAgent = process.env.USER_AGENT || undefined;
const startedAt = Date.now();
const { ensureBinary, launchContext } = await loadCloakBrowser();
let context;
try {
await runWithStderrLogs(() => ensureBinary());
context = await runWithStderrLogs(() => launchContext({
headless,
userAgent,
locale: "en-US",
viewport: { width: 1440, height: 900 },
humanize: true,
}));
const page = await context.newPage();
const response = await page.goto(requestedUrl, {
waitUntil: "domcontentloaded",
timeout: NAV_TIMEOUT_MS
});
await sleep(waitTime);
let challengeDetected = await detectChallenge(page);
if (challengeDetected) {
await sleep(EXTRA_CHALLENGE_WAIT_MS);
challengeDetected = await detectChallenge(page);
}
const extracted = await page.evaluate((contentLimit) => {
const bodyText = document.body?.innerText || "";
return {
finalUrl: window.location.href,
title: document.title || "",
content: bodyText.slice(0, contentLimit),
metaDescription:
document.querySelector('meta[name="description"]')?.content ||
document.querySelector('meta[property="og:description"]')?.content ||
""
};
}, CONTENT_LIMIT);
const result = {
requestedUrl,
finalUrl: extracted.finalUrl,
title: extracted.title,
content: extracted.content,
metaDescription: extracted.metaDescription,
status: response ? response.status() : null,
challengeDetected,
elapsedSeconds: ((Date.now() - startedAt) / 1000).toFixed(2)
};
if (screenshotPath) {
ensureParentDir(screenshotPath);
await page.screenshot({ path: screenshotPath, fullPage: false, timeout: 10000 });
result.screenshot = screenshotPath;
}
if (saveHtml) {
const htmlTarget = screenshotPath
? screenshotPath.replace(/\.[^.]+$/, ".html")
: path.resolve(__dirname, `page-${Date.now()}.html`);
ensureParentDir(htmlTarget);
fs.writeFileSync(htmlTarget, await page.content());
result.htmlFile = htmlTarget;
}
process.stdout.write(`${JSON.stringify(result, null, 2)}\n`);
await context.close();
} catch (error) {
if (context) {
try {
await context.close();
} catch {
// Ignore close errors after the primary failure.
}
}
fail("Scrape failed.", error.message);
}
}
main();
@@ -0,0 +1,329 @@
#!/usr/bin/env npx tsx
import parseArgs from 'minimist';
import type { Page } from 'playwright-core';
import { launchBrowser } from './browse';
type Step =
| { action: 'goto'; url: string }
| { action: 'click'; selector?: string; text?: string; role?: string; name?: string }
| { action: 'type'; selector?: string; text: string }
| { action: 'press'; key: string; selector?: string }
| { action: 'wait'; ms: number }
| { action: 'screenshot'; path: string }
| { action: 'extract'; selector: string; count?: number };
function normalizeNavigationUrl(rawUrl: string): string {
let parsed: URL;
try {
parsed = new URL(rawUrl);
} catch {
throw new Error(`Invalid navigation URL: ${rawUrl}`);
}
if (!['http:', 'https:'].includes(parsed.protocol)) {
throw new Error(`Only http and https URLs are allowed in flow steps: ${rawUrl}`);
}
return parsed.toString();
}
function normalizeKey(k: string): string {
if (!k) return 'Enter';
const lower = k.toLowerCase();
if (lower === 'enter' || lower === 'return') return 'Enter';
if (lower === 'tab') return 'Tab';
if (lower === 'escape' || lower === 'esc') return 'Escape';
return k;
}
function splitInstructions(instruction: string): string[] {
return instruction
.split(/\bthen\b|;/gi)
.map((s) => s.trim())
.filter(Boolean);
}
function parseInstruction(instruction: string): Step[] {
const parts = splitInstructions(instruction);
const steps: Step[] = [];
for (const p of parts) {
// go to https://...
const goto = p.match(/^(?:go to|open|navigate to)\s+(https?:\/\/\S+)/i);
if (goto) {
steps.push({ action: 'goto', url: normalizeNavigationUrl(goto[1]) });
continue;
}
// click on "text" or click #selector or click button "name"
const clickRole = p.match(/^click\s+(button|link|textbox|img|image|tab)\s+"([^"]+)"$/i);
if (clickRole) {
const role = clickRole[1].toLowerCase() === 'image' ? 'img' : clickRole[1].toLowerCase();
steps.push({ action: 'click', role, name: clickRole[2] });
continue;
}
const clickText = p.match(/^click(?: on)?\s+"([^"]+)"/i);
if (clickText) {
steps.push({ action: 'click', text: clickText[1] });
continue;
}
const clickSelector = p.match(/^click(?: on)?\s+(#[\w-]+|\.[\w-]+|[a-z]+\[[^\]]+\])/i);
if (clickSelector) {
steps.push({ action: 'click', selector: clickSelector[1] });
continue;
}
// type "text" [in selector]
const typeInto = p.match(/^type\s+"([^"]+)"\s+in\s+(.+)$/i);
if (typeInto) {
steps.push({ action: 'type', text: typeInto[1], selector: typeInto[2].trim() });
continue;
}
const typeOnly = p.match(/^type\s+"([^"]+)"$/i);
if (typeOnly) {
steps.push({ action: 'type', text: typeOnly[1] });
continue;
}
// press enter [in selector]
const pressIn = p.match(/^press\s+(\w+)\s+in\s+(.+)$/i);
if (pressIn) {
steps.push({ action: 'press', key: normalizeKey(pressIn[1]), selector: pressIn[2].trim() });
continue;
}
const pressOnly = p.match(/^press\s+(\w+)$/i);
if (pressOnly) {
steps.push({ action: 'press', key: normalizeKey(pressOnly[1]) });
continue;
}
// wait 2s / wait 500ms
const waitS = p.match(/^wait\s+(\d+)\s*s(?:ec(?:onds?)?)?$/i);
if (waitS) {
steps.push({ action: 'wait', ms: parseInt(waitS[1], 10) * 1000 });
continue;
}
const waitMs = p.match(/^wait\s+(\d+)\s*ms$/i);
if (waitMs) {
steps.push({ action: 'wait', ms: parseInt(waitMs[1], 10) });
continue;
}
// screenshot path
const shot = p.match(/^screenshot(?: to)?\s+(.+)$/i);
if (shot) {
steps.push({ action: 'screenshot', path: shot[1].trim() });
continue;
}
throw new Error(`Could not parse step: "${p}"`);
}
return steps;
}
function validateSteps(steps: Step[]): Step[] {
return steps.map((step) =>
step.action === 'goto'
? {
...step,
url: normalizeNavigationUrl(step.url),
}
: step
);
}
function escapeRegExp(value: string): string {
return value.replace(/[.*+?^${}()|[\]\\]/g, '\\$&');
}
function isLikelyLoginText(text: string): boolean {
return /(login|accedi|sign\s*in|entra)/i.test(text);
}
async function clickByText(page: Page, text: string): Promise<boolean> {
const patterns = [new RegExp(`^${escapeRegExp(text)}$`, 'i'), new RegExp(escapeRegExp(text), 'i')];
for (const pattern of patterns) {
const targets = [
page.getByRole('button', { name: pattern }).first(),
page.getByRole('link', { name: pattern }).first(),
page.getByText(pattern).first(),
];
for (const target of targets) {
if (await target.count()) {
try {
await target.click({ timeout: 8000 });
return true;
} catch {
// keep trying next candidate
}
}
}
}
return false;
}
async function fallbackLoginNavigation(page: Page, requestedText: string): Promise<boolean> {
if (!isLikelyLoginText(requestedText)) return false;
const current = new URL(page.url());
const candidateLinks = await page.evaluate(() => {
const loginTerms = ['login', 'accedi', 'sign in', 'entra'];
const anchors = Array.from(document.querySelectorAll('a[href], a[onclick], button[onclick]')) as Array<HTMLAnchorElement | HTMLButtonElement>;
return anchors
.map((el) => {
const text = (el.textContent || '').trim().toLowerCase();
const href = (el as HTMLAnchorElement).getAttribute('href') || '';
return { text, href };
})
.filter((x) => x.text && loginTerms.some((t) => x.text.includes(t)))
.map((x) => x.href)
.filter(Boolean);
});
// Prefer real URLs (not javascript:)
const realCandidate = candidateLinks.find((h) => /login|account\/login/i.test(h) && !h.startsWith('javascript:'));
if (realCandidate) {
const target = new URL(realCandidate, page.url()).toString();
await page.goto(target, { waitUntil: 'domcontentloaded', timeout: 60000 });
return true;
}
// Site-specific fallback for Corriere
if (/corriere\.it$/i.test(current.hostname) || /\.corriere\.it$/i.test(current.hostname)) {
await page.goto('https://www.corriere.it/account/login', {
waitUntil: 'domcontentloaded',
timeout: 60000,
});
return true;
}
return false;
}
async function typeInBestTarget(page: Page, text: string, selector?: string) {
if (selector) {
await page.locator(selector).first().click({ timeout: 10000 });
await page.locator(selector).first().fill(text);
return;
}
const loc = page.locator('input[name="q"], input[type="search"], input[type="text"], textarea').first();
await loc.click({ timeout: 10000 });
await loc.fill(text);
}
async function pressOnTarget(page: Page, key: string, selector?: string) {
if (selector) {
await page.locator(selector).first().press(key);
return;
}
await page.keyboard.press(key);
}
async function runSteps(page: Page, steps: Step[]) {
for (const step of steps) {
switch (step.action) {
case 'goto':
await page.goto(normalizeNavigationUrl(step.url), {
waitUntil: 'domcontentloaded',
timeout: 60000,
});
break;
case 'click':
if (step.selector) {
await page.locator(step.selector).first().click({ timeout: 15000 });
} else if (step.role && step.name) {
await page.getByRole(step.role as any, { name: new RegExp(escapeRegExp(step.name), 'i') }).first().click({ timeout: 15000 });
} else if (step.text) {
const clicked = await clickByText(page, step.text);
if (!clicked) {
const recovered = await fallbackLoginNavigation(page, step.text);
if (!recovered) {
throw new Error(`Could not click target text: ${step.text}`);
}
}
} else {
throw new Error('click step missing selector/text/role');
}
try {
await page.waitForLoadState('domcontentloaded', { timeout: 10000 });
} catch {
// no navigation is fine
}
break;
case 'type':
await typeInBestTarget(page, step.text, step.selector);
break;
case 'press':
await pressOnTarget(page, step.key, step.selector);
break;
case 'wait':
await page.waitForTimeout(step.ms);
break;
case 'screenshot':
await page.screenshot({ path: step.path, fullPage: true });
break;
case 'extract': {
const items = await page.locator(step.selector).allTextContents();
const out = items.slice(0, step.count ?? items.length).map((t) => t.trim()).filter(Boolean);
console.log(JSON.stringify(out, null, 2));
break;
}
default:
throw new Error('Unknown step');
}
}
}
async function main() {
const args = parseArgs(process.argv.slice(2), {
string: ['instruction', 'steps'],
boolean: ['headless', 'help'],
default: { headless: true },
alias: { i: 'instruction', s: 'steps', h: 'help' },
});
if (args.help || (!args.instruction && !args.steps)) {
console.log(`
General Web Flow Runner (CloakBrowser)
Usage:
npx tsx flow.ts --instruction "go to https://example.com then type \"hello\" then press enter"
npx tsx flow.ts --steps '[{"action":"goto","url":"https://example.com"}]'
Supported natural steps:
- go to/open/navigate to <url>
- click on "Text"
- click <css-selector>
- type "text"
- type "text" in <css-selector>
- press <key>
- press <key> in <css-selector>
- wait <N>s | wait <N>ms
- screenshot <path>
`);
process.exit(args.help ? 0 : 1);
}
const steps = validateSteps(args.steps ? JSON.parse(args.steps) : parseInstruction(args.instruction));
const browser = await launchBrowser({ headless: args.headless });
const page = await browser.newPage();
try {
await runSteps(page, steps);
console.log('Flow complete. Final URL:', page.url());
} finally {
await browser.close();
}
}
main().catch((e) => {
console.error('Error:', e instanceof Error ? e.message : e);
process.exit(1);
});
@@ -0,0 +1,36 @@
{
"name": "web-automation-scripts",
"version": "1.0.0",
"description": "Web browsing and scraping scripts using CloakBrowser",
"type": "module",
"scripts": {
"check-install": "node check-install.js",
"extract": "node extract.js",
"browse": "tsx browse.ts",
"auth": "tsx auth.ts",
"flow": "tsx flow.ts",
"scrape": "tsx scrape.ts",
"typecheck": "tsc --noEmit -p tsconfig.json",
"lint": "pnpm run typecheck && node --check check-install.js && node --check extract.js",
"fetch-browser": "npx cloakbrowser install"
},
"dependencies": {
"@mozilla/readability": "^0.5.0",
"better-sqlite3": "^12.6.2",
"cloakbrowser": "^0.3.22",
"jsdom": "^24.0.0",
"minimist": "^1.2.8",
"playwright-core": "^1.59.1",
"turndown": "^7.1.2",
"turndown-plugin-gfm": "^1.0.2"
},
"devDependencies": {
"@types/jsdom": "^21.1.6",
"@types/minimist": "^1.2.5",
"@types/turndown": "^5.0.4",
"esbuild": "0.27.0",
"tsx": "^4.7.0",
"typescript": "^5.3.0"
},
"packageManager": "pnpm@10.18.1+sha512.77a884a165cbba2d8d1c19e3b4880eee6d2fcabd0d879121e282196b80042351d5eb3ca0935fa599da1dc51265cc68816ad2bddd2a2de5ea9fdf92adbec7cd34"
}
File diff suppressed because it is too large Load Diff
@@ -0,0 +1,174 @@
#!/usr/bin/env npx tsx
import { mkdirSync, writeFileSync } from 'fs';
import { dirname, resolve } from 'path';
import { getPage } from './browse.js';
type NavResult = {
requestedUrl: string;
url: string;
status: number | null;
title: string;
error?: string;
};
type RouteCheck = {
route: string;
result: NavResult;
heading: string | null;
};
const DEFAULT_BASE_URL = 'http://localhost:3000';
const DEFAULT_REPORT_PATH = resolve(process.cwd(), 'scan-local-app.md');
function env(name: string): string | undefined {
const value = process.env[name]?.trim();
return value ? value : undefined;
}
function getRoutes(baseUrl: string): string[] {
const routeList = env('SCAN_ROUTES');
if (routeList) {
return routeList
.split(',')
.map((route) => route.trim())
.filter(Boolean)
.map((route) => new URL(route, baseUrl).toString());
}
return [baseUrl];
}
async function gotoWithStatus(page: any, url: string): Promise<NavResult> {
const response = await page
.goto(url, { waitUntil: 'domcontentloaded', timeout: 60000 })
.catch((error: unknown) => ({ error }));
if (response?.error) {
return {
requestedUrl: url,
url: page.url(),
status: null,
title: await page.title().catch(() => ''),
error: String(response.error),
};
}
return {
requestedUrl: url,
url: page.url(),
status: response ? response.status() : null,
title: await page.title().catch(() => ''),
};
}
async function textOrNull(page: any, selector: string): Promise<string | null> {
const locator = page.locator(selector).first();
try {
if ((await locator.count()) === 0) return null;
const value = await locator.textContent();
return value ? value.trim().replace(/\s+/g, ' ') : null;
} catch {
return null;
}
}
async function loginIfConfigured(page: any, baseUrl: string, lines: string[]) {
const loginPath = env('SCAN_LOGIN_PATH');
const username = env('SCAN_USERNAME') ?? env('CLOAKBROWSER_USERNAME');
const password = env('SCAN_PASSWORD') ?? env('CLOAKBROWSER_PASSWORD');
const usernameSelector = env('SCAN_USERNAME_SELECTOR') ?? 'input[type="email"], input[name="email"]';
const passwordSelector = env('SCAN_PASSWORD_SELECTOR') ?? 'input[type="password"], input[name="password"]';
const submitSelector = env('SCAN_SUBMIT_SELECTOR') ?? 'button[type="submit"], input[type="submit"]';
if (!loginPath) {
lines.push('## Login');
lines.push('- Skipped: set `SCAN_LOGIN_PATH` to enable login smoke checks.');
lines.push('');
return;
}
const loginUrl = new URL(loginPath, baseUrl).toString();
lines.push('## Login');
lines.push(`- Login URL: ${loginUrl}`);
await gotoWithStatus(page, loginUrl);
if (!username || !password) {
lines.push('- Skipped: set `SCAN_USERNAME`/`SCAN_PASSWORD` or `CLOAKBROWSER_USERNAME`/`CLOAKBROWSER_PASSWORD`.');
lines.push('');
return;
}
await page.locator(usernameSelector).first().fill(username);
await page.locator(passwordSelector).first().fill(password);
await page.locator(submitSelector).first().click();
await page.waitForTimeout(2500);
lines.push(`- After submit URL: ${page.url()}`);
lines.push(`- Cookie count: ${(await page.context().cookies()).length}`);
lines.push('');
}
async function checkRoutes(page: any, baseUrl: string, lines: string[]) {
const routes = getRoutes(baseUrl);
const routeChecks: RouteCheck[] = [];
for (const url of routes) {
const result = await gotoWithStatus(page, url);
const heading = await textOrNull(page, 'h1');
routeChecks.push({
route: url,
result,
heading,
});
}
lines.push('## Route Checks');
for (const check of routeChecks) {
const relativeUrl = check.route.startsWith(baseUrl) ? check.route.slice(baseUrl.length) || '/' : check.route;
const finalPath = check.result.url.startsWith(baseUrl)
? check.result.url.slice(baseUrl.length) || '/'
: check.result.url;
const suffix = check.heading ? `, h1="${check.heading}"` : '';
const errorSuffix = check.result.error ? `, error="${check.result.error}"` : '';
lines.push(
`- ${relativeUrl} → status ${check.result.status ?? 'ERR'} (final ${finalPath})${suffix}${errorSuffix}`
);
}
lines.push('');
}
async function main() {
const baseUrl = env('SCAN_BASE_URL') ?? DEFAULT_BASE_URL;
const reportPath = resolve(env('SCAN_REPORT_PATH') ?? DEFAULT_REPORT_PATH);
const headless = (env('SCAN_HEADLESS') ?? env('CLOAKBROWSER_HEADLESS') ?? 'true') === 'true';
const { page, browser } = await getPage({ headless });
const lines: string[] = [];
lines.push('# Web Automation Scan (local)');
lines.push('');
lines.push(`- Base URL: ${baseUrl}`);
lines.push(`- Timestamp: ${new Date().toISOString()}`);
lines.push(`- Headless: ${headless}`);
lines.push(`- Report Path: ${reportPath}`);
lines.push('');
try {
await loginIfConfigured(page, baseUrl, lines);
await checkRoutes(page, baseUrl, lines);
lines.push('## Notes');
lines.push('- This generic smoke helper records route availability and top-level headings for a local app.');
lines.push('- Configure login and route coverage with `SCAN_*` environment variables.');
} finally {
await browser.close();
}
mkdirSync(dirname(reportPath), { recursive: true });
writeFileSync(reportPath, `${lines.join('\n')}\n`, 'utf-8');
console.log(`Report written to ${reportPath}`);
}
main().catch((error) => {
console.error(error);
process.exitCode = 1;
});
@@ -0,0 +1,351 @@
#!/usr/bin/env npx tsx
/**
* Web scraper that extracts content to markdown
*
* Usage:
* npx tsx scrape.ts --url "https://example.com" --mode main
* npx tsx scrape.ts --url "https://example.com" --mode full --output page.md
* npx tsx scrape.ts --url "https://example.com" --mode selector --selector ".content"
*/
import TurndownService from 'turndown';
import * as turndownPluginGfm from 'turndown-plugin-gfm';
import { Readability } from '@mozilla/readability';
import { JSDOM } from 'jsdom';
import { writeFileSync } from 'fs';
import parseArgs from 'minimist';
import { getPage } from './browse.js';
// Types
type ScrapeMode = 'main' | 'full' | 'selector';
interface ScrapeOptions {
url: string;
mode: ScrapeMode;
selector?: string;
output?: string;
includeLinks?: boolean;
includeTables?: boolean;
includeImages?: boolean;
headless?: boolean;
wait?: number;
}
interface ScrapeResult {
title: string;
url: string;
markdown: string;
byline?: string;
excerpt?: string;
}
// Configure Turndown for markdown conversion
function createTurndownService(options: {
includeLinks?: boolean;
includeTables?: boolean;
includeImages?: boolean;
}): TurndownService {
const turndown = new TurndownService({
headingStyle: 'atx',
hr: '---',
bulletListMarker: '-',
codeBlockStyle: 'fenced',
fence: '```',
emDelimiter: '*',
strongDelimiter: '**',
linkStyle: 'inlined',
});
// Add GFM support (tables, strikethrough, task lists)
turndown.use(turndownPluginGfm.gfm);
// Custom rule for code blocks with language detection
turndown.addRule('codeBlockWithLanguage', {
filter: (node) => {
return (
node.nodeName === 'PRE' &&
node.firstChild?.nodeName === 'CODE'
);
},
replacement: (_content, node) => {
const codeNode = node.firstChild as HTMLElement;
const className = codeNode.getAttribute('class') || '';
const langMatch = className.match(/language-(\w+)/);
const lang = langMatch ? langMatch[1] : '';
const code = codeNode.textContent || '';
return `\n\n\`\`\`${lang}\n${code}\n\`\`\`\n\n`;
},
});
// Remove images if not included
if (!options.includeImages) {
turndown.addRule('removeImages', {
filter: 'img',
replacement: () => '',
});
}
// Remove links but keep text if not included
if (!options.includeLinks) {
turndown.addRule('removeLinks', {
filter: 'a',
replacement: (content) => content,
});
}
// Remove script, style, nav, footer, aside elements
turndown.remove(['script', 'style', 'nav', 'footer', 'aside', 'noscript']);
return turndown;
}
// Extract main content using Readability
function extractMainContent(html: string, url: string): {
content: string;
title: string;
byline?: string;
excerpt?: string;
} {
const dom = new JSDOM(html, { url });
const reader = new Readability(dom.window.document);
const article = reader.parse();
if (!article) {
throw new Error('Could not extract main content from page');
}
return {
content: article.content,
title: article.title,
byline: article.byline || undefined,
excerpt: article.excerpt || undefined,
};
}
// Scrape a URL and return markdown
export async function scrape(options: ScrapeOptions): Promise<ScrapeResult> {
const { page, browser } = await getPage({ headless: options.headless ?? true });
try {
// Navigate to URL
console.log(`Navigating to: ${options.url}`);
await page.goto(options.url, {
timeout: 60000,
waitUntil: 'domcontentloaded',
});
// Wait if specified
if (options.wait) {
console.log(`Waiting ${options.wait}ms for dynamic content...`);
await page.waitForTimeout(options.wait);
}
const pageTitle = await page.title();
const pageUrl = page.url();
let html: string;
let title = pageTitle;
let byline: string | undefined;
let excerpt: string | undefined;
// Get HTML based on mode
switch (options.mode) {
case 'main': {
// Get full page HTML and extract with Readability
const fullHtml = await page.content();
const extracted = extractMainContent(fullHtml, pageUrl);
html = extracted.content;
title = extracted.title || pageTitle;
byline = extracted.byline;
excerpt = extracted.excerpt;
break;
}
case 'selector': {
if (!options.selector) {
throw new Error('Selector mode requires --selector option');
}
const element = await page.$(options.selector);
if (!element) {
throw new Error(`Selector not found: ${options.selector}`);
}
html = await element.innerHTML();
break;
}
case 'full':
default: {
// Get body content, excluding common non-content elements
html = await page.evaluate(() => {
// Remove common non-content elements
const selectorsToRemove = [
'script', 'style', 'noscript', 'iframe',
'nav', 'header', 'footer', '.cookie-banner',
'.advertisement', '.ads', '#ads', '.social-share',
'.comments', '#comments', '.sidebar'
];
selectorsToRemove.forEach(selector => {
document.querySelectorAll(selector).forEach(el => el.remove());
});
return document.body.innerHTML;
});
break;
}
}
// Convert to markdown
const turndown = createTurndownService({
includeLinks: options.includeLinks ?? true,
includeTables: options.includeTables ?? true,
includeImages: options.includeImages ?? false,
});
let markdown = turndown.turndown(html);
// Add title as H1 if not already present
if (!markdown.startsWith('# ')) {
markdown = `# ${title}\n\n${markdown}`;
}
// Add metadata header
const metadataLines = [
`<!-- Scraped from: ${pageUrl} -->`,
byline ? `<!-- Author: ${byline} -->` : null,
excerpt ? `<!-- Excerpt: ${excerpt} -->` : null,
`<!-- Scraped at: ${new Date().toISOString()} -->`,
'',
].filter(Boolean);
markdown = metadataLines.join('\n') + '\n' + markdown;
// Clean up excessive whitespace
markdown = markdown
.replace(/\n{4,}/g, '\n\n\n')
.replace(/[ \t]+$/gm, '')
.trim();
const result: ScrapeResult = {
title,
url: pageUrl,
markdown,
byline,
excerpt,
};
// Save to file if output specified
if (options.output) {
writeFileSync(options.output, markdown, 'utf-8');
console.log(`Markdown saved to: ${options.output}`);
}
return result;
} finally {
await browser.close();
}
}
// CLI entry point
async function main() {
const args = parseArgs(process.argv.slice(2), {
string: ['url', 'mode', 'selector', 'output'],
boolean: ['headless', 'links', 'tables', 'images', 'help'],
default: {
mode: 'main',
headless: true,
links: true,
tables: true,
images: false,
},
alias: {
u: 'url',
m: 'mode',
s: 'selector',
o: 'output',
h: 'help',
},
});
if (args.help || !args.url) {
console.log(`
Web Scraper - Extract content to Markdown
Usage:
npx tsx scrape.ts --url <url> [options]
Options:
-u, --url <url> URL to scrape (required)
-m, --mode <mode> Scrape mode: main, full, or selector (default: main)
-s, --selector <sel> CSS selector for selector mode
-o, --output <path> Output file path for markdown
--headless <bool> Run in headless mode (default: true)
--wait <ms> Wait time for dynamic content
--links Include links in output (default: true)
--tables Include tables in output (default: true)
--images Include images in output (default: false)
-h, --help Show this help message
Scrape Modes:
main Extract main article content using Readability (best for articles)
full Full page content with common elements removed
selector Extract specific element by CSS selector
Examples:
npx tsx scrape.ts --url "https://docs.example.com/guide" --mode main
npx tsx scrape.ts --url "https://example.com" --mode full --output page.md
npx tsx scrape.ts --url "https://example.com" --mode selector --selector ".api-docs"
npx tsx scrape.ts --url "https://example.com" --mode main --no-links --output clean.md
Output Format:
- GitHub Flavored Markdown (tables, strikethrough, task lists)
- Proper heading hierarchy
- Code blocks with language detection
- Metadata comments at top (source URL, date)
`);
process.exit(args.help ? 0 : 1);
}
const mode = args.mode as ScrapeMode;
if (!['main', 'full', 'selector'].includes(mode)) {
console.error(`Invalid mode: ${mode}. Must be main, full, or selector.`);
process.exit(1);
}
try {
const result = await scrape({
url: args.url,
mode,
selector: args.selector,
output: args.output,
includeLinks: args.links,
includeTables: args.tables,
includeImages: args.images,
headless: args.headless,
wait: args.wait ? parseInt(args.wait, 10) : undefined,
});
// Print result summary
console.log(`\nScrape complete:`);
console.log(` Title: ${result.title}`);
console.log(` URL: ${result.url}`);
if (result.byline) console.log(` Author: ${result.byline}`);
console.log(` Markdown length: ${result.markdown.length} chars`);
// Print markdown if not saved to file
if (!args.output) {
console.log('\n--- Markdown Output ---\n');
console.log(result.markdown);
}
} catch (error) {
console.error('Error:', error instanceof Error ? error.message : error);
process.exit(1);
}
}
// Run if executed directly
const isMainModule = process.argv[1]?.includes('scrape.ts');
if (isMainModule) {
main();
}
@@ -0,0 +1,36 @@
import { launchPersistentContext } from 'cloakbrowser';
import { homedir } from 'os';
import { join } from 'path';
import { mkdirSync, existsSync } from 'fs';
async function test() {
const profilePath = join(homedir(), '.cloakbrowser-profile');
if (!existsSync(profilePath)) {
mkdirSync(profilePath, { recursive: true });
}
console.log('Profile path:', profilePath);
console.log('Launching CloakBrowser with full options...');
const browser = await launchPersistentContext({
headless: true,
userDataDir: profilePath,
humanize: true,
});
console.log('Browser launched');
const page = browser.pages()[0] || await browser.newPage();
console.log('Page created');
await page.goto('https://github.com', { timeout: 30000 });
console.log('Navigated to:', page.url());
console.log('Title:', await page.title());
await page.screenshot({ path: '/tmp/github-test.png' });
console.log('Screenshot saved');
await browser.close();
console.log('Done');
}
test().catch(console.error);
@@ -0,0 +1,23 @@
import { launch } from 'cloakbrowser';
async function test() {
console.log('Launching CloakBrowser with minimal config...');
const browser = await launch({
headless: true,
humanize: true,
});
console.log('Browser launched');
const page = await browser.newPage();
console.log('Page created');
await page.goto('https://example.com', { timeout: 30000 });
console.log('Navigated to:', page.url());
console.log('Title:', await page.title());
await browser.close();
console.log('Done');
}
test().catch(console.error);
@@ -0,0 +1,33 @@
import { launchPersistentContext } from 'cloakbrowser';
import { homedir } from 'os';
import { join } from 'path';
import { mkdirSync, existsSync } from 'fs';
async function test() {
const profilePath = join(homedir(), '.cloakbrowser-profile');
if (!existsSync(profilePath)) {
mkdirSync(profilePath, { recursive: true });
}
console.log('Profile path:', profilePath);
console.log('Launching with persistent userDataDir...');
const browser = await launchPersistentContext({
headless: true,
userDataDir: profilePath,
humanize: true,
});
console.log('Browser launched');
const page = browser.pages()[0] || await browser.newPage();
console.log('Page created');
await page.goto('https://example.com', { timeout: 30000 });
console.log('Navigated to:', page.url());
console.log('Title:', await page.title());
await browser.close();
console.log('Done');
}
test().catch(console.error);
@@ -0,0 +1,16 @@
{
"compilerOptions": {
"target": "ES2022",
"module": "ESNext",
"moduleResolution": "bundler",
"esModuleInterop": true,
"allowSyntheticDefaultImports": true,
"strict": true,
"skipLibCheck": true,
"resolveJsonModule": true,
"outDir": "./dist",
"rootDir": "."
},
"include": ["*.ts"],
"exclude": ["node_modules", "dist"]
}
@@ -0,0 +1,8 @@
declare module 'turndown-plugin-gfm' {
import TurndownService from 'turndown';
export function gfm(turndownService: TurndownService): void;
export function strikethrough(turndownService: TurndownService): void;
export function tables(turndownService: TurndownService): void;
export function taskListItems(turndownService: TurndownService): void;
}