Files
Luke 658562ae35 Add web-automation skill
- Browse and scrape web pages using Playwright with Camoufox anti-detection browser
- Supports automated web workflows, authenticated sessions, and bot protection bypass
- Includes scripts for browse, scrape, auth, and local app scanning
- Updated README with skill documentation and system library requirements
2026-02-11 18:46:59 +00:00

352 lines
9.7 KiB
TypeScript

#!/usr/bin/env npx tsx
/**
* Web scraper that extracts content to markdown
*
* Usage:
* npx tsx scrape.ts --url "https://example.com" --mode main
* npx tsx scrape.ts --url "https://example.com" --mode full --output page.md
* npx tsx scrape.ts --url "https://example.com" --mode selector --selector ".content"
*/
import TurndownService from 'turndown';
import * as turndownPluginGfm from 'turndown-plugin-gfm';
import { Readability } from '@mozilla/readability';
import { JSDOM } from 'jsdom';
import { writeFileSync } from 'fs';
import parseArgs from 'minimist';
import { getPage } from './browse.js';
// Types
type ScrapeMode = 'main' | 'full' | 'selector';
interface ScrapeOptions {
url: string;
mode: ScrapeMode;
selector?: string;
output?: string;
includeLinks?: boolean;
includeTables?: boolean;
includeImages?: boolean;
headless?: boolean;
wait?: number;
}
interface ScrapeResult {
title: string;
url: string;
markdown: string;
byline?: string;
excerpt?: string;
}
// Configure Turndown for markdown conversion
function createTurndownService(options: {
includeLinks?: boolean;
includeTables?: boolean;
includeImages?: boolean;
}): TurndownService {
const turndown = new TurndownService({
headingStyle: 'atx',
hr: '---',
bulletListMarker: '-',
codeBlockStyle: 'fenced',
fence: '```',
emDelimiter: '*',
strongDelimiter: '**',
linkStyle: 'inlined',
});
// Add GFM support (tables, strikethrough, task lists)
turndown.use(turndownPluginGfm.gfm);
// Custom rule for code blocks with language detection
turndown.addRule('codeBlockWithLanguage', {
filter: (node) => {
return (
node.nodeName === 'PRE' &&
node.firstChild?.nodeName === 'CODE'
);
},
replacement: (_content, node) => {
const codeNode = node.firstChild as HTMLElement;
const className = codeNode.getAttribute('class') || '';
const langMatch = className.match(/language-(\w+)/);
const lang = langMatch ? langMatch[1] : '';
const code = codeNode.textContent || '';
return `\n\n\`\`\`${lang}\n${code}\n\`\`\`\n\n`;
},
});
// Remove images if not included
if (!options.includeImages) {
turndown.addRule('removeImages', {
filter: 'img',
replacement: () => '',
});
}
// Remove links but keep text if not included
if (!options.includeLinks) {
turndown.addRule('removeLinks', {
filter: 'a',
replacement: (content) => content,
});
}
// Remove script, style, nav, footer, aside elements
turndown.remove(['script', 'style', 'nav', 'footer', 'aside', 'noscript']);
return turndown;
}
// Extract main content using Readability
function extractMainContent(html: string, url: string): {
content: string;
title: string;
byline?: string;
excerpt?: string;
} {
const dom = new JSDOM(html, { url });
const reader = new Readability(dom.window.document);
const article = reader.parse();
if (!article) {
throw new Error('Could not extract main content from page');
}
return {
content: article.content,
title: article.title,
byline: article.byline || undefined,
excerpt: article.excerpt || undefined,
};
}
// Scrape a URL and return markdown
export async function scrape(options: ScrapeOptions): Promise<ScrapeResult> {
const { page, browser } = await getPage({ headless: options.headless ?? true });
try {
// Navigate to URL
console.log(`Navigating to: ${options.url}`);
await page.goto(options.url, {
timeout: 60000,
waitUntil: 'domcontentloaded',
});
// Wait if specified
if (options.wait) {
console.log(`Waiting ${options.wait}ms for dynamic content...`);
await page.waitForTimeout(options.wait);
}
const pageTitle = await page.title();
const pageUrl = page.url();
let html: string;
let title = pageTitle;
let byline: string | undefined;
let excerpt: string | undefined;
// Get HTML based on mode
switch (options.mode) {
case 'main': {
// Get full page HTML and extract with Readability
const fullHtml = await page.content();
const extracted = extractMainContent(fullHtml, pageUrl);
html = extracted.content;
title = extracted.title || pageTitle;
byline = extracted.byline;
excerpt = extracted.excerpt;
break;
}
case 'selector': {
if (!options.selector) {
throw new Error('Selector mode requires --selector option');
}
const element = await page.$(options.selector);
if (!element) {
throw new Error(`Selector not found: ${options.selector}`);
}
html = await element.innerHTML();
break;
}
case 'full':
default: {
// Get body content, excluding common non-content elements
html = await page.evaluate(() => {
// Remove common non-content elements
const selectorsToRemove = [
'script', 'style', 'noscript', 'iframe',
'nav', 'header', 'footer', '.cookie-banner',
'.advertisement', '.ads', '#ads', '.social-share',
'.comments', '#comments', '.sidebar'
];
selectorsToRemove.forEach(selector => {
document.querySelectorAll(selector).forEach(el => el.remove());
});
return document.body.innerHTML;
});
break;
}
}
// Convert to markdown
const turndown = createTurndownService({
includeLinks: options.includeLinks ?? true,
includeTables: options.includeTables ?? true,
includeImages: options.includeImages ?? false,
});
let markdown = turndown.turndown(html);
// Add title as H1 if not already present
if (!markdown.startsWith('# ')) {
markdown = `# ${title}\n\n${markdown}`;
}
// Add metadata header
const metadataLines = [
`<!-- Scraped from: ${pageUrl} -->`,
byline ? `<!-- Author: ${byline} -->` : null,
excerpt ? `<!-- Excerpt: ${excerpt} -->` : null,
`<!-- Scraped at: ${new Date().toISOString()} -->`,
'',
].filter(Boolean);
markdown = metadataLines.join('\n') + '\n' + markdown;
// Clean up excessive whitespace
markdown = markdown
.replace(/\n{4,}/g, '\n\n\n')
.replace(/[ \t]+$/gm, '')
.trim();
const result: ScrapeResult = {
title,
url: pageUrl,
markdown,
byline,
excerpt,
};
// Save to file if output specified
if (options.output) {
writeFileSync(options.output, markdown, 'utf-8');
console.log(`Markdown saved to: ${options.output}`);
}
return result;
} finally {
await browser.close();
}
}
// CLI entry point
async function main() {
const args = parseArgs(process.argv.slice(2), {
string: ['url', 'mode', 'selector', 'output'],
boolean: ['headless', 'links', 'tables', 'images', 'help'],
default: {
mode: 'main',
headless: true,
links: true,
tables: true,
images: false,
},
alias: {
u: 'url',
m: 'mode',
s: 'selector',
o: 'output',
h: 'help',
},
});
if (args.help || !args.url) {
console.log(`
Web Scraper - Extract content to Markdown
Usage:
npx tsx scrape.ts --url <url> [options]
Options:
-u, --url <url> URL to scrape (required)
-m, --mode <mode> Scrape mode: main, full, or selector (default: main)
-s, --selector <sel> CSS selector for selector mode
-o, --output <path> Output file path for markdown
--headless <bool> Run in headless mode (default: true)
--wait <ms> Wait time for dynamic content
--links Include links in output (default: true)
--tables Include tables in output (default: true)
--images Include images in output (default: false)
-h, --help Show this help message
Scrape Modes:
main Extract main article content using Readability (best for articles)
full Full page content with common elements removed
selector Extract specific element by CSS selector
Examples:
npx tsx scrape.ts --url "https://docs.example.com/guide" --mode main
npx tsx scrape.ts --url "https://example.com" --mode full --output page.md
npx tsx scrape.ts --url "https://example.com" --mode selector --selector ".api-docs"
npx tsx scrape.ts --url "https://example.com" --mode main --no-links --output clean.md
Output Format:
- GitHub Flavored Markdown (tables, strikethrough, task lists)
- Proper heading hierarchy
- Code blocks with language detection
- Metadata comments at top (source URL, date)
`);
process.exit(args.help ? 0 : 1);
}
const mode = args.mode as ScrapeMode;
if (!['main', 'full', 'selector'].includes(mode)) {
console.error(`Invalid mode: ${mode}. Must be main, full, or selector.`);
process.exit(1);
}
try {
const result = await scrape({
url: args.url,
mode,
selector: args.selector,
output: args.output,
includeLinks: args.links,
includeTables: args.tables,
includeImages: args.images,
headless: args.headless,
wait: args.wait ? parseInt(args.wait, 10) : undefined,
});
// Print result summary
console.log(`\nScrape complete:`);
console.log(` Title: ${result.title}`);
console.log(` URL: ${result.url}`);
if (result.byline) console.log(` Author: ${result.byline}`);
console.log(` Markdown length: ${result.markdown.length} chars`);
// Print markdown if not saved to file
if (!args.output) {
console.log('\n--- Markdown Output ---\n');
console.log(result.markdown);
}
} catch (error) {
console.error('Error:', error instanceof Error ? error.message : error);
process.exit(1);
}
}
// Run if executed directly
const isMainModule = process.argv[1]?.includes('scrape.ts');
if (isMainModule) {
main();
}