Files
ai-coding-skills/skills/web-automation/cursor/scripts/scrape.ts
T
2026-05-03 21:09:22 -05:00

353 lines
9.9 KiB
TypeScript
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
#!/usr/bin/env npx tsx
// ⚠️ GENERATED FILE do not edit directly. Edit the canonical source in skills/web-automation/shared/ and run `pnpm run sync:pi`.
/**
* Web scraper that extracts content to markdown
*
* Usage:
* npx tsx scrape.ts --url "https://example.com" --mode main
* npx tsx scrape.ts --url "https://example.com" --mode full --output page.md
* npx tsx scrape.ts --url "https://example.com" --mode selector --selector ".content"
*/
import TurndownService from 'turndown';
import * as turndownPluginGfm from 'turndown-plugin-gfm';
import { Readability } from '@mozilla/readability';
import { JSDOM } from 'jsdom';
import { writeFileSync } from 'fs';
import parseArgs from 'minimist';
import { getPage } from './browse.js';
// Types
type ScrapeMode = 'main' | 'full' | 'selector';
interface ScrapeOptions {
url: string;
mode: ScrapeMode;
selector?: string;
output?: string;
includeLinks?: boolean;
includeTables?: boolean;
includeImages?: boolean;
headless?: boolean;
wait?: number;
}
interface ScrapeResult {
title: string;
url: string;
markdown: string;
byline?: string;
excerpt?: string;
}
// Configure Turndown for markdown conversion
function createTurndownService(options: {
includeLinks?: boolean;
includeTables?: boolean;
includeImages?: boolean;
}): TurndownService {
const turndown = new TurndownService({
headingStyle: 'atx',
hr: '---',
bulletListMarker: '-',
codeBlockStyle: 'fenced',
fence: '```',
emDelimiter: '*',
strongDelimiter: '**',
linkStyle: 'inlined',
});
// Add GFM support (tables, strikethrough, task lists)
turndown.use(turndownPluginGfm.gfm);
// Custom rule for code blocks with language detection
turndown.addRule('codeBlockWithLanguage', {
filter: (node) => {
return (
node.nodeName === 'PRE' &&
node.firstChild?.nodeName === 'CODE'
);
},
replacement: (_content, node) => {
const codeNode = node.firstChild as HTMLElement;
const className = codeNode.getAttribute('class') || '';
const langMatch = className.match(/language-(\w+)/);
const lang = langMatch ? langMatch[1] : '';
const code = codeNode.textContent || '';
return `\n\n\`\`\`${lang}\n${code}\n\`\`\`\n\n`;
},
});
// Remove images if not included
if (!options.includeImages) {
turndown.addRule('removeImages', {
filter: 'img',
replacement: () => '',
});
}
// Remove links but keep text if not included
if (!options.includeLinks) {
turndown.addRule('removeLinks', {
filter: 'a',
replacement: (content) => content,
});
}
// Remove script, style, nav, footer, aside elements
turndown.remove(['script', 'style', 'nav', 'footer', 'aside', 'noscript']);
return turndown;
}
// Extract main content using Readability
function extractMainContent(html: string, url: string): {
content: string;
title: string;
byline?: string;
excerpt?: string;
} {
const dom = new JSDOM(html, { url });
const reader = new Readability(dom.window.document);
const article = reader.parse();
if (!article) {
throw new Error('Could not extract main content from page');
}
return {
content: article.content,
title: article.title,
byline: article.byline || undefined,
excerpt: article.excerpt || undefined,
};
}
// Scrape a URL and return markdown
export async function scrape(options: ScrapeOptions): Promise<ScrapeResult> {
const { page, browser } = await getPage({ headless: options.headless ?? true });
try {
// Navigate to URL
console.log(`Navigating to: ${options.url}`);
await page.goto(options.url, {
timeout: 60000,
waitUntil: 'domcontentloaded',
});
// Wait if specified
if (options.wait) {
console.log(`Waiting ${options.wait}ms for dynamic content...`);
await page.waitForTimeout(options.wait);
}
const pageTitle = await page.title();
const pageUrl = page.url();
let html: string;
let title = pageTitle;
let byline: string | undefined;
let excerpt: string | undefined;
// Get HTML based on mode
switch (options.mode) {
case 'main': {
// Get full page HTML and extract with Readability
const fullHtml = await page.content();
const extracted = extractMainContent(fullHtml, pageUrl);
html = extracted.content;
title = extracted.title || pageTitle;
byline = extracted.byline;
excerpt = extracted.excerpt;
break;
}
case 'selector': {
if (!options.selector) {
throw new Error('Selector mode requires --selector option');
}
const element = await page.$(options.selector);
if (!element) {
throw new Error(`Selector not found: ${options.selector}`);
}
html = await element.innerHTML();
break;
}
case 'full':
default: {
// Get body content, excluding common non-content elements
html = await page.evaluate(() => {
// Remove common non-content elements
const selectorsToRemove = [
'script', 'style', 'noscript', 'iframe',
'nav', 'header', 'footer', '.cookie-banner',
'.advertisement', '.ads', '#ads', '.social-share',
'.comments', '#comments', '.sidebar'
];
selectorsToRemove.forEach(selector => {
document.querySelectorAll(selector).forEach(el => el.remove());
});
return document.body.innerHTML;
});
break;
}
}
// Convert to markdown
const turndown = createTurndownService({
includeLinks: options.includeLinks ?? true,
includeTables: options.includeTables ?? true,
includeImages: options.includeImages ?? false,
});
let markdown = turndown.turndown(html);
// Add title as H1 if not already present
if (!markdown.startsWith('# ')) {
markdown = `# ${title}\n\n${markdown}`;
}
// Add metadata header
const metadataLines = [
`<!-- Scraped from: ${pageUrl} -->`,
byline ? `<!-- Author: ${byline} -->` : null,
excerpt ? `<!-- Excerpt: ${excerpt} -->` : null,
`<!-- Scraped at: ${new Date().toISOString()} -->`,
'',
].filter(Boolean);
markdown = metadataLines.join('\n') + '\n' + markdown;
// Clean up excessive whitespace
markdown = markdown
.replace(/\n{4,}/g, '\n\n\n')
.replace(/[ \t]+$/gm, '')
.trim();
const result: ScrapeResult = {
title,
url: pageUrl,
markdown,
byline,
excerpt,
};
// Save to file if output specified
if (options.output) {
writeFileSync(options.output, markdown, 'utf-8');
console.log(`Markdown saved to: ${options.output}`);
}
return result;
} finally {
await browser.close();
}
}
// CLI entry point
async function main() {
const args = parseArgs(process.argv.slice(2), {
string: ['url', 'mode', 'selector', 'output'],
boolean: ['headless', 'links', 'tables', 'images', 'help'],
default: {
mode: 'main',
headless: true,
links: true,
tables: true,
images: false,
},
alias: {
u: 'url',
m: 'mode',
s: 'selector',
o: 'output',
h: 'help',
},
});
if (args.help || !args.url) {
console.log(`
Web Scraper - Extract content to Markdown
Usage:
npx tsx scrape.ts --url <url> [options]
Options:
-u, --url <url> URL to scrape (required)
-m, --mode <mode> Scrape mode: main, full, or selector (default: main)
-s, --selector <sel> CSS selector for selector mode
-o, --output <path> Output file path for markdown
--headless <bool> Run in headless mode (default: true)
--wait <ms> Wait time for dynamic content
--links Include links in output (default: true)
--tables Include tables in output (default: true)
--images Include images in output (default: false)
-h, --help Show this help message
Scrape Modes:
main Extract main article content using Readability (best for articles)
full Full page content with common elements removed
selector Extract specific element by CSS selector
Examples:
npx tsx scrape.ts --url "https://docs.example.com/guide" --mode main
npx tsx scrape.ts --url "https://example.com" --mode full --output page.md
npx tsx scrape.ts --url "https://example.com" --mode selector --selector ".api-docs"
npx tsx scrape.ts --url "https://example.com" --mode main --no-links --output clean.md
Output Format:
- GitHub Flavored Markdown (tables, strikethrough, task lists)
- Proper heading hierarchy
- Code blocks with language detection
- Metadata comments at top (source URL, date)
`);
process.exit(args.help ? 0 : 1);
}
const mode = args.mode as ScrapeMode;
if (!['main', 'full', 'selector'].includes(mode)) {
console.error(`Invalid mode: ${mode}. Must be main, full, or selector.`);
process.exit(1);
}
try {
const result = await scrape({
url: args.url,
mode,
selector: args.selector,
output: args.output,
includeLinks: args.links,
includeTables: args.tables,
includeImages: args.images,
headless: args.headless,
wait: args.wait ? parseInt(args.wait, 10) : undefined,
});
// Print result summary
console.log(`\nScrape complete:`);
console.log(` Title: ${result.title}`);
console.log(` URL: ${result.url}`);
if (result.byline) console.log(` Author: ${result.byline}`);
console.log(` Markdown length: ${result.markdown.length} chars`);
// Print markdown if not saved to file
if (!args.output) {
console.log('\n--- Markdown Output ---\n');
console.log(result.markdown);
}
} catch (error) {
console.error('Error:', error instanceof Error ? error.message : error);
process.exit(1);
}
}
// Run if executed directly
const isMainModule = process.argv[1]?.includes('scrape.ts');
if (isMainModule) {
main();
}