Add web-automation skill variants and inline prerequisite checks
This commit is contained in:
351
skills/web-automation/opencode/scripts/scrape.ts
Normal file
351
skills/web-automation/opencode/scripts/scrape.ts
Normal file
@@ -0,0 +1,351 @@
|
||||
#!/usr/bin/env npx tsx
|
||||
|
||||
/**
|
||||
* Web scraper that extracts content to markdown
|
||||
*
|
||||
* Usage:
|
||||
* npx tsx scrape.ts --url "https://example.com" --mode main
|
||||
* npx tsx scrape.ts --url "https://example.com" --mode full --output page.md
|
||||
* npx tsx scrape.ts --url "https://example.com" --mode selector --selector ".content"
|
||||
*/
|
||||
|
||||
import TurndownService from 'turndown';
|
||||
import * as turndownPluginGfm from 'turndown-plugin-gfm';
|
||||
import { Readability } from '@mozilla/readability';
|
||||
import { JSDOM } from 'jsdom';
|
||||
import { writeFileSync } from 'fs';
|
||||
import parseArgs from 'minimist';
|
||||
import { getPage } from './browse.js';
|
||||
|
||||
// Types
|
||||
type ScrapeMode = 'main' | 'full' | 'selector';
|
||||
|
||||
interface ScrapeOptions {
|
||||
url: string;
|
||||
mode: ScrapeMode;
|
||||
selector?: string;
|
||||
output?: string;
|
||||
includeLinks?: boolean;
|
||||
includeTables?: boolean;
|
||||
includeImages?: boolean;
|
||||
headless?: boolean;
|
||||
wait?: number;
|
||||
}
|
||||
|
||||
interface ScrapeResult {
|
||||
title: string;
|
||||
url: string;
|
||||
markdown: string;
|
||||
byline?: string;
|
||||
excerpt?: string;
|
||||
}
|
||||
|
||||
// Configure Turndown for markdown conversion
|
||||
function createTurndownService(options: {
|
||||
includeLinks?: boolean;
|
||||
includeTables?: boolean;
|
||||
includeImages?: boolean;
|
||||
}): TurndownService {
|
||||
const turndown = new TurndownService({
|
||||
headingStyle: 'atx',
|
||||
hr: '---',
|
||||
bulletListMarker: '-',
|
||||
codeBlockStyle: 'fenced',
|
||||
fence: '```',
|
||||
emDelimiter: '*',
|
||||
strongDelimiter: '**',
|
||||
linkStyle: 'inlined',
|
||||
});
|
||||
|
||||
// Add GFM support (tables, strikethrough, task lists)
|
||||
turndown.use(turndownPluginGfm.gfm);
|
||||
|
||||
// Custom rule for code blocks with language detection
|
||||
turndown.addRule('codeBlockWithLanguage', {
|
||||
filter: (node) => {
|
||||
return (
|
||||
node.nodeName === 'PRE' &&
|
||||
node.firstChild?.nodeName === 'CODE'
|
||||
);
|
||||
},
|
||||
replacement: (_content, node) => {
|
||||
const codeNode = node.firstChild as HTMLElement;
|
||||
const className = codeNode.getAttribute('class') || '';
|
||||
const langMatch = className.match(/language-(\w+)/);
|
||||
const lang = langMatch ? langMatch[1] : '';
|
||||
const code = codeNode.textContent || '';
|
||||
return `\n\n\`\`\`${lang}\n${code}\n\`\`\`\n\n`;
|
||||
},
|
||||
});
|
||||
|
||||
// Remove images if not included
|
||||
if (!options.includeImages) {
|
||||
turndown.addRule('removeImages', {
|
||||
filter: 'img',
|
||||
replacement: () => '',
|
||||
});
|
||||
}
|
||||
|
||||
// Remove links but keep text if not included
|
||||
if (!options.includeLinks) {
|
||||
turndown.addRule('removeLinks', {
|
||||
filter: 'a',
|
||||
replacement: (content) => content,
|
||||
});
|
||||
}
|
||||
|
||||
// Remove script, style, nav, footer, aside elements
|
||||
turndown.remove(['script', 'style', 'nav', 'footer', 'aside', 'noscript']);
|
||||
|
||||
return turndown;
|
||||
}
|
||||
|
||||
// Extract main content using Readability
|
||||
function extractMainContent(html: string, url: string): {
|
||||
content: string;
|
||||
title: string;
|
||||
byline?: string;
|
||||
excerpt?: string;
|
||||
} {
|
||||
const dom = new JSDOM(html, { url });
|
||||
const reader = new Readability(dom.window.document);
|
||||
const article = reader.parse();
|
||||
|
||||
if (!article) {
|
||||
throw new Error('Could not extract main content from page');
|
||||
}
|
||||
|
||||
return {
|
||||
content: article.content,
|
||||
title: article.title,
|
||||
byline: article.byline || undefined,
|
||||
excerpt: article.excerpt || undefined,
|
||||
};
|
||||
}
|
||||
|
||||
// Scrape a URL and return markdown
|
||||
export async function scrape(options: ScrapeOptions): Promise<ScrapeResult> {
|
||||
const { page, browser } = await getPage({ headless: options.headless ?? true });
|
||||
|
||||
try {
|
||||
// Navigate to URL
|
||||
console.log(`Navigating to: ${options.url}`);
|
||||
await page.goto(options.url, {
|
||||
timeout: 60000,
|
||||
waitUntil: 'domcontentloaded',
|
||||
});
|
||||
|
||||
// Wait if specified
|
||||
if (options.wait) {
|
||||
console.log(`Waiting ${options.wait}ms for dynamic content...`);
|
||||
await page.waitForTimeout(options.wait);
|
||||
}
|
||||
|
||||
const pageTitle = await page.title();
|
||||
const pageUrl = page.url();
|
||||
|
||||
let html: string;
|
||||
let title = pageTitle;
|
||||
let byline: string | undefined;
|
||||
let excerpt: string | undefined;
|
||||
|
||||
// Get HTML based on mode
|
||||
switch (options.mode) {
|
||||
case 'main': {
|
||||
// Get full page HTML and extract with Readability
|
||||
const fullHtml = await page.content();
|
||||
const extracted = extractMainContent(fullHtml, pageUrl);
|
||||
html = extracted.content;
|
||||
title = extracted.title || pageTitle;
|
||||
byline = extracted.byline;
|
||||
excerpt = extracted.excerpt;
|
||||
break;
|
||||
}
|
||||
|
||||
case 'selector': {
|
||||
if (!options.selector) {
|
||||
throw new Error('Selector mode requires --selector option');
|
||||
}
|
||||
const element = await page.$(options.selector);
|
||||
if (!element) {
|
||||
throw new Error(`Selector not found: ${options.selector}`);
|
||||
}
|
||||
html = await element.innerHTML();
|
||||
break;
|
||||
}
|
||||
|
||||
case 'full':
|
||||
default: {
|
||||
// Get body content, excluding common non-content elements
|
||||
html = await page.evaluate(() => {
|
||||
// Remove common non-content elements
|
||||
const selectorsToRemove = [
|
||||
'script', 'style', 'noscript', 'iframe',
|
||||
'nav', 'header', 'footer', '.cookie-banner',
|
||||
'.advertisement', '.ads', '#ads', '.social-share',
|
||||
'.comments', '#comments', '.sidebar'
|
||||
];
|
||||
|
||||
selectorsToRemove.forEach(selector => {
|
||||
document.querySelectorAll(selector).forEach(el => el.remove());
|
||||
});
|
||||
|
||||
return document.body.innerHTML;
|
||||
});
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
// Convert to markdown
|
||||
const turndown = createTurndownService({
|
||||
includeLinks: options.includeLinks ?? true,
|
||||
includeTables: options.includeTables ?? true,
|
||||
includeImages: options.includeImages ?? false,
|
||||
});
|
||||
|
||||
let markdown = turndown.turndown(html);
|
||||
|
||||
// Add title as H1 if not already present
|
||||
if (!markdown.startsWith('# ')) {
|
||||
markdown = `# ${title}\n\n${markdown}`;
|
||||
}
|
||||
|
||||
// Add metadata header
|
||||
const metadataLines = [
|
||||
`<!-- Scraped from: ${pageUrl} -->`,
|
||||
byline ? `<!-- Author: ${byline} -->` : null,
|
||||
excerpt ? `<!-- Excerpt: ${excerpt} -->` : null,
|
||||
`<!-- Scraped at: ${new Date().toISOString()} -->`,
|
||||
'',
|
||||
].filter(Boolean);
|
||||
|
||||
markdown = metadataLines.join('\n') + '\n' + markdown;
|
||||
|
||||
// Clean up excessive whitespace
|
||||
markdown = markdown
|
||||
.replace(/\n{4,}/g, '\n\n\n')
|
||||
.replace(/[ \t]+$/gm, '')
|
||||
.trim();
|
||||
|
||||
const result: ScrapeResult = {
|
||||
title,
|
||||
url: pageUrl,
|
||||
markdown,
|
||||
byline,
|
||||
excerpt,
|
||||
};
|
||||
|
||||
// Save to file if output specified
|
||||
if (options.output) {
|
||||
writeFileSync(options.output, markdown, 'utf-8');
|
||||
console.log(`Markdown saved to: ${options.output}`);
|
||||
}
|
||||
|
||||
return result;
|
||||
} finally {
|
||||
await browser.close();
|
||||
}
|
||||
}
|
||||
|
||||
// CLI entry point
|
||||
async function main() {
|
||||
const args = parseArgs(process.argv.slice(2), {
|
||||
string: ['url', 'mode', 'selector', 'output'],
|
||||
boolean: ['headless', 'links', 'tables', 'images', 'help'],
|
||||
default: {
|
||||
mode: 'main',
|
||||
headless: true,
|
||||
links: true,
|
||||
tables: true,
|
||||
images: false,
|
||||
},
|
||||
alias: {
|
||||
u: 'url',
|
||||
m: 'mode',
|
||||
s: 'selector',
|
||||
o: 'output',
|
||||
h: 'help',
|
||||
},
|
||||
});
|
||||
|
||||
if (args.help || !args.url) {
|
||||
console.log(`
|
||||
Web Scraper - Extract content to Markdown
|
||||
|
||||
Usage:
|
||||
npx tsx scrape.ts --url <url> [options]
|
||||
|
||||
Options:
|
||||
-u, --url <url> URL to scrape (required)
|
||||
-m, --mode <mode> Scrape mode: main, full, or selector (default: main)
|
||||
-s, --selector <sel> CSS selector for selector mode
|
||||
-o, --output <path> Output file path for markdown
|
||||
--headless <bool> Run in headless mode (default: true)
|
||||
--wait <ms> Wait time for dynamic content
|
||||
--links Include links in output (default: true)
|
||||
--tables Include tables in output (default: true)
|
||||
--images Include images in output (default: false)
|
||||
-h, --help Show this help message
|
||||
|
||||
Scrape Modes:
|
||||
main Extract main article content using Readability (best for articles)
|
||||
full Full page content with common elements removed
|
||||
selector Extract specific element by CSS selector
|
||||
|
||||
Examples:
|
||||
npx tsx scrape.ts --url "https://docs.example.com/guide" --mode main
|
||||
npx tsx scrape.ts --url "https://example.com" --mode full --output page.md
|
||||
npx tsx scrape.ts --url "https://example.com" --mode selector --selector ".api-docs"
|
||||
npx tsx scrape.ts --url "https://example.com" --mode main --no-links --output clean.md
|
||||
|
||||
Output Format:
|
||||
- GitHub Flavored Markdown (tables, strikethrough, task lists)
|
||||
- Proper heading hierarchy
|
||||
- Code blocks with language detection
|
||||
- Metadata comments at top (source URL, date)
|
||||
`);
|
||||
process.exit(args.help ? 0 : 1);
|
||||
}
|
||||
|
||||
const mode = args.mode as ScrapeMode;
|
||||
if (!['main', 'full', 'selector'].includes(mode)) {
|
||||
console.error(`Invalid mode: ${mode}. Must be main, full, or selector.`);
|
||||
process.exit(1);
|
||||
}
|
||||
|
||||
try {
|
||||
const result = await scrape({
|
||||
url: args.url,
|
||||
mode,
|
||||
selector: args.selector,
|
||||
output: args.output,
|
||||
includeLinks: args.links,
|
||||
includeTables: args.tables,
|
||||
includeImages: args.images,
|
||||
headless: args.headless,
|
||||
wait: args.wait ? parseInt(args.wait, 10) : undefined,
|
||||
});
|
||||
|
||||
// Print result summary
|
||||
console.log(`\nScrape complete:`);
|
||||
console.log(` Title: ${result.title}`);
|
||||
console.log(` URL: ${result.url}`);
|
||||
if (result.byline) console.log(` Author: ${result.byline}`);
|
||||
console.log(` Markdown length: ${result.markdown.length} chars`);
|
||||
|
||||
// Print markdown if not saved to file
|
||||
if (!args.output) {
|
||||
console.log('\n--- Markdown Output ---\n');
|
||||
console.log(result.markdown);
|
||||
}
|
||||
} catch (error) {
|
||||
console.error('Error:', error instanceof Error ? error.message : error);
|
||||
process.exit(1);
|
||||
}
|
||||
}
|
||||
|
||||
// Run if executed directly
|
||||
const isMainModule = process.argv[1]?.includes('scrape.ts');
|
||||
if (isMainModule) {
|
||||
main();
|
||||
}
|
||||
Reference in New Issue
Block a user