Add web-automation skill variants and inline prerequisite checks

2026-02-09 04:34:57 +00:00
parent 0833934dd5
commit 4c60c00391
42 changed files with 9416 additions and 21 deletions
@@ -0,0 +1,351 @@
+#!/usr/bin/env npx tsx
+
+/**
+ * Web scraper that extracts content to markdown
+ *
+ * Usage:
+ *   npx tsx scrape.ts --url "https://example.com" --mode main
+ *   npx tsx scrape.ts --url "https://example.com" --mode full --output page.md
+ *   npx tsx scrape.ts --url "https://example.com" --mode selector --selector ".content"
+ */
+
+import TurndownService from 'turndown';
+import * as turndownPluginGfm from 'turndown-plugin-gfm';
+import { Readability } from '@mozilla/readability';
+import { JSDOM } from 'jsdom';
+import { writeFileSync } from 'fs';
+import parseArgs from 'minimist';
+import { getPage } from './browse.js';
+
+// Types
+type ScrapeMode = 'main' | 'full' | 'selector';
+
+interface ScrapeOptions {
+  url: string;
+  mode: ScrapeMode;
+  selector?: string;
+  output?: string;
+  includeLinks?: boolean;
+  includeTables?: boolean;
+  includeImages?: boolean;
+  headless?: boolean;
+  wait?: number;
+}
+
+interface ScrapeResult {
+  title: string;
+  url: string;
+  markdown: string;
+  byline?: string;
+  excerpt?: string;
+}
+
+// Configure Turndown for markdown conversion
+function createTurndownService(options: {
+  includeLinks?: boolean;
+  includeTables?: boolean;
+  includeImages?: boolean;
+}): TurndownService {
+  const turndown = new TurndownService({
+    headingStyle: 'atx',
+    hr: '---',
+    bulletListMarker: '-',
+    codeBlockStyle: 'fenced',
+    fence: '```',
+    emDelimiter: '*',
+    strongDelimiter: '**',
+    linkStyle: 'inlined',
+  });
+
+  // Add GFM support (tables, strikethrough, task lists)
+  turndown.use(turndownPluginGfm.gfm);
+
+  // Custom rule for code blocks with language detection
+  turndown.addRule('codeBlockWithLanguage', {
+    filter: (node) => {
+      return (
+        node.nodeName === 'PRE' &&
+        node.firstChild?.nodeName === 'CODE'
+      );
+    },
+    replacement: (_content, node) => {
+      const codeNode = node.firstChild as HTMLElement;
+      const className = codeNode.getAttribute('class') || '';
+      const langMatch = className.match(/language-(\w+)/);
+      const lang = langMatch ? langMatch[1] : '';
+      const code = codeNode.textContent || '';
+      return `\n\n\`\`\`${lang}\n${code}\n\`\`\`\n\n`;
+    },
+  });
+
+  // Remove images if not included
+  if (!options.includeImages) {
+    turndown.addRule('removeImages', {
+      filter: 'img',
+      replacement: () => '',
+    });
+  }
+
+  // Remove links but keep text if not included
+  if (!options.includeLinks) {
+    turndown.addRule('removeLinks', {
+      filter: 'a',
+      replacement: (content) => content,
+    });
+  }
+
+  // Remove script, style, nav, footer, aside elements
+  turndown.remove(['script', 'style', 'nav', 'footer', 'aside', 'noscript']);
+
+  return turndown;
+}
+
+// Extract main content using Readability
+function extractMainContent(html: string, url: string): {
+  content: string;
+  title: string;
+  byline?: string;
+  excerpt?: string;
+} {
+  const dom = new JSDOM(html, { url });
+  const reader = new Readability(dom.window.document);
+  const article = reader.parse();
+
+  if (!article) {
+    throw new Error('Could not extract main content from page');
+  }
+
+  return {
+    content: article.content,
+    title: article.title,
+    byline: article.byline || undefined,
+    excerpt: article.excerpt || undefined,
+  };
+}
+
+// Scrape a URL and return markdown
+export async function scrape(options: ScrapeOptions): Promise<ScrapeResult> {
+  const { page, browser } = await getPage({ headless: options.headless ?? true });
+
+  try {
+    // Navigate to URL
+    console.log(`Navigating to: ${options.url}`);
+    await page.goto(options.url, {
+      timeout: 60000,
+      waitUntil: 'domcontentloaded',
+    });
+
+    // Wait if specified
+    if (options.wait) {
+      console.log(`Waiting ${options.wait}ms for dynamic content...`);
+      await page.waitForTimeout(options.wait);
+    }
+
+    const pageTitle = await page.title();
+    const pageUrl = page.url();
+
+    let html: string;
+    let title = pageTitle;
+    let byline: string | undefined;
+    let excerpt: string | undefined;
+
+    // Get HTML based on mode
+    switch (options.mode) {
+      case 'main': {
+        // Get full page HTML and extract with Readability
+        const fullHtml = await page.content();
+        const extracted = extractMainContent(fullHtml, pageUrl);
+        html = extracted.content;
+        title = extracted.title || pageTitle;
+        byline = extracted.byline;
+        excerpt = extracted.excerpt;
+        break;
+      }
+
+      case 'selector': {
+        if (!options.selector) {
+          throw new Error('Selector mode requires --selector option');
+        }
+        const element = await page.$(options.selector);
+        if (!element) {
+          throw new Error(`Selector not found: ${options.selector}`);
+        }
+        html = await element.innerHTML();
+        break;
+      }
+
+      case 'full':
+      default: {
+        // Get body content, excluding common non-content elements
+        html = await page.evaluate(() => {
+          // Remove common non-content elements
+          const selectorsToRemove = [
+            'script', 'style', 'noscript', 'iframe',
+            'nav', 'header', 'footer', '.cookie-banner',
+            '.advertisement', '.ads', '#ads', '.social-share',
+            '.comments', '#comments', '.sidebar'
+          ];
+
+          selectorsToRemove.forEach(selector => {
+            document.querySelectorAll(selector).forEach(el => el.remove());
+          });
+
+          return document.body.innerHTML;
+        });
+        break;
+      }
+    }
+
+    // Convert to markdown
+    const turndown = createTurndownService({
+      includeLinks: options.includeLinks ?? true,
+      includeTables: options.includeTables ?? true,
+      includeImages: options.includeImages ?? false,
+    });
+
+    let markdown = turndown.turndown(html);
+
+    // Add title as H1 if not already present
+    if (!markdown.startsWith('# ')) {
+      markdown = `# ${title}\n\n${markdown}`;
+    }
+
+    // Add metadata header
+    const metadataLines = [
+      `<!-- Scraped from: ${pageUrl} -->`,
+      byline ? `<!-- Author: ${byline} -->` : null,
+      excerpt ? `<!-- Excerpt: ${excerpt} -->` : null,
+      `<!-- Scraped at: ${new Date().toISOString()} -->`,
+      '',
+    ].filter(Boolean);
+
+    markdown = metadataLines.join('\n') + '\n' + markdown;
+
+    // Clean up excessive whitespace
+    markdown = markdown
+      .replace(/\n{4,}/g, '\n\n\n')
+      .replace(/[ \t]+$/gm, '')
+      .trim();
+
+    const result: ScrapeResult = {
+      title,
+      url: pageUrl,
+      markdown,
+      byline,
+      excerpt,
+    };
+
+    // Save to file if output specified
+    if (options.output) {
+      writeFileSync(options.output, markdown, 'utf-8');
+      console.log(`Markdown saved to: ${options.output}`);
+    }
+
+    return result;
+  } finally {
+    await browser.close();
+  }
+}
+
+// CLI entry point
+async function main() {
+  const args = parseArgs(process.argv.slice(2), {
+    string: ['url', 'mode', 'selector', 'output'],
+    boolean: ['headless', 'links', 'tables', 'images', 'help'],
+    default: {
+      mode: 'main',
+      headless: true,
+      links: true,
+      tables: true,
+      images: false,
+    },
+    alias: {
+      u: 'url',
+      m: 'mode',
+      s: 'selector',
+      o: 'output',
+      h: 'help',
+    },
+  });
+
+  if (args.help || !args.url) {
+    console.log(`
+Web Scraper - Extract content to Markdown
+
+Usage:
+  npx tsx scrape.ts --url <url> [options]
+
+Options:
+  -u, --url <url>       URL to scrape (required)
+  -m, --mode <mode>     Scrape mode: main, full, or selector (default: main)
+  -s, --selector <sel>  CSS selector for selector mode
+  -o, --output <path>   Output file path for markdown
+  --headless <bool>     Run in headless mode (default: true)
+  --wait <ms>           Wait time for dynamic content
+  --links               Include links in output (default: true)
+  --tables              Include tables in output (default: true)
+  --images              Include images in output (default: false)
+  -h, --help            Show this help message
+
+Scrape Modes:
+  main      Extract main article content using Readability (best for articles)
+  full      Full page content with common elements removed
+  selector  Extract specific element by CSS selector
+
+Examples:
+  npx tsx scrape.ts --url "https://docs.example.com/guide" --mode main
+  npx tsx scrape.ts --url "https://example.com" --mode full --output page.md
+  npx tsx scrape.ts --url "https://example.com" --mode selector --selector ".api-docs"
+  npx tsx scrape.ts --url "https://example.com" --mode main --no-links --output clean.md
+
+Output Format:
+  - GitHub Flavored Markdown (tables, strikethrough, task lists)
+  - Proper heading hierarchy
+  - Code blocks with language detection
+  - Metadata comments at top (source URL, date)
+`);
+    process.exit(args.help ? 0 : 1);
+  }
+
+  const mode = args.mode as ScrapeMode;
+  if (!['main', 'full', 'selector'].includes(mode)) {
+    console.error(`Invalid mode: ${mode}. Must be main, full, or selector.`);
+    process.exit(1);
+  }
+
+  try {
+    const result = await scrape({
+      url: args.url,
+      mode,
+      selector: args.selector,
+      output: args.output,
+      includeLinks: args.links,
+      includeTables: args.tables,
+      includeImages: args.images,
+      headless: args.headless,
+      wait: args.wait ? parseInt(args.wait, 10) : undefined,
+    });
+
+    // Print result summary
+    console.log(`\nScrape complete:`);
+    console.log(`  Title: ${result.title}`);
+    console.log(`  URL: ${result.url}`);
+    if (result.byline) console.log(`  Author: ${result.byline}`);
+    console.log(`  Markdown length: ${result.markdown.length} chars`);
+
+    // Print markdown if not saved to file
+    if (!args.output) {
+      console.log('\n--- Markdown Output ---\n');
+      console.log(result.markdown);
+    }
+  } catch (error) {
+    console.error('Error:', error instanceof Error ? error.message : error);
+    process.exit(1);
+  }
+}
+
+// Run if executed directly
+const isMainModule = process.argv[1]?.includes('scrape.ts');
+if (isMainModule) {
+  main();
+}