stef-openclaw-skills/skills/web-automation/scripts/scrape.ts

#!/usr/bin/env npx tsx

/**
 * Web scraper that extracts content to markdown
 *
 * Usage:
 *   npx tsx scrape.ts --url "https://example.com" --mode main
 *   npx tsx scrape.ts --url "https://example.com" --mode full --output page.md
 *   npx tsx scrape.ts --url "https://example.com" --mode selector --selector ".content"
 */

import TurndownService from 'turndown';
import * as turndownPluginGfm from 'turndown-plugin-gfm';
import { Readability } from '@mozilla/readability';
import { JSDOM } from 'jsdom';
import { writeFileSync } from 'fs';
import parseArgs from 'minimist';
import { getPage } from './browse.js';

// Types
type ScrapeMode = 'main' | 'full' | 'selector';

interface ScrapeOptions {
  url: string;
  mode: ScrapeMode;
  selector?: string;
  output?: string;
  includeLinks?: boolean;
  includeTables?: boolean;
  includeImages?: boolean;
  headless?: boolean;
  wait?: number;
}

interface ScrapeResult {
  title: string;
  url: string;
  markdown: string;
  byline?: string;
  excerpt?: string;
}

// Configure Turndown for markdown conversion
function createTurndownService(options: {
  includeLinks?: boolean;
  includeTables?: boolean;
  includeImages?: boolean;
}): TurndownService {
  const turndown = new TurndownService({
    headingStyle: 'atx',
    hr: '---',
    bulletListMarker: '-',
    codeBlockStyle: 'fenced',
    fence: '```',
    emDelimiter: '*',
    strongDelimiter: '**',
    linkStyle: 'inlined',
  });

  // Add GFM support (tables, strikethrough, task lists)
  turndown.use(turndownPluginGfm.gfm);

  // Custom rule for code blocks with language detection
  turndown.addRule('codeBlockWithLanguage', {
    filter: (node) => {
      return (
        node.nodeName === 'PRE' &&
        node.firstChild?.nodeName === 'CODE'
      );
    },
    replacement: (_content, node) => {
      const codeNode = node.firstChild as HTMLElement;
      const className = codeNode.getAttribute('class') || '';
      const langMatch = className.match(/language-(\w+)/);
      const lang = langMatch ? langMatch[1] : '';
      const code = codeNode.textContent || '';
      return `\n\n\`\`\`${lang}\n${code}\n\`\`\`\n\n`;
    },
  });

  // Remove images if not included
  if (!options.includeImages) {
    turndown.addRule('removeImages', {
      filter: 'img',
      replacement: () => '',
    });
  }

  // Remove links but keep text if not included
  if (!options.includeLinks) {
    turndown.addRule('removeLinks', {
      filter: 'a',
      replacement: (content) => content,
    });
  }

  // Remove script, style, nav, footer, aside elements
  turndown.remove(['script', 'style', 'nav', 'footer', 'aside', 'noscript']);

  return turndown;
}

// Extract main content using Readability
function extractMainContent(html: string, url: string): {
  content: string;
  title: string;
  byline?: string;
  excerpt?: string;
} {
  const dom = new JSDOM(html, { url });
  const reader = new Readability(dom.window.document);
  const article = reader.parse();

  if (!article) {
    throw new Error('Could not extract main content from page');
  }

  return {
    content: article.content,
    title: article.title,
    byline: article.byline || undefined,
    excerpt: article.excerpt || undefined,
  };
}

// Scrape a URL and return markdown
export async function scrape(options: ScrapeOptions): Promise<ScrapeResult> {
  const { page, browser } = await getPage({ headless: options.headless ?? true });

  try {
    // Navigate to URL
    console.log(`Navigating to: ${options.url}`);
    await page.goto(options.url, {
      timeout: 60000,
      waitUntil: 'domcontentloaded',
    });

    // Wait if specified
    if (options.wait) {
      console.log(`Waiting ${options.wait}ms for dynamic content...`);
      await page.waitForTimeout(options.wait);
    }

    const pageTitle = await page.title();
    const pageUrl = page.url();

    let html: string;
    let title = pageTitle;
    let byline: string | undefined;
    let excerpt: string | undefined;

    // Get HTML based on mode
    switch (options.mode) {
      case 'main': {
        // Get full page HTML and extract with Readability
        const fullHtml = await page.content();
        const extracted = extractMainContent(fullHtml, pageUrl);
        html = extracted.content;
        title = extracted.title || pageTitle;
        byline = extracted.byline;
        excerpt = extracted.excerpt;
        break;
      }

      case 'selector': {
        if (!options.selector) {
          throw new Error('Selector mode requires --selector option');
        }
        const element = await page.$(options.selector);
        if (!element) {
          throw new Error(`Selector not found: ${options.selector}`);
        }
        html = await element.innerHTML();
        break;
      }

      case 'full':
      default: {
        // Get body content, excluding common non-content elements
        html = await page.evaluate(() => {
          // Remove common non-content elements
          const selectorsToRemove = [
            'script', 'style', 'noscript', 'iframe',
            'nav', 'header', 'footer', '.cookie-banner',
            '.advertisement', '.ads', '#ads', '.social-share',
            '.comments', '#comments', '.sidebar'
          ];

          selectorsToRemove.forEach(selector => {
            document.querySelectorAll(selector).forEach(el => el.remove());
          });

          return document.body.innerHTML;
        });
        break;
      }
    }

    // Convert to markdown
    const turndown = createTurndownService({
      includeLinks: options.includeLinks ?? true,
      includeTables: options.includeTables ?? true,
      includeImages: options.includeImages ?? false,
    });

    let markdown = turndown.turndown(html);

    // Add title as H1 if not already present
    if (!markdown.startsWith('# ')) {
      markdown = `# ${title}\n\n${markdown}`;
    }

    // Add metadata header
    const metadataLines = [
      `<!-- Scraped from: ${pageUrl} -->`,
      byline ? `<!-- Author: ${byline} -->` : null,
      excerpt ? `<!-- Excerpt: ${excerpt} -->` : null,
      `<!-- Scraped at: ${new Date().toISOString()} -->`,
      '',
    ].filter(Boolean);

    markdown = metadataLines.join('\n') + '\n' + markdown;

    // Clean up excessive whitespace
    markdown = markdown
      .replace(/\n{4,}/g, '\n\n\n')
      .replace(/[ \t]+$/gm, '')
      .trim();

    const result: ScrapeResult = {
      title,
      url: pageUrl,
      markdown,
      byline,
      excerpt,
    };

    // Save to file if output specified
    if (options.output) {
      writeFileSync(options.output, markdown, 'utf-8');
      console.log(`Markdown saved to: ${options.output}`);
    }

    return result;
  } finally {
    await browser.close();
  }
}

// CLI entry point
async function main() {
  const args = parseArgs(process.argv.slice(2), {
    string: ['url', 'mode', 'selector', 'output'],
    boolean: ['headless', 'links', 'tables', 'images', 'help'],
    default: {
      mode: 'main',
      headless: true,
      links: true,
      tables: true,
      images: false,
    },
    alias: {
      u: 'url',
      m: 'mode',
      s: 'selector',
      o: 'output',
      h: 'help',
    },
  });

  if (args.help || !args.url) {
    console.log(`
Web Scraper - Extract content to Markdown

Usage:
  npx tsx scrape.ts --url <url> [options]

Options:
  -u, --url <url>       URL to scrape (required)
  -m, --mode <mode>     Scrape mode: main, full, or selector (default: main)
  -s, --selector <sel>  CSS selector for selector mode
  -o, --output <path>   Output file path for markdown
  --headless <bool>     Run in headless mode (default: true)
  --wait <ms>           Wait time for dynamic content
  --links               Include links in output (default: true)
  --tables              Include tables in output (default: true)
  --images              Include images in output (default: false)
  -h, --help            Show this help message

Scrape Modes:
  main      Extract main article content using Readability (best for articles)
  full      Full page content with common elements removed
  selector  Extract specific element by CSS selector

Examples:
  npx tsx scrape.ts --url "https://docs.example.com/guide" --mode main
  npx tsx scrape.ts --url "https://example.com" --mode full --output page.md
  npx tsx scrape.ts --url "https://example.com" --mode selector --selector ".api-docs"
  npx tsx scrape.ts --url "https://example.com" --mode main --no-links --output clean.md

Output Format:
  - GitHub Flavored Markdown (tables, strikethrough, task lists)
  - Proper heading hierarchy
  - Code blocks with language detection
  - Metadata comments at top (source URL, date)
`);
    process.exit(args.help ? 0 : 1);
  }

  const mode = args.mode as ScrapeMode;
  if (!['main', 'full', 'selector'].includes(mode)) {
    console.error(`Invalid mode: ${mode}. Must be main, full, or selector.`);
    process.exit(1);
  }

  try {
    const result = await scrape({
      url: args.url,
      mode,
      selector: args.selector,
      output: args.output,
      includeLinks: args.links,
      includeTables: args.tables,
      includeImages: args.images,
      headless: args.headless,
      wait: args.wait ? parseInt(args.wait, 10) : undefined,
    });

    // Print result summary
    console.log(`\nScrape complete:`);
    console.log(`  Title: ${result.title}`);
    console.log(`  URL: ${result.url}`);
    if (result.byline) console.log(`  Author: ${result.byline}`);
    console.log(`  Markdown length: ${result.markdown.length} chars`);

    // Print markdown if not saved to file
    if (!args.output) {
      console.log('\n--- Markdown Output ---\n');
      console.log(result.markdown);
    }
  } catch (error) {
    console.error('Error:', error instanceof Error ? error.message : error);
    process.exit(1);
  }
}

// Run if executed directly
const isMainModule = process.argv[1]?.includes('scrape.ts');
if (isMainModule) {
  main();
}