#!/usr/bin/env npx tsx /** * Web scraper that extracts content to markdown * * Usage: * npx tsx scrape.ts --url "https://example.com" --mode main * npx tsx scrape.ts --url "https://example.com" --mode full --output page.md * npx tsx scrape.ts --url "https://example.com" --mode selector --selector ".content" */ import TurndownService from 'turndown'; import * as turndownPluginGfm from 'turndown-plugin-gfm'; import { Readability } from '@mozilla/readability'; import { JSDOM } from 'jsdom'; import { writeFileSync } from 'fs'; import parseArgs from 'minimist'; import { getPage } from './browse.js'; // Types type ScrapeMode = 'main' | 'full' | 'selector'; interface ScrapeOptions { url: string; mode: ScrapeMode; selector?: string; output?: string; includeLinks?: boolean; includeTables?: boolean; includeImages?: boolean; headless?: boolean; wait?: number; } interface ScrapeResult { title: string; url: string; markdown: string; byline?: string; excerpt?: string; } // Configure Turndown for markdown conversion function createTurndownService(options: { includeLinks?: boolean; includeTables?: boolean; includeImages?: boolean; }): TurndownService { const turndown = new TurndownService({ headingStyle: 'atx', hr: '---', bulletListMarker: '-', codeBlockStyle: 'fenced', fence: '```', emDelimiter: '*', strongDelimiter: '**', linkStyle: 'inlined', }); // Add GFM support (tables, strikethrough, task lists) turndown.use(turndownPluginGfm.gfm); // Custom rule for code blocks with language detection turndown.addRule('codeBlockWithLanguage', { filter: (node) => { return ( node.nodeName === 'PRE' && node.firstChild?.nodeName === 'CODE' ); }, replacement: (_content, node) => { const codeNode = node.firstChild as HTMLElement; const className = codeNode.getAttribute('class') || ''; const langMatch = className.match(/language-(\w+)/); const lang = langMatch ? langMatch[1] : ''; const code = codeNode.textContent || ''; return `\n\n\`\`\`${lang}\n${code}\n\`\`\`\n\n`; }, }); // Remove images if not included if (!options.includeImages) { turndown.addRule('removeImages', { filter: 'img', replacement: () => '', }); } // Remove links but keep text if not included if (!options.includeLinks) { turndown.addRule('removeLinks', { filter: 'a', replacement: (content) => content, }); } // Remove script, style, nav, footer, aside elements turndown.remove(['script', 'style', 'nav', 'footer', 'aside', 'noscript']); return turndown; } // Extract main content using Readability function extractMainContent(html: string, url: string): { content: string; title: string; byline?: string; excerpt?: string; } { const dom = new JSDOM(html, { url }); const reader = new Readability(dom.window.document); const article = reader.parse(); if (!article) { throw new Error('Could not extract main content from page'); } return { content: article.content, title: article.title, byline: article.byline || undefined, excerpt: article.excerpt || undefined, }; } // Scrape a URL and return markdown export async function scrape(options: ScrapeOptions): Promise { const { page, browser } = await getPage({ headless: options.headless ?? true }); try { // Navigate to URL console.log(`Navigating to: ${options.url}`); await page.goto(options.url, { timeout: 60000, waitUntil: 'domcontentloaded', }); // Wait if specified if (options.wait) { console.log(`Waiting ${options.wait}ms for dynamic content...`); await page.waitForTimeout(options.wait); } const pageTitle = await page.title(); const pageUrl = page.url(); let html: string; let title = pageTitle; let byline: string | undefined; let excerpt: string | undefined; // Get HTML based on mode switch (options.mode) { case 'main': { // Get full page HTML and extract with Readability const fullHtml = await page.content(); const extracted = extractMainContent(fullHtml, pageUrl); html = extracted.content; title = extracted.title || pageTitle; byline = extracted.byline; excerpt = extracted.excerpt; break; } case 'selector': { if (!options.selector) { throw new Error('Selector mode requires --selector option'); } const element = await page.$(options.selector); if (!element) { throw new Error(`Selector not found: ${options.selector}`); } html = await element.innerHTML(); break; } case 'full': default: { // Get body content, excluding common non-content elements html = await page.evaluate(() => { // Remove common non-content elements const selectorsToRemove = [ 'script', 'style', 'noscript', 'iframe', 'nav', 'header', 'footer', '.cookie-banner', '.advertisement', '.ads', '#ads', '.social-share', '.comments', '#comments', '.sidebar' ]; selectorsToRemove.forEach(selector => { document.querySelectorAll(selector).forEach(el => el.remove()); }); return document.body.innerHTML; }); break; } } // Convert to markdown const turndown = createTurndownService({ includeLinks: options.includeLinks ?? true, includeTables: options.includeTables ?? true, includeImages: options.includeImages ?? false, }); let markdown = turndown.turndown(html); // Add title as H1 if not already present if (!markdown.startsWith('# ')) { markdown = `# ${title}\n\n${markdown}`; } // Add metadata header const metadataLines = [ ``, byline ? `` : null, excerpt ? `` : null, ``, '', ].filter(Boolean); markdown = metadataLines.join('\n') + '\n' + markdown; // Clean up excessive whitespace markdown = markdown .replace(/\n{4,}/g, '\n\n\n') .replace(/[ \t]+$/gm, '') .trim(); const result: ScrapeResult = { title, url: pageUrl, markdown, byline, excerpt, }; // Save to file if output specified if (options.output) { writeFileSync(options.output, markdown, 'utf-8'); console.log(`Markdown saved to: ${options.output}`); } return result; } finally { await browser.close(); } } // CLI entry point async function main() { const args = parseArgs(process.argv.slice(2), { string: ['url', 'mode', 'selector', 'output'], boolean: ['headless', 'links', 'tables', 'images', 'help'], default: { mode: 'main', headless: true, links: true, tables: true, images: false, }, alias: { u: 'url', m: 'mode', s: 'selector', o: 'output', h: 'help', }, }); if (args.help || !args.url) { console.log(` Web Scraper - Extract content to Markdown Usage: npx tsx scrape.ts --url [options] Options: -u, --url URL to scrape (required) -m, --mode Scrape mode: main, full, or selector (default: main) -s, --selector CSS selector for selector mode -o, --output Output file path for markdown --headless Run in headless mode (default: true) --wait Wait time for dynamic content --links Include links in output (default: true) --tables Include tables in output (default: true) --images Include images in output (default: false) -h, --help Show this help message Scrape Modes: main Extract main article content using Readability (best for articles) full Full page content with common elements removed selector Extract specific element by CSS selector Examples: npx tsx scrape.ts --url "https://docs.example.com/guide" --mode main npx tsx scrape.ts --url "https://example.com" --mode full --output page.md npx tsx scrape.ts --url "https://example.com" --mode selector --selector ".api-docs" npx tsx scrape.ts --url "https://example.com" --mode main --no-links --output clean.md Output Format: - GitHub Flavored Markdown (tables, strikethrough, task lists) - Proper heading hierarchy - Code blocks with language detection - Metadata comments at top (source URL, date) `); process.exit(args.help ? 0 : 1); } const mode = args.mode as ScrapeMode; if (!['main', 'full', 'selector'].includes(mode)) { console.error(`Invalid mode: ${mode}. Must be main, full, or selector.`); process.exit(1); } try { const result = await scrape({ url: args.url, mode, selector: args.selector, output: args.output, includeLinks: args.links, includeTables: args.tables, includeImages: args.images, headless: args.headless, wait: args.wait ? parseInt(args.wait, 10) : undefined, }); // Print result summary console.log(`\nScrape complete:`); console.log(` Title: ${result.title}`); console.log(` URL: ${result.url}`); if (result.byline) console.log(` Author: ${result.byline}`); console.log(` Markdown length: ${result.markdown.length} chars`); // Print markdown if not saved to file if (!args.output) { console.log('\n--- Markdown Output ---\n'); console.log(result.markdown); } } catch (error) { console.error('Error:', error instanceof Error ? error.message : error); process.exit(1); } } // Run if executed directly const isMainModule = process.argv[1]?.includes('scrape.ts'); if (isMainModule) { main(); }