352 lines
9.7 KiB
TypeScript
352 lines
9.7 KiB
TypeScript
#!/usr/bin/env npx tsx
|
|
|
|
/**
|
|
* Web scraper that extracts content to markdown
|
|
*
|
|
* Usage:
|
|
* npx tsx scrape.ts --url "https://example.com" --mode main
|
|
* npx tsx scrape.ts --url "https://example.com" --mode full --output page.md
|
|
* npx tsx scrape.ts --url "https://example.com" --mode selector --selector ".content"
|
|
*/
|
|
|
|
import TurndownService from 'turndown';
|
|
import * as turndownPluginGfm from 'turndown-plugin-gfm';
|
|
import { Readability } from '@mozilla/readability';
|
|
import { JSDOM } from 'jsdom';
|
|
import { writeFileSync } from 'fs';
|
|
import parseArgs from 'minimist';
|
|
import { getPage } from './browse.js';
|
|
|
|
// Types
|
|
type ScrapeMode = 'main' | 'full' | 'selector';
|
|
|
|
interface ScrapeOptions {
|
|
url: string;
|
|
mode: ScrapeMode;
|
|
selector?: string;
|
|
output?: string;
|
|
includeLinks?: boolean;
|
|
includeTables?: boolean;
|
|
includeImages?: boolean;
|
|
headless?: boolean;
|
|
wait?: number;
|
|
}
|
|
|
|
interface ScrapeResult {
|
|
title: string;
|
|
url: string;
|
|
markdown: string;
|
|
byline?: string;
|
|
excerpt?: string;
|
|
}
|
|
|
|
// Configure Turndown for markdown conversion
|
|
function createTurndownService(options: {
|
|
includeLinks?: boolean;
|
|
includeTables?: boolean;
|
|
includeImages?: boolean;
|
|
}): TurndownService {
|
|
const turndown = new TurndownService({
|
|
headingStyle: 'atx',
|
|
hr: '---',
|
|
bulletListMarker: '-',
|
|
codeBlockStyle: 'fenced',
|
|
fence: '```',
|
|
emDelimiter: '*',
|
|
strongDelimiter: '**',
|
|
linkStyle: 'inlined',
|
|
});
|
|
|
|
// Add GFM support (tables, strikethrough, task lists)
|
|
turndown.use(turndownPluginGfm.gfm);
|
|
|
|
// Custom rule for code blocks with language detection
|
|
turndown.addRule('codeBlockWithLanguage', {
|
|
filter: (node) => {
|
|
return (
|
|
node.nodeName === 'PRE' &&
|
|
node.firstChild?.nodeName === 'CODE'
|
|
);
|
|
},
|
|
replacement: (_content, node) => {
|
|
const codeNode = node.firstChild as HTMLElement;
|
|
const className = codeNode.getAttribute('class') || '';
|
|
const langMatch = className.match(/language-(\w+)/);
|
|
const lang = langMatch ? langMatch[1] : '';
|
|
const code = codeNode.textContent || '';
|
|
return `\n\n\`\`\`${lang}\n${code}\n\`\`\`\n\n`;
|
|
},
|
|
});
|
|
|
|
// Remove images if not included
|
|
if (!options.includeImages) {
|
|
turndown.addRule('removeImages', {
|
|
filter: 'img',
|
|
replacement: () => '',
|
|
});
|
|
}
|
|
|
|
// Remove links but keep text if not included
|
|
if (!options.includeLinks) {
|
|
turndown.addRule('removeLinks', {
|
|
filter: 'a',
|
|
replacement: (content) => content,
|
|
});
|
|
}
|
|
|
|
// Remove script, style, nav, footer, aside elements
|
|
turndown.remove(['script', 'style', 'nav', 'footer', 'aside', 'noscript']);
|
|
|
|
return turndown;
|
|
}
|
|
|
|
// Extract main content using Readability
|
|
function extractMainContent(html: string, url: string): {
|
|
content: string;
|
|
title: string;
|
|
byline?: string;
|
|
excerpt?: string;
|
|
} {
|
|
const dom = new JSDOM(html, { url });
|
|
const reader = new Readability(dom.window.document);
|
|
const article = reader.parse();
|
|
|
|
if (!article) {
|
|
throw new Error('Could not extract main content from page');
|
|
}
|
|
|
|
return {
|
|
content: article.content,
|
|
title: article.title,
|
|
byline: article.byline || undefined,
|
|
excerpt: article.excerpt || undefined,
|
|
};
|
|
}
|
|
|
|
// Scrape a URL and return markdown
|
|
export async function scrape(options: ScrapeOptions): Promise<ScrapeResult> {
|
|
const { page, browser } = await getPage({ headless: options.headless ?? true });
|
|
|
|
try {
|
|
// Navigate to URL
|
|
console.log(`Navigating to: ${options.url}`);
|
|
await page.goto(options.url, {
|
|
timeout: 60000,
|
|
waitUntil: 'domcontentloaded',
|
|
});
|
|
|
|
// Wait if specified
|
|
if (options.wait) {
|
|
console.log(`Waiting ${options.wait}ms for dynamic content...`);
|
|
await page.waitForTimeout(options.wait);
|
|
}
|
|
|
|
const pageTitle = await page.title();
|
|
const pageUrl = page.url();
|
|
|
|
let html: string;
|
|
let title = pageTitle;
|
|
let byline: string | undefined;
|
|
let excerpt: string | undefined;
|
|
|
|
// Get HTML based on mode
|
|
switch (options.mode) {
|
|
case 'main': {
|
|
// Get full page HTML and extract with Readability
|
|
const fullHtml = await page.content();
|
|
const extracted = extractMainContent(fullHtml, pageUrl);
|
|
html = extracted.content;
|
|
title = extracted.title || pageTitle;
|
|
byline = extracted.byline;
|
|
excerpt = extracted.excerpt;
|
|
break;
|
|
}
|
|
|
|
case 'selector': {
|
|
if (!options.selector) {
|
|
throw new Error('Selector mode requires --selector option');
|
|
}
|
|
const element = await page.$(options.selector);
|
|
if (!element) {
|
|
throw new Error(`Selector not found: ${options.selector}`);
|
|
}
|
|
html = await element.innerHTML();
|
|
break;
|
|
}
|
|
|
|
case 'full':
|
|
default: {
|
|
// Get body content, excluding common non-content elements
|
|
html = await page.evaluate(() => {
|
|
// Remove common non-content elements
|
|
const selectorsToRemove = [
|
|
'script', 'style', 'noscript', 'iframe',
|
|
'nav', 'header', 'footer', '.cookie-banner',
|
|
'.advertisement', '.ads', '#ads', '.social-share',
|
|
'.comments', '#comments', '.sidebar'
|
|
];
|
|
|
|
selectorsToRemove.forEach(selector => {
|
|
document.querySelectorAll(selector).forEach(el => el.remove());
|
|
});
|
|
|
|
return document.body.innerHTML;
|
|
});
|
|
break;
|
|
}
|
|
}
|
|
|
|
// Convert to markdown
|
|
const turndown = createTurndownService({
|
|
includeLinks: options.includeLinks ?? true,
|
|
includeTables: options.includeTables ?? true,
|
|
includeImages: options.includeImages ?? false,
|
|
});
|
|
|
|
let markdown = turndown.turndown(html);
|
|
|
|
// Add title as H1 if not already present
|
|
if (!markdown.startsWith('# ')) {
|
|
markdown = `# ${title}\n\n${markdown}`;
|
|
}
|
|
|
|
// Add metadata header
|
|
const metadataLines = [
|
|
`<!-- Scraped from: ${pageUrl} -->`,
|
|
byline ? `<!-- Author: ${byline} -->` : null,
|
|
excerpt ? `<!-- Excerpt: ${excerpt} -->` : null,
|
|
`<!-- Scraped at: ${new Date().toISOString()} -->`,
|
|
'',
|
|
].filter(Boolean);
|
|
|
|
markdown = metadataLines.join('\n') + '\n' + markdown;
|
|
|
|
// Clean up excessive whitespace
|
|
markdown = markdown
|
|
.replace(/\n{4,}/g, '\n\n\n')
|
|
.replace(/[ \t]+$/gm, '')
|
|
.trim();
|
|
|
|
const result: ScrapeResult = {
|
|
title,
|
|
url: pageUrl,
|
|
markdown,
|
|
byline,
|
|
excerpt,
|
|
};
|
|
|
|
// Save to file if output specified
|
|
if (options.output) {
|
|
writeFileSync(options.output, markdown, 'utf-8');
|
|
console.log(`Markdown saved to: ${options.output}`);
|
|
}
|
|
|
|
return result;
|
|
} finally {
|
|
await browser.close();
|
|
}
|
|
}
|
|
|
|
// CLI entry point
|
|
async function main() {
|
|
const args = parseArgs(process.argv.slice(2), {
|
|
string: ['url', 'mode', 'selector', 'output'],
|
|
boolean: ['headless', 'links', 'tables', 'images', 'help'],
|
|
default: {
|
|
mode: 'main',
|
|
headless: true,
|
|
links: true,
|
|
tables: true,
|
|
images: false,
|
|
},
|
|
alias: {
|
|
u: 'url',
|
|
m: 'mode',
|
|
s: 'selector',
|
|
o: 'output',
|
|
h: 'help',
|
|
},
|
|
});
|
|
|
|
if (args.help || !args.url) {
|
|
console.log(`
|
|
Web Scraper - Extract content to Markdown
|
|
|
|
Usage:
|
|
npx tsx scrape.ts --url <url> [options]
|
|
|
|
Options:
|
|
-u, --url <url> URL to scrape (required)
|
|
-m, --mode <mode> Scrape mode: main, full, or selector (default: main)
|
|
-s, --selector <sel> CSS selector for selector mode
|
|
-o, --output <path> Output file path for markdown
|
|
--headless <bool> Run in headless mode (default: true)
|
|
--wait <ms> Wait time for dynamic content
|
|
--links Include links in output (default: true)
|
|
--tables Include tables in output (default: true)
|
|
--images Include images in output (default: false)
|
|
-h, --help Show this help message
|
|
|
|
Scrape Modes:
|
|
main Extract main article content using Readability (best for articles)
|
|
full Full page content with common elements removed
|
|
selector Extract specific element by CSS selector
|
|
|
|
Examples:
|
|
npx tsx scrape.ts --url "https://docs.example.com/guide" --mode main
|
|
npx tsx scrape.ts --url "https://example.com" --mode full --output page.md
|
|
npx tsx scrape.ts --url "https://example.com" --mode selector --selector ".api-docs"
|
|
npx tsx scrape.ts --url "https://example.com" --mode main --no-links --output clean.md
|
|
|
|
Output Format:
|
|
- GitHub Flavored Markdown (tables, strikethrough, task lists)
|
|
- Proper heading hierarchy
|
|
- Code blocks with language detection
|
|
- Metadata comments at top (source URL, date)
|
|
`);
|
|
process.exit(args.help ? 0 : 1);
|
|
}
|
|
|
|
const mode = args.mode as ScrapeMode;
|
|
if (!['main', 'full', 'selector'].includes(mode)) {
|
|
console.error(`Invalid mode: ${mode}. Must be main, full, or selector.`);
|
|
process.exit(1);
|
|
}
|
|
|
|
try {
|
|
const result = await scrape({
|
|
url: args.url,
|
|
mode,
|
|
selector: args.selector,
|
|
output: args.output,
|
|
includeLinks: args.links,
|
|
includeTables: args.tables,
|
|
includeImages: args.images,
|
|
headless: args.headless,
|
|
wait: args.wait ? parseInt(args.wait, 10) : undefined,
|
|
});
|
|
|
|
// Print result summary
|
|
console.log(`\nScrape complete:`);
|
|
console.log(` Title: ${result.title}`);
|
|
console.log(` URL: ${result.url}`);
|
|
if (result.byline) console.log(` Author: ${result.byline}`);
|
|
console.log(` Markdown length: ${result.markdown.length} chars`);
|
|
|
|
// Print markdown if not saved to file
|
|
if (!args.output) {
|
|
console.log('\n--- Markdown Output ---\n');
|
|
console.log(result.markdown);
|
|
}
|
|
} catch (error) {
|
|
console.error('Error:', error instanceof Error ? error.message : error);
|
|
process.exit(1);
|
|
}
|
|
}
|
|
|
|
// Run if executed directly
|
|
const isMainModule = process.argv[1]?.includes('scrape.ts');
|
|
if (isMainModule) {
|
|
main();
|
|
}
|