Files
ai-coding-skills/skills/web-automation/pi/scripts/scrape.ts
T
stefano 251148c3ff
check / check (ubuntu-latest) (push) Successful in 2m5s
check / check (macos-latest) (push) Has been cancelled
check-online / check-online (ubuntu-latest) (push) Successful in 1m53s
Perform code optimization and document cleanup (#1)
## Summary
- add repository-wide quality tooling and verification scaffolding, including CI workflows, pnpm workspace setup, ESLint/Prettier/markdown checks, and generated-output verification helpers
- reorganize skill sources and generation flow by introducing canonical `_source` variants, generator/manifests, reusable helper abstractions, and shared web-automation/browser utilities
- clean up and expand documentation so the root README flows into docs and skill docs, with clearer development, reviewer, installer, and workflow guidance

## Notable changes
- docs flow and consistency cleanup across `README.md`, `docs/README.md`, and related docs
- new scripts for `check`, docs verification, generated-file verification, shell portability, and safe directory replacement
- refactors in Atlassian and web-automation skill runtimes to reduce duplication and centralize reusable code
- changelog, development documentation, and CI surface updates

## Test Plan
- [ ] `pnpm run check`
- [ ] review generated/manifests and skill sync outputs
- [ ] smoke-check docs flow from `README.md` to `docs/README.md` to skill docs

## Notes
- this branch currently includes tracked `skills/web-automation/shared/node_modules` content that should be reviewed carefully as potentially noisy/accidental committed artifacts

Co-authored-by: Stefano Fiorini <stefano.fiorini@firsthorizon.com>
Reviewed-on: #1
2026-05-04 04:41:34 +00:00

353 lines
9.9 KiB
TypeScript
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
#!/usr/bin/env npx tsx
// ⚠️ GENERATED FILE do not edit directly. Edit the canonical source in skills/web-automation/shared/ and run `pnpm run sync:pi`.
/**
* Web scraper that extracts content to markdown
*
* Usage:
* npx tsx scrape.ts --url "https://example.com" --mode main
* npx tsx scrape.ts --url "https://example.com" --mode full --output page.md
* npx tsx scrape.ts --url "https://example.com" --mode selector --selector ".content"
*/
import TurndownService from 'turndown';
import * as turndownPluginGfm from 'turndown-plugin-gfm';
import { Readability } from '@mozilla/readability';
import { JSDOM } from 'jsdom';
import { writeFileSync } from 'fs';
import parseArgs from 'minimist';
import { getPage } from './browse.js';
// Types
type ScrapeMode = 'main' | 'full' | 'selector';
interface ScrapeOptions {
url: string;
mode: ScrapeMode;
selector?: string;
output?: string;
includeLinks?: boolean;
includeTables?: boolean;
includeImages?: boolean;
headless?: boolean;
wait?: number;
}
interface ScrapeResult {
title: string;
url: string;
markdown: string;
byline?: string;
excerpt?: string;
}
// Configure Turndown for markdown conversion
function createTurndownService(options: {
includeLinks?: boolean;
includeTables?: boolean;
includeImages?: boolean;
}): TurndownService {
const turndown = new TurndownService({
headingStyle: 'atx',
hr: '---',
bulletListMarker: '-',
codeBlockStyle: 'fenced',
fence: '```',
emDelimiter: '*',
strongDelimiter: '**',
linkStyle: 'inlined',
});
// Add GFM support (tables, strikethrough, task lists)
turndown.use(turndownPluginGfm.gfm);
// Custom rule for code blocks with language detection
turndown.addRule('codeBlockWithLanguage', {
filter: (node) => {
return (
node.nodeName === 'PRE' &&
node.firstChild?.nodeName === 'CODE'
);
},
replacement: (_content, node) => {
const codeNode = node.firstChild as HTMLElement;
const className = codeNode.getAttribute('class') || '';
const langMatch = className.match(/language-(\w+)/);
const lang = langMatch ? langMatch[1] : '';
const code = codeNode.textContent || '';
return `\n\n\`\`\`${lang}\n${code}\n\`\`\`\n\n`;
},
});
// Remove images if not included
if (!options.includeImages) {
turndown.addRule('removeImages', {
filter: 'img',
replacement: () => '',
});
}
// Remove links but keep text if not included
if (!options.includeLinks) {
turndown.addRule('removeLinks', {
filter: 'a',
replacement: (content) => content,
});
}
// Remove script, style, nav, footer, aside elements
turndown.remove(['script', 'style', 'nav', 'footer', 'aside', 'noscript']);
return turndown;
}
// Extract main content using Readability
function extractMainContent(html: string, url: string): {
content: string;
title: string;
byline?: string;
excerpt?: string;
} {
const dom = new JSDOM(html, { url });
const reader = new Readability(dom.window.document);
const article = reader.parse();
if (!article) {
throw new Error('Could not extract main content from page');
}
return {
content: article.content,
title: article.title,
byline: article.byline || undefined,
excerpt: article.excerpt || undefined,
};
}
// Scrape a URL and return markdown
export async function scrape(options: ScrapeOptions): Promise<ScrapeResult> {
const { page, browser } = await getPage({ headless: options.headless ?? true });
try {
// Navigate to URL
console.log(`Navigating to: ${options.url}`);
await page.goto(options.url, {
timeout: 60000,
waitUntil: 'domcontentloaded',
});
// Wait if specified
if (options.wait) {
console.log(`Waiting ${options.wait}ms for dynamic content...`);
await page.waitForTimeout(options.wait);
}
const pageTitle = await page.title();
const pageUrl = page.url();
let html: string;
let title = pageTitle;
let byline: string | undefined;
let excerpt: string | undefined;
// Get HTML based on mode
switch (options.mode) {
case 'main': {
// Get full page HTML and extract with Readability
const fullHtml = await page.content();
const extracted = extractMainContent(fullHtml, pageUrl);
html = extracted.content;
title = extracted.title || pageTitle;
byline = extracted.byline;
excerpt = extracted.excerpt;
break;
}
case 'selector': {
if (!options.selector) {
throw new Error('Selector mode requires --selector option');
}
const element = await page.$(options.selector);
if (!element) {
throw new Error(`Selector not found: ${options.selector}`);
}
html = await element.innerHTML();
break;
}
case 'full':
default: {
// Get body content, excluding common non-content elements
html = await page.evaluate(() => {
// Remove common non-content elements
const selectorsToRemove = [
'script', 'style', 'noscript', 'iframe',
'nav', 'header', 'footer', '.cookie-banner',
'.advertisement', '.ads', '#ads', '.social-share',
'.comments', '#comments', '.sidebar'
];
selectorsToRemove.forEach(selector => {
document.querySelectorAll(selector).forEach(el => el.remove());
});
return document.body.innerHTML;
});
break;
}
}
// Convert to markdown
const turndown = createTurndownService({
includeLinks: options.includeLinks ?? true,
includeTables: options.includeTables ?? true,
includeImages: options.includeImages ?? false,
});
let markdown = turndown.turndown(html);
// Add title as H1 if not already present
if (!markdown.startsWith('# ')) {
markdown = `# ${title}\n\n${markdown}`;
}
// Add metadata header
const metadataLines = [
`<!-- Scraped from: ${pageUrl} -->`,
byline ? `<!-- Author: ${byline} -->` : null,
excerpt ? `<!-- Excerpt: ${excerpt} -->` : null,
`<!-- Scraped at: ${new Date().toISOString()} -->`,
'',
].filter(Boolean);
markdown = metadataLines.join('\n') + '\n' + markdown;
// Clean up excessive whitespace
markdown = markdown
.replace(/\n{4,}/g, '\n\n\n')
.replace(/[ \t]+$/gm, '')
.trim();
const result: ScrapeResult = {
title,
url: pageUrl,
markdown,
byline,
excerpt,
};
// Save to file if output specified
if (options.output) {
writeFileSync(options.output, markdown, 'utf-8');
console.log(`Markdown saved to: ${options.output}`);
}
return result;
} finally {
await browser.close();
}
}
// CLI entry point
async function main() {
const args = parseArgs(process.argv.slice(2), {
string: ['url', 'mode', 'selector', 'output'],
boolean: ['headless', 'links', 'tables', 'images', 'help'],
default: {
mode: 'main',
headless: true,
links: true,
tables: true,
images: false,
},
alias: {
u: 'url',
m: 'mode',
s: 'selector',
o: 'output',
h: 'help',
},
});
if (args.help || !args.url) {
console.log(`
Web Scraper - Extract content to Markdown
Usage:
npx tsx scrape.ts --url <url> [options]
Options:
-u, --url <url> URL to scrape (required)
-m, --mode <mode> Scrape mode: main, full, or selector (default: main)
-s, --selector <sel> CSS selector for selector mode
-o, --output <path> Output file path for markdown
--headless <bool> Run in headless mode (default: true)
--wait <ms> Wait time for dynamic content
--links Include links in output (default: true)
--tables Include tables in output (default: true)
--images Include images in output (default: false)
-h, --help Show this help message
Scrape Modes:
main Extract main article content using Readability (best for articles)
full Full page content with common elements removed
selector Extract specific element by CSS selector
Examples:
npx tsx scrape.ts --url "https://docs.example.com/guide" --mode main
npx tsx scrape.ts --url "https://example.com" --mode full --output page.md
npx tsx scrape.ts --url "https://example.com" --mode selector --selector ".api-docs"
npx tsx scrape.ts --url "https://example.com" --mode main --no-links --output clean.md
Output Format:
- GitHub Flavored Markdown (tables, strikethrough, task lists)
- Proper heading hierarchy
- Code blocks with language detection
- Metadata comments at top (source URL, date)
`);
process.exit(args.help ? 0 : 1);
}
const mode = args.mode as ScrapeMode;
if (!['main', 'full', 'selector'].includes(mode)) {
console.error(`Invalid mode: ${mode}. Must be main, full, or selector.`);
process.exit(1);
}
try {
const result = await scrape({
url: args.url,
mode,
selector: args.selector,
output: args.output,
includeLinks: args.links,
includeTables: args.tables,
includeImages: args.images,
headless: args.headless,
wait: args.wait ? parseInt(args.wait, 10) : undefined,
});
// Print result summary
console.log(`\nScrape complete:`);
console.log(` Title: ${result.title}`);
console.log(` URL: ${result.url}`);
if (result.byline) console.log(` Author: ${result.byline}`);
console.log(` Markdown length: ${result.markdown.length} chars`);
// Print markdown if not saved to file
if (!args.output) {
console.log('\n--- Markdown Output ---\n');
console.log(result.markdown);
}
} catch (error) {
console.error('Error:', error instanceof Error ? error.message : error);
process.exit(1);
}
}
// Run if executed directly
const isMainModule = process.argv[1]?.includes('scrape.ts');
if (isMainModule) {
main();
}