390 lines
12 KiB
JavaScript
390 lines
12 KiB
JavaScript
/**
|
|
* Pokémon Play! Resources Scraper
|
|
*
|
|
* Downloads official tournament rules, resources, and documentation from pokemon.com
|
|
* - PDFs: Downloads directly
|
|
* - Videos: Saves video URLs to text files
|
|
* - Web pages: Extracts and saves text content
|
|
*
|
|
* Usage:
|
|
* node code/utils/scrape-pokemon-resources.js
|
|
* npm run scrape:pokemon
|
|
*
|
|
* Output: docs/projects/pokemon-professor/Pokemon Rules & Resources/
|
|
*/
|
|
import puppeteer from 'puppeteer-extra';
|
|
import StealthPlugin from 'puppeteer-extra-plugin-stealth';
|
|
import fs from 'fs';
|
|
import path from 'path';
|
|
import { fileURLToPath } from 'url';
|
|
import https from 'https';
|
|
import http from 'http';
|
|
|
|
// Add stealth plugin to avoid bot detection
|
|
puppeteer.use(StealthPlugin());
|
|
|
|
const __filename = fileURLToPath(import.meta.url);
|
|
const __dirname = path.dirname(__filename);
|
|
|
|
// Configuration
|
|
const BASE_URL =
|
|
'https://www.pokemon.com/us/play-pokemon/about/tournaments-rules-and-resources';
|
|
const OUTPUT_DIR = path.resolve(
|
|
__dirname,
|
|
'../../docs/projects/pokemon-professor/Pokemon Rules & Resources'
|
|
);
|
|
|
|
// Target resource names from the page
|
|
const TARGET_RESOURCES = [
|
|
// Rules & Resources for All
|
|
'Play! Pokémon Terms of Use',
|
|
'Play! Pokémon Standards of Conduct',
|
|
'Play! Pokémon Inclusion Policy',
|
|
'Play! Pokémon Accessibility Policy',
|
|
'Play! Pokémon Trainer Username and Team Name Policy',
|
|
'Play! Pokémon Premier Events Sponsorship Policy',
|
|
'Play! Pokémon Tournament Rules Handbook',
|
|
'Play! Pokémon COVID-19 Protocols',
|
|
'Play! Pokémon Attire and Cosplay Policy',
|
|
'Play! Pokémon Penalty Guidelines',
|
|
|
|
// Pokémon TCG Rules & Resources
|
|
'Pokémon TCG Rulebook',
|
|
'Play! Pokémon Deck List (8.5x11)',
|
|
'Play! Pokémon Deck List (A4)',
|
|
'TCG Errata',
|
|
'Pokémon TCG Banned Card List',
|
|
'Mega Evolution—Phantasmal Flames Banned List and Rule Changes Announcement',
|
|
'Pokémon TCG Promo Card Legality Status',
|
|
'Pokémon TCG Alternative Play Handbook',
|
|
'Pokémon TCG Tournament Handbook',
|
|
|
|
// Video Game Rules & Resources
|
|
'Play! Pokémon Video Game Championships Tournament Handbook',
|
|
'Pokémon Video Game Team List',
|
|
|
|
// Pokémon GO Rules & Resources
|
|
'Play! Pokémon Pokémon GO Tournament Handbook',
|
|
'Pokémon GO Team List',
|
|
'Play! Pokémon Pokémon GO Championship Series Banned Pokémon List',
|
|
'Organizing Pokémon GO Events',
|
|
|
|
// Pokémon UNITE Rules & Resources
|
|
'Pokémon UNITE Championship Series Handbook',
|
|
|
|
// Pokémon League Rules & Resources
|
|
'Play! Pokémon Store Handbook',
|
|
'Play! Pokémon League Challenges, Cups, and Prerelease Guide',
|
|
'League Roster',
|
|
'League Flyer',
|
|
|
|
// Pokémon Club Rules & Resources
|
|
'Pokémon Activity Sheets',
|
|
|
|
// Further Resources for Players
|
|
'World Championships Battle Dictionary',
|
|
'Play! Pokémon Scholarship Program Terms and Conditions',
|
|
'Championship Event Awards Disbursement Information',
|
|
|
|
// Training Videos
|
|
'League Management Demos',
|
|
'Tournament Software and Reporting Events',
|
|
'Championship Series Reporting',
|
|
'TOM Training Videos',
|
|
'Tools Overview',
|
|
'Installation and Set-up',
|
|
'Setting Up Your Tournament',
|
|
'Tournament Detail Verification',
|
|
'Running & Completing the Tournament',
|
|
'Reporting Matches',
|
|
'Adding Players'
|
|
];
|
|
|
|
/**
|
|
* Clean filename for filesystem
|
|
* @param {string} name - Original name
|
|
* @returns {string} Safe filename
|
|
*/
|
|
function sanitizeFilename(name) {
|
|
return name
|
|
.replace(/[<>:"/\\|?*]/g, '-')
|
|
.replace(/\s+/g, ' ')
|
|
.trim();
|
|
}
|
|
|
|
/**
|
|
* Download file from URL using page context with cookies
|
|
* @param {Page} page - Puppeteer page
|
|
* @param {string} url - File URL
|
|
* @param {string} filepath - Destination path
|
|
* @returns {Promise<void>}
|
|
*/
|
|
async function downloadFile(page, url, filepath) {
|
|
// Get cookies from the current page session
|
|
const cookies = await page.cookies();
|
|
const cookieString = cookies.map(c => `${c.name}=${c.value}`).join('; ');
|
|
|
|
// Use page.evaluate to download with fetch
|
|
const buffer = await page.evaluate(async downloadUrl => {
|
|
const response = await fetch(downloadUrl, {
|
|
method: 'GET',
|
|
credentials: 'include'
|
|
});
|
|
|
|
if (!response.ok) {
|
|
throw new Error(`HTTP ${response.status}`);
|
|
}
|
|
|
|
const arrayBuffer = await response.arrayBuffer();
|
|
return Array.from(new Uint8Array(arrayBuffer));
|
|
}, url);
|
|
|
|
const bufferData = Buffer.from(buffer);
|
|
|
|
// Verify it's actually a PDF
|
|
const header = bufferData.slice(0, 5).toString();
|
|
if (!header.startsWith('%PDF')) {
|
|
throw new Error(`Downloaded file is not a PDF (got: ${header})`);
|
|
}
|
|
|
|
fs.writeFileSync(filepath, bufferData);
|
|
}
|
|
|
|
/**
|
|
* Extract text content from a web page
|
|
* @param {Page} page - Puppeteer page
|
|
* @returns {Promise<string>} Page text content
|
|
*/
|
|
async function extractPageText(page) {
|
|
return await page.evaluate(() => {
|
|
// Remove script and style elements
|
|
const scripts = document.querySelectorAll(
|
|
'script, style, nav, footer, header'
|
|
);
|
|
scripts.forEach(el => el.remove());
|
|
|
|
// Get main content
|
|
const main =
|
|
document.querySelector('main, article, .content, #content') ||
|
|
document.body;
|
|
return main.innerText.trim();
|
|
});
|
|
}
|
|
|
|
/**
|
|
* Main scraping function
|
|
*/
|
|
async function scrapeResources() {
|
|
console.log('🚀 Starting Pokémon Play! Resources Scraper');
|
|
console.log(`📁 Output directory: ${OUTPUT_DIR}\n`);
|
|
|
|
// Create output directory
|
|
if (!fs.existsSync(OUTPUT_DIR)) {
|
|
fs.mkdirSync(OUTPUT_DIR, { recursive: true });
|
|
console.log('✅ Created output directory\n');
|
|
}
|
|
|
|
const browser = await puppeteer.launch({
|
|
headless: true,
|
|
args: [
|
|
'--no-sandbox',
|
|
'--disable-setuid-sandbox',
|
|
'--disable-blink-features=AutomationControlled',
|
|
'--disable-web-security',
|
|
'--disable-features=IsolateOrigins,site-per-process'
|
|
]
|
|
});
|
|
|
|
try {
|
|
const page = await browser.newPage();
|
|
await page.setViewport({ width: 1920, height: 1080 });
|
|
|
|
// Set realistic user agent
|
|
await page.setUserAgent(
|
|
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36'
|
|
);
|
|
|
|
// Set extra headers to appear more like a real browser
|
|
await page.setExtraHTTPHeaders({
|
|
'Accept-Language': 'en-US,en;q=0.9',
|
|
'Accept-Encoding': 'gzip, deflate, br',
|
|
Accept:
|
|
'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8'
|
|
});
|
|
|
|
// Navigate to main page
|
|
console.log('🌐 Loading main page...');
|
|
await page.goto(BASE_URL, { waitUntil: 'networkidle0', timeout: 90000 });
|
|
|
|
// Wait for content to load - try waiting for a specific element
|
|
console.log('⏳ Waiting for content to render...');
|
|
try {
|
|
await page.waitForSelector('a[href*=".pdf"], .resource, article', {
|
|
timeout: 10000
|
|
});
|
|
} catch (e) {
|
|
console.log(
|
|
'⚠️ Timeout waiting for specific selectors, continuing anyway...'
|
|
);
|
|
}
|
|
|
|
await new Promise(resolve => setTimeout(resolve, 5000));
|
|
|
|
console.log('✅ Page loaded\n');
|
|
|
|
// Debug: Take a screenshot
|
|
await page.screenshot({
|
|
path: path.join(OUTPUT_DIR, 'debug-screenshot.png'),
|
|
fullPage: true
|
|
});
|
|
console.log('📸 Screenshot saved for debugging\n');
|
|
|
|
// Debug: Dump HTML content
|
|
const html = await page.content();
|
|
fs.writeFileSync(path.join(OUTPUT_DIR, 'debug-page-source.html'), html);
|
|
console.log('📄 HTML source saved for debugging\n');
|
|
|
|
// Get all links on the page with multiple strategies
|
|
const links = await page.evaluate(() => {
|
|
const anchors = Array.from(document.querySelectorAll('a'));
|
|
const allLinks = anchors
|
|
.map(a => ({
|
|
text: a.innerText.trim(),
|
|
href: a.href,
|
|
title: a.title || '',
|
|
ariaLabel: a.getAttribute('aria-label') || ''
|
|
}))
|
|
.filter(
|
|
link =>
|
|
(link.text || link.title || link.ariaLabel) &&
|
|
link.href &&
|
|
!link.href.startsWith('javascript:') &&
|
|
!link.href.includes('#')
|
|
);
|
|
|
|
// Also try to get download links specifically
|
|
const downloadLinks = Array.from(
|
|
document.querySelectorAll('[download], a[href*=".pdf"]')
|
|
).map(a => ({
|
|
text: a.innerText.trim() || a.getAttribute('download') || a.title,
|
|
href: a.href
|
|
}));
|
|
|
|
return [...allLinks, ...downloadLinks].filter(
|
|
(link, index, self) =>
|
|
index === self.findIndex(l => l.href === link.href)
|
|
);
|
|
});
|
|
|
|
console.log(`📋 Found ${links.length} total links on page`);
|
|
|
|
// Debug: Show first 10 links
|
|
if (links.length > 0) {
|
|
console.log('\n📝 Sample links found:');
|
|
links.slice(0, 10).forEach((link, i) => {
|
|
console.log(` ${i + 1}. ${link.text.substring(0, 60)}...`);
|
|
});
|
|
console.log('');
|
|
} else {
|
|
console.log(
|
|
'⚠️ No links found - page may require different loading strategy\n'
|
|
);
|
|
}
|
|
|
|
// Process each target resource
|
|
let processed = 0;
|
|
let downloaded = 0;
|
|
let skipped = 0;
|
|
|
|
for (const targetName of TARGET_RESOURCES) {
|
|
// Find matching link (case-insensitive, fuzzy match)
|
|
const link = links.find(
|
|
l =>
|
|
l.text.toLowerCase().includes(targetName.toLowerCase()) ||
|
|
targetName.toLowerCase().includes(l.text.toLowerCase())
|
|
);
|
|
|
|
if (!link) {
|
|
console.log(`⚠️ Could not find link for: ${targetName}`);
|
|
skipped++;
|
|
continue;
|
|
}
|
|
|
|
processed++;
|
|
const safeFilename = sanitizeFilename(targetName);
|
|
|
|
try {
|
|
// Check if it's a PDF
|
|
if (link.href.toLowerCase().endsWith('.pdf')) {
|
|
const filepath = path.join(OUTPUT_DIR, `${safeFilename}.pdf`);
|
|
console.log(`📥 Downloading PDF: ${targetName}`);
|
|
console.log(` URL: ${link.href}`);
|
|
await downloadFile(page, link.href, filepath);
|
|
console.log(` ✅ Saved: ${safeFilename}.pdf\n`);
|
|
downloaded++;
|
|
}
|
|
// Check if it's a video link (YouTube, Vimeo, etc.)
|
|
else if (
|
|
link.href.includes('youtube.com') ||
|
|
link.href.includes('youtu.be') ||
|
|
link.href.includes('vimeo.com') ||
|
|
link.href.includes('video')
|
|
) {
|
|
const filepath = path.join(
|
|
OUTPUT_DIR,
|
|
`${safeFilename} - Video URL.txt`
|
|
);
|
|
console.log(`🎥 Saving video URL: ${targetName}`);
|
|
fs.writeFileSync(
|
|
filepath,
|
|
`${targetName}\n\nVideo URL: ${link.href}\n`
|
|
);
|
|
console.log(` ✅ Saved: ${safeFilename} - Video URL.txt\n`);
|
|
downloaded++;
|
|
}
|
|
// Otherwise, extract page text
|
|
else {
|
|
console.log(`📄 Extracting text from: ${targetName}`);
|
|
console.log(` URL: ${link.href}`);
|
|
|
|
const contentPage = await browser.newPage();
|
|
await contentPage.goto(link.href, {
|
|
waitUntil: 'networkidle2',
|
|
timeout: 60000
|
|
});
|
|
const text = await extractPageText(contentPage);
|
|
await contentPage.close();
|
|
|
|
const filepath = path.join(OUTPUT_DIR, `${safeFilename}.txt`);
|
|
fs.writeFileSync(
|
|
filepath,
|
|
`${targetName}\n\nSource: ${link.href}\n\n${text}\n`
|
|
);
|
|
console.log(` ✅ Saved: ${safeFilename}.txt\n`);
|
|
downloaded++;
|
|
}
|
|
} catch (error) {
|
|
console.error(
|
|
` ❌ Error processing ${targetName}: ${error.message}\n`
|
|
);
|
|
skipped++;
|
|
}
|
|
}
|
|
|
|
console.log('🎉 Scraping complete!');
|
|
console.log(`📊 Statistics:`);
|
|
console.log(` Total targets: ${TARGET_RESOURCES.length}`);
|
|
console.log(` Processed: ${processed}`);
|
|
console.log(` Downloaded: ${downloaded}`);
|
|
console.log(` Skipped: ${skipped}`);
|
|
} catch (error) {
|
|
console.error('❌ Fatal error:', error.message);
|
|
process.exit(1);
|
|
} finally {
|
|
await browser.close();
|
|
}
|
|
}
|
|
|
|
// Run scraper
|
|
scrapeResources();
|