🗑️ Remove unused and archived files across multiple directories and update project dependencies in package files
This commit is contained in:
389
code/utils/scrape-pokemon-resources.js
Normal file
389
code/utils/scrape-pokemon-resources.js
Normal file
@@ -0,0 +1,389 @@
|
||||
/**
|
||||
* Pokémon Play! Resources Scraper
|
||||
*
|
||||
* Downloads official tournament rules, resources, and documentation from pokemon.com
|
||||
* - PDFs: Downloads directly
|
||||
* - Videos: Saves video URLs to text files
|
||||
* - Web pages: Extracts and saves text content
|
||||
*
|
||||
* Usage:
|
||||
* node code/utils/scrape-pokemon-resources.js
|
||||
* npm run scrape:pokemon
|
||||
*
|
||||
* Output: docs/projects/pokemon-professor/Pokemon Rules & Resources/
|
||||
*/
|
||||
import puppeteer from 'puppeteer-extra';
|
||||
import StealthPlugin from 'puppeteer-extra-plugin-stealth';
|
||||
import fs from 'fs';
|
||||
import path from 'path';
|
||||
import { fileURLToPath } from 'url';
|
||||
import https from 'https';
|
||||
import http from 'http';
|
||||
|
||||
// Add stealth plugin to avoid bot detection
|
||||
puppeteer.use(StealthPlugin());
|
||||
|
||||
const __filename = fileURLToPath(import.meta.url);
|
||||
const __dirname = path.dirname(__filename);
|
||||
|
||||
// Configuration
|
||||
const BASE_URL =
|
||||
'https://www.pokemon.com/us/play-pokemon/about/tournaments-rules-and-resources';
|
||||
const OUTPUT_DIR = path.resolve(
|
||||
__dirname,
|
||||
'../../docs/projects/pokemon-professor/Pokemon Rules & Resources'
|
||||
);
|
||||
|
||||
// Target resource names from the page
|
||||
const TARGET_RESOURCES = [
|
||||
// Rules & Resources for All
|
||||
'Play! Pokémon Terms of Use',
|
||||
'Play! Pokémon Standards of Conduct',
|
||||
'Play! Pokémon Inclusion Policy',
|
||||
'Play! Pokémon Accessibility Policy',
|
||||
'Play! Pokémon Trainer Username and Team Name Policy',
|
||||
'Play! Pokémon Premier Events Sponsorship Policy',
|
||||
'Play! Pokémon Tournament Rules Handbook',
|
||||
'Play! Pokémon COVID-19 Protocols',
|
||||
'Play! Pokémon Attire and Cosplay Policy',
|
||||
'Play! Pokémon Penalty Guidelines',
|
||||
|
||||
// Pokémon TCG Rules & Resources
|
||||
'Pokémon TCG Rulebook',
|
||||
'Play! Pokémon Deck List (8.5x11)',
|
||||
'Play! Pokémon Deck List (A4)',
|
||||
'TCG Errata',
|
||||
'Pokémon TCG Banned Card List',
|
||||
'Mega Evolution—Phantasmal Flames Banned List and Rule Changes Announcement',
|
||||
'Pokémon TCG Promo Card Legality Status',
|
||||
'Pokémon TCG Alternative Play Handbook',
|
||||
'Pokémon TCG Tournament Handbook',
|
||||
|
||||
// Video Game Rules & Resources
|
||||
'Play! Pokémon Video Game Championships Tournament Handbook',
|
||||
'Pokémon Video Game Team List',
|
||||
|
||||
// Pokémon GO Rules & Resources
|
||||
'Play! Pokémon Pokémon GO Tournament Handbook',
|
||||
'Pokémon GO Team List',
|
||||
'Play! Pokémon Pokémon GO Championship Series Banned Pokémon List',
|
||||
'Organizing Pokémon GO Events',
|
||||
|
||||
// Pokémon UNITE Rules & Resources
|
||||
'Pokémon UNITE Championship Series Handbook',
|
||||
|
||||
// Pokémon League Rules & Resources
|
||||
'Play! Pokémon Store Handbook',
|
||||
'Play! Pokémon League Challenges, Cups, and Prerelease Guide',
|
||||
'League Roster',
|
||||
'League Flyer',
|
||||
|
||||
// Pokémon Club Rules & Resources
|
||||
'Pokémon Activity Sheets',
|
||||
|
||||
// Further Resources for Players
|
||||
'World Championships Battle Dictionary',
|
||||
'Play! Pokémon Scholarship Program Terms and Conditions',
|
||||
'Championship Event Awards Disbursement Information',
|
||||
|
||||
// Training Videos
|
||||
'League Management Demos',
|
||||
'Tournament Software and Reporting Events',
|
||||
'Championship Series Reporting',
|
||||
'TOM Training Videos',
|
||||
'Tools Overview',
|
||||
'Installation and Set-up',
|
||||
'Setting Up Your Tournament',
|
||||
'Tournament Detail Verification',
|
||||
'Running & Completing the Tournament',
|
||||
'Reporting Matches',
|
||||
'Adding Players'
|
||||
];
|
||||
|
||||
/**
|
||||
* Clean filename for filesystem
|
||||
* @param {string} name - Original name
|
||||
* @returns {string} Safe filename
|
||||
*/
|
||||
function sanitizeFilename(name) {
|
||||
return name
|
||||
.replace(/[<>:"/\\|?*]/g, '-')
|
||||
.replace(/\s+/g, ' ')
|
||||
.trim();
|
||||
}
|
||||
|
||||
/**
|
||||
* Download file from URL using page context with cookies
|
||||
* @param {Page} page - Puppeteer page
|
||||
* @param {string} url - File URL
|
||||
* @param {string} filepath - Destination path
|
||||
* @returns {Promise<void>}
|
||||
*/
|
||||
async function downloadFile(page, url, filepath) {
|
||||
// Get cookies from the current page session
|
||||
const cookies = await page.cookies();
|
||||
const cookieString = cookies.map(c => `${c.name}=${c.value}`).join('; ');
|
||||
|
||||
// Use page.evaluate to download with fetch
|
||||
const buffer = await page.evaluate(async downloadUrl => {
|
||||
const response = await fetch(downloadUrl, {
|
||||
method: 'GET',
|
||||
credentials: 'include'
|
||||
});
|
||||
|
||||
if (!response.ok) {
|
||||
throw new Error(`HTTP ${response.status}`);
|
||||
}
|
||||
|
||||
const arrayBuffer = await response.arrayBuffer();
|
||||
return Array.from(new Uint8Array(arrayBuffer));
|
||||
}, url);
|
||||
|
||||
const bufferData = Buffer.from(buffer);
|
||||
|
||||
// Verify it's actually a PDF
|
||||
const header = bufferData.slice(0, 5).toString();
|
||||
if (!header.startsWith('%PDF')) {
|
||||
throw new Error(`Downloaded file is not a PDF (got: ${header})`);
|
||||
}
|
||||
|
||||
fs.writeFileSync(filepath, bufferData);
|
||||
}
|
||||
|
||||
/**
|
||||
* Extract text content from a web page
|
||||
* @param {Page} page - Puppeteer page
|
||||
* @returns {Promise<string>} Page text content
|
||||
*/
|
||||
async function extractPageText(page) {
|
||||
return await page.evaluate(() => {
|
||||
// Remove script and style elements
|
||||
const scripts = document.querySelectorAll(
|
||||
'script, style, nav, footer, header'
|
||||
);
|
||||
scripts.forEach(el => el.remove());
|
||||
|
||||
// Get main content
|
||||
const main =
|
||||
document.querySelector('main, article, .content, #content') ||
|
||||
document.body;
|
||||
return main.innerText.trim();
|
||||
});
|
||||
}
|
||||
|
||||
/**
|
||||
* Main scraping function
|
||||
*/
|
||||
async function scrapeResources() {
|
||||
console.log('🚀 Starting Pokémon Play! Resources Scraper');
|
||||
console.log(`📁 Output directory: ${OUTPUT_DIR}\n`);
|
||||
|
||||
// Create output directory
|
||||
if (!fs.existsSync(OUTPUT_DIR)) {
|
||||
fs.mkdirSync(OUTPUT_DIR, { recursive: true });
|
||||
console.log('✅ Created output directory\n');
|
||||
}
|
||||
|
||||
const browser = await puppeteer.launch({
|
||||
headless: true,
|
||||
args: [
|
||||
'--no-sandbox',
|
||||
'--disable-setuid-sandbox',
|
||||
'--disable-blink-features=AutomationControlled',
|
||||
'--disable-web-security',
|
||||
'--disable-features=IsolateOrigins,site-per-process'
|
||||
]
|
||||
});
|
||||
|
||||
try {
|
||||
const page = await browser.newPage();
|
||||
await page.setViewport({ width: 1920, height: 1080 });
|
||||
|
||||
// Set realistic user agent
|
||||
await page.setUserAgent(
|
||||
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36'
|
||||
);
|
||||
|
||||
// Set extra headers to appear more like a real browser
|
||||
await page.setExtraHTTPHeaders({
|
||||
'Accept-Language': 'en-US,en;q=0.9',
|
||||
'Accept-Encoding': 'gzip, deflate, br',
|
||||
Accept:
|
||||
'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8'
|
||||
});
|
||||
|
||||
// Navigate to main page
|
||||
console.log('🌐 Loading main page...');
|
||||
await page.goto(BASE_URL, { waitUntil: 'networkidle0', timeout: 90000 });
|
||||
|
||||
// Wait for content to load - try waiting for a specific element
|
||||
console.log('⏳ Waiting for content to render...');
|
||||
try {
|
||||
await page.waitForSelector('a[href*=".pdf"], .resource, article', {
|
||||
timeout: 10000
|
||||
});
|
||||
} catch (e) {
|
||||
console.log(
|
||||
'⚠️ Timeout waiting for specific selectors, continuing anyway...'
|
||||
);
|
||||
}
|
||||
|
||||
await new Promise(resolve => setTimeout(resolve, 5000));
|
||||
|
||||
console.log('✅ Page loaded\n');
|
||||
|
||||
// Debug: Take a screenshot
|
||||
await page.screenshot({
|
||||
path: path.join(OUTPUT_DIR, 'debug-screenshot.png'),
|
||||
fullPage: true
|
||||
});
|
||||
console.log('📸 Screenshot saved for debugging\n');
|
||||
|
||||
// Debug: Dump HTML content
|
||||
const html = await page.content();
|
||||
fs.writeFileSync(path.join(OUTPUT_DIR, 'debug-page-source.html'), html);
|
||||
console.log('📄 HTML source saved for debugging\n');
|
||||
|
||||
// Get all links on the page with multiple strategies
|
||||
const links = await page.evaluate(() => {
|
||||
const anchors = Array.from(document.querySelectorAll('a'));
|
||||
const allLinks = anchors
|
||||
.map(a => ({
|
||||
text: a.innerText.trim(),
|
||||
href: a.href,
|
||||
title: a.title || '',
|
||||
ariaLabel: a.getAttribute('aria-label') || ''
|
||||
}))
|
||||
.filter(
|
||||
link =>
|
||||
(link.text || link.title || link.ariaLabel) &&
|
||||
link.href &&
|
||||
!link.href.startsWith('javascript:') &&
|
||||
!link.href.includes('#')
|
||||
);
|
||||
|
||||
// Also try to get download links specifically
|
||||
const downloadLinks = Array.from(
|
||||
document.querySelectorAll('[download], a[href*=".pdf"]')
|
||||
).map(a => ({
|
||||
text: a.innerText.trim() || a.getAttribute('download') || a.title,
|
||||
href: a.href
|
||||
}));
|
||||
|
||||
return [...allLinks, ...downloadLinks].filter(
|
||||
(link, index, self) =>
|
||||
index === self.findIndex(l => l.href === link.href)
|
||||
);
|
||||
});
|
||||
|
||||
console.log(`📋 Found ${links.length} total links on page`);
|
||||
|
||||
// Debug: Show first 10 links
|
||||
if (links.length > 0) {
|
||||
console.log('\n📝 Sample links found:');
|
||||
links.slice(0, 10).forEach((link, i) => {
|
||||
console.log(` ${i + 1}. ${link.text.substring(0, 60)}...`);
|
||||
});
|
||||
console.log('');
|
||||
} else {
|
||||
console.log(
|
||||
'⚠️ No links found - page may require different loading strategy\n'
|
||||
);
|
||||
}
|
||||
|
||||
// Process each target resource
|
||||
let processed = 0;
|
||||
let downloaded = 0;
|
||||
let skipped = 0;
|
||||
|
||||
for (const targetName of TARGET_RESOURCES) {
|
||||
// Find matching link (case-insensitive, fuzzy match)
|
||||
const link = links.find(
|
||||
l =>
|
||||
l.text.toLowerCase().includes(targetName.toLowerCase()) ||
|
||||
targetName.toLowerCase().includes(l.text.toLowerCase())
|
||||
);
|
||||
|
||||
if (!link) {
|
||||
console.log(`⚠️ Could not find link for: ${targetName}`);
|
||||
skipped++;
|
||||
continue;
|
||||
}
|
||||
|
||||
processed++;
|
||||
const safeFilename = sanitizeFilename(targetName);
|
||||
|
||||
try {
|
||||
// Check if it's a PDF
|
||||
if (link.href.toLowerCase().endsWith('.pdf')) {
|
||||
const filepath = path.join(OUTPUT_DIR, `${safeFilename}.pdf`);
|
||||
console.log(`📥 Downloading PDF: ${targetName}`);
|
||||
console.log(` URL: ${link.href}`);
|
||||
await downloadFile(page, link.href, filepath);
|
||||
console.log(` ✅ Saved: ${safeFilename}.pdf\n`);
|
||||
downloaded++;
|
||||
}
|
||||
// Check if it's a video link (YouTube, Vimeo, etc.)
|
||||
else if (
|
||||
link.href.includes('youtube.com') ||
|
||||
link.href.includes('youtu.be') ||
|
||||
link.href.includes('vimeo.com') ||
|
||||
link.href.includes('video')
|
||||
) {
|
||||
const filepath = path.join(
|
||||
OUTPUT_DIR,
|
||||
`${safeFilename} - Video URL.txt`
|
||||
);
|
||||
console.log(`🎥 Saving video URL: ${targetName}`);
|
||||
fs.writeFileSync(
|
||||
filepath,
|
||||
`${targetName}\n\nVideo URL: ${link.href}\n`
|
||||
);
|
||||
console.log(` ✅ Saved: ${safeFilename} - Video URL.txt\n`);
|
||||
downloaded++;
|
||||
}
|
||||
// Otherwise, extract page text
|
||||
else {
|
||||
console.log(`📄 Extracting text from: ${targetName}`);
|
||||
console.log(` URL: ${link.href}`);
|
||||
|
||||
const contentPage = await browser.newPage();
|
||||
await contentPage.goto(link.href, {
|
||||
waitUntil: 'networkidle2',
|
||||
timeout: 60000
|
||||
});
|
||||
const text = await extractPageText(contentPage);
|
||||
await contentPage.close();
|
||||
|
||||
const filepath = path.join(OUTPUT_DIR, `${safeFilename}.txt`);
|
||||
fs.writeFileSync(
|
||||
filepath,
|
||||
`${targetName}\n\nSource: ${link.href}\n\n${text}\n`
|
||||
);
|
||||
console.log(` ✅ Saved: ${safeFilename}.txt\n`);
|
||||
downloaded++;
|
||||
}
|
||||
} catch (error) {
|
||||
console.error(
|
||||
` ❌ Error processing ${targetName}: ${error.message}\n`
|
||||
);
|
||||
skipped++;
|
||||
}
|
||||
}
|
||||
|
||||
console.log('🎉 Scraping complete!');
|
||||
console.log(`📊 Statistics:`);
|
||||
console.log(` Total targets: ${TARGET_RESOURCES.length}`);
|
||||
console.log(` Processed: ${processed}`);
|
||||
console.log(` Downloaded: ${downloaded}`);
|
||||
console.log(` Skipped: ${skipped}`);
|
||||
} catch (error) {
|
||||
console.error('❌ Fatal error:', error.message);
|
||||
process.exit(1);
|
||||
} finally {
|
||||
await browser.close();
|
||||
}
|
||||
}
|
||||
|
||||
// Run scraper
|
||||
scrapeResources();
|
||||
Reference in New Issue
Block a user