🗑️ Remove unused and archived files across multiple directories and update project dependencies in package files

2026-01-28 02:54:35 +00:00
parent 79d52f5d92
commit 1beba26249
170 changed files with 632200 additions and 209 deletions
--- a/code/utils/scrape-pokemon-resources.js
+++ b/code/utils/scrape-pokemon-resources.js
@@ -0,0 +1,389 @@
+/**
+ * Pokémon Play! Resources Scraper
+ *
+ * Downloads official tournament rules, resources, and documentation from pokemon.com
+ * - PDFs: Downloads directly
+ * - Videos: Saves video URLs to text files
+ * - Web pages: Extracts and saves text content
+ *
+ * Usage:
+ *   node code/utils/scrape-pokemon-resources.js
+ *   npm run scrape:pokemon
+ *
+ * Output: docs/projects/pokemon-professor/Pokemon Rules & Resources/
+ */
+import puppeteer from 'puppeteer-extra';
+import StealthPlugin from 'puppeteer-extra-plugin-stealth';
+import fs from 'fs';
+import path from 'path';
+import { fileURLToPath } from 'url';
+import https from 'https';
+import http from 'http';
+
+// Add stealth plugin to avoid bot detection
+puppeteer.use(StealthPlugin());
+
+const __filename = fileURLToPath(import.meta.url);
+const __dirname = path.dirname(__filename);
+
+// Configuration
+const BASE_URL =
+  'https://www.pokemon.com/us/play-pokemon/about/tournaments-rules-and-resources';
+const OUTPUT_DIR = path.resolve(
+  __dirname,
+  '../../docs/projects/pokemon-professor/Pokemon Rules & Resources'
+);
+
+// Target resource names from the page
+const TARGET_RESOURCES = [
+  // Rules & Resources for All
+  'Play! Pokémon Terms of Use',
+  'Play! Pokémon Standards of Conduct',
+  'Play! Pokémon Inclusion Policy',
+  'Play! Pokémon Accessibility Policy',
+  'Play! Pokémon Trainer Username and Team Name Policy',
+  'Play! Pokémon Premier Events Sponsorship Policy',
+  'Play! Pokémon Tournament Rules Handbook',
+  'Play! Pokémon COVID-19 Protocols',
+  'Play! Pokémon Attire and Cosplay Policy',
+  'Play! Pokémon Penalty Guidelines',
+
+  // Pokémon TCG Rules & Resources
+  'Pokémon TCG Rulebook',
+  'Play! Pokémon Deck List (8.5x11)',
+  'Play! Pokémon Deck List (A4)',
+  'TCG Errata',
+  'Pokémon TCG Banned Card List',
+  'Mega Evolution—Phantasmal Flames Banned List and Rule Changes Announcement',
+  'Pokémon TCG Promo Card Legality Status',
+  'Pokémon TCG Alternative Play Handbook',
+  'Pokémon TCG Tournament Handbook',
+
+  // Video Game Rules & Resources
+  'Play! Pokémon Video Game Championships Tournament Handbook',
+  'Pokémon Video Game Team List',
+
+  // Pokémon GO Rules & Resources
+  'Play! Pokémon Pokémon GO Tournament Handbook',
+  'Pokémon GO Team List',
+  'Play! Pokémon Pokémon GO Championship Series Banned Pokémon List',
+  'Organizing Pokémon GO Events',
+
+  // Pokémon UNITE Rules & Resources
+  'Pokémon UNITE Championship Series Handbook',
+
+  // Pokémon League Rules & Resources
+  'Play! Pokémon Store Handbook',
+  'Play! Pokémon League Challenges, Cups, and Prerelease Guide',
+  'League Roster',
+  'League Flyer',
+
+  // Pokémon Club Rules & Resources
+  'Pokémon Activity Sheets',
+
+  // Further Resources for Players
+  'World Championships Battle Dictionary',
+  'Play! Pokémon Scholarship Program Terms and Conditions',
+  'Championship Event Awards Disbursement Information',
+
+  // Training Videos
+  'League Management Demos',
+  'Tournament Software and Reporting Events',
+  'Championship Series Reporting',
+  'TOM Training Videos',
+  'Tools Overview',
+  'Installation and Set-up',
+  'Setting Up Your Tournament',
+  'Tournament Detail Verification',
+  'Running & Completing the Tournament',
+  'Reporting Matches',
+  'Adding Players'
+];
+
+/**
+ * Clean filename for filesystem
+ * @param {string} name - Original name
+ * @returns {string} Safe filename
+ */
+function sanitizeFilename(name) {
+  return name
+    .replace(/[<>:"/\\|?*]/g, '-')
+    .replace(/\s+/g, ' ')
+    .trim();
+}
+
+/**
+ * Download file from URL using page context with cookies
+ * @param {Page} page - Puppeteer page
+ * @param {string} url - File URL
+ * @param {string} filepath - Destination path
+ * @returns {Promise<void>}
+ */
+async function downloadFile(page, url, filepath) {
+  // Get cookies from the current page session
+  const cookies = await page.cookies();
+  const cookieString = cookies.map(c => `${c.name}=${c.value}`).join('; ');
+
+  // Use page.evaluate to download with fetch
+  const buffer = await page.evaluate(async downloadUrl => {
+    const response = await fetch(downloadUrl, {
+      method: 'GET',
+      credentials: 'include'
+    });
+
+    if (!response.ok) {
+      throw new Error(`HTTP ${response.status}`);
+    }
+
+    const arrayBuffer = await response.arrayBuffer();
+    return Array.from(new Uint8Array(arrayBuffer));
+  }, url);
+
+  const bufferData = Buffer.from(buffer);
+
+  // Verify it's actually a PDF
+  const header = bufferData.slice(0, 5).toString();
+  if (!header.startsWith('%PDF')) {
+    throw new Error(`Downloaded file is not a PDF (got: ${header})`);
+  }
+
+  fs.writeFileSync(filepath, bufferData);
+}
+
+/**
+ * Extract text content from a web page
+ * @param {Page} page - Puppeteer page
+ * @returns {Promise<string>} Page text content
+ */
+async function extractPageText(page) {
+  return await page.evaluate(() => {
+    // Remove script and style elements
+    const scripts = document.querySelectorAll(
+      'script, style, nav, footer, header'
+    );
+    scripts.forEach(el => el.remove());
+
+    // Get main content
+    const main =
+      document.querySelector('main, article, .content, #content') ||
+      document.body;
+    return main.innerText.trim();
+  });
+}
+
+/**
+ * Main scraping function
+ */
+async function scrapeResources() {
+  console.log('🚀 Starting Pokémon Play! Resources Scraper');
+  console.log(`📁 Output directory: ${OUTPUT_DIR}\n`);
+
+  // Create output directory
+  if (!fs.existsSync(OUTPUT_DIR)) {
+    fs.mkdirSync(OUTPUT_DIR, { recursive: true });
+    console.log('✅ Created output directory\n');
+  }
+
+  const browser = await puppeteer.launch({
+    headless: true,
+    args: [
+      '--no-sandbox',
+      '--disable-setuid-sandbox',
+      '--disable-blink-features=AutomationControlled',
+      '--disable-web-security',
+      '--disable-features=IsolateOrigins,site-per-process'
+    ]
+  });
+
+  try {
+    const page = await browser.newPage();
+    await page.setViewport({ width: 1920, height: 1080 });
+
+    // Set realistic user agent
+    await page.setUserAgent(
+      'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36'
+    );
+
+    // Set extra headers to appear more like a real browser
+    await page.setExtraHTTPHeaders({
+      'Accept-Language': 'en-US,en;q=0.9',
+      'Accept-Encoding': 'gzip, deflate, br',
+      Accept:
+        'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8'
+    });
+
+    // Navigate to main page
+    console.log('🌐 Loading main page...');
+    await page.goto(BASE_URL, { waitUntil: 'networkidle0', timeout: 90000 });
+
+    // Wait for content to load - try waiting for a specific element
+    console.log('⏳ Waiting for content to render...');
+    try {
+      await page.waitForSelector('a[href*=".pdf"], .resource, article', {
+        timeout: 10000
+      });
+    } catch (e) {
+      console.log(
+        '⚠️  Timeout waiting for specific selectors, continuing anyway...'
+      );
+    }
+
+    await new Promise(resolve => setTimeout(resolve, 5000));
+
+    console.log('✅ Page loaded\n');
+
+    // Debug: Take a screenshot
+    await page.screenshot({
+      path: path.join(OUTPUT_DIR, 'debug-screenshot.png'),
+      fullPage: true
+    });
+    console.log('📸 Screenshot saved for debugging\n');
+
+    // Debug: Dump HTML content
+    const html = await page.content();
+    fs.writeFileSync(path.join(OUTPUT_DIR, 'debug-page-source.html'), html);
+    console.log('📄 HTML source saved for debugging\n');
+
+    // Get all links on the page with multiple strategies
+    const links = await page.evaluate(() => {
+      const anchors = Array.from(document.querySelectorAll('a'));
+      const allLinks = anchors
+        .map(a => ({
+          text: a.innerText.trim(),
+          href: a.href,
+          title: a.title || '',
+          ariaLabel: a.getAttribute('aria-label') || ''
+        }))
+        .filter(
+          link =>
+            (link.text || link.title || link.ariaLabel) &&
+            link.href &&
+            !link.href.startsWith('javascript:') &&
+            !link.href.includes('#')
+        );
+
+      // Also try to get download links specifically
+      const downloadLinks = Array.from(
+        document.querySelectorAll('[download], a[href*=".pdf"]')
+      ).map(a => ({
+        text: a.innerText.trim() || a.getAttribute('download') || a.title,
+        href: a.href
+      }));
+
+      return [...allLinks, ...downloadLinks].filter(
+        (link, index, self) =>
+          index === self.findIndex(l => l.href === link.href)
+      );
+    });
+
+    console.log(`📋 Found ${links.length} total links on page`);
+
+    // Debug: Show first 10 links
+    if (links.length > 0) {
+      console.log('\n📝 Sample links found:');
+      links.slice(0, 10).forEach((link, i) => {
+        console.log(`   ${i + 1}. ${link.text.substring(0, 60)}...`);
+      });
+      console.log('');
+    } else {
+      console.log(
+        '⚠️  No links found - page may require different loading strategy\n'
+      );
+    }
+
+    // Process each target resource
+    let processed = 0;
+    let downloaded = 0;
+    let skipped = 0;
+
+    for (const targetName of TARGET_RESOURCES) {
+      // Find matching link (case-insensitive, fuzzy match)
+      const link = links.find(
+        l =>
+          l.text.toLowerCase().includes(targetName.toLowerCase()) ||
+          targetName.toLowerCase().includes(l.text.toLowerCase())
+      );
+
+      if (!link) {
+        console.log(`⚠️  Could not find link for: ${targetName}`);
+        skipped++;
+        continue;
+      }
+
+      processed++;
+      const safeFilename = sanitizeFilename(targetName);
+
+      try {
+        // Check if it's a PDF
+        if (link.href.toLowerCase().endsWith('.pdf')) {
+          const filepath = path.join(OUTPUT_DIR, `${safeFilename}.pdf`);
+          console.log(`📥 Downloading PDF: ${targetName}`);
+          console.log(`   URL: ${link.href}`);
+          await downloadFile(page, link.href, filepath);
+          console.log(`   ✅ Saved: ${safeFilename}.pdf\n`);
+          downloaded++;
+        }
+        // Check if it's a video link (YouTube, Vimeo, etc.)
+        else if (
+          link.href.includes('youtube.com') ||
+          link.href.includes('youtu.be') ||
+          link.href.includes('vimeo.com') ||
+          link.href.includes('video')
+        ) {
+          const filepath = path.join(
+            OUTPUT_DIR,
+            `${safeFilename} - Video URL.txt`
+          );
+          console.log(`🎥 Saving video URL: ${targetName}`);
+          fs.writeFileSync(
+            filepath,
+            `${targetName}\n\nVideo URL: ${link.href}\n`
+          );
+          console.log(`   ✅ Saved: ${safeFilename} - Video URL.txt\n`);
+          downloaded++;
+        }
+        // Otherwise, extract page text
+        else {
+          console.log(`📄 Extracting text from: ${targetName}`);
+          console.log(`   URL: ${link.href}`);
+
+          const contentPage = await browser.newPage();
+          await contentPage.goto(link.href, {
+            waitUntil: 'networkidle2',
+            timeout: 60000
+          });
+          const text = await extractPageText(contentPage);
+          await contentPage.close();
+
+          const filepath = path.join(OUTPUT_DIR, `${safeFilename}.txt`);
+          fs.writeFileSync(
+            filepath,
+            `${targetName}\n\nSource: ${link.href}\n\n${text}\n`
+          );
+          console.log(`   ✅ Saved: ${safeFilename}.txt\n`);
+          downloaded++;
+        }
+      } catch (error) {
+        console.error(
+          `   ❌ Error processing ${targetName}: ${error.message}\n`
+        );
+        skipped++;
+      }
+    }
+
+    console.log('🎉 Scraping complete!');
+    console.log(`📊 Statistics:`);
+    console.log(`   Total targets: ${TARGET_RESOURCES.length}`);
+    console.log(`   Processed: ${processed}`);
+    console.log(`   Downloaded: ${downloaded}`);
+    console.log(`   Skipped: ${skipped}`);
+  } catch (error) {
+    console.error('❌ Fatal error:', error.message);
+    process.exit(1);
+  } finally {
+    await browser.close();
+  }
+}
+
+// Run scraper
+scrapeResources();