memory-infrastructure-palace/code/utils/scrape-pokemon-resources.js

/**
 * Pokémon Play! Resources Scraper
 *
 * Downloads official tournament rules, resources, and documentation from pokemon.com
 * - PDFs: Downloads directly
 * - Videos: Saves video URLs to text files
 * - Web pages: Extracts and saves text content
 *
 * Usage:
 *   node code/utils/scrape-pokemon-resources.js
 *   npm run scrape:pokemon
 *
 * Output: docs/projects/pokemon-professor/Pokemon Rules & Resources/
 */
import puppeteer from 'puppeteer-extra';
import StealthPlugin from 'puppeteer-extra-plugin-stealth';
import fs from 'fs';
import path from 'path';
import { fileURLToPath } from 'url';
import https from 'https';
import http from 'http';

// Add stealth plugin to avoid bot detection
puppeteer.use(StealthPlugin());

const __filename = fileURLToPath(import.meta.url);
const __dirname = path.dirname(__filename);

// Configuration
const BASE_URL =
  'https://www.pokemon.com/us/play-pokemon/about/tournaments-rules-and-resources';
const OUTPUT_DIR = path.resolve(
  __dirname,
  '../../docs/projects/pokemon-professor/Pokemon Rules & Resources'
);

// Target resource names from the page
const TARGET_RESOURCES = [
  // Rules & Resources for All
  'Play! Pokémon Terms of Use',
  'Play! Pokémon Standards of Conduct',
  'Play! Pokémon Inclusion Policy',
  'Play! Pokémon Accessibility Policy',
  'Play! Pokémon Trainer Username and Team Name Policy',
  'Play! Pokémon Premier Events Sponsorship Policy',
  'Play! Pokémon Tournament Rules Handbook',
  'Play! Pokémon COVID-19 Protocols',
  'Play! Pokémon Attire and Cosplay Policy',
  'Play! Pokémon Penalty Guidelines',

  // Pokémon TCG Rules & Resources
  'Pokémon TCG Rulebook',
  'Play! Pokémon Deck List (8.5x11)',
  'Play! Pokémon Deck List (A4)',
  'TCG Errata',
  'Pokémon TCG Banned Card List',
  'Mega Evolution—Phantasmal Flames Banned List and Rule Changes Announcement',
  'Pokémon TCG Promo Card Legality Status',
  'Pokémon TCG Alternative Play Handbook',
  'Pokémon TCG Tournament Handbook',

  // Video Game Rules & Resources
  'Play! Pokémon Video Game Championships Tournament Handbook',
  'Pokémon Video Game Team List',

  // Pokémon GO Rules & Resources
  'Play! Pokémon Pokémon GO Tournament Handbook',
  'Pokémon GO Team List',
  'Play! Pokémon Pokémon GO Championship Series Banned Pokémon List',
  'Organizing Pokémon GO Events',

  // Pokémon UNITE Rules & Resources
  'Pokémon UNITE Championship Series Handbook',

  // Pokémon League Rules & Resources
  'Play! Pokémon Store Handbook',
  'Play! Pokémon League Challenges, Cups, and Prerelease Guide',
  'League Roster',
  'League Flyer',

  // Pokémon Club Rules & Resources
  'Pokémon Activity Sheets',

  // Further Resources for Players
  'World Championships Battle Dictionary',
  'Play! Pokémon Scholarship Program Terms and Conditions',
  'Championship Event Awards Disbursement Information',

  // Training Videos
  'League Management Demos',
  'Tournament Software and Reporting Events',
  'Championship Series Reporting',
  'TOM Training Videos',
  'Tools Overview',
  'Installation and Set-up',
  'Setting Up Your Tournament',
  'Tournament Detail Verification',
  'Running & Completing the Tournament',
  'Reporting Matches',
  'Adding Players'
];

/**
 * Clean filename for filesystem
 * @param {string} name - Original name
 * @returns {string} Safe filename
 */
function sanitizeFilename(name) {
  return name
    .replace(/[<>:"/\\|?*]/g, '-')
    .replace(/\s+/g, ' ')
    .trim();
}

/**
 * Download file from URL using page context with cookies
 * @param {Page} page - Puppeteer page
 * @param {string} url - File URL
 * @param {string} filepath - Destination path
 * @returns {Promise<void>}
 */
async function downloadFile(page, url, filepath) {
  // Get cookies from the current page session
  const cookies = await page.cookies();
  const cookieString = cookies.map(c => `${c.name}=${c.value}`).join('; ');

  // Use page.evaluate to download with fetch
  const buffer = await page.evaluate(async downloadUrl => {
    const response = await fetch(downloadUrl, {
      method: 'GET',
      credentials: 'include'
    });

    if (!response.ok) {
      throw new Error(`HTTP ${response.status}`);
    }

    const arrayBuffer = await response.arrayBuffer();
    return Array.from(new Uint8Array(arrayBuffer));
  }, url);

  const bufferData = Buffer.from(buffer);

  // Verify it's actually a PDF
  const header = bufferData.slice(0, 5).toString();
  if (!header.startsWith('%PDF')) {
    throw new Error(`Downloaded file is not a PDF (got: ${header})`);
  }

  fs.writeFileSync(filepath, bufferData);
}

/**
 * Extract text content from a web page
 * @param {Page} page - Puppeteer page
 * @returns {Promise<string>} Page text content
 */
async function extractPageText(page) {
  return await page.evaluate(() => {
    // Remove script and style elements
    const scripts = document.querySelectorAll(
      'script, style, nav, footer, header'
    );
    scripts.forEach(el => el.remove());

    // Get main content
    const main =
      document.querySelector('main, article, .content, #content') ||
      document.body;
    return main.innerText.trim();
  });
}

/**
 * Main scraping function
 */
async function scrapeResources() {
  console.log('🚀 Starting Pokémon Play! Resources Scraper');
  console.log(`📁 Output directory: ${OUTPUT_DIR}\n`);

  // Create output directory
  if (!fs.existsSync(OUTPUT_DIR)) {
    fs.mkdirSync(OUTPUT_DIR, { recursive: true });
    console.log('✅ Created output directory\n');
  }

  const browser = await puppeteer.launch({
    headless: true,
    args: [
      '--no-sandbox',
      '--disable-setuid-sandbox',
      '--disable-blink-features=AutomationControlled',
      '--disable-web-security',
      '--disable-features=IsolateOrigins,site-per-process'
    ]
  });

  try {
    const page = await browser.newPage();
    await page.setViewport({ width: 1920, height: 1080 });

    // Set realistic user agent
    await page.setUserAgent(
      'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36'
    );

    // Set extra headers to appear more like a real browser
    await page.setExtraHTTPHeaders({
      'Accept-Language': 'en-US,en;q=0.9',
      'Accept-Encoding': 'gzip, deflate, br',
      Accept:
        'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8'
    });

    // Navigate to main page
    console.log('🌐 Loading main page...');
    await page.goto(BASE_URL, { waitUntil: 'networkidle0', timeout: 90000 });

    // Wait for content to load - try waiting for a specific element
    console.log('⏳ Waiting for content to render...');
    try {
      await page.waitForSelector('a[href*=".pdf"], .resource, article', {
        timeout: 10000
      });
    } catch (e) {
      console.log(
        '⚠️  Timeout waiting for specific selectors, continuing anyway...'
      );
    }

    await new Promise(resolve => setTimeout(resolve, 5000));

    console.log('✅ Page loaded\n');

    // Debug: Take a screenshot
    await page.screenshot({
      path: path.join(OUTPUT_DIR, 'debug-screenshot.png'),
      fullPage: true
    });
    console.log('📸 Screenshot saved for debugging\n');

    // Debug: Dump HTML content
    const html = await page.content();
    fs.writeFileSync(path.join(OUTPUT_DIR, 'debug-page-source.html'), html);
    console.log('📄 HTML source saved for debugging\n');

    // Get all links on the page with multiple strategies
    const links = await page.evaluate(() => {
      const anchors = Array.from(document.querySelectorAll('a'));
      const allLinks = anchors
        .map(a => ({
          text: a.innerText.trim(),
          href: a.href,
          title: a.title || '',
          ariaLabel: a.getAttribute('aria-label') || ''
        }))
        .filter(
          link =>
            (link.text || link.title || link.ariaLabel) &&
            link.href &&
            !link.href.startsWith('javascript:') &&
            !link.href.includes('#')
        );

      // Also try to get download links specifically
      const downloadLinks = Array.from(
        document.querySelectorAll('[download], a[href*=".pdf"]')
      ).map(a => ({
        text: a.innerText.trim() || a.getAttribute('download') || a.title,
        href: a.href
      }));

      return [...allLinks, ...downloadLinks].filter(
        (link, index, self) =>
          index === self.findIndex(l => l.href === link.href)
      );
    });

    console.log(`📋 Found ${links.length} total links on page`);

    // Debug: Show first 10 links
    if (links.length > 0) {
      console.log('\n📝 Sample links found:');
      links.slice(0, 10).forEach((link, i) => {
        console.log(`   ${i + 1}. ${link.text.substring(0, 60)}...`);
      });
      console.log('');
    } else {
      console.log(
        '⚠️  No links found - page may require different loading strategy\n'
      );
    }

    // Process each target resource
    let processed = 0;
    let downloaded = 0;
    let skipped = 0;

    for (const targetName of TARGET_RESOURCES) {
      // Find matching link (case-insensitive, fuzzy match)
      const link = links.find(
        l =>
          l.text.toLowerCase().includes(targetName.toLowerCase()) ||
          targetName.toLowerCase().includes(l.text.toLowerCase())
      );

      if (!link) {
        console.log(`⚠️  Could not find link for: ${targetName}`);
        skipped++;
        continue;
      }

      processed++;
      const safeFilename = sanitizeFilename(targetName);

      try {
        // Check if it's a PDF
        if (link.href.toLowerCase().endsWith('.pdf')) {
          const filepath = path.join(OUTPUT_DIR, `${safeFilename}.pdf`);
          console.log(`📥 Downloading PDF: ${targetName}`);
          console.log(`   URL: ${link.href}`);
          await downloadFile(page, link.href, filepath);
          console.log(`   ✅ Saved: ${safeFilename}.pdf\n`);
          downloaded++;
        }
        // Check if it's a video link (YouTube, Vimeo, etc.)
        else if (
          link.href.includes('youtube.com') ||
          link.href.includes('youtu.be') ||
          link.href.includes('vimeo.com') ||
          link.href.includes('video')
        ) {
          const filepath = path.join(
            OUTPUT_DIR,
            `${safeFilename} - Video URL.txt`
          );
          console.log(`🎥 Saving video URL: ${targetName}`);
          fs.writeFileSync(
            filepath,
            `${targetName}\n\nVideo URL: ${link.href}\n`
          );
          console.log(`   ✅ Saved: ${safeFilename} - Video URL.txt\n`);
          downloaded++;
        }
        // Otherwise, extract page text
        else {
          console.log(`📄 Extracting text from: ${targetName}`);
          console.log(`   URL: ${link.href}`);

          const contentPage = await browser.newPage();
          await contentPage.goto(link.href, {
            waitUntil: 'networkidle2',
            timeout: 60000
          });
          const text = await extractPageText(contentPage);
          await contentPage.close();

          const filepath = path.join(OUTPUT_DIR, `${safeFilename}.txt`);
          fs.writeFileSync(
            filepath,
            `${targetName}\n\nSource: ${link.href}\n\n${text}\n`
          );
          console.log(`   ✅ Saved: ${safeFilename}.txt\n`);
          downloaded++;
        }
      } catch (error) {
        console.error(
          `   ❌ Error processing ${targetName}: ${error.message}\n`
        );
        skipped++;
      }
    }

    console.log('🎉 Scraping complete!');
    console.log(`📊 Statistics:`);
    console.log(`   Total targets: ${TARGET_RESOURCES.length}`);
    console.log(`   Processed: ${processed}`);
    console.log(`   Downloaded: ${downloaded}`);
    console.log(`   Skipped: ${skipped}`);
  } catch (error) {
    console.error('❌ Fatal error:', error.message);
    process.exit(1);
  } finally {
    await browser.close();
  }
}

// Run scraper
scrapeResources();