/** * This script scrapes a website for table data using Puppeteer and Cheerio. * It specifically looks for tables with headers "Date", "Venue", and "Location", * and extracts the data from these tables. * * The script performs the following steps: * 1. Launches a Puppeteer browser instance. * 2. Sets the user agent and viewport to mimic a real browser. * 3. Navigates to the specified URL. * 4. Simulates human-like interactions (mouse movements and delays). * 5. Extracts the HTML content of the page. * 6. Loads the HTML content into Cheerio for parsing. * 7. Finds all table elements and checks if they contain the headers "Date", "Venue", and "Location". * 8. Extracts the data from the matching tables and returns it. * * @param {string} url - The URL of the website to scrape. * @returns {Promise>} - A promise that resolves to an array of objects containing the scraped data. * * Example usage: * const url = 'https://www.pokemon.com/us/play-pokemon/pokemon-events/championship-series/2025/regional-special-championships'; * scrapeWebsite(url) * .then(data => console.log(data)) * .catch(error => console.error(error)); * * Required npm packages: * - puppeteer: ^10.0.0 * - cheerio: ^1.0.0-rc.10 * * Currently not working due to recaptch on P!P site */ const puppeteer = require('puppeteer'); const cheerio = require('cheerio'); async function scrapeWebsite(url) { // Launch Puppeteer const browser = await puppeteer.launch({ headless: false, // Run in headless mode args: [ '--no-sandbox', '--disable-setuid-sandbox', '--disable-dev-shm-usage', '--disable-accelerated-2d-canvas', '--disable-gpu', '--window-size=1920x1080' ] }); const page = await browser.newPage(); // Set user agent to mimic a real browser await page.setUserAgent( 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36' ); // Set viewport to mimic a real browser await page.setViewport({ width: 1920, height: 1080 }); // Navigate to the URL await page.goto(url, { waitUntil: 'networkidle2' }); // Simulate human-like interactions await page.waitForTimeout(2000); // Wait for 2 seconds await page.mouse.move(100, 100); // Move mouse to a specific position await page.mouse.move(200, 200, { steps: 10 }); // Move mouse with steps await page.waitForTimeout(1000); // Wait for 1 second // Get the HTML content const content = await page.content(); await browser.close(); // Load the HTML content into Cheerio const $ = cheerio.load(content); // Find all table elements const tables = $('table'); const data = []; // Loop through each table tables.each((index, table) => { const headers = []; const rows = $(table).find('tr'); // Check if the first row contains the headers Date, Venue, and Location const firstRow = rows.first(); firstRow.find('tr').each((i, th) => { headers.push($(th).text().trim().toLowerCase()); }); if ( headers.includes('date') && headers.includes('venue') && headers.includes('location') ) { // Loop through the remaining rows and extract data rows.slice(1).each((i, row) => { const cells = $(row).find('td'); const rowData = {}; cells.each((j, cell) => { const header = headers[j]; const cellText = $(cell).text().trim(); if (header === 'date') { const dates = cellText.split(' - '); rowData.startDate = dates[0]; rowData.endDate = dates[1] || dates[0]; } else if (header === 'venue') { rowData.venue = cellText; } else if (header === 'location') { rowData.location = cellText; } }); data.push(rowData); }); } }); return data; } // Example usage const url = 'https://www.pokemon.com/us/play-pokemon/pokemon-events/championship-series/2025/regional-special-championships'; scrapeWebsite(url) .then(data => console.log(data)) .catch(error => console.error(error));