126 lines
4.0 KiB
JavaScript
126 lines
4.0 KiB
JavaScript
/**
|
|
* This script scrapes a website for table data using Puppeteer and Cheerio.
|
|
* It specifically looks for tables with headers "Date", "Venue", and "Location",
|
|
* and extracts the data from these tables.
|
|
*
|
|
* The script performs the following steps:
|
|
* 1. Launches a Puppeteer browser instance.
|
|
* 2. Sets the user agent and viewport to mimic a real browser.
|
|
* 3. Navigates to the specified URL.
|
|
* 4. Simulates human-like interactions (mouse movements and delays).
|
|
* 5. Extracts the HTML content of the page.
|
|
* 6. Loads the HTML content into Cheerio for parsing.
|
|
* 7. Finds all table elements and checks if they contain the headers "Date", "Venue", and "Location".
|
|
* 8. Extracts the data from the matching tables and returns it.
|
|
*
|
|
* @param {string} url - The URL of the website to scrape.
|
|
* @returns {Promise<Array<Object>>} - A promise that resolves to an array of objects containing the scraped data.
|
|
*
|
|
* Example usage:
|
|
* const url = 'https://www.pokemon.com/us/play-pokemon/pokemon-events/championship-series/2025/regional-special-championships';
|
|
* scrapeWebsite(url)
|
|
* .then(data => console.log(data))
|
|
* .catch(error => console.error(error));
|
|
*
|
|
* Required npm packages:
|
|
* - puppeteer: ^10.0.0
|
|
* - cheerio: ^1.0.0-rc.10
|
|
*
|
|
* Currently not working due to recaptch on P!P site
|
|
*/
|
|
const puppeteer = require('puppeteer');
|
|
const cheerio = require('cheerio');
|
|
|
|
async function scrapeWebsite(url) {
|
|
// Launch Puppeteer
|
|
const browser = await puppeteer.launch({
|
|
headless: false, // Run in headless mode
|
|
args: [
|
|
'--no-sandbox',
|
|
'--disable-setuid-sandbox',
|
|
'--disable-dev-shm-usage',
|
|
'--disable-accelerated-2d-canvas',
|
|
'--disable-gpu',
|
|
'--window-size=1920x1080'
|
|
]
|
|
});
|
|
const page = await browser.newPage();
|
|
// Set user agent to mimic a real browser
|
|
await page.setUserAgent(
|
|
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
|
|
);
|
|
|
|
// Set viewport to mimic a real browser
|
|
await page.setViewport({ width: 1920, height: 1080 });
|
|
|
|
// Navigate to the URL
|
|
await page.goto(url, { waitUntil: 'networkidle2' });
|
|
|
|
// Simulate human-like interactions
|
|
await page.waitForTimeout(2000); // Wait for 2 seconds
|
|
await page.mouse.move(100, 100); // Move mouse to a specific position
|
|
await page.mouse.move(200, 200, { steps: 10 }); // Move mouse with steps
|
|
await page.waitForTimeout(1000); // Wait for 1 second
|
|
|
|
// Get the HTML content
|
|
const content = await page.content();
|
|
await browser.close();
|
|
|
|
// Load the HTML content into Cheerio
|
|
const $ = cheerio.load(content);
|
|
|
|
// Find all table elements
|
|
const tables = $('table');
|
|
const data = [];
|
|
|
|
// Loop through each table
|
|
tables.each((index, table) => {
|
|
const headers = [];
|
|
const rows = $(table).find('tr');
|
|
|
|
// Check if the first row contains the headers Date, Venue, and Location
|
|
const firstRow = rows.first();
|
|
firstRow.find('tr').each((i, th) => {
|
|
headers.push($(th).text().trim().toLowerCase());
|
|
});
|
|
|
|
if (
|
|
headers.includes('date') &&
|
|
headers.includes('venue') &&
|
|
headers.includes('location')
|
|
) {
|
|
// Loop through the remaining rows and extract data
|
|
rows.slice(1).each((i, row) => {
|
|
const cells = $(row).find('td');
|
|
const rowData = {};
|
|
|
|
cells.each((j, cell) => {
|
|
const header = headers[j];
|
|
const cellText = $(cell).text().trim();
|
|
|
|
if (header === 'date') {
|
|
const dates = cellText.split(' - ');
|
|
rowData.startDate = dates[0];
|
|
rowData.endDate = dates[1] || dates[0];
|
|
} else if (header === 'venue') {
|
|
rowData.venue = cellText;
|
|
} else if (header === 'location') {
|
|
rowData.location = cellText;
|
|
}
|
|
});
|
|
|
|
data.push(rowData);
|
|
});
|
|
}
|
|
});
|
|
|
|
return data;
|
|
}
|
|
|
|
// Example usage
|
|
const url =
|
|
'https://www.pokemon.com/us/play-pokemon/pokemon-events/championship-series/2025/regional-special-championships';
|
|
scrapeWebsite(url)
|
|
.then(data => console.log(data))
|
|
.catch(error => console.error(error));
|