Initial commit

This commit is contained in:
2026-01-26 16:43:01 -05:00
commit 23cb27503e
39 changed files with 96557 additions and 0 deletions

125
code/junk-drawer/scrape.js Normal file
View File

@@ -0,0 +1,125 @@
/**
* This script scrapes a website for table data using Puppeteer and Cheerio.
* It specifically looks for tables with headers "Date", "Venue", and "Location",
* and extracts the data from these tables.
*
* The script performs the following steps:
* 1. Launches a Puppeteer browser instance.
* 2. Sets the user agent and viewport to mimic a real browser.
* 3. Navigates to the specified URL.
* 4. Simulates human-like interactions (mouse movements and delays).
* 5. Extracts the HTML content of the page.
* 6. Loads the HTML content into Cheerio for parsing.
* 7. Finds all table elements and checks if they contain the headers "Date", "Venue", and "Location".
* 8. Extracts the data from the matching tables and returns it.
*
* @param {string} url - The URL of the website to scrape.
* @returns {Promise<Array<Object>>} - A promise that resolves to an array of objects containing the scraped data.
*
* Example usage:
* const url = 'https://www.pokemon.com/us/play-pokemon/pokemon-events/championship-series/2025/regional-special-championships';
* scrapeWebsite(url)
* .then(data => console.log(data))
* .catch(error => console.error(error));
*
* Required npm packages:
* - puppeteer: ^10.0.0
* - cheerio: ^1.0.0-rc.10
*
* Currently not working due to recaptch on P!P site
*/
const puppeteer = require("puppeteer");
const cheerio = require("cheerio");
async function scrapeWebsite(url) {
// Launch Puppeteer
const browser = await puppeteer.launch({
headless: false, // Run in headless mode
args: [
"--no-sandbox",
"--disable-setuid-sandbox",
"--disable-dev-shm-usage",
"--disable-accelerated-2d-canvas",
"--disable-gpu",
"--window-size=1920x1080",
],
});
const page = await browser.newPage();
// Set user agent to mimic a real browser
await page.setUserAgent(
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36",
);
// Set viewport to mimic a real browser
await page.setViewport({ width: 1920, height: 1080 });
// Navigate to the URL
await page.goto(url, { waitUntil: "networkidle2" });
// Simulate human-like interactions
await page.waitForTimeout(2000); // Wait for 2 seconds
await page.mouse.move(100, 100); // Move mouse to a specific position
await page.mouse.move(200, 200, { steps: 10 }); // Move mouse with steps
await page.waitForTimeout(1000); // Wait for 1 second
// Get the HTML content
const content = await page.content();
await browser.close();
// Load the HTML content into Cheerio
const $ = cheerio.load(content);
// Find all table elements
const tables = $("table");
const data = [];
// Loop through each table
tables.each((index, table) => {
const headers = [];
const rows = $(table).find("tr");
// Check if the first row contains the headers Date, Venue, and Location
const firstRow = rows.first();
firstRow.find("tr").each((i, th) => {
headers.push($(th).text().trim().toLowerCase());
});
if (
headers.includes("date") &&
headers.includes("venue") &&
headers.includes("location")
) {
// Loop through the remaining rows and extract data
rows.slice(1).each((i, row) => {
const cells = $(row).find("td");
const rowData = {};
cells.each((j, cell) => {
const header = headers[j];
const cellText = $(cell).text().trim();
if (header === "date") {
const dates = cellText.split(" - ");
rowData.startDate = dates[0];
rowData.endDate = dates[1] || dates[0];
} else if (header === "venue") {
rowData.venue = cellText;
} else if (header === "location") {
rowData.location = cellText;
}
});
data.push(rowData);
});
}
});
return data;
}
// Example usage
const url =
"https://www.pokemon.com/us/play-pokemon/pokemon-events/championship-series/2025/regional-special-championships";
scrapeWebsite(url)
.then((data) => console.log(data))
.catch((error) => console.error(error));