Initial commit

2026-01-26 16:43:01 -05:00
commit 23cb27503e
39 changed files with 96557 additions and 0 deletions
--- a/code/junk-drawer/scrape.js
+++ b/code/junk-drawer/scrape.js
@@ -0,0 +1,125 @@
+/**
+ * This script scrapes a website for table data using Puppeteer and Cheerio.
+ * It specifically looks for tables with headers "Date", "Venue", and "Location",
+ * and extracts the data from these tables.
+ *
+ * The script performs the following steps:
+ * 1. Launches a Puppeteer browser instance.
+ * 2. Sets the user agent and viewport to mimic a real browser.
+ * 3. Navigates to the specified URL.
+ * 4. Simulates human-like interactions (mouse movements and delays).
+ * 5. Extracts the HTML content of the page.
+ * 6. Loads the HTML content into Cheerio for parsing.
+ * 7. Finds all table elements and checks if they contain the headers "Date", "Venue", and "Location".
+ * 8. Extracts the data from the matching tables and returns it.
+ *
+ * @param {string} url - The URL of the website to scrape.
+ * @returns {Promise<Array<Object>>} - A promise that resolves to an array of objects containing the scraped data.
+ *
+ * Example usage:
+ * const url = 'https://www.pokemon.com/us/play-pokemon/pokemon-events/championship-series/2025/regional-special-championships';
+ * scrapeWebsite(url)
+ *   .then(data => console.log(data))
+ *   .catch(error => console.error(error));
+ *
+ * Required npm packages:
+ * - puppeteer: ^10.0.0
+ * - cheerio: ^1.0.0-rc.10
+ *
+ * Currently not working due to recaptch on P!P site
+ */
+const puppeteer = require("puppeteer");
+const cheerio = require("cheerio");
+
+async function scrapeWebsite(url) {
+  // Launch Puppeteer
+  const browser = await puppeteer.launch({
+    headless: false, // Run in headless mode
+    args: [
+      "--no-sandbox",
+      "--disable-setuid-sandbox",
+      "--disable-dev-shm-usage",
+      "--disable-accelerated-2d-canvas",
+      "--disable-gpu",
+      "--window-size=1920x1080",
+    ],
+  });
+  const page = await browser.newPage();
+  // Set user agent to mimic a real browser
+  await page.setUserAgent(
+    "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36",
+  );
+
+  // Set viewport to mimic a real browser
+  await page.setViewport({ width: 1920, height: 1080 });
+
+  // Navigate to the URL
+  await page.goto(url, { waitUntil: "networkidle2" });
+
+  // Simulate human-like interactions
+  await page.waitForTimeout(2000); // Wait for 2 seconds
+  await page.mouse.move(100, 100); // Move mouse to a specific position
+  await page.mouse.move(200, 200, { steps: 10 }); // Move mouse with steps
+  await page.waitForTimeout(1000); // Wait for 1 second
+
+  // Get the HTML content
+  const content = await page.content();
+  await browser.close();
+
+  // Load the HTML content into Cheerio
+  const $ = cheerio.load(content);
+
+  // Find all table elements
+  const tables = $("table");
+  const data = [];
+
+  // Loop through each table
+  tables.each((index, table) => {
+    const headers = [];
+    const rows = $(table).find("tr");
+
+    // Check if the first row contains the headers Date, Venue, and Location
+    const firstRow = rows.first();
+    firstRow.find("tr").each((i, th) => {
+      headers.push($(th).text().trim().toLowerCase());
+    });
+
+    if (
+      headers.includes("date") &&
+      headers.includes("venue") &&
+      headers.includes("location")
+    ) {
+      // Loop through the remaining rows and extract data
+      rows.slice(1).each((i, row) => {
+        const cells = $(row).find("td");
+        const rowData = {};
+
+        cells.each((j, cell) => {
+          const header = headers[j];
+          const cellText = $(cell).text().trim();
+
+          if (header === "date") {
+            const dates = cellText.split(" - ");
+            rowData.startDate = dates[0];
+            rowData.endDate = dates[1] || dates[0];
+          } else if (header === "venue") {
+            rowData.venue = cellText;
+          } else if (header === "location") {
+            rowData.location = cellText;
+          }
+        });
+
+        data.push(rowData);
+      });
+    }
+  });
+
+  return data;
+}
+
+// Example usage
+const url =
+  "https://www.pokemon.com/us/play-pokemon/pokemon-events/championship-series/2025/regional-special-championships";
+scrapeWebsite(url)
+  .then((data) => console.log(data))
+  .catch((error) => console.error(error));