Initial commit
This commit is contained in:
125
code/junk-drawer/scrape.js
Normal file
125
code/junk-drawer/scrape.js
Normal file
@@ -0,0 +1,125 @@
|
||||
/**
|
||||
* This script scrapes a website for table data using Puppeteer and Cheerio.
|
||||
* It specifically looks for tables with headers "Date", "Venue", and "Location",
|
||||
* and extracts the data from these tables.
|
||||
*
|
||||
* The script performs the following steps:
|
||||
* 1. Launches a Puppeteer browser instance.
|
||||
* 2. Sets the user agent and viewport to mimic a real browser.
|
||||
* 3. Navigates to the specified URL.
|
||||
* 4. Simulates human-like interactions (mouse movements and delays).
|
||||
* 5. Extracts the HTML content of the page.
|
||||
* 6. Loads the HTML content into Cheerio for parsing.
|
||||
* 7. Finds all table elements and checks if they contain the headers "Date", "Venue", and "Location".
|
||||
* 8. Extracts the data from the matching tables and returns it.
|
||||
*
|
||||
* @param {string} url - The URL of the website to scrape.
|
||||
* @returns {Promise<Array<Object>>} - A promise that resolves to an array of objects containing the scraped data.
|
||||
*
|
||||
* Example usage:
|
||||
* const url = 'https://www.pokemon.com/us/play-pokemon/pokemon-events/championship-series/2025/regional-special-championships';
|
||||
* scrapeWebsite(url)
|
||||
* .then(data => console.log(data))
|
||||
* .catch(error => console.error(error));
|
||||
*
|
||||
* Required npm packages:
|
||||
* - puppeteer: ^10.0.0
|
||||
* - cheerio: ^1.0.0-rc.10
|
||||
*
|
||||
* Currently not working due to recaptch on P!P site
|
||||
*/
|
||||
const puppeteer = require("puppeteer");
|
||||
const cheerio = require("cheerio");
|
||||
|
||||
async function scrapeWebsite(url) {
|
||||
// Launch Puppeteer
|
||||
const browser = await puppeteer.launch({
|
||||
headless: false, // Run in headless mode
|
||||
args: [
|
||||
"--no-sandbox",
|
||||
"--disable-setuid-sandbox",
|
||||
"--disable-dev-shm-usage",
|
||||
"--disable-accelerated-2d-canvas",
|
||||
"--disable-gpu",
|
||||
"--window-size=1920x1080",
|
||||
],
|
||||
});
|
||||
const page = await browser.newPage();
|
||||
// Set user agent to mimic a real browser
|
||||
await page.setUserAgent(
|
||||
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36",
|
||||
);
|
||||
|
||||
// Set viewport to mimic a real browser
|
||||
await page.setViewport({ width: 1920, height: 1080 });
|
||||
|
||||
// Navigate to the URL
|
||||
await page.goto(url, { waitUntil: "networkidle2" });
|
||||
|
||||
// Simulate human-like interactions
|
||||
await page.waitForTimeout(2000); // Wait for 2 seconds
|
||||
await page.mouse.move(100, 100); // Move mouse to a specific position
|
||||
await page.mouse.move(200, 200, { steps: 10 }); // Move mouse with steps
|
||||
await page.waitForTimeout(1000); // Wait for 1 second
|
||||
|
||||
// Get the HTML content
|
||||
const content = await page.content();
|
||||
await browser.close();
|
||||
|
||||
// Load the HTML content into Cheerio
|
||||
const $ = cheerio.load(content);
|
||||
|
||||
// Find all table elements
|
||||
const tables = $("table");
|
||||
const data = [];
|
||||
|
||||
// Loop through each table
|
||||
tables.each((index, table) => {
|
||||
const headers = [];
|
||||
const rows = $(table).find("tr");
|
||||
|
||||
// Check if the first row contains the headers Date, Venue, and Location
|
||||
const firstRow = rows.first();
|
||||
firstRow.find("tr").each((i, th) => {
|
||||
headers.push($(th).text().trim().toLowerCase());
|
||||
});
|
||||
|
||||
if (
|
||||
headers.includes("date") &&
|
||||
headers.includes("venue") &&
|
||||
headers.includes("location")
|
||||
) {
|
||||
// Loop through the remaining rows and extract data
|
||||
rows.slice(1).each((i, row) => {
|
||||
const cells = $(row).find("td");
|
||||
const rowData = {};
|
||||
|
||||
cells.each((j, cell) => {
|
||||
const header = headers[j];
|
||||
const cellText = $(cell).text().trim();
|
||||
|
||||
if (header === "date") {
|
||||
const dates = cellText.split(" - ");
|
||||
rowData.startDate = dates[0];
|
||||
rowData.endDate = dates[1] || dates[0];
|
||||
} else if (header === "venue") {
|
||||
rowData.venue = cellText;
|
||||
} else if (header === "location") {
|
||||
rowData.location = cellText;
|
||||
}
|
||||
});
|
||||
|
||||
data.push(rowData);
|
||||
});
|
||||
}
|
||||
});
|
||||
|
||||
return data;
|
||||
}
|
||||
|
||||
// Example usage
|
||||
const url =
|
||||
"https://www.pokemon.com/us/play-pokemon/pokemon-events/championship-series/2025/regional-special-championships";
|
||||
scrapeWebsite(url)
|
||||
.then((data) => console.log(data))
|
||||
.catch((error) => console.error(error));
|
||||
Reference in New Issue
Block a user