// Constants const CONFIG = { USER_AGENT: "Googlebot/2.1 (+http://www.google.com/bot.html)", RECIPIENT_EMAIL: "tom.spaccialbelli@gmail.com", PRODUCT_BLOCK_REGEX: /]*data-sku[^>]*>([\s\S]*?)(?=]*data-sku)/g, LINK_REGEX: /(\d+)/g, SEE_MORE_PRICE_REGEX: /class="hideFromPro lpPictosMsg"[^>]*>\+ d'offres à partir de (\d+),\d+€/g, SHEETS: { CAT: "CAT", PRODUCTS: "PRODUCTS", COUNT: "COUNT" }, RANGES: { CAT_URLS: "B2:B", PRODUCTS_DATA: { startRow: 2, startCol: 1, numCols: 2 }, COUNT_DATA: { startRow: 2, startCol: 1, numCols: 2 } }, BOT_DETECTION_TEXT: "Comment activer le javascript", PRICE_CHANGE_THRESHOLD: 0.4, BASE_URL: "https://www.cdiscount.com/ProductListUC.mvc/UpdateJsonPage?page=", POST_HEADER : { "accept": "application/json, text/javascript, */*; q=0.01", "accept-language": "en-US,en;q=0.5", "cache-control": "no-cache", "content-type": "application/x-www-form-urlencoded; charset=UTF-8", "pragma": "no-cache", "user-agent": "Mozilla/5.0 (compatible; Googlebot/2.1; +http://www.google.com/bot.html)", "sec-ch-ua": "\"Googlebot\";v=\"2.1\", \"(compatible; Googlebot/2.1; +http://www.google.com/bot.html)\"", "sec-ch-ua-mobile": "?0", "sec-ch-ua-platform": "\"Linux\"", "sec-fetch-dest": "empty", "sec-fetch-mode": "cors", "sec-fetch-site": "same-origin", "sec-gpc": "1", "x-requested-with": "XMLHttpRequest" }, }; function getCountData(countSheet) { const { startRow, startCol, numCols } = CONFIG.RANGES.COUNT_DATA; const lastRow = countSheet.getLastRow(); if (lastRow >= startRow) { const numRows = lastRow - startRow + 1; return countSheet.getRange(startRow, startCol, numRows, numCols).getValues(); } else { return []; } } function updateCountData(countSheet, requestCount) { const today = new Date().toLocaleDateString(); const countData = getCountData(countSheet); let rowIndex = countData.findIndex(data => { const dateStr = data[0].toLocaleDateString(); return dateStr === today; }); if (rowIndex === -1) { // If the date doesn't exist, add a new row with the date and request count rowIndex = countData.length; countSheet.getRange(CONFIG.RANGES.COUNT_DATA.startRow + rowIndex, CONFIG.RANGES.COUNT_DATA.startCol).setValue(today); countSheet.getRange(CONFIG.RANGES.COUNT_DATA.startRow + rowIndex, CONFIG.RANGES.COUNT_DATA.startCol + 1).setValue(requestCount); } else { // If the date already exists, update the request count in the existing row const existingCount = countData[rowIndex][1]; countSheet.getRange(CONFIG.RANGES.COUNT_DATA.startRow + rowIndex, CONFIG.RANGES.COUNT_DATA.startCol + 1).setValue(existingCount + requestCount); } } // Utility function to parse price from string to integer function parsePrice(priceString) { return priceString ? parseInt(priceString, 10) : null; } // Utility function to log price change function logPriceChange(url, previousPrice, newPrice) { Logger.log(`Price change detected for ${url}`); Logger.log(`Previous price: ${previousPrice}, New price: ${newPrice}`); } function checkBotDetection(responseText) { return responseText.includes(CONFIG.BOT_DETECTION_TEXT); } function parseProductData(productBlockMatch) { const linkMatches = productBlockMatch.match(CONFIG.LINK_REGEX) || []; const priceMatches = productBlockMatch.match(CONFIG.PRICE_REGEX) || []; if (linkMatches.length === 0 || priceMatches.length === 0) { return null; } const productUrl = linkMatches[0].match(/href="([^"]*)"/)[1]; const price = parsePrice(priceMatches[0].split('>')[1]); const seeMorePriceMatch = productBlockMatch.match(CONFIG.SEE_MORE_PRICE_REGEX); const seeMorePrice = seeMorePriceMatch ? parsePrice(seeMorePriceMatch[0].match(/(\d+),\d+/)) : null; const lowestPrice = seeMorePrice !== null && seeMorePrice < price ? seeMorePrice : price; return { productUrl, lowestPrice }; } function comparePrice(previousPrice, lowestPrice) { if (!isNaN(previousPrice) && !isNaN(lowestPrice) && lowestPrice !== null && lowestPrice !== "" && lowestPrice < previousPrice * CONFIG.PRICE_CHANGE_THRESHOLD) { return true; } return false; } function processProductData(productBlockMatches, existingData) { const updatedProductData = []; const newProductData = []; for (const productBlockMatch of productBlockMatches) { if (productBlockMatch.includes("{{nodeid}}") || productBlockMatch.includes("sponsorisés")) { continue; } const productData = parseProductData(productBlockMatch); if (productData) { const { productUrl, lowestPrice } = productData; const existingProductIndex = existingData.findIndex(data => data[0] === productUrl); if (existingProductIndex !== -1) { const previousPrice = parsePrice(existingData[existingProductIndex][1]); if (comparePrice(previousPrice, lowestPrice)) { sendEmailNotification('', productUrl, previousPrice, lowestPrice); updatedProductData.push([productUrl, lowestPrice]); } } else { newProductData.push([productUrl, lowestPrice]); } } } return { updatedProductData, newProductData }; } function fetchProductData(responseText, payload, existingData) { if (checkBotDetection(responseText)) { return { botBlacklisted: true }; } try { const data = JSON.parse(responseText); const html = data.productsHtml; const productBlockMatches = html.match(CONFIG.PRODUCT_BLOCK_REGEX) || []; const { updatedProductData, newProductData } = processProductData(productBlockMatches, existingData); return { updatedProductData, newProductData, totalResultCount: data.totalResultCount }; } catch (error) { Logger.log(`Error parsing response: ${error.message}`); return { updatedProductData: [], newProductData: [], totalResultCount: 0 }; } } // Function to get existing product data from the sheet function getExistingProductData(productsSheet) { const { startRow, startCol, numCols } = CONFIG.RANGES.PRODUCTS_DATA; const lastRow = productsSheet.getLastRow(); if (lastRow >= startRow) { const numRows = lastRow - startRow + 1; return productsSheet.getRange(startRow, startCol, numRows, numCols).getValues(); } else { // If there are no rows in the specified range, return an empty array return []; } } function writeProductData(productsSheet, updatedProductData, newProductData) { const { startRow, startCol, numCols } = CONFIG.RANGES.PRODUCTS_DATA; const lastRow = productsSheet.getLastRow(); let existingData = []; if (lastRow >= startRow) { const numRows = lastRow - startRow + 1; existingData = productsSheet.getRange(startRow, startCol, numRows, numCols).getValues(); } // Update existing product prices for (const [productUrl, newPrice] of updatedProductData) { const rowIndex = existingData.findIndex(data => data[0] === productUrl); if (rowIndex !== -1) { productsSheet.getRange(startRow + rowIndex, startCol + 1).setValue(newPrice); } } // Append new product data if (newProductData.length > 0) { const newDataStartRow = existingData.length + startRow; const newDataRange = productsSheet.getRange(newDataStartRow, startCol, newProductData.length, numCols); newDataRange.setValues(newProductData); } } // Main function to scrape Cdiscount pages and write product data to the "PRODUCTS" sheet function scrapeCdiscountPages() { const spreadsheet = SpreadsheetApp.getActiveSpreadsheet(); const catSheet = spreadsheet.getSheetByName(CONFIG.SHEETS.CAT); const productsSheet = spreadsheet.getSheetByName(CONFIG.SHEETS.PRODUCTS); const countSheet = spreadsheet.getSheetByName(CONFIG.SHEETS.COUNT); const payloads = catSheet.getRange(CONFIG.RANGES.CAT_URLS).getValues().flat().filter(payload => payload !== ""); const existingData = getExistingProductData(productsSheet); const updatedProductData = []; let newProductData = []; // Initial request to check if the bot is blacklisted const initialUrl = "https://www.cdiscount.com/electromenager/lavage-sechage/achat-seche-linge/l-1100105.html"; const initialResponse = UrlFetchApp.fetch(initialUrl, { method: "GET", headers: { "User-Agent": "Mozilla/5.0 (compatible; Googlebot/2.1; +http://www.google.com/bot.html)" }, followRedirects: false }); if (initialResponse.getContentText().includes(CONFIG.BOT_DETECTION_TEXT)) { Logger.log("Not the good IP address"); return; } const firstPageRequests = payloads.map(payload => ({ url: CONFIG.BASE_URL + 1, method: "POST", headers: CONFIG.POST_HEADER, payload: payload, followRedirects: false, validateHttpsCertificates: false, })); const firstPageResponses = UrlFetchApp.fetchAll(firstPageRequests); let totalRequestCount = 0; for (let i = 0; i < payloads.length; i++) { const payload = payloads[i]; const { totalResultCount, botBlacklisted } = fetchProductData(firstPageResponses[i].getContentText(), payload, existingData); if (botBlacklisted) { Logger.log("Not the good IP address"); return; } const totalPages = Math.ceil(totalResultCount / 50); const pageUrls = []; for (let page = 1; page <= totalPages; page++) { const url = CONFIG.BASE_URL + page; pageUrls.push(url); } const requests = pageUrls.map(url => ({ url: url, method: "POST", headers: CONFIG.POST_HEADER, payload: payload, followRedirects: false, muteHttpExceptions: true, validateHttpsCertificates: false, })); try { const responses = UrlFetchApp.fetchAll(requests); totalRequestCount += responses.length; for (const response of responses) { const { updatedProductData: updatedData, newProductData: newData } = fetchProductData(response.getContentText(), payload, existingData); updatedProductData.push(...updatedData); newProductData.push(...newData); } } catch (error) { Logger.log(`Error fetching requests: ${error.message}`); // Continue with the next payload continue; } } // Remove duplicates from newProductData newProductData = Array.from(new Set(newProductData.map(JSON.stringify)), JSON.parse); writeProductData(productsSheet, updatedProductData, newProductData); updateCountData(countSheet, totalRequestCount); } function sendEmailNotification(name, url, previousPrice, newPrice) { const subject = "Price Change Alert"; let body = `The price of the product has changed.\n\nURL: ${url}\nPrevious Price: ${previousPrice}\nNew Price: ${newPrice}`; if (name) { body = `The price of the product "${name}" has changed.\n\nURL: ${url}\nPrevious Price: ${previousPrice}\nNew Price: ${newPrice}`; } GmailApp.sendEmail(CONFIG.RECIPIENT_EMAIL, subject, body, {cc: 'valentin.francon@gmail.com'}); }