[go: nahoru, domu]

blob: 977de0d7e5ad91c2b6153372910622bba9b04881 [file] [log] [blame]
// Copyright 2021 The Chromium Authors
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file.
var verbose = 0;
// Aliexpress uses 'US $12.34' format in the price.
// Macy's uses "$12.34 to 56.78" format.
var priceCleanupPrefix = 'total price|sale price|price|sale|' +
'with offer|only|our price|now|starting at';
var priceCleanupPostfix = '(/(each|set))';
var priceRegexTemplate = '((reg|regular|orig|from|' + priceCleanupPrefix +
')\\s+)?' +
'(\\d+\\s*/\\s*)?(US(D)?\\s*)?' +
'\\$\\s*[\\d.,]+(\\s+(to|-|–)\\s+(\\$)?[\\d.,]+)?' +
priceCleanupPostfix + '?';
var priceRegexFull = new RegExp('^' + priceRegexTemplate + '( ea)?$', 'i');
var priceRegex = new RegExp(priceRegexTemplate, 'i');
var priceCleanupRegex = new RegExp(
'^((' + priceCleanupPrefix + ')\\s+)|' + priceCleanupPostfix + '$', 'i');
var cartItemHTMLRegex = new RegExp(
'(cart|basket|bundle)[-_]?((\\w+)[-_])?(item|product)', 'i');
var cartItemTextRegex = new RegExp(
'remove|delete|save for later|move to (favo(u?)rite|list|wish( ?)list)s?',
'i');
var cartItemQtyRegex = new RegExp('qty', 'i');
var moveToCartTextRegex = new RegExp('move to (cart|bag)', 'i');
var addToCartTextRegex = new RegExp('add to cart', 'i');
var cartPriceTextRegex = new RegExp('((estimated (sales )?)|(sales ))tax', 'i');
var minicartHTMLRegex = new RegExp('mini-cart-product', 'i');
var productIdHTMLRegex = new RegExp('<a href="#modal-(\\w+)', 'i');
var productIdURLRegex = new RegExp(
'((\\w+)-\\d+-medium)|(images.cymax.com/Images/\\d+/(\\w+)-)', 'i');
var saveForLaterRegex = new RegExp('save for later', 'i');
function getLazyLoadingURL(image) {
// FIXME: some lazy images in Nordstrom and Staples don't have URLs in the
// DOM.
// TODO: add more lazy-loading attributes.
for (const attribute
of ['data-src', 'data-img-url', 'data-config-src', 'data-echo',
'data-lazy']) {
let url = image.getAttribute(attribute);
if (url == null)
continue;
if (url.substr(0, 2) == '//')
url = 'https:' + url;
if (url.substr(0, 4) != 'http')
continue;
return url;
}
}
function getLargeImages(root, atLeast, relaxed = false) {
let candidates = root.querySelectorAll('img');
if (candidates.length == 0) {
// Aliexpress
candidates = root.querySelectorAll('amp-img');
}
if (candidates.length == 0) {
// Google store
candidates = root.querySelectorAll('.bg-img');
}
images = [];
function shouldStillKeep(image) {
if (!relaxed)
return false;
if (image.getAttribute('aria-hidden') == 'true')
return true;
if (getLazyLoadingURL(image) != null)
return true;
// For test files on target.com the images aren't preserved for
// some products so we need to look for the images in the parent
// picture tag.
if (image.parentElement.tagName == 'PICTURE')
return true;
return false;
}
for (const image of candidates) {
if (verbose > 1)
console.log('offsetHeight', image, image.offsetHeight);
if (image.offsetHeight < atLeast) {
if (!shouldStillKeep(image))
continue;
}
if (window.getComputedStyle(image)['visibility'] == 'hidden')
continue;
images.push(image);
}
return images;
}
function getVisibleElements(list) {
visible = [];
for (const ele of list) {
if (ele.offsetHeight == 0 || ele.offsetHeight == 0)
continue;
visible.push(ele);
}
return visible;
}
// Some sites e.g. CraigsList have multiple images per product
function multipleImagesSupported() {
const hostname = new URL(document.baseURI).hostname;
// When saving target.com to mhtml, the color selecting images become very
// large and are picked up. Adding in hostname.endsWith('target.com') is a
// workaround for this problem. In target we only get one image per product.
return hostname.endsWith('craigslist.org') || hostname.endsWith('target.com')
|| hostname.endsWith('zazzle.com')
|| hostname.endsWith("ashleyfurniture.com")
|| hostname.endsWith("chewy.com");
}
function extractImage(item) {
const hostname = new URL(document.baseURI).hostname;
// Some merchant sites have product images as background of a div element.
// Below logic handles them separately.
if (hostname.endsWith("americastire.com")
|| hostname.endsWith("discounttire.com")) {
const image = item.querySelector(".product-image__image-block");
if (image == null) {
return null;
}
return extractImageUrl(image);
}
if (hostname.endsWith("discounttiredirect.com")) {
const image = item.querySelector(".cart-item__product-image");
if (image == null) {
return null;
}
return extractImageUrl(image);
}
// Sometimes an item contains small icons, which need to be filtered out.
// TODO: two pass getLargeImages() is probably too slow.
let images = getLargeImages(item, 40);
if (images.length == 0) {
images = getLargeImages(item, 30, true);
}
if (images.length == 0) {
return null;
}
if (!multipleImagesSupported()) {
if (verbose > 0)
console.assert(
images.length == 1, 'image extraction error', item, images);
if (images.length != 1) {
return null;
}
}
if (!document.URL.includes("chewy.com")) {
images = images.slice(0, 1);
}
for (const image of images) {
const currentUrl = extractImageUrl(image);
if (currentUrl !== null) return currentUrl;
}
return null;
}
function extractImageUrl(image) {
const lazyUrl = getLazyLoadingURL(image);
if (lazyUrl != null)
return lazyUrl;
// Special handling for Google store, America's Tire and Discount
// Tire Direct.
if (image.className === "bg-img"
|| image.className.includes("product-image__image-block")
|| image.className.includes("cart-item__product-image")) {
if (image.style.backgroundImage == undefined) {
return null;
}
const matches = image.style.backgroundImage.match('[\"\'](.*)[\"\']');
if (matches === null) {
return null;
} else {
return matches[1];
}
}
// If |image| is <amp-img>, image.src won't work.
const src = image.src || image.getAttribute('src');
if (verbose > 1)
console.log('image src', src);
if (src != null) {
// data: images are usually placeholders.
// Even if it's valid, we prefer http(s) URLs.
if (!src.startsWith('data:')) {
// Get absolute URL in case it's <amp-img>.
return (new URL(src, document.location)).href
}
}
let sourceSet = image.getAttribute('data-search-image-source-set');
if (sourceSet == null && image.parentElement.tagName == 'PICTURE') {
let sources = image.parentElement.querySelectorAll('source');
if (sources.length >= 1) {
sourceSet = getAbsoluteUrlOfSrcSet(sources[0]);
}
}
if (sourceSet == null)
return null;
console.assert(sourceSet.includes(' '), 'image extraction error', image);
// TODO: Pick the one with right pixel density?
imageUrl = sourceSet.split(' ')[0];
console.assert(imageUrl.length > 0, 'image extraction error', sourceSet);
return imageUrl;
}
// Use self assigning trick to get absolute URL
// https://github.com/chromium/dom-distiller/blob/ccfe233400cc214717ccc80973be431ab0e33cf7/java/org/chromium/distiller/DomUtil.java#L438
function getAbsoluteUrlOfSrcSet(image) {
// preserve src
const backup = image.src;
// use self assigning trick
image.src = image.srcset;
// clean up and return absolute url
const ret = image.src;
image.src = backup;
return ret;
}
function extractUrl(item) {
// Some sites doesn't use <a> tag or explicitly state href. E.g. ae.com
// shows side panel after clicking on each item instead of directing to
// product page, and some sites might trigger JS to initiate navigation
// instead of <a>.
if (document.URL.includes("ae.com")
|| document.URL.includes("kiehls.com")
|| document.URL.includes("discounttiredirect.com")
|| document.URL.includes("shutterfly.com")
|| document.URL.includes("bkstr.com")) {
return "";
}
let anchors;
if (item.tagName == 'A') {
anchors = [item];
} else {
anchors = item.querySelectorAll('a');
}
console.assert(anchors.length >= 1, 'url extraction error', item);
if (anchors.length == 0) {
return null;
}
const filtered = [];
for (const anchor of anchors) {
if (anchor.href.match(/\/#$/))
continue;
// href="javascript:" would be sanitized when serialized to MHTML.
if (anchor.href.match(/^javascript:/))
continue;
if (anchor.href == '') {
// For Sears
let href = anchor.getAttribute('bot-href');
if (href != null && href.length > 0) {
// Resolve to absolute URL.
anchor.href = href;
href = anchor.href;
anchor.removeAttribute('href');
if (href != '')
return href;
}
continue;
}
filtered.push(anchor);
// TODO: This returns the first URL in DOM order.
// Use the one with largest area instead?
return anchor.href;
}
if (filtered.length == 0)
return null;
return filtered
.reduce(function(a, b) {
return a.offsetHeight * a.offsetWidth > b.offsetHeight * b.offsetWidth ?
a :
b;
})
.href;
}
function isInlineDisplay(element) {
const display = window.getComputedStyle(element)['display'];
return display.indexOf('inline') != -1;
}
function childElementCountExcludingInline(element) {
let count = 0;
for (const child of element.children) {
if (isInlineDisplay(child))
count += 1;
}
return count;
}
function hasNonInlineDescendentsInclusive(element) {
if (!isInlineDisplay(element))
return true;
return hasNonInlineDescendents(element);
}
function hasNonInlineDescendents(element) {
for (const child of element.children) {
if (hasNonInlineDescendentsInclusive(child))
return true;
}
return false;
}
function hasNonWhiteTextNodes(element) {
for (const child of element.childNodes) {
if (child.nodeType != document.TEXT_NODE)
continue;
if (child.nodeValue.trim() != '')
return true;
}
return false;
}
// Concat classNames and IDs of ancestors up to |maxDepth|, while not containing
// |excludingElement|.
// If |excludingElement| is already a descendent of |element|, still return the
// className of |element|.
// |maxDepth| include current level, so maxDepth = 1 means just |element|.
// maxDepth >= 3 causes error in Walmart deals if not deducting "price".
function ancestorIdAndClassNames(element, excludingElement, maxDepth = 3) {
let name = '';
let depth = 0;
while (true) {
name += element.className + element.id;
element = element.parentElement;
depth += 1;
if (depth >= maxDepth)
break;
if (!element)
break;
if (element.contains(excludingElement))
break;
}
return name;
}
/*
Returns top-ranked element with the following criteria, with decreasing
priority:
- score based on whether ancestorIdAndClassNames contains "title", "price",
etc.
- largest area
- largest font size
- longest text
*/
function chooseTitle(elementArray) {
return elementArray.reduce(function(a, b) {
// Titles are typically 2 characters or more - if one element
// has less than 2 characters, don't use it.
const a_len_score = (a.innerText.trim().length >= 2);
const b_len_score = (b.innerText.trim().length >= 2);
if (a_len_score != b_len_score) {
return a_len_score > b_len_score ? a : b;
}
const titleRegex = /name|title|truncate|desc|brand/i;
const negativeRegex = /price|model/i;
const a_str = ancestorIdAndClassNames(a, b);
const b_str = ancestorIdAndClassNames(b, a);
const a_score = (a_str.match(titleRegex) != null) -
(a_str.match(negativeRegex) != null);
const b_score = (b_str.match(titleRegex) != null) -
(b_str.match(negativeRegex) != null);
if (verbose > 1)
console.log('className score', a_score, b_score, a_str, b_str, a, b);
if (a_score != b_score) {
return a_score > b_score ? a : b;
}
// Use getBoundingClientRect() to avoid int rounding error in
// offsetHeight/Width.
const a_area =
a.getBoundingClientRect().width * a.getBoundingClientRect().height;
const b_area =
b.getBoundingClientRect().width * b.getBoundingClientRect().height;
if (verbose > 1)
console.log(
'getBoundingClientRect', a.getBoundingClientRect(),
b.getBoundingClientRect(), a, b);
if (a_area != b_area) {
return a_area > b_area ? a : b;
}
const a_size = parseFloat(window.getComputedStyle(a)['font-size']);
const b_size = parseFloat(window.getComputedStyle(b)['font-size']);
if (verbose > 1)
console.log('font size', a_size, b_size, a, b);
if (a_size != b_size) {
return a_size > b_size ? a : b;
}
return a.innerText.length > b.innerText.length ? a : b;
});
}
function extractTitle(item) {
const possible_titles =
item.querySelectorAll('a, span, p, div, h1, h2, h3, h4, h5, strong');
let titles = [];
for (const title of possible_titles) {
if (hasNonInlineDescendents(title) && !hasNonWhiteTextNodes(title)) {
continue;
}
// Too small to be a title.
if (title.offsetWidth <= 1 || title.offsetHeight <= 1)
continue;
if (title.innerText.trim() == '')
continue;
if (title.innerText.trim().toLowerCase() == 'sponsored')
continue;
if (title.childElementCount > 0) {
if (title.textContent.trim() ==
title.lastElementChild.textContent.trim() ||
title.textContent.trim() ==
title.firstElementChild.textContent.trim()) {
continue;
}
}
// Aliexpress has many items without title. Without the following filter,
// the title would be the price.
// if (title.innerText.trim().match(priceRegexFull)) continue;
titles.push(title);
}
if (titles.length > 1) {
if (verbose > 1)
console.log('all generic titles', item, titles);
titles = [chooseTitle(titles)];
}
if (verbose > 0)
console.log('titles', item, titles);
console.assert(titles.length == 1, 'titles extraction error', item, titles);
if (titles.length != 1)
return null;
title = titles[0].innerText.trim();
return title;
}
function adjustBeautifiedCents(priceElement) {
const text = priceElement.innerText.trim().replace(/\/(each|set)$/i, '');
let cents;
const children = priceElement.children;
for (let i = children.length - 1; i >= 0; i--) {
const t = children[i].innerText.trim();
if (t == '')
continue;
if (t.indexOf('/') != -1)
continue;
cents = t;
break;
}
if (cents == null)
return null;
if (verbose > 0)
console.log('cents', cents, priceElement);
if (cents.length == 2 && cents == text.slice(-cents.length) &&
text.slice(-3, -2).match(/\d/)) {
return text.substr(0, text.length - cents.length) + '.' + cents;
}
}
function anyLineThroughInAncentry(element, maxDepth = 2) {
let depth = 0;
while (element != null && element.tagName != 'BODY') {
if (window.getComputedStyle(element)['text-decoration'].indexOf(
'line-through') != -1)
return true;
element = element.parentElement;
depth += 1;
if (depth >= maxDepth)
break;
}
return false;
}
function forgivingParseFloat(str) {
return parseFloat(str.replace(priceCleanupRegex, '').replace(/^[$]*/, ''));
}
function choosePrice(priceArray) {
if (priceArray.length == 0)
return null;
return priceArray
.reduce(function(a, b) {
// Positive tags
for (const pattern of ['with offer', 'sale', 'now']) {
const a_val = a.toLowerCase().indexOf(pattern) != -1;
const b_val = b.toLowerCase().indexOf(pattern) != -1;
if (a_val != b_val) {
return a_val > b_val ? a : b;
}
}
// Negative tags
for (const pattern of ['/set', '/each']) {
const a_val = a.toLowerCase().indexOf(pattern) != -1;
const b_val = b.toLowerCase().indexOf(pattern) != -1;
if (a_val != b_val) {
return a_val < b_val ? a : b;
}
}
// Guess the smallest numerical value.
// The tags like "now" don't always fall inside element boundary.
// See Nordstrom/homepage-eager.mhtml.
return forgivingParseFloat(a) > forgivingParseFloat(b) ? b : a;
})
.replace(priceCleanupRegex, '');
}
function extractPrice(item) {
const hostname = new URL(document.baseURI).hostname;
// shein.com shows price by one element per digit and it's challenging
// to decide based on textContent.
if (hostname.endsWith("shein.com")) {
return "";
}
// Etsy mobile
const prices = item.querySelectorAll(`
.currency-value
`);
if (prices.length == 1) {
let ans = prices[0].textContent.trim();
if (ans.match(/^\d/))
ans = '$' + ans; // for Etsy
if (ans != '')
return ans;
}
// Generic heuristic to search for price elements.
let captured_prices = [];
for (const price of item.querySelectorAll(
'span, b, p, div, h3, td, li, em, strong, ins')) {
let candidate = price.innerText.trim();
if (hostname.endsWith("urbanoutfitters.com") ||
hostname.endsWith("freepeople.com")) {
priceParts = candidate.split("\n");
if (priceParts.length >= 2){
candidate = priceParts[1];
}
} else if (hostname.endsWith("thecompanystore.com") ||
hostname.endsWith("childrensplace.com") ||
hostname.endsWith("chewy.com")) {
candidate = candidate.split("\n")[0];
}
if (!candidate.match(priceRegexFull))
continue;
if (verbose > 1)
console.log('price candidate', candidate, price);
if (price.childElementCount > 0) {
// Avoid matching the parent element of the real price element.
// Otherwise adjustBeautifiedCents would break.
if (price.innerText.trim() == price.lastElementChild.innerText.trim() ||
price.innerText.trim() == price.firstElementChild.innerText.trim()) {
// If the wanted child is not scanned, change the querySelectorAll
// string.
if (verbose > 1)
console.log('skip redundant parent', price);
continue;
}
}
// TODO: check child elements recursively.
if (anyLineThroughInAncentry(price)) {
if (verbose > 1)
console.log('line-through', price);
continue;
}
// for Amazon and HomeDepot
if (candidate.indexOf('.') == -1 && price.lastElementChild != null) {
const adjusted = adjustBeautifiedCents(price);
if (adjusted != null)
return adjusted;
}
captured_prices.push(candidate);
}
if (verbose > 0)
console.log('captured_prices', captured_prices);
return choosePrice(captured_prices);
}
function getProductIdFromMatches(productIdMatches, matchIndex = undefined) {
if (productIdMatches === null) {
return null;
}
if (matchIndex !== undefined) {
return productIdMatches[matchIndex];
}
for (var i = productIdMatches.length - 1; i >= 0; i--) {
if (productIdMatches[i] !== undefined) {
return productIdMatches[i];
}
}
return null;
}
function getProductIdWithPattern(sourceMap, patternMap) {
const hostname = window.location.hostname;
for (const sourceName of Object.keys(sourceMap)) {
if (patternMap[sourceName] === undefined ||
!(hostname in patternMap[sourceName])) {
continue;
}
const source = sourceMap[sourceName];
const heuristic = patternMap[sourceName][hostname];
if (Array.isArray(heuristic)) {
return getProductIdFromMatches(source.match(
new RegExp(heuristic[0], 'i')), heuristic[1]);
} else {
return getProductIdFromMatches(source.match(
new RegExp(heuristic, 'i')));
}
}
return null;
}
function extractProductId(url, imageUrl, item) {
const idExtractionMapNotExist =
typeof idExtractionMap === 'undefined' ||
idExtractionMap === undefined;
const couponIdExtractionMapNotExist =
typeof couponIdExtractionMap === 'undefined' ||
couponIdExtractionMap === undefined;
if (idExtractionMapNotExist && couponIdExtractionMapNotExist) {
return null;
}
let productId = null;
const sourceMap = {"product_url": url,
"product_image_url": imageUrl,
"product_element": item.outerHTML};
if (!idExtractionMapNotExist) {
productId = getProductIdWithPattern(sourceMap, idExtractionMap);
if (productId !== null) return productId;
}
if (!couponIdExtractionMapNotExist) {
productId = getProductIdWithPattern(sourceMap, couponIdExtractionMap);
if (productId !== null) return productId;
}
return null;
}
function extractItem(item) {
imageUrl = extractImage(item);
if (imageUrl == null) {
if (verbose > 0)
console.warn('no images found', item);
return null;
}
url = extractUrl(item);
// Some items in Sears and Staples only have ng-click or onclick handlers,
// so it's impossible to extract URL.
if (url == null) {
if (verbose > 0)
console.warn('no url found', item);
return null;
}
title = extractTitle(item);
if (title == null) {
if (verbose > 0)
console.warn('no title found', item);
return null;
}
let price = extractPrice(item);
// eBay "You may also like" and "Guides" are not product items.
// Not having price is one hint.
// FIXME: "Also viewed" items in Gap doesn't have prices.
if (price == null) {
if (verbose > 0)
console.warn('no price found', item);
return null;
}
let extractionResult =
{'url': url, 'imageUrl': imageUrl, 'title': title, 'price': price};
// productId is an optional field for extraction.
const productId = extractProductId(url, imageUrl, item);
if (productId !== null) {
extractionResult['productId'] = productId;
}
return extractionResult;
}
function commonAncestor(a, b) {
while (!a.contains(b)) {
a = a.parentElement;
}
return a;
}
function commonAncestorList(list) {
return list.reduce(function(a, b) {
return commonAncestor(a, b);
});
}
function hasOverlap(target, list) {
for (const element of list) {
if (element.contains(target) || target.contains(element)) {
return true;
}
}
return false;
}
function matchPattern(item, pattern, matchText) {
if (item === null) return false;
const textToMatch = matchText ? item.textContent : item.outerHTML;
return textToMatch.toLowerCase().match(pattern);
}
function isCartItem(item) {
// TODO: Improve the heuristic here to accommodate more formats of cart item.
if (matchPattern(item, moveToCartTextRegex, true)) return false;
// Walmart has 'move to cart' outside of the div.cart-item.
if (matchPattern(item.parentElement, moveToCartTextRegex, true)) return false;
if (matchPattern(item, cartPriceTextRegex, true)) return false;
// Item element in bestbuy.com contains "add to cart" for things
// like protection plans.
if (!document.URL.includes("bestbuy.com")
&& !document.URL.includes("orientaltrading.com")
&& matchPattern(item, addToCartTextRegex, true)) return false;
if ((document.URL.includes("ashleyfurniture.com")
|| document.URL.includes("gnc.com")
|| document.URL.includes("bathandbodyworks.com"))
&& matchPattern(item, minicartHTMLRegex, false)) return false;
if (document.URL.includes("ashleyfurniture.com")
&& matchPattern(item, cartItemQtyRegex, true) === null)
return false;
return matchPattern(item, cartItemTextRegex, true) ||
matchPattern(item, cartItemQtyRegex, true) ||
matchPattern(item, cartItemHTMLRegex, false);
}
function extractOneItem(item, extracted_items, processed, output,
savedForLaterSection, skipFiltering) {
if (skipFiltering) {
const extraction = extractItem(item);
if (extraction != null) {
output.set(item, extraction);
extracted_items.push(item);
}
return;
}
if (verbose > 1) {
console.log('trying', item);
}
if (item.childElementCount == 0 && item.parentElement.tagName != 'BODY') {
// Amazone store page uses overlay <a>.
item = item.parentElement;
if (item == null)
return;
}
if (processed.has(item)) {
if (verbose > 0)
console.log('processed', item);
return;
}
processed.add(item);
if (item.scrollHeight > 1000) {
if (verbose > 0)
console.log('too tall', item);
return;
}
if (hasOverlap(item, extracted_items)) {
if (verbose > 0)
console.log('overlap', item);
return;
}
// scrollHeight could be 0 while getBoundingClientRect().height > 0.
const bounding_rect = item.getBoundingClientRect();
if (bounding_rect.height < 50) {
if (verbose > 0)
console.log('too short', item);
return;
}
if (bounding_rect.height * bounding_rect.width > 800 * window.innerWidth) {
if (verbose > 0)
console.log('too tall', item);
return;
}
if (item.querySelectorAll('img, amp-img, .bg-img').length == 0) {
if (verbose > 0)
console.log('no image', item);
return;
}
if (!item.textContent.match(priceRegex)) {
if (verbose > 0)
console.log('no price', item);
return;
}
if (bounding_rect.top <= 10 &&
(document.URL.includes('partycity.com') ||
document.URL.includes('chewy.com'))) {
if (verbose > 0)
console.log('likely cart page header', item);
return;
}
if (isInSavedForLater(item, savedForLaterSection)) {
if (verbose > 0)
console.log('in save for later', item);
return;
}
if (!isCartItem(item)) {
if (verbose > 0)
console.log('not cart item', item);
return;
}
if (verbose > 0)
console.log('try extracting', item);
const extraction = extractItem(item);
if (extraction != null) {
output.set(item, extraction);
extracted_items.push(item);
}
}
function isInSavedForLater(item, savedForLaterSection) {
return savedForLaterSection !== null
&& savedForLaterSection.getBoundingClientRect().top
< item.getBoundingClientRect().top
&& !item.textContent.toLowerCase().match(saveForLaterRegex);
}
function getSavedForLaterSection() {
// This regex should match the XPath pattern below.
const shortCutRegex = new RegExp(
'(your saved items)|(saved for later)|(my saved items)|(wishlist items)',
'i');
if (!document.body.innerText.match(shortCutRegex))
return null;
const nodes = document.evaluate(
"//*[contains(translate(" +
"text(), 'ABCDEFGHIJKLMNOPQRSTUVWXYZ','abcdefghijklmnopqrstuvwxyz'), " +
"'your saved items')" +
"or contains(translate(" +
"text(), 'ABCDEFGHIJKLMNOPQRSTUVWXYZ', 'abcdefghijklmnopqrstuvwxyz'), " +
"'saved for later')" +
"or contains(translate(" +
"text(), 'ABCDEFGHIJKLMNOPQRSTUVWXYZ', 'abcdefghijklmnopqrstuvwxyz'), " +
"'my saved items')" +
"or contains(translate(" +
"text(), 'ABCDEFGHIJKLMNOPQRSTUVWXYZ', 'abcdefghijklmnopqrstuvwxyz'), " +
"'wishlist items')]", document,
null, XPathResult.ORDERED_NODE_ITERATOR_TYPE, null);
let node = nodes.iterateNext();
let section = null;
while (node) {
if (node!= null && node.offsetHeight >= 1 && node.offsetWidth >= 1) {
section = node;
}
node = nodes.iterateNext();
}
return section
}
function isHeuristicsImprovementEnabled() {
if (typeof isImprovementEnabled === 'undefined'
|| typeof isImprovementEnabled !== 'boolean') {
return false;
}
return isImprovementEnabled;
}
function documentPositionComparator(a, b) {
if (a === b)
return 0;
const position = a.compareDocumentPosition(b);
if (position & Node.DOCUMENT_POSITION_FOLLOWING ||
position & Node.DOCUMENT_POSITION_CONTAINED_BY) {
return -1;
} else if (
position & Node.DOCUMENT_POSITION_PRECEDING ||
position & Node.DOCUMENT_POSITION_CONTAINS) {
return 1;
} else {
return 0;
}
}
// Remove duplicate products with identical product URLs.
function deduplicateResults(output) {
if (!document.URL.includes("sourcebmx.com")) return output;
const productUrls = new Set();
let filteredOutput = [];
for (let i = 0; i < output.length; i++) {
const productUrl = output[i]["url"];
if (!productUrls.has(productUrl)) {
filteredOutput.push(output[i]);
productUrls.add(productUrl)
}
}
return filteredOutput;
}
if (typeof Sleeper === 'undefined') {
var Sleeper = class {
constructor() {
// 99.9th percentile of the individual task execution times should be
// < 50ms.
// The task time is defined as exclusive CPU usage, from last time
// sleeping is done to the beginning of the next sleep.
let min_task_time = 10;
if (typeof kSleeperMinTaskTimeMs !== 'undefined') {
min_task_time = kSleeperMinTaskTimeMs;
}
this.min_task_time = min_task_time;
// Avoid monopolizing JavaScript main thread execution time.
let duty_cycle = 0.05;
if (typeof kSleeperDutyCycle !== 'undefined') {
duty_cycle = kSleeperDutyCycle;
}
this.duty_cycle = Math.max(0.01, Math.min(duty_cycle, 1));
this.last_sleep = performance.now();
this.start = performance.now();
this.longest_task = 0;
this.total_tasks_time = 0;
}
async maybeSleep() {
const elapsed = performance.now() - this.last_sleep;
if (elapsed <= this.min_task_time)
return;
this.longest_task = Math.max(this.longest_task, elapsed);
this.total_tasks_time += elapsed;
if (verbose > 1) {
console.log('longest task', this.longest_task);
}
// Calculate the delay aiming for the target duty cycle.
// duty_cycle = (working time) / (working time + sleeping time)
// = elapsed / (elapsed + delay)
const delay = elapsed * (1 - this.duty_cycle) / this.duty_cycle;
await new Promise(r => setTimeout(r, delay));
this.last_sleep = performance.now();
}
get longestTask() {
const elapsed = performance.now() - this.last_sleep;
return Math.max(this.longest_task, elapsed);
}
get totalTasksTime() {
const elapsed = performance.now() - this.last_sleep;
return this.total_tasks_time + elapsed;
}
get elapsed() {
return performance.now() - this.start;
}
}
}
async function extractAllItems(root) {
let timeout = 250;
if (typeof kTimeoutMs !== 'undefined') {
timeout = kTimeoutMs;
}
let items = [];
const sleeper = new Sleeper();
// Root element being null could be due to the
// fact that the cart is emptied, or the cart
// element has not been loaded yet.
if (root == null) {
if (document.readyState == 'complete') {
return [];
} else {
return false;
}
}
let skipFiltering = true;
if (document.URL.includes("kiehls.com")
|| document.URL.includes("laroche-posay.us")) {
items = root.querySelectorAll(".c-product-table__row");
} else if (document.URL.includes("americastire.com")
|| document.URL.includes("discounttire.com")) {
items = root.querySelectorAll("[role=\"listitem\"]");
} else if (document.URL.includes("discounttiredirect.com")) {
items = root.querySelectorAll(".cart-item");
} else if (document.URL.includes("shutterfly.com")){
items = root.querySelectorAll(".cartitem");
} else {
skipFiltering = false;
// Generic pattern
const candidates = new Set();
items = root.querySelectorAll('a');
const urlMap = new Map();
for (const item of items) {
if (!urlMap.has(item.href)) {
urlMap.set(item.href, new Set());
}
urlMap.get(item.href).add(item);
}
for (const [key, value] of urlMap) {
const ancestor = commonAncestorList(Array.from(value));
if (!candidates.has(ancestor))
candidates.add(ancestor);
}
for (const item of items) {
candidates.add(item);
}
const ancestors = new Set();
// TODO: optimize this part.
for (let depth = 0; depth < 8; depth++) {
for (let item of candidates) {
for (let i = 0; i < depth; i++) {
item = item.parentElement;
if (!item)
break;
}
if (item)
ancestors.add(item);
}
}
items = Array.from(ancestors);
}
await sleeper.maybeSleep();
if (verbose > 0)
console.log(items);
const outputMap = new Map();
const processed = new Set();
const extracted_items = [];
let savedForLaterSection = null;
if (isHeuristicsImprovementEnabled()) {
savedForLaterSection = getSavedForLaterSection();
if (verbose > 0)
console.log(savedForLaterSection);
await sleeper.maybeSleep();
}
let i = 0;
let early_abort = false;
for (const item of items) {
extractOneItem(item, extracted_items, processed, outputMap,
savedForLaterSection, skipFiltering);
// Checking for every item is too slow.
if (i++ % 10 == 0) {
await sleeper.maybeSleep();
if (sleeper.totalTasksTime > timeout) {
if (verbose > 0) {
console.log('aborted due to timeout');
}
early_abort = true;
break;
}
}
}
const keysInDocOrder =
Array.from(outputMap.keys()).sort(documentPositionComparator);
const output = [];
for (const key of keysInDocOrder) {
output.push(outputMap.get(key));
}
await sleeper.maybeSleep();
return {
'products': deduplicateResults(output),
'longest_task_ms': sleeper.longestTask,
'total_tasks_ms': sleeper.totalTasksTime,
'elapsed_ms': sleeper.elapsed,
'timedout': early_abort,
};
}
extracted_results_promise = extractAllItems(document);