1 /* This Source Code Form is subject to the terms of the Mozilla Public
2 * License, v. 2.0. If a copy of the MPL was not distributed with this
3 * file, You can obtain one at http://mozilla.org/MPL/2.0/. */
5 import { XPCOMUtils } from "resource://gre/modules/XPCOMUtils.sys.mjs";
9 ChromeUtils.defineESModuleGetters(lazy, {
10 clearTimeout: "resource://gre/modules/Timer.sys.mjs",
11 setTimeout: "resource://gre/modules/Timer.sys.mjs",
14 XPCOMUtils.defineLazyPreferenceGetter(
16 "serpEventTelemetryCategorization",
17 "browser.search.serpEventTelemetryCategorization.enabled",
21 export const CATEGORIZATION_SETTINGS = {
22 MAX_DOMAINS_TO_CATEGORIZE: 10,
25 // Duplicated from SearchSERPTelemetry to avoid loading the module on content
27 const SEARCH_TELEMETRY_SHARED = {
28 PROVIDER_INFO: "SearchTelemetry:ProviderInfo",
29 LOAD_TIMEOUT: "SearchTelemetry:LoadTimeout",
30 SPA_LOAD_TIMEOUT: "SearchTelemetry:SPALoadTimeout",
34 * Standard events mapped to the telemetry action.
36 const EVENT_TYPE_TO_ACTION = {
41 * A map of object conditions mapped to the condition that should be run when
42 * an event is triggered. The condition name is referenced in Remote Settings
43 * under the optional `condition` string for an event listener.
46 keydownEnter: event => event.key == "Enter",
49 export const VISIBILITY_THRESHOLD = 0.5;
52 * SearchProviders looks after keeping track of the search provider information
53 * received from the main process.
55 * It is separate to SearchTelemetryChild so that it is not constructed for each
56 * tab, but once per process.
58 class SearchProviders {
60 this._searchProviderInfo = null;
61 Services.cpmm.sharedData.addEventListener("change", this);
65 * Gets the search provider information for any provider with advert information.
66 * If there is nothing in the cache, it will obtain it from shared data.
68 * @returns {object} Returns the search provider information.
69 * @see SearchTelemetry.sys.mjs
72 if (this._searchProviderInfo) {
73 return this._searchProviderInfo;
76 this._searchProviderInfo = Services.cpmm.sharedData.get(
77 SEARCH_TELEMETRY_SHARED.PROVIDER_INFO
80 if (!this._searchProviderInfo) {
84 this._searchProviderInfo = this._searchProviderInfo
85 // Filter-out non-ad providers so that we're not trying to match against
86 // those unnecessarily.
87 .filter(p => "extraAdServersRegexps" in p)
88 // Pre-build the regular expressions.
90 p.adServerAttributes = p.adServerAttributes ?? [];
91 if (p.shoppingTab?.inspectRegexpInSERP) {
92 p.shoppingTab.regexp = new RegExp(p.shoppingTab.regexp);
96 searchPageRegexp: new RegExp(p.searchPageRegexp),
97 extraAdServersRegexps: p.extraAdServersRegexps.map(
103 return this._searchProviderInfo;
107 * Handles events received from sharedData notifications.
109 * @param {object} event The event details.
112 switch (event.type) {
114 if (event.changedKeys.includes(SEARCH_TELEMETRY_SHARED.PROVIDER_INFO)) {
115 // Just null out the provider information for now, we'll fetch it next
117 this._searchProviderInfo = null;
126 * @typedef {object} EventListenerParam
127 * @property {string} eventType
128 * The type of event the listener should listen for. If the event type is
129 * is non-standard, it should correspond to a definition in
130 * CUSTOM_EVENT_TYPE_TO_DATA that will re-map it to a standard type. TODO
131 * @property {string} target
132 * The type of component that was the source of the event.
133 * @property {string | null} action
134 * The action that should be reported in telemetry.
138 * Provides a way to add listeners to elements, as well as unload them.
140 class ListenerHelper {
142 * Adds each event listener in an array of event listeners to each element
143 * in an array of elements, and sets their unloading.
145 * @param {Array<Element>} elements
146 * DOM elements to add event listeners to.
147 * @param {Array<EventListenerParam>} eventListenerParams
148 * The type of event to add the listener to.
149 * @param {string} target
151 static addListeners(elements, eventListenerParams, target) {
152 if (!elements?.length || !eventListenerParams?.length) {
156 let document = elements[0].ownerGlobal.document;
157 let callback = documentToEventCallbackMap.get(document);
162 // The map might have entries from previous callers, so we must ensure
163 // we don't discard existing event listener callbacks.
164 let removeListenerCallbacks = [];
165 if (documentToRemoveEventListenersMap.has(document)) {
166 removeListenerCallbacks = documentToRemoveEventListenersMap.get(document);
169 for (let params of eventListenerParams) {
170 let removeListeners = ListenerHelper.addListener(
176 removeListenerCallbacks = removeListenerCallbacks.concat(removeListeners);
179 documentToRemoveEventListenersMap.set(document, removeListenerCallbacks);
183 * Add an event listener to each element in an array of elements.
185 * @param {Array<Element>} elements
186 * DOM elements to add event listeners to.
187 * @param {EventListenerParam} eventListenerParam
188 * @param {string} target
189 * @param {Function} callback
190 * @returns {Array<function>} Array of remove event listener functions.
192 static addListener(elements, eventListenerParam, target, callback) {
193 let { action, eventType, target: customTarget } = eventListenerParam;
196 target = customTarget;
200 action = EVENT_TYPE_TO_ACTION[eventType];
206 // Some events might have specific conditions we want to check before
207 // registering an engagement event.
209 if (eventListenerParam.condition) {
210 if (CONDITIONS[eventListenerParam.condition]) {
211 let condition = CONDITIONS[eventListenerParam.condition];
212 eventCallback = async event => {
213 let start = Cu.now();
214 if (condition(event)) {
215 callback({ action, target });
217 ChromeUtils.addProfilerMarker(
218 "SearchSERPTelemetryChild._eventCallback",
220 "Call cached function before callback."
224 // If a component included a condition, but it wasn't found it is
225 // due to the fact that it was added in a more recent Firefox version
226 // than what is provided via search-telemetry-v2. Since the version of
227 // Firefox the user is using doesn't include this condition,
228 // we shouldn't add the event.
232 eventCallback = () => {
233 callback({ action, target });
237 let removeListenerCallbacks = [];
238 for (let element of elements) {
239 element.addEventListener(eventType, eventCallback);
240 removeListenerCallbacks.push(() => {
241 element.removeEventListener(eventType, eventCallback);
244 return removeListenerCallbacks;
249 * Scans SERPs for ad components.
251 class SearchAdImpression {
253 * A reference to ad component information that is used if an anchor
254 * element could not be categorized to a specific ad component.
258 #defaultComponent = null;
261 * Maps DOM elements to AdData.
263 * @type {Map<Element, AdData>}
267 * @property {string} type
268 * The type of ad component.
269 * @property {number} adsLoaded
270 * The number of ads counted as loaded for the component.
271 * @property {boolean} countChildren
272 * Whether all the children were counted for the component.
274 #elementToAdDataMap = new Map();
277 * An array of components to do a top-down search.
279 #topDownComponents = [];
282 * A reference the providerInfo for this SERP.
286 #providerInfo = null;
288 set providerInfo(providerInfo) {
289 if (this.#providerInfo?.telemetryId == providerInfo.telemetryId) {
293 this.#providerInfo = providerInfo;
296 this.#topDownComponents = [];
298 for (let component of this.#providerInfo.components) {
299 if (component.default) {
300 this.#defaultComponent = component;
303 if (component.topDown) {
304 this.#topDownComponents.push(component);
310 * Check if the page has a shopping tab.
312 * @param {Document} document
314 * Whether the page has a shopping tab. Defaults to false.
316 hasShoppingTab(document) {
317 if (!this.#providerInfo?.shoppingTab) {
321 // If a provider has the inspectRegexpInSERP, we assume there must be an
322 // associated regexp that must be used on any hrefs matched by the elements
323 // found using the selector. If inspectRegexpInSERP is false, then check if
324 // the number of items found using the selector matches exactly one element
325 // to ensure we've used a fine-grained search.
326 let elements = document.querySelectorAll(
327 this.#providerInfo.shoppingTab.selector
329 if (this.#providerInfo.shoppingTab.inspectRegexpInSERP) {
330 let regexp = this.#providerInfo.shoppingTab.regexp;
331 for (let element of elements) {
332 let href = element.getAttribute("href");
333 if (href && regexp.test(href)) {
334 this.#recordElementData(element, {
335 type: "shopping_tab",
341 } else if (elements.length == 1) {
342 this.#recordElementData(elements[0], {
343 type: "shopping_tab",
352 * Examine the list of anchors and the document object and find components
355 * With the list of anchors, go through each and find the component it
356 * belongs to and save it in elementToAdDataMap.
358 * Then, with the document object find components and save the results to
359 * elementToAdDataMap.
361 * Lastly, combine the results together in a new Map that contains the number
362 * of loaded, visible, and blocked results for the component.
364 * @param {HTMLCollectionOf<HTMLAnchorElement>} anchors
365 * @param {Document} document
367 * @returns {Map<string, object>}
368 * A map where the key is a string containing the type of ad component
369 * and the value is an object containing the number of adsLoaded,
370 * adsVisible, and adsHidden within the component.
372 categorize(anchors, document) {
373 // Used for various functions to make relative URLs absolute.
374 let origin = new URL(document.documentURI).origin;
376 // Bottom up approach.
377 this.#categorizeAnchors(anchors, origin);
379 // Top down approach.
380 this.#categorizeDocument(document);
382 let componentToVisibilityMap = new Map();
383 let hrefToComponentMap = new Map();
385 let innerWindowHeight = document.ownerGlobal.innerHeight;
386 let scrollY = document.ownerGlobal.scrollY;
388 // Iterate over the results:
389 // - If it's searchbox add event listeners.
390 // - If it is a non_ads_link, map its href to component type.
391 // - For others, map its component type and check visibility.
392 for (let [element, data] of this.#elementToAdDataMap.entries()) {
393 if (data.type == "incontent_searchbox") {
394 // Bug 1880413: Deprecate hard coding the incontent search box.
395 // If searchbox has child elements, observe those, otherwise
396 // fallback to its parent element.
397 let searchElements = data.childElements.length
400 ListenerHelper.addListeners(
403 { eventType: "click", target: data.type },
405 eventType: "keydown",
408 condition: "keydownEnter",
415 if (data.childElements.length) {
416 for (let child of data.childElements) {
417 let href = this.#extractHref(child, origin);
419 hrefToComponentMap.set(href, data.type);
423 let href = this.#extractHref(element, origin);
425 hrefToComponentMap.set(href, data.type);
429 // If the component is a non_ads_link, skip visibility checks.
430 if (data.type == "non_ads_link") {
434 // If proxy children were found, check the visibility of all of them
435 // otherwise just check the visiblity of the first child.
437 if (data.proxyChildElements.length) {
438 childElements = data.proxyChildElements;
439 } else if (data.childElements.length) {
440 childElements = [data.childElements[0]];
443 let count = this.#countVisibleAndHiddenAds(
450 if (componentToVisibilityMap.has(data.type)) {
451 let componentInfo = componentToVisibilityMap.get(data.type);
452 componentInfo.adsLoaded += data.adsLoaded;
453 componentInfo.adsVisible += count.adsVisible;
454 componentInfo.adsHidden += count.adsHidden;
456 componentToVisibilityMap.set(data.type, {
457 adsLoaded: data.adsLoaded,
458 adsVisible: count.adsVisible,
459 adsHidden: count.adsHidden,
464 // Release the DOM elements from the Map.
465 this.#elementToAdDataMap.clear();
467 return { componentToVisibilityMap, hrefToComponentMap };
471 * Given an element, find the href that is most likely to make the request if
472 * the element is clicked. If the element contains a specific data attribute
473 * known to contain the url used to make the initial request, use it,
474 * otherwise use its href. Specific character conversions are done to mimic
475 * conversions likely to take place when urls are observed in network
478 * @param {Element} element
479 * The element to inspect.
480 * @param {string} origin
481 * The origin for relative urls.
483 * The href of the element.
485 #extractHref(element, origin) {
487 // Prioritize the href from a known data attribute value instead of
488 // its href property, as the former is the initial url the page will
489 // navigate to before being re-directed to the href.
490 for (let name of this.#providerInfo.adServerAttributes) {
492 element.dataset[name] &&
493 this.#providerInfo.extraAdServersRegexps.some(regexp =>
494 regexp.test(element.dataset[name])
497 href = element.dataset[name];
501 // If a data attribute value was not found, fallback to the href.
502 href = href ?? element.getAttribute("href");
507 // Avoid extracting or fixing up Javascript URLs.
508 if (href.startsWith("javascript")) {
512 // Hrefs can be relative.
513 if (!href.startsWith("https://") && !href.startsWith("http://")) {
514 href = origin + href;
516 // Per Bug 376844, apostrophes in query params are escaped, and thus, are
517 // percent-encoded by the time they are observed in the network. Even
518 // though it's more comprehensive, we avoid using newURI because its more
519 // expensive and conversions should be the exception.
520 // e.g. /path'?q=Mozilla's -> /path'?q=Mozilla%27s
521 let arr = href.split("?");
522 if (arr.length == 2 && arr[1].includes("'")) {
523 href = arr[0] + "?" + arr[1].replaceAll("'", "%27");
529 * Given a list of anchor elements, group them into ad components.
531 * The first step in the process is to check if the anchor should be
532 * inspected. This is based on whether it contains an href or a
533 * data-attribute values that matches an ad link, or if it contains a
534 * pattern caught by a components included regular expression.
536 * Determine which component it belongs to and the number of matches for
537 * the component. The heuristic is described in findDataForAnchor.
538 * If there was a result and we haven't seen it before, save it in
539 * elementToAdDataMap.
541 * @param {HTMLCollectionOf<HTMLAnchorElement>} anchors
542 * The list of anchors to inspect.
543 * @param {string} origin
544 * The origin of the document the anchors belong to.
546 #categorizeAnchors(anchors, origin) {
547 for (let anchor of anchors) {
548 if (this.#shouldInspectAnchor(anchor, origin)) {
549 let result = this.#findDataForAnchor(anchor);
551 this.#recordElementData(result.element, {
554 proxyChildElements: result.proxyChildElements,
555 childElements: result.childElements,
558 if (result.relatedElements?.length) {
559 // Bug 1880413: Deprecate related elements.
560 // Bottom-up approach with related elements are only used for
561 // non-link elements related to ads, like carousel arrows.
562 ListenerHelper.addListeners(
563 result.relatedElements,
578 * Find components from the document object. This is mostly relevant for
579 * components that are non-ads and don't have an obvious regular expression
580 * that could match the pattern of the href.
582 * @param {Document} document
584 #categorizeDocument(document) {
585 // using the subset of components that are top down,
586 // go through each one.
587 for (let component of this.#topDownComponents) {
588 // Top-down searches must have the topDown attribute.
589 if (!component.topDown) {
592 // Top down searches must include a parent.
593 if (!component.included?.parent) {
596 let parents = document.querySelectorAll(
597 component.included.parent.selector
599 if (parents.length) {
600 let eventListeners = component.included.parent.eventListeners;
601 if (eventListeners?.length) {
602 ListenerHelper.addListeners(parents, eventListeners, component.type);
604 for (let parent of parents) {
605 // Bug 1880413: Deprecate related elements.
606 // Top-down related elements are either used for auto-suggested
607 // elements of a searchbox, or elements on a page which we can't
608 // find through a bottom up approach but we want an add a listener,
609 // like carousels with arrows.
610 if (component.included.related?.selector) {
611 let relatedElements = parent.querySelectorAll(
612 component.included.related.selector
614 if (relatedElements.length) {
615 // For the search box, related elements with event listeners are
616 // auto-suggested terms. For everything else (e.g. carousels)
617 // they are expanded.
618 ListenerHelper.addListeners(
623 component.type == "incontent_searchbox"
633 if (component.included.children) {
634 for (let child of component.included.children) {
635 let childElements = parent.querySelectorAll(child.selector);
636 if (childElements.length) {
637 if (child.eventListeners) {
638 childElements = Array.from(childElements);
639 ListenerHelper.addListeners(
641 child.eventListeners,
642 child.type ?? component.type
645 if (!child.skipCount) {
646 this.#recordElementData(parent, {
647 type: component.type,
648 childElements: Array.from(childElements),
653 } else if (!component.included.parent.skipCount) {
654 this.#recordElementData(parent, {
655 type: component.type,
664 * Evaluates whether an anchor should be inspected based on matching
665 * regular expressions on either its href or specified data-attribute values.
667 * @param {HTMLAnchorElement} anchor
668 * @param {string} origin
671 #shouldInspectAnchor(anchor, origin) {
672 let href = anchor.getAttribute("href");
677 // Some hrefs might be relative.
678 if (!href.startsWith("https://") && !href.startsWith("http://")) {
679 href = origin + href;
682 let regexps = this.#providerInfo.extraAdServersRegexps;
683 // Anchors can contain ad links in a data-attribute.
684 for (let name of this.#providerInfo.adServerAttributes) {
685 let attributeValue = anchor.dataset[name];
688 regexps.some(regexp => regexp.test(attributeValue))
693 // Anchors can contain ad links in a specific href.
694 if (regexps.some(regexp => regexp.test(href))) {
701 * Find the component data for an anchor.
703 * To categorize the anchor, we iterate over the list of possible components
704 * the anchor could be categorized. If the component is default, we skip
705 * checking because the fallback option for all anchor links is the default.
707 * First, get the "parent" of the anchor which best represents the DOM element
708 * that contains the anchor links for the component and no other component.
709 * This parent will be cached so that other anchors that share the same
710 * parent can be counted together.
712 * The check for a parent is a loop because we can define more than one best
713 * parent since on certain SERPs, it's possible for a "better" DOM element
714 * parent to appear occassionally.
716 * If no parent is found, skip this component.
718 * If a parent was found, check for specific child elements.
720 * Finding child DOM elements of a parent is optional. One reason to do so is
721 * to use child elements instead of anchor links to count the number of ads for
722 * a component via the `countChildren` property. This is provided because some ads
723 * (i.e. carousels) have multiple ad links in a single child element that go to the
724 * same location. In this scenario, all instances of the child are recorded as ads.
725 * Subsequent anchor elements that map to the same parent are ignored.
727 * Whether or not a child was found, return the information that was found,
728 * including whether or not all child elements were counted instead of anchors.
730 * If another anchor belonging to a parent that was previously recorded is the input
731 * for this function, we either increment the ad count by 1 or don't increment the ad
732 * count because the parent used `countChildren` completed the calculation in a
736 * @param {HTMLAnchorElement} anchor
737 * The anchor to be inspected.
739 * An object containing the element representing the root DOM element for
740 * the component, the type of component, how many ads were counted,
741 * and whether or not the count was of all the children.
743 #findDataForAnchor(anchor) {
744 for (let component of this.#providerInfo.components) {
745 // First, check various conditions for skipping a component.
747 // A component should always have at least one included statement.
748 if (!component.included) {
752 // Top down searches are done after the bottom up search.
753 if (component.topDown) {
757 // The default component doesn't need to be checked,
758 // as it will be the fallback option.
759 if (component.default) {
763 // The anchor shouldn't belong to an excluded parent component if one
766 component.excluded?.parent?.selector &&
767 anchor.closest(component.excluded.parent.selector)
772 // All components with included should have a parent entry.
773 if (!component.included.parent) {
777 // Find the parent of the anchor.
778 let parent = anchor.closest(component.included.parent.selector);
784 // If we've already inspected the parent, add the child element to the
785 // list of anchors. Don't increment the ads loaded count, as we only care
786 // about grouping the anchor with the correct parent.
787 if (this.#elementToAdDataMap.has(parent)) {
790 childElements: [anchor],
794 let relatedElements = [];
795 if (component.included.related?.selector) {
796 relatedElements = parent.querySelectorAll(
797 component.included.related.selector
801 // If the component has no defined children, return the parent element.
802 if (component.included.children) {
803 // Look for the first instance of a matching child selector.
804 for (let child of component.included.children) {
805 // If counting by child, get all of them at once.
806 if (child.countChildren) {
807 let proxyChildElements = parent.querySelectorAll(child.selector);
808 if (proxyChildElements.length) {
811 type: child.type ?? component.type,
812 proxyChildElements: Array.from(proxyChildElements),
813 count: proxyChildElements.length,
814 childElements: [anchor],
818 } else if (parent.querySelector(child.selector)) {
821 type: child.type ?? component.type,
822 childElements: [anchor],
828 // If no children were defined for this component, or none were found
829 // in the DOM, use the default definition.
832 type: component.type,
833 childElements: [anchor],
837 // If no component was found, use default values.
840 type: this.#defaultComponent.type,
845 * Determines whether or not an ad was visible or hidden.
847 * An ad is considered visible if the parent element containing the
848 * component has non-zero dimensions, and all child element in the
849 * component have non-zero dimensions and mostly (50% height) fits within
850 * the window at the time when the impression was taken. If the element is to
851 * the left of the visible area, we also consider it viewed as it's possible
852 * the user interacted with a carousel which typically scrolls new content
855 * For some components, like text ads, we don't send every child
856 * element for visibility, just the first text ad. For other components
857 * like carousels, we send all child elements because we do care about
858 * counting how many elements of the carousel were visible.
860 * @param {Element} element
861 * Element to be inspected
862 * @param {number} adsLoaded
863 * Number of ads initially determined to be loaded for this element.
864 * @param {Array<Element>} childElements
865 * List of children belonging to element.
866 * @param {number} innerWindowHeight
867 * Current height of the window containing the elements.
868 * @param {number} scrollY
869 * Current distance the window has been scrolled.
871 * Contains adsVisible which is the number of ads shown for the element
872 * and adsHidden, the number of ads not visible to the user.
874 #countVisibleAndHiddenAds(
882 element.ownerGlobal.windowUtils.getBoundsWithoutFlushing(element);
884 // If the parent element is not visible, assume all ads within are
887 !element.checkVisibility({
888 visibilityProperty: true,
889 opacityProperty: true,
892 Glean.serp.adsBlockedCount.hidden_parent.add();
895 adsHidden: adsLoaded,
899 // If an ad is far above the possible visible area of a window, an
900 // adblocker might be doing it as a workaround for blocking the ad.
902 elementRect.bottom < 0 &&
903 innerWindowHeight + scrollY + elementRect.bottom < 0
905 Glean.serp.adsBlockedCount.beyond_viewport.add();
908 adsHidden: adsLoaded,
912 // If the element has no child elements, check if the element
913 // was ever viewed by the user at this moment.
914 if (!childElements?.length) {
915 // Most ads don't require horizontal scrolling to view it. Thus, we only
916 // check if it could've appeared with some vertical scrolling.
917 let visible = VisibilityHelper.elementWasVisibleVertically(
923 adsVisible: visible ? 1 : 0,
930 for (let child of childElements) {
932 !child.checkVisibility({
933 visibilityProperty: true,
934 opacityProperty: true,
938 Glean.serp.adsBlockedCount.hidden_child.add();
943 child.ownerGlobal.windowUtils.getBoundsWithoutFlushing(child);
944 // If the child element is to the right of the containing element and
945 // can't be viewed, skip it. We do this check because some elements like
946 // carousels can hide additional content horizontally. We don't apply the
947 // same logic if the element is to the left because we assume carousels
948 // scroll elements to the left when the user wants to see more contents.
949 // Thus, the elements to the left must've been visible.
951 !VisibilityHelper.childElementWasVisibleHorizontally(
960 // If the height of child element is not visible, skip it.
962 !VisibilityHelper.elementWasVisibleVertically(
980 * Caches ad data for a DOM element. The key of the map is by Element rather
981 * than Component for fast lookup on whether an Element has been already been
982 * categorized as a component. Subsequent calls to this passing the same
983 * element will update the list of child elements.
985 * @param {Element} element
986 * The element considered to be the root for the component.
987 * @param {object} params
988 * Various parameters that can be recorded. Whether the input values exist
989 * or not depends on which component was found, which heuristic should be used
990 * to determine whether an ad was visible, and whether we've already seen this
992 * @param {string | null} params.type
993 * The type of component.
994 * @param {number} params.count
995 * The number of ads found for a component. The number represents either
996 * the number of elements that match an ad expression or the number of DOM
997 * elements containing an ad link.
998 * @param {Array<Element>} params.proxyChildElements
999 * An array of DOM elements that should be inspected for visibility instead
1000 * of the actual child elements, possibly because they are grouped.
1001 * @param {Array<Element>} params.childElements
1002 * An array of DOM elements to inspect.
1006 { type, count = 1, proxyChildElements = [], childElements = [] } = {}
1008 if (this.#elementToAdDataMap.has(element)) {
1009 let recordedValues = this.#elementToAdDataMap.get(element);
1010 if (childElements.length) {
1011 recordedValues.childElements =
1012 recordedValues.childElements.concat(childElements);
1015 this.#elementToAdDataMap.set(element, {
1025 export class VisibilityHelper {
1027 * Whether the element was vertically visible. It assumes elements above the
1028 * viewable area were visible at some point in time.
1030 * @param {DOMRect} rect
1031 * The bounds of the element.
1032 * @param {number} innerWindowHeight
1033 * The height of the window.
1034 * @param {number} threshold
1035 * What percentage of the element should vertically be visible.
1036 * @returns {boolean}
1037 * Whether the element was visible.
1039 static elementWasVisibleVertically(rect, innerWindowHeight, threshold) {
1040 return rect.top + rect.height * threshold <= innerWindowHeight;
1044 * Whether the child element was horizontally visible. It assumes elements to
1045 * the left were visible at some point in time.
1047 * @param {DOMRect} parentRect
1048 * The bounds of the element that contains the child.
1049 * @param {DOMRect} childRect
1050 * The bounds of the child element.
1051 * @param {number} threshold
1052 * What percentage of the child element should horizontally be visible.
1053 * @returns {boolean}
1054 * Whether the child element was visible.
1056 static childElementWasVisibleHorizontally(parentRect, childRect, threshold) {
1058 childRect.left + childRect.width * threshold <=
1059 parentRect.left + parentRect.width
1065 * An object indicating which elements to examine for domains to extract and
1066 * which heuristic technique to use to extract that element's domain.
1068 * @typedef {object} ExtractorInfo
1069 * @property {string} selectors
1070 * A string representing the CSS selector that targets the elements on the
1071 * page that contain domains we want to extract.
1072 * @property {string} method
1073 * A string representing which domain extraction heuristic to use.
1074 * One of: "href", "dataAttribute" or "textContent".
1075 * @property {object | null} options
1076 * Options related to the domain extraction heuristic used.
1077 * @property {string | null} options.dataAttributeKey
1078 * The key name of the data attribute to lookup.
1079 * @property {string | null} options.queryParamKey
1080 * The key name of the query param value to lookup.
1081 * @property {boolean | null} options.queryParamValueIsHref
1082 * Whether the query param value is expected to contain an href.
1086 * DomainExtractor examines elements on a page to retrieve the domains.
1088 class DomainExtractor {
1090 * Extract domains from the page using an array of information pertaining to
1093 * @param {Document} document
1094 * The document for the SERP we are extracting domains from.
1095 * @param {Array<ExtractorInfo>} extractorInfos
1096 * Information used to target the domains we need to extract.
1097 * @param {string} providerName
1098 * Name of the search provider.
1099 * @return {Set<string>}
1100 * A set of the domains extracted from the page.
1102 extractDomainsFromDocument(document, extractorInfos, providerName) {
1103 let extractedDomains = new Set();
1104 if (!extractorInfos?.length) {
1105 return extractedDomains;
1108 for (let extractorInfo of extractorInfos) {
1109 if (!extractorInfo.selectors) {
1113 let elements = document.querySelectorAll(extractorInfo.selectors);
1118 switch (extractorInfo.method) {
1120 // Origin is used in case a URL needs to be made absolute.
1121 let origin = new URL(document.documentURI).origin;
1122 this.#fromElementsConvertHrefsIntoDomains(
1127 extractorInfo.options?.queryParamKey,
1128 extractorInfo.options?.queryParamValueIsHref
1132 case "dataAttribute": {
1133 this.#fromElementsRetrieveDataAttributeValues(
1136 extractorInfo.options?.dataAttributeKey,
1141 case "textContent": {
1142 this.#fromElementsRetrieveTextContent(
1152 return extractedDomains;
1156 * Given a list of elements, extract domains using href attributes. If the
1157 * URL in the href includes the specified query param, the domain will be
1158 * that query param's value. Otherwise it will be the hostname of the href
1161 * @param {NodeList<Element>} elements
1162 * A list of elements from the page whose href attributes we want to
1164 * @param {string} origin
1165 * Origin of the current page.
1166 * @param {string} providerName
1167 * The name of the search provider.
1168 * @param {Set<string>} extractedDomains
1169 * The result set of domains extracted from the page.
1170 * @param {string | null} queryParam
1171 * An optional query param to search for in an element's href attribute.
1172 * @param {boolean | null} queryParamValueIsHref
1173 * Whether the query param value is expected to contain an href.
1175 #fromElementsConvertHrefsIntoDomains(
1181 queryParamValueIsHref
1183 for (let element of elements) {
1184 if (this.#exceedsThreshold(extractedDomains.size)) {
1188 let href = element.getAttribute("href");
1192 url = new URL(href, origin);
1197 // Ignore non-standard protocols.
1198 if (url.protocol != "https:" && url.protocol != "http:") {
1203 let paramValue = url.searchParams.get(queryParam);
1204 if (queryParamValueIsHref) {
1206 paramValue = new URL(paramValue).hostname;
1210 paramValue = this.#processDomain(paramValue, providerName);
1212 if (paramValue && !extractedDomains.has(paramValue)) {
1213 extractedDomains.add(paramValue);
1215 } else if (url.hostname) {
1216 let processedHostname = this.#processDomain(url.hostname, providerName);
1217 if (processedHostname && !extractedDomains.has(processedHostname)) {
1218 extractedDomains.add(processedHostname);
1225 * Given a list of elements, examine each for the specified data attribute.
1226 * If found, add that data attribute's value to the result set of extracted
1229 * @param {NodeList<Element>} elements
1230 * A list of elements from the page whose data attributes we want to
1232 * @param {string} providerName
1233 * The name of the search provider.
1234 * @param {string} attribute
1235 * The name of a data attribute to search for within an element.
1236 * @param {Set<string>} extractedDomains
1237 * The result set of domains extracted from the page.
1239 #fromElementsRetrieveDataAttributeValues(
1245 for (let element of elements) {
1246 if (this.#exceedsThreshold(extractedDomains.size)) {
1249 let value = element.dataset[attribute];
1250 value = this.#processDomain(value, providerName);
1251 if (value && !extractedDomains.has(value)) {
1252 extractedDomains.add(value);
1257 /* Given a list of elements, examine the text content for each element, which
1258 * may be 1) a URL from which we can extract a domain or 2) text we can fix
1259 * up to create a best guess as to a URL. If either condition is met, we add
1260 * the domain to the result set.
1262 * @param {NodeList<Element>} elements
1263 * A list of elements from the page whose text content we want to inspect.
1264 * @param {Set<string>} extractedDomains
1265 * The result set of domains extracted from the page.
1266 * @param {string} providerName
1267 * The name of the search provider.
1269 #fromElementsRetrieveTextContent(elements, extractedDomains, providerName) {
1270 // Not an exhaustive regex, but it fits our purpose for this method.
1271 const LOOSE_URL_REGEX =
1272 /^(?:https?:\/\/)?(?:www\.)?(?:[\w\-]+\.)+(?:[\w\-]{2,})/i;
1274 // Known but acceptable limitations to this function, where the return
1275 // value won't be correctly fixed up:
1276 // 1) A url is embedded within other text. Ex: "xkcd.com is cool."
1277 // 2) The url contains legal but unusual characters. Ex: $ ! * '
1278 function fixup(textContent) {
1281 .replaceAll(" ", "")
1286 for (let element of elements) {
1287 if (this.#exceedsThreshold(extractedDomains.size)) {
1290 let textContent = element.textContent;
1296 if (LOOSE_URL_REGEX.test(textContent)) {
1297 // Creating a new URL object will throw if the protocol is missing.
1298 if (!/^https?:\/\//.test(textContent)) {
1299 textContent = "https://" + textContent;
1303 domain = new URL(textContent).hostname;
1305 domain = fixup(textContent);
1308 domain = fixup(textContent);
1311 let processedDomain = this.#processDomain(domain, providerName);
1312 if (processedDomain && !extractedDomains.has(processedDomain)) {
1313 extractedDomains.add(processedDomain);
1319 * Processes a raw domain extracted from the SERP into its final form before
1322 * @param {string} domain
1323 * The domain extracted from the page.
1324 * @param {string} providerName
1325 * The provider associated with the page.
1327 * The domain without any subdomains.
1329 #processDomain(domain, providerName) {
1331 domain.startsWith(`${providerName}.`) ||
1332 domain.includes(`.${providerName}.`)
1336 return this.#stripDomainOfSubdomains(domain);
1340 * Helper to strip domains of any subdomains.
1342 * @param {string} domain
1343 * The domain to strip of any subdomains.
1344 * @returns {object} browser
1345 * The given domain with any subdomains removed.
1347 #stripDomainOfSubdomains(domain) {
1349 // Can throw an exception if the input has too few domain levels.
1351 tld = Services.eTLD.getKnownPublicSuffixFromHost(domain);
1356 let domainWithoutTLD = domain.substring(0, domain.length - tld.length);
1357 let secondLevelDomain = domainWithoutTLD.split(".").at(-2);
1359 return secondLevelDomain ? `${secondLevelDomain}.${tld}` : "";
1363 * Per a request from Data Science, we need to limit the number of domains
1364 * categorized to 10 non-ad domains and 10 ad domains.
1366 * @param {number} nDomains The number of domains processed.
1367 * @returns {boolean} Whether or not the threshold was exceeded.
1369 #exceedsThreshold(nDomains) {
1370 return nDomains >= CATEGORIZATION_SETTINGS.MAX_DOMAINS_TO_CATEGORIZE;
1374 export const domainExtractor = new DomainExtractor();
1375 const searchProviders = new SearchProviders();
1376 const searchAdImpression = new SearchAdImpression();
1378 const documentToEventCallbackMap = new WeakMap();
1379 const documentToRemoveEventListenersMap = new WeakMap();
1380 const documentToSubmitMap = new WeakMap();
1383 * SearchTelemetryChild monitors for pages that are partner searches, and
1384 * looks through them to find links which looks like adverts and sends back
1385 * a notification to SearchTelemetry for possible telemetry reporting.
1387 * Only the partner details and the fact that at least one ad was found on the
1388 * page are returned to SearchTelemetry. If no ads are found, no notification is
1391 export class SearchSERPTelemetryChild extends JSWindowActorChild {
1393 * Amount of time to wait after a page event before examining the page
1396 * @type {number | null}
1400 * Determines if there is a provider that matches the supplied URL and returns
1401 * the information associated with that provider.
1403 * @param {string} url The url to check
1404 * @returns {array|null} Returns null if there's no match, otherwise an array
1405 * of provider name and the provider information.
1407 _getProviderInfoForUrl(url) {
1408 return searchProviders.info?.find(info => info.searchPageRegexp.test(url));
1412 * Checks to see if the page is a partner and has an ad link within it. If so,
1413 * it will notify SearchTelemetry.
1415 _checkForAdLink(eventType) {
1417 if (!this.contentWindow) {
1421 // unload occurred before the timer expired
1425 let doc = this.document;
1426 let url = doc.documentURI;
1427 let providerInfo = this._getProviderInfoForUrl(url);
1428 if (!providerInfo) {
1432 let regexps = providerInfo.extraAdServersRegexps;
1433 let anchors = doc.getElementsByTagName("a");
1435 for (let anchor of anchors) {
1439 for (let name of providerInfo.adServerAttributes) {
1440 hasAds = regexps.some(regexp => regexp.test(anchor.dataset[name]));
1446 hasAds = regexps.some(regexp => regexp.test(anchor.href));
1454 this.sendAsyncMessage("SearchTelemetry:PageInfo", {
1461 providerInfo.components?.length &&
1462 (eventType == "load" || eventType == "pageshow")
1464 // Start performance measurements.
1465 let start = Cu.now();
1466 let timerId = Glean.serp.categorizationDuration.start();
1468 let pageActionCallback = info => {
1469 if (info.action == "submitted") {
1470 documentToSubmitMap.set(doc, true);
1472 this.sendAsyncMessage("SearchTelemetry:Action", {
1473 target: info.target,
1475 action: info.action,
1478 documentToEventCallbackMap.set(this.document, pageActionCallback);
1480 let componentToVisibilityMap, hrefToComponentMap;
1482 let result = searchAdImpression.categorize(anchors, doc);
1483 componentToVisibilityMap = result.componentToVisibilityMap;
1484 hrefToComponentMap = result.hrefToComponentMap;
1486 // Cancel the timer if an error encountered.
1487 Glean.serp.categorizationDuration.cancel(timerId);
1490 if (componentToVisibilityMap && hrefToComponentMap) {
1491 // End measurements.
1492 ChromeUtils.addProfilerMarker(
1493 "SearchSERPTelemetryChild._checkForAdLink",
1495 "Checked anchors for visibility"
1497 Glean.serp.categorizationDuration.stopAndAccumulate(timerId);
1498 this.sendAsyncMessage("SearchTelemetry:AdImpressions", {
1499 adImpressions: componentToVisibilityMap,
1507 lazy.serpEventTelemetryCategorization &&
1508 providerInfo.domainExtraction &&
1509 (eventType == "load" || eventType == "pageshow")
1511 let start = Cu.now();
1512 let nonAdDomains = domainExtractor.extractDomainsFromDocument(
1514 providerInfo.domainExtraction.nonAds,
1515 providerInfo.telemetryId
1517 let adDomains = domainExtractor.extractDomainsFromDocument(
1519 providerInfo.domainExtraction.ads,
1520 providerInfo.telemetryId
1523 this.sendAsyncMessage("SearchTelemetry:Domains", {
1529 ChromeUtils.addProfilerMarker(
1530 "SearchSERPTelemetryChild._checkForAdLink",
1532 "Extract domains from elements"
1538 * Checks for the presence of certain components on the page that are
1539 * required for recording the page impression.
1541 #checkForPageImpressionComponents() {
1542 let url = this.document.documentURI;
1543 let providerInfo = this._getProviderInfoForUrl(url);
1544 if (providerInfo.components?.length) {
1545 searchAdImpression.providerInfo = providerInfo;
1546 let start = Cu.now();
1547 let shoppingTabDisplayed = searchAdImpression.hasShoppingTab(
1550 ChromeUtils.addProfilerMarker(
1551 "SearchSERPTelemetryChild.#recordImpression",
1553 "Checked for shopping tab"
1555 this.sendAsyncMessage("SearchTelemetry:PageImpression", {
1557 shoppingTabDisplayed,
1562 #removeEventListeners() {
1563 let callbacks = documentToRemoveEventListenersMap.get(this.document);
1565 for (let callback of callbacks) {
1568 documentToRemoveEventListenersMap.delete(this.document);
1573 * Handles events received from the actor child notifications.
1575 * @param {object} event The event details.
1577 handleEvent(event) {
1578 if (!this.#urlIsSERP(this.document.documentURI)) {
1581 switch (event.type) {
1583 // If a page is loaded from the bfcache, we won't get a "DOMContentLoaded"
1584 // event, so we need to rely on "pageshow" in this case. Note: we do this
1585 // so that we remain consistent with the *.in-content:sap* count for the
1586 // SEARCH_COUNTS histogram.
1587 if (event.persisted) {
1588 this.#checkForPageImpressionComponents();
1589 this.#check(event.type);
1593 case "DOMContentLoaded": {
1594 this.#checkForPageImpressionComponents();
1595 this.#check(event.type);
1599 // We check both DOMContentLoaded and load in case the page has
1600 // taken a long time to load and the ad is only detected on load.
1601 // We still check at DOMContentLoaded because if the page hasn't
1602 // finished loading and the user navigates away, we still want to know
1603 // if there were ads on the page or not at that time.
1604 this.#check(event.type);
1608 let callbacks = documentToRemoveEventListenersMap.get(this.document);
1610 for (let removeEventListenerCallback of callbacks) {
1611 removeEventListenerCallback();
1613 documentToRemoveEventListenersMap.delete(this.document);
1615 this.#cancelCheck();
1621 async receiveMessage(message) {
1622 switch (message.name) {
1623 case "SearchSERPTelemetry:WaitForSPAPageLoad":
1624 lazy.setTimeout(() => {
1625 this.#checkForPageImpressionComponents();
1626 this._checkForAdLink("load");
1627 }, Services.cpmm.sharedData.get(SEARCH_TELEMETRY_SHARED.SPA_LOAD_TIMEOUT));
1629 case "SearchSERPTelemetry:StopTrackingDocument":
1630 this.#removeDocumentFromSubmitMap();
1631 this.#removeEventListeners();
1633 case "SearchSERPTelemetry:DidSubmit":
1634 return this.#didSubmit();
1640 return documentToSubmitMap.get(this.document);
1643 #removeDocumentFromSubmitMap() {
1644 documentToSubmitMap.delete(this.document);
1648 let provider = this._getProviderInfoForUrl(this.document.documentURI);
1650 // Some URLs can match provider info but also be the provider's homepage
1651 // instead of a SERP.
1652 // e.g. https://example.com/ vs. https://example.com/?foo=bar
1653 // To check this, we look for the presence of the query parameter
1654 // that contains a search term.
1655 let queries = new URLSearchParams(url.split("#")[0].split("?")[1]);
1656 for (let queryParamName of provider.queryParamNames) {
1657 if (queries.get(queryParamName)) {
1666 if (this._waitForContentTimeout) {
1667 lazy.clearTimeout(this._waitForContentTimeout);
1672 if (!this.#adTimeout) {
1673 this.#adTimeout = Services.cpmm.sharedData.get(
1674 SEARCH_TELEMETRY_SHARED.LOAD_TIMEOUT
1677 this.#cancelCheck();
1678 this._waitForContentTimeout = lazy.setTimeout(() => {
1679 this._checkForAdLink(eventType);
1680 }, this.#adTimeout);