1 /* This Source Code Form is subject to the terms of the Mozilla Public
2 * License, v. 2.0. If a copy of the MPL was not distributed with this
3 * file, You can obtain one at http://mozilla.org/MPL/2.0/. */
5 import { XPCOMUtils } from "resource://gre/modules/XPCOMUtils.sys.mjs";
9 ChromeUtils.defineESModuleGetters(lazy, {
10 clearTimeout: "resource://gre/modules/Timer.sys.mjs",
11 setTimeout: "resource://gre/modules/Timer.sys.mjs",
14 XPCOMUtils.defineLazyPreferenceGetter(
17 "browser.search.serpEventTelemetry.enabled",
21 XPCOMUtils.defineLazyPreferenceGetter(
23 "serpEventTelemetryCategorization",
24 "browser.search.serpEventTelemetryCategorization.enabled",
28 export const CATEGORIZATION_SETTINGS = {
29 MAX_DOMAINS_TO_CATEGORIZE: 10,
32 // Duplicated from SearchSERPTelemetry to avoid loading the module on content
34 const SEARCH_TELEMETRY_SHARED = {
35 PROVIDER_INFO: "SearchTelemetry:ProviderInfo",
36 LOAD_TIMEOUT: "SearchTelemetry:LoadTimeout",
37 SPA_LOAD_TIMEOUT: "SearchTelemetry:SPALoadTimeout",
41 * Standard events mapped to the telemetry action.
43 const EVENT_TYPE_TO_ACTION = {
48 * A map of object conditions mapped to the condition that should be run when
49 * an event is triggered. The condition name is referenced in Remote Settings
50 * under the optional `condition` string for an event listener.
53 keydownEnter: event => event.key == "Enter",
57 * SearchProviders looks after keeping track of the search provider information
58 * received from the main process.
60 * It is separate to SearchTelemetryChild so that it is not constructed for each
61 * tab, but once per process.
63 class SearchProviders {
65 this._searchProviderInfo = null;
66 Services.cpmm.sharedData.addEventListener("change", this);
70 * Gets the search provider information for any provider with advert information.
71 * If there is nothing in the cache, it will obtain it from shared data.
73 * @returns {object} Returns the search provider information.
74 * @see SearchTelemetry.sys.mjs
77 if (this._searchProviderInfo) {
78 return this._searchProviderInfo;
81 this._searchProviderInfo = Services.cpmm.sharedData.get(
82 SEARCH_TELEMETRY_SHARED.PROVIDER_INFO
85 if (!this._searchProviderInfo) {
89 this._searchProviderInfo = this._searchProviderInfo
90 // Filter-out non-ad providers so that we're not trying to match against
91 // those unnecessarily.
92 .filter(p => "extraAdServersRegexps" in p)
93 // Pre-build the regular expressions.
95 p.adServerAttributes = p.adServerAttributes ?? [];
96 if (p.shoppingTab?.inspectRegexpInSERP) {
97 p.shoppingTab.regexp = new RegExp(p.shoppingTab.regexp);
101 searchPageRegexp: new RegExp(p.searchPageRegexp),
102 extraAdServersRegexps: p.extraAdServersRegexps.map(
108 return this._searchProviderInfo;
112 * Handles events received from sharedData notifications.
114 * @param {object} event The event details.
117 switch (event.type) {
119 if (event.changedKeys.includes(SEARCH_TELEMETRY_SHARED.PROVIDER_INFO)) {
120 // Just null out the provider information for now, we'll fetch it next
122 this._searchProviderInfo = null;
131 * @typedef {object} EventListenerParam
132 * @property {string} eventType
133 * The type of event the listener should listen for. If the event type is
134 * is non-standard, it should correspond to a definition in
135 * CUSTOM_EVENT_TYPE_TO_DATA that will re-map it to a standard type. TODO
136 * @property {string} target
137 * The type of component that was the source of the event.
138 * @property {string | null} action
139 * The action that should be reported in telemetry.
143 * Provides a way to add listeners to elements, as well as unload them.
145 class ListenerHelper {
147 * Adds each event listener in an array of event listeners to each element
148 * in an array of elements, and sets their unloading.
150 * @param {Array<Element>} elements
151 * DOM elements to add event listeners to.
152 * @param {Array<EventListenerParam>} eventListenerParams
153 * The type of event to add the listener to.
154 * @param {string} target
156 static addListeners(elements, eventListenerParams, target) {
157 if (!elements?.length || !eventListenerParams?.length) {
161 let document = elements[0].ownerGlobal.document;
162 let callback = documentToEventCallbackMap.get(document);
167 // The map might have entries from previous callers, so we must ensure
168 // we don't discard existing event listener callbacks.
169 let removeListenerCallbacks = [];
170 if (documentToRemoveEventListenersMap.has(document)) {
171 removeListenerCallbacks = documentToRemoveEventListenersMap.get(document);
174 for (let params of eventListenerParams) {
175 let removeListeners = ListenerHelper.addListener(
181 removeListenerCallbacks = removeListenerCallbacks.concat(removeListeners);
184 documentToRemoveEventListenersMap.set(document, removeListenerCallbacks);
188 * Add an event listener to each element in an array of elements.
190 * @param {Array<Element>} elements
191 * DOM elements to add event listeners to.
192 * @param {EventListenerParam} eventListenerParam
193 * @param {string} target
194 * @param {Function} callback
195 * @returns {Array<function>} Array of remove event listener functions.
197 static addListener(elements, eventListenerParam, target, callback) {
198 let { action, eventType, target: customTarget } = eventListenerParam;
201 target = customTarget;
205 action = EVENT_TYPE_TO_ACTION[eventType];
211 // Some events might have specific conditions we want to check before
212 // registering an engagement event.
214 if (eventListenerParam.condition) {
215 if (CONDITIONS[eventListenerParam.condition]) {
216 let condition = CONDITIONS[eventListenerParam.condition];
217 eventCallback = async event => {
218 let start = Cu.now();
219 if (condition(event)) {
220 callback({ action, target });
222 ChromeUtils.addProfilerMarker(
223 "SearchSERPTelemetryChild._eventCallback",
225 "Call cached function before callback."
229 // If a component included a condition, but it wasn't found it is
230 // due to the fact that it was added in a more recent Firefox version
231 // than what is provided via search-telemetry-v2. Since the version of
232 // Firefox the user is using doesn't include this condition,
233 // we shouldn't add the event.
237 eventCallback = () => {
238 callback({ action, target });
242 let removeListenerCallbacks = [];
243 for (let element of elements) {
244 element.addEventListener(eventType, eventCallback);
245 removeListenerCallbacks.push(() => {
246 element.removeEventListener(eventType, eventCallback);
249 return removeListenerCallbacks;
254 * Scans SERPs for ad components.
256 class SearchAdImpression {
258 * A reference to ad component information that is used if an anchor
259 * element could not be categorized to a specific ad component.
263 #defaultComponent = null;
266 * Maps DOM elements to AdData.
268 * @type {Map<Element, AdData>}
272 * @property {string} type
273 * The type of ad component.
274 * @property {number} adsLoaded
275 * The number of ads counted as loaded for the component.
276 * @property {boolean} countChildren
277 * Whether all the children were counted for the component.
279 #elementToAdDataMap = new Map();
282 * An array of components to do a top-down search.
284 #topDownComponents = [];
287 * A reference the providerInfo for this SERP.
291 #providerInfo = null;
293 set providerInfo(providerInfo) {
294 if (this.#providerInfo?.telemetryId == providerInfo.telemetryId) {
298 this.#providerInfo = providerInfo;
301 this.#topDownComponents = [];
303 for (let component of this.#providerInfo.components) {
304 if (component.default) {
305 this.#defaultComponent = component;
308 if (component.topDown) {
309 this.#topDownComponents.push(component);
315 * Check if the page has a shopping tab.
317 * @param {Document} document
319 * Whether the page has a shopping tab. Defaults to false.
321 hasShoppingTab(document) {
322 if (!this.#providerInfo?.shoppingTab) {
326 // If a provider has the inspectRegexpInSERP, we assume there must be an
327 // associated regexp that must be used on any hrefs matched by the elements
328 // found using the selector. If inspectRegexpInSERP is false, then check if
329 // the number of items found using the selector matches exactly one element
330 // to ensure we've used a fine-grained search.
331 let elements = document.querySelectorAll(
332 this.#providerInfo.shoppingTab.selector
334 if (this.#providerInfo.shoppingTab.inspectRegexpInSERP) {
335 let regexp = this.#providerInfo.shoppingTab.regexp;
336 for (let element of elements) {
337 let href = element.getAttribute("href");
338 if (href && regexp.test(href)) {
339 this.#recordElementData(element, {
340 type: "shopping_tab",
346 } else if (elements.length == 1) {
347 this.#recordElementData(elements[0], {
348 type: "shopping_tab",
357 * Examine the list of anchors and the document object and find components
360 * With the list of anchors, go through each and find the component it
361 * belongs to and save it in elementToAdDataMap.
363 * Then, with the document object find components and save the results to
364 * elementToAdDataMap.
366 * Lastly, combine the results together in a new Map that contains the number
367 * of loaded, visible, and blocked results for the component.
369 * @param {HTMLCollectionOf<HTMLAnchorElement>} anchors
370 * @param {Document} document
372 * @returns {Map<string, object>}
373 * A map where the key is a string containing the type of ad component
374 * and the value is an object containing the number of adsLoaded,
375 * adsVisible, and adsHidden within the component.
377 categorize(anchors, document) {
378 // Used for various functions to make relative URLs absolute.
379 let origin = new URL(document.documentURI).origin;
381 // Bottom up approach.
382 this.#categorizeAnchors(anchors, origin);
384 // Top down approach.
385 this.#categorizeDocument(document);
387 let componentToVisibilityMap = new Map();
388 let hrefToComponentMap = new Map();
390 let innerWindowHeight = document.ownerGlobal.innerHeight;
391 let scrollY = document.ownerGlobal.scrollY;
393 // Iterate over the results:
394 // - If it's searchbox add event listeners.
395 // - If it is a non_ads_link, map its href to component type.
396 // - For others, map its component type and check visibility.
397 for (let [element, data] of this.#elementToAdDataMap.entries()) {
398 if (data.type == "incontent_searchbox") {
399 // Bug 1880413: Deprecate hard coding the incontent search box.
400 // If searchbox has child elements, observe those, otherwise
401 // fallback to its parent element.
402 let searchElements = data.childElements.length
405 ListenerHelper.addListeners(
408 { eventType: "click", target: data.type },
410 eventType: "keydown",
413 condition: "keydownEnter",
420 if (data.childElements.length) {
421 for (let child of data.childElements) {
422 let href = this.#extractHref(child, origin);
424 hrefToComponentMap.set(href, data.type);
428 let href = this.#extractHref(element, origin);
430 hrefToComponentMap.set(href, data.type);
434 // If the component is a non_ads_link, skip visibility checks.
435 if (data.type == "non_ads_link") {
439 // If proxy children were found, check the visibility of all of them
440 // otherwise just check the visiblity of the first child.
442 if (data.proxyChildElements.length) {
443 childElements = data.proxyChildElements;
444 } else if (data.childElements.length) {
445 childElements = [data.childElements[0]];
448 let count = this.#countVisibleAndHiddenAds(
455 if (componentToVisibilityMap.has(data.type)) {
456 let componentInfo = componentToVisibilityMap.get(data.type);
457 componentInfo.adsLoaded += data.adsLoaded;
458 componentInfo.adsVisible += count.adsVisible;
459 componentInfo.adsHidden += count.adsHidden;
461 componentToVisibilityMap.set(data.type, {
462 adsLoaded: data.adsLoaded,
463 adsVisible: count.adsVisible,
464 adsHidden: count.adsHidden,
469 // Release the DOM elements from the Map.
470 this.#elementToAdDataMap.clear();
472 return { componentToVisibilityMap, hrefToComponentMap };
476 * Given an element, find the href that is most likely to make the request if
477 * the element is clicked. If the element contains a specific data attribute
478 * known to contain the url used to make the initial request, use it,
479 * otherwise use its href. Specific character conversions are done to mimic
480 * conversions likely to take place when urls are observed in network
483 * @param {Element} element
484 * The element to inspect.
485 * @param {string} origin
486 * The origin for relative urls.
488 * The href of the element.
490 #extractHref(element, origin) {
492 // Prioritize the href from a known data attribute value instead of
493 // its href property, as the former is the initial url the page will
494 // navigate to before being re-directed to the href.
495 for (let name of this.#providerInfo.adServerAttributes) {
497 element.dataset[name] &&
498 this.#providerInfo.extraAdServersRegexps.some(regexp =>
499 regexp.test(element.dataset[name])
502 href = element.dataset[name];
506 // If a data attribute value was not found, fallback to the href.
507 href = href ?? element.getAttribute("href");
512 // Avoid extracting or fixing up Javascript URLs.
513 if (href.startsWith("javascript")) {
517 // Hrefs can be relative.
518 if (!href.startsWith("https://") && !href.startsWith("http://")) {
519 href = origin + href;
521 // Per Bug 376844, apostrophes in query params are escaped, and thus, are
522 // percent-encoded by the time they are observed in the network. Even
523 // though it's more comprehensive, we avoid using newURI because its more
524 // expensive and conversions should be the exception.
525 // e.g. /path'?q=Mozilla's -> /path'?q=Mozilla%27s
526 let arr = href.split("?");
527 if (arr.length == 2 && arr[1].includes("'")) {
528 href = arr[0] + "?" + arr[1].replaceAll("'", "%27");
534 * Given a list of anchor elements, group them into ad components.
536 * The first step in the process is to check if the anchor should be
537 * inspected. This is based on whether it contains an href or a
538 * data-attribute values that matches an ad link, or if it contains a
539 * pattern caught by a components included regular expression.
541 * Determine which component it belongs to and the number of matches for
542 * the component. The heuristic is described in findDataForAnchor.
543 * If there was a result and we haven't seen it before, save it in
544 * elementToAdDataMap.
546 * @param {HTMLCollectionOf<HTMLAnchorElement>} anchors
547 * The list of anchors to inspect.
548 * @param {string} origin
549 * The origin of the document the anchors belong to.
551 #categorizeAnchors(anchors, origin) {
552 for (let anchor of anchors) {
553 if (this.#shouldInspectAnchor(anchor, origin)) {
554 let result = this.#findDataForAnchor(anchor);
556 this.#recordElementData(result.element, {
559 proxyChildElements: result.proxyChildElements,
560 childElements: result.childElements,
563 if (result.relatedElements?.length) {
564 // Bug 1880413: Deprecate related elements.
565 // Bottom-up approach with related elements are only used for
566 // non-link elements related to ads, like carousel arrows.
567 ListenerHelper.addListeners(
568 result.relatedElements,
583 * Find components from the document object. This is mostly relevant for
584 * components that are non-ads and don't have an obvious regular expression
585 * that could match the pattern of the href.
587 * @param {Document} document
589 #categorizeDocument(document) {
590 // using the subset of components that are top down,
591 // go through each one.
592 for (let component of this.#topDownComponents) {
593 // Top-down searches must have the topDown attribute.
594 if (!component.topDown) {
597 // Top down searches must include a parent.
598 if (!component.included?.parent) {
601 let parents = document.querySelectorAll(
602 component.included.parent.selector
604 if (parents.length) {
605 let eventListeners = component.included.parent.eventListeners;
606 if (eventListeners?.length) {
607 ListenerHelper.addListeners(parents, eventListeners, component.type);
609 for (let parent of parents) {
610 // Bug 1880413: Deprecate related elements.
611 // Top-down related elements are either used for auto-suggested
612 // elements of a searchbox, or elements on a page which we can't
613 // find through a bottom up approach but we want an add a listener,
614 // like carousels with arrows.
615 if (component.included.related?.selector) {
616 let relatedElements = parent.querySelectorAll(
617 component.included.related.selector
619 if (relatedElements.length) {
620 // For the search box, related elements with event listeners are
621 // auto-suggested terms. For everything else (e.g. carousels)
622 // they are expanded.
623 ListenerHelper.addListeners(
628 component.type == "incontent_searchbox"
638 if (component.included.children) {
639 for (let child of component.included.children) {
640 let childElements = parent.querySelectorAll(child.selector);
641 if (childElements.length) {
642 if (child.eventListeners) {
643 childElements = Array.from(childElements);
644 ListenerHelper.addListeners(
646 child.eventListeners,
647 child.type ?? component.type
650 if (!child.skipCount) {
651 this.#recordElementData(parent, {
652 type: component.type,
653 childElements: Array.from(childElements),
658 } else if (!component.included.parent.skipCount) {
659 this.#recordElementData(parent, {
660 type: component.type,
669 * Evaluates whether an anchor should be inspected based on matching
670 * regular expressions on either its href or specified data-attribute values.
672 * @param {HTMLAnchorElement} anchor
673 * @param {string} origin
676 #shouldInspectAnchor(anchor, origin) {
677 let href = anchor.getAttribute("href");
682 // Some hrefs might be relative.
683 if (!href.startsWith("https://") && !href.startsWith("http://")) {
684 href = origin + href;
687 let regexps = this.#providerInfo.extraAdServersRegexps;
688 // Anchors can contain ad links in a data-attribute.
689 for (let name of this.#providerInfo.adServerAttributes) {
690 let attributeValue = anchor.dataset[name];
693 regexps.some(regexp => regexp.test(attributeValue))
698 // Anchors can contain ad links in a specific href.
699 if (regexps.some(regexp => regexp.test(href))) {
706 * Find the component data for an anchor.
708 * To categorize the anchor, we iterate over the list of possible components
709 * the anchor could be categorized. If the component is default, we skip
710 * checking because the fallback option for all anchor links is the default.
712 * First, get the "parent" of the anchor which best represents the DOM element
713 * that contains the anchor links for the component and no other component.
714 * This parent will be cached so that other anchors that share the same
715 * parent can be counted together.
717 * The check for a parent is a loop because we can define more than one best
718 * parent since on certain SERPs, it's possible for a "better" DOM element
719 * parent to appear occassionally.
721 * If no parent is found, skip this component.
723 * If a parent was found, check for specific child elements.
725 * Finding child DOM elements of a parent is optional. One reason to do so is
726 * to use child elements instead of anchor links to count the number of ads for
727 * a component via the `countChildren` property. This is provided because some ads
728 * (i.e. carousels) have multiple ad links in a single child element that go to the
729 * same location. In this scenario, all instances of the child are recorded as ads.
730 * Subsequent anchor elements that map to the same parent are ignored.
732 * Whether or not a child was found, return the information that was found,
733 * including whether or not all child elements were counted instead of anchors.
735 * If another anchor belonging to a parent that was previously recorded is the input
736 * for this function, we either increment the ad count by 1 or don't increment the ad
737 * count because the parent used `countChildren` completed the calculation in a
741 * @param {HTMLAnchorElement} anchor
742 * The anchor to be inspected.
744 * An object containing the element representing the root DOM element for
745 * the component, the type of component, how many ads were counted,
746 * and whether or not the count was of all the children.
748 #findDataForAnchor(anchor) {
749 for (let component of this.#providerInfo.components) {
750 // First, check various conditions for skipping a component.
752 // A component should always have at least one included statement.
753 if (!component.included) {
757 // Top down searches are done after the bottom up search.
758 if (component.topDown) {
762 // The default component doesn't need to be checked,
763 // as it will be the fallback option.
764 if (component.default) {
768 // The anchor shouldn't belong to an excluded parent component if one
771 component.excluded?.parent?.selector &&
772 anchor.closest(component.excluded.parent.selector)
777 // All components with included should have a parent entry.
778 if (!component.included.parent) {
782 // Find the parent of the anchor.
783 let parent = anchor.closest(component.included.parent.selector);
789 // If we've already inspected the parent, add the child element to the
790 // list of anchors. Don't increment the ads loaded count, as we only care
791 // about grouping the anchor with the correct parent.
792 if (this.#elementToAdDataMap.has(parent)) {
795 childElements: [anchor],
799 let relatedElements = [];
800 if (component.included.related?.selector) {
801 relatedElements = parent.querySelectorAll(
802 component.included.related.selector
806 // If the component has no defined children, return the parent element.
807 if (component.included.children) {
808 // Look for the first instance of a matching child selector.
809 for (let child of component.included.children) {
810 // If counting by child, get all of them at once.
811 if (child.countChildren) {
812 let proxyChildElements = parent.querySelectorAll(child.selector);
813 if (proxyChildElements.length) {
816 type: child.type ?? component.type,
817 proxyChildElements: Array.from(proxyChildElements),
818 count: proxyChildElements.length,
819 childElements: [anchor],
823 } else if (parent.querySelector(child.selector)) {
826 type: child.type ?? component.type,
827 childElements: [anchor],
833 // If no children were defined for this component, or none were found
834 // in the DOM, use the default definition.
837 type: component.type,
838 childElements: [anchor],
842 // If no component was found, use default values.
845 type: this.#defaultComponent.type,
850 * Determines whether or not an ad was visible or hidden.
852 * An ad is considered visible if the parent element containing the
853 * component has non-zero dimensions, and all child element in the
854 * component have non-zero dimensions and fits within the window
855 * at the time when the impression was takent.
857 * For some components, like text ads, we don't send every child
858 * element for visibility, just the first text ad. For other components
859 * like carousels, we send all child elements because we do care about
860 * counting how many elements of the carousel were visible.
862 * @param {Element} element
863 * Element to be inspected
864 * @param {number} adsLoaded
865 * Number of ads initially determined to be loaded for this element.
866 * @param {Array<Element>} childElements
867 * List of children belonging to element.
868 * @param {number} innerWindowHeight
869 * Current height of the window containing the elements.
870 * @param {number} scrollY
871 * Current distance the window has been scrolled.
873 * Contains adsVisible which is the number of ads shown for the element
874 * and adsHidden, the number of ads not visible to the user.
876 #countVisibleAndHiddenAds(
884 element.ownerGlobal.windowUtils.getBoundsWithoutFlushing(element);
886 // If the element lacks a dimension, assume all ads that
887 // were contained within it are hidden.
888 if (elementRect.width == 0 || elementRect.height == 0) {
891 adsHidden: adsLoaded,
895 // If an ad is far above the possible visible area of a window, an
896 // adblocker might be doing it as a workaround for blocking the ad.
898 elementRect.bottom < 0 &&
899 innerWindowHeight + scrollY + elementRect.bottom < 0
903 adsHidden: adsLoaded,
907 // Since the parent element has dimensions but no child elements we want
908 // to inspect, check the parent itself is within the viewable area.
909 if (!childElements || !childElements.length) {
910 if (innerWindowHeight < elementRect.y + elementRect.height) {
924 for (let child of childElements) {
926 child.ownerGlobal.windowUtils.getBoundsWithoutFlushing(child);
928 // If the child element we're inspecting has no dimension, it is hidden.
929 if (itemRect.height == 0 || itemRect.width == 0) {
934 // If the child element is to the left of the containing element, or to
935 // the right of the containing element, skip it.
937 itemRect.x < elementRect.x ||
938 itemRect.x + itemRect.width > elementRect.x + elementRect.width
943 // If the child element is too far down, skip it.
944 if (innerWindowHeight < itemRect.y + itemRect.height) {
957 * Caches ad data for a DOM element. The key of the map is by Element rather
958 * than Component for fast lookup on whether an Element has been already been
959 * categorized as a component. Subsequent calls to this passing the same
960 * element will update the list of child elements.
962 * @param {Element} element
963 * The element considered to be the root for the component.
964 * @param {object} params
965 * Various parameters that can be recorded. Whether the input values exist
966 * or not depends on which component was found, which heuristic should be used
967 * to determine whether an ad was visible, and whether we've already seen this
969 * @param {string | null} params.type
970 * The type of component.
971 * @param {number} params.count
972 * The number of ads found for a component. The number represents either
973 * the number of elements that match an ad expression or the number of DOM
974 * elements containing an ad link.
975 * @param {Array<Element>} params.proxyChildElements
976 * An array of DOM elements that should be inspected for visibility instead
977 * of the actual child elements, possibly because they are grouped.
978 * @param {Array<Element>} params.childElements
979 * An array of DOM elements to inspect.
983 { type, count = 1, proxyChildElements = [], childElements = [] } = {}
985 if (this.#elementToAdDataMap.has(element)) {
986 let recordedValues = this.#elementToAdDataMap.get(element);
987 if (childElements.length) {
988 recordedValues.childElements =
989 recordedValues.childElements.concat(childElements);
992 this.#elementToAdDataMap.set(element, {
1003 * An object indicating which elements to examine for domains to extract and
1004 * which heuristic technique to use to extract that element's domain.
1006 * @typedef {object} ExtractorInfo
1007 * @property {string} selectors
1008 * A string representing the CSS selector that targets the elements on the
1009 * page that contain domains we want to extract.
1010 * @property {string} method
1011 * A string representing which domain extraction heuristic to use.
1012 * One of: "href", "dataAttribute" or "textContent".
1013 * @property {object | null} options
1014 * Options related to the domain extraction heuristic used.
1015 * @property {string | null} options.dataAttributeKey
1016 * The key name of the data attribute to lookup.
1017 * @property {string | null} options.queryParamKey
1018 * The key name of the query param value to lookup.
1019 * @property {boolean | null} options.queryParamValueIsHref
1020 * Whether the query param value is expected to contain an href.
1024 * DomainExtractor examines elements on a page to retrieve the domains.
1026 class DomainExtractor {
1028 * Extract domains from the page using an array of information pertaining to
1031 * @param {Document} document
1032 * The document for the SERP we are extracting domains from.
1033 * @param {Array<ExtractorInfo>} extractorInfos
1034 * Information used to target the domains we need to extract.
1035 * @param {string} providerName
1036 * Name of the search provider.
1037 * @return {Set<string>}
1038 * A set of the domains extracted from the page.
1040 extractDomainsFromDocument(document, extractorInfos, providerName) {
1041 let extractedDomains = new Set();
1042 if (!extractorInfos?.length) {
1043 return extractedDomains;
1046 for (let extractorInfo of extractorInfos) {
1047 if (!extractorInfo.selectors) {
1051 let elements = document.querySelectorAll(extractorInfo.selectors);
1056 switch (extractorInfo.method) {
1058 // Origin is used in case a URL needs to be made absolute.
1059 let origin = new URL(document.documentURI).origin;
1060 this.#fromElementsConvertHrefsIntoDomains(
1065 extractorInfo.options?.queryParamKey,
1066 extractorInfo.options?.queryParamValueIsHref
1070 case "dataAttribute": {
1071 this.#fromElementsRetrieveDataAttributeValues(
1074 extractorInfo.options?.dataAttributeKey,
1079 case "textContent": {
1080 this.#fromElementsRetrieveTextContent(elements, extractedDomains);
1086 return extractedDomains;
1090 * Given a list of elements, extract domains using href attributes. If the
1091 * URL in the href includes the specified query param, the domain will be
1092 * that query param's value. Otherwise it will be the hostname of the href
1095 * @param {NodeList<Element>} elements
1096 * A list of elements from the page whose href attributes we want to
1098 * @param {string} origin
1099 * Origin of the current page.
1100 * @param {string} providerName
1101 * The name of the search provider.
1102 * @param {Set<string>} extractedDomains
1103 * The result set of domains extracted from the page.
1104 * @param {string | null} queryParam
1105 * An optional query param to search for in an element's href attribute.
1106 * @param {boolean | null} queryParamValueIsHref
1107 * Whether the query param value is expected to contain an href.
1109 #fromElementsConvertHrefsIntoDomains(
1115 queryParamValueIsHref
1117 for (let element of elements) {
1118 if (this.#exceedsThreshold(extractedDomains.size)) {
1122 let href = element.getAttribute("href");
1126 url = new URL(href, origin);
1131 // Ignore non-standard protocols.
1132 if (url.protocol != "https:" && url.protocol != "http:") {
1137 let paramValue = url.searchParams.get(queryParam);
1138 if (queryParamValueIsHref) {
1140 paramValue = new URL(paramValue).hostname;
1144 paramValue = this.#processDomain(paramValue, providerName);
1146 if (paramValue && !extractedDomains.has(paramValue)) {
1147 extractedDomains.add(paramValue);
1149 } else if (url.hostname) {
1150 let processedHostname = this.#processDomain(url.hostname, providerName);
1151 if (processedHostname && !extractedDomains.has(processedHostname)) {
1152 extractedDomains.add(processedHostname);
1159 * Given a list of elements, examine each for the specified data attribute.
1160 * If found, add that data attribute's value to the result set of extracted
1163 * @param {NodeList<Element>} elements
1164 * A list of elements from the page whose data attributes we want to
1166 * @param {string} providerName
1167 * The name of the search provider.
1168 * @param {string} attribute
1169 * The name of a data attribute to search for within an element.
1170 * @param {Set<string>} extractedDomains
1171 * The result set of domains extracted from the page.
1173 #fromElementsRetrieveDataAttributeValues(
1179 for (let element of elements) {
1180 if (this.#exceedsThreshold(extractedDomains.size)) {
1183 let value = element.dataset[attribute];
1184 value = this.#processDomain(value, providerName);
1185 if (value && !extractedDomains.has(value)) {
1186 extractedDomains.add(value);
1191 /* Given a list of elements, examine the text content for each element, which
1192 * may be 1) a URL from which we can extract a domain or 2) text we can fix
1193 * up to create a best guess as to a URL. If either condition is met, we add
1194 * the domain to the result set.
1196 * @param {NodeList<Element>} elements
1197 * A list of elements from the page whose text content we want to inspect.
1198 * @param {Set<string>} extractedDomains
1199 * The result set of domains extracted from the page.
1201 #fromElementsRetrieveTextContent(elements, extractedDomains) {
1202 for (let element of elements) {
1203 if (this.#exceedsThreshold(extractedDomains.size)) {
1206 let textContent = element.textContent;
1213 domain = new URL(textContent).hostname;
1215 domain = textContent.toLowerCase().replaceAll(" ", "");
1216 // If the attempt to turn the text content into a URL object only fails
1217 // because we're missing a protocol, ".com" may already be present.
1218 if (!domain.endsWith(".com")) {
1219 domain = domain.concat(".com");
1222 if (!extractedDomains.has(domain)) {
1223 extractedDomains.add(domain);
1229 * Processes a raw domain extracted from the SERP into its final form before
1232 * @param {string} domain
1233 * The domain extracted from the page.
1234 * @param {string} providerName
1235 * The provider associated with the page.
1237 * The domain without any subdomains.
1239 #processDomain(domain, providerName) {
1241 domain.startsWith(`${providerName}.`) ||
1242 domain.includes(`.${providerName}.`)
1246 return this.#stripDomainOfSubdomains(domain);
1250 * Helper to strip domains of any subdomains.
1252 * @param {string} domain
1253 * The domain to strip of any subdomains.
1254 * @returns {object} browser
1255 * The given domain with any subdomains removed.
1257 #stripDomainOfSubdomains(domain) {
1259 // Can throw an exception if the input has too few domain levels.
1261 tld = Services.eTLD.getKnownPublicSuffixFromHost(domain);
1266 let domainWithoutTLD = domain.substring(0, domain.length - tld.length);
1267 let secondLevelDomain = domainWithoutTLD.split(".").at(-2);
1269 return secondLevelDomain ? `${secondLevelDomain}.${tld}` : "";
1273 * Per a request from Data Science, we need to limit the number of domains
1274 * categorized to 10 non-ad domains and 10 ad domains.
1276 * @param {number} nDomains The number of domains processed.
1277 * @returns {boolean} Whether or not the threshold was exceeded.
1279 #exceedsThreshold(nDomains) {
1280 return nDomains >= CATEGORIZATION_SETTINGS.MAX_DOMAINS_TO_CATEGORIZE;
1284 export const domainExtractor = new DomainExtractor();
1285 const searchProviders = new SearchProviders();
1286 const searchAdImpression = new SearchAdImpression();
1288 const documentToEventCallbackMap = new WeakMap();
1289 const documentToRemoveEventListenersMap = new WeakMap();
1290 const documentToSubmitMap = new WeakMap();
1293 * SearchTelemetryChild monitors for pages that are partner searches, and
1294 * looks through them to find links which looks like adverts and sends back
1295 * a notification to SearchTelemetry for possible telemetry reporting.
1297 * Only the partner details and the fact that at least one ad was found on the
1298 * page are returned to SearchTelemetry. If no ads are found, no notification is
1301 export class SearchSERPTelemetryChild extends JSWindowActorChild {
1303 * Amount of time to wait after a page event before examining the page
1306 * @type {number | null}
1310 * Determines if there is a provider that matches the supplied URL and returns
1311 * the information associated with that provider.
1313 * @param {string} url The url to check
1314 * @returns {array|null} Returns null if there's no match, otherwise an array
1315 * of provider name and the provider information.
1317 _getProviderInfoForUrl(url) {
1318 return searchProviders.info?.find(info => info.searchPageRegexp.test(url));
1322 * Checks to see if the page is a partner and has an ad link within it. If so,
1323 * it will notify SearchTelemetry.
1325 _checkForAdLink(eventType) {
1327 if (!this.contentWindow) {
1331 // unload occurred before the timer expired
1335 let doc = this.document;
1336 let url = doc.documentURI;
1337 let providerInfo = this._getProviderInfoForUrl(url);
1338 if (!providerInfo) {
1342 let regexps = providerInfo.extraAdServersRegexps;
1343 let anchors = doc.getElementsByTagName("a");
1345 for (let anchor of anchors) {
1349 for (let name of providerInfo.adServerAttributes) {
1350 hasAds = regexps.some(regexp => regexp.test(anchor.dataset[name]));
1356 hasAds = regexps.some(regexp => regexp.test(anchor.href));
1364 this.sendAsyncMessage("SearchTelemetry:PageInfo", {
1371 lazy.serpEventsEnabled &&
1372 providerInfo.components?.length &&
1373 (eventType == "load" || eventType == "pageshow")
1375 // Start performance measurements.
1376 let start = Cu.now();
1377 let timerId = Glean.serp.categorizationDuration.start();
1379 let pageActionCallback = info => {
1380 if (info.action == "submitted") {
1381 documentToSubmitMap.set(doc, true);
1383 this.sendAsyncMessage("SearchTelemetry:Action", {
1384 target: info.target,
1386 action: info.action,
1389 documentToEventCallbackMap.set(this.document, pageActionCallback);
1391 let componentToVisibilityMap, hrefToComponentMap;
1393 let result = searchAdImpression.categorize(anchors, doc);
1394 componentToVisibilityMap = result.componentToVisibilityMap;
1395 hrefToComponentMap = result.hrefToComponentMap;
1397 // Cancel the timer if an error encountered.
1398 Glean.serp.categorizationDuration.cancel(timerId);
1401 if (componentToVisibilityMap && hrefToComponentMap) {
1402 // End measurements.
1403 ChromeUtils.addProfilerMarker(
1404 "SearchSERPTelemetryChild._checkForAdLink",
1406 "Checked anchors for visibility"
1408 Glean.serp.categorizationDuration.stopAndAccumulate(timerId);
1409 this.sendAsyncMessage("SearchTelemetry:AdImpressions", {
1410 adImpressions: componentToVisibilityMap,
1418 lazy.serpEventTelemetryCategorization &&
1419 providerInfo.domainExtraction &&
1420 (eventType == "load" || eventType == "pageshow")
1422 let start = Cu.now();
1423 let nonAdDomains = domainExtractor.extractDomainsFromDocument(
1425 providerInfo.domainExtraction.nonAds,
1426 providerInfo.telemetryId
1428 let adDomains = domainExtractor.extractDomainsFromDocument(
1430 providerInfo.domainExtraction.ads,
1431 providerInfo.telemetryId
1434 this.sendAsyncMessage("SearchTelemetry:Domains", {
1440 ChromeUtils.addProfilerMarker(
1441 "SearchSERPTelemetryChild._checkForAdLink",
1443 "Extract domains from elements"
1449 * Checks for the presence of certain components on the page that are
1450 * required for recording the page impression.
1452 #checkForPageImpressionComponents() {
1453 let url = this.document.documentURI;
1454 let providerInfo = this._getProviderInfoForUrl(url);
1455 if (providerInfo.components?.length) {
1456 searchAdImpression.providerInfo = providerInfo;
1457 let start = Cu.now();
1458 let shoppingTabDisplayed = searchAdImpression.hasShoppingTab(
1461 ChromeUtils.addProfilerMarker(
1462 "SearchSERPTelemetryChild.#recordImpression",
1464 "Checked for shopping tab"
1466 this.sendAsyncMessage("SearchTelemetry:PageImpression", {
1468 shoppingTabDisplayed,
1473 #removeEventListeners() {
1474 let callbacks = documentToRemoveEventListenersMap.get(this.document);
1476 for (let callback of callbacks) {
1479 documentToRemoveEventListenersMap.delete(this.document);
1484 * Handles events received from the actor child notifications.
1486 * @param {object} event The event details.
1488 handleEvent(event) {
1489 if (!this.#urlIsSERP(this.document.documentURI)) {
1492 switch (event.type) {
1494 // If a page is loaded from the bfcache, we won't get a "DOMContentLoaded"
1495 // event, so we need to rely on "pageshow" in this case. Note: we do this
1496 // so that we remain consistent with the *.in-content:sap* count for the
1497 // SEARCH_COUNTS histogram.
1498 if (event.persisted) {
1499 this.#check(event.type);
1500 if (lazy.serpEventsEnabled) {
1501 this.#checkForPageImpressionComponents();
1506 case "DOMContentLoaded": {
1507 if (lazy.serpEventsEnabled) {
1508 this.#checkForPageImpressionComponents();
1510 this.#check(event.type);
1514 // We check both DOMContentLoaded and load in case the page has
1515 // taken a long time to load and the ad is only detected on load.
1516 // We still check at DOMContentLoaded because if the page hasn't
1517 // finished loading and the user navigates away, we still want to know
1518 // if there were ads on the page or not at that time.
1519 this.#check(event.type);
1523 let callbacks = documentToRemoveEventListenersMap.get(this.document);
1525 for (let removeEventListenerCallback of callbacks) {
1526 removeEventListenerCallback();
1528 documentToRemoveEventListenersMap.delete(this.document);
1530 this.#cancelCheck();
1536 async receiveMessage(message) {
1537 switch (message.name) {
1538 case "SearchSERPTelemetry:WaitForSPAPageLoad":
1539 lazy.setTimeout(() => {
1540 this.#checkForPageImpressionComponents();
1541 this._checkForAdLink("load");
1542 }, Services.cpmm.sharedData.get(SEARCH_TELEMETRY_SHARED.SPA_LOAD_TIMEOUT));
1544 case "SearchSERPTelemetry:StopTrackingDocument":
1545 this.#removeDocumentFromSubmitMap();
1546 this.#removeEventListeners();
1548 case "SearchSERPTelemetry:DidSubmit":
1549 return this.#didSubmit();
1555 return documentToSubmitMap.get(this.document);
1558 #removeDocumentFromSubmitMap() {
1559 documentToSubmitMap.delete(this.document);
1563 let provider = this._getProviderInfoForUrl(this.document.documentURI);
1565 // Some URLs can match provider info but also be the provider's homepage
1566 // instead of a SERP.
1567 // e.g. https://example.com/ vs. https://example.com/?foo=bar
1568 // To check this, we look for the presence of the query parameter
1569 // that contains a search term.
1570 let queries = new URLSearchParams(url.split("#")[0].split("?")[1]);
1571 for (let queryParamName of provider.queryParamNames) {
1572 if (queries.get(queryParamName)) {
1581 if (this._waitForContentTimeout) {
1582 lazy.clearTimeout(this._waitForContentTimeout);
1587 if (!this.#adTimeout) {
1588 this.#adTimeout = Services.cpmm.sharedData.get(
1589 SEARCH_TELEMETRY_SHARED.LOAD_TIMEOUT
1592 this.#cancelCheck();
1593 this._waitForContentTimeout = lazy.setTimeout(() => {
1594 this._checkForAdLink(eventType);
1595 }, this.#adTimeout);