diff options
Diffstat (limited to 'browser/components/search/SearchSERPTelemetry.sys.mjs')
-rw-r--r-- | browser/components/search/SearchSERPTelemetry.sys.mjs | 839 |
1 files changed, 839 insertions, 0 deletions
diff --git a/browser/components/search/SearchSERPTelemetry.sys.mjs b/browser/components/search/SearchSERPTelemetry.sys.mjs new file mode 100644 index 0000000000..1313bc1743 --- /dev/null +++ b/browser/components/search/SearchSERPTelemetry.sys.mjs @@ -0,0 +1,839 @@ +/* This Source Code Form is subject to the terms of the Mozilla Public + * License, v. 2.0. If a copy of the MPL was not distributed with this + * file, You can obtain one at http://mozilla.org/MPL/2.0/. */ + +import { XPCOMUtils } from "resource://gre/modules/XPCOMUtils.sys.mjs"; + +const lazy = {}; + +ChromeUtils.defineESModuleGetters(lazy, { + BrowserSearchTelemetry: "resource:///modules/BrowserSearchTelemetry.sys.mjs", + SearchUtils: "resource://gre/modules/SearchUtils.sys.mjs", +}); + +XPCOMUtils.defineLazyModuleGetters(lazy, { + RemoteSettings: "resource://services-settings/remote-settings.js", +}); + +// The various histograms and scalars that we report to. +const SEARCH_CONTENT_SCALAR_BASE = "browser.search.content."; +const SEARCH_WITH_ADS_SCALAR_BASE = "browser.search.withads."; +const SEARCH_AD_CLICKS_SCALAR_BASE = "browser.search.adclicks."; +const SEARCH_DATA_TRANSFERRED_SCALAR = "browser.search.data_transferred"; +const SEARCH_TELEMETRY_PRIVATE_BROWSING_KEY_SUFFIX = "pb"; + +const TELEMETRY_SETTINGS_KEY = "search-telemetry-v2"; + +XPCOMUtils.defineLazyGetter(lazy, "logConsole", () => { + return console.createInstance({ + prefix: "SearchTelemetry", + maxLogLevel: lazy.SearchUtils.loggingEnabled ? "Debug" : "Warn", + }); +}); + +/** + * TelemetryHandler is the main class handling Search Engine Result Page (SERP) + * telemetry. It primarily deals with tracking of what pages are loaded into tabs. + * + * It handles the *in-content:sap* keys of the SEARCH_COUNTS histogram. + */ +class TelemetryHandler { + // Whether or not this class is initialised. + _initialized = false; + + // An instance of ContentHandler. + _contentHandler; + + // The original provider information, mainly used for tests. + _originalProviderInfo = null; + + // The current search provider info. + _searchProviderInfo = null; + + // An instance of remote settings that is used to access the provider info. + _telemetrySettings; + + // _browserInfoByURL is a map of tracked search urls to objects containing: + // * {object} info + // the search provider information associated with the url. + // * {WeakMap} browsers + // a weak map of browsers that have the url loaded and their ad report state. + // * {integer} count + // a manual count of browsers logged. + // We keep a weak map of browsers, in case we miss something on our counts + // and cause a memory leak - worst case our map is slightly bigger than it + // needs to be. + // The manual count is because WeakMap doesn't give us size/length + // information, but we want to know when we can clean up our associated + // entry. + _browserInfoByURL = new Map(); + + // _browserSourceMap is a map of the latest search source for a particular + // browser - one of the KNOWN_SEARCH_SOURCES in BrowserSearchTelemetry. + _browserSourceMap = new WeakMap(); + + // _browserNewtabSessionMap is a map of the newtab session id for particular + // browsers. + _browserNewtabSessionMap = new WeakMap(); + + constructor() { + this._contentHandler = new ContentHandler({ + browserInfoByURL: this._browserInfoByURL, + findBrowserItemForURL: (...args) => this._findBrowserItemForURL(...args), + checkURLForSerpMatch: (...args) => this._checkURLForSerpMatch(...args), + }); + } + + /** + * Initializes the TelemetryHandler and its ContentHandler. It will add + * appropriate listeners to the window so that window opening and closing + * can be tracked. + */ + async init() { + if (this._initialized) { + return; + } + + this._telemetrySettings = lazy.RemoteSettings(TELEMETRY_SETTINGS_KEY); + let rawProviderInfo = []; + try { + rawProviderInfo = await this._telemetrySettings.get(); + } catch (ex) { + lazy.logConsole.error("Could not get settings:", ex); + } + + // Send the provider info to the child handler. + this._contentHandler.init(rawProviderInfo); + this._originalProviderInfo = rawProviderInfo; + + // Now convert the regexps into + this._setSearchProviderInfo(rawProviderInfo); + + for (let win of Services.wm.getEnumerator("navigator:browser")) { + this._registerWindow(win); + } + Services.wm.addListener(this); + + this._initialized = true; + } + + /** + * Uninitializes the TelemetryHandler and its ContentHandler. + */ + uninit() { + if (!this._initialized) { + return; + } + + this._contentHandler.uninit(); + + for (let win of Services.wm.getEnumerator("navigator:browser")) { + this._unregisterWindow(win); + } + Services.wm.removeListener(this); + + this._initialized = false; + } + + /** + * Records the search source for particular browsers, in case it needs + * to be associated with a SERP. + * + * @param {browser} browser + * The browser where the search originated. + * @param {string} source + * Where the search originated from. + */ + recordBrowserSource(browser, source) { + this._browserSourceMap.set(browser, source); + } + + /** + * Records the newtab source for particular browsers, in case it needs + * to be associated with a SERP. + * + * @param {browser} browser + * The browser where the search originated. + * @param {string} newtabSessionId + * The sessionId of the newtab session the search originated from. + */ + recordBrowserNewtabSession(browser, newtabSessionId) { + this._browserNewtabSessionMap.set(browser, newtabSessionId); + } + + /** + * Handles the TabClose event received from the listeners. + * + * @param {object} event + * The event object provided by the listener. + */ + handleEvent(event) { + if (event.type != "TabClose") { + console.error(`Received unexpected event type ${event.type}`); + return; + } + + this._browserNewtabSessionMap.delete(event.target.linkedBrowser); + this.stopTrackingBrowser(event.target.linkedBrowser); + } + + /** + * Test-only function, used to override the provider information, so that + * unit tests can set it to easy to test values. + * + * @param {Array} providerInfo + * See {@link https://searchfox.org/mozilla-central/search?q=search-telemetry-schema.json} + * for type information. + */ + overrideSearchTelemetryForTests(providerInfo) { + let info = providerInfo ? providerInfo : this._originalProviderInfo; + this._contentHandler.overrideSearchTelemetryForTests(info); + this._setSearchProviderInfo(info); + } + + /** + * Used to set the local version of the search provider information. + * This automatically maps the regexps to RegExp objects so that + * we don't have to create a new instance each time. + * + * @param {Array} providerInfo + * A raw array of provider information to set. + */ + _setSearchProviderInfo(providerInfo) { + this._searchProviderInfo = providerInfo.map(provider => { + let newProvider = { + ...provider, + searchPageRegexp: new RegExp(provider.searchPageRegexp), + }; + if (provider.extraAdServersRegexps) { + newProvider.extraAdServersRegexps = provider.extraAdServersRegexps.map( + r => new RegExp(r) + ); + } + return newProvider; + }); + this._contentHandler._searchProviderInfo = this._searchProviderInfo; + } + + reportPageWithAds(info, browser) { + this._contentHandler._reportPageWithAds(info, browser); + } + + /** + * This may start tracking a tab based on the URL. If the URL matches a search + * partner, and it has a code, then we'll start tracking it. This will aid + * determining if it is a page we should be tracking for adverts. + * + * @param {object} browser + * The browser associated with the page. + * @param {string} url + * The url that was loaded in the browser. + * @param {nsIDocShell.LoadCommand} loadType + * The load type associated with the page load. + */ + updateTrackingStatus(browser, url, loadType) { + if ( + !lazy.BrowserSearchTelemetry.shouldRecordSearchCount( + browser.getTabBrowser() + ) + ) { + return; + } + let info = this._checkURLForSerpMatch(url); + if (!info) { + this._browserNewtabSessionMap.delete(browser); + this.stopTrackingBrowser(browser); + return; + } + + let source = "unknown"; + if (loadType & Ci.nsIDocShell.LOAD_CMD_RELOAD) { + source = "reload"; + } else if (loadType & Ci.nsIDocShell.LOAD_CMD_HISTORY) { + source = "tabhistory"; + } else if (this._browserSourceMap.has(browser)) { + source = this._browserSourceMap.get(browser); + this._browserSourceMap.delete(browser); + } + + let newtabSessionId; + if (this._browserNewtabSessionMap.has(browser)) { + newtabSessionId = this._browserNewtabSessionMap.get(browser); + // We leave the newtabSessionId in the map for this browser + // until we stop loading SERP pages or the tab is closed. + } + + this._reportSerpPage(info, source, url); + + let item = this._browserInfoByURL.get(url); + + if (item) { + item.browsers.set(browser, "no ads reported"); + item.count++; + item.source = source; + item.newtabSessionId = newtabSessionId; + } else { + item = this._browserInfoByURL.set(url, { + browsers: new WeakMap().set(browser, "no ads reported"), + info, + count: 1, + source, + newtabSessionId, + }); + } + } + + /** + * Stops tracking of a tab, for example the tab has loaded a different URL. + * + * @param {object} browser The browser associated with the tab to stop being + * tracked. + */ + stopTrackingBrowser(browser) { + for (let [url, item] of this._browserInfoByURL) { + if (item.browsers.has(browser)) { + item.browsers.delete(browser); + item.count--; + } + + if (!item.count) { + this._browserInfoByURL.delete(url); + } + } + } + + /** + * Parts of the URL, like search params and hashes, may be mutated by scripts + * on a page we're tracking. Since we don't want to keep track of that + * ourselves in order to keep the list of browser objects a weak-referenced + * set, we do optional fuzzy matching of URLs to fetch the most relevant item + * that contains tracking information. + * + * @param {string} url URL to fetch the tracking data for. + * @returns {object} Map containing the following members: + * - {WeakMap} browsers + * Map of browser elements that belong to `url` and their ad report state. + * - {object} info + * Info dictionary as returned by `_checkURLForSerpMatch`. + * - {number} count + * The number of browser element we can most accurately tell we're + * tracking, since they're inside a WeakMap. + */ + _findBrowserItemForURL(url) { + try { + url = new URL(url); + } catch (ex) { + return null; + } + + const compareURLs = (url1, url2) => { + // In case of an exact match, well, that's an obvious winner. + if (url1.href == url2.href) { + return Infinity; + } + + // Each step we get closer to the two URLs being the same, we increase the + // score. The consumer of this method will use these scores to see which + // of the URLs is the best match. + let score = 0; + if (url1.hostname == url2.hostname) { + ++score; + if (url1.pathname == url2.pathname) { + ++score; + for (let [key1, value1] of url1.searchParams) { + // Let's not fuss about the ordering of search params, since the + // score effect will solve that. + if (url2.searchParams.has(key1)) { + ++score; + if (url2.searchParams.get(key1) == value1) { + ++score; + } + } + } + if (url1.hash == url2.hash) { + ++score; + } + } + } + return score; + }; + + let item; + let currentBestMatch = 0; + for (let [trackingURL, candidateItem] of this._browserInfoByURL) { + if (currentBestMatch === Infinity) { + break; + } + try { + // Make sure to cache the parsed URL object, since there's no reason to + // do it twice. + trackingURL = + candidateItem._trackingURL || + (candidateItem._trackingURL = new URL(trackingURL)); + } catch (ex) { + continue; + } + let score = compareURLs(url, trackingURL); + if (score > currentBestMatch) { + item = candidateItem; + currentBestMatch = score; + } + } + + return item; + } + + // nsIWindowMediatorListener + + /** + * This is called when a new window is opened, and handles registration of + * that window if it is a browser window. + * + * @param {nsIAppWindow} appWin The xul window that was opened. + */ + onOpenWindow(appWin) { + let win = appWin.docShell.domWindow; + win.addEventListener( + "load", + () => { + if ( + win.document.documentElement.getAttribute("windowtype") != + "navigator:browser" + ) { + return; + } + + this._registerWindow(win); + }, + { once: true } + ); + } + + /** + * Listener that is called when a window is closed, and handles deregistration of + * that window if it is a browser window. + * + * @param {nsIAppWindow} appWin The xul window that was closed. + */ + onCloseWindow(appWin) { + let win = appWin.docShell.domWindow; + + if ( + win.document.documentElement.getAttribute("windowtype") != + "navigator:browser" + ) { + return; + } + + this._unregisterWindow(win); + } + + /** + * Adds event listeners for the window and registers it with the content handler. + * + * @param {object} win The window to register. + */ + _registerWindow(win) { + win.gBrowser.tabContainer.addEventListener("TabClose", this); + } + + /** + * Removes event listeners for the window and unregisters it with the content + * handler. + * + * @param {object} win The window to unregister. + */ + _unregisterWindow(win) { + for (let tab of win.gBrowser.tabs) { + this.stopTrackingBrowser(tab); + } + + win.gBrowser.tabContainer.removeEventListener("TabClose", this); + } + + /** + * Searches for provider information for a given url. + * + * @param {string} url The url to match for a provider. + * @returns {Array | null} Returns an array of provider name and the provider information. + */ + _getProviderInfoForURL(url) { + return this._searchProviderInfo.find(info => + info.searchPageRegexp.test(url) + ); + } + + /** + * Checks to see if a url is a search partner location, and determines the + * provider and codes used. + * + * @param {string} url The url to match. + * @returns {null|object} Returns null if there is no match found. Otherwise, + * returns an object of strings for provider, code and type. + */ + _checkURLForSerpMatch(url) { + let searchProviderInfo = this._getProviderInfoForURL(url); + if (!searchProviderInfo) { + return null; + } + let queries = new URLSearchParams(url.split("#")[0].split("?")[1]); + if (!queries.get(searchProviderInfo.queryParamName)) { + return null; + } + // Default to organic to simplify things. + // We override type in the sap cases. + let type = "organic"; + let code; + if (searchProviderInfo.codeParamName) { + code = queries.get(searchProviderInfo.codeParamName); + if (code) { + // The code is only included if it matches one of the specific ones. + if (searchProviderInfo.taggedCodes.includes(code)) { + type = "tagged"; + if ( + searchProviderInfo.followOnParamNames && + searchProviderInfo.followOnParamNames.some(p => queries.has(p)) + ) { + type += "-follow-on"; + } + } else if (searchProviderInfo.organicCodes.includes(code)) { + type = "organic"; + } else if (searchProviderInfo.expectedOrganicCodes?.includes(code)) { + code = "none"; + } else { + code = "other"; + } + } else if (searchProviderInfo.followOnCookies) { + // Especially Bing requires lots of extra work related to cookies. + for (let followOnCookie of searchProviderInfo.followOnCookies) { + if (followOnCookie.extraCodeParamName) { + let eCode = queries.get(followOnCookie.extraCodeParamName); + if ( + !eCode || + !followOnCookie.extraCodePrefixes.some(p => eCode.startsWith(p)) + ) { + continue; + } + } + + // If this cookie is present, it's probably an SAP follow-on. + // This might be an organic follow-on in the same session, but there + // is no way to tell the difference. + for (let cookie of Services.cookies.getCookiesFromHost( + followOnCookie.host, + {} + )) { + if (cookie.name != followOnCookie.name) { + continue; + } + + let [cookieParam, cookieValue] = cookie.value + .split("=") + .map(p => p.trim()); + if ( + cookieParam == followOnCookie.codeParamName && + searchProviderInfo.taggedCodes.includes(cookieValue) + ) { + type = "tagged-follow-on"; + code = cookieValue; + break; + } + } + } + } + } + return { provider: searchProviderInfo.telemetryId, type, code }; + } + + /** + * Logs telemetry for a search provider visit. + * + * @param {object} info The search provider information. + * @param {string} info.provider The name of the provider. + * @param {string} info.type The type of search. + * @param {string} [info.code] The code for the provider. + * @param {string} source Where the search originated from. + * @param {string} url The url that was matched (for debug logging only). + */ + _reportSerpPage(info, source, url) { + let payload = `${info.provider}:${info.type}:${info.code || "none"}`; + Services.telemetry.keyedScalarAdd( + SEARCH_CONTENT_SCALAR_BASE + source, + payload, + 1 + ); + + lazy.logConsole.debug("Counting", payload, "for", url); + } +} + +/** + * ContentHandler deals with handling telemetry of the content within a tab - + * when ads detected and when they are selected. + */ +class ContentHandler { + /** + * Constructor. + * + * @param {object} options + * The options for the handler. + * @param {Map} options.browserInfoByURL + * The map of urls from TelemetryHandler. + * @param {Function} options.getProviderInfoForURL + * A function that obtains the provider information for a url. + */ + constructor(options) { + this._browserInfoByURL = options.browserInfoByURL; + this._findBrowserItemForURL = options.findBrowserItemForURL; + this._checkURLForSerpMatch = options.checkURLForSerpMatch; + } + + /** + * Initializes the content handler. This will also set up the shared data that is + * shared with the SearchTelemetryChild actor. + * + * @param {Array} providerInfo + * The provider information for the search telemetry to record. + */ + init(providerInfo) { + Services.ppmm.sharedData.set("SearchTelemetry:ProviderInfo", providerInfo); + + Services.obs.addObserver(this, "http-on-examine-response"); + Services.obs.addObserver(this, "http-on-stop-request"); + } + + /** + * Uninitializes the content handler. + */ + uninit() { + Services.obs.removeObserver(this, "http-on-examine-response"); + Services.obs.removeObserver(this, "http-on-stop-request"); + } + + /** + * Test-only function to override the search provider information for use + * with tests. Passes it to the SearchTelemetryChild actor. + * + * @param {object} providerInfo @see SEARCH_PROVIDER_INFO for type information. + */ + overrideSearchTelemetryForTests(providerInfo) { + Services.ppmm.sharedData.set("SearchTelemetry:ProviderInfo", providerInfo); + } + + /** + * Reports bandwidth used by the given channel if it is used by search requests. + * + * @param {object} aChannel The channel that generated the activity. + */ + _reportChannelBandwidth(aChannel) { + if (!(aChannel instanceof Ci.nsIChannel)) { + return; + } + let wrappedChannel = ChannelWrapper.get(aChannel); + + let getTopURL = channel => { + // top-level document + if ( + channel.loadInfo && + channel.loadInfo.externalContentPolicyType == + Ci.nsIContentPolicy.TYPE_DOCUMENT + ) { + return channel.finalURL; + } + + // iframe + let frameAncestors; + try { + frameAncestors = channel.frameAncestors; + } catch (e) { + frameAncestors = null; + } + if (frameAncestors) { + let ancestor = frameAncestors.find(obj => obj.frameId == 0); + if (ancestor) { + return ancestor.url; + } + } + + // top-level resource + if (channel.loadInfo && channel.loadInfo.loadingPrincipal) { + return channel.loadInfo.loadingPrincipal.spec; + } + + return null; + }; + + let topUrl = getTopURL(wrappedChannel); + if (!topUrl) { + return; + } + + let info = this._checkURLForSerpMatch(topUrl); + if (!info) { + return; + } + + let bytesTransferred = + wrappedChannel.requestSize + wrappedChannel.responseSize; + let { provider } = info; + + let isPrivate = + wrappedChannel.loadInfo && + wrappedChannel.loadInfo.originAttributes.privateBrowsingId > 0; + if (isPrivate) { + provider += `-${SEARCH_TELEMETRY_PRIVATE_BROWSING_KEY_SUFFIX}`; + } + + Services.telemetry.keyedScalarAdd( + SEARCH_DATA_TRANSFERRED_SCALAR, + provider, + bytesTransferred + ); + } + + observe(aSubject, aTopic, aData) { + switch (aTopic) { + case "http-on-stop-request": + this._reportChannelBandwidth(aSubject); + break; + case "http-on-examine-response": + this.observeActivity(aSubject); + break; + } + } + + /** + * Listener that observes network activity, so that we can determine if a link + * from a search provider page was followed, and if then if that link was an + * ad click or not. + * + * @param {nsIChannel} channel The channel that generated the activity. + */ + observeActivity(channel) { + if (!(channel instanceof Ci.nsIChannel)) { + return; + } + + let wrappedChannel = ChannelWrapper.get(channel); + if (wrappedChannel._adClickRecorded) { + lazy.logConsole.debug("Ad click already recorded"); + return; + } + + Services.tm.dispatchToMainThread(() => { + // We suspect that No Content (204) responses are used to transfer or + // update beacons. They used to lead to double-counting ad-clicks, so let's + // ignore them. + if (wrappedChannel.statusCode == 204) { + lazy.logConsole.debug("Ignoring activity from ambiguous responses"); + return; + } + + // The wrapper is consistent across redirects, so we can use it to track state. + let originURL = wrappedChannel.originURI && wrappedChannel.originURI.spec; + let item = this._findBrowserItemForURL(originURL); + if (!originURL || !item) { + return; + } + + let URL = wrappedChannel.finalURL; + + let providerInfo = item.info.provider; + let info = this._searchProviderInfo.find(provider => { + return provider.telemetryId == providerInfo; + }); + + if (!info.extraAdServersRegexps?.some(regex => regex.test(URL))) { + return; + } + + try { + lazy.logConsole.debug( + "Counting ad click in page for", + info.telemetryId, + item.source, + originURL, + URL + ); + Services.telemetry.keyedScalarAdd( + SEARCH_AD_CLICKS_SCALAR_BASE + item.source, + `${info.telemetryId}:${item.info.type}`, + 1 + ); + wrappedChannel._adClickRecorded = true; + if (item.newtabSessionId) { + Glean.newtabSearchAd.click.record({ + newtab_visit_id: item.newtabSessionId, + search_access_point: item.source, + is_follow_on: item.info.type.endsWith("follow-on"), + is_tagged: item.info.type.startsWith("tagged"), + telemetry_id: item.info.provider, + }); + } + } catch (e) { + console.error(e); + } + }); + } + + /** + * Logs telemetry for a page with adverts, if it is one of the partner search + * provider pages that we're tracking. + * + * @param {object} info + * The search provider information for the page. + * @param {boolean} info.hasAds + * Whether or not the page has adverts. + * @param {string} info.url + * The url of the page. + * @param {object} browser + * The browser associated with the page. + */ + _reportPageWithAds(info, browser) { + let item = this._findBrowserItemForURL(info.url); + if (!item) { + lazy.logConsole.warn( + "Expected to report URI for", + info.url, + "with ads but couldn't find the information" + ); + return; + } + + let adReportState = item.browsers.get(browser); + if (adReportState == "ad reported") { + lazy.logConsole.debug( + "Ad was previously reported for browser with URI", + info.url + ); + return; + } + + lazy.logConsole.debug( + "Counting ads in page for", + item.info.provider, + item.info.type, + item.source, + info.url + ); + Services.telemetry.keyedScalarAdd( + SEARCH_WITH_ADS_SCALAR_BASE + item.source, + `${item.info.provider}:${item.info.type}`, + 1 + ); + + item.browsers.set(browser, "ad reported"); + + if (item.newtabSessionId) { + Glean.newtabSearchAd.impression.record({ + newtab_visit_id: item.newtabSessionId, + search_access_point: item.source, + is_follow_on: item.info.type.endsWith("follow-on"), + is_tagged: item.info.type.startsWith("tagged"), + telemetry_id: item.info.provider, + }); + } + } +} + +export var SearchSERPTelemetry = new TelemetryHandler(); |