1 files changed, 390 insertions, 0 deletions
diff --git a/browser/components/newtab/lib/UserDomainAffinityProvider.jsm b/browser/components/newtab/lib/UserDomainAffinityProvider.jsm
new file mode 100644
index 0000000000..f165aad1e5
--- /dev/null
+++ b/browser/components/newtab/lib/UserDomainAffinityProvider.jsm
@@ -0,0 +1,390 @@
+/* This Source Code Form is subject to the terms of the Mozilla Public
+ * License, v. 2.0. If a copy of the MPL was not distributed with this
+ * file, You can obtain one at http://mozilla.org/MPL/2.0/. */
+"use strict";
+
+const { Services } = ChromeUtils.import("resource://gre/modules/Services.jsm");
+
+ChromeUtils.defineModuleGetter(
+  this,
+  "PlacesUtils",
+  "resource://gre/modules/PlacesUtils.jsm"
+);
+
+const DEFAULT_TIME_SEGMENTS = [
+  { id: "hour", startTime: 3600, endTime: 0, weightPosition: 1 },
+  { id: "day", startTime: 86400, endTime: 3600, weightPosition: 0.75 },
+  { id: "week", startTime: 604800, endTime: 86400, weightPosition: 0.5 },
+  { id: "weekPlus", startTime: 0, endTime: 604800, weightPosition: 0.25 },
+  { id: "alltime", startTime: 0, endTime: 0, weightPosition: 0.25 },
+];
+
+const DEFAULT_PARAMETER_SETS = {
+  "linear-frequency": {
+    recencyFactor: 0.4,
+    frequencyFactor: 0.5,
+    combinedDomainFactor: 0.5,
+    perfectFrequencyVisits: 10,
+    perfectCombinedDomainScore: 2,
+    multiDomainBoost: 0.1,
+    itemScoreFactor: 0,
+  },
+};
+
+const DEFAULT_MAX_HISTORY_QUERY_RESULTS = 1000;
+
+function merge(...args) {
+  return Object.assign.apply(this, args);
+}
+
+/**
+ * Provides functionality to personalize content recommendations by calculating
+ * user domain affinity scores. These scores are used to calculate relevance
+ * scores for items/recs/stories that have domain affinities.
+ *
+ * The algorithm works as follows:
+ *
+ * - The recommendation endpoint returns a settings object containing
+ * timeSegments and parametersets.
+ *
+ * - For every time segment we calculate the corresponding domain visit counts,
+ * yielding result objects of the following structure: {"mozilla.org": 12,
+ * "mozilla.com": 34} (see UserDomainAffinityProvider#queryVisits)
+ *
+ * - These visit counts are transformed to domain affinity scores for all
+ * provided parameter sets: {"mozilla.org": {"paramSet1": 0.8,
+ * "paramSet2": 0.9}, "mozilla.org": {"paramSet1": 1, "paramSet2": 0.9}}
+ * (see UserDomainAffinityProvider#calculateScoresForParameterSets)
+ *
+ * - The parameter sets provide factors for weighting which allows for
+ * flexible targeting. The functionality to calculate final scores can
+ * be seen in UserDomainAffinityProvider#calculateScores
+ *
+ * - The user domain affinity scores are summed up across all time segments
+ * see UserDomainAffinityProvider#calculateAllUserDomainAffinityScores
+ *
+ * - An item's domain affinities are matched to the user's domain affinity
+ * scores by calculating an item relevance score
+ * (see UserDomainAffinityProvider#calculateItemRelevanceScore)
+ *
+ * - The item relevance scores are used to sort items (see TopStoriesFeed for
+ * more details)
+ *
+ * - The data structure was chosen to allow for fast cache lookups during
+ * relevance score calculation. While user domain affinities are calculated
+ * infrequently (i.e. only once a day), the item relevance score (potentially)
+ * needs to be calculated every time the feed updates. Therefore allowing cache
+ * lookups of scores[domain][parameterSet] is beneficial
+ */
+this.UserDomainAffinityProvider = class UserDomainAffinityProvider {
+  constructor(
+    timeSegments = DEFAULT_TIME_SEGMENTS,
+    parameterSets = DEFAULT_PARAMETER_SETS,
+    maxHistoryQueryResults = DEFAULT_MAX_HISTORY_QUERY_RESULTS,
+    version,
+    scores
+  ) {
+    this.timeSegments = timeSegments;
+    this.maxHistoryQueryResults = maxHistoryQueryResults;
+    this.version = version;
+    if (scores) {
+      this.parameterSets = parameterSets;
+      this.scores = scores;
+    } else {
+      this.parameterSets = this.prepareParameterSets(parameterSets);
+      this.scores = this.calculateAllUserDomainAffinityScores();
+    }
+  }
+
+  /**
+   * Adds dynamic parameters to the given parameter sets that need to be
+   * computed based on time segments.
+   *
+   * @param ps The parameter sets
+   * @return Updated parameter sets with additional fields (i.e. timeSegmentWeights)
+   */
+  prepareParameterSets(ps) {
+    return (
+      Object.keys(ps)
+        // Add timeSegmentWeight fields to param sets e.g. timeSegmentWeights: {"hour": 1, "day": 0.8915, ...}
+        .map(k => ({
+          [k]: merge(ps[k], {
+            timeSegmentWeights: this.calculateTimeSegmentWeights(
+              ps[k].recencyFactor
+            ),
+          }),
+        }))
+        .reduce((acc, cur) => merge(acc, cur))
+    );
+  }
+
+  /**
+   * Calculates a time segment weight based on the provided recencyFactor.
+   *
+   * @param recencyFactor The recency factor indicating how to weigh recency
+   * @return An object containing time segment weights: {"hour": 0.987, "day": 1}
+   */
+  calculateTimeSegmentWeights(recencyFactor) {
+    return this.timeSegments.reduce(
+      (acc, cur) =>
+        merge(acc, {
+          [cur.id]: this.calculateScore(cur.weightPosition, 1, recencyFactor),
+        }),
+      {}
+    );
+  }
+
+  /**
+   * Calculates user domain affinity scores based on browsing history and the
+   * available times segments and parameter sets.
+   */
+  calculateAllUserDomainAffinityScores() {
+    return (
+      this.timeSegments
+        // Calculate parameter set specific domain scores for each time segment
+        // => [{"a.com": {"ps1": 12, "ps2": 34}, "b.com": {"ps1": 56, "ps2": 78}}, ...]
+        .map(ts => this.calculateUserDomainAffinityScores(ts))
+        // Keep format, but reduce to single object, with combined scores across all time segments
+        // => "{a.com":{"ps1":2,"ps2":2}, "b.com":{"ps1":3,"ps2":3}}""
+        .reduce((acc, cur) => this._combineScores(acc, cur))
+    );
+  }
+
+  /**
+   * Calculates the user domain affinity scores for the given time segment.
+   *
+   * @param ts The time segment
+   * @return The parameter specific scores for all domains with visits in
+   * this time segment: {"a.com": {"ps1": 12, "ps2": 34}, "b.com" ...}
+   */
+  calculateUserDomainAffinityScores(ts) {
+    // Returns domains and visit counts for this time segment: {"a.com": 1, "b.com": 2}
+    let visits = this.queryVisits(ts);
+
+    return Object.keys(visits).reduce(
+      (acc, d) =>
+        merge(acc, {
+          [d]: this.calculateScoresForParameterSets(ts, visits[d]),
+        }),
+      {}
+    );
+  }
+
+  /**
+   * Calculates the scores for all parameter sets for the given time segment
+   * and domain visit count.
+   *
+   * @param ts The time segment
+   * @param vc The domain visit count in the given time segment
+   * @return The parameter specific scores for the visit count in
+   * this time segment: {"ps1": 12, "ps2": 34}
+   */
+  calculateScoresForParameterSets(ts, vc) {
+    return Object.keys(this.parameterSets).reduce(
+      (acc, ps) =>
+        merge(acc, {
+          [ps]: this.calculateScoreForParameterSet(
+            ts,
+            vc,
+            this.parameterSets[ps]
+          ),
+        }),
+      {}
+    );
+  }
+
+  /**
+   * Calculates the final affinity score in the given time segment for the given parameter set
+   *
+   * @param timeSegment The time segment
+   * @param visitCount The domain visit count in the given time segment
+   * @param parameterSet The parameter set to use for scoring
+   * @return The final score
+   */
+  calculateScoreForParameterSet(timeSegment, visitCount, parameterSet) {
+    return this.calculateScore(
+      visitCount * parameterSet.timeSegmentWeights[timeSegment.id],
+      parameterSet.perfectFrequencyVisits,
+      parameterSet.frequencyFactor
+    );
+  }
+
+  /**
+   * Keeps the same format, but reduces the two objects to a single object, with
+   * combined scores across all time segments  => {a.com":{"ps1":2,"ps2":2},
+   * "b.com":{"ps1":3,"ps2":3}}
+   */
+  _combineScores(a, b) {
+    // Merge both score objects so we get a combined object holding all domains.
+    // This is so we can combine them without missing domains that are in a and not in b and vice versa.
+    const c = merge({}, a, b);
+    return Object.keys(c).reduce(
+      (acc, d) => merge(acc, this._combine(a, b, c, d)),
+      {}
+    );
+  }
+
+  _combine(a, b, c, d) {
+    return (
+      Object.keys(c[d])
+        // Summing up the parameter set specific scores of each domain
+        .map(ps => ({
+          [d]: {
+            [ps]: Math.min(
+              1,
+              ((a[d] && a[d][ps]) || 0) + ((b[d] && b[d][ps]) || 0)
+            ),
+          },
+        }))
+        // Reducing from an array of objects with a single parameter set to a single object
+        // [{"a.com":{"ps1":11}}, {"a.com: {"ps2":12}}] => {"a.com":{"ps1":11,"ps2":12}}
+        .reduce((acc, cur) => ({ [d]: merge(acc[d], cur[d]) }))
+    );
+  }
+
+  /**
+   * Calculates a value on the curve described by the provided parameters. The curve we're using is
+   * (a^(b*x) - 1) / (a^b - 1): https://www.desmos.com/calculator/maqhpttupp
+   *
+   * @param {number} score A value between 0 and maxScore, representing x.
+   * @param {number} maxScore Highest possible score.
+   * @param {number} factor The slope describing the curve to get to maxScore. A low slope value
+   * [0, 0.5] results in a log-shaped curve, a high slope [0.5, 1] results in a exp-shaped curve,
+   * a slope of exactly 0.5 is linear.
+   * @param {number} ease Adjusts how much bend is in the curve i.e. how dramatic the maximum
+   * effect of the slope can be. This represents b in the formula above.
+   * @return {number} the final score
+   */
+  calculateScore(score, maxScore, factor, ease = 2) {
+    let a = 0;
+    let x = Math.max(0, score / maxScore);
+
+    if (x >= 1) {
+      return 1;
+    }
+
+    if (factor === 0.5) {
+      return x;
+    }
+
+    if (factor < 0.5) {
+      // We want a log-shaped curve so we scale "a" between 0 and .99
+      a = (factor / 0.5) * 0.49;
+    } else if (factor > 0.5) {
+      // We want an exp-shaped curve so we scale "a" between 1.01 and 10
+      a = 1 + ((factor - 0.5) / 0.5) * 9;
+    }
+
+    return (Math.pow(a, ease * x) - 1) / (Math.pow(a, ease) - 1);
+  }
+
+  /**
+   * Queries the visit counts in the given time segment.
+   *
+   * @param ts the time segment
+   * @return the visit count object: {"a.com": 1, "b.com": 2}
+   */
+  queryVisits(ts) {
+    const visitCounts = {};
+    const query = PlacesUtils.history.getNewQuery();
+    if (!query) {
+      return visitCounts;
+    }
+    const wwwRegEx = /^www\./;
+
+    query.beginTimeReference = query.TIME_RELATIVE_NOW;
+    query.beginTime =
+      ts.startTime && ts.startTime !== 0
+        ? -(ts.startTime * 1000 * 1000)
+        : -(Date.now() * 1000);
+
+    query.endTimeReference = query.TIME_RELATIVE_NOW;
+    query.endTime =
+      ts.endTime && ts.endTime !== 0 ? -(ts.endTime * 1000 * 1000) : 0;
+
+    const options = PlacesUtils.history.getNewQueryOptions();
+    options.sortingMode = options.SORT_BY_VISITCOUNT_DESCENDING;
+    options.maxResults = this.maxHistoryQueryResults;
+
+    const { root } = PlacesUtils.history.executeQuery(query, options);
+    root.containerOpen = true;
+    for (let i = 0; i < root.childCount; i++) {
+      let node = root.getChild(i);
+      let host = Services.io.newURI(node.uri).host.replace(wwwRegEx, "");
+      if (!visitCounts[host]) {
+        visitCounts[host] = 0;
+      }
+      visitCounts[host] += node.accessCount;
+    }
+    root.containerOpen = false;
+    return visitCounts;
+  }
+
+  /**
+   * Calculates an item's relevance score.
+   *
+   * @param item the item (story), must contain domain affinities, otherwise a
+   * score of 1 is returned.
+   * @return the calculated item's score or 1 if item has no domain_affinities
+   * or references an unknown parameter set.
+   */
+  calculateItemRelevanceScore(item) {
+    const params = this.parameterSets[item.parameter_set];
+    if (!item.domain_affinities || !params) {
+      return item.item_score;
+    }
+
+    const scores = Object.keys(item.domain_affinities).reduce(
+      (acc, d) => {
+        let userDomainAffinityScore = this.scores[d]
+          ? this.scores[d][item.parameter_set]
+          : false;
+        if (userDomainAffinityScore) {
+          acc.combinedDomainScore +=
+            userDomainAffinityScore * item.domain_affinities[d];
+          acc.matchingDomainsCount++;
+        }
+        return acc;
+      },
+      { combinedDomainScore: 0, matchingDomainsCount: 0 }
+    );
+
+    // Boost the score as configured in the provided parameter set
+    const boostedCombinedDomainScore =
+      scores.combinedDomainScore *
+      Math.pow(params.multiDomainBoost + 1, scores.matchingDomainsCount);
+
+    // Calculate what the score would be if the item score is ignored
+    const normalizedCombinedDomainScore = this.calculateScore(
+      boostedCombinedDomainScore,
+      params.perfectCombinedDomainScore,
+      params.combinedDomainFactor
+    );
+
+    // Calculate the final relevance score using the itemScoreFactor. The itemScoreFactor
+    // allows weighting the item score in relation to the normalizedCombinedDomainScore:
+    // An itemScoreFactor of 1 results in the item score and ignores the combined domain score
+    // An itemScoreFactor of 0.5 results in the the average of item score and combined domain score
+    // An itemScoreFactor of 0 results in the combined domain score and ignores the item score
+    return (
+      params.itemScoreFactor *
+        (item.item_score - normalizedCombinedDomainScore) +
+      normalizedCombinedDomainScore
+    );
+  }
+
+  /**
+   * Returns an object holding the settings and affinity scores of this provider instance.
+   */
+  getAffinities() {
+    return {
+      timeSegments: this.timeSegments,
+      parameterSets: this.parameterSets,
+      maxHistoryQueryResults: this.maxHistoryQueryResults,
+      version: this.version,
+      scores: this.scores,
+    };
+  }
+};
+
+const EXPORTED_SYMBOLS = ["UserDomainAffinityProvider"];