chromium/chrome/android/java/src/org/chromium/chrome/browser/suggestions/UrlSimilarityScorer.java

// Copyright 2024 The Chromium Authors
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file.

package org.chromium.chrome.browser.suggestions;

import android.annotation.SuppressLint;
import android.text.TextUtils;

import androidx.annotation.IntDef;
import androidx.annotation.Nullable;

import org.chromium.base.metrics.RecordHistogram;
import org.chromium.chrome.browser.tabmodel.TabList;
import org.chromium.components.embedder_support.util.UrlConstants;
import org.chromium.url.GURL;

import java.util.Arrays;
import java.util.HashSet;
import java.util.Set;

/**
 * Heuristically scores the similarity between a "key" URL with one or more "candidate" URLs. The
 * main use is to find the best match among "candidate" URLs from a TabList.
 */
public class UrlSimilarityScorer {

    /** Return value for findTabWithMostSimilarUrl(). */
    static class MatchResult {
        public final int index;
        public final int score;

        public MatchResult(int index, int score) {
            this.index = index;
            this.score = score;
        }
    }

    // These values are persisted to logs. Entries should not be renumbered and numeric values
    // should never be reused.
    // Information on the tile clicked by the user. The values must be consistent with
    // MvtReselectUrlMatchResult in enums.xml.
    @IntDef({
        MvtReselectUrlMatchResult.NONE,
        MvtReselectUrlMatchResult.EXACT,
        MvtReselectUrlMatchResult.PARTIAL,
        MvtReselectUrlMatchResult.NUM_ENTRIES
    })
    @interface MvtReselectUrlMatchResult {
        int NONE = 0;
        int EXACT = 1;
        int PARTIAL = 2;
        int NUM_ENTRIES = 3;
    }

    private static final String HISTOGRAM_MATCH_PREFIX = "NewTabPage.MostVisited.ReselectMatch.";
    private static final String HISTOGRAM_ARM_LAX_UP_TO_PATH = "LaxUpToPath";
    private static final String HISTOGRAM_ARM_LAX_UP_TO_QUERY = "LaxUpToQuery";
    private static final String HISTOGRAM_ARM_LAX_UP_TO_REF = "LaxUpToRef";
    private static final String HISTOGRAM_ARM_STRICT = "Strict";

    // Value for candidate URL rejection, set to negative since match scores are non-negative.
    public static final int MISMATCHED = -1;
    // Exact matches beat all, and so is assigned maximal value.
    public static final int EXACT = Integer.MAX_VALUE;

    public static final int PATH_MATCH_MAX_SCORE = 99;
    public static final int SCORE_PATH_MATCH_MULTIPLIER = 10;
    public static final int SCORE_QUERY_MATCH = 2;
    public static final int SCORE_REF_MATCH = 1;

    // These are the same ones used in VisitSegmentDatabase::ComputeSegmentName().
    private static Set<String> sDiscardableHostPrefixes =
            new HashSet<>(Arrays.asList("www", "m", "mobile", "touch"));

    private final GURL mKeyUrl;
    // Match parameters, ordered so intended usage would specify "true" before "false".
    private final boolean mLaxSchemeHost;
    private final boolean mLaxRef;
    private final boolean mLaxQuery;
    private final boolean mLaxPath;

    // Cached parts of {@link mKeyUrl}, following standard order within a URL.
    private final Set<String> mCompatibleSchemes;
    private final String mProcessedKeyHost;
    private final String mKeyPort;
    private final String mProcessedKeyPath;
    private final String mKeyQuery;
    private final String mKeyRef;

    /**
     * @param keyUrl URL to be compare against.
     * @param laxSchemeHost Whether scheme differences (identical or http -> https) and / or host
     *     differences (canonicalized) are allowed.
     * @param laxRef Whether #ref difference are allowed.
     * @param laxQuery Whether ?query difference are allowed.
     * @param laxPath Whether /path difference (limited to drill-down matches) are allowed.
     */
    public UrlSimilarityScorer(
            GURL keyUrl, boolean laxSchemeHost, boolean laxRef, boolean laxQuery, boolean laxPath) {
        mKeyUrl = keyUrl;
        mLaxSchemeHost = laxSchemeHost;
        mLaxRef = laxRef;
        mLaxQuery = laxQuery;
        mLaxPath = laxPath;

        mCompatibleSchemes = new HashSet<String>();
        String keyScheme = mKeyUrl.getScheme();
        mCompatibleSchemes.add(keyScheme);
        if (laxSchemeHost && keyScheme.contentEquals(UrlConstants.HTTP_SCHEME)) {
            // Special case: Allow key scheme "http" to match candidate scheme "https", i.e.,
            // enter security boundary.
            mCompatibleSchemes.add(UrlConstants.HTTPS_SCHEME);
        }

        mProcessedKeyHost =
                mLaxSchemeHost ? canonicalizeHost(mKeyUrl.getHost()) : mKeyUrl.getHost();
        mKeyPort = mKeyUrl.getPort();
        // If lax /path match, ensure /path used begin and end with "/" to erase the distinction
        // between directory and files, and simplify comparison.
        mProcessedKeyPath = mLaxPath ? ensureSlashSentinel(mKeyUrl.getPath()) : mKeyUrl.getPath();
        mKeyQuery = mKeyUrl.getQuery();
        mKeyRef = mKeyUrl.getRef();
    }

    /**
     * Removes frequently seen parts of a URL host string, e.g., "www." to eliminate superficial
     * differences during matching.
     */
    @SuppressLint("DefaultLocale")
    static String canonicalizeHost(String host) {
        String buf = host.toLowerCase();
        int head = 0;
        while (true) {
            int dotPos = buf.indexOf(".", head);
            if (dotPos < 0) {
                break;
            }
            String prefix = buf.substring(head, dotPos);
            if (!sDiscardableHostPrefixes.contains(prefix)) {
                break;
            }
            head = dotPos + 1;
        }
        return buf.substring(head).toString();
    }

    /** Returns potentially modified {@param path} that starts with and ends with "/". */
    static String ensureSlashSentinel(String path) {
        boolean hasLeading = path.startsWith("/");
        boolean hasTrailing = path.endsWith("/");
        if (hasLeading && hasTrailing) return path;

        StringBuilder builder = new StringBuilder();
        if (!hasLeading) {
            builder.append('/');
        }
        builder.append(path);
        if (!hasTrailing && !path.isEmpty()) {
            builder.append('/');
        }
        return builder.toString();
    }

    /**
     * Determines whether {@param ancestorPath} is a (non-strict) ancestor of {@param path}. If so,
     * returns the latter's relative depth, with 0 being identical. Otherwise returns null. Both
     * paths must begin and end with "/".
     */
    static Integer getPathAncestralDepth(String ancestorPath, String path) {
        assert ancestorPath.startsWith("/") && ancestorPath.endsWith("/");
        assert path.startsWith("/") && path.endsWith("/");
        if (!path.startsWith(ancestorPath)) return null;

        int commonLength = ancestorPath.length();
        if (path.length() == commonLength) return 0;

        int depth = 0;
        int end = path.length();
        for (int i = commonLength; i < end; ++i) {
            if (path.charAt(i) == '/') {
                ++depth;
            }
        }
        return depth;
    }

    /** Computes the similarity score of {@param candidateUrl}. */
    public int scoreSimilarity(GURL candidateUrl) {
        if (candidateUrl.equals(mKeyUrl)) {
            return EXACT;
        }

        // Port difference (e.g., example.com:443 vs. example.com:8000) is MISMATCHED to support
        // common usage. Scheme compatibility is determined by lookup, and is MISMATCHED on failure.
        if (!candidateUrl.getPort().contentEquals(mKeyPort)
                || !mCompatibleSchemes.contains(candidateUrl.getScheme())) {
            return MISMATCHED;
        }

        String host =
                mLaxSchemeHost ? canonicalizeHost(candidateUrl.getHost()) : candidateUrl.getHost();
        if (!host.contentEquals(mProcessedKeyHost)) {
            return MISMATCHED;
        }

        int score = 0;
        if (candidateUrl.getQuery().contentEquals(mKeyQuery)) {
            score += SCORE_QUERY_MATCH;
        } else if (!mLaxQuery) {
            return MISMATCHED;
        }

        if (candidateUrl.getRef().contentEquals(mKeyRef)) {
            score += SCORE_REF_MATCH;
        } else if (!mLaxRef) {
            return MISMATCHED;
        }

        int pathScore = PATH_MATCH_MAX_SCORE;
        if (mLaxPath) {
            String modifiedCandidatePath = ensureSlashSentinel(candidateUrl.getPath());
            Integer depth = getPathAncestralDepth(mProcessedKeyPath, modifiedCandidatePath);
            // Drill-down match: Require "key" /path to be equal to or an ancestor of "candidate"
            // /path. Decrease /path score by 1 for each level of depth, until 1 is reached.
            if (depth == null) {
                return MISMATCHED;
            }
            pathScore = Math.max(1, pathScore - depth.intValue());
        } else if (!mProcessedKeyPath.contentEquals(candidateUrl.getPath())) {
            return MISMATCHED;
        }
        score += pathScore * SCORE_PATH_MATCH_MULTIPLIER;

        return score;
    }

    /**
     * Finds the tab in {@param tabList} whose URL attains the highest similarity score, taking the
     * first if a tie exists, and returns the result.
     */
    public MatchResult findTabWithMostSimilarUrl(TabList tabList) {
        int bestIndex = TabList.INVALID_TAB_INDEX;
        int bestScore = MISMATCHED;
        int count = tabList.getCount();
        for (int i = 0; i < count; ++i) {
            int score = scoreSimilarity(tabList.getTabAt(i).getUrl());
            if (score != MISMATCHED && bestScore < score) {
                bestScore = score;
                bestIndex = i;
                // Early-exit on finding identical match.
                if (bestScore == EXACT) break;
            }
        }
        return new MatchResult(bestIndex, bestScore);
    }

    /**
     * Returns the suffix string to compute histogram name, based on lax match configurations.
     * Returns null if the configuration is unrecognized for logging.
     */
    public @Nullable String getHistogramStrictnessSuffix() {
        if (!mLaxSchemeHost) {
            return !mLaxRef && !mLaxQuery && !mLaxPath ? HISTOGRAM_ARM_STRICT : null;
        }
        if (!mLaxRef) return null;

        if (!mLaxQuery) return mLaxPath ? null : HISTOGRAM_ARM_LAX_UP_TO_REF;

        return mLaxPath ? HISTOGRAM_ARM_LAX_UP_TO_PATH : HISTOGRAM_ARM_LAX_UP_TO_QUERY;
    }

    /**
     * Records histograms for the given {@param matchResult}, which is assumed to be generated by
     * this class instance (whose configs are used to find the right histogram suffix).
     */
    public void recordMatchResult(MatchResult matchResult) {
        String suffix = getHistogramStrictnessSuffix();
        if (TextUtils.isEmpty(suffix)) return;

        RecordHistogram.recordEnumeratedHistogram(
                HISTOGRAM_MATCH_PREFIX + suffix,
                scoreToMvtReselectUrlMatchResult(matchResult.score),
                MvtReselectUrlMatchResult.NUM_ENTRIES);
    }

    int scoreToMvtReselectUrlMatchResult(int score) {
        if (score == MISMATCHED) return MvtReselectUrlMatchResult.NONE;

        if (score == EXACT) return MvtReselectUrlMatchResult.EXACT;

        return MvtReselectUrlMatchResult.PARTIAL;
    }
}