// Copyright 2021 The Chromium Authors
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file.
#ifndef ASH_PUBLIC_CPP_TAB_CLUSTER_CORRELATION_CLUSTERER_H_
#define ASH_PUBLIC_CPP_TAB_CLUSTER_CORRELATION_CLUSTERER_H_
#include <map>
#include <optional>
#include <set>
#include <vector>
#include "ash/public/cpp/ash_public_export.h"
#include "ash/public/cpp/tab_cluster/undirected_graph.h"
namespace ash {
class UndirectedGraph;
class EdgeSum;
// Adapted from
// https://github.com/google-research/google-research/blob/HEAD/parallel_clustering/clustering/config.proto#L66
// Consider a graph with vertex set V, edge set E, non-negative vertex weights
// k_u, edge weights w_uv, and a "resolution" parameter which must be
// non-negative. We define "rescaled" edge weights w'_uv for all u, v, in V as:
// { 0 if u == v
// w'_{uv} = { w_uv - resolution k_u k_v if {u, v} in E,
// { -resolution k_u k_v otherwise
// The correlation clustering objective is to maximize
// sum_{u, v in the same cluster} w'_uv,
// which is equivalent (up to sign and an additive constant) to the
// "maximizing agreements" and "minimizing disagreements" formulations of
// correlation clustering that are used in the approximation algorithms
// literature. Assuming the total edge weight in the graph is M, modularity
// partitioning can be expressed in this form by:
// * setting resolution = 1/(2*M),
// * setting the weight of each node to the total weight of its incident edges.
// Note that the final correlation clustering objective is monotonic in, but not
// equal to modularity. In particular, if the correlation clustering objective
// is C, we have:
// modularity = (C - resolution * sum_v (deg_v)^2 / (4 * M)) / M.
//
// To optimize this objective we use local search. We start with each vertex in
// its own cluster. We consider moves of the following form: move all vertices
// in a "move set" S of vertices to either one of the existing clusters or to a
// newly created cluster. We currently consider the following options for S:
// - One move set per current cluster with all the vertices currently in it.
// With these move sets we're effectively considering merging two clusters.
// For each move set considered we move that move set to the cluster that
// improves the objective the most if an improving move exists.
// TODO(crbug/1231303): //ash/public mostly contains interfaces, shared
// constants, types and utility classes, which are used to share code between
// //ash and //chrome. It's not common to put all implementation inside
// //ash/public. Find a better place to move this class to.
class ASH_PUBLIC_EXPORT CorrelationClusterer {
public:
struct CorrelationClustererConfig {
double resolution = 1.0;
};
CorrelationClusterer();
CorrelationClusterer(const CorrelationClusterer&) = delete;
CorrelationClusterer& operator=(const CorrelationClusterer&) = delete;
~CorrelationClusterer();
// Returns a list of clusters for a given undirected graph.
std::vector<std::vector<int>> Cluster(
const UndirectedGraph& undirected_graph);
private:
// The main clustering function.
// initial_clustering must include every node in the range
// [0, MutableGraph().NumNodes()) exactly once. If it doesn't this function
// will log an error message and return clusters of singleton.
void RefineClusters(std::vector<std::vector<int>>* clusters_ptr);
// Fills `clustering_` with a map of node (index) to cluster id (value)
// from the given set of `clusters`, returns false with non-empty
// error string when a problem occurred.
bool SetClustering(const std::vector<std::vector<int>>& clusters,
std::string* error);
// Moves node from its current cluster to a new cluster.
void MoveNodeToCluster(const int node, const int new_cluster);
void MoveNodesToCluster(const std::set<int>& nodes,
std::optional<int> new_cluster);
// Returns a pair of:
// * The best cluster to move `moving_nodes` to according to the correlation
// clustering objective function. Null optional means create a new cluster.
// * The change in objective function achieved by that move. May be positive
// or negative.
// The runtime is linear in the number of edges incident to `moving_nodes`.
std::pair<std::optional<int>, double> BestMove(
const std::set<int>& moving_nodes);
// Computes the best move given certain pre-computed sums of edge weights of
// the following classes of vertices in relation to a fixed set of
// `moving_nodes` that may change clusters:
// * Class 0: Neither node is moving.
// * Class 1: Exactly one node is moving.
// * Class 2: Both nodes are moving.
// where "moving" means in moving_nodes.
//
// Change in objective if we move all `moving nodes` to cluster i:
// class_2_currently_separate + class_1_together_after[i] -
// class_1_currently_together
// where
// class_2_currently_separate = Weight of edges in class 2 where the
// endpoints are in different clusters currently
// class_1_together_after[i] = Weight of edges in class 1 where the
// non-moving node is in cluster i
// class_1_currently_together = Weight of edges in class 1 where the
// endpoints are in the same cluster currently
//
// Two complications:
// * We need to avoid double-counting pairs in class 2
// * We need to account for missing edges, which have weight
// -resolution. To do so we subtract the number of edges we see in each
// category from the max possible number of edges (i.e. the number of
// edges we'd have if the graph was complete).
std::pair<std::optional<int>, double> BestMoveFromStats(
double moving_nodes_weight,
std::map<int, double>& cluster_moving_weights,
const EdgeSum& class_2_currently_separate,
const EdgeSum& class_1_currently_together,
const std::map<int, EdgeSum>& class_1_together_after);
// Increments `next_cluster_id_`.
int NewClusterId();
// Gets the cluster of a given `node` from `clustering_`.
int ClusterForNode(int node) const;
// Gets cluster weight of a `cluster_id` from `cluster_weights_` map.
double GetClusterWeight(int cluster_id) const;
// Resets data members before performing clustering.
void Reset();
UndirectedGraph graph_;
CorrelationClustererConfig config_;
// Maps node to its cluster.
std::vector<int> clustering_;
// Number of nodes in each cluster.
// Currently this is used only to detect when a cluster is empty so its
// entry can be removed from cluster_sizes_ and cluster_weights_.
std::map<int, int32_t> cluster_sizes_;
// Sum of the weights of the nodes in each cluster.
std::map<int, double> cluster_weights_;
int next_cluster_id_ = 0;
int num_nodes_ = 0;
};
} // namespace ash
#endif // ASH_PUBLIC_CPP_TAB_CLUSTER_CORRELATION_CLUSTERER_H_