chromium/third_party/blink/renderer/build/scripts/cluster.py

import math
import random as rand


def l2_pairwise_distance(v1, v2):
    """Euclidean distance from each point in v1 to each point in v2

    Args:
        v1: list of point 1
        v2: list of point 2

    Returns:
        distance matrix between each point in v1 and v2
    """
    nrow = len(v1)
    ncol = len(v2)
    dist_mat = [[0 for _ in range(ncol)] for _ in range(nrow)]
    for i in range(nrow):
        for j in range(ncol):
            dist_mat[i][j] = math.sqrt((v1[i] - v2[j])**2)
    return dist_mat


def calculate_error(k_means_matrix):
    """Calculate the sum of distance from each point to its nearest cluster center

    Args:
        k_means_matrix: distance matrix of point to cluster center

    Returns:
        Sum of distance from each point to its nearest cluster center
    """
    return sum([min(dist) for dist in k_means_matrix])


def k_means(x_input, n_cluster=3, n_iter=100, n_tries=10):
    """Perform 1-D k-means clustering on a list of numbers x_input

    Args:
        x_input: list of numbers
        n_cluster: number of clusters
        n_iter: number of iterations

    Returns:
        centers: list of n_cluster elements containing the cluster centers
        min_dist_idx: list of len(x_input) elements containing the nearest cluster center's id
        error_value: sum of all distance from each point to its nearest cluster center
    """
    results = []
    for _ in range(n_tries):
        error_value = 0
        rand.seed(None)
        centers = sorted([rand.uniform(0.0, 100.0) for i in range(n_cluster)])
        min_dist_idx = [0] * len(x_input)
        i = 0
        while i < n_iter:
            failed = False
            dist_mat = l2_pairwise_distance(x_input, centers)
            error_value = calculate_error(dist_mat)
            min_dist_idx = [dist.index(min(dist)) for dist in dist_mat]
            centers = [0] * n_cluster
            count = [0] * n_cluster
            for j in range(len(x_input)):
                centers[min_dist_idx[j]] += x_input[j]
                count[min_dist_idx[j]] += 1

            for j in range(n_cluster):
                if count[j] == 0:
                    centers = sorted(
                        [rand.uniform(0.0, 100.0) for i in range(n_cluster)])
                    failed = True
                    break

            if failed:
                i = 0
                continue

            for j in range(n_cluster):
                centers[j] = centers[j] / count[j]
            i += 1

        results.append((centers, min_dist_idx, error_value))

    return min(results, key=lambda x: x[2])