import math
import random as rand
def l2_pairwise_distance(v1, v2):
"""Euclidean distance from each point in v1 to each point in v2
Args:
v1: list of point 1
v2: list of point 2
Returns:
distance matrix between each point in v1 and v2
"""
nrow = len(v1)
ncol = len(v2)
dist_mat = [[0 for _ in range(ncol)] for _ in range(nrow)]
for i in range(nrow):
for j in range(ncol):
dist_mat[i][j] = math.sqrt((v1[i] - v2[j])**2)
return dist_mat
def calculate_error(k_means_matrix):
"""Calculate the sum of distance from each point to its nearest cluster center
Args:
k_means_matrix: distance matrix of point to cluster center
Returns:
Sum of distance from each point to its nearest cluster center
"""
return sum([min(dist) for dist in k_means_matrix])
def k_means(x_input, n_cluster=3, n_iter=100, n_tries=10):
"""Perform 1-D k-means clustering on a list of numbers x_input
Args:
x_input: list of numbers
n_cluster: number of clusters
n_iter: number of iterations
Returns:
centers: list of n_cluster elements containing the cluster centers
min_dist_idx: list of len(x_input) elements containing the nearest cluster center's id
error_value: sum of all distance from each point to its nearest cluster center
"""
results = []
for _ in range(n_tries):
error_value = 0
rand.seed(None)
centers = sorted([rand.uniform(0.0, 100.0) for i in range(n_cluster)])
min_dist_idx = [0] * len(x_input)
i = 0
while i < n_iter:
failed = False
dist_mat = l2_pairwise_distance(x_input, centers)
error_value = calculate_error(dist_mat)
min_dist_idx = [dist.index(min(dist)) for dist in dist_mat]
centers = [0] * n_cluster
count = [0] * n_cluster
for j in range(len(x_input)):
centers[min_dist_idx[j]] += x_input[j]
count[min_dist_idx[j]] += 1
for j in range(n_cluster):
if count[j] == 0:
centers = sorted(
[rand.uniform(0.0, 100.0) for i in range(n_cluster)])
failed = True
break
if failed:
i = 0
continue
for j in range(n_cluster):
centers[j] = centers[j] / count[j]
i += 1
results.append((centers, min_dist_idx, error_value))
return min(results, key=lambda x: x[2])