raise TypeError("k-prototypes does not support sparse data.")
if categorical is None or not categorical and verbose:
print("No categorical data selected, effectively doing k-means.")
if isinstance(categorical, int):
categorical = [categorical]
assert len(categorical) != X.shape[1], \
"All columns are categorical, use k-modes instead of k-prototypes."
After Change
raise TypeError("k-prototypes does not support sparse data.")
if categorical is None or not categorical:
raise NotImplementedError(
"No categorical data selected, effectively doing k-means. "
"Present a list of categorical columns, or use scikit-learn"s "
"KMeans instead."
)
if isinstance(categorical, int):
categorical = [categorical]
assert len(categorical) != X.shape[1], \
"All columns are categorical, use k-modes instead of k-prototypes."
assert max(categorical) < X.shape[1], \
"Categorical index larger than number of columns."
ncatattrs = len(categorical)
nnumattrs = X.shape[1] - ncatattrs
npoints = X.shape[0]
assert n_clusters < npoints, "More clusters than data points?"
Xnum, Xcat = _split_num_cat(X, categorical)
Xnum, Xcat = check_array(Xnum), check_array(Xcat, dtype=None)
// Convert the categorical values in Xcat to integers for speed.
// Based on the unique values in Xcat, we can make a mapping to achieve this.
Xcat, enc_map = encode_features(Xcat)
// Estimate a good value for gamma, which determines the weighing of
// categorical values in clusters (see Huang [1997]).
if gamma is None:
gamma = 0.5 * Xnum.std()
all_centroids = []
all_labels = []
all_costs = []
all_n_iters = []
for init_no in range(n_init):
// For numerical part of initialization, we don"t have a guarantee
// that there is not an empty cluster, so we need to retry until
// there is none.
init_tries = 0
while True:
init_tries += 1
// _____ INIT _____
if verbose:
print("Init: initializing centroids")
if isinstance(init, str) and init == "Huang":
centroids = kmodes.init_huang(Xcat, n_clusters)
elif isinstance(init, str) and init == "Cao":
centroids = kmodes.init_cao(Xcat, n_clusters)
elif isinstance(init, str) and init == "random":
seeds = np.random.choice(range(npoints), n_clusters)
centroids = Xcat[seeds]
elif isinstance(init, list):
assert init[0].shape[0] == n_clusters, \
"Incorrect number of initial numerical centroids in init."
assert init[0].shape[1] == nnumattrs, \
"Incorrect number of numerical attributes in init"
assert init[1].shape[0] == n_clusters, \
"Incorrect number of initial categorical centroids in init."
assert init[1].shape[1] == ncatattrs, \
"Incorrect number of categorical attributes in init"
centroids = [np.asarray(init[0], dtype=np.float64),
np.asarray(init[1], dtype=np.uint8)]
else:
raise NotImplementedError("Initialization method not supported.")
if not isinstance(init, list):
// Numerical is initialized by drawing from normal distribution,
// categorical following the k-modes methods.
meanx = np.mean(Xnum, axis=0)
stdx = np.std(Xnum, axis=0)
centroids = [
meanx + np.random.randn(n_clusters, nnumattrs) * stdx,
centroids
]
if verbose:
print("Init: initializing clusters")
membship = np.zeros((n_clusters, npoints), dtype=np.uint8)
// Keep track of the sum of attribute values per cluster so that we
// can do k-means on the numerical attributes.
cl_attr_sum = np.zeros((n_clusters, nnumattrs), dtype=np.float64)
// cl_attr_freq is a list of lists with dictionaries that contain
// the frequencies of values per cluster and attribute.
cl_attr_freq = [[defaultdict(int) for _ in range(ncatattrs)]
for _ in range(n_clusters)]
for ipoint in range(npoints):
// Initial assignment to clusters
clust = np.argmin(
euclidean_dissim(centroids[0], Xnum[ipoint]) +
gamma * matching_dissim(centroids[1], Xcat[ipoint])
)
membship[clust, ipoint] = 1
// Count attribute values per cluster.
for iattr, curattr in enumerate(Xnum[ipoint]):
cl_attr_sum[clust, iattr] += curattr
for iattr, curattr in enumerate(Xcat[ipoint]):
cl_attr_freq[clust][iattr][curattr] += 1
// If no empty clusters, then consider initialization finalized.
if membship.sum(axis=1).min() > 0:
break
if init_tries == MAX_INIT_TRIES:
// Could not get rid of empty clusters. Randomly
// initialize instead.
init = "random"
elif init_tries == RAISE_INIT_TRIES:
raise ValueError(
"Clustering algorithm could not initialize. "
"Consider assigning the initial clusters manually."
)
// Perform an initial centroid update.
for ik in range(n_clusters):
for iattr in range(nnumattrs):