bbeebd26abdd2ca6f03bcde5c4045b0b74e3821e,kmodes/kprototypes.py,,k_prototypes,#,122

Before Change


        raise TypeError("k-prototypes does not support sparse data.")

    if categorical is None or not categorical and verbose:
        print("No categorical data selected, effectively doing k-means.")
    if isinstance(categorical, int):
        categorical = [categorical]
    assert len(categorical) != X.shape[1], \
        "All columns are categorical, use k-modes instead of k-prototypes."

After Change


        raise TypeError("k-prototypes does not support sparse data.")

    if categorical is None or not categorical:
        raise NotImplementedError(
            "No categorical data selected, effectively doing k-means. "
            "Present a list of categorical columns, or use scikit-learn"s "
            "KMeans instead."
        )
    if isinstance(categorical, int):
        categorical = [categorical]
    assert len(categorical) != X.shape[1], \
        "All columns are categorical, use k-modes instead of k-prototypes."
    assert max(categorical) < X.shape[1], \
        "Categorical index larger than number of columns."

    ncatattrs = len(categorical)
    nnumattrs = X.shape[1] - ncatattrs
    npoints = X.shape[0]
    assert n_clusters < npoints, "More clusters than data points?"

    Xnum, Xcat = _split_num_cat(X, categorical)
    Xnum, Xcat = check_array(Xnum), check_array(Xcat, dtype=None)

    // Convert the categorical values in Xcat to integers for speed.
    // Based on the unique values in Xcat, we can make a mapping to achieve this.
    Xcat, enc_map = encode_features(Xcat)

    // Estimate a good value for gamma, which determines the weighing of
    // categorical values in clusters (see Huang [1997]).
    if gamma is None:
        gamma = 0.5 * Xnum.std()

    all_centroids = []
    all_labels = []
    all_costs = []
    all_n_iters = []
    for init_no in range(n_init):

        // For numerical part of initialization, we don"t have a guarantee
        // that there is not an empty cluster, so we need to retry until
        // there is none.
        init_tries = 0
        while True:
            init_tries += 1
            // _____ INIT _____
            if verbose:
                print("Init: initializing centroids")
            if isinstance(init, str) and init == "Huang":
                centroids = kmodes.init_huang(Xcat, n_clusters)
            elif isinstance(init, str) and init == "Cao":
                centroids = kmodes.init_cao(Xcat, n_clusters)
            elif isinstance(init, str) and init == "random":
                seeds = np.random.choice(range(npoints), n_clusters)
                centroids = Xcat[seeds]
            elif isinstance(init, list):
                assert init[0].shape[0] == n_clusters, \
                    "Incorrect number of initial numerical centroids in init."
                assert init[0].shape[1] == nnumattrs, \
                    "Incorrect number of numerical attributes in init"
                assert init[1].shape[0] == n_clusters, \
                    "Incorrect number of initial categorical centroids in init."
                assert init[1].shape[1] == ncatattrs, \
                    "Incorrect number of categorical attributes in init"
                centroids = [np.asarray(init[0], dtype=np.float64),
                             np.asarray(init[1], dtype=np.uint8)]
            else:
                raise NotImplementedError("Initialization method not supported.")

            if not isinstance(init, list):
                // Numerical is initialized by drawing from normal distribution,
                // categorical following the k-modes methods.
                meanx = np.mean(Xnum, axis=0)
                stdx = np.std(Xnum, axis=0)
                centroids = [
                    meanx + np.random.randn(n_clusters, nnumattrs) * stdx,
                    centroids
                ]

            if verbose:
                print("Init: initializing clusters")
            membship = np.zeros((n_clusters, npoints), dtype=np.uint8)
            // Keep track of the sum of attribute values per cluster so that we
            // can do k-means on the numerical attributes.
            cl_attr_sum = np.zeros((n_clusters, nnumattrs), dtype=np.float64)
            // cl_attr_freq is a list of lists with dictionaries that contain
            // the frequencies of values per cluster and attribute.
            cl_attr_freq = [[defaultdict(int) for _ in range(ncatattrs)]
                            for _ in range(n_clusters)]
            for ipoint in range(npoints):
                // Initial assignment to clusters
                clust = np.argmin(
                    euclidean_dissim(centroids[0], Xnum[ipoint]) +
                    gamma * matching_dissim(centroids[1], Xcat[ipoint])
                )
                membship[clust, ipoint] = 1
                // Count attribute values per cluster.
                for iattr, curattr in enumerate(Xnum[ipoint]):
                    cl_attr_sum[clust, iattr] += curattr
                for iattr, curattr in enumerate(Xcat[ipoint]):
                    cl_attr_freq[clust][iattr][curattr] += 1

            // If no empty clusters, then consider initialization finalized.
            if membship.sum(axis=1).min() > 0:
                break

            if init_tries == MAX_INIT_TRIES:
                // Could not get rid of empty clusters. Randomly
                // initialize instead.
                init = "random"
            elif init_tries == RAISE_INIT_TRIES:
                raise ValueError(
                    "Clustering algorithm could not initialize. "
                    "Consider assigning the initial clusters manually."
                )

        // Perform an initial centroid update.
        for ik in range(n_clusters):
            for iattr in range(nnumattrs):
Italian Trulli
In pattern: SUPERPATTERN

Frequency: 3

Non-data size: 5

Instances


Project Name: nicodv/kmodes
Commit Name: bbeebd26abdd2ca6f03bcde5c4045b0b74e3821e
Time: 2016-06-16
Author: nico.devos@autogrid.com
File Name: kmodes/kprototypes.py
Class Name:
Method Name: k_prototypes


Project Name: inspirehep/magpie
Commit Name: d000d0750c958e2ec1b3d9eb3c9ea9cf63ab14f0
Time: 2017-07-07
Author: stypka@spotify.com
File Name: magpie/main.py
Class Name: MagpieModel
Method Name: train


Project Name: inspirehep/magpie
Commit Name: d000d0750c958e2ec1b3d9eb3c9ea9cf63ab14f0
Time: 2017-07-07
Author: stypka@spotify.com
File Name: magpie/main.py
Class Name: MagpieModel
Method Name: batch_train