Error in code of KMeans

Hi,
I am learning unit4 of “machine learning for robotics”. In the example code of K-Means, I found some error. It is alright when only 2 cluster centroids are applied, but when I change self.K to higher digit, it goes wrong. The original code in course book is:

import warnings
warnings.filterwarnings('ignore')
import matplotlib.pyplot as plt
import numpy as np
import random

class Kmeans_dev:
    """
    class definition
    """
    
    def __init__(self,X,K):
        """
        class constructor
        """
        self.X = X
        self.output = {}
        self.centroids = np.array([]).reshape(self.X.shape[1],0)
        self.K = K
        self.m = self.X.shape[0]
        
    
    def start_centroid_pos(self, X, K):
        """
        Random initialization of K centroids.
        """
        m,n = X.shape[0], X.shape[1]
        centroids = np.zeros((K,n))

        for i in range(K):
        #for i in range(1,K+1,1):
            centroids[i] = X[np.random.randint(0,m),:]

        return centroids
    

    def fit(self,n_iter):
        """
        Method to train the data set (position of centroids()
        """
        #randomly Initialize the centroids (callstart_centroid_pos() )
        self.centroids=self.start_centroid_pos(self.X,self.K)
        
        #compute Euclidian distances and assign clusters
        for n in range(n_iter):
            EuclidianDistance=np.array([]).reshape(self.m,0)
            for k in range(self.K):
                tempDist=np.sum((self.X-self.centroids[:,k])**2,axis=1)
                EuclidianDistance=np.c_[EuclidianDistance,tempDist]
            C=np.argmin(EuclidianDistance,axis=1)+1
            
            #adjust the centroids
            Y={}
            for k in range(self.K):
                Y[k+1]=np.array([]).reshape(2,0)
            for i in range(self.m):
                Y[C[i]]=np.c_[Y[C[i]],self.X[i]]
        
            for k in range(self.K):
                Y[k+1] = Y[k+1].T
            for k in range(self.K):
                self.centroids[:,k] = np.mean(Y[k+1],axis = 0)
                
            self.output=Y
            
    
    def predict(self):
        """
        Return of data set adherence to certain cluster
        """
        
        return self.output,self.centroids.T

# data set
X = np.array([[0.3,8.3 ], [3, 8], [2, 9],[0.3, 8.9],[1.7, 9.7 ],
              [0.9, 10.5], [10.3, 2.1],[10, 2],[7, 7 ], [6.9, 6.5],
              [6, 6],[1, 2], [1.5, 1.8], [5, 8 ], [8, 8], [1, 0.6],
              [9,11],[12, 5], [4.5, 4.8], [4.5, 3 ], [2, 8], [9, 3], [9,7]])


"""
Run k-means algorithm on given data set. Printing the output (position of centrod) and adherence of point to
certain cluster
"""

K= 2 # number of cluster you would like to create

#creation of class object and training (fit)
kmeans=Kmeans_dev(X,K)
kmeans.fit(50)

color=['blue','green']
labels=['cluster1','cluster2']

fig, axs = plt.subplots(5,2, figsize=(14, 28), facecolor='w', edgecolor='k')
fig.subplots_adjust(hspace = .5, wspace=.001)

axs = axs.ravel()

#you print the test results iteration by iteration (in order to show how the centroids moves)

for i in range(10):
    kmeans=Kmeans_dev(X,K)

    kmeans.fit(1)

    Output,Centroids=kmeans.predict()
    
    for k in range(K):
        axs[i].scatter(Output[k+1][:,0],Output[k+1][:,1],c=color[k],label=labels[k])
    axs[i].scatter(Centroids[:,0],Centroids[:,1],s=300,c='red',label='Centroids', marker='*')
    
    axs[i].set_title("Centroids movement. Iteration "+ str(1+i))
    
plt.show()

The errors take place:
1.
in def fit(self,n_iter):, the second for iteration,

            for k in range(self.K):
                tempDist=np.sum((self.X-self.centroids[:,k])**2,axis=1)
                EuclidianDistance=np.c_[EuclidianDistance,tempDist]

it should be:

            for k in range(self.K):
                tempDist=np.sum((self.X-self.centroids[k,:])**2,axis=1)
                EuclidianDistance=np.c_[EuclidianDistance,tempDist]

the index k is opposite.
2.
the same function, in 6th for iteration:

            for k in range(self.K):
                self.centroids[k, :] = np.mean(Y[k+1], axis=0)

it is with the same index problem. It should be:

            for k in range(self.K):
                self.centroids[:, k] = np.mean(Y[k+1], axis=0)

in plot part, when we draw steriod mark on final centroids:

    axs[i].scatter(Centroids[:,0],Centroids[:,1],s=300,c='red',label='Centroids', marker='*')

it should be:

    axs[i].scatter(Centroids[0, :],Centroids[1, :],s=300,c='red',label='Centroids', marker='*')

the whole rectified code is:

import warnings
warnings.filterwarnings('ignore')
import matplotlib.pyplot as plt
import numpy as np
import random

class Kmeans_dev:
    """
    class definition
    """
    
    def __init__(self,X,K):
        """
        class constructor
        """
        self.X = X
        self.output = {}
        self.centroids = np.array([]).reshape(self.X.shape[1],0)
        self.K = K
        self.m = self.X.shape[0]
        
    
    def start_centroid_pos(self, X, K):
        """
        Random initialization of K centroids.
        """
        m,n = X.shape[0], X.shape[1]
        centroids = np.zeros((K,n))

        for i in range(K):
        #for i in range(1,K+1,1):
            centroids[i] = X[np.random.randint(0,m),:]

        return centroids
    

    def fit(self,n_iter):
        """
        Method to train the data set (position of centroids()
        """
        #randomly Initialize the centroids (callstart_centroid_pos() )
        self.centroids=self.start_centroid_pos(self.X,self.K)
        
        #compute Euclidian distances and assign clusters
        for n in range(n_iter):
            EuclidianDistance=np.array([]).reshape(self.m,0)
            for k in range(self.K):
                tempDist=np.sum((self.X-self.centroids[k, :])**2,axis=1)
                EuclidianDistance=np.c_[EuclidianDistance,tempDist]
            C=np.argmin(EuclidianDistance,axis=1)+1
            
            #adjust the centroids
            Y={}
            for k in range(self.K):
                Y[k+1]=np.array([]).reshape(2,0)
            for i in range(self.m):
                Y[C[i]]=np.c_[Y[C[i]],self.X[i]]
        
            for k in range(self.K):
                Y[k+1] = Y[k+1].T
            for k in range(self.K):
                self.centroids[k, :] = np.mean(Y[k+1],axis = 0)
                
            self.output=Y
            
    
    def predict(self):
        """
        Return of data set adherence to certain cluster
        """
        
        return self.output,self.centroids.T

# data set
X = np.array([[0.3,8.3 ], [3, 8], [2, 9],[0.3, 8.9],[1.7, 9.7 ],
              [0.9, 10.5], [10.3, 2.1],[10, 2],[7, 7 ], [6.9, 6.5],
              [6, 6],[1, 2], [1.5, 1.8], [5, 8 ], [8, 8], [1, 0.6],
              [9,11],[12, 5], [4.5, 4.8], [4.5, 3 ], [2, 8], [9, 3], [9,7]])


"""
Run k-means algorithm on given data set. Printing the output (position of centrod) and adherence of point to
certain cluster
"""

K= 2 # number of cluster you would like to create

#creation of class object and training (fit)
kmeans=Kmeans_dev(X,K)
kmeans.fit(50)

color=['blue','green']
labels=['cluster1','cluster2']

fig, axs = plt.subplots(5,2, figsize=(14, 28), facecolor='w', edgecolor='k')
fig.subplots_adjust(hspace = .5, wspace=.001)

axs = axs.ravel()

#you print the test results iteration by iteration (in order to show how the centroids moves)

for i in range(10):
    kmeans=Kmeans_dev(X,K)

    kmeans.fit(1)

    Output,Centroids=kmeans.predict()
    
    for k in range(K):
        axs[i].scatter(Output[k+1][:,0],Output[k+1][:,1],c=color[k],label=labels[k])
    axs[i].scatter(Centroids[0, :],Centroids[1, :],s=300,c='red',label='Centroids', marker='*')
    
    axs[i].set_title("Centroids movement. Iteration "+ str(1+i))
    
plt.show()

change K from 2 to 3 or 4, the error will show up.

Hello @MeineLiebeAxt ,

Thanks for the feedback. We will analyze this in the following days and update the course accordingly.

This topic was automatically closed after 7 days. New replies are no longer allowed.