【人工智慧】機器學習之聚類演算法Kmeans及其應用,呼叫sklearn中聚類演算法以及手動實現Kmeans演算法。

語言: CN / TW / HK

本文已參與「新人創作禮」活動, 一起開啟掘金創作之路。

實現Kmeans演算法實現聚類

要求: 1、根據演算法流程,手動實現Kmeans演算法; 2、呼叫sklearn中聚類演算法,對給定資料集進行聚類分析; 3、對比上述2中Kmeans演算法的聚類效果。

讀取檔案

python def loadFile(path): dataList = [] #開啟檔案:以二進位制讀模式、utf-8格式的編碼方式 開啟 fr = open(path,"r",encoding='UTF-8') record = fr.read() fr.close #按照行轉換為一維表即包含各行作為元素的列表,分隔符有'\r', '\r\n', \n' recordList = record.splitlines() #逐行遍歷:行內欄位按'\t'分隔符分隔,轉換為列表 for line in recordList: if line.strip(): dataList .append(list(map(float, line.split('\t')))) #返回轉換後的矩陣 recordmat = np.mat(dataList ) return recordmat

手動實現Kmeans演算法

```python def kMeans(dataset, k): m = np.shape(dataset)[0] ClustDist = np.mat(np.zeros((m, 2))) cents = randCents(dataset, k) clusterChanged = True # 迴圈迭代,得到最近的聚類中心 while clusterChanged: clusterChanged = False for i in range(m): DistList = [distEclud(dataset[i, :], cents[jk,:]) for jk in range(k)] minDist = min(DistList) minIndex = DistList.index(minDist)

        if ClustDist[i, 0] != minIndex:
            clusterChanged = True
        ClustDist[i, :] = minIndex, minDist

    # 更新聚類
    for cent in range(k):
        ptsInClust = dataset[np.nonzero(ClustDist[:, 0].A == cent)[0]]
        # 更新聚類中心cents,axis=0按列求均值
        cents[cent, :] = np.mean(ptsInClust, axis=0)
# 返回聚類中心和聚類分配矩陣
return cents, ClustDist

```

處理資料

```python path_file = "TESTDATA.TXT" recordMat = loadFile(path_file) k = 4

cents, distMat = kMeans(recordMat, k) ```

繪製資料散點圖

```python plt.subplot(311) plt.grid(True)# 生成網格 for indx in range(len(distMat)): if distMat[indx, 0] == 0: plt.scatter(recordMat[indx, 0], recordMat[indx, 1], c='red', marker='o') if distMat[indx, 0] == 1: plt.scatter(recordMat[indx, 0], recordMat[indx, 1], c='blue', marker='o') if distMat[indx, 0] == 2: plt.scatter(recordMat[indx, 0], recordMat[indx, 1], c='cyan', marker='o') if distMat[indx, 0] == 3: plt.scatter(recordMat[indx, 0], recordMat[indx, 1], c='green', marker='o')

#if distMat[indx, 0] == 4:
    #plt.scatter(recordMat[indx, 0], recordMat[indx, 1], c='black', marker='o')

```

繪製聚類中心

python x = [cents[i,0] for i in range(k)] y = [cents[i,1] for i in range(k)] plt.scatter(x, y, s = 80, c='yellow', marker='o') plt.title('Kmeans')

呼叫sklearn中聚類演算法

```python from sklearn.cluster import KMeans X = np.array(recordMat) # 生成初始聚類資料

kmeans_model = KMeans(n_clusters=k, init='k-means++') # 聚類模型

kmeans_model = KMeans(n_clusters=k, init='random') # 聚類模型 kmeans_model.fit(X) # 訓練聚類模型

```

繪製k-Means聚類結果

```python

plt.figure()# 建立視窗

plt.subplot(312) plt.axis([np.min(X[:,0])-1, np.max(X[:,0]+1), np.min(X[:,1])-1, np.max(X[:,1])+1])# 座標軸 plt.grid(True)# 生成網格

colors = ['r', 'g', 'b','c'] # 聚類顏色 markers = ['o', 's', 'D', '+'] # 聚類標誌 for i, l in enumerate(kmeans_model.labels_): plt.plot(X[i][0], X[i][1], color=colors[l],marker=markers[l],ls='None') plt.title('K = %s,random' %(k)) ```

對比效果:

在這裡插入圖片描述

整合程式碼:

```python import numpy as np import matplotlib.pyplot as plt from sklearn.cluster import KMeans

def loadFile(path): dataList = [] #開啟檔案:以二進位制讀模式、utf-8格式的編碼方式 開啟 fr = open(path,"r",encoding='UTF-8') record = fr.read() fr.close #按照行轉換為一維表即包含各行作為元素的列表,分隔符有'\r', '\r\n', \n' recordList = record.splitlines() #逐行遍歷:行內欄位按'\t'分隔符分隔,轉換為列表 for line in recordList: if line.strip(): dataList .append(list(map(float, line.split('\t')))) #返回轉換後的矩陣 recordmat = np.mat(dataList ) return recordmat

def distEclud(vecA, vecB): return np.linalg.norm(vecA-vecB, ord=2)

def randCents(dataSet, k): n = np.shape(dataSet)[1] cents = np.mat(np.zeros((k,n))) for j in range(n): #質心必須在資料集範圍內,也就是在min到max之間 minCol = min(dataSet[:,j]) maxCol = max(dataSet[:,j]) #利用隨機函式生成0到1.0之間的隨機數 cents [:,j] = np.mat(minCol + float(maxCol - minCol) * np.random.rand(k,1)) return cents

def kMeans(dataset, k): m = np.shape(dataset)[0] ClustDist = np.mat(np.zeros((m, 2))) cents = randCents(dataset, k) clusterChanged = True # 迴圈迭代,得到最近的聚類中心 while clusterChanged: clusterChanged = False for i in range(m): DistList = [distEclud(dataset[i, :], cents[jk,:]) for jk in range(k)] minDist = min(DistList) minIndex = DistList.index(minDist)

        if ClustDist[i, 0] != minIndex:
            clusterChanged = True
        ClustDist[i, :] = minIndex, minDist

    # 更新聚類
    for cent in range(k):
        ptsInClust = dataset[np.nonzero(ClustDist[:, 0].A == cent)[0]]
        # 更新聚類中心cents,axis=0按列求均值
        cents[cent, :] = np.mean(ptsInClust, axis=0)
# 返回聚類中心和聚類分配矩陣
return cents, ClustDist

path_file = "TESTDATA.TXT" recordMat = loadFile(path_file) k = 4

cents, distMat = kMeans(recordMat, k)

繪製資料散點圖

plt.subplot(311) plt.grid(True)# 生成網格 for indx in range(len(distMat)): if distMat[indx, 0] == 0: plt.scatter(recordMat[indx, 0], recordMat[indx, 1], c='red', marker='o') if distMat[indx, 0] == 1: plt.scatter(recordMat[indx, 0], recordMat[indx, 1], c='blue', marker='o') if distMat[indx, 0] == 2: plt.scatter(recordMat[indx, 0], recordMat[indx, 1], c='cyan', marker='o') if distMat[indx, 0] == 3: plt.scatter(recordMat[indx, 0], recordMat[indx, 1], c='green', marker='o')

#if distMat[indx, 0] == 4:
    #plt.scatter(recordMat[indx, 0], recordMat[indx, 1], c='black', marker='o')

繪製聚類中心

x = [cents[i,0] for i in range(k)] y = [cents[i,1] for i in range(k)] plt.scatter(x, y, s = 80, c='yellow', marker='o') plt.title('Kmeans')

X = np.array(recordMat) # 生成初始聚類資料

plt.figure()# 建立視窗

plt.subplot(312) plt.axis([np.min(X[:,0])-1, np.max(X[:,0]+1), np.min(X[:,1])-1, np.max(X[:,1])+1])# 座標軸 plt.grid(True)# 生成網格

colors = ['r', 'g', 'b','c'] # 聚類顏色 markers = ['o', 's', 'D', '+'] # 聚類標誌

kmeans_model = KMeans(n_clusters=k, init='k-means++') # 聚類模型

kmeans_model = KMeans(n_clusters=k, init='random') # 聚類模型 kmeans_model.fit(X) # 訓練聚類模型

繪製k-Means聚類結果

for i, l in enumerate(kmeans_model.labels_): plt.plot(X[i][0], X[i][1], color=colors[l],marker=markers[l],ls='None') plt.title('K = %s,random' %(k))

X = np.array(recordMat) # 生成初始聚類資料

plt.figure()# 建立視窗

plt.subplot(313) plt.axis([np.min(X[:,0])-1, np.max(X[:,0]+1), np.min(X[:,1])-1, np.max(X[:,1])+1])# 座標軸 plt.grid(True)# 生成網格

colors = ['r', 'g', 'b','c'] # 聚類顏色 markers = ['o', 's', 'D', '+'] # 聚類標誌 kmeans_model = KMeans(n_clusters=k, init='k-means++') # 聚類模型

kmeans_model = KMeans(n_clusters=k, init='random') # 聚類模型

kmeans_model.fit(X) # 訓練聚類模型

繪製k-Means聚類結果

for i, l in enumerate(kmeans_model.labels_): plt.plot(X[i][0], X[i][1], color=colors[l],marker=markers[l],ls='None') plt.title('K = %s,k-means++' %(k))

plt.show()

```