KNN算法

这个算法比较简单，就是计算不同特征值之间的距离并进行分类

先写一个计算欧几里得距离的函数

import numpy as np
def euclidean_distance(x1, x2):
    distance = np.sqrt(sum((x1 - x2) ** 2))
    return distance

from collections import Counter
class KNN:
    def __init__(self, k):
        self.k = k
        
        
    def fit(self, X, y):
        self.X_train = X
        self.y_train = y
        
        
    def predict(self, X):
        predictions = [self._predict(x) for x in X]
        return predictions
        
    def _predict(self, x):
        # 先计算距离
        distances = [euclidean_distance(x, x_train) for x_train in self.X_train]
        # 再排序得到最近的k个，得到的k_indices是最近的k个训练样本的索引
        # argsort()函数是返回排序后的索引数组，排序默认为升序
        k_indices = np.argsort(distances)[:self.k]
        k_nearest_labels = [self.y_train[i] for i in k_indices]
        # 统计k个最近的样本中各个类别的个数
        most_common = Counter(k_nearest_labels).most_common(1)
        return most_common[0][0]

训练函数

from sklearn import datasets
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
from matplotlib.colors import ListedColormap
#定义颜色映射
cmap = ListedColormap(['#FF0000', '#00FF00', '#0000FF'])
iris = datasets.load_iris()
X, y = iris.data, iris.target

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1234)

plt.figure()
plt.scatter(X[:, 2], X[:, 3], c=y, cmap=cmap, edgecolor='k', s=20)
plt.show()

png

clf = KNN(k=5)
clf.fit(X_train, y_train)
predicition = clf.predict(X_test)
print(predicition)

[1, 2, 2, 0, 1, 0, 0, 0, 1, 2, 1, 0, 2, 1, 0, 1, 2, 0, 2, 1, 1, 1, 1, 1, 2, 0, 2, 1, 2, 0]

acc = np.sum(predicition == y_test) / len(y_test)
print(acc)

0.9666666666666667