基于之前讲的这篇:机器学习-基于KNN和LMKNN的心脏病预测_牛大了2022的博客-CSDN博客 之后还会尝试其他模型。
由于两者差别在predict函数部分,所以着重解析那里。
import scipy.spatial
from collections import Counterclass KNN:def __init__(self, k):self.k = kdef fit(self, X, y):self.X_train = Xself.y_train = ydef distance(self, X1, X2):return scipy.spatial.distance.euclidean(X1, X2)def predict(self, X_test):final_output = []for i in range(len(X_test)):d = []votes = []for j in range(len(X_train)):dist = scipy.spatial.distance.euclidean(X_train[j] , X_test[i])d.append([dist, j])d.sort()d = d[0:self.k]for d, j in d:votes.append(self.y_train[j])ans = Counter(votes).most_common(1)[0][0]final_output.append(ans)return final_outputdef score(self, X_test, y_test):predictions = self.predict(X_test)value = 0for i in range(len(y_test)):if(predictions[i] == y_test[i]):value += 1return value / len(y_test)
predict部分解析,最经典的KNN模型:
final_output = []
:创建一个空列表,将最终的预测结果存储在其中。
for i in range(len(X_test)):
:对于测试集中的每个样本,执行以下操作:
d = []
:创建一个空列表,存储测试样本到训练样本之间的距离和对应的训练样本的索引。
votes = []
:创建一个空列表,存储与测试样本距离最近的前k个训练样本的类别。
for j in range(len(X_train)):
:对于每个训练样本,执行以下操作:
dist = scipy.spatial.distance.euclidean(X_train[j] , X_test[i])
:计算测试样本和训练样本之间的欧氏距离。
d.append([dist, j])
:将距离和训练样本的索引添加到列表d中。
d.sort()
:按距离从小到大排序。
d = d[0:self.k]
:选择距离最近的前k个训练样本。
for d, j in d:
:对于每个距离最近的训练样本,执行以下操作:
votes.append(self.y_train[j])
:将该训练样本的类别添加到列表votes中。
ans = Counter(votes).most_common(1)[0][0]
:从votes中找到出现最多的类别,并将其作为预测结果。
final_output.append(ans)
:将预测结果添加到列表final_output中。
return final_output
:返回最终的预测结果列表。
lmknn定义:基于局部均值的k-最近邻(LMKNN)规则作为KNN的扩展,是一种简单而鲁棒的模式分类非参数算法。LMKNN的基本思想是在进行分类决策时,使用训练集中每个类的测试点的k个最近邻居的局部平均向量来确定其类别标签。
import scipy.spatial
import numpy as np
from operator import itemgetterfrom collections import Counter
class LMKNN:def __init__(self, k):self.k = kdef fit(self, X, y):self.X_train = Xself.y_train = ydef distance(self, X1, X2):return scipy.spatial.distance.euclidean(X1, X2)def predict(self, X_test):final_output = []myclass = list(set(self.y_train))for i in range(len(X_test)):eucDist = []votes = []for j in range(len(X_train)):dist = scipy.spatial.distance.euclidean(X_train[j] , X_test[i])eucDist.append([dist, j, self.y_train[j]])eucDist.sort()minimum_dist_per_class = []for c in myclass:minimum_class = []for di in range(len(eucDist)):if(len(minimum_class) != self.k):if(eucDist[di][2] == c):minimum_class.append(eucDist[di])else:breakminimum_dist_per_class.append(minimum_class)indexData = []for a in range(len(minimum_dist_per_class)):temp_index = []for j in range(len(minimum_dist_per_class[a])):temp_index.append(minimum_dist_per_class[a][j][1])indexData.append(temp_index)centroid = []for a in range(len(indexData)):transposeData = X_train[indexData[a]].TtempCentroid = []for j in range(len(transposeData)):tempCentroid.append(np.mean(transposeData[j]))centroid.append(tempCentroid)centroid = np.array(centroid)eucDist_final = []for b in range(len(centroid)):dist = scipy.spatial.distance.euclidean(centroid[b] , X_test[i])eucDist_final.append([dist, myclass[b]])sorted_eucDist_final = sorted(eucDist_final, key=itemgetter(0))final_output.append(sorted_eucDist_final[0][1])return final_outputdef score(self, X_test, y_test):predictions = self.predict(X_test)value = 0for i in range(len(y_test)):if(predictions[i] == y_test[i]):value += 1return value / len(y_test)
predict函数解析:
def predict(self, X_test):
# 初始化空列表,用于保存测试数据的预测结果final_output = []
# 获取训练数据集的类别标签列表myclass = list(set(self.y_train))
# 循环遍历每个测试样本for i in range(len(X_test)):
# 初始化空列表,用于保存每个训练样本到当前测试样本的欧几里得距离eucDist = []
# 初始化空列表,用于保存最近的K个训练样本的类别标签votes = []
# 循环遍历每个训练样本for j in range(len(X_train)):# 计算当前训练样本到当前测试样本的欧几里得距离,并保存到eucDist列表中dist = scipy.spatial.distance.euclidean(X_train[j] , X_test[i])eucDist.append([dist, j, self.y_train[j]])
# 将eucDist列表按照欧几里得距离升序排序eucDist.sort()
# 针对每个类别,选择最近的K个训练样本,并将它们保存到minimum_dist_per_class列表中minimum_dist_per_class = []for c in myclass:minimum_class = []for di in range(len(eucDist)):if(len(minimum_class) != self.k):if(eucDist[di][2] == c):minimum_class.append(eucDist[di])else:breakminimum_dist_per_class.append(minimum_class)
# 将minimum_dist_per_class列表中的每个最近的K个训练样本的下标保存到indexData列表中indexData = []for a in range(len(minimum_dist_per_class)):temp_index = []for j in range(len(minimum_dist_per_class[a])):temp_index.append(minimum_dist_per_class[a][j][1])indexData.append(temp_index)
# 针对每个类别,计算最近的K个训练样本的平均值,并将它们保存到centroid列表中centroid = []for a in range(len(indexData)):transposeData = X_train[indexData[a]].TtempCentroid = []for j in range(len(transposeData)):tempCentroid.append(np.mean(transposeData[j]))centroid.append(tempCentroid)centroid = np.array(centroid)
# 计算测试样本到每个类别的平均值之间的欧几里得距离,并将它们保存到eucDist_final列表中eucDist_final = []for b in range(len(centroid)):dist = scipy.spatial.distance.euclidean(centroid[b] , X_test[i])eucDist_final.append([dist, myclass[b]])
# 将eucDist_final列表按照欧几里得距离升序排序,并将距离最近的类别标签添加到votes列表中sorted_eucDist_final = sorted(eucDist_final, key=itemgetter(0))final_output.append(sorted_eucDist_final[0][1])
#将排序后第一个元素的类别标签存储到final_output列表中return final_output