[go: nahoru, domu]

Skip to content

Commit

Permalink
Browse files Browse the repository at this point in the history
  • Loading branch information
luokn committed Oct 7, 2022
1 parent cfa2147 commit d32c46d
Show file tree
Hide file tree
Showing 12 changed files with 59 additions and 79 deletions.
13 changes: 6 additions & 7 deletions src/adaboost.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,12 +3,12 @@
# @Author: Luokun
# @Email : olooook@outlook.com


import numpy as np
from matplotlib import pyplot as plt


class AdaBoost:

def __init__(self, n_estimators: int, lr=1e-2, eps=1e-5):
"""
Args:
Expand Down Expand Up @@ -52,6 +52,7 @@ def __call__(self, X: np.ndarray) -> np.ndarray:


class WeakEstimator: # 弱分类器, 一阶决策树

def __init__(self, lr: float):
self.lr, self.feature, self.threshold, self.sign = lr, None, None, None # 划分特征、划分阈值,符号{-1,1}

Expand All @@ -73,12 +74,10 @@ def __call__(self, X: np.ndarray) -> np.ndarray:


def load_data(n_samples_per_class=500):
X = np.concatenate(
[
np.random.randn(n_samples_per_class, 2) + np.array([1, -1]),
np.random.randn(n_samples_per_class, 2) + np.array([-1, 1]),
]
)
X = np.concatenate([
np.random.randn(n_samples_per_class, 2) + np.array([1, -1]),
np.random.randn(n_samples_per_class, 2) + np.array([-1, 1]),
])
y = np.array([1] * n_samples_per_class + [-1] * n_samples_per_class)

training_set, test_set = np.split(np.random.permutation(len(X)), [int(len(X) * 0.8)])
Expand Down
39 changes: 18 additions & 21 deletions src/decision_tree.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,6 @@
# @Author: Luokun
# @Email : olooook@outlook.com


import numpy as np


Expand Down Expand Up @@ -70,26 +69,24 @@ def calc_info_gain(cls, X: np.ndarray, y: np.ndarray, rows: np.ndarray, col: int


def load_data():
X = np.array(
[
[0, 0, 0],
[0, 0, 0],
[0, 0, 1],
[0, 0, 1],
[0, 1, 0],
[0, 1, 0],
[0, 1, 1],
[0, 1, 1],
[1, 0, 0],
[1, 0, 0],
[1, 0, 1],
[1, 0, 1],
[1, 1, 0],
[1, 1, 0],
[1, 1, 1],
[1, 1, 1],
]
)
X = np.array([
[0, 0, 0],
[0, 0, 0],
[0, 0, 1],
[0, 0, 1],
[0, 1, 0],
[0, 1, 0],
[0, 1, 1],
[0, 1, 1],
[1, 0, 0],
[1, 0, 0],
[1, 0, 1],
[1, 0, 1],
[1, 1, 0],
[1, 1, 0],
[1, 1, 1],
[1, 1, 1],
])
y = np.where(X.sum(axis=1) >= 2, 1, 0)
return X, y

Expand Down
6 changes: 2 additions & 4 deletions src/em.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,6 @@
# @Author: Luokun
# @Email : olooook@outlook.com


import numpy as np


Expand All @@ -22,8 +21,8 @@ def fit(self, X: np.ndarray, iterations=100):

def _expect(self, X: np.ndarray): # E步
p1, p2, p3 = self.prob
a = p1 * (p2 ** X) * ((1 - p2) ** (1 - X))
b = (1 - p1) * (p3 ** X) * ((1 - p3) ** (1 - X))
a = p1 * (p2**X) * ((1 - p2)**(1 - X))
b = (1 - p1) * (p3**X) * ((1 - p3)**(1 - X))
return a / (a + b)

def _maximize(self, X: np.ndarray, M: np.ndarray): # M步
Expand All @@ -34,7 +33,6 @@ def _maximize(self, X: np.ndarray, M: np.ndarray): # M步

# EM算法与高斯混合模型可参见./gmm.py


if __name__ == "__main__":
x = np.array([1, 1, 0, 1, 0, 0, 1, 0, 1, 1])

Expand Down
10 changes: 4 additions & 6 deletions src/gmm.py
Original file line number Diff line number Diff line change
Expand Up @@ -65,12 +65,10 @@ def maximize(self, X: np.ndarray, G: np.ndarray): # M步


def load_data(n_samples_per_class=500):
X = np.concatenate(
[
np.random.multivariate_normal(mean=[4, 0], cov=[[2, 0], [0, 2]], size=[n_samples_per_class]),
np.random.multivariate_normal(mean=[0, 4], cov=[[2, 0], [0, 2]], size=[n_samples_per_class]),
]
)
X = np.concatenate([
np.random.multivariate_normal(mean=[4, 0], cov=[[2, 0], [0, 2]], size=[n_samples_per_class]),
np.random.multivariate_normal(mean=[0, 4], cov=[[2, 0], [0, 2]], size=[n_samples_per_class]),
])
y = np.array([0] * n_samples_per_class + [1] * n_samples_per_class)
return X, y

Expand Down
12 changes: 5 additions & 7 deletions src/kmeans.py
Original file line number Diff line number Diff line change
Expand Up @@ -35,13 +35,11 @@ def fit(self, X: np.ndarray):
y_pred = self(X)

# 各类别的均值作为新的中心点,
centers = np.stack(
[
# 存在元素属于类别i则计算类别i所有点的均值,否则随机选择一个点作为类别i的均值
np.mean(X[y_pred == i], axis=0) if np.any(y_pred == i) else random.choice(X)
for i in range(self.n_clusters)
]
)
centers = np.stack([
# 存在元素属于类别i则计算类别i所有点的均值,否则随机选择一个点作为类别i的均值
np.mean(X[y_pred == i], axis=0) if np.any(y_pred == i) else random.choice(X)
for i in range(self.n_clusters)
])

# 中心点最大更新值小于eps则停止迭代
if np.abs(self.centers - centers).max() < self.eps:
Expand Down
2 changes: 1 addition & 1 deletion src/knn.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,7 @@ def __call__(self, X: np.ndarray):
y_pred = np.zeros([len(X)], dtype=int) # X对应的类别
for i, x in enumerate(X):
dist = LA.norm(self.X - x, axis=1) # 计算x与所有已知类别点的距离
topk = np.argsort(dist)[: self.k] # 取距离最小的k个点对应的索引
topk = np.argsort(dist)[:self.k] # 取距离最小的k个点对应的索引
y_pred[i] = np.bincount(self.y[topk]).argmax() # 取近邻点最多的类别作为x的类别
return y_pred

Expand Down
14 changes: 6 additions & 8 deletions src/lda.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,7 +30,7 @@ def fit(self, X: np.ndarray, Y: np.ndarray):
S_W += (Xi - Mi).T @ (Xi - Mi)
S_B += len(Xi) * (Mi - M).reshape(-1, 1) @ (Mi - M).reshape(1, -1)
L, V = LA.eig(LA.inv(S_W) @ S_B) # 计算 S_W^{-1} S_B 的特征值与特征向量
topk = np.argsort(L)[::-1][: self.k] # 按照特征值降序排列,取前K大特征值
topk = np.argsort(L)[::-1][:self.k] # 按照特征值降序排列,取前K大特征值
self.W = V[:, topk] # 选择topk对应的特征向量

def __call__(self, X: np.ndarray):
Expand All @@ -52,20 +52,18 @@ def __init__(self, k: int):
def __call__(self, X: np.ndarray):
X_norm = X - X.mean(axis=0) # 去中心化
L, V = np.linalg.eig(X_norm.T @ X_norm) # 对协方差矩阵进行特征值分解
topk = np.argsort(L)[::-1][: self.k] # 找出前K大特征值对应的索引
topk = np.argsort(L)[::-1][:self.k] # 找出前K大特征值对应的索引
return X_norm @ V[:, topk] # 将去中心化的X乘以前K大特征值对应的特征向量


def load_data(n_samlpes_per_class=500):
theta = np.pi / 4
scale = np.array([[2, 0], [0, 0.5]]) # 缩放
rotate = np.array([[np.cos(theta), -np.sin(theta)], [np.sin(theta), np.cos(theta)]]) # 旋转
X = np.concatenate(
[
np.random.randn(n_samlpes_per_class, 2) + np.array([0, -2]),
np.random.randn(n_samlpes_per_class, 2) + np.array([0, +2]),
]
)
X = np.concatenate([
np.random.randn(n_samlpes_per_class, 2) + np.array([0, -2]),
np.random.randn(n_samlpes_per_class, 2) + np.array([0, +2]),
])
X = X @ scale @ rotate # 对数据进行缩放和旋转
y = np.array([0] * n_samlpes_per_class + [1] * n_samlpes_per_class)
return X, y
Expand Down
12 changes: 5 additions & 7 deletions src/logistic_regression.py
Original file line number Diff line number Diff line change
Expand Up @@ -41,12 +41,10 @@ def sigmoid(x):


def load_data(n_samples_per_class=500):
X = np.concatenate(
[
np.random.randn(n_samples_per_class, 2) + np.array([1, -1]),
np.random.randn(n_samples_per_class, 2) + np.array([-1, 1]),
]
)
X = np.concatenate([
np.random.randn(n_samples_per_class, 2) + np.array([1, -1]),
np.random.randn(n_samples_per_class, 2) + np.array([-1, 1]),
])
y = np.array([0] * n_samples_per_class + [1] * n_samples_per_class)

training_set, test_set = np.split(np.random.permutation(len(X)), [int(len(X) * 0.6)])
Expand All @@ -58,7 +56,7 @@ def train_logistic_regression(model, X, y, epochs=100, batch_size=32):
for _ in range(epochs):
np.random.shuffle(indices)
for i in range(batch_size, len(X) + 1, batch_size):
model.fit(X[indices[i - batch_size : i]], y[indices[i - batch_size : i]])
model.fit(X[indices[i - batch_size:i]], y[indices[i - batch_size:i]])


if __name__ == "__main__":
Expand Down
5 changes: 2 additions & 3 deletions src/naive_bayes.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,9 +28,8 @@ def fit(self, X: np.ndarray, y: np.ndarray):
def __call__(self, X: np.ndarray) -> np.ndarray:
y_pred = np.zeros([len(X)], dtype=int)
for i, x in enumerate(X):
P = np.log(self.P_prior) + np.array(
[np.log(p_cond[np.arange(len(x)), x]).sum() for p_cond in self.P_cond]
) # 先验概率的对数,加上条件概率的对数
# 先验概率的对数,加上条件概率的对数
P = np.log(self.P_prior) + np.array([np.log(p_cond[np.arange(len(x)), x]).sum() for p_cond in self.P_cond])
y_pred[i] = np.argmax(P)
return y_pred

Expand Down
14 changes: 6 additions & 8 deletions src/pca.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,18 +22,16 @@ def __init__(self, k: int):
def __call__(self, X: np.ndarray):
X_norm = X - X.mean(axis=0) # 去中心化
L, V = np.linalg.eig(X_norm.T @ X_norm) # 对协方差矩阵进行特征值分解
topk = np.argsort(L)[::-1][: self.k] # 找出前K大特征值对应的索引
topk = np.argsort(L)[::-1][:self.k] # 找出前K大特征值对应的索引
return X_norm @ V[:, topk] # 将去中心化的X乘以前K大特征值对应的特征向量


def load_data(n_samples_per_class=200):
X = np.concatenate(
[
np.random.randn(n_samples_per_class, 2) + np.array([2, 0]),
np.random.randn(n_samples_per_class, 2),
np.random.randn(n_samples_per_class, 2) + np.array([-2, 0]),
]
)
X = np.concatenate([
np.random.randn(n_samples_per_class, 2) + np.array([2, 0]),
np.random.randn(n_samples_per_class, 2),
np.random.randn(n_samples_per_class, 2) + np.array([-2, 0]),
])
theta = np.pi / 4 # 逆时针旋转45°
scale = np.diag([1.2, 0.5]) # 缩放矩阵
rotate = np.array([[np.cos(theta), -np.sin(theta)], [np.sin(theta), np.cos(theta)]]) # 旋转矩阵
Expand Down
10 changes: 4 additions & 6 deletions src/perceptron.py
Original file line number Diff line number Diff line change
Expand Up @@ -36,12 +36,10 @@ def pad(x):


def load_data(n_samples_per_class=500):
X = np.concatenate(
[
np.random.randn(n_samples_per_class, 2) + np.array([1, -1]),
np.random.randn(n_samples_per_class, 2) + np.array([-1, 1]),
]
)
X = np.concatenate([
np.random.randn(n_samples_per_class, 2) + np.array([1, -1]),
np.random.randn(n_samples_per_class, 2) + np.array([-1, 1]),
])
y = np.array([-1] * n_samples_per_class + [1] * n_samples_per_class)

training_set, test_set = np.split(np.random.permutation(len(X)), [int(len(X) * 0.6)])
Expand Down
1 change: 0 additions & 1 deletion src/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,6 @@
# @Author: Kun Luo
# @Email : olooook@outlook.com


from typing import List, Optional

import numpy as np
Expand Down

0 comments on commit d32c46d

Please sign in to comment.