[go: nahoru, domu]

Skip to content

Commit

Permalink
.
Browse files Browse the repository at this point in the history
  • Loading branch information
LuoKun committed Jan 5, 2022
1 parent 50bff95 commit f2f9591
Showing 1 changed file with 32 additions and 46 deletions.
78 changes: 32 additions & 46 deletions src/decision_tree.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,68 +12,54 @@ class DecisionTree:
Decision tree classifier(决策树分类器,ID3生成算法)
"""

def __init__(self, rate: float = 0.95):
self.rate, self.tree = rate, None
def __init__(self):
self.rate, self.root = None, None

def fit(self, X: np.ndarray, Y: np.ndarray):
indices = np.arange(X.shape[0]) # 所有行索引
features = np.arange(X.shape[1]) # 所有列索引
self.tree = self._create_tree(X, Y, indices, features)
def fit(self, X: np.ndarray, Y: np.ndarray, rate: float = 0.95):
self.rate = rate
self.root = self._create_tree(X, Y, np.arange(X.shape[0]), np.arange(X.shape[1]))

def __call__(self, X: np.ndarray):
Y = np.zeros([len(X)], dtype=int) # 输出变量
for i, x in enumerate(X):
Y[i] = self._predict(self.tree, x)
return Y
return np.array([self._predict(self.root, x) for x in X])

def _predict(self, node, x):
def _predict(self, node, x: np.ndarray):
if isinstance(node, dict): # 如果节点是树(字典)类型
feature, trees = node["feature"], node["trees"]
return self._predict(trees[x[feature]], x) # 根据值进行下一次递归
return node # 如果节点是叶子类型则直接返回该值

def _create_tree(self, X, Y, indices, features):
category, rate = self._select_category(Y, indices) # 获得数量最多的类别及其频率
if len(features) == 0 or rate > self.rate: # 无特征可分或者满足一定的单一性
return category # 返回最单一的类别
feature = self._select_feature(X, Y, indices, features) # 选择香农熵最小的特征
sub_trees = {}
sub_features = features[features != feature] # 除去选择的特征
for v in np.unique(X[indices, feature]).tolist(): # 为该特征的每一个取值都建立子树
sub_indices = indices[X[indices, feature] == v]
sub_trees[v] = self._create_tree(X, Y, sub_indices, sub_features) # 递归构建子决策树
return {"feature": feature, "trees": sub_trees}
def _create_tree(self, X: np.ndarray, Y: np.ndarray, indices: np.ndarray, features: np.ndarray):
cat_counter = np.bincount(Y[indices]) # 计算类别个数
if len(features) == 0 or np.max(cat_counter) / len(indices) > self.rate: # 无特征可分或者满足一定的单一性
return np.argmax(cat_counter) # 返回最单一的类别
k = np.argmax([self._calc_info_gain(X, Y, indices, f) for f in features]) # 最大信息增益特征
feature = features[k]
features = np.delete(features, k) # 除去选择的特征
trees = {
value: self._create_tree(X, Y, indices[X[indices, feature] == value], features)
for value in np.unique(X[indices, feature]).tolist() # 为该特征的每一个取值都建立子树
}
return {"feature": feature, "trees": trees}

@staticmethod
def _select_category(Y, indices):
counter = np.bincount(Y[indices]) # 计算类别频次
category = np.argmax(counter)
return category, counter[category] / len(indices) # 返回出现次数最多的类别,以及其频率

@staticmethod
def _calc_entropy(Y, indices): # 计算经验熵
def _calc_exp_ent(Y: np.ndarray, indices: np.ndarray): # 计算经验熵
prob = np.bincount(Y[indices]) / len(indices)
prob = prob[prob.nonzero()] # 除去0概率
return np.sum(-prob * np.log(prob)) # 经验熵

@classmethod
def _calc_cond_entropy(cls, X, Y, indices, feature): # 计算条件熵
cond_ent = 0 # 经验条件熵
def _calc_cnd_ent(cls, X: np.ndarray, Y: np.ndarray, indices: np.ndarray, feature: int): # 计算条件熵
ent = 0 # 经验条件熵
for value in np.unique(X[indices, feature]):
sub_indices = indices[X[indices, feature] == value]
cond_ent += len(sub_indices) / len(indices) * cls._calc_entropy(Y, sub_indices)
return cond_ent # 条件熵

@classmethod
def _calc_info_gain(cls, X, Y, indices, feature): # 计算信息增益
ent = cls._calc_entropy(Y, indices) # 经验熵
cond_ent = cls._calc_cond_entropy(X, Y, indices, feature) # 经验条件熵
return ent - cond_ent # 信息增益
indices_ = indices[X[indices, feature] == value]
ent += len(indices_) / len(indices) * cls._calc_exp_ent(Y, indices_)
return ent # 条件熵

@classmethod
def _select_feature(cls, X, Y, indices, features):
gains = np.array([cls._calc_info_gain(X, Y, indices, feature) for feature in features]) # 计算features中所有特征的信息增益
return features[gains.argmax()] # 返回信息增益最大的特征
def _calc_info_gain(cls, X: np.ndarray, Y: np.ndarray, indices: np.ndarray, feature: int): # 计算信息增益
exp_ent = cls._calc_exp_ent(Y, indices) # 经验熵
cnd_ent = cls._calc_cnd_ent(X, Y, indices, feature) # 经验条件熵
return exp_ent - cnd_ent # 信息增益


def load_data():
Expand Down Expand Up @@ -103,11 +89,11 @@ def load_data():

if __name__ == "__main__":
x, y = load_data()
decision_tree = DecisionTree(rate=0.95)
decision_tree.fit(x, y)
decision_tree = DecisionTree()
decision_tree.fit(x, y, rate=0.95)
pred = decision_tree(x)

print(decision_tree.tree)
print(decision_tree.root)
print(y)
print(pred)

Expand Down

0 comments on commit f2f9591

Please sign in to comment.