.

luokn · Jan 5, 2022 · f2f9591 · f2f9591
1 parent 50bff95
commit f2f9591
Showing 1 changed file with 32 additions and 46 deletions.
diff --git a/src/decision_tree.py b/src/decision_tree.py
@@ -12,68 +12,54 @@ class DecisionTree:
     Decision tree classifier(决策树分类器，ID3生成算法)
     """
 
-    def __init__(self, rate: float = 0.95):
-        self.rate, self.tree = rate, None
+    def __init__(self):
+        self.rate, self.root = None, None
 
-    def fit(self, X: np.ndarray, Y: np.ndarray):
-        indices = np.arange(X.shape[0])  # 所有行索引
-        features = np.arange(X.shape[1])  # 所有列索引
-        self.tree = self._create_tree(X, Y, indices, features)
+    def fit(self, X: np.ndarray, Y: np.ndarray, rate: float = 0.95):
+        self.rate = rate
+        self.root = self._create_tree(X, Y, np.arange(X.shape[0]), np.arange(X.shape[1]))
 
     def __call__(self, X: np.ndarray):
-        Y = np.zeros([len(X)], dtype=int)  # 输出变量
-        for i, x in enumerate(X):
-            Y[i] = self._predict(self.tree, x)
-        return Y
+        return np.array([self._predict(self.root, x) for x in X])
 
-    def _predict(self, node, x):
+    def _predict(self, node, x: np.ndarray):
         if isinstance(node, dict):  # 如果节点是树(字典)类型
             feature, trees = node["feature"], node["trees"]
             return self._predict(trees[x[feature]], x)  # 根据值进行下一次递归
         return node  # 如果节点是叶子类型则直接返回该值
 
-    def _create_tree(self, X, Y, indices, features):
-        category, rate = self._select_category(Y, indices)  # 获得数量最多的类别及其频率
-        if len(features) == 0 or rate > self.rate:  # 无特征可分或者满足一定的单一性
-            return category  # 返回最单一的类别
-        feature = self._select_feature(X, Y, indices, features)  # 选择香农熵最小的特征
-        sub_trees = {}
-        sub_features = features[features != feature]  # 除去选择的特征
-        for v in np.unique(X[indices, feature]).tolist():  # 为该特征的每一个取值都建立子树
-            sub_indices = indices[X[indices, feature] == v]
-            sub_trees[v] = self._create_tree(X, Y, sub_indices, sub_features)  # 递归构建子决策树
-        return {"feature": feature, "trees": sub_trees}
+    def _create_tree(self, X: np.ndarray, Y: np.ndarray, indices: np.ndarray, features: np.ndarray):
+        cat_counter = np.bincount(Y[indices])  # 计算类别个数
+        if len(features) == 0 or np.max(cat_counter) / len(indices) > self.rate:  # 无特征可分或者满足一定的单一性
+            return np.argmax(cat_counter)  # 返回最单一的类别
+        k = np.argmax([self._calc_info_gain(X, Y, indices, f) for f in features])  # 最大信息增益特征
+        feature = features[k]
+        features = np.delete(features, k)  # 除去选择的特征
+        trees = {
+            value: self._create_tree(X, Y, indices[X[indices, feature] == value], features)
+            for value in np.unique(X[indices, feature]).tolist()  # 为该特征的每一个取值都建立子树
+        }
+        return {"feature": feature, "trees": trees}
 
     @staticmethod
-    def _select_category(Y, indices):
-        counter = np.bincount(Y[indices])  # 计算类别频次
-        category = np.argmax(counter)
-        return category, counter[category] / len(indices)  # 返回出现次数最多的类别，以及其频率
-
-    @staticmethod
-    def _calc_entropy(Y, indices):  # 计算经验熵
+    def _calc_exp_ent(Y: np.ndarray, indices: np.ndarray):  # 计算经验熵
         prob = np.bincount(Y[indices]) / len(indices)
         prob = prob[prob.nonzero()]  # 除去0概率
         return np.sum(-prob * np.log(prob))  # 经验熵
 
     @classmethod
-    def _calc_cond_entropy(cls, X, Y, indices, feature):  # 计算条件熵
-        cond_ent = 0  # 经验条件熵
+    def _calc_cnd_ent(cls, X: np.ndarray, Y: np.ndarray, indices: np.ndarray, feature: int):  # 计算条件熵
+        ent = 0  # 经验条件熵
         for value in np.unique(X[indices, feature]):
-            sub_indices = indices[X[indices, feature] == value]
-            cond_ent += len(sub_indices) / len(indices) * cls._calc_entropy(Y, sub_indices)
-        return cond_ent  # 条件熵
-
-    @classmethod
-    def _calc_info_gain(cls, X, Y, indices, feature):  # 计算信息增益
-        ent = cls._calc_entropy(Y, indices)  # 经验熵
-        cond_ent = cls._calc_cond_entropy(X, Y, indices, feature)  # 经验条件熵
-        return ent - cond_ent  # 信息增益
+            indices_ = indices[X[indices, feature] == value]
+            ent += len(indices_) / len(indices) * cls._calc_exp_ent(Y, indices_)
+        return ent  # 条件熵
 
     @classmethod
-    def _select_feature(cls, X, Y, indices, features):
-        gains = np.array([cls._calc_info_gain(X, Y, indices, feature) for feature in features])  # 计算features中所有特征的信息增益
-        return features[gains.argmax()]  # 返回信息增益最大的特征
+    def _calc_info_gain(cls, X: np.ndarray, Y: np.ndarray, indices: np.ndarray, feature: int):  # 计算信息增益
+        exp_ent = cls._calc_exp_ent(Y, indices)  # 经验熵
+        cnd_ent = cls._calc_cnd_ent(X, Y, indices, feature)  # 经验条件熵
+        return exp_ent - cnd_ent  # 信息增益
 
 
 def load_data():
@@ -103,11 +89,11 @@ def load_data():
 
 if __name__ == "__main__":
     x, y = load_data()
-    decision_tree = DecisionTree(rate=0.95)
-    decision_tree.fit(x, y)
+    decision_tree = DecisionTree()
+    decision_tree.fit(x, y, rate=0.95)
     pred = decision_tree(x)
 
-    print(decision_tree.tree)
+    print(decision_tree.root)
     print(y)
     print(pred)