计算熵函数(二元熵)
def compute_entropy(y):
"""
Computes the entropy for
Args:
y (ndarray): Numpy array indicating whether each example at a node is
edible (`1`) or poisonous (`0`)
Returns:
entropy (float): Entropy at that node
"""
entropy = 0
p=len(y[y==1])/len(y)
if pos==size or pos==0:
entropy=0
else:
entropy=-p*np.log2(p)-(1-p)*np.log2(1-p)
return entropy
根据特征分裂结点
def split_dataset(X, node_indices, feature):
"""
Splits the data at the given node into
left and right branches
Args:
X (ndarray): Data matrix of shape(n_samples, n_features)
node_indices (ndarray): List containing the active indices. I.e, the samples being considered at this step.
feature (int): Index of feature to split on
Returns:
left_indices (ndarray): Indices with feature value == 1
right_indices (ndarray): Indices with feature value == 0
"""
left_indices = []
right_indices = []
for i in node_indices:
if X[i,feature]==1:
left_indices.append(i)
else:
right_indices.append(i)
return left_indices, right_indices
计算信息增益
def compute_information_gain(X, y, node_indices, feature):
"""
Compute the information of splitting the node on a given feature
Args:
X (ndarray): Data matrix of shape(n_samples, n_features)
y (array like): list or ndarray with n_samples containing the target variable
node_indices (ndarray): List containing the active indices. I.e, the samples being considered in this step.
Returns:
cost (float): Cost computed
"""
left_indices, right_indices = split_dataset(X, node_indices, feature)
X_node, y_node = X[node_indices], y[node_indices]
X_left, y_left = X[left_indices], y[left_indices]
X_right, y_right = X[right_indices], y[right_indices]
information_gain = 0
lefts=len(y_left)
rights=len(y_right)
w_left=lefts/(lefts+rights)
w_right=rights/(lefts+rights)
H_left=compute_entropy(y_left)
H_right=compute_entropy(y_right)
H_root=compute_entropy(y_node)
information_gain=H_root-(w_left*H_left+w_right*H_right)
return information_gain
选择最佳分裂特征
def get_best_split(X, y, node_indices):
"""
Returns the optimal feature and threshold value
to split the node data
Args:
X (ndarray): Data matrix of shape(n_samples, n_features)
y (array like): list or ndarray with n_samples containing the target variable
node_indices (ndarray): List containing the active indices. I.e, the samples being considered in this step.
Returns:
best_feature (int): The index of the best feature to split
"""
num_features = X.shape[1]
best_feature = -1
IG=0
for i in range(num_features):
ig=compute_information_gain(X,y,node_indices,i)
if ig>IG:
IG=ig
best_feature=i
return best_feature
随机森林算法
def build_tree_recursive(X, y, node_indices, branch_name, max_depth, current_depth):
"""
Build a tree using the recursive algorithm that split the dataset into 2 subgroups at each node.
This function just prints the tree.
Args:
X (ndarray): Data matrix of shape(n_samples, n_features)
y (array like): list or ndarray with n_samples containing the target variable
node_indices (ndarray): List containing the active indices. I.e, the samples being considered in this step.
branch_name (string): Name of the branch. ['Root', 'Left', 'Right']
max_depth (int): Max depth of the resulting tree.
current_depth (int): Current depth. Parameter used during recursive call.
"""
if current_depth == max_depth:
formatting = " "*current_depth + "-"*current_depth
print(formatting, "%s leaf node with indices" % branch_name, node_indices)
return
best_feature = get_best_split(X, y, node_indices)
tree.append((current_depth, branch_name, best_feature, node_indices))
formatting = "-"*current_depth
print("%s Depth %d, %s: Split on feature: %d" % (formatting, current_depth, branch_name, best_feature))
left_indices, right_indices = split_dataset(X, node_indices, best_feature)
build_tree_recursive(X, y, left_indices, "Left", max_depth, current_depth+1)
build_tree_recursive(X, y, right_indices, "Right", max_depth, current_depth+1)