Decision Trees
import numpy as np
import matplotlib.pyplot as plt
from public_tests import *
from utils import *
%matplotlib inline
X_train = np.array([[1,1,1],[1,0,1],[1,0,0],[1,0,0],[1,1,1],[0,1,1],[0,0,0],[1,0,1],[0,1,0],[1,0,0]])
y_train = np.array([1,1,0,0,1,0,0,1,1,0])
print("First few elements of X_train:\n", X_train[:5])
print("Type of X_train:",type(X_train))
print("First few elements of y_train:", y_train[:5])
print("Type of y_train:",type(y_train))
print ('The shape of X_train is:', X_train.shape)
print ('The shape of y_train is: ', y_train.shape)
print ('Number of training examples (m):', len(X_train))
def compute_entropy(y):
"""
Computes the entropy for
Args:
y (ndarray): Numpy array indicating whether each example at a node is
edible (`1`) or poisonous (`0`)
Returns:
entropy (float): Entropy at that node
"""
# You need to return the following variables correctly
entropy = 0.
if len(y) != 0 :
p1 = len(y[y==1]) / len(y)
if p1 != 0 and p1 != 1 :
entropy =-p1 * np.log2(p1) - (1-p1) * np.log2(1-p1)
else :
entropy = 0
return entropy
# Compute entropy at the root node (i.e. with all examples)
# Since we have 5 edible and 5 non-edible mushrooms, the entropy should be 1"
print("Entropy at root node: ", compute_entropy(y_train))
# UNIT TESTS
compute_entropy_test(compute_entropy)
def split_dataset(X, node_indices, feature):
"""
Splits the data at the given node into
left and right branches
Args:
X (ndarray): Data matrix of shape(n_samples, n_features)
node_indices (list): List containing the active indices. I.e, the samples being considered at this step.
feature (int): Index of feature to split on
Returns:
left_indices (list): Indices with feature value == 1
right_indices (list): Indices with feature value == 0
"""
# You need to return the following variables correctly
left_indices = []
right_indices = []
for i in node_indices :
if X[i][feature] == 1:
left_indices.append(i)
else:
right_indices.append(i)
return left_indices, right_indices
# Case 1
root_indices = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9]
# Feel free to play around with these variables
# The dataset only has three features, so this value can be 0 (Brown Cap), 1 (Tapering Stalk Shape) or 2 (Solitary)
feature = 0
left_indices, right_indices = split_dataset(X_train, root_indices, feature)
print("CASE 1:")
print("Left indices: ", left_indices)
print("Right indices: ", right_indices)
# Visualize the split
generate_split_viz(root_indices, left_indices, right_indices, feature)
print()
# Case 2
root_indices_subset = [0, 2, 4, 6, 8]
left_indices, right_indices = split_dataset(X_train, root_indices_subset, feature)
print("CASE 2:")
print("Left indices: ", left_indices)
print("Right indices: ", right_indices)
# Visualize the split
generate_split_viz(root_indices_subset, left_indices, right_indices, feature)
# UNIT TESTS
split_dataset_test(split_dataset)
def compute_information_gain(X, y, node_indices, feature):
"""
Compute the information of splitting the node on a given feature
Args:
X (ndarray): Data matrix of shape(n_samples, n_features)
y (array like): list or ndarray with n_samples containing the target variable
node_indices (ndarray): List containing the active indices. I.e, the samples being considered in this step.
Returns:
cost (float): Cost computed
"""
# Split dataset
left_indices, right_indices = split_dataset(X, node_indices, feature)
# Some useful variables
X_node, y_node = X[node_indices], y[node_indices]
X_left, y_left = X[left_indices], y[left_indices]
X_right, y_right = X[right_indices], y[right_indices]
# You need to return the following variables correctly
information_gain = 0
node_entropy = compute_entropy(y_node)
left_entropy = compute_entropy(y_left)
right_entropy = compute_entropy(y_right)
w_left = len(X_left) / len(X_node)
w_right = len(X_right) / len(X_node)
weighted_entropy = w_left * left_entropy + w_right * right_entropy
information_gain = node_entropy - weighted_entropy
return information_gain
info_gain0 = compute_information_gain(X_train, y_train, root_indices, feature=0)
print("Information Gain from splitting the root on brown cap: ", info_gain0)
info_gain1 = compute_information_gain(X_train, y_train, root_indices, feature=1)
print("Information Gain from splitting the root on tapering stalk shape: ", info_gain1)
info_gain2 = compute_information_gain(X_train, y_train, root_indices, feature=2)
print("Information Gain from splitting the root on solitary: ", info_gain2)
# UNIT TESTS
compute_information_gain_test(compute_information_gain)
def get_best_split(X, y, node_indices):
"""
Returns the optimal feature and threshold value
to split the node data
Args:
X (ndarray): Data matrix of shape(n_samples, n_features)
y (array like): list or ndarray with n_samples containing the target variable
node_indices (ndarray): List containing the active indices. I.e, the samples being considered in this step.
Returns:
best_feature (int): The index of the best feature to split
"""
# Some useful variables
num_features = X.shape[1]
# You need to return the following variables correctly
best_feature = -1
### START CODE HERE ###
max_info_gain = 0
for feature in range(num_features):
info_gain = compute_information_gain(X, y, node_indices, feature)
if info_gain > max_info_gain:
max_info_gain = info_gain
best_feature = feature
return best_feature
best_feature = get_best_split(X_train, y_train, root_indices)
print("Best feature to split on: %d" % best_feature)
# UNIT TESTS
get_best_split_test(get_best_split)
tree = []
def build_tree_recursive(X, y, node_indices, branch_name, max_depth, current_depth):
"""
Build a tree using the recursive algorithm that split the dataset into 2 subgroups at each node.
This function just prints the tree.
Args:
X (ndarray): Data matrix of shape(n_samples, n_features)
y (array like): list or ndarray with n_samples containing the target variable
node_indices (ndarray): List containing the active indices. I.e, the samples being considered in this step.
branch_name (string): Name of the branch. ['Root', 'Left', 'Right']
max_depth (int): Max depth of the resulting tree.
current_depth (int): Current depth. Parameter used during recursive call.
"""
# Maximum depth reached - stop splitting
if current_depth == max_depth:
formatting = " "*current_depth + "-"*current_depth
print(formatting, "%s leaf node with indices" % branch_name, node_indices)
return
# Otherwise, get best split and split the data
# Get the best feature and threshold at this node
best_feature = get_best_split(X, y, node_indices)
formatting = "-"*current_depth
print("%s Depth %d, %s: Split on feature: %d" % (formatting, current_depth, branch_name, best_feature))
# Split the dataset at the best feature
left_indices, right_indices = split_dataset(X, node_indices, best_feature)
tree.append((left_indices, right_indices, best_feature))
# continue splitting the left and the right child. Increment current depth
build_tree_recursive(X, y, left_indices, "Left", max_depth, current_depth+1)
build_tree_recursive(X, y, right_indices, "Right", max_depth, current_depth+1)
build_tree_recursive(X_train, y_train, root_indices, "Root", max_depth=2, current_depth=0)
generate_tree_viz(root_indices, y_train, tree)
Yorumlar
Yorum Gönder