clf = DecisionTreeClassifier(random_state=1)
path = clf.cost_complexity_pruning_path(X_train, y_train)
ccp_alphas, impurities = path.ccp_alphas, path.impurities
clfs = []
for ccp_alpha in ccp_alphas:
clf = DecisionTreeClassifier(random_state=1, ccp_alpha=ccp_alpha)
clf.fit(X_train, y_train)
clfs.append(clf)
print("Number of nodes in the last tree is: {} with ccp_alpha: {}".format(clfs[-1].tree_.node_count, ccp_alphas[-1]))
# we remove the last element in clfs and ccp_alphas, because it is the trivial tree with only one node
clfs = clfs[:-1]
ccp_alphas = ccp_alphas[:-1]
train_scores = [clf.score(X_train, y_train) for clf in clfs]
test_scores = [clf.score(X_test, y_test) for clf in clfs]
index_best_model = np.argmax(test_scores)
best_model = clfs[index_best_model]
print(best_model)
print('Training accuracy of best model: ',best_model.score(X_train, y_train))
print('Test accuracy of best model: ',best_model.score(X_test, y_test))
#Since accuracy isn't the right metric for our data we would want high recall
recall_train=[]
for clf in clfs:
pred_train3=clf.predict(X_train)
values_train=metrics.recall_score(y_train,pred_train3)
recall_train.append(values_train)
recall_test=[]
for clf in clfs:
pred_test3=clf.predict(X_test)
values_test=metrics.recall_score(y_test,pred_test3)
recall_test.append(values_test)
# creating the model where we get highest train and test recall
index_best_model = np.argmax(recall_test)
best_model = clfs[index_best_model]
print(best_model)
Now put the printed best_model here below and run:
clf = DecisionTreeClassifier(paste here...)
clf.fit(X_train, y_train)
make_confusion_matrix(estimator, X_test, y_test)
get_accuracy_and_recall_score(estimator, X_train, X_test, y_train, y_test)
get_feature_importances_and_visualize(estimator, X_test)