#####
#
# This script was written in Python 3.7 on 13 December 2019 by [REDACTED]
# with reference to Galarnyk, Michael. 2019. “Understanding Decision Trees for Classification (Python).” Medium. November 4, 2019.
# https://towardsdatascience.com/understanding-decision-trees-for-classification-python-9663d683c952.
#
#####

import pandas as pd
import numpy as np
#from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split
import csv
from sklearn.datasets.base import Bunch

#  CHANGE:
#    your file name
#    number of data rows, without counting header
#    number of columns for features, don't count last target column
#    list feature names
#    list target names
#    output_filename
#    feature_names = ['topic 0', 'topic 1', 'topic 2', 'topic 3', 'topic 4', 'topic 5', 'topic 6', 'topic 7', 'topic 8', 'topic 9', 'topic 10', 'topic 11', 'topic 12', 'topic 13', 'topic 14', 'topic 15', 'topic 16', 'topic 17', 'topic 18', 'topic 19', 'topic 20', 'topic 21', 'topic 22', 'topic 23', 'topic 24', 'topic 25', 'topic 26', 'topic 27', 'topic 28', 'topic 29', 'topic 30', 'topic 31', 'topic 32', 'topic 33', 'topic 34', 'topic 35', 'topic 36', 'topic 37', 'topic 38', 'topic 39', 'topic 40', 'topic 41', 'topic 42', 'topic 43', 'topic 44', 'topic 45', 'topic 46', 'topic 47', 'topic 48', 'topic 49', 'topic 50', 'topic 51', 'topic 52', 'topic 53', 'topic 54', 'topic 55', 'topic 56', 'topic 57', 'topic 58', 'topic 59', 'topic 60', 'topic 61', 'topic 62', 'topic 63', 'topic 64', 'topic 65', 'topic 66', 'topic 67', 'topic 68', 'topic 69', 'topic 70', 'topic 71', 'topic 72', 'topic 73', 'topic 74', 'topic 75', 'topic 76', 'topic 77', 'topic 78', 'topic 79', 'topic 80', 'topic 81', 'topic 82', 'topic 83', 'topic 84', 'topic 85', 'topic 86', 'topic 87', 'topic 88', 'topic 89', 'topic 90', 'topic 91', 'topic 92', 'topic 93', 'topic 94', 'topic 95', 'topic 96', 'topic 97', 'topic 98', 'topic 99']


def load_my_dataset():
    with open('mallet_composition_file.csv') as csv_file:
        data_file = csv.reader(csv_file)
        temp = next(data_file)
        n_samples = 14619 #number of data rows, don't count header
        n_features = 25 #number of columns for features, don't count target column
        feature_names = ['topic 0', 'topic 1', 'topic 2', 'topic 3', 'topic 4', 'topic 5', 'topic 6', 'topic 7', 'topic 8', 'topic 9', 'topic 10', 'topic 11', 'topic 12', 'topic 13', 'topic 14', 'topic 15', 'topic 16', 'topic 17', 'topic 18', 'topic 19', 'topic 20', 'topic 21', 'topic 22', 'topic 23', 'topic 24']
        target_names = ['notes','summary', 'text'] #adjust accordingly
        data = np.empty((n_samples, n_features))
        target = np.empty((n_samples,), dtype=np.int)

        for i, sample in enumerate(data_file):
            data[i] = np.asarray(sample[:-1], dtype=np.float64)
            target[i] = np.asarray(sample[-1], dtype=np.int)

    return Bunch(data=data, target=target, feature_names = feature_names, target_names = target_names)

data = load_my_dataset()
#print(data)
print('dataset loaded')



df = pd.DataFrame(data.data, columns=data.feature_names)
#print(df)
df['target'] = data.target

X_train, X_test, Y_train, Y_test = train_test_split(df[data.feature_names], df['target'], random_state=0)

from sklearn.tree import DecisionTreeClassifier
#clf = DecisionTreeClassifier(max_depth = 3)
clf = DecisionTreeClassifier(max_leaf_nodes = 3)
clf.fit(X_train, Y_train)

#output a graph
from sklearn import tree
import graphviz 
dot_data = tree.export_graphviz(clf, out_file=None, 
                     feature_names=data.feature_names,  
                     class_names=data.target_names,  
                     filled=True, rounded=True,  
                     special_characters=True)  
graph = graphviz.Source(dot_data)  
output_filename = "decision_tree_results"
graph.render(output_filename)
print('output file: ', output_filename)

# Predict for 1 observation
clf.predict(X_test.iloc[0].values.reshape(1, -1))
# Predict for multiple observations
clf.predict(X_test[0:10])

# The score method returns the accuracy of the model
score = clf.score(X_test, Y_test)
print(score)

# List of values to try for max_depth:
max_depth_range = list(range(1, 11))
# List to store the average RMSE for each value of max_depth:
accuracy = []
for depth in max_depth_range:
     clf = DecisionTreeClassifier(max_depth = depth, min_samples_leaf=1, random_state = 0)
     clf.fit(X_train, Y_train)
     score = clf.score(X_test, Y_test)
     accuracy.append(score)
print(accuracy)



######4# List of values to try for max_leaf_nodes:
max_leaf_nodes = list(range(2, 11))
# List to store the average RMSE for each value of max_leaf_node:
accuracy_nodes = []
for node in max_leaf_nodes:
     clf = DecisionTreeClassifier(max_leaf_nodes = node)
     clf.fit(X_train, Y_train)
     score = clf.score(X_test, Y_test)
     accuracy_nodes.append(score)
print(accuracy_nodes)
#####



pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)
pd.set_option('display.width', None)
pd.set_option('display.max_colwidth', -1)

importances = pd.DataFrame({'feature':X_train.columns,'importance':np.round(clf.feature_importances_,3)})
importances = importances.sort_values('importance',ascending=False)
print(importances)

print("Done")