%pip install xlwt # pip installable libraries
Requirement already satisfied: xlwt in /usr/local/lib/python3.10/dist-packages (1.3.0)
Haikoo Khandor
July 10, 2023
Requirement already satisfied: xlwt in /usr/local/lib/python3.10/dist-packages (1.3.0)
import random # Necessary imports
import numpy as np
import pandas as pd
from matplotlib import pyplot as plt
from sklearn import datasets
from sklearn.tree import DecisionTreeClassifier
from sklearn import tree
from sklearn.tree import _tree
import xlwt
import csv
import json
import xgboost as xgb
from sklearn.model_selection import train_test_split
from sklearn import metrics
from google.colab import drive # mounting google drive for the dataset
drive.mount('/content/drive')
Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
age | workclass | fnlwgt | education | education.num | marital.status | occupation | relationship | race | sex | capital.gain | capital.loss | hours.per.week | native.country | income | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 90 | ? | 77053 | HS-grad | 9 | Widowed | ? | Not-in-family | White | Female | 0 | 4356 | 40 | United-States | <=50K |
1 | 82 | Private | 132870 | HS-grad | 9 | Widowed | Exec-managerial | Not-in-family | White | Female | 0 | 4356 | 18 | United-States | <=50K |
2 | 66 | ? | 186061 | Some-college | 10 | Widowed | ? | Unmarried | Black | Female | 0 | 4356 | 40 | United-States | <=50K |
3 | 54 | Private | 140359 | 7th-8th | 4 | Divorced | Machine-op-inspct | Unmarried | White | Female | 0 | 3900 | 40 | United-States | <=50K |
4 | 41 | Private | 264663 | Some-college | 10 | Separated | Prof-specialty | Own-child | White | Female | 0 | 3900 | 40 | United-States | <=50K |
Let us say we have a dataset \(D\). It contains \(n\) number of numerical variables and \(c\) number of categorical/enum variables. A constraint for a variable is defined values that they cannot take. These constraints are often non-intuitive as there may exist two variables which are related with each other and only a particular set satisfies them. It is improbable for humans to effectively search each and every constraint without missing a few or noting the wrong constraint. Thus we use Decision trees to solve these constraints.
Now, we are given a var file which contains all the variables, their types, lowest and highest value if mentioned and the default value. This file has a specified format which can be read by a machine easily. The below function utilizes this to extract the variable information. var_int_finder does it for numerical (int and float) variables whereas var_enum_finder does it for categorical variables.
# Row passed has int or float, now there are two possibilities namely:
# int O5 = 2
# int a[2..4] = 3,
# in case one, no range is given so we take 0-1000 as the default range, otherwise we choose the range and the assignment value as the default value.
# Input: first column of the row i.e. row passed here is first column of a row
# Output: name(of variable), default(value assigned if not specified in the var file), low and high
# Function: Extracts the name, default, low and high from the row first column passed and handles the two cases mentioned above
def var_int_finder(row):
# Initializaing the defaults
name = ''
low = 0
high = 1000
default = 0
indicator = -1 # whether we have a [] or not
left_brac = -1 # respective indexes
right_brac = -1
double_dot = -1
for i in range(len(row)):
if row[i] == '[': # storing index of left square bracket if present
left_brac = i
if i != len(row)-1:
# storing index of double dot(..) if present
if row[i] == '.' and row[i+1] == '.':
double_dot = i
if row[i] == ']': # storing index of right square bracket if present
right_brac = i
indicator = i
# if we have a [], then storing the name as per the index
if row[i] == '=' and indicator >= 0:
name = row[indicator+1:i]
default = row[i+1:]
if left_brac != -1:
low = row[left_brac+1:double_dot]
high = row[double_dot+2:right_brac]
if indicator == -1: # if not []
i = 0
while row[i] != 't': # checking for int/float last letter which is t
i += 1
j = i
while row[j] != '=':
j += 1
name = row[i+1:j]
default = row[j+1:]
return name, default, low, high
# row passed has enum
# Input: row passed containing the name "enum"
# Output: type-enum is the name of the enum, possible(list of all values the enum variable can use)
# Function: Extracts the type-enum and possible list for each row consisting of enum variables
def var_enum_finder(row):
check = 0
while row[0][check] != 'm': # checks for m
check += 1
opening = 0
while row[0][opening] != '{':
opening += 1
type_enum = row[0][check+2:opening]
possible = [] # all possible elements of the enum variable
possible.append(row[0][opening+1:].strip())
blank_check = 0
while (len(row[blank_check]) != 0): # checking for number of elements
blank_check += 1
if blank_check == len(row):
break
if blank_check == 1: # in case of one option
possible[0] = possible[0][0:-1]
for i in range(blank_check-1):
if i != blank_check-2:
possible.append(row[i+1].strip())
else:
possible.append(row[i+1][:-1].strip())
return type_enum, possible
# This loop goes through all the rows of the var file, categorizes it based on int/float and enum, passes it to var_int_finder and var_enum_finder, gets the
# values and appends it to master_dict and enum_store respectively
with open('adult_var_new.csv') as file_obj: # parsing the var file
reader_obj = csv.reader(file_obj)
for row in reader_obj:
if 'int ' in row[0] or 'int[' in row[0]:
name, default, low, high = var_int_finder(row[0])
master_dict[name] = list()
master_dict[name].append('int')
master_dict[name].append(int(low))
master_dict[name].append(int(high))
master_dict[name].append(int(default))
elif 'float ' in row[0] or 'float[' in row[0]:
name, default, low, high = var_int_finder(row[0])
master_dict[name] = list()
master_dict[name].append('float')
master_dict[name].append(float(low))
master_dict[name].append(float(high))
master_dict[name].append(float(default))
elif 'enum' in row[0]:
type_num, possible = var_enum_finder(row)
enum_store[type_num] = list(possible)
{' age ': ['int', 17, 90, 45],
' fnlwgt ': ['int', 12285, 1484705, 15000],
' education.num ': ['int', 1, 16, 10],
' capital.gain ': ['int', 0, 99999, 50000],
' capital.loss ': ['int', 0, 4356, 2000],
' hours.per.week ': ['int', 1, 99, 5]}
[' age ',
' fnlwgt ',
' education.num ',
' capital.gain ',
' capital.loss ',
' hours.per.week ']
After getting variable information, we perform a Depth-First Search on the learned decision tree from SkLearn/XGBoost. We need the information of left child, right child, variable present at the split and the split threshold. For SkLearn, it is provided through inbuilt functions. For XGBoost, we perform recursion to extract the information. Both the libraries provide the tree in a dictionary format so we parse the dictionary. We one-hot-encode the categorical variables since the internal working of both libraries follows a 0/1 encoding rule for each category in an enum variable.
For XGBoost
def extract_splits(tree_dict, splits=[], threshold=[]):
if "leaf" in tree_dict:
splits.append(-1)
threshold.append(-1)
elif "split" in tree_dict:
splits.append(tree_dict["split"])
threshold.append(tree_dict["split_condition"])
if "children" in tree_dict:
for child in tree_dict["children"]:
extract_splits(child, splits, threshold)
return splits, threshold
def extract_children(tree_dict, left_children=[], right_children=[]):
if tree_dict["nodeid"] == 0:
left_children.append(tree_dict["yes"])
right_children.append(tree_dict["no"])
if "children" in tree_dict:
for child in tree_dict["children"]:
if "yes" in child:
left_children.append(child["yes"])
right_children.append(child["no"])
elif "no" in child:
left_children.append(child["yes"])
right_children.append(child["no"])
extract_children(child, left_children, right_children)
else:
left_children.append(-1)
right_children.append(-1)
return left_children, right_children
Params function returns the essential parameters required. 1. true_classes: Class labels 2. children_left: 0-indexed array where children_left[i] is the left child of the ith node 3. children_right: 0-indexed array where children_right[i] is the right child of the ith node 4. feature: 0-indexed array where input_vars[feature[i]] is the variable used for splitting the ith node 5. input_vars: all numerical + categorical variables in one_hot_encoded format 6. last_node: last leaf node to stop recursion 7. adjacency_list: a simple parent to children key value dictionary
# Returns the essential paramaters of the tree:
# children_left: ex: [1,2,3,4,5] means 0th node has 1st node as left child, 1st node has 2nd node as left child and so on..
# children_right: ex: [1,2,3,4,5] means 0th node has 1st node as right child, 1st node has 2nd node as right child and so on..
# feature: ex: [1,2,3,4,5] and let input_vars = [a,b,c,d,e] means 0th node has feature index as 1 i.e. input_vars[1] i.e. b is the feature used for splitting the node
# threshold: ex: [2.5,3.5,-2,-2] means 0th node has threshold split as 2.5, a value of -2 means it is a leaf node
# input_vars: consists of all the num + one-hot encoded categorical
# last_node: last leaf node
# adjacency_list: consists of the children of each node, empty if no child i.e leaf node
def params_sklearn(instance, dataset):
true_classes = {} # classification target names
for i in range(len(np.unique(df.iloc[:, -1:]))):
true_classes[i] = np.unique(df.iloc[:, -1:])[i]
# id of the left child of node i or -1 if leaf node, eg: if 2 is Left child of 0 then at index 0, 2 will be stored
children_left = instance.tree_.children_left
# id of the right child of node i or -1 if leaf node, eg: if 2 is Right child of 0 then at index 0, 2 will be stored
children_right = instance.tree_.children_right
feature = instance.tree_.feature # index of feature used for splitting node i
threshold = instance.tree_.threshold # threshold value at node i
df_x = dataset.drop('income', axis=1) # Train
df_y = dataset['income'] # To be predicted
df_encoded = pd.get_dummies(df_x, drop_first=False)
input_vars = df_encoded.columns.values.tolist() # feature names
last_node = children_right.max() # last child
# stores the parent nodes and their child nodes in a key(parent)->value list(children)
adjacency_list = {}
n_nodes = instance.tree_.node_count # total node count
output = df_y.name
for i in range(n_nodes):
adjacency_list[i] = list()
if (children_left[i] > 0):
adjacency_list[i].append(children_left[i])
if (children_right[i] > 0):
adjacency_list[i].append(children_right[i])
return true_classes, children_left, children_right, feature, threshold, input_vars, last_node, adjacency_list, output
def params_XG(tree_obj, dataset):
# true_classes = {0:'<=50k', 1:">50k"}
true_classes = [0, 1]
df_x = dataset.drop('income', axis=1) # Train
df_y = dataset['income'] # To be predicted
df_encoded = pd.get_dummies(df_x, drop_first=False)
input_vars = df_encoded.columns.values.tolist() # feature names
left_children, right_children = extract_children(tree_obj, [], [])
nodeids = extract_nodeids(tree_obj, [])
splits, threshold = extract_splits(tree_obj, [], [])
fin_splits = [0]*len(splits)
fin_threshold = [0]*len(threshold)
fin_left_children = [0]*len(left_children)
fin_right_children = [0]*len(right_children)
for i in range(len(splits)):
fin_splits[nodeids[i]] = splits[i]
fin_threshold[nodeids[i]] = threshold[i]
fin_left_children[nodeids[i]] = left_children[i]
fin_right_children[nodeids[i]] = right_children[i]
fin_adjacency_list = {}
for i in range(len(splits)):
fin_adjacency_list[i] = list()
if (fin_left_children[i] > 0):
fin_adjacency_list[i].append(fin_left_children[i])
if (fin_right_children[i] > 0):
fin_adjacency_list[i].append(fin_right_children[i])
last_node = nodeids[-1]
output = 'income'
return true_classes, fin_left_children, fin_right_children, fin_splits, fin_threshold, input_vars, last_node, fin_adjacency_list, output
Common function for choosing the library based on the user input
def params(id, instance=[], dataset=[], tree_obj=[]):
if id == 0: # sklearn
true_classes, left_children, right_children, splits, threshold, input_vars, last_node, adjacency_list, output = params_sklearn(
instance, dataset)
return true_classes, left_children, right_children, splits, threshold, input_vars, last_node, adjacency_list, output
elif id == 1: # XGBoost
true_classes, left_children, right_children, splits, threshold, input_vars, last_node, adjacency_list, output = params_XG(
tree_obj, dataset)
return true_classes, left_children, right_children, splits, threshold, input_vars, last_node, adjacency_list, output
After storing the constraints for each variable, we need to use the data structures and extract values for each variable. The entire workflow is as follows: 1. path_cat contains the categorical variables found in the tree with a yes/no answer. If yes, the variable is given the value of the categorical var else a randomly chosen value out of all - the categorical variable with a “no” attached to it. If the variable is not found, a random value out of possible set is chosen. 2. path_num contains the numerical variables found in the tree. We separate the greater than, less than constraints in two lists and generate a value between the maximum of less than values if more than one and minimum of greater than values if more than one. If only single side values are found, the other side is used from the range given. The range is satisfied at all times.
# Input: path_cat - dictionary containing only categorical variables with yes no instances
# path_num - dictionary containing only numerical variables with a list consisting of <,>= conditions
# dataset: dataset
# Returns: a testcase is appended to the main list
# Function: Numerical: processes the path_num and generates the value based on the constraints for each feature, if feature not found, generate value between low and high from var file
# Categorical: processes the path_cat and makes a not_list, then chooses value based on the constraints and again if feature not found, choose random class from the list of that variable
def post_process(path_cat, path_num, ans_cat={}, ans_num={}):
ans_cat = {} # stores categorical variables
ans_num = {} # stores int, float variables
not_list = {} # not-list
# Numerical starts
for i in path_num.keys(): # iterating through the input vars in the path dictionary
less_than = [] # stores all values which come after "<="
greater = [] # stores all values which come after ">"
# iterating through the list for that input var
for j in range(len(path_num[i])):
if path_num[i][j] == '<=':
less_than.append(path_num[i][j+1])
elif path_num[i][j] == '>':
greater.append(path_num[i][j+1])
max_2 = master_dict[str(i)][2]
min_2 = master_dict[str(i)][1]
# <=, <= -
# <=, >
# >, >
# not seen
if len(greater) > 0:
min_2 = max(greater)
# less_than = [7073.5], greater = [500], min_2 = 500, max_2 = 7073.5
if len(less_than) > 0:
max_2 = min(less_than)
if master_dict[i][0] == 'int':
if (max_2-min_2 < 1):
ans_num[i] = (max_2 + min_2)/2
else:
ans_num[i] = np.random.randint(min_2, max_2)
else:
# generating the test case value
ans_num[i] = random.uniform(min_2, max_2)
for j in master_dict_keys: # if some features not present in the test case, populate it with a random value
if j not in ans_num:
if master_dict[j][0] == 'int':
if master_dict[j][2]-master_dict[j][1] < 1:
ans_num[j] = (master_dict[j][1]+master_dict[j][2])/2
else:
ans_num[j] = np.random.randint(
master_dict[j][1], master_dict[j][2])
else:
ans_num[j] = random.uniform(
master_dict[j][1], master_dict[j][2])
# Numerical ends
# Categorical starts
for k, v in path_cat.items():
for j in enum_store_key:
# if that cat variable is in the path # native.country(j), one-hot-encoding: native.country_India(k)
if j in k:
if v == "No": # if no
if j not in not_list:
not_list[j] = list()
not_list[j].append(k) # append to list
elif v == "Yes": # if yes append directly
ans_cat[j] = k[len(j)+1:]
not_list_2 = {} # removing the unnecessary pre-names like ex: native.country_India to India
keys = list(not_list.keys())
for i in range(len(not_list)):
cut = len(keys[i])
not_list_2[keys[i]] = list()
for j in range(len(not_list[keys[i]])):
not_list_2[keys[i]].append(not_list[keys[i]][j][cut+1:])
for j in enum_store_key:
if j not in ans_cat and j in not_list_2:
# picking ans from whole - not_list
temp = [i for i in enum_store[j] if i not in not_list_2[j]]
ans_cat[j] = random.choice(temp)
# if feature not found in the path, populate it with a random name
elif j not in ans_cat and j not in not_list_2:
ans_cat[j] = random.choice(enum_store[j])
# Categorical ends
ans_cat[output] = random.choice(true_classes)
# concatenate the dictionaries, append it to main
res = {**ans_num.copy(), **ans_cat.copy()}
main.append(res.copy())
Traverse_sk performs the DFS on the tree using the adjacency list. The base case is the leaf node. If not a leaf node, determine if it is a left or right child, determine if it is numerical or categorical variable and then store the appropriate value. Call recursion on it’s children. If it is a leaf node, we have reached the end of a path, now we need to store the values collected till now. Since DFS does not start from the root each time, we cannot just empty the entire data structure and then proceed. We need to intelligently delete only certain elements from the data structure. Since both the libraries produce full binary trees, after the left leaf node, we can simply delete the last entry since there is going to be right child. For the right child, the choice is now difficult. We move upwards the tree, storing which variables to remove till we reach a left child. We then delete the required nodes from the data structure. If it is the last_leaf_node, we return.
# Input: Node(current node), instance: DecisionTreeClassifier class, dataset, cat and num store the path
# Function: if not a leaf node, check for num, cat append it to num and cat respectively, if leaf node, pass it to post_process for test case
# then pop certain features for cat and num based on the backtracking as DFS is performed here
# Popping: Right child: since all categorical vars are one-hot encoded, they are unique so we pop them straightaway for numerical, we look for the feature(as they may be repeated)
# then pop it in the end
# Left child: only pop once as we have full trees so we have left and right child both, again pop based on whether it's cat or num as mentioned above
def traverse_sk(node, children_left, children_right, last_node, feature, adjacency_list, threshold, cat={}, num={}):
if children_left[node] < 0: # leaf node
post_process(cat.copy(), num.copy()) # send for adding to testcase
parent = -1
if node == last_node:
cat.clear()
num.clear()
for i in range(len(children_left)):
if children_left[i] == node:
parent = i
break
# Categorical parent variable popping
# left child, direct pop as all names are distinct in the case of cat
if node in children_left and input_vars[feature[parent]] in input_cat_vars:
cat.popitem()
for i in range(len(children_right)):
if children_right[i] == node:
parent = i
break
# left node, pop is cat, else store the feature name and then pop from num
if node in children_right and input_vars[feature[parent]] in input_cat_vars and node != last_node:
pop_len_2 = 0 # popping length for numerical variable
temp_2 = node # current right child
store = [] # storing the feature names as we traverse to the upper last right child for numerical variables
while temp_2 in children_right:
idx = 0
for i in range(len(children_right)):
if children_right[i] == temp_2:
idx = i
temp_2 = idx
if input_vars[feature[temp_2]] in master_dict_keys:
pop_len_2 += 1
store.append(input_vars[feature[temp_2]])
else:
cat.popitem()
for i in range(len(children_left)):
if children_left[i] == temp_2:
idx = i
temp_2 = idx
if input_vars[feature[temp_2]] in master_dict_keys:
pop_len_2 += 1
store.append(input_vars[feature[temp_2]])
else:
cat.popitem()
for i in range(pop_len_2):
# last feature is added first in store so delete the last element in that feature name
last = store[i]
num[last].pop()
num[last].pop()
if len(num[last]) == 0: # if some feature list is empty, remove it completely
num.pop(last)
for i in range(len(children_left)):
if children_left[i] == node:
parent = i
break
# Numerical parent variable popping
# if left child, we will pop the last two elements added in the last feature name only
if node in children_left and input_vars[feature[parent]] in master_dict_keys:
idx = 0
# finding last feature name by finding the parent of the left child
for i in range(len(children_left)):
if children_left[i] == node:
idx = i
last = input_vars[feature[idx]]
num[last].pop()
num[last].pop()
for i in range(len(children_right)):
if children_right[i] == node:
parent = i
break
# right child, pop if cat else store the feature name and pop at end for num
if node in children_right and node != last_node and input_vars[feature[parent]] in master_dict_keys:
pop_len_2 = 0 # popping length
temp_2 = node # current right child
store = [] # storing the feature names as we traverse to the upper last right child
while temp_2 in children_right:
idx = 0
for i in range(len(children_right)):
if children_right[i] == temp_2:
idx = i
temp_2 = idx
if input_vars[feature[temp_2]] in master_dict_keys:
pop_len_2 += 1
store.append(input_vars[feature[temp_2]])
else:
cat.popitem()
for i in range(len(children_left)):
if children_left[i] == temp_2:
idx = i
temp_2 = idx
if input_vars[feature[temp_2]] in master_dict_keys:
pop_len_2 += 1
store.append(input_vars[feature[temp_2]])
else:
cat.popitem()
for i in range(pop_len_2):
# last feature is added first in store so delete the last element in that feature name
last = store[i]
num[last].pop()
num[last].pop()
if len(num[last]) == 0: # if some feature list is empty, remove it completely
num.pop(last)
else: # not a leaf node
for child in adjacency_list[node]:
# cat left node
if child == children_left[node] and input_vars[feature[node]] in input_cat_vars:
cat[input_vars[feature[node]]] = "No"
elif child == children_left[node]: # num left child
# if not added then add the list otherwise add the value in the list
if input_vars[feature[node]] not in num:
num[input_vars[feature[node]]] = list()
num[input_vars[feature[node]]].append("<=")
num[input_vars[feature[node]]].append(threshold[node])
# cat right node
elif child == children_right[node] and input_vars[feature[node]] in input_cat_vars:
cat[input_vars[feature[node]]] = "Yes"
elif child == children_right[node]: # num right node
# if not added then add the list otherwise add the value in the list
if input_vars[feature[node]] not in num:
num[input_vars[feature[node]]] = list()
num[input_vars[feature[node]]].append(">")
num[input_vars[feature[node]]].append(threshold[node])
traverse_sk(child, children_left, children_right, last_node,
feature, adjacency_list, threshold) # recursive call
# Input: Node(current node), instance: DecisionTreeClassifier class, dataset, cat and num store the path
# Function: if not a leaf node, check for num, cat append it to num and cat respectively, if leaf node, pass it to post_process for test case
# then pop certain features for cat and num based on the backtracking as DFS is performed here
# Popping: Right child: since all categorical vars are one-hot encoded, they are unique so we pop them straightaway for numerical, we look for the feature(as they may be repeated)
# then pop it in the end
# Left child: only pop once as we have full trees so we have left and right child both, again pop based on whether it's cat or num as mentioned above
def traverse_XG(node, children_left, children_right, last_node, feature, adjacency_list, threshold, cat={}, num={}):
if children_left[node] < 0: # leaf node
post_process(cat.copy(), num.copy()) # send for adding to testcase
parent = -1
if node == last_node:
cat.clear()
num.clear()
for i in range(len(children_left)):
if children_left[i] == node:
parent = i
break
# Categorical parent variable popping
# left child, direct pop as all names are distinct in the case of cat
if node in children_left and feature[parent] in input_cat_vars:
cat.popitem()
for i in range(len(children_right)):
if children_right[i] == node:
parent = i
break
# left node, pop is cat, else store the feature name and then pop from num
if node in children_right and feature[parent] in input_cat_vars and node != last_node:
pop_len_2 = 0 # popping length for numerical variable
temp_2 = node # current right child
store = [] # storing the feature names as we traverse to the upper last right child for numerical variables
while temp_2 in children_right:
idx = 0
for i in range(len(children_right)):
if children_right[i] == temp_2:
idx = i
temp_2 = idx
if feature[temp_2] in master_dict_keys:
pop_len_2 += 1
store.append(feature[temp_2])
else:
cat.popitem()
for i in range(len(children_left)):
if children_left[i] == temp_2:
idx = i
temp_2 = idx
if feature[temp_2] in master_dict_keys:
pop_len_2 += 1
store.append(feature[temp_2])
else:
cat.popitem()
for i in range(pop_len_2):
# last feature is added first in store so delete the last element in that feature name
last = store[i]
num[last].pop()
num[last].pop()
if len(num[last]) == 0: # if some feature list is empty, remove it completely
num.pop(last)
for i in range(len(children_left)):
if children_left[i] == node:
parent = i
break
# Numerical parent variable popping
# if left child, we will pop the last two elements added in the last feature name only
if node in children_left and feature[parent] in master_dict_keys:
idx = 0
# finding last feature name by finding the parent of the left child
for i in range(len(children_left)):
if children_left[i] == node:
idx = i
last = feature[idx]
num[last].pop()
num[last].pop()
for i in range(len(children_right)):
if children_right[i] == node:
parent = i
break
# right child, pop if cat else store the feature name and pop at end for num
if node in children_right and node != last_node and feature[parent] in master_dict_keys:
pop_len_2 = 0 # popping length
temp_2 = node # current right child
store = [] # storing the feature names as we traverse to the upper last right child
while temp_2 in children_right:
idx = 0
for i in range(len(children_right)):
if children_right[i] == temp_2:
idx = i
temp_2 = idx
if feature[temp_2] in master_dict_keys:
pop_len_2 += 1
store.append(feature[temp_2])
else:
cat.popitem()
for i in range(len(children_left)):
if children_left[i] == temp_2:
idx = i
temp_2 = idx
if feature[temp_2] in master_dict_keys:
pop_len_2 += 1
store.append(feature[temp_2])
else:
cat.popitem()
for i in range(pop_len_2):
# last feature is added first in store so delete the last element in that feature name
last = store[i]
num[last].pop()
num[last].pop()
if len(num[last]) == 0: # if some feature list is empty, remove it completely
num.pop(last)
else: # not a leaf node
for child in adjacency_list[node]:
# cat left node
if child == children_left[node] and feature[node] in input_cat_vars:
cat[feature[node]] = "No"
elif child == children_left[node]: # num left child
# if not added then add the list otherwise add the value in the list
if feature[node] not in num:
num[feature[node]] = list()
num[feature[node]].append("<=")
num[feature[node]].append(threshold[node])
# cat right node
elif child == children_right[node] and feature[node] in input_cat_vars:
cat[feature[node]] = "Yes"
elif child == children_right[node]: # num right node
# if not added then add the list otherwise add the value in the list
if feature[node] not in num:
num[feature[node]] = list()
num[feature[node]].append(">")
num[feature[node]].append(threshold[node])
traverse_XG(child, children_left, children_right, last_node,
feature, adjacency_list, threshold) # recursive call
Common function for choosing the library to traverse based on the user input
def traverse(id, node, children_left, children_right, last_node, feature, adjacency_list, threshold):
if id == 0: # sklearn
traverse_sk(node, children_left, children_right,
last_node, feature, adjacency_list, threshold)
elif id == 1: # XGBoost
traverse_XG(node, children_left, children_right,
last_node, feature, adjacency_list, threshold)
We add the generated testcases back to the total dataset in the ratio of train:test and then return the updated dataframe.
def data(id, df, i, testcases=[]):
if i == 0:
if id == 1:
df['income'] = df['income'].map({'>50K': 1, '<=50K': 0})
df_x = df.drop('income', axis=1)
df_y = df['income']
df_encoded = pd.get_dummies(df_x)
X_train, X_test, y_train, y_test = train_test_split(
df_encoded, df_y, test_size=0.2, random_state=0)
if id == 1:
y_train = y_train.astype("category")
y_test = y_test.astype("category")
return df, X_train, y_train, X_test, y_test
test_x = testcases.drop('income', axis=1) # Train
test_y = testcases['income'] # To be predicted
X_train_tc, X_test_tc, y_train_tc, y_test_tc = train_test_split(
test_x, test_y, test_size=0.2, random_state=0)
if id == 1:
y_train_tc = y_train_tc.astype("category")
y_test_tc = y_test_tc.astype("category")
df_x = df.drop('income', axis=1) # Train
df_y = df['income'] # To be predicted
X_train_or, X_test_or, y_train_or, y_test_or = train_test_split(
df_x, df_y, test_size=0.2, random_state=0)
if id == 1:
y_train_or = y_train_or.astype("category")
y_test_or = y_test_or.astype("category")
df = pd.concat([df, testcases])
X_train = pd.concat([X_train_or, X_train_tc])
y_train = pd.concat([y_train_or, y_train_tc])
X_test = pd.concat([X_test_or, X_test_tc])
y_test = pd.concat([y_test_or, y_test_tc])
X_train = pd.get_dummies(X_train, drop_first=False)
X_test = pd.get_dummies(X_test, drop_first=False)
if 'native.country_Holand-Netherlands' not in X_test.columns.values.tolist():
X_test.insert(81, 'native.country_Holand-Netherlands', 0)
return df, X_train, y_train, X_test, y_test
The user can run this for as many iterations as needed. id helps to switch between libraries.
iterations = 5
id = 0
accuracy = []
for i in range(iterations):
if i == 0:
df, X_train, y_train, X_test, y_test = data(id, df, i)
df_x = df.drop('income', axis=1)
df_encoded = pd.get_dummies(df_x)
input_vars = df_encoded.columns.values.tolist() # feature names
input_cat_vars = input_vars.copy()
for i in range(len(master_dict)):
input_cat_vars.remove(master_dict_keys[i])
enum_store_key = list(enum_store.keys())
tree_dict = []
if id == 0:
tree = DecisionTreeClassifier(max_depth=10, random_state=0)
elif id == 1:
tree = xgb.XGBClassifier(
n_estimators=1, tree_method="gpu_hist", enable_categorical=True, max_depth=8)
tree.fit(X_train, y_train)
if id == 1:
booster = tree.get_booster()
tree = booster.get_dump(dump_format='json')[0]
tree_dict = json.loads(tree)
true_classes, children_left, children_right, feature, threshold, input_vars, last_node, adjacency_list, output = params(
id, tree, df, tree_dict)
main = [] # stores the testcases
traverse(id, 0, children_left, children_right,
last_node, feature, adjacency_list, threshold)
preds = tree.predict(X_test)
print("Accuracy:", metrics.accuracy_score(y_test, preds))
accuracy.append(metrics.accuracy_score(y_test, preds))
test_cases = pd.DataFrame(main) # testcases
test_cases = test_cases[df.columns.values.tolist()]
df, X_train, y_train, X_test, y_test = data(id, df, i, test_cases)
Accuracy: 0.8498387839705205
Accuracy: 0.8475247524752475
Accuracy: 0.8507936507936508
Accuracy: 0.856157192140393
Accuracy: 0.8386090057958092