Haikoo Khandor Blog - Constraint-based Testcase generation for Better Learning in Software Systems

%pip install xlwt # pip installable libraries

Requirement already satisfied: xlwt in /usr/local/lib/python3.10/dist-packages (1.3.0)

import random  # Necessary imports
import numpy as np
import pandas as pd
from matplotlib import pyplot as plt
from sklearn import datasets
from sklearn.tree import DecisionTreeClassifier
from sklearn import tree
from sklearn.tree import _tree
import xlwt
import csv
import json
import xgboost as xgb
from sklearn.model_selection import train_test_split
from sklearn import metrics

from google.colab import drive  # mounting google drive for the dataset
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).

df = pd.read_csv("drive/MyDrive/Datasets/adult.csv")  # reading the csv

df.head()

	age	workclass	fnlwgt	education	education.num	marital.status	occupation	relationship	race	sex	capital.loss	hours.per.week	native.country	income
0	90	?	77053	HS-grad	9	Widowed	?	Not-in-family	White	Female	4356	40	United-States	<=50K
1	82	Private	132870	HS-grad	9	Widowed	Exec-managerial	Not-in-family	White	Female	4356	18	United-States	<=50K
2	66	?	186061	Some-college	10	Widowed	?	Unmarried	Black	Female	4356	40	United-States	<=50K
3	54	Private	140359	7th-8th	4	Divorced	Machine-op-inspct	Unmarried	White	Female	3900	40	United-States	<=50K
4	41	Private	264663	Some-college	10	Separated	Prof-specialty	Own-child	White	Female	3900	40	United-States	<=50K

Initial setup

Let us say we have a dataset \(D\). It contains \(n\) number of numerical variables and \(c\) number of categorical/enum variables. A constraint for a variable is defined values that they cannot take. These constraints are often non-intuitive as there may exist two variables which are related with each other and only a particular set satisfies them. It is improbable for humans to effectively search each and every constraint without missing a few or noting the wrong constraint. Thus we use Decision trees to solve these constraints.

Now, we are given a var file which contains all the variables, their types, lowest and highest value if mentioned and the default value. This file has a specified format which can be read by a machine easily. The below function utilizes this to extract the variable information. var_int_finder does it for numerical (int and float) variables whereas var_enum_finder does it for categorical variables.

# Row passed has int or float, now there are two possibilities namely:
# int O5 = 2
# int a[2..4] = 3,
# in case one, no range is given so we take 0-1000 as the default range, otherwise we choose the range and the assignment value as the default value.
# Input: first column of the row i.e. row passed here is first column of a row
# Output: name(of variable), default(value assigned if not specified in the var file), low and high
# Function: Extracts the name, default, low and high from the row first column passed and handles the two cases mentioned above
def var_int_finder(row):
    # Initializaing the defaults
    name = ''
    low = 0
    high = 1000
    default = 0
    indicator = -1  # whether we have a [] or not
    left_brac = -1  # respective indexes
    right_brac = -1
    double_dot = -1
    for i in range(len(row)):
        if row[i] == '[':  # storing index of left square bracket if present
            left_brac = i
        if i != len(row)-1:
            # storing index of double dot(..) if present
            if row[i] == '.' and row[i+1] == '.':
                double_dot = i
        if row[i] == ']':  # storing index of right square bracket if present
            right_brac = i
            indicator = i
        # if we have a [], then storing the name as per the index
        if row[i] == '=' and indicator >= 0:
            name = row[indicator+1:i]
            default = row[i+1:]

    if left_brac != -1:
        low = row[left_brac+1:double_dot]
        high = row[double_dot+2:right_brac]

    if indicator == -1:  # if not []
        i = 0
        while row[i] != 't':  # checking for int/float last letter which is t
            i += 1
        j = i
        while row[j] != '=':
            j += 1
        name = row[i+1:j]
        default = row[j+1:]
    return name, default, low, high

# row passed has enum
# Input: row passed containing the name "enum"
# Output: type-enum is the name of the enum, possible(list of all values the enum variable can use)
# Function: Extracts the type-enum and possible list for each row consisting of enum variables
def var_enum_finder(row):
    check = 0
    while row[0][check] != 'm':  # checks for m
        check += 1
    opening = 0
    while row[0][opening] != '{':
        opening += 1
    type_enum = row[0][check+2:opening]
    possible = []  # all possible elements of the enum variable
    possible.append(row[0][opening+1:].strip())
    blank_check = 0
    while (len(row[blank_check]) != 0):  # checking for number of elements
        blank_check += 1
        if blank_check == len(row):
            break
    if blank_check == 1:  # in case of one option
        possible[0] = possible[0][0:-1]
    for i in range(blank_check-1):
        if i != blank_check-2:
            possible.append(row[i+1].strip())
        else:
            possible.append(row[i+1][:-1].strip())
    return type_enum, possible

master_dict = {}  # stores int-float
enum_store = {}  # stores enum

# This loop goes through all the rows of the var file, categorizes it based on int/float and enum, passes it to var_int_finder and var_enum_finder, gets the
# values and appends it to master_dict and enum_store respectively
with open('adult_var_new.csv') as file_obj:  # parsing the var file
    reader_obj = csv.reader(file_obj)
    for row in reader_obj:
        if 'int ' in row[0] or 'int[' in row[0]:
            name, default, low, high = var_int_finder(row[0])
            master_dict[name] = list()
            master_dict[name].append('int')
            master_dict[name].append(int(low))
            master_dict[name].append(int(high))
            master_dict[name].append(int(default))

        elif 'float ' in row[0] or 'float[' in row[0]:
            name, default, low, high = var_int_finder(row[0])
            master_dict[name] = list()
            master_dict[name].append('float')
            master_dict[name].append(float(low))
            master_dict[name].append(float(high))
            master_dict[name].append(float(default))

        elif 'enum' in row[0]:
            type_num, possible = var_enum_finder(row)
            enum_store[type_num] = list(possible)

master_dict

{' age ': ['int', 17, 90, 45],
 ' fnlwgt ': ['int', 12285, 1484705, 15000],
 ' education.num ': ['int', 1, 16, 10],
 ' capital.gain ': ['int', 0, 99999, 50000],
 ' capital.loss ': ['int', 0, 4356, 2000],
 ' hours.per.week ': ['int', 1, 99, 5]}

# Categorical variables before one-hot-encoding
enum_store_list = list(enum_store.keys())

list_master = list(master_dict.keys())
list_master

[' age ',
 ' fnlwgt ',
 ' education.num ',
 ' capital.gain ',
 ' capital.loss ',
 ' hours.per.week ']

# removes the whitespaces from the keys in the master_dict
for i in range(len(master_dict)):
    temp = list_master[i]
    temp = temp.strip()
    master_dict[temp] = master_dict.pop(list_master[i])

master_dict_keys = list(master_dict.keys())
enum_store_key = list(enum_store.keys())

After getting variable information, we perform a Depth-First Search on the learned decision tree from SkLearn/XGBoost. We need the information of left child, right child, variable present at the split and the split threshold. For SkLearn, it is provided through inbuilt functions. For XGBoost, we perform recursion to extract the information. Both the libraries provide the tree in a dictionary format so we parse the dictionary. We one-hot-encode the categorical variables since the internal working of both libraries follows a 0/1 encoding rule for each category in an enum variable.

For XGBoost

def extract_splits(tree_dict, splits=[], threshold=[]):
    if "leaf" in tree_dict:
        splits.append(-1)
        threshold.append(-1)
    elif "split" in tree_dict:
        splits.append(tree_dict["split"])
        threshold.append(tree_dict["split_condition"])
    if "children" in tree_dict:
        for child in tree_dict["children"]:
            extract_splits(child, splits, threshold)
    return splits, threshold

def extract_nodeids(tree_dict, nodes=[]):
    nodes = [tree_dict["nodeid"]]
    if "children" in tree_dict:
        for child in tree_dict["children"]:
            nodes.extend(extract_nodeids(child))
    return nodes

def extract_children(tree_dict, left_children=[], right_children=[]):
    if tree_dict["nodeid"] == 0:
        left_children.append(tree_dict["yes"])
        right_children.append(tree_dict["no"])
    if "children" in tree_dict:
        for child in tree_dict["children"]:
            if "yes" in child:
                left_children.append(child["yes"])
                right_children.append(child["no"])
            elif "no" in child:
                left_children.append(child["yes"])
                right_children.append(child["no"])
            extract_children(child, left_children, right_children)
    else:
        left_children.append(-1)
        right_children.append(-1)
    return left_children, right_children

Params function returns the essential parameters required. 1. true_classes: Class labels 2. children_left: 0-indexed array where children_left[i] is the left child of the ith node 3. children_right: 0-indexed array where children_right[i] is the right child of the ith node 4. feature: 0-indexed array where input_vars[feature[i]] is the variable used for splitting the ith node 5. input_vars: all numerical + categorical variables in one_hot_encoded format 6. last_node: last leaf node to stop recursion 7. adjacency_list: a simple parent to children key value dictionary

# Returns the essential paramaters of the tree:
# children_left: ex: [1,2,3,4,5] means 0th node has 1st node as left child, 1st node has 2nd node as left child and so on..
# children_right: ex: [1,2,3,4,5] means 0th node has 1st node as right child, 1st node has 2nd node as right child and so on..
# feature: ex: [1,2,3,4,5] and let input_vars = [a,b,c,d,e] means 0th node has feature index as 1 i.e. input_vars[1] i.e. b is the feature used for splitting the node
# threshold: ex: [2.5,3.5,-2,-2] means 0th node has threshold split as 2.5, a value of -2 means it is a leaf node
# input_vars: consists of all the num + one-hot encoded categorical
# last_node: last leaf node
# adjacency_list: consists of the children of each node, empty if no child i.e leaf node
def params_sklearn(instance, dataset):
    true_classes = {}  # classification target names
    for i in range(len(np.unique(df.iloc[:, -1:]))):
        true_classes[i] = np.unique(df.iloc[:, -1:])[i]
    # id of the left child of node i or -1 if leaf node, eg: if 2 is Left child of 0 then at index 0, 2 will be stored
    children_left = instance.tree_.children_left
    # id of the right child of node i or -1 if leaf node, eg: if 2 is Right child of 0 then at index 0, 2 will be stored
    children_right = instance.tree_.children_right
    feature = instance.tree_.feature  # index of feature used for splitting node i
    threshold = instance.tree_.threshold  # threshold value at node i
    df_x = dataset.drop('income', axis=1)  # Train
    df_y = dataset['income']  # To be predicted
    df_encoded = pd.get_dummies(df_x, drop_first=False)
    input_vars = df_encoded.columns.values.tolist()  # feature names
    last_node = children_right.max()  # last child
    # stores the parent nodes and their child nodes in a key(parent)->value list(children)
    adjacency_list = {}
    n_nodes = instance.tree_.node_count  # total node count
    output = df_y.name
    for i in range(n_nodes):
        adjacency_list[i] = list()
        if (children_left[i] > 0):
            adjacency_list[i].append(children_left[i])
        if (children_right[i] > 0):
            adjacency_list[i].append(children_right[i])
    return true_classes, children_left, children_right, feature, threshold, input_vars, last_node, adjacency_list, output

def params_XG(tree_obj, dataset):
    # true_classes = {0:'<=50k', 1:">50k"}
    true_classes = [0, 1]
    df_x = dataset.drop('income', axis=1)  # Train
    df_y = dataset['income']  # To be predicted
    df_encoded = pd.get_dummies(df_x, drop_first=False)
    input_vars = df_encoded.columns.values.tolist()  # feature names
    left_children, right_children = extract_children(tree_obj, [], [])
    nodeids = extract_nodeids(tree_obj, [])
    splits, threshold = extract_splits(tree_obj, [], [])
    fin_splits = [0]*len(splits)
    fin_threshold = [0]*len(threshold)
    fin_left_children = [0]*len(left_children)
    fin_right_children = [0]*len(right_children)
    for i in range(len(splits)):
        fin_splits[nodeids[i]] = splits[i]
        fin_threshold[nodeids[i]] = threshold[i]
        fin_left_children[nodeids[i]] = left_children[i]
        fin_right_children[nodeids[i]] = right_children[i]
    fin_adjacency_list = {}
    for i in range(len(splits)):
        fin_adjacency_list[i] = list()
        if (fin_left_children[i] > 0):
            fin_adjacency_list[i].append(fin_left_children[i])
        if (fin_right_children[i] > 0):
            fin_adjacency_list[i].append(fin_right_children[i])
    last_node = nodeids[-1]
    output = 'income'
    return true_classes, fin_left_children, fin_right_children, fin_splits, fin_threshold, input_vars, last_node, fin_adjacency_list, output

Common function for choosing the library based on the user input

def params(id, instance=[], dataset=[], tree_obj=[]):
    if id == 0:  # sklearn
        true_classes, left_children, right_children, splits, threshold, input_vars, last_node, adjacency_list, output = params_sklearn(
            instance, dataset)
        return true_classes, left_children, right_children, splits, threshold, input_vars, last_node, adjacency_list, output
    elif id == 1:  # XGBoost
        true_classes, left_children, right_children, splits, threshold, input_vars, last_node, adjacency_list, output = params_XG(
            tree_obj, dataset)
        return true_classes, left_children, right_children, splits, threshold, input_vars, last_node, adjacency_list, output

After storing the constraints for each variable, we need to use the data structures and extract values for each variable. The entire workflow is as follows: 1. path_cat contains the categorical variables found in the tree with a yes/no answer. If yes, the variable is given the value of the categorical var else a randomly chosen value out of all - the categorical variable with a “no” attached to it. If the variable is not found, a random value out of possible set is chosen. 2. path_num contains the numerical variables found in the tree. We separate the greater than, less than constraints in two lists and generate a value between the maximum of less than values if more than one and minimum of greater than values if more than one. If only single side values are found, the other side is used from the range given. The range is satisfied at all times.

# Input: path_cat - dictionary containing only categorical variables with yes no instances
# path_num - dictionary containing only numerical variables with a list consisting of <,>= conditions
# dataset: dataset
# Returns: a testcase is appended to the main list
# Function: Numerical: processes the path_num and generates the value based on the constraints for each feature, if feature not found, generate value between low and high from var file
# Categorical: processes the path_cat and makes a not_list, then chooses value based on the constraints and again if feature not found, choose random class from the list of that variable
def post_process(path_cat, path_num, ans_cat={}, ans_num={}):
    ans_cat = {}  # stores categorical variables
    ans_num = {}  # stores int, float variables
    not_list = {}  # not-list
    # Numerical starts
    for i in path_num.keys():  # iterating through the input vars in the path dictionary
        less_than = []  # stores all values which come after "<="
        greater = []  # stores all values which come after ">"
        # iterating through the list for that input var
        for j in range(len(path_num[i])):
            if path_num[i][j] == '<=':
                less_than.append(path_num[i][j+1])
            elif path_num[i][j] == '>':
                greater.append(path_num[i][j+1])
        max_2 = master_dict[str(i)][2]
        min_2 = master_dict[str(i)][1]
        # <=, <= -
        # <=, >
        # >, >
        # not seen
        if len(greater) > 0:
            min_2 = max(greater)
        # less_than = [7073.5], greater = [500], min_2 = 500, max_2 = 7073.5
        if len(less_than) > 0:
            max_2 = min(less_than)
        if master_dict[i][0] == 'int':
            if (max_2-min_2 < 1):
                ans_num[i] = (max_2 + min_2)/2
            else:
                ans_num[i] = np.random.randint(min_2, max_2)
        else:
            # generating the test case value
            ans_num[i] = random.uniform(min_2, max_2)
    for j in master_dict_keys:  # if some features not present in the test case, populate it with a random value
        if j not in ans_num:
            if master_dict[j][0] == 'int':
                if master_dict[j][2]-master_dict[j][1] < 1:
                    ans_num[j] = (master_dict[j][1]+master_dict[j][2])/2
                else:
                    ans_num[j] = np.random.randint(
                        master_dict[j][1], master_dict[j][2])
            else:
                ans_num[j] = random.uniform(
                    master_dict[j][1], master_dict[j][2])
    # Numerical ends

    # Categorical starts
    for k, v in path_cat.items():
        for j in enum_store_key:
            # if that cat variable is in the path # native.country(j), one-hot-encoding: native.country_India(k)
            if j in k:
                if v == "No":  # if no
                    if j not in not_list:
                        not_list[j] = list()
                    not_list[j].append(k)  # append to list
                elif v == "Yes":  # if yes append directly
                    ans_cat[j] = k[len(j)+1:]

    not_list_2 = {}  # removing the unnecessary pre-names like ex: native.country_India to India
    keys = list(not_list.keys())
    for i in range(len(not_list)):
        cut = len(keys[i])
        not_list_2[keys[i]] = list()
        for j in range(len(not_list[keys[i]])):
            not_list_2[keys[i]].append(not_list[keys[i]][j][cut+1:])

    for j in enum_store_key:
        if j not in ans_cat and j in not_list_2:
            # picking ans from whole - not_list
            temp = [i for i in enum_store[j] if i not in not_list_2[j]]
            ans_cat[j] = random.choice(temp)

        # if feature not found in the path, populate it with a random name
        elif j not in ans_cat and j not in not_list_2:
            ans_cat[j] = random.choice(enum_store[j])
    # Categorical ends
    ans_cat[output] = random.choice(true_classes)
    # concatenate the dictionaries, append it to main
    res = {**ans_num.copy(), **ans_cat.copy()}
    main.append(res.copy())

Traverse_sk performs the DFS on the tree using the adjacency list. The base case is the leaf node. If not a leaf node, determine if it is a left or right child, determine if it is numerical or categorical variable and then store the appropriate value. Call recursion on it’s children. If it is a leaf node, we have reached the end of a path, now we need to store the values collected till now. Since DFS does not start from the root each time, we cannot just empty the entire data structure and then proceed. We need to intelligently delete only certain elements from the data structure. Since both the libraries produce full binary trees, after the left leaf node, we can simply delete the last entry since there is going to be right child. For the right child, the choice is now difficult. We move upwards the tree, storing which variables to remove till we reach a left child. We then delete the required nodes from the data structure. If it is the last_leaf_node, we return.

# Input: Node(current node), instance: DecisionTreeClassifier class, dataset, cat and num store the path
# Function: if not a leaf node, check for num, cat append it to num and cat respectively, if leaf node, pass it to post_process for test case
# then pop certain features for cat and num based on the backtracking as DFS is performed here
# Popping: Right child: since all categorical vars are one-hot encoded, they are unique so we pop them straightaway for numerical, we look for the feature(as they may be repeated)
# then pop it in the end
# Left child: only pop once as we have full trees so we have left and right child both, again pop based on whether it's cat or num as mentioned above
def traverse_sk(node, children_left, children_right, last_node, feature, adjacency_list, threshold, cat={}, num={}):
    if children_left[node] < 0:  # leaf node
        post_process(cat.copy(), num.copy())  # send for adding to testcase
        parent = -1
        if node == last_node:
            cat.clear()
            num.clear()
        for i in range(len(children_left)):
            if children_left[i] == node:
                parent = i
                break
        # Categorical parent variable popping
        # left child, direct pop as all names are distinct in the case of cat
        if node in children_left and input_vars[feature[parent]] in input_cat_vars:
            cat.popitem()

        for i in range(len(children_right)):
            if children_right[i] == node:
                parent = i
                break
        # left node, pop is cat, else store the feature name and then pop from num
        if node in children_right and input_vars[feature[parent]] in input_cat_vars and node != last_node:
            pop_len_2 = 0  # popping length for numerical variable
            temp_2 = node  # current right child
            store = []  # storing the feature names as we traverse to the upper last right child for numerical variables
            while temp_2 in children_right:
                idx = 0
                for i in range(len(children_right)):
                    if children_right[i] == temp_2:
                        idx = i
                temp_2 = idx
                if input_vars[feature[temp_2]] in master_dict_keys:
                    pop_len_2 += 1
                    store.append(input_vars[feature[temp_2]])
                else:
                    cat.popitem()
            for i in range(len(children_left)):
                if children_left[i] == temp_2:
                    idx = i
            temp_2 = idx
            if input_vars[feature[temp_2]] in master_dict_keys:
                pop_len_2 += 1
                store.append(input_vars[feature[temp_2]])
            else:
                cat.popitem()
            for i in range(pop_len_2):
                # last feature is added first in store so delete the last element in that feature name
                last = store[i]
                num[last].pop()
                num[last].pop()
                if len(num[last]) == 0:  # if some feature list is empty, remove it completely
                    num.pop(last)

        for i in range(len(children_left)):
            if children_left[i] == node:
                parent = i
                break
        # Numerical parent variable popping
        # if left child, we will pop the last two elements added in the last feature name only
        if node in children_left and input_vars[feature[parent]] in master_dict_keys:
            idx = 0
            # finding last feature name by finding the parent of the left child
            for i in range(len(children_left)):
                if children_left[i] == node:
                    idx = i
            last = input_vars[feature[idx]]
            num[last].pop()
            num[last].pop()

        for i in range(len(children_right)):
            if children_right[i] == node:
                parent = i
                break

        # right child, pop if cat else store the feature name and pop at end for num
        if node in children_right and node != last_node and input_vars[feature[parent]] in master_dict_keys:
            pop_len_2 = 0  # popping length
            temp_2 = node  # current right child
            store = []  # storing the feature names as we traverse to the upper last right child
            while temp_2 in children_right:
                idx = 0
                for i in range(len(children_right)):
                    if children_right[i] == temp_2:
                        idx = i
                temp_2 = idx
                if input_vars[feature[temp_2]] in master_dict_keys:
                    pop_len_2 += 1
                    store.append(input_vars[feature[temp_2]])
                else:
                    cat.popitem()

            for i in range(len(children_left)):
                if children_left[i] == temp_2:
                    idx = i
            temp_2 = idx
            if input_vars[feature[temp_2]] in master_dict_keys:
                pop_len_2 += 1
                store.append(input_vars[feature[temp_2]])
            else:
                cat.popitem()
            for i in range(pop_len_2):
                # last feature is added first in store so delete the last element in that feature name
                last = store[i]
                num[last].pop()
                num[last].pop()
                if len(num[last]) == 0:  # if some feature list is empty, remove it completely
                    num.pop(last)

    else:  # not a leaf node
        for child in adjacency_list[node]:
            # cat left node
            if child == children_left[node] and input_vars[feature[node]] in input_cat_vars:
                cat[input_vars[feature[node]]] = "No"
            elif child == children_left[node]:  # num left child
                # if not added then add the list otherwise add the value in the list
                if input_vars[feature[node]] not in num:
                    num[input_vars[feature[node]]] = list()
                num[input_vars[feature[node]]].append("<=")
                num[input_vars[feature[node]]].append(threshold[node])

            # cat right node
            elif child == children_right[node] and input_vars[feature[node]] in input_cat_vars:
                cat[input_vars[feature[node]]] = "Yes"

            elif child == children_right[node]:  # num right node
                # if not added then add the list otherwise add the value in the list
                if input_vars[feature[node]] not in num:
                    num[input_vars[feature[node]]] = list()
                num[input_vars[feature[node]]].append(">")
                num[input_vars[feature[node]]].append(threshold[node])
            traverse_sk(child, children_left, children_right, last_node,
                        feature, adjacency_list, threshold)  # recursive call

# Input: Node(current node), instance: DecisionTreeClassifier class, dataset, cat and num store the path
# Function: if not a leaf node, check for num, cat append it to num and cat respectively, if leaf node, pass it to post_process for test case
# then pop certain features for cat and num based on the backtracking as DFS is performed here
# Popping: Right child: since all categorical vars are one-hot encoded, they are unique so we pop them straightaway for numerical, we look for the feature(as they may be repeated)
# then pop it in the end
# Left child: only pop once as we have full trees so we have left and right child both, again pop based on whether it's cat or num as mentioned above
def traverse_XG(node, children_left, children_right, last_node, feature, adjacency_list, threshold, cat={}, num={}):
    if children_left[node] < 0:  # leaf node
        post_process(cat.copy(), num.copy())  # send for adding to testcase
        parent = -1
        if node == last_node:
            cat.clear()
            num.clear()

        for i in range(len(children_left)):
            if children_left[i] == node:
                parent = i
                break
        # Categorical parent variable popping
        # left child, direct pop as all names are distinct in the case of cat
        if node in children_left and feature[parent] in input_cat_vars:
            cat.popitem()

        for i in range(len(children_right)):
            if children_right[i] == node:
                parent = i
                break
        # left node, pop is cat, else store the feature name and then pop from num
        if node in children_right and feature[parent] in input_cat_vars and node != last_node:
            pop_len_2 = 0  # popping length for numerical variable
            temp_2 = node  # current right child
            store = []  # storing the feature names as we traverse to the upper last right child for numerical variables
            while temp_2 in children_right:
                idx = 0
                for i in range(len(children_right)):
                    if children_right[i] == temp_2:
                        idx = i
                temp_2 = idx
                if feature[temp_2] in master_dict_keys:
                    pop_len_2 += 1
                    store.append(feature[temp_2])
                else:
                    cat.popitem()
            for i in range(len(children_left)):
                if children_left[i] == temp_2:
                    idx = i
            temp_2 = idx
            if feature[temp_2] in master_dict_keys:
                pop_len_2 += 1
                store.append(feature[temp_2])
            else:
                cat.popitem()
            for i in range(pop_len_2):
                # last feature is added first in store so delete the last element in that feature name
                last = store[i]
                num[last].pop()
                num[last].pop()
                if len(num[last]) == 0:  # if some feature list is empty, remove it completely
                    num.pop(last)

        for i in range(len(children_left)):
            if children_left[i] == node:
                parent = i
                break
        # Numerical parent variable popping
        # if left child, we will pop the last two elements added in the last feature name only
        if node in children_left and feature[parent] in master_dict_keys:
            idx = 0
            # finding last feature name by finding the parent of the left child
            for i in range(len(children_left)):
                if children_left[i] == node:
                    idx = i
            last = feature[idx]
            num[last].pop()
            num[last].pop()

        for i in range(len(children_right)):
            if children_right[i] == node:
                parent = i
                break
        # right child, pop if cat else store the feature name and pop at end for num
        if node in children_right and node != last_node and feature[parent] in master_dict_keys:
            pop_len_2 = 0  # popping length
            temp_2 = node  # current right child
            store = []  # storing the feature names as we traverse to the upper last right child
            while temp_2 in children_right:
                idx = 0
                for i in range(len(children_right)):
                    if children_right[i] == temp_2:
                        idx = i
                temp_2 = idx
                if feature[temp_2] in master_dict_keys:
                    pop_len_2 += 1
                    store.append(feature[temp_2])
                else:
                    cat.popitem()
            for i in range(len(children_left)):
                if children_left[i] == temp_2:
                    idx = i
            temp_2 = idx
            if feature[temp_2] in master_dict_keys:
                pop_len_2 += 1
                store.append(feature[temp_2])
            else:
                cat.popitem()
            for i in range(pop_len_2):
                # last feature is added first in store so delete the last element in that feature name
                last = store[i]
                num[last].pop()
                num[last].pop()
                if len(num[last]) == 0:  # if some feature list is empty, remove it completely
                    num.pop(last)

    else:  # not a leaf node
        for child in adjacency_list[node]:
            # cat left node
            if child == children_left[node] and feature[node] in input_cat_vars:
                cat[feature[node]] = "No"
            elif child == children_left[node]:  # num left child
                # if not added then add the list otherwise add the value in the list
                if feature[node] not in num:
                    num[feature[node]] = list()
                num[feature[node]].append("<=")
                num[feature[node]].append(threshold[node])

            # cat right node
            elif child == children_right[node] and feature[node] in input_cat_vars:
                cat[feature[node]] = "Yes"

            elif child == children_right[node]:  # num right node
                # if not added then add the list otherwise add the value in the list
                if feature[node] not in num:
                    num[feature[node]] = list()
                num[feature[node]].append(">")
                num[feature[node]].append(threshold[node])
            traverse_XG(child, children_left, children_right, last_node,
                        feature, adjacency_list, threshold)  # recursive call

Common function for choosing the library to traverse based on the user input

def traverse(id, node, children_left, children_right, last_node, feature, adjacency_list, threshold):
    if id == 0:  # sklearn
        traverse_sk(node, children_left, children_right,
                    last_node, feature, adjacency_list, threshold)
    elif id == 1:  # XGBoost
        traverse_XG(node, children_left, children_right,
                    last_node, feature, adjacency_list, threshold)

We add the generated testcases back to the total dataset in the ratio of train:test and then return the updated dataframe.

def data(id, df, i, testcases=[]):
    if i == 0:
        if id == 1:
            df['income'] = df['income'].map({'>50K': 1, '<=50K': 0})
        df_x = df.drop('income', axis=1)
        df_y = df['income']
        df_encoded = pd.get_dummies(df_x)
        X_train, X_test, y_train, y_test = train_test_split(
            df_encoded, df_y, test_size=0.2, random_state=0)
        if id == 1:
            y_train = y_train.astype("category")
            y_test = y_test.astype("category")
        return df, X_train, y_train, X_test, y_test
    test_x = testcases.drop('income', axis=1)  # Train
    test_y = testcases['income']  # To be predicted
    X_train_tc, X_test_tc, y_train_tc, y_test_tc = train_test_split(
        test_x, test_y, test_size=0.2, random_state=0)
    if id == 1:
        y_train_tc = y_train_tc.astype("category")
        y_test_tc = y_test_tc.astype("category")
    df_x = df.drop('income', axis=1)  # Train
    df_y = df['income']  # To be predicted
    X_train_or, X_test_or, y_train_or, y_test_or = train_test_split(
        df_x, df_y, test_size=0.2, random_state=0)
    if id == 1:
        y_train_or = y_train_or.astype("category")
        y_test_or = y_test_or.astype("category")
    df = pd.concat([df, testcases])
    X_train = pd.concat([X_train_or, X_train_tc])
    y_train = pd.concat([y_train_or, y_train_tc])
    X_test = pd.concat([X_test_or, X_test_tc])

    y_test = pd.concat([y_test_or, y_test_tc])
    X_train = pd.get_dummies(X_train, drop_first=False)
    X_test = pd.get_dummies(X_test, drop_first=False)

    if 'native.country_Holand-Netherlands' not in X_test.columns.values.tolist():
        X_test.insert(81, 'native.country_Holand-Netherlands', 0)

    return df, X_train, y_train, X_test, y_test

The user can run this for as many iterations as needed. id helps to switch between libraries.

iterations = 5
id = 0
accuracy = []
for i in range(iterations):
    if i == 0:
        df, X_train, y_train, X_test, y_test = data(id, df, i)
        df_x = df.drop('income', axis=1)
        df_encoded = pd.get_dummies(df_x)
        input_vars = df_encoded.columns.values.tolist()  # feature names
        input_cat_vars = input_vars.copy()
        for i in range(len(master_dict)):
            input_cat_vars.remove(master_dict_keys[i])
        enum_store_key = list(enum_store.keys())
    tree_dict = []
    if id == 0:
        tree = DecisionTreeClassifier(max_depth=10, random_state=0)
    elif id == 1:
        tree = xgb.XGBClassifier(
            n_estimators=1, tree_method="gpu_hist", enable_categorical=True, max_depth=8)
    tree.fit(X_train, y_train)
    if id == 1:
        booster = tree.get_booster()
        tree = booster.get_dump(dump_format='json')[0]
        tree_dict = json.loads(tree)
    true_classes, children_left, children_right, feature, threshold, input_vars, last_node, adjacency_list, output = params(
        id, tree, df, tree_dict)
    main = []  # stores the testcases
    traverse(id, 0, children_left, children_right,
             last_node, feature, adjacency_list, threshold)
    preds = tree.predict(X_test)
    print("Accuracy:", metrics.accuracy_score(y_test, preds))
    accuracy.append(metrics.accuracy_score(y_test, preds))
    test_cases = pd.DataFrame(main)  # testcases
    test_cases = test_cases[df.columns.values.tolist()]
    df, X_train, y_train, X_test, y_test = data(id, df, i, test_cases)

Accuracy: 0.8498387839705205
Accuracy: 0.8475247524752475
Accuracy: 0.8507936507936508
Accuracy: 0.856157192140393
Accuracy: 0.8386090057958092