import argparse import pandas as pd import numpy as np import pickle from pathlib import Path from collections import de
Posted: Wed Apr 27, 2022 3:46 pm
import argparse import pandas as pd import numpy as np import pickle from pathlib import Path from collections import defaultdict [10 points] Problem 1 - Building a Decision Tree A sample dataset has been provided to you in the './data/dataset.csv' path. Here are the attributes for the dataset. Use this dataset to test your functions. • Age - ["<=30", "31-40",">40"] Income - ["low", "medium", "high") Student - ["no", "yes"] Credit Rating - ["fair", "excellent"] • Loan - ["no", "yes"] Note: • A sample dataset to test your code has been provided in the location "data/dataset.csv". Please maintain this as it would be necessary while grading. • Do not change the variable names of the returned values. After calculating each of those values, assign them to the corresponding value that is being returned. • The "Loan" attribute should be used as the target variable while making calculations for your decision tree.
import math import pandas as pd def information_gain_target (dataset_file): Input: dataset_file - A string variable which references the path to the dataset file. Output: ig_loan - A floating point variable which holds the information gain associated with the target variat ######## NOTE: 1. Return the information gain associated with the target variable in the dataset. 2. The Loan attribute is the target variable 3. The pandas dataframe has the following attributes: Age, Income, Student, Credit Rating, Loan 4. Perform your calculations for information gain and assign it to the variable ig_loan df = pd.read_csv (dataset_file) ig_loan = 0 # your code here return ig_loan
attribute_values = { "Age": ["<=30", "31-40", ">40"], "Income": ["low", "medium", "high"], "Student" : ["yes" "no" ], "Credit Rating": ["fair", "excellent"] } attributes = ["Age", "Income", "Student", "Credit Rating"] def information_gain(p_count_yes, p_count_no): A helper function that returns the information gain when given counts of number of yes and no values. Please complete this function before you proceed to the information_gain_attributes function below. # your code here return ig
import operator def information_gain_attributes (dataset_file, ig_loan, attributes, attribute_values): Input: 1. dataset_file - A string variable which references the path to the dataset file. 2. ig_loan - A floating point variable representing the information gain of the target variable "Loan". 3. attributes - A python list which has all the attributes of the dataset 4. attribute_values - A python dictionary representing the values each attribute can hold. Output: results - A python dictionary representing the information gain associated with each variable. 1. ig_attributes - A sub dictionary representing the information gain for each attribute. 2. best_attribute - Returns the attribute which has the highest information gain. NOTE: 1. The Loan attribute is the target variable 2. The pandas dataframe has the following attributes: Age, Income, Student, Credit Rating, Loan results = { "ig_attributes" : { "Age": 0, "Income": 0, "Student": 0, "Credit Rating": 0 } "best_attribute": } df = pd.read_csv (dataset_file) d_range - len (df) for attribute in attributes: ig_attribute = 0 value_counts = dict() vcount = df [attribute].value_counts() for att_value in attribute_values[ attribute]: #your code here results["ig_attributes"][attribute] = ig_loan ig_attribute results["best_attribute"] return results = max(results["ig_attributes"].items(), key=operator.itemgetter (1))[0]
import math import pandas as pd def information_gain_target (dataset_file): Input: dataset_file - A string variable which references the path to the dataset file. Output: ig_loan - A floating point variable which holds the information gain associated with the target variat ######## NOTE: 1. Return the information gain associated with the target variable in the dataset. 2. The Loan attribute is the target variable 3. The pandas dataframe has the following attributes: Age, Income, Student, Credit Rating, Loan 4. Perform your calculations for information gain and assign it to the variable ig_loan df = pd.read_csv (dataset_file) ig_loan = 0 # your code here return ig_loan
attribute_values = { "Age": ["<=30", "31-40", ">40"], "Income": ["low", "medium", "high"], "Student" : ["yes" "no" ], "Credit Rating": ["fair", "excellent"] } attributes = ["Age", "Income", "Student", "Credit Rating"] def information_gain(p_count_yes, p_count_no): A helper function that returns the information gain when given counts of number of yes and no values. Please complete this function before you proceed to the information_gain_attributes function below. # your code here return ig
import operator def information_gain_attributes (dataset_file, ig_loan, attributes, attribute_values): Input: 1. dataset_file - A string variable which references the path to the dataset file. 2. ig_loan - A floating point variable representing the information gain of the target variable "Loan". 3. attributes - A python list which has all the attributes of the dataset 4. attribute_values - A python dictionary representing the values each attribute can hold. Output: results - A python dictionary representing the information gain associated with each variable. 1. ig_attributes - A sub dictionary representing the information gain for each attribute. 2. best_attribute - Returns the attribute which has the highest information gain. NOTE: 1. The Loan attribute is the target variable 2. The pandas dataframe has the following attributes: Age, Income, Student, Credit Rating, Loan results = { "ig_attributes" : { "Age": 0, "Income": 0, "Student": 0, "Credit Rating": 0 } "best_attribute": } df = pd.read_csv (dataset_file) d_range - len (df) for attribute in attributes: ig_attribute = 0 value_counts = dict() vcount = df [attribute].value_counts() for att_value in attribute_values[ attribute]: #your code here results["ig_attributes"][attribute] = ig_loan ig_attribute results["best_attribute"] return results = max(results["ig_attributes"].items(), key=operator.itemgetter (1))[0]