Source code for doenut.models.averaged_model

import logging
from typing import Tuple

import numpy as np
import pandas as pd
import doenut.utils
import copy

from doenut.data.modifiable_data_set import ModifiableDataSet
from doenut.models.model_set import ModelSet
from doenut.models.model import Model


logger = doenut.utils.initialise_log(__name__, logging.DEBUG)


[docs] class AveragedModel(Model): """Model scored as the average of multiple models generated from a single set of inputs via a leave-one-out approach. Parameters ---------- data: doenut.data.ModifiableDataSet the data to run / test against. scale_data: bool, default True Whether to scale the overall data before running it. scale_run_data: bool, default True Whether to normalise the data for each run fit_intercept: bool, default True Whether to fit the intercept to zero response_key: str, optional for multi-column responses, which one to test on drop_duplicates: {'yes', 'drop', 'average'} whether to drop duplicate values or not. May also be 'average' which will cause them to be dropped, but the one left will have its response value(s) set to the average of all the duplicates. """
[docs] @classmethod def tune_model( cls, data: ModifiableDataSet, fit_intercept: bool = True, response_key: str = None, drop_duplicates: str = "yes", ) -> Tuple["AveragedModel", "AveragedModel"]: """Generate a pair of models from the same set of data. One using scaled data the other unscaled. The scaled model can then be used for determining which columns to drop for later models, and the unscaled model for checking the models performance against validation data (or just for using once done). Parameters ---------- data : doenut.data.ModifiableDataSet The dataset to test against. This should be unscaled. fit_intercept : bool, default True Whether to fit the intercept or not (usually yes) response_key : str, optional If there are more than one response columns, which to use. drop_duplicates: {'yes', 'drop', 'average'} whether to drop duplicate values or not. May also be 'average' which will cause them to be dropped, but the one left will have its response value(s) set to the average of all the duplicates. Returns ------- AveragedModel: The generated scaled model AveragedModel: The generated unscaled model """ logger.info("Running Tune Model") logger.debug("Generating scaled model") scaled_model = AveragedModel( data, scale_data=True, scale_run_data=True, fit_intercept=fit_intercept, response_key=response_key, drop_duplicates=drop_duplicates, ) logger.debug("Generating unscaled model") unscaled_model = AveragedModel( data, scale_data=False, scale_run_data=False, fit_intercept=fit_intercept, response_key=response_key, drop_duplicates=drop_duplicates, ) return scaled_model, unscaled_model
def __init__( self, data: ModifiableDataSet, scale_data: bool = True, scale_run_data: bool = True, fit_intercept: bool = True, response_key: str = None, drop_duplicates: str = "yes", ): logger.info("Constructing AveragedModel") proc_data = copy.deepcopy(data) if scale_data: proc_data.scale(False) # Call super to set up basic model super().__init__(proc_data.get(), fit_intercept) # check the columns responses = self.data.get_responses() if response_key is None: if len(responses.columns) > 1: raise ValueError( "No response key specified and multiple response columns" ) response_key = responses.columns[0] logger.info(f"Setting response_key to {response_key}") # Get the processed inputs + responses (after filtering + dedupe proc_inputs, proc_responses = None, None if isinstance(drop_duplicates, str): if str.lower(drop_duplicates) == "yes": proc_data.drop_duplicates() elif str.lower(drop_duplicates) == "average": proc_data.average_duplicates() elif str.lower(drop_duplicates) == "no": pass else: raise ValueError( f"Invalid drop_duplicates value {drop_duplicates}" " - should one of 'yes', 'no', 'average'" ) final_data = proc_data.get() proc_inputs = final_data.get_inputs() proc_responses = final_data.get_responses() logger.debug( f"Final data sizes: inputs {proc_inputs.shape}, responses {proc_responses.shape}" ) # Use leave-one-out on the input data rows to generate a set of models self.models = ModelSet(None, None, fit_intercept) model_predictions = [] errors = [] model_responses = [] for i, row_idx in enumerate(proc_inputs.index): logger.debug(f"Testing against row {row_idx}") test_input = proc_inputs.iloc[i].to_numpy().reshape(1, -1) test_response = proc_responses.iloc[i] train_input = proc_inputs.drop(row_idx).to_numpy() train_responses = proc_responses.drop(row_idx) # We need to re-scale each column, using the training data *only*, # but then applying the same scaling to the test data. if scale_run_data: train_input, mj, rj = doenut.orthogonal_scaling(train_input, 0) test_input = doenut.scale_by(test_input, mj, rj) model = self.models.add_model( train_input, train_responses, False, fit_intercept ) predictions = model.get_predictions_for(test_input)[0] model_predictions.append(predictions) model_responses.append(test_response) errors.append(test_response - predictions) self.coeffs = self.models.get_attributes("coef_") self.intercepts = self.models.get_attributes("intercept_") self.averaged_coeffs = np.mean(np.array(self.coeffs), axis=0) self.averaged_intercepts = np.mean(np.array(self.intercepts), axis=0) self.r2s = self.models.get_r2s() # replace our initial model with the averaged one to determine R2/Q2. self.model.coef_ = self.averaged_coeffs self.model.intercept_ = self.averaged_intercepts self.r2 = self.get_r2_for(final_data) # Now calculate q2 self.q2_predictions = pd.DataFrame.from_records( model_predictions, columns=proc_responses.columns ) self.q2_ground_truths = pd.DataFrame.from_records( model_responses, columns=proc_responses.columns ) self.q2 = doenut.Calculate_Q2( self.q2_ground_truths, self.q2_predictions, proc_responses, response_key, ) # finally make a fitted model. start_data = data.get() self.model.fit(start_data.get_inputs(), start_data.get_responses()) self.predictions = self.get_predictions_for(proc_inputs) logger.info("Constructed AveragedModel")