Source code for doenut.models.averaged_model_set

import logging

import pandas as pd

import doenut.utils
from doenut.data import ModifiableDataSet
from doenut.models.model_set import ModelSet
from doenut.models.averaged_model import AveragedModel

logger = doenut.utils.initialise_log(__name__, logging.DEBUG)



[docs]
class AveragedModelSet(ModelSet):
    """Class to train and hold a group of related (averaged) models.
    When constructing the AveragedModelSet, you can define default values.
    Then when adding a new model to the set you only have to specify the
    parameters which differ from the default.


    Parameters
    ----------
    default_inputs: pd.DataFrame, optional
        The default inputs to the model
    default_responses: pd.DataFrame, optional
        The default responses for the model
    default_scale_data: bool, optional
        Whether to scale the data before adding to the model by default
    default_scale_run_data: bool, optional
        Whether to scale the data for each train/test set by default
    default_fit_intercept: bool, optional
        Whether to fit the model's intercept to the axis by default
    default_response_key: str, optional
        The default column to pick from the responses
    default_drop_duplicates: {'no', 'yes', 'averages'}, optional
        What to do with duplicates in the inputs, by default
    default_input_selector: List, optional
        What columns from the input data to select by default

    """


[docs]
    @classmethod
    def multiple_response_columns(
        cls,
        inputs: pd.DataFrame = None,
        responses: pd.DataFrame = None,
        scale_data: bool = True,
        scale_run_data: bool = True,
        fit_intercept: bool = True,
        drop_duplicates: str = "yes",
        input_selector: list = [],
    ) -> "AveragedModelSet":
        logger.info("Generating AveragedModelSet")
        result = AveragedModelSet(
            inputs,
            responses,
            scale_data,
            scale_run_data,
            fit_intercept,
            [],
            drop_duplicates,
            input_selector,
        )
        for column in responses.columns:
            logger.debug(f"Adding model for response key {column}")
            result.add_model(response_key=column)
        return result


    def __init__(
        self,
        default_inputs: pd.DataFrame = None,
        default_responses: pd.DataFrame = None,
        default_scale_data: bool = True,
        default_scale_run_data: bool = True,
        default_fit_intercept: bool = True,
        default_response_key: list = [0],
        default_drop_duplicates: str = "yes",
        default_input_selector: list = [],
    ):
        super().__init__(
            default_inputs,
            default_responses,
            default_scale_data,
            default_fit_intercept,
        )
        self.default_scale_run_data = default_scale_run_data
        self.default_response_key = default_response_key
        self.default_drop_duplicates = default_drop_duplicates
        self.default_input_selector = default_input_selector


[docs]
    def add_model(
        self,
        inputs=None,
        responses=None,
        scale_data=None,
        scale_run_data=None,
        fit_intercept=None,
        response_key=None,
        drop_duplicates=None,
        input_selector=None,
    ):
        """Add a new AveragedModel to the set

        Parameters
        ----------
        inputs: pd.DataFrame, optional
            The inputs to the model
        responses: pd.DataFrame, optional
            The responses for the model
        scale_data: bool, optional
            Whether to scale the data before adding to the model
        scale_run_data: bool, optional
            Whether to scale the data for each train/test set
        fit_intercept: bool, optional
            Whether to fit the model's intercept to the axis
        response_key: str, optional
            The column to pick from the responses
        drop_duplicates: {'no', 'yes', 'averages'}, optional
            What to do with duplicates in the inputs
        input_selector: List, optional
            What columns from the input data to select

        Returns
        -------
        doenut.models.AveragedModel
            The generated model

        """
        inputs = self._validate_value("inputs", inputs)
        responses = self._validate_value("responses", responses)
        scale_data = self._validate_value("scale_data", scale_data)
        scale_run_data = self._validate_value("scale_run_data", scale_run_data)
        fit_intercept = self._validate_value("fit_intercept", fit_intercept)
        response_key = self._validate_value("response_key", response_key)
        drop_duplicates = self._validate_value(
            "drop_duplicates", drop_duplicates
        )
        input_selector = self._validate_value("input_selector", input_selector)

        data = ModifiableDataSet(inputs, responses)
        # if scale_data:
        #     data.scale()
        if input_selector:
            data.filter(input_selector)
        model = AveragedModel(
            data,
            scale_data,
            scale_run_data,
            fit_intercept,
            response_key,
            drop_duplicates,
        )
        self.models.append(model)
        return model