Source code for doenut.designer

############################################################################################################
#
#              DoENUT Designer
#
############################################################################################################
from typing import Dict, Any, Iterable, List

import doepy.build

# !!! TO-DO !!!
#
# make this into a nice proper class
import pandas as pd
import numpy as np
import copy
import doenut.utils
import logging


logger = doenut.utils.initialise_log(__name__, logging.DEBUG)


[docs] def _check_is_input_dict(data: Dict[Any, Iterable]) -> None: """ Validate an input dictionary's type. Most of these functions require a dictionary of lists as their input data This is a helper function that will throw an appropriate assert if needed. Parameters ---------- data : Dict[Any, Iterable] The data dictionary to validate Raises ------ TypeError If the data is of the wrong type, or if any of the values in the dictionary are not iterable """ if not isinstance(data, dict): raise TypeError("Input data must be a dictionary") for key, value in data.items(): try: _ = iter(value) except TypeError as e: print(f"Parameter {key} is not iterable") raise e
[docs] def get_ranges(data: Dict[Any, Iterable[float]]) -> Dict[Any, List[float]]: """ Find the ranges of data in an input dictionary Go through a dictionary of value lists, and return the same, but with only the min / max value from each in each. Parameters ---------- data: Dict[Any, Iterable[float]] The input dictionary to parse Returns ------- Dict[Any, List[float]] A dictionary of the same keys, but each value has now been replaced with a list of min and max of that value list """ # first check we are being passed something sane _check_is_input_dict(data) result = {} for key, value in data.items(): result[key] = [min(value), max(value)] logger.debug(f"Result range for {key}: {result[key]}") return result
[docs] def full_fact(data: Dict[Any, List[float]]) -> pd.DataFrame: """Generate a full factorial model from the supplied parameters Parameters ---------- data : Dict[Any, List[float]] dict of lists of allowed values for each parameter Returns ------- pd.DataFrame A dataframe of all the generated experiments """ # first validate the inputs are all lists or list like # while we are here, work out how bit this is. row_count = 1 for key, value in data.items(): try: _ = iter(value) except TypeError as e: print(f"Parameter {key} is not iterable") raise e row_count = row_count * len(value) logger.info( f"Creating full factoral model of shape {row_count}x{len(data.keys())}" ) result = np.zeros((row_count, len(data.keys())), dtype="O") # Now build up the data column by column # how many row 'blocks' there are to the left of the current column left_data = 1 # how many rows are remaining to fill. right_data = row_count # Note, there are a lot of int() calls below, # but these should always be valid as we are dividing ints by other ints # that are divisors of it. for column_idx, (column, values) in enumerate(data.items()): logger.debug(f"Generating for column {column}") value_count = len(values) # how many times do we need to write this value in a row? rows_per_value = int(right_data / value_count) for group in range(left_data): # work out where this group begins offset = rows_per_value * value_count * group for idx, value in enumerate(values): start = int(offset + (idx * rows_per_value)) end = int(start + rows_per_value) result[start:end, column_idx] = value # re-establish the invariants left_data = int(left_data * value_count) right_data = right_data / value_count result = pd.DataFrame(columns=list(data.keys()), dtype=object, data=result) return result
[docs] def frac_fact(data: Dict[Any, List[float]], resolution: int = None) -> pd.DataFrame: """build a 2-level fractional factorial design Parameters ---------- data : Dict[Any, List[float]] dictionary to design from resolution : float, optional what resolution model to build. Default is param_count/2 Returns ------- pd.DataFrame A dataframe of all the experiments """ _check_is_input_dict(data) if resolution is None: resolution = int(len(data.keys()) / 2) + 1 if resolution >= len(data.keys()): raise ValueError( "Resolution has to be less than the number of parameters" ) if resolution == 1: raise ValueError("Resolution of 1 is meaningless") # only want the limits data_ranges = get_ranges(data) # TODO:: Now implement the hard bit! return doepy.build.frac_fact_res(data_ranges, resolution)
# TODO:: this should be a base class with hte actual experiment overwritten in the sub-class
[docs] def experiment_designer( levels, res, do_midpoints=True, shuffle=True, repeats=1, num_midpoints=3 ): """levels is a dictionary of factor name and levels res is the resolution (for frac fact) - shouldn't be in class do_midpoints whether to add in the mid points shuffle whether to shuffle repeats how many repeats you're doing of the NON-MIDPOINTS num_midpoints, how many midpoints to do Parameters ---------- levels : res : do_midpoints : (Default value = True) shuffle : (Default value = True) repeats : (Default value = 1) num_midpoints : (Default value = 3) Returns ------- """ # deepcopy as their code overwrites the levels >:( levels_in = copy.deepcopy(levels) design = doepy.frac_fact_res(levels_in, res=res) factor_names = [x for x in levels.keys()] if repeats > 1: for i in range(repeats): design = design.append(midpoints, ignore_index=True) if do_midpoints: midpoints = {} for factor in levels.keys(): if len(levels[factor]) > 2: midpoints[factor] = np.median(levels[factor]) else: midpoints[factor] = np.mean(levels[factor]) # midpoints = pd.DataFrame(midpoints, index=str(len(design)+1)) for i in range(num_midpoints): design = design.append(midpoints, ignore_index=True) if shuffle: design = design.sample(frac=1) return design
[docs] def frac_fact_res_designer( levels, res, do_midpoints=True, shuffle=True, repeats=1, num_midpoints=3 ): levels_in = copy.deepcopy(levels) design = doepy.frac_fact_res(levels_in, res=res) factor_names = [x for x in levels.keys()] if do_midpoints: midpoints = {} for factor in levels.keys(): if len(levels[factor]) > 2: midpoints[factor] = np.median(levels[factor]) else: midpoints[factor] = np.mean(levels[factor]) # midpoints = pd.DataFrame(midpoints, index=str(len(design)+1)) for i in range(num_midpoints): design = design.append(midpoints, ignore_index=True) if shuffle: design = design.sample(frac=1) return design
[docs] def fact_designer( levels, do_midpoints=True, shuffle=True, repeats=1, num_midpoints=3 ): levels_in = copy.deepcopy(levels) # Build a basic full factorial design. design = full_fact(levels_in) if do_midpoints: midpoints = {} for factor in levels.keys(): if len(levels[factor]) > 2: midpoints[factor] = np.repeat( np.median(levels[factor]), num_midpoints ) else: midpoints[factor] = np.repeat( np.mean(levels[factor]), num_midpoints ) midpoint_df = pd.DataFrame.from_dict(midpoints) design = pd.concat([design, midpoint_df], ignore_index=True) if shuffle: design = design.sample(frac=1) return design