Source code for pybaselines.optimizers

# -*- coding: utf-8 -*-
"""High level functions for making better use of baseline algorithms.

Functions in this module make use of other baseline algorithms in
pybaselines to provide better results or optimize parameters.

Created on March 3, 2021
@author: Donald Erb

"""

from collections import defaultdict
import itertools
from math import ceil

import numpy as np

from . import classification, misc, morphological, polynomial, smooth, spline, whittaker
from ._algorithm_setup import _Algorithm, _class_wrapper
from ._validation import _check_optional_array
from .utils import _check_scalar, _get_edges, _sort_array, gaussian


class _Optimizers(_Algorithm):
    """A base class for all optimizer algorithms."""

[docs] @_Algorithm._register(ensure_1d=False, skip_sorting=True) def collab_pls(self, data, average_dataset=True, method='asls', method_kwargs=None): """ Collaborative Penalized Least Squares (collab-PLS). Averages the data or the fit weights for an entire dataset to get more optimal results. Uses any Whittaker-smoothing-based or weighted spline algorithm. Parameters ---------- data : array-like, shape (M, N) An array with shape (M, N) where M is the number of entries in the dataset and N is the number of data points in each entry. average_dataset : bool, optional If True (default) will average the dataset before fitting to get the weighting. If False, will fit each individual entry in the dataset and then average the weights to get the weighting for the dataset. method : str, optional A string indicating the Whittaker-smoothing-based or weighted spline method to use for fitting the baseline. Default is 'asls'. method_kwargs : dict, optional A dictionary of keyword arguments to pass to the selected `method` function. Default is None, which will use an empty dictionary. Returns ------- baselines : np.ndarray, shape (M, N) An array of all of the baselines. params : dict A dictionary with the following items: * 'average_weights': numpy.ndarray, shape (N,) The weight array used to fit all of the baselines. * 'average_alpha': numpy.ndarray, shape (N,) Only returned if `method` is 'aspls' or 'pspline_aspls'. The `alpha` array used to fit all of the baselines for the :meth:`~.Baseline.aspls` or :meth:`~.Baseline.pspline_aspls` methods. * 'method_params': dict[str, list] A dictionary containing the output parameters for each individual fit. Keys will depend on the selected method and will have a list of values, with each item corresponding to a fit. Notes ----- If `method` is 'aspls' or 'pspline_aspls', `collab_pls` will also calculate the `alpha` array for the entire dataset in the same manner as the weights. References ---------- Chen, L., et al. Collaborative Penalized Least Squares for Background Correction of Multiple Raman Spectra. Journal of Analytical Methods in Chemistry, 2018, 2018. """ dataset, baseline_func, _, method_kws, _ = self._setup_optimizer( data, method, (whittaker, morphological, classification, spline), method_kwargs, True ) data_shape = dataset.shape if len(data_shape) != 2: raise ValueError(( 'the input data must have a shape of (number of measurements, number of points), ' f'but instead has a shape of {data_shape}' )) method = method.lower() # if using aspls or pspline_aspls, also need to calculate the alpha array # for the entire dataset calc_alpha = method in ('aspls', 'pspline_aspls') # step 1: calculate weights for the entire dataset if average_dataset: _, fit_params = baseline_func(np.mean(dataset, axis=0), **method_kws) method_kws['weights'] = fit_params['weights'] if calc_alpha: method_kws['alpha'] = fit_params['alpha'] else: weights = np.empty(data_shape) if calc_alpha: alpha = np.empty(data_shape) for i, entry in enumerate(dataset): _, fit_params = baseline_func(entry, **method_kws) weights[i] = fit_params['weights'] if calc_alpha: alpha[i] = fit_params['alpha'] method_kws['weights'] = np.mean(weights, axis=0) if calc_alpha: method_kws['alpha'] = np.mean(alpha, axis=0) # step 2: use the dataset weights from step 1 (stored in method_kws['weights']) # to fit each individual data entry; set tol to infinity so that only one # iteration is done and new weights are not calculated if method not in ('mpls', 'pspline_mpls', 'fabc'): method_kws['tol'] = np.inf if method in ('brpls', 'pspline_brpls'): method_kws['tol_2'] = np.inf baselines = np.empty(data_shape) params = {'average_weights': method_kws['weights'], 'method_params': defaultdict(list)} if calc_alpha: params['average_alpha'] = method_kws['alpha'] if method == 'fabc': # set weights as mask so it just fits the data method_kws['weights_as_mask'] = True for i, entry in enumerate(dataset): baselines[i], param = baseline_func(entry, **method_kws) for key, value in param.items(): params['method_params'][key].append(value) return baselines, params
[docs] @_Algorithm._register(skip_sorting=True) def optimize_extended_range(self, data, method='asls', side='both', width_scale=0.1, height_scale=1., sigma_scale=1 / 12, min_value=2, max_value=8, step=1, pad_kwargs=None, method_kwargs=None): """ Extends data and finds the best parameter value for the given baseline method. Adds additional data to the left and/or right of the input data, and then iterates through parameter values to find the best fit. Useful for calculating the optimum `lam` or `poly_order` value required to optimize other algorithms. Parameters ---------- data : array-like, shape (N,) The y-values of the measured data, with N data points. method : str, optional A string indicating the Whittaker-smoothing-based, polynomial, or spline method to use for fitting the baseline. Default is 'asls'. side : {'both', 'left', 'right'}, optional The side of the measured data to extend. Default is 'both'. width_scale : float, optional The number of data points added to each side is `width_scale` * N. Default is 0.1. height_scale : float, optional The height of the added Gaussian peak(s) is calculated as `height_scale` * max(`data`). Default is 1. sigma_scale : float, optional The sigma value for the added Gaussian peak(s) is calculated as `sigma_scale` * `width_scale` * N. Default is 1/12, which will make the Gaussian span +- 6 sigma, making its total width about half of the added length. min_value : int or float, optional The minimum value for the `lam` or `poly_order` value to use with the indicated method. If using a polynomial method, `min_value` must be an integer. If using a Whittaker-smoothing-based method, `min_value` should be the exponent to raise to the power of 10 (eg. a `min_value` value of 2 designates a `lam` value of 10**2). Default is 2. max_value : int or float, optional The maximum value for the `lam` or `poly_order` value to use with the indicated method. If using a polynomial method, `max_value` must be an integer. If using a Whittaker-smoothing-based method, `max_value` should be the exponent to raise to the power of 10 (eg. a `max_value` value of 3 designates a `lam` value of 10**3). Default is 8. step : int or float, optional The step size for iterating the parameter value from `min_value` to `max_value`. If using a polynomial method, `step` must be an integer. If using a Whittaker-smoothing-based method, `step` should be the exponent to raise to the power of 10 (eg. a `step` value of 1 designates a `lam` value of 10**1). Default is 1. pad_kwargs : dict, optional A dictionary of options to pass to :func:`.pad_edges` for padding the edges of the data when adding the extended left and/or right sections. Default is None, which will use an empty dictionary. method_kwargs : dict, optional A dictionary of keyword arguments to pass to the selected `method` function. Default is None, which will use an empty dictionary. Returns ------- baseline : numpy.ndarray, shape (N,) The baseline calculated with the optimum parameter. method_params : dict A dictionary with the following items: * 'optimal_parameter': int or float The `lam` or `poly_order` value that produced the lowest root-mean-squared-error. * 'min_rmse': float .. deprecated:: 1.2.0 The 'min_rmse' key will be removed from the ``method_params`` dictionary in pybaselines version 1.4.0 in favor of the new 'rmse' key which returns all root-mean-squared-error values. * 'rmse' : numpy.ndarray The array of the calculated root-mean-squared-error for each of the fits. * 'method_params': dict A dictionary containing the output parameters for the optimal fit. Items will depend on the selected method. Raises ------ ValueError Raised if `side` is not 'left', 'right', or 'both'. TypeError Raised if using a polynomial method and `min_value`, `max_value`, or `step` is not an integer. ValueError Raised if using a Whittaker-smoothing-based method and `min_value`, `max_value`, or `step` is greater than 100. Notes ----- Based on the extended range penalized least squares (erPLS) method from [1]_. The method proposed by [1]_ was for optimizing lambda only for the aspls method by extending only the right side of the spectrum. The method was modified by allowing extending either side following [2]_, and for optimizing lambda or the polynomial degree for all of the affected algorithms in pybaselines. It should be noted that the optimal ``lam`` value obtained from :meth:`~.Baseline.optimize_extended_range` cannot be directly used for fitting other data using the same ``method`` since the optimal ``lam`` value corresponds to the padded data; since ``lam`` has a dependance on data size, the optimal ``lam`` value for fitting non-padded data will be slightly lower than the optimal value obtained from :meth:`~.Baseline.optimize_extended_range`. References ---------- .. [1] Zhang, F., et al. An Automatic Baseline Correction Method Based on the Penalized Least Squares Method. Sensors, 2020, 20(7), 2015. .. [2] Krishna, H., et al. Range-independent background subtraction algorithm for recovery of Raman spectra of biological tissue. Journal of Raman Spectroscopy. 2012, 43(12), 1884-1894. """ side = side.lower() if side not in ('left', 'right', 'both'): raise ValueError('side must be "left", "right", or "both"') y, _, func_module, method_kws, fit_object = self._setup_optimizer( data, method, (whittaker, polynomial, morphological, spline, classification), method_kwargs, True ) method = method.lower() if func_module == 'polynomial' or method in ('dietrich', 'cwt_br'): if any(not isinstance(val, int) for val in (min_value, max_value, step)): raise TypeError(( 'min_value, max_value, and step must all be integers when' ' using a polynomial method' )) param_name = 'poly_order' else: if any(val > 15 for val in (min_value, max_value, step)): raise ValueError(( 'min_value, max_value, and step should be the power of 10 to use ' '(eg. min_value=2 denotes 10**2), not the actual "lam" value, and ' 'thus should not be greater than 15' )) param_name = 'lam' if step == 0 or min_value == max_value: do_optimization = False else: do_optimization = True if max_value < min_value and step > 0: step = -step added_window = int(self._size * width_scale) for key in ('weights', 'alpha'): if key in method_kws: method_kws[key] = np.pad( method_kws[key], [0 if side == 'right' else added_window, 0 if side == 'left' else added_window], 'constant', constant_values=1 ) min_x = self.x_domain[0] max_x = self.x_domain[1] x_range = max_x - min_x known_background = np.array([]) fit_x_data = self.x fit_data = y lower_bound = upper_bound = 0 if pad_kwargs is None: pad_kwargs = {} added_left, added_right = _get_edges( _sort_array(y, self._sort_order), added_window, **pad_kwargs ) added_gaussian = gaussian( np.linspace(-added_window / 2, added_window / 2, added_window), height_scale * abs(y.max()), 0, added_window * sigma_scale ) if side in ('right', 'both'): added_x = np.linspace( max_x, max_x + x_range * (width_scale / 2), added_window + 1 )[1:] fit_x_data = np.concatenate((fit_x_data, added_x)) fit_data = np.concatenate((fit_data, added_gaussian + added_right)) known_background = added_right upper_bound += added_window if side in ('left', 'both'): added_x = np.linspace( min_x - x_range * (width_scale / 2), min_x, added_window + 1 )[:-1] fit_x_data = np.concatenate((added_x, fit_x_data)) fit_data = np.concatenate((added_gaussian + added_left, fit_data)) known_background = np.concatenate((known_background, added_left)) lower_bound += added_window added_len = 2 * added_window if side == 'both' else added_window if self._sort_order is None: new_sort_order = None else: if side == 'right': new_sort_order = np.concatenate(( self._sort_order, np.arange(self._size, self._size + added_len, dtype=np.intp) ), dtype=np.intp) elif side == 'left': new_sort_order = np.concatenate(( np.arange(added_len, dtype=np.intp), self._sort_order + added_len ), dtype=np.intp) else: new_sort_order = np.concatenate(( np.arange(added_window, dtype=np.intp), self._sort_order + added_window, np.arange(self._size + added_window, self._size + added_len, dtype=np.intp) ), dtype=np.intp) new_fitter = fit_object._override_x(fit_x_data, new_sort_order=new_sort_order) baseline_func = getattr(new_fitter, method) if do_optimization: if param_name == 'poly_order': variables = np.arange(min_value, max_value + step, step) else: # use linspace for floats since it ensures endpoints are included; use # logspace to skip having to do 10.0**linspace(...) variables = np.logspace( min_value, max_value, ceil((max_value - min_value) / step), base=10.0 ) # double check that variables has at least one item; otherwise skip the optimization if variables.size == 0: do_optimization = False if not do_optimization: variables = np.array([min_value]) if param_name == 'lam': variables = 10.0**variables upper_idx = len(fit_data) - upper_bound min_sum_squares = np.inf best_idx = 0 sum_squares_tot = np.zeros_like(variables) for i, var in enumerate(variables): method_kws[param_name] = var fit_baseline, fit_params = baseline_func(fit_data, **method_kws) # TODO change the known baseline so that np.roll does not have to be # calculated each time, since it requires additional time residual = ( known_background - np.roll(fit_baseline, upper_bound)[:added_len] ) # just calculate the sum of squares to reduce time from using sqrt for rmse sum_squares = residual.dot(residual) sum_squares_tot[i] = sum_squares if sum_squares < min_sum_squares: baseline = fit_baseline[lower_bound:upper_idx] method_params = fit_params best_idx = i min_sum_squares = sum_squares sum_squares_tot = np.sqrt(sum_squares_tot / added_len) params = { 'optimal_parameter': variables[best_idx], 'min_rmse': sum_squares_tot[best_idx], 'rmse': sum_squares_tot, 'method_params': method_params } for key in ('weights', 'alpha'): if key in params['method_params']: params['method_params'][key] = params['method_params'][key][ 0 if side == 'right' else added_window: None if side == 'left' else -added_window ] return baseline, params
[docs] @_Algorithm._register(skip_sorting=True) def adaptive_minmax(self, data, poly_order=None, method='modpoly', weights=None, constrained_fraction=0.01, constrained_weight=1e5, estimation_poly_order=2, method_kwargs=None): """ Fits polynomials of different orders and uses the maximum values as the baseline. Each polynomial order fit is done both unconstrained and constrained at the endpoints. Parameters ---------- data : array-like, shape (N,) The y-values of the measured data, with N data points. poly_order : int or Sequence(int, int) or None, optional The two polynomial orders to use for fitting. If a single integer is given, then will use the input value and one plus the input value. Default is None, which will do a preliminary fit using a polynomial of order `estimation_poly_order` and then select the appropriate polynomial orders according to [1]_. method : {'modpoly', 'imodpoly'}, optional The method to use for fitting each polynomial. Default is 'modpoly'. weights : array-like, shape (N,), optional The weighting array. If None (default), then will be an array with size equal to N and all values set to 1. constrained_fraction : float or Sequence(float, float), optional The fraction of points at the left and right edges to use for the constrained fit. Default is 0.01. If `constrained_fraction` is a sequence, the first item is the fraction for the left edge and the second is the fraction for the right edge. constrained_weight : float or Sequence(float, float), optional The weighting to give to the endpoints. Higher values ensure that the end points are fit, but can cause large fluctuations in the other sections of the polynomial. Default is 1e5. If `constrained_weight` is a sequence, the first item is the weight for the left edge and the second is the weight for the right edge. estimation_poly_order : int, optional The polynomial order used for estimating the baseline-to-signal ratio to select the appropriate polynomial orders if `poly_order` is None. Default is 2. method_kwargs : dict, optional Additional keyword arguments to pass to :meth:`~.Baseline.modpoly` or :meth:`~.Baseline.imodpoly`. These include `tol`, `max_iter`, `use_original`, `mask_initial_peaks`, and `num_std`. Returns ------- numpy.ndarray, shape (N,) The calculated baseline. params : dict A dictionary with the following items: * 'weights': numpy.ndarray, shape (N,) The weight array used for fitting the data. * 'constrained_weights': numpy.ndarray, shape (N,) The weight array used for the endpoint-constrained fits. * 'poly_order': numpy.ndarray, shape (2,) An array of the two polynomial orders used for the fitting. * 'method_params': dict[str, list] A dictionary containing the output parameters for each individual fit. Keys will depend on the selected method and will have a list of values, with each item corresponding to a fit. References ---------- .. [1] Cao, A., et al. A robust method for automated background subtraction of tissue fluorescence. Journal of Raman Spectroscopy, 2007, 38, 1199-1205. """ y, baseline_func, _, method_kws, _ = self._setup_optimizer( data, method, [polynomial], method_kwargs, False ) sort_weights = weights is not None weight_array = _check_optional_array(self._size, weights, check_finite=self._check_finite) if poly_order is None: poly_orders = _determine_polyorders( y, estimation_poly_order, weight_array, baseline_func, **method_kws ) else: poly_orders, scalar_poly_order = _check_scalar(poly_order, 2, True, dtype=int) if scalar_poly_order: poly_orders[1] += 1 # add 1 since they are initially equal if scalar input # use high weighting rather than Lagrange multipliers to constrain the points # to better work with noisy data weightings = _check_scalar(constrained_weight, 2, True)[0] constrained_fractions = _check_scalar(constrained_fraction, 2, True)[0] if np.any(constrained_fractions < 0) or np.any(constrained_fractions > 1): raise ValueError('constrained_fraction must be between 0 and 1') # have to temporarily sort weights to match x- and y-ordering so that left and right edges # are correct if sort_weights: weight_array = _sort_array(weight_array, self._sort_order) constrained_weights = weight_array.copy() constrained_weights[:ceil(self._size * constrained_fractions[0])] = weightings[0] constrained_weights[ self._size - ceil(self._size * constrained_fractions[1]): ] = weightings[1] # and now change back to original ordering if sort_weights: weight_array = _sort_array(weight_array, self._inverted_order) constrained_weights = _sort_array(constrained_weights, self._inverted_order) params = { 'weights': weight_array, 'constrained_weights': constrained_weights, 'poly_order': poly_orders, 'method_params': defaultdict(list) } # order of inputs is (poly_orders[0], weight_array), (poly_orders[0], constrained_weights), # (poly_orders[1], weight_array), (poly_orders[1], constrained_weights) baselines = np.empty((4, self._size)) for i, (p_order, weight) in enumerate( itertools.product(poly_orders, (weight_array, constrained_weights)) ): baselines[i], method_params = baseline_func( data=y, poly_order=p_order, weights=weight, **method_kws ) for key, value in method_params.items(): params['method_params'][key].append(value) return np.maximum.reduce(baselines), params
[docs] @_Algorithm._register def custom_bc(self, data, method='asls', regions=((None, None),), sampling=1, lam=None, diff_order=2, method_kwargs=None): """ Customized baseline correction for fine tuned stiffness of the baseline at specific regions. Divides the data into regions with variable number of data points and then uses other baseline algorithms to fit the truncated data. Regions with less points effectively makes the fit baseline more stiff in those regions. Parameters ---------- data : array-like, shape (N,) The y-values of the measured data, with N data points. method : str, optional A string indicating the algorithm to use for fitting the baseline; can be any non-optimizer algorithm in pybaselines. Default is 'asls'. regions : array-like, shape (M, 2), optional The two dimensional array containing the start and stop indices for each region of interest. Each region is defined as ``data[start:stop]``. Default is ((None, None),), which will use all points. sampling : int or array-like, optional The sampling step size for each region defined in `regions`. If `sampling` is an integer, then all regions will use the same index step size; if `sampling` is an array-like, its length must be equal to `M`, the first dimension in `regions`. Default is 1, which will use all points. lam : float or None, optional The value for smoothing the calculated interpolated baseline using Whittaker smoothing, in order to reduce the kinks between regions. Default is None, which will not smooth the baseline; a value of 0 will also not perform smoothing. diff_order : int, optional The difference order used for Whittaker smoothing of the calculated baseline. Default is 2. method_kwargs : dict, optional A dictionary of keyword arguments to pass to the selected `method` function. Default is None, which will use an empty dictionary. Returns ------- baseline : numpy.ndarray, shape (N,) The baseline calculated with the optimum parameter. params : dict A dictionary with the following items: * 'x_fit': numpy.ndarray, shape (P,) The truncated x-values used for fitting the baseline. * 'y_fit': numpy.ndarray, shape (P,) The truncated y-values used for fitting the baseline. * 'baseline_fit': numpy.ndarray, shape (P,) The truncated baseline before interpolating from `P` points to `N` points. * 'method_params': dict A dictionary containing the output parameters for the fit using the selected method. Raises ------ ValueError Raised if `regions` is not two dimensional, if `sampling` is not the same length as `rois.shape[0]`, if any values in `sampling` or `regions` is less than 1, if segments in `regions` overlap, or if any value in `regions` is greater than the length of the input data. Notes ----- Uses Whittaker smoothing to smooth the transitions between regions rather than LOESS as used in [1]_. Uses binning rather than direct truncation of the regions in order to get better results for noisy data. References ---------- .. [1] Liland, K., et al. Customized baseline correction. Chemometrics and Intelligent Laboratory Systems, 2011, 109(1), 51-56. """ y, _, _, method_kws, fitting_object = self._setup_optimizer( data, method, (classification, misc, morphological, polynomial, smooth, spline, whittaker), method_kwargs, True ) roi = np.atleast_2d(regions) roi_shape = roi.shape if len(roi_shape) != 2 or roi_shape[1] != 2: raise ValueError('rois must be a two dimensional sequence of (start, stop) values') steps = _check_scalar(sampling, roi_shape[0], fill_scalar=True, dtype=np.intp)[0] if np.any(steps < 1): raise ValueError('all step sizes in "sampling" must be >= 1') x_sections = [] y_sections = [] x_mask = np.ones(self._shape, dtype=bool) last_stop = -1 include_first = True include_last = True for (start, stop), step in zip(roi, steps): if start is None: start = 0 if stop is None: stop = self._size if start < last_stop: raise ValueError('Sections cannot overlap') else: last_stop = stop if start < 0 or stop < 0: raise ValueError('values in regions must be positive') elif stop > self._size: raise ValueError('values in regions must be less than len(data)') sections = (stop - start) // step if sections == 0: sections = 1 # will create one section using the midpoint indices = np.linspace(start, stop, sections + 1, dtype=np.intp) for left_idx, right_idx in zip(indices[:-1], indices[1:]): if left_idx == 0 and right_idx == 1: include_first = False elif right_idx == self._size and left_idx == self._size - 1: include_last = False y_sections.append(np.mean(y[left_idx:right_idx])) x_sections.append(np.mean(self.x[left_idx:right_idx])) x_mask[start:stop] = False # ensure first and last indices are included in the fit to avoid edge effects if include_first: x_mask[0] = True if include_last: x_mask[-1] = True x_sections.extend(self.x[x_mask]) y_sections.extend(y[x_mask]) x_fit = np.array(x_sections) sort_order = np.argsort(x_fit, kind='mergesort') x_fit = x_fit[sort_order] y_fit = np.array(y_sections)[sort_order] # param sorting will be wrong, but most params that need sorting will have # no meaning since they correspond to a truncated dataset params = {'x_fit': x_fit, 'y_fit': y_fit} new_fitter = fitting_object._override_x(x_fit) baseline_fit, params['method_params'] = getattr(new_fitter, method.lower())( y_fit, **method_kws ) baseline = np.interp(self.x, x_fit, baseline_fit) params['baseline_fit'] = baseline_fit if lam is not None and lam != 0: _, _, whittaker_system = self._setup_whittaker(y, lam=lam, diff_order=diff_order) baseline = whittaker_system.solve( whittaker_system.add_diagonal(1.), baseline, overwrite_ab=True, overwrite_b=True ) return baseline, params
_optimizers_wrapper = _class_wrapper(_Optimizers)
[docs] @_optimizers_wrapper def collab_pls(data, average_dataset=True, method='asls', method_kwargs=None, x_data=None): """ Collaborative Penalized Least Squares (collab-PLS). Averages the data or the fit weights for an entire dataset to get more optimal results. Uses any Whittaker-smoothing-based or weighted spline algorithm. Parameters ---------- data : array-like, shape (M, N) An array with shape (M, N) where M is the number of entries in the dataset and N is the number of data points in each entry. average_dataset : bool, optional If True (default) will average the dataset before fitting to get the weighting. If False, will fit each individual entry in the dataset and then average the weights to get the weighting for the dataset. method : str, optional A string indicating the Whittaker-smoothing-based or weighted spline method to use for fitting the baseline. Default is 'asls'. method_kwargs : dict, optional A dictionary of keyword arguments to pass to the selected `method` function. Default is None, which will use an empty dictionary. x_data : array-like, shape (N,), optional The x values for the data. Not used by most Whittaker-smoothing algorithms. Returns ------- baselines : np.ndarray, shape (M, N) An array of all of the baselines. params : dict A dictionary with the following items: * 'average_weights': numpy.ndarray, shape (N,) The weight array used to fit all of the baselines. * 'average_alpha': numpy.ndarray, shape (N,) Only returned if `method` is 'aspls' or 'pspline_aspls'. The `alpha` array used to fit all of the baselines for the :meth:`~.Baseline.aspls` or :meth:`~.Baseline.pspline_aspls` methods. * 'method_params': dict[str, list] A dictionary containing the output parameters for each individual fit. Keys will depend on the selected method and will have a list of values, with each item corresponding to a fit. Notes ----- If `method` is 'aspls' or 'pspline_aspls', `collab_pls` will also calculate the `alpha` array for the entire dataset in the same manner as the weights. References ---------- Chen, L., et al. Collaborative Penalized Least Squares for Background Correction of Multiple Raman Spectra. Journal of Analytical Methods in Chemistry, 2018, 2018. """
[docs] @_optimizers_wrapper def optimize_extended_range(data, x_data=None, method='asls', side='both', width_scale=0.1, height_scale=1., sigma_scale=1. / 12., min_value=2, max_value=8, step=1, pad_kwargs=None, method_kwargs=None): """ Extends data and finds the best parameter value for the given baseline method. Adds additional data to the left and/or right of the input data, and then iterates through parameter values to find the best fit. Useful for calculating the optimum `lam` or `poly_order` value required to optimize other algorithms. Parameters ---------- data : array-like, shape (N,) The y-values of the measured data, with N data points. x_data : array-like, shape (N,), optional The x-values of the measured data. Default is None, which will create an array from -1 to 1 with N points. method : str, optional A string indicating the Whittaker-smoothing-based, polynomial, or spline method to use for fitting the baseline. Default is 'asls'. side : {'both', 'left', 'right'}, optional The side of the measured data to extend. Default is 'both'. width_scale : float, optional The number of data points added to each side is `width_scale` * N. Default is 0.1. height_scale : float, optional The height of the added Gaussian peak(s) is calculated as `height_scale` * max(`data`). Default is 1. sigma_scale : float, optional The sigma value for the added Gaussian peak(s) is calculated as `sigma_scale` * `width_scale` * N. Default is 1/12, which will make the Gaussian span +- 6 sigma, making its total width about half of the added length. min_value : int or float, optional The minimum value for the `lam` or `poly_order` value to use with the indicated method. If using a polynomial method, `min_value` must be an integer. If using a Whittaker-smoothing-based method, `min_value` should be the exponent to raise to the power of 10 (eg. a `min_value` value of 2 designates a `lam` value of 10**2). Default is 2. max_value : int or float, optional The maximum value for the `lam` or `poly_order` value to use with the indicated method. If using a polynomial method, `max_value` must be an integer. If using a Whittaker-smoothing-based method, `max_value` should be the exponent to raise to the power of 10 (eg. a `max_value` value of 3 designates a `lam` value of 10**3). Default is 8. step : int or float, optional The step size for iterating the parameter value from `min_value` to `max_value`. If using a polynomial method, `step` must be an integer. pad_kwargs : dict, optional A dictionary of options to pass to :func:`.pad_edges` for padding the edges of the data when adding the extended left and/or right sections. Default is None, which will use an empty dictionary. method_kwargs : dict, optional A dictionary of keyword arguments to pass to the selected `method` function. Default is None, which will use an empty dictionary. Returns ------- baseline : numpy.ndarray, shape (N,) The baseline calculated with the optimum parameter. method_params : dict A dictionary with the following items: * 'optimal_parameter': int or float The `lam` or `poly_order` value that produced the lowest root-mean-squared-error. * 'min_rmse': float The minimum root-mean-squared-error obtained when using the optimal parameter. * 'method_params': dict A dictionary containing the output parameters for the optimal fit. Items will depend on the selected method. Raises ------ ValueError Raised if `side` is not 'left', 'right', or 'both'. TypeError Raised if using a polynomial method and `min_value`, `max_value`, or `step` is not an integer. ValueError Raised if using a Whittaker-smoothing-based method and `min_value`, `max_value`, or `step` is greater than 100. Notes ----- Based on the extended range penalized least squares (erPLS) method from [1]_. The method proposed by [1]_ was for optimizing lambda only for the aspls method by extending only the right side of the spectrum. The method was modified by allowing extending either side following [2]_, and for optimizing lambda or the polynomial degree for all of the affected algorithms in pybaselines. References ---------- .. [1] Zhang, F., et al. An Automatic Baseline Correction Method Based on the Penalized Least Squares Method. Sensors, 2020, 20(7), 2015. .. [2] Krishna, H., et al. Range-independent background subtraction algorithm for recovery of Raman spectra of biological tissue. Journal of Raman Spectroscopy. 2012, 43(12), 1884-1894. """
def _determine_polyorders(y, poly_order, weights, fit_function, **fit_kwargs): """ Selects the appropriate polynomial orders based on the baseline-to-signal ratio. Parameters ---------- y : numpy.ndarray The array of y-values. poly_order : int The polynomial order for fitting. weights : numpy.ndarray The weight array for fitting. fit_function : Callable The function to use for the polynomial fit. **fit_kwargs Additional keyword arguments to pass to `fit_function`. Returns ------- orders : numpy.ndarray, shape (2,) The two polynomial orders to use based on the baseline to signal ratio according to the reference. References ---------- Cao, A., et al. A robust method for automated background subtraction of tissue fluorescence. Journal of Raman Spectroscopy, 2007, 38, 1199-1205. """ baseline = fit_function(y, poly_order=poly_order, weights=weights, **fit_kwargs)[0] signal = y - baseline baseline_to_signal = (baseline.max() - baseline.min()) / (signal.max() - signal.min()) # Table 2 in reference if baseline_to_signal < 0.2: orders = (1, 2) elif baseline_to_signal < 0.75: orders = (2, 3) elif baseline_to_signal < 8.5: orders = (3, 4) elif baseline_to_signal < 55: orders = (4, 5) elif baseline_to_signal < 240: orders = (5, 6) elif baseline_to_signal < 517: orders = (6, 7) else: orders = (6, 8) # not a typo, use 6 and 8 rather than 7 and 8 return np.array(orders)
[docs] @_optimizers_wrapper def adaptive_minmax(data, x_data=None, poly_order=None, method='modpoly', weights=None, constrained_fraction=0.01, constrained_weight=1e5, estimation_poly_order=2, method_kwargs=None): """ Fits polynomials of different orders and uses the maximum values as the baseline. Each polynomial order fit is done both unconstrained and constrained at the endpoints. Parameters ---------- data : array-like, shape (N,) The y-values of the measured data, with N data points. x_data : array-like, shape (N,), optional The x-values of the measured data. Default is None, which will create an array from -1 to 1 with N points. poly_order : int or Sequence(int, int) or None, optional The two polynomial orders to use for fitting. If a single integer is given, then will use the input value and one plus the input value. Default is None, which will do a preliminary fit using a polynomial of order `estimation_poly_order` and then select the appropriate polynomial orders according to [3]_. method : {'modpoly', 'imodpoly'}, optional The method to use for fitting each polynomial. Default is 'modpoly'. weights : array-like, shape (N,), optional The weighting array. If None (default), then will be an array with size equal to N and all values set to 1. constrained_fraction : float or Sequence(float, float), optional The fraction of points at the left and right edges to use for the constrained fit. Default is 0.01. If `constrained_fraction` is a sequence, the first item is the fraction for the left edge and the second is the fraction for the right edge. constrained_weight : float or Sequence(float, float), optional The weighting to give to the endpoints. Higher values ensure that the end points are fit, but can cause large fluctuations in the other sections of the polynomial. Default is 1e5. If `constrained_weight` is a sequence, the first item is the weight for the left edge and the second is the weight for the right edge. estimation_poly_order : int, optional The polynomial order used for estimating the baseline-to-signal ratio to select the appropriate polynomial orders if `poly_order` is None. Default is 2. method_kwargs : dict, optional Additional keyword arguments to pass to :meth:`~.Baseline.modpoly` or :meth:`~.Baseline.imodpoly`. These include `tol`, `max_iter`, `use_original`, `mask_initial_peaks`, and `num_std`. Returns ------- numpy.ndarray, shape (N,) The calculated baseline. params : dict A dictionary with the following items: * 'weights': numpy.ndarray, shape (N,) The weight array used for fitting the data. * 'constrained_weights': numpy.ndarray, shape (N,) The weight array used for the endpoint-constrained fits. * 'poly_order': numpy.ndarray, shape (2,) An array of the two polynomial orders used for the fitting. * 'method_params': dict[str, list] A dictionary containing the output parameters for each individual fit. Keys will depend on the selected method and will have a list of values, with each item corresponding to a fit. References ---------- .. [3] Cao, A., et al. A robust method for automated background subtraction of tissue fluorescence. Journal of Raman Spectroscopy, 2007, 38, 1199-1205. """
[docs] @_optimizers_wrapper def custom_bc(data, x_data=None, method='asls', regions=((None, None),), sampling=1, lam=None, diff_order=2, method_kwargs=None): """ Customized baseline correction for fine tuned stiffness of the baseline at specific regions. Divides the data into regions with variable number of data points and then uses other baseline algorithms to fit the truncated data. Regions with less points effectively makes the fit baseline more stiff in those regions. Parameters ---------- data : array-like, shape (N,) The y-values of the measured data, with N data points. method : str, optional A string indicating the algorithm to use for fitting the baseline; can be any non-optimizer algorithm in pybaselines. Default is 'asls'. regions : array-like, shape (M, 2), optional The two dimensional array containing the start and stop indices for each region of interest. Each region is defined as ``data[start:stop]``. Default is ((None, None),), which will use all points. sampling : int or array-like, optional The sampling step size for each region defined in `regions`. If `sampling` is an integer, then all regions will use the same index step size; if `sampling` is an array-like, its length must be equal to `M`, the first dimension in `regions`. Default is 1, which will use all points. lam : float or None, optional The value for smoothing the calculated interpolated baseline using Whittaker smoothing, in order to reduce the kinks between regions. Default is None, which will not smooth the baseline; a value of 0 will also not perform smoothing. diff_order : int, optional The difference order used for Whittaker smoothing of the calculated baseline. Default is 2. method_kwargs : dict, optional A dictionary of keyword arguments to pass to the selected `method` function. Default is None, which will use an empty dictionary. Returns ------- baseline : numpy.ndarray, shape (N,) The baseline calculated with the optimum parameter. params : dict A dictionary with the following items: * 'x_fit': numpy.ndarray, shape (P,) The truncated x-values used for fitting the baseline. * 'y_fit': numpy.ndarray, shape (P,) The truncated y-values used for fitting the baseline. * 'baseline_fit': numpy.ndarray, shape (P,) The truncated baseline before interpolating from `P` points to `N` points. * 'method_params': dict A dictionary containing the output parameters for the fit using the selected method. Raises ------ ValueError Raised if `regions` is not two dimensional, if `sampling` is not the same length as `rois.shape[0]`, if any values in `sampling` or `regions` is less than 1, if segments in `regions` overlap, or if any value in `regions` is greater than the length of the input data. Notes ----- Uses Whittaker smoothing to smooth the transitions between regions rather than LOESS as used in [4]_. Uses binning rather than direct truncation of the regions in order to get better results for noisy data. References ---------- .. [4] Liland, K., et al. Customized baseline correction. Chemometrics and Intelligent Laboratory Systems, 2011, 109(1), 51-56. """