Source code for pybaselines.utils

# -*- coding: utf-8 -*-
"""Helper functions for pybaselines.

Created on March 5, 2021
@author: Donald Erb

"""

from math import ceil

import numpy as np
from scipy.ndimage import grey_opening
from scipy.signal import convolve
from scipy.special import binom

from ._banded_utils import PenalizedSystem
from ._banded_utils import difference_matrix as _difference_matrix
from ._compat import jit
from ._spline_utils import PSpline, SplineBasis
from ._validation import (
    _check_array, _check_optional_array, _check_scalar, _get_row_col_values, _yx_arrays
)


# the minimum positive float values such that 1 + _MIN_FLOAT != 1
# TODO this is mostly used to prevent dividing by 0; is there a better way to do that?
# especially since it is usually max(value, _MIN_FLOAT) and in some cases value could be
# < _MIN_FLOAT but still > 0 and useful; think about it
_MIN_FLOAT = np.finfo(float).eps



[docs]
class ParameterWarning(UserWarning):
    """
    Warning issued when a parameter value is outside of the recommended range.

    For cases where a parameter value is valid and will not cause errors, but is
    outside of the recommended range of values and as a result may cause issues
    such as numerical instability that would otherwise be hard to diagnose.
    """




[docs]
class SortingWarning(UserWarning):
    """Issued when `assume_sorted` is set to True and inputs are not in ascending order."""




[docs]
def relative_difference(old, new, norm_order=None):
    """
    Calculates the relative difference, ``(norm(new-old) / norm(old))``, of two values.

    Used as an exit criteria in many baseline algorithms.

    Parameters
    ----------
    old : numpy.ndarray or float
        The array or single value from the previous iteration.
    new : numpy.ndarray or float
        The array or single value from the current iteration.
    norm_order : int, optional
        The type of norm to calculate. Default is None, which is l2
        norm for arrays, abs for scalars.

    Returns
    -------
    float
        The relative difference between the old and new values.

    """
    numerator = np.linalg.norm(new - old, norm_order)
    denominator = np.maximum(np.linalg.norm(old, norm_order), _MIN_FLOAT)
    return numerator / denominator




[docs]
def gaussian(x, height=1.0, center=0.0, sigma=1.0):
    """
    Generates a Gaussian distribution based on height, center, and sigma.

    Parameters
    ----------
    x : numpy.ndarray
        The x-values at which to evaluate the distribution.
    height : float, optional
        The maximum height of the distribution. Default is 1.0.
    center : float, optional
        The center of the distribution. Default is 0.0.
    sigma : float, optional
        The standard deviation of the distribution. Default is 1.0.

    Returns
    -------
    numpy.ndarray
        The Gaussian distribution evaluated with x.

    Raises
    ------
    ValueError
        Raised if `sigma` is not greater than 0.

    """
    if sigma <= 0:
        raise ValueError('sigma must be greater than 0')
    return height * np.exp(-0.5 * ((x - center)**2) / sigma**2)




[docs]
def gaussian2d(x, z, height=1.0, center_x=0.0, center_z=0.0, sigma_x=1.0, sigma_z=1.0):
    """
    Generates a Gaussian distribution based on height, center, and sigma.

    Parameters
    ----------
    x : numpy.ndarray, shape (M, N)
        The x-values at which to evaluate the distribution.
    z : numpy.ndarray, shape (M, N)
        The z-values at which to evaluate the distribution.
    height : float, optional
        The maximum height of the distribution. Default is 1.0.
    center_x : float, optional
        The center of the distribution in the x-axis. Default is 0.0.
    sigma_x : float, optional
        The standard deviation of the distribution in the x-axis. Default is 1.0.
    center_z : float, optional
        The center of the distribution in the z-axis. Default is 0.0.
    sigma_z : float, optional
        The standard deviation of the distribution in the z-axis. Default is 1.0.

    Returns
    -------
    numpy.ndarray, shape (M, N)
        The Gaussian distribution evaluated with x and z.

    Raises
    ------
    ValueError
        Raised if the input `x` or `z` are not two dimensional.

    Notes
    -----
    The input `x` and `z` should be two dimensional arrays, which can be gotten
    from their one dimensional counterparts by using :func:`numpy.meshgrid`.

    """
    if x.ndim != 2 or z.ndim != 2:
        raise ValueError('x and z should be two dimensional')
    return height * gaussian(x, 1, center_x, sigma_x) * gaussian(z, 1, center_z, sigma_z)




[docs]
def gaussian_kernel(window_size, sigma=1.0):
    """
    Creates an area-normalized gaussian kernel for convolution.

    Parameters
    ----------
    window_size : int
        The number of points for the entire kernel.
    sigma : float, optional
        The standard deviation of the gaussian model.

    Returns
    -------
    numpy.ndarray, shape (window_size,)
        The area-normalized gaussian kernel.

    Notes
    -----
    Return gaus/sum(gaus) rather than creating a unit-area gaussian
    since the unit-area gaussian would have an area smaller than 1
    for window_size < ~ 6 * sigma.

    """
    # centers distribution from -half_window to half_window
    window_size = max(1, window_size)
    x = np.arange(window_size) - (window_size - 1) / 2
    gaus = gaussian(x, 1, 0, sigma)
    return gaus / np.sum(gaus)



def _mollifier_kernel(half_window):
    """
    A kernel for smoothing/mollification.

    Parameters
    ----------
    half_window : int
        The half-window length of the kernel.

    Returns
    -------
    numpy.ndarray, shape (``2 * half_window + 1``,)
        The area normalized kernel.

    References
    ----------
    Chen, H., et al. An Adaptive and Fully Automated Baseline Correction
    Method for Raman Spectroscopy Based on Morphological Operations and
    Mollifications. Applied Spectroscopy, 2019, 73(3), 284-293.

    """
    if half_window == 0:
        return np.ones(1)
    x = (np.arange(2 * half_window + 1) - half_window) / half_window
    kernel = np.zeros_like(x)
    # x[1:-1] is same as x[abs(x) < 1]
    kernel[1:-1] = np.exp(-1 / (1 - (x[1:-1])**2))
    return kernel / kernel.sum()


def _get_edges(data, pad_length, mode='extrapolate', extrapolate_window=None, **pad_kwargs):
    """
    Provides the left and right edges for padding data.

    Parameters
    ----------
    data : array-like
        The array of the data.
    pad_length : int
        The number of points to add to the left and right edges.
    mode : str or Callable, optional
        The method for padding. Default is 'extrapolate'. Any method other than
        'extrapolate' will use numpy.pad.
    extrapolate_window : int, optional
        The number of values to use for linear fitting on the left and right
        edges. Default is None, which will set the extrapolate window size equal
        to `pad_length`.
    **pad_kwargs
        Any keyword arguments to pass to numpy.pad, which will be used if `mode`
        is not 'extrapolate'.

    Returns
    -------
    left_edge : numpy.ndarray, shape(pad_length,)
        The array of data for the left padding.
    right_edge : numpy.ndarray, shape(pad_length,)
        The array of data for the right padding.

    Raises
    ------
    ValueError
        Raised if `pad_length` is < 0, or if `extrapolate_window` is <= 0 and
        `mode` is `extrapolate`.

    Notes
    -----
    If mode is 'extrapolate', then the left and right edges will be fit with
    a first order polynomial and then extrapolated. Otherwise, uses :func:`numpy.pad`.

    """
    y = np.asarray(data)
    if pad_length == 0:
        return np.array([]), np.array([])
    elif pad_length < 0:
        raise ValueError('pad length must be greater or equal to 0')

    if isinstance(mode, str):
        mode = mode.lower()
    if mode == 'extrapolate':
        if extrapolate_window is None:
            extrapolate_window = pad_length
        extrapolate_windows = _check_scalar(extrapolate_window, 2, True, dtype=int)[0]

        if np.any(extrapolate_windows <= 0):
            raise ValueError('extrapolate_window must be greater than 0')
        left_edge = np.empty(pad_length)
        right_edge = np.empty(pad_length)
        # use x[pad_length:-pad_length] for fitting to ensure x and y are
        # same shape regardless of extrapolate window value
        x = np.arange(len(y) + 2 * pad_length)
        for i, array in enumerate((left_edge, right_edge)):
            extrapolate_window_i = extrapolate_windows[i]
            if extrapolate_window_i == 1:
                # just use the edges rather than trying to fit a line
                array[:] = y[0] if i == 0 else y[-1]
            elif i == 0:
                poly = np.polynomial.Polynomial.fit(
                    x[pad_length:-pad_length][:extrapolate_window_i],
                    y[:extrapolate_window_i], 1
                )
                array[:] = poly(x[:pad_length])
            else:
                poly = np.polynomial.Polynomial.fit(
                    x[pad_length:-pad_length][-extrapolate_window_i:],
                    y[-extrapolate_window_i:], 1
                )
                array[:] = poly(x[-pad_length:])
    else:
        padded_data = np.pad(y, pad_length, mode, **pad_kwargs)
        left_edge = padded_data[:pad_length]
        right_edge = padded_data[-pad_length:]

    return left_edge, right_edge



[docs]
def pad_edges(data, pad_length, mode='extrapolate',
              extrapolate_window=None, **pad_kwargs):
    """
    Adds left and right edges to the data.

    Parameters
    ----------
    data : array-like
        The array of the data.
    pad_length : int
        The number of points to add to the left and right edges.
    mode : str or Callable, optional
        The method for padding. Default is 'extrapolate'. Any method other than
        'extrapolate' will use :func:`numpy.pad`.
    extrapolate_window : int, optional
        The number of values to use for linear fitting on the left and right
        edges. Default is None, which will set the extrapolate window size equal
        to `pad_length`.
    **pad_kwargs
        Any keyword arguments to pass to :func:`numpy.pad`, which will be used if `mode`
        is not 'extrapolate'.

    Returns
    -------
    padded_data : numpy.ndarray, shape (N + 2 * half_window,)
        The data with padding on the left and right edges.

    Notes
    -----
    If mode is 'extrapolate', then the left and right edges will be fit with
    a first order polynomial and then extrapolated. Otherwise, uses :func:`numpy.pad`.

    """
    y = np.asarray(data)
    if pad_length == 0:
        return y

    if isinstance(mode, str):
        mode = mode.lower()
    if mode == 'extrapolate':
        left_edge, right_edge = _get_edges(y, pad_length, mode, extrapolate_window)
        padded_data = np.concatenate((left_edge, y, right_edge))
    else:
        padded_data = np.pad(y, pad_length, mode, **pad_kwargs)

    return padded_data



def _extrapolate2d(y, total_padding, extrapolate_window=None):
    """
    Extrapolates each edge of two dimensional data.

    Corners are calculated by averaging linear fits of the extended data.

    Parameters
    ----------
    y : numpy.ndarray
        _description_
    total_padding : Sequence[int, int, int, int]
        The padding for the top, bottom, left, and right. The padding of top and
        bottom are assumed to be equal, as are the left and right.
    extrapolate_window : int or Sequence[int, int] or Sequence[int, int, int, int], optional
        The number of values to use for linear fitting on the top, bottom, left, and right
        edges. Default is None, which will set the extrapolate window size equal
        to `total_padding`.

    Returns
    -------
    output : numpy.ndarray
        The data with padding

    Raises
    ------
    NotImplementedError
        Raised if any value in `total_padding` is zero.
    ValueError
        Raised if any extrapolation window is less than 1.

    Notes
    -----
    Uses the Moore-Penrose pseudo-inverse to speed up the calculation of the linear fits
    for each edge. Using the Vandermonde with `numpy.linalg.lstsq` would also work but is
    a little slower.

    """
    if np.equal(total_padding, 0).any():
        raise NotImplementedError('pad length of 0 is not supported in 2D')
    elif np.less(total_padding, 0).any():
        raise ValueError('pad length must be greater or equal to 0')

    if extrapolate_window is None:
        extrapolate_windows = total_padding
    else:
        extrapolate_windows = _get_row_col_values(extrapolate_window).reshape((2, 2))

    if np.less_equal(extrapolate_windows, 0).any():
        raise ValueError('extrapolate_window must be greater than 0')
    # pad length for left and right or top and bottom should be equal, so ignore the repeats
    total_padding = [total_padding[0][0], total_padding[1][0]]

    output = np.empty(
        (y.shape[0] + total_padding[0] * 2, y.shape[1] + total_padding[1] * 2)
    )
    output[total_padding[0]:-total_padding[0], total_padding[1]:-total_padding[1]] = y

    x = np.arange(y.shape[0] + 2 * total_padding[0])
    z = np.arange(y.shape[1] + 2 * total_padding[1])

    vander_x = np.polynomial.polynomial.polyvander(x, 1)
    vander_z = np.polynomial.polynomial.polyvander(z, 1)
    pinv_top = np.linalg.pinv(
        vander_x[total_padding[0]:-total_padding[0]][:extrapolate_windows[0][0]]
    )
    pinv_bottom = np.linalg.pinv(
        vander_x[total_padding[0]:-total_padding[0]][-extrapolate_windows[0][1]:]
    )
    pinv_left = np.linalg.pinv(
        vander_z[total_padding[1]:-total_padding[1]][:extrapolate_windows[1][0]]
    )
    pinv_right = np.linalg.pinv(
        vander_z[total_padding[1]:-total_padding[1]][-extrapolate_windows[1][1]:]
    )

    top = vander_x[:total_padding[0]] @ (pinv_top @ y[:extrapolate_windows[0][0]])
    bottom = vander_x[-total_padding[0]:] @ (pinv_bottom @ y[-extrapolate_windows[0][1]:])

    output[:total_padding[0], total_padding[1]:-total_padding[1]] = top
    output[-total_padding[0]:, total_padding[1]:-total_padding[1]] = bottom

    left = vander_z[:total_padding[1]] @ (pinv_left @ y[:, :extrapolate_windows[1][0]].T)
    right = vander_z[-total_padding[1]:] @ (pinv_right @ y[:, -extrapolate_windows[1][1]:].T)

    output[total_padding[0]:-total_padding[0], :total_padding[1]] = left.T
    output[total_padding[0]:-total_padding[0], -total_padding[1]:] = right.T

    # now fill the corners by averaging the extensions of the corners
    top_left = vander_z[:total_padding[1]] @ (
        pinv_left @ output[
            :total_padding[0], total_padding[1]:-total_padding[1]
        ][:, :extrapolate_windows[1][0]].T
    )
    top_right = vander_z[-total_padding[1]:] @ (
        pinv_right @ output[
            :total_padding[0], total_padding[1]:-total_padding[1]
        ][:, -extrapolate_windows[1][1]:].T
    )

    bottom_left = vander_z[:total_padding[1]] @ (
        pinv_left @ output[
            -total_padding[0]:, total_padding[1]:-total_padding[1]
        ][:, :extrapolate_windows[1][0]].T
    )
    bottom_right = vander_z[-total_padding[1]:] @ (
        pinv_right @ output[
            -total_padding[0]:, total_padding[1]:-total_padding[1]
        ][:, -extrapolate_windows[1][1]:].T
    )

    left_top = vander_x[:total_padding[0]] @ (
        pinv_top @ output[
            total_padding[0]:-total_padding[0], :total_padding[1]
        ][:extrapolate_windows[0][0]]
    )
    left_bottom = vander_x[-total_padding[0]:] @ (
        pinv_bottom @ output[
            total_padding[0]:-total_padding[0], :total_padding[1]:
        ][-extrapolate_windows[0][1]:]
    )

    right_top = vander_x[:total_padding[0]] @ (
        pinv_top @ output[
            total_padding[0]:-total_padding[0], -total_padding[1]:
        ][:extrapolate_windows[0][0]]
    )
    right_bottom = vander_x[-total_padding[0]:] @ (
        pinv_bottom @ output[
            total_padding[0]:-total_padding[0], -total_padding[1]:
        ][-extrapolate_windows[0][1]:]
    )

    output[:total_padding[0], :total_padding[1]] = 0.5 * (top_left.T + left_top)
    output[:total_padding[0], -total_padding[1]:] = 0.5 * (top_right.T + right_top)
    output[-total_padding[0]:, :total_padding[1]] = 0.5 * (bottom_left.T + left_bottom)
    output[-total_padding[0]:, -total_padding[1]:] = 0.5 * (bottom_right.T + right_bottom)

    return output



[docs]
def pad_edges2d(data, pad_length, mode='edge', extrapolate_window=None, **pad_kwargs):
    """
    Adds left, right, top, and bottom edges to the data.

    Parameters
    ----------
    data : array-like, shape (M, N)
        The 2D array of the data.
    pad_length : int or Sequence[int, int]
        The number of points to add to the top, bottom, left, and right edges. If a single
        value is given, all edges have the same padding. If a sequence of two values is
        given, the first value will be the padding on the top and bottom (rows), and the second
        value will pad the left and right (columns).
    mode : str or Callable, optional
        The method for padding. Default is 'edge'. Any method other than
        'extrapolate' will use :func:`numpy.pad`.
    extrapolate_window : int or Sequence[int, int] or Sequence[int, int, int, int], optional
        The number of values to use for linear fitting on the top, bottom, left, and right
        edges. Default is None, which will set the extrapolate window size equal
        to `pad_length`.
    **pad_kwargs
        Any keyword arguments to pass to :func:`numpy.pad`, which will be used if `mode`
        is not 'extrapolate'.

    Returns
    -------
    padded_data : numpy.ndarray
        The data with padding on the top, bottom, left, and right edges.

    Notes
    -----
    If mode is 'extrapolate', then each edge will be extended by linear fits along each
    row and column, and the corners are calculated by averaging the linear sections.

    """
    y = np.asarray(data)
    if y.ndim != 2:
        raise ValueError('input data must be two dimensional')
    total_padding = _get_row_col_values(pad_length).reshape((2, 2))

    if isinstance(mode, str):
        mode = mode.lower()
    if mode == 'extrapolate':
        output = _extrapolate2d(y, total_padding, extrapolate_window)
    else:
        output = np.pad(data, total_padding, mode=mode, **pad_kwargs)

    return output




[docs]
def padded_convolve(data, kernel, mode='reflect', **pad_kwargs):
    """
    Pads data before convolving to reduce edge effects.

    Parameters
    ----------
    data : array-like, shape (N,)
        The data to convolve.
    kernel : array-like, shape (M,)
        The convolution kernel.
    mode : str or Callable, optional
        The method for padding to pass to :func:`.pad_edges`. Default is 'reflect'.
    **pad_kwargs
        Any additional keyword arguments to pass to :func:`.pad_edges`.

    Returns
    -------
    convolution : numpy.ndarray, shape (N,)
        The convolution output.

    """
    # TODO need to revisit this and ensure everything is correct
    # TODO look at using scipy.ndimage.convolve1d instead, or at least
    # comparing the output in tests; that function should have a similar usage
    padding = ceil(min(len(data), len(kernel)) / 2)
    convolution = convolve(
        pad_edges(data, padding, mode, **pad_kwargs), kernel, mode='same'
    )
    return convolution[padding:-padding]



@jit(nopython=True, cache=True)
def _interp_inplace(x, y, y_start, y_end):
    """
    Interpolates values inplace between the two ends of an array.

    Parameters
    ----------
    x : numpy.ndarray
        The x-values for interpolation. All values are assumed to be valid.
    y : numpy.ndarray
        The y-values. The two endpoints, y[0] and y[-1] are assumed to be valid,
        and all values inbetween (ie. y[1:-1]) will be replaced by interpolation.
    y_start : float, optional
        The initial y-value for interpolation.
    y_end : float, optional
        The end y-value for interpolation.

    Returns
    -------
    y : numpy.ndarray
        The input `y` array, with the interpolation performed inplace.

    """
    y[1:-1] = y_start + (x[1:-1] - x[0]) * ((y_end - y_start) / (x[-1] - x[0]))

    return y


def _poly_transform_matrix(num_coefficients, original_domain):
    """
    Creates the matrix that transforms polynomial coefficents from one domain to another.

    The polynomial coefficient array `d` computed with `v` can be transformed to the
    coefficient array `c` computed with `x` where ``v = scale * x + offset`` by applying
    ``c = T @ d``, where `T` is the transformation matrix.

    Parameters
    ----------
    num_coefficients : int
        The number of polynomial coefficients, ie. the polynomial degree + 1.
    original_domain : Sequence[float, float]
        The domain, [min(x), max(x)], of the original data used for fitting.

    Returns
    -------
    transformation : numpy.ndarray, shape (`num_coefficients`, `num_coefficients`)
        The transformation matrix to convert domains.

    Notes
    -----
    The calculation of the transformation matrix is based on the math from
    https://stackoverflow.com/questions/141422/how-can-a-transform-a-polynomial-to-another-coordinate-system#comment57358951_142436.

    This function assumes the original coefficients were computed with the domain [-1, 1].

    """
    offset, scale = np.polynomial.polyutils.mapparms(np.array([-1., 1.]), original_domain)
    transformation = np.zeros((num_coefficients, num_coefficients))
    skip_offset = np.equal(offset, 0)  # 0 raised to negative powers causes nan
    for i in range(num_coefficients):
        for j in range(num_coefficients):
            if skip_offset:
                if j == i:
                    transformation[i, j] = binom(j, i) * (scale)**(-j)
            else:
                transformation[i, j] = binom(j, i) * (scale)**(-j) * (-offset)**(j - i)

    return transformation


def _convert_coef(coef, original_domain):
    """
    Scales the polynomial coefficients back to the original domain of the data.

    For fitting, the x-values are scaled from their original domain, [min(x),
    max(x)], to [-1, 1] in order to improve the numerical stability of fitting.
    This function rescales the retrieved polynomial coefficients for the fit
    x-values back to the original domain.

    Parameters
    ----------
    coef : numpy.ndarray, shape (a,)
        The array of coefficients for the polynomial. Should increase in
        order, for example (c0, c1, c2) from `y = c0 + c1 * x + c2 * x**2`.
    original_domain : Sequence[float, float]
        The domain, [min(x), max(x)], of the original data used for fitting.

    Returns
    -------
    numpy.ndarray, shape (a,)
        The array of coefficients scaled for the original domain.

    Notes
    -----
    Could slightly reduce computation time by computing offset and scale once within
    the _Algorithm object, but doing it this way with `original_domain` is backwards
    compatible and this function is probably not called enough to justify the change.

    """
    transformation = _poly_transform_matrix(coef.shape[0], original_domain)
    return transformation @ coef


def _convert_coef2d(coef, poly_degree_x, poly_degree_z, original_x_domain, original_z_domain):
    """
    Scales the polynomial coefficients back to the original domain of the data.

    For fitting, the x-values and z-values are scaled from their original domain,
    [min(x), max(x)] and [min(z), max(z)], to [-1, 1] in order to improve the numerical
    stability of fitting. This function rescales the retrieved polynomial coefficients
    for the fit x-values and z-values back to their original domains.

    Parameters
    ----------
    coef : numpy.ndarray, shape (``a * b``,)
        The 1d array of coefficients for the polynomial. Should increase in
        order. The shape should be (``a * b``,), where `a` is the polynomial degree + 1 for
        the x-values and `b` is the polynomial degree + 1 for the z-values.
    poly_degree_x : int
        The polynomial degree for the x-values
    poly_degree_z : int
        The polynomial degree for the z-values
    original_x_domain : Sequence[float, float]
        The domain, [min(x), max(x)], of the original x-values used for fitting.
    original_z_domain : Sequence[float, float]
        The domain, [min(z), max(z)], of the original z-values used for fitting.

    Returns
    -------
    numpy.ndarray, shape (a, b)
        The 2D array of coefficients scaled for the original domains.

    Notes
    -----
    Reshapes the coefficient array into the correct shape for use with
    :func:`numpy.polynomial.polynomial.polyval2d`.

    """
    x_order = poly_degree_x + 1
    z_order = poly_degree_z + 1
    transformation_x = _poly_transform_matrix(x_order, original_x_domain)
    transformation_z = _poly_transform_matrix(z_order, original_z_domain)

    return transformation_x @ coef.reshape((x_order, z_order)) @ transformation_z.T



[docs]
def difference_matrix(data_size, diff_order=2, diff_format=None):
    """
    Creates an n-order finite-difference matrix.

    Parameters
    ----------
    data_size : int
        The number of data points.
    diff_order : int, optional
        The integer differential order; must be >= 0. Default is 2.
    diff_format : str or None, optional
        The sparse format to use for the difference matrix. Default is None,
        which will use the default specified in :func:`scipy.sparse.diags`.

    Returns
    -------
    diff_matrix : scipy.sparse.spmatrix or scipy.sparse.sparray
        The sparse difference matrix.

    Raises
    ------
    ValueError
        Raised if `diff_order` or `data_size` is negative.

    Notes
    -----
    The resulting matrices are sparse versions of::

        import numpy as np
        np.diff(np.eye(data_size), diff_order, axis=0)

    This implementation allows using the differential matrices are they
    are written in various publications, ie. ``D.T @ D``.

    Most baseline algorithms use 2nd order differential matrices when
    doing penalized least squared fitting or Whittaker-smoothing-based fitting.

    """
    # difference_matrix moved to pybaselines._banded_utils in version 1.0.0 in order to more
    # easily use it in other modules without creating circular imports; this function
    # exposes it through pybaselines.utils for backwards compatibility in user code
    return _difference_matrix(data_size, diff_order=diff_order, diff_format=diff_format)




[docs]
def optimize_window(data, increment=1, max_hits=3, window_tol=1e-6,
                    max_half_window=None, min_half_window=None):
    """
    Optimizes the morphological half-window size.

    Parameters
    ----------
    data : array-like
        The measured data values. Can be one or two dimensional.
    increment : int, optional
        The step size for iterating half windows. Default is 1.
    max_hits : int, optional
        The number of consecutive half windows that must produce the same
        morphological opening before accepting the half window as the optimum
        value. Default is 3.
    window_tol : float, optional
        The tolerance value for considering two morphological openings as
        equivalent. Default is 1e-6.
    max_half_window : int, optional
        The maximum allowable half-window size. If None (default), will be set
        to (len(data) - 1) / 2.
    min_half_window : int, optional
        The minimum half-window size. If None (default), will be set to 1.

    Returns
    -------
    half_window : int or numpy.ndarray[int, int]
        The optimized half window size(s). If `data` is one dimensional, the
        output is a single integer, and if `data` is two dimensional, the output
        is an array of two integers.

    Notes
    -----
    May only provide good results for some morphological algorithms, so use with
    caution.

    References
    ----------
    Perez-Pueyo, R., et al. Morphology-Based Automated Baseline Removal for
    Raman Spectra of Artistic Pigments. Applied Spectroscopy, 2010, 64, 595-600.

    """
    y = np.asarray(data)
    if max_half_window is None:
        max_half_window = (y.shape[-1] - 1) // 2
    if min_half_window is None:
        min_half_window = 1

    y_dims = y.ndim
    # TODO would it be better to allow padding the data?
    opening = grey_opening(y, [2 * min_half_window + 1] * y_dims)
    hits = 0
    half_window = 1  # in case min_half_window is set incorrectly
    best_half_window = min_half_window
    for half_window in range(min_half_window + increment, max_half_window, increment):
        new_opening = grey_opening(y, [half_window * 2 + 1] * y_dims)
        if relative_difference(opening, new_opening) < window_tol:
            if hits == 0:
                # keep just the first window that fits tolerance
                best_half_window = half_window - increment
            hits += 1
            if hits >= max_hits:
                half_window = best_half_window
                break
        elif hits:
            hits = 0
        opening = new_opening

    if y_dims == 2:
        output = np.maximum([half_window, half_window], [1, 1])
    else:
        output = max(half_window, 1)  # ensure half window is at least 1
    return output



def _inverted_sort(sort_order):
    """
    Finds the indices that invert a sorting.

    Given an array `a`, and the indices that sort the array, `sort_order`, the
    inverted sort is defined such that it gives the original index order of `a`,
    ie. ``a == a[sort_order][inverted_order]``.

    Parameters
    ----------
    sort_order : numpy.ndarray, shape (N,)
        The original index array for sorting.

    Returns
    -------
    inverted_order : numpy.ndarray, shape (N,)
        The array that inverts the sort given by `sort_order`.

    Notes
    -----
    This function is equivalent to doing::

        inverted_order = sort_order.argsort()

    but is faster for large arrays since no additional sorting is performed.

    """
    num_points = len(sort_order)
    inverted_order = np.empty(num_points, dtype=np.intp)
    inverted_order[sort_order] = np.arange(num_points, dtype=np.intp)

    return inverted_order


def _determine_sorts(data):
    """
    Provides the arrays for sorting and inverting sorting, if needed.

    Parameters
    ----------
    data : numpy.ndarray, shape (N,)
        The array to potentially sort.

    Returns
    -------
    output : tuple(numpy.ndarray, numpy.ndarray) or tuple(None, None)
        A tuple of the index array for sorting the input array and the array
        that inverts that sorting. If the input array is already sorted, then
        the output will be (None, None).

    """
    sort_order = data.argsort(kind='mergesort')
    skip_sorting = (sort_order[1:] > sort_order[:-1]).all()
    if skip_sorting:
        output = (None, None)
    else:
        output = (sort_order, _inverted_sort(sort_order))

    return output


def _sort_array2d(array, sort_order=None):
    """
    Sorts the input 2D array only if given a non-None sorting order.

    Parameters
    ----------
    array : numpy.ndarray
        The array to sort. Must be two or three dimensional.
    sort_order : numpy.ndarray, optional
        The array(s) defining the sort order for the input array. Default is None, which
        will not sort the input.

    Returns
    -------
    output : numpy.ndarray
        The input array after optionally sorting.

    Notes
    -----
    For all inputs, assumes the last 2 axes correspond to the data that needs sorted.

    Raises
    ------
    ValueError
        Raised if the input array is not two or three dimensional.

    """
    if sort_order is None:
        output = array
    else:
        n_dims = array.ndim
        if n_dims == 2:
            output = array[sort_order]
        elif n_dims == 3:
            if isinstance(sort_order, tuple):
                if sort_order[0] is Ellipsis:
                    output = array[sort_order]
                else:
                    output = array[:, sort_order[0], sort_order[1]]
            else:
                output = array[:, sort_order, :]
        else:
            raise ValueError('too many dimensions to sort the data')

    return output


def _sort_array(array, sort_order=None):
    """
    Sorts the input array only if given a non-None sorting order.

    Parameters
    ----------
    array : numpy.ndarray
        The array to sort.
    sort_order : numpy.ndarray, optional
        The array defining the sort order for the input array. Default is None, which
        will not sort the input.

    Returns
    -------
    output : numpy.ndarray
        The input array after optionally sorting.

    Notes
    -----
    For all inputs, assumes the last axis corresponds to the data that needs sorted.

    Raises
    ------
    ValueError
        Raised if the input array has more than two dimensions.

    """
    if sort_order is None:
        output = array
    else:
        n_dims = array.ndim
        if n_dims == 1:
            output = array[sort_order]
        elif n_dims == 2:
            output = array[:, sort_order]
        else:
            raise ValueError('too many dimensions to sort the data')

    return output



[docs]
def whittaker_smooth(data, lam=1e6, diff_order=2, weights=None, check_finite=True):
    """
    Smooths the input data using Whittaker smoothing.

    The input is smoothed by solving the equation ``(W + lam * D.T @ D) y_smooth = W @ y``,
    where `W` is a matrix with `weights` on the diagonals and `D` is the finite difference
    matrix.

    Parameters
    ----------
    data : array-like, shape (N,)
        The y-values of the measured data, with N data points.
    lam : float, optional
        The smoothing parameter. Larger values will create smoother baselines.
        Default is 1e6.
    diff_order : int, optional
        The order of the finite difference matrix. Must be greater than or equal to 0.
        Default is 2 (second order differential matrix). Typical values are 2 or 1.
    weights : array-like, shape (N,), optional
        The weighting array, used to override the function's baseline identification
        to designate peak points. Only elements with 0 or False values will have
        an effect; all non-zero values are considered baseline points. If None
        (default), then will be an array with size equal to N and all values set to 1.
    check_finite : bool, optional
        If True, will raise an error if any values if `data` or `weights` are not finite.
        Default is False, which skips the check.

    Returns
    -------
    y_smooth : numpy.ndarray, shape (N,)
        The smoothed data.

    References
    ----------
    Eilers, P. A Perfect Smoother. Analytical Chemistry, 2003, 75(14), 3631-3636.

    """
    y = _check_array(data, check_finite=check_finite, ensure_1d=True)
    len_y = len(y)
    penalized_system = PenalizedSystem(len_y, lam=lam, diff_order=diff_order)
    weight_array = _check_optional_array(len_y, weights, check_finite=check_finite)

    y_smooth = penalized_system.solve(
        penalized_system.add_diagonal(weight_array),
        weight_array * y, overwrite_ab=True, overwrite_b=True
    )

    return y_smooth




[docs]
def pspline_smooth(data, x_data=None, lam=1e1, num_knots=100, spline_degree=3, diff_order=2,
                   weights=None, check_finite=True):
    """
    Smooths the input data using Penalized Spline smoothing.

    The input is smoothed by solving the equation
    ``(B.T @ W @ B + lam * D.T @ D) y_smooth = B.T @ W @ y``, where `W` is a matrix with
    `weights` on the diagonals, `D` is the finite difference matrix, and `B` is the
    spline basis matrix.

    Parameters
    ----------
    data : array-like, shape (N,)
        The y-values of the measured data, with N data points.
    x_data : array-like, shape (N,), optional
        The x-values of the measured data. Default is None, which will create an
        array from -1 to 1 with N points.
    lam : float, optional
        The smoothing parameter. Larger values will create smoother baselines.
        Default is 1e1.
    num_knots : int, optional
        The number of knots for the spline. Default is 100.
    spline_degree : int, optional
        The degree of the spline. Default is 3, which is a cubic spline.
    diff_order : int, optional
        The order of the finite difference matrix. Must be greater than or equal to 0.
        Default is 2 (second order differential matrix). Typical values are 2 or 1.
    weights : array-like, shape (N,), optional
        The weighting array, used to override the function's baseline identification
        to designate peak points. Only elements with 0 or False values will have
        an effect; all non-zero values are considered baseline points. If None
        (default), then will be an array with size equal to N and all values set to 1.
    check_finite : bool, optional
        If True, will raise an error if any values if `data` or `weights` are not finite.
        Default is False, which skips the check.

    Returns
    -------
    y_smooth : numpy.ndarray, shape (N,)
        The smoothed data.
    tuple(numpy.ndarray, numpy.ndarray, int)
        A tuple of the spline knots, spline coefficients, and spline degree, which can be used to
        reconstruct the fit spline. Useful if needing to recreate the spline with different
        x-values.

    References
    ----------
    Eilers, P., et al. Splines, knots, and penalties. Wiley Interdisciplinary
    Reviews: Computational Statistics, 2010, 2(6), 637-653.

    """
    y, x = _yx_arrays(data, x_data, check_finite=check_finite, ensure_1d=True)
    spline_basis = SplineBasis(x, num_knots, spline_degree, check_finite)
    pspline = PSpline(spline_basis, lam, diff_order)

    weight_array = _check_optional_array(
        len(y), weights, dtype=float, order='C', check_finite=check_finite
    )
    y_smooth = pspline.solve_pspline(y, weight_array)

    return y_smooth, pspline.tck



def _make_data(data_size=1000, bkg_type='exponential'):
    """
    Creates simple datasets for use in examples.

    Parameters
    ----------
    data_size : int, optional
        The number of data points. Default is 1000.
    bkg_type : {'exponential', 'gaussian', 'linear', 'sine'}, optional
        A string designating the type of baseline to add to the data. Default is 'exponential'.

    Returns
    -------
    x : numpy.ndarray, shape (`data_size`,)
        The x-values for the example data.
    y : numpy.ndarray, shape (`data_size`,)
        The y-values for the example data.
    baseline : numpy.ndarray, shape (`data_size`,)
        The true baseline for the example data.

    Raises
    ------
    ValueError
        Raised if `bkg_type` is not an allowed value.

    Notes
    -----
    The example data generated by this function may change without notice to meet the needs
    of the examples in the pybaselines documentation, so outside users are advised not to
    rely on the exact output.

    """
    x = np.linspace(0, 1000, data_size)
    signal = (
        gaussian(x, 9, 100, 12)
        + gaussian(x, 6, 180, 5)
        + gaussian(x, 8, 350, 11)
        + gaussian(x, 15, 400, 18)
        + gaussian(x, 6, 550, 6)
        + gaussian(x, 13, 700, 8)
        + gaussian(x, 9, 800, 9)
        + gaussian(x, 9, 880, 7)
    )
    if bkg_type == 'exponential':
        baseline = 5 + 15 * np.exp(-x / 200)
    elif bkg_type == 'gaussian':
        baseline = 30 + gaussian(x, 20, 500, 150)
    elif bkg_type == 'linear':
        baseline = 1 + x * 0.005
    elif bkg_type == 'sine':
        baseline = 70 + 5 * np.sin(x / 50)
    else:
        raise ValueError(f'unknown bkg_type {bkg_type}')

    noise = np.random.default_rng(0).normal(0, 0.1, data_size)
    y = signal + baseline + noise

    return x, y, baseline