Source code for scalr.data.preprocess.standard_scale

"""This file performs standard scaler normalization on the data."""

from typing import Union

from anndata import AnnData
from anndata.experimental import AnnCollection
import numpy as np

from scalr.data.preprocess import PreprocessorBase



[docs]
class StandardScaler(PreprocessorBase):
    """Class for Standard Normalization"""

    def __init__(self, with_mean: bool = True, with_std: bool = True):
        """Initialize parameters for standard scaler normalization.

        Args:
            with_mean: Mean for standard scaling.
            with_std: Standard deviation for standard scaling.
        """

        self.with_mean = with_mean
        self.with_std = with_std

        # Parameters for standard scaler.
        self.train_mean = None
        self.train_std = None


[docs]
    def transform(self, data: np.ndarray) -> np.ndarray:
        """A function to transform provided input data.

        Args:
            data (np.ndarray): raw data

        Returns:
            np.ndarray: processed data
        """
        if not self.with_mean:
            train_mean = np.zeros((1, data.shape[1]))
        else:
            train_mean = self.train_mean
        return (data - train_mean) / self.train_std



[docs]
    def fit(self, data: Union[AnnData, AnnCollection],
            sample_chunksize: int) -> None:
        """This function calculate parameters for standard scaler object from the train data.
        
        Args:
            data: Data to calculate the required parameters of.
            sample_chunksize: Chunks of data that can be loaded into memory at once.
        
        """

        self.calculate_mean(data, sample_chunksize)
        self.calculate_std(data, sample_chunksize)



[docs]
    def calculate_mean(self, data: Union[AnnData, AnnCollection],
                       sample_chunksize: int) -> None:
        """Function to calculate mean for each feature in the train data
        
        Args:
            data: Data to calculate the mean of.
            sample_chunksize: Chunks of data that can be loaded into memory at once.
            
        Returns:
            Nothing, stores mean per feature of the train data.
            """

        train_sum = np.zeros(data.shape[1]).reshape(1, -1)

        # Iterate through batches of data to get mean statistics
        for i in range(int(np.ceil(data.shape[0] / sample_chunksize))):
            train_sum += data[i * sample_chunksize:i * sample_chunksize +
                              sample_chunksize].X.sum(axis=0)
        self.train_mean = train_sum / data.shape[0]



[docs]
    def calculate_std(self, data: Union[AnnData, AnnCollection],
                      sample_chunksize: int) -> None:
        """A function to calculate standard deviation for each feature in the train data.
        
        Args:
            data: Data to calculate the standard deviation of
            sample_chunksize: Chunks of data that can be loaded into memory at once.
            
        Returns:
            Nothing, stores standard deviation per feature of the train data.
            """

        # Getting standard deviation of entire train data per feature.
        if self.with_std:
            self.train_std = np.zeros(data.shape[1]).reshape(1, -1)
            # Iterate through batches of data to get std statistics
            for i in range(int(np.ceil(data.shape[0] / sample_chunksize))):
                self.train_std += np.sum(np.power(
                    data[i * sample_chunksize:i * sample_chunksize +
                         sample_chunksize].X - self.train_mean, 2),
                                         axis=0)
            self.train_std /= data.shape[0]
            self.train_std = np.sqrt(self.train_std)

            # Handling cases where standard deviation of feature is 0, replace it with 1.
            self.train_std[self.train_std == 0] = 1
        else:
            # If `with_std` is False, set train_std to 1.
            self.train_std = np.ones((1, data.shape[1]))



[docs]
    @classmethod
    def get_default_params(cls) -> dict:
        """Class method to get default params for preprocess_config."""
        return dict(with_mean=True, with_std=True)