"""This file performs standard scaler normalization on the data."""
from typing import Union
from anndata import AnnData
from anndata.experimental import AnnCollection
import numpy as np
from scalr.data.preprocess import PreprocessorBase
from scalr.utils import EventLogger
[docs]
class StandardScaler(PreprocessorBase):
"""Class for Standard Normalization"""
def __init__(self, with_mean: bool = True, with_std: bool = True):
"""Initialize parameters for standard scaler normalization.
Args:
with_mean: Mean for standard scaling.
with_std: Standard deviation for standard scaling.
"""
self.with_mean = with_mean
self.with_std = with_std
# Parameters for standard scaler.
self.train_mean = None
self.train_std = None
self.event_logger = EventLogger('Standard Scaler Normalization')
[docs]
def fit(self, data: Union[AnnData, AnnCollection],
sample_chunksize: int) -> None:
"""This function calculate parameters for standard scaler object from the train data.
Args:
data: Data to calculate the required parameters of.
sample_chunksize: Chunks of data that can be loaded into memory at once.
"""
self.event_logger.info('\n\nStarting standardscaler normalization')
self.event_logger.info('\nFitting standard scaler object on train data')
self.calculate_mean(data, sample_chunksize)
self.calculate_std(data, sample_chunksize)
[docs]
def calculate_mean(self, data: Union[AnnData, AnnCollection],
sample_chunksize: int) -> None:
"""Function to calculate mean for each feature in the train data
Args:
data: Data to calculate the mean of.
sample_chunksize: Chunks of data that can be loaded into memory at once.
Returns:
Nothing, stores mean per feature of the train data.
"""
self.event_logger.info('Calculating mean of data...')
train_sum = np.zeros(data.shape[1]).reshape(1, -1)
# Iterate through batches of data to get mean statistics
for i in range(int(np.ceil(data.shape[0] / sample_chunksize))):
train_sum += data[i * sample_chunksize:i * sample_chunksize +
sample_chunksize].X.sum(axis=0)
self.train_mean = train_sum / data.shape[0]
if not self.with_mean:
self.event_logger.info(
'`train_mean` will be set to zero during `transform()`, as `with_mean` is set to False!'
)
[docs]
def calculate_std(self, data: Union[AnnData, AnnCollection],
sample_chunksize: int) -> None:
"""A function to calculate standard deviation for each feature in the train data.
Args:
data: Data to calculate the standard deviation of
sample_chunksize: Chunks of data that can be loaded into memory at once.
Returns:
Nothing, stores standard deviation per feature of the train data.
"""
# Getting standard deviation of entire train data per feature.
if self.with_std:
self.event_logger.info('Calculating standard deviation of data...')
self.train_std = np.zeros(data.shape[1]).reshape(1, -1)
# Iterate through batches of data to get std statistics
for i in range(int(np.ceil(data.shape[0] / sample_chunksize))):
self.train_std += np.sum(np.power(
data[i * sample_chunksize:i * sample_chunksize +
sample_chunksize].X - self.train_mean, 2),
axis=0)
self.train_std /= data.shape[0]
self.train_std = np.sqrt(self.train_std)
# Handling cases where standard deviation of feature is 0, replace it with 1.
self.train_std[self.train_std == 0] = 1
else:
# If `with_std` is False, set train_std to 1.
self.event_logger.info(
'Setting `train_std` to be 1, as `with_std` is set to False!')
self.train_std = np.ones((1, data.shape[1]))
[docs]
@classmethod
def get_default_params(cls) -> dict:
"""Class method to get default params for preprocess_config."""
return dict(with_mean=True, with_std=True)