"""This file performs standard scaler normalization on the data."""
from typing import Union
from anndata import AnnData
from anndata.experimental import AnnCollection
import numpy as np
from scalr.data.preprocess import PreprocessorBase
[docs]
class StandardScaler(PreprocessorBase):
"""Class for Standard Normalization"""
def __init__(self, with_mean: bool = True, with_std: bool = True):
"""Initialize parameters for standard scaler normalization.
Args:
with_mean: Mean for standard scaling.
with_std: Standard deviation for standard scaling.
"""
self.with_mean = with_mean
self.with_std = with_std
# Parameters for standard scaler.
self.train_mean = None
self.train_std = None
[docs]
def fit(self, data: Union[AnnData, AnnCollection],
sample_chunksize: int) -> None:
"""This function calculate parameters for standard scaler object from the train data.
Args:
data: Data to calculate the required parameters of.
sample_chunksize: Chunks of data that can be loaded into memory at once.
"""
self.calculate_mean(data, sample_chunksize)
self.calculate_std(data, sample_chunksize)
[docs]
def calculate_mean(self, data: Union[AnnData, AnnCollection],
sample_chunksize: int) -> None:
"""Function to calculate mean for each feature in the train data
Args:
data: Data to calculate the mean of.
sample_chunksize: Chunks of data that can be loaded into memory at once.
Returns:
Nothing, stores mean per feature of the train data.
"""
train_sum = np.zeros(data.shape[1]).reshape(1, -1)
# Iterate through batches of data to get mean statistics
for i in range(int(np.ceil(data.shape[0] / sample_chunksize))):
train_sum += data[i * sample_chunksize:i * sample_chunksize +
sample_chunksize].X.sum(axis=0)
self.train_mean = train_sum / data.shape[0]
[docs]
def calculate_std(self, data: Union[AnnData, AnnCollection],
sample_chunksize: int) -> None:
"""A function to calculate standard deviation for each feature in the train data.
Args:
data: Data to calculate the standard deviation of
sample_chunksize: Chunks of data that can be loaded into memory at once.
Returns:
Nothing, stores standard deviation per feature of the train data.
"""
# Getting standard deviation of entire train data per feature.
if self.with_std:
self.train_std = np.zeros(data.shape[1]).reshape(1, -1)
# Iterate through batches of data to get std statistics
for i in range(int(np.ceil(data.shape[0] / sample_chunksize))):
self.train_std += np.sum(np.power(
data[i * sample_chunksize:i * sample_chunksize +
sample_chunksize].X - self.train_mean, 2),
axis=0)
self.train_std /= data.shape[0]
self.train_std = np.sqrt(self.train_std)
# Handling cases where standard deviation of feature is 0, replace it with 1.
self.train_std[self.train_std == 0] = 1
else:
# If `with_std` is False, set train_std to 1.
self.train_std = np.ones((1, data.shape[1]))
[docs]
@classmethod
def get_default_params(cls) -> dict:
"""Class method to get default params for preprocess_config."""
return dict(with_mean=True, with_std=True)