Source code for Stats_Analysis.Base_Dist.NormalDistribution_Class

import numpy as np
from scipy.stats import norm
from scipy.integrate import quad
import matplotlib.pyplot as plt


[docs]
class NormalDistribution:   
    """
    Normal distribution probability distribution.

    This class supports computation of the PDF and CDF for scalar and array inputs, with optional truncation.
    Uses `scipy.stats.norm` for the underlying normal distribution.

    Parameters
    ----------
    mu : float
        The mean of the normal distribution.
    sigma : float
        The standard deviation of the normal distribution. Must be sigma > 0.
    lower_bound : float, optional
        The lower bound. Default is None, meaning no lower bound is applied.
    upper_bound : float, optional
        The upper bound. Default is None, meaning no upper bound is applied.

    Raises
    ------
    ValueError
        If sigma (standard deviation) is not positive.
        If lower_bound >= upper_bound.
    """



[docs]
    def __init__(self, mu, sigma, lower_bound=None, upper_bound=None):
        """
        Initialize the normal distribution with optional truncation.
        """

        self.mu = mu

        # Checks for sigma > 0 as no real meaning of negative standard deviation
        if sigma <= 0:
            raise ValueError("sigma (standard deviation) must be positive.")
        
        self.sigma = sigma

        # Exploit scipys normal distribution as underlying distribution
        self.dist = norm(loc=mu, scale=sigma)

        # If both lower and upper bounds are declared, check that lower_bound < upper_bound, ie the correct way around
        # Also catches for the case where the bounds are equal
        if lower_bound is not None and upper_bound is not None and lower_bound >= upper_bound:
            raise ValueError("lower_bound must be less than upper_bound")

        self.lower_bound = lower_bound
        self.upper_bound = upper_bound

        # This allows truncation from the start
        # Also allows only one bound to be set - ie upper limit only
        if lower_bound is not None or upper_bound is not None:
            self.untrunc_cdf_upper, self.untrunc_cdf_lower = self._calc_trunc_fact()
            self.truncation_factor = self.untrunc_cdf_upper - self.untrunc_cdf_lower
        else:
            self.untrunc_cdf_upper = 1
            self.untrunc_cdf_lower = 0
            self.truncation_factor = 1



[docs]
    def _calc_trunc_fact(self):
        """
        Calculate the normalisation factor if the distribution has been truncated. ie limits applied
        """
        # Calculate the untruncated CDF at the lower bound if it is set
        if self.lower_bound is not None:
            untrunc_cdf_lower = self.dist.cdf(self.lower_bound)
        else:
            untrunc_cdf_lower = 0
        
        # Calculate the untruncated CDF at the upper bound if it is set
        if self.upper_bound is not None:
            untrunc_cdf_upper = self.dist.cdf(self.upper_bound)
        else:
            untrunc_cdf_upper = 1

        # Calculate the area under the curve between the bounds, ie the area that need to be normalised to 1
        return untrunc_cdf_upper, untrunc_cdf_lower

    

[docs]
    def pdf(self, X):
        """
        Calculate the Probability Density Function (PDF), automatically including truncation if bounds are set.

        Parameters
        ----------
        X : float or np.ndarray
            The value(s) at which to evaluate the PDF.

        Returns
        -------
        float or np.ndarray
            The normalized PDF value(s), accounting for optional truncation.

        Notes
        -----
        - For `Z > -beta`, the PDF is defined by lower_bound Gaussian core.
        - For `Z <= -beta`, the PDF transitions to a power-law tail.
        - If truncation bounds are provided, the PDF is zero outside the truncation range.
        """

        # Calculate the untruncated PDF using scipy.stats.norm
        pdf_non_norm = self.dist.pdf(X)

        # If a bound has been set, set the PDF is set to 0 outside the bounds
        # If X is outside the bounds - condition is met and set to 0
        # If X is within the bounds - condition is not met and set to pdf_non_norm
        if self.lower_bound is not None or self.upper_bound is not None:
            pdf = np.where(np.logical_or(X < self.lower_bound, X > self.upper_bound), 0, pdf_non_norm)

        # If no bounds are set, the PDF is untouched
        else:
            pdf = pdf_non_norm

        # Apply the truncation factor calculated earlier (for non truncated is simply 1)
        return pdf / self.truncation_factor



[docs]
    def pdf_fitting(self, X, mu, sigma):
        """
        Calculate the Probability Density Function (PDF) for a given mean and standard deviation for use with MLE fitting.
        """
        # Calculate the untruncated PDF using scipy.stats.norm
        pdf_non_norm = norm.pdf(X, mu, sigma)

        # If a bound has been set, set the PDF is set to 0 outside the bounds
        # If X is outside the bounds - condition is met and set to 0
        # If X is within the bounds - condition is not met and set to pdf_non_norm
        if self.lower_bound is not None or self.upper_bound is not None:
            pdf = np.where(np.logical_or(X < self.lower_bound, X > self.upper_bound), 0, pdf_non_norm)
            truncation_factor_fit = norm.cdf(self.upper_bound, mu, sigma) - norm.cdf(self.lower_bound, mu, sigma)
        # If no bounds are set, the PDF is untouched
        else:
            pdf = pdf_non_norm
            truncation_factor_fit = 1 

        # Apply the truncation factor calculated earlier (for non truncated is simply 1)
        return pdf / truncation_factor_fit

    

[docs]
    def cdf(self, X):
        """
        Calculate the Cumulative Distribution Function (CDF).

        Parameters
        ----------
        X : float or np.ndarray
            The value(s) at which to evaluate the truncated CDF.

        Returns
        -------
        float or np.ndarray
            The truncated CDF value(s).

        Notes
        -----
        - For values less than lower_bound, the CDF equals 0.
        - For values greater than upper_bound, the CDF equals 1
        - Within the bounds, the CDF is scaled by the truncation factor.
        """
        
        # Calculate the untruncated CDF using scipy.stats.norm
        cdf = self.dist.cdf(X)

        # If a lower bound has been declared, set the CDF to be 0 when lower than
        # All contributions are before the lower bound are removed
        if self.lower_bound is not None:
            cdf = np.where(X < self.lower_bound, 0, cdf - self.untrunc_cdf_lower)

        # If an upper bound has been declared, set the CDF to be 1 - any probability not accounted for 
        if self.upper_bound is not None:
            cdf = np.where(X > self.upper_bound, self.untrunc_cdf_upper - self.untrunc_cdf_lower, cdf)

        # Apply the truncation factor to the CDF
        return cdf / self.truncation_factor

    

[docs]
    def cdf_fitting(self, X,  mu, sigma):
        """
        Calculate the Cumulative Distribution Function (CDF) with no set parameters, automatically including truncation if bounds are set, for use in Binned MLE fitting.

        Parameters
        ----------
        X : float or np.ndarray
            The value(s) at which to evaluate the truncated CDF.

        Returns
        -------
        float or np.ndarray
            The truncated CDF value(s).

        Notes
        -----
        - For values less than lower_bound, the CDF equals 0.
        - For values greater than upper_bound, the CDF equals 1
        - Within the bounds, the CDF is scaled by the truncation factor.
        """
        
        # Calculate the untruncated CDF using scipy.expon
        cdf = norm.cdf(X, mu, sigma)
        if self.lower_bound is not None or self.upper_bound is not None:
                    untrunc_cdf_lower = norm.cdf(self.lower_bound, mu, sigma)
                    untrunc_cdf_upper = norm.cdf(self.upper_bound, mu, sigma)
                    cdf = np.where(X < self.lower_bound, 0, cdf - untrunc_cdf_lower)
                    cdf = np.where(X > self.upper_bound, untrunc_cdf_upper - untrunc_cdf_lower, cdf)
                    truncation_factor_fit = untrunc_cdf_upper - untrunc_cdf_lower
                    return cdf / truncation_factor_fit
        else:
            return cdf

        
        

[docs]
    def normalisation_check(self):
        """
        Perform a numerical integration using scipy.integrate.quad to check the normalization of the PDF.

        If the PDF has been truncated:
        It is first performed over the region the PDF is defined [lower_bound, upper_bound]

        It is then performed over the entire real line (-infinity to infinity).

        Prints the results of the numerical integrations.
        """
        
        if self.lower_bound is None:
            lower_bound = -np.inf
        else:
            lower_bound = self.lower_bound

        if self.upper_bound is None:
            upper_bound = np.inf
        else:
            upper_bound = self.upper_bound

        if self.lower_bound is not None or self.upper_bound is not None:
            print(f"Normalisation over the region the PDF is defined/truncated: [{lower_bound},{upper_bound}]")
            integral_bounded, error_bounded = quad(lambda x: self.pdf(x), lower_bound, upper_bound)
            print(f"Integral: {integral_bounded} \u00B1 {error_bounded}")

        print(f"Normalisation over the whole real line: [infinity to infinity]")
        integral_inf, error_inf = quad(lambda x: self.pdf(x), -np.inf, np.inf)
        print(f"Integral: {integral_inf} \u00B1 {error_inf}")



[docs]
    def plot_dist(self):
        """
        Plot the PDF and CDF for the Crystal Ball distribution.

        If both lower and upper bounds are set:
        The PDF and CDF are plotted between[lower_bound, upper_bound].

        If both lower and upper bounds are not set:
        The PDF and CDF is plotted between [mu - 5*sigma, mu + 5*sigma]
        """

        # Set the range of the plot
        if self.lower_bound is None and self.upper_bound is None:
            X = np.linspace(self.mu - 5*self.sigma, self.mu + 5*self.sigma, 1000)

        if self.lower_bound is not None and self.upper_bound is None:
            X = np.linspace(self.lower_bound, self.lower_bound + self.sigma*10, 1000)

        if self.lower_bound is None and self.upper_bound is not None:
            X = np.linspace(self.upper_bound - self.sigma*10, self.upper_bound, 1000)

        else:
            X = np.linspace(self.lower_bound-0.1*(self.upper_bound-self.lower_bound), self.upper_bound+0.1*(self.upper_bound-self.lower_bound), 1000)


        # LHS Plot: the PDF
        plt.figure(figsize=(12, 4))
        plt.subplot(1, 2, 1)
        plt.plot(X, self.pdf(X), color='black', linestyle='-', label='PDF')
        plt.xlim(X[0], X[-1])
        plt.axvline(self.mu, color='r', linestyle='--', label=r'Mean: $\mu$')
        plt.axvline(self.mu+self.sigma*2, color='b', linestyle='--', label=r'+/- 2$\sigma$')
        plt.axvline(self.mu-self.sigma*2, color='b', linestyle='--')
        plt.xlabel('X', fontsize=14)
        plt.ylabel('Normal PDF(X)', fontsize=14)
        plt.xticks(fontsize=12)
        plt.yticks(fontsize=12)
        plt.legend(fontsize=14)
        plt.legend()

        # RHS plot: the CDF
        plt.subplot(1, 2, 2)
        plt.plot(X, self.cdf(X), color='black', linestyle='-', label='CDF')
        plt.xlim(X[0], X[-1])
        plt.axvline(self.mu, color='r', linestyle='--', label=r'Mean: $\mu$')
        plt.axvline(self.mu+self.sigma*2, color='b', linestyle='--', label=r'+/- 2$\sigma$')
        plt.axvline(self.mu-self.sigma*2, color='b', linestyle='--')
        plt.xlabel('X', fontsize=14)
        plt.ylabel('Normal CDF(X)', fontsize=14)
        plt.xticks(fontsize=12)
        plt.yticks(fontsize=12)
        plt.legend(fontsize=14)
        plt.legend()

        plt.tight_layout()
        plt.show()