Add docstrings

cba4fa90 · Theo Serralta · c531573b · cba4fa90
Commit cba4fa90 authored Nov 06, 2024 by Theo Serralta
Hide whitespace changes
Inline Side-by-side

Showing with 238 additions and 193 deletions

cnv_sv_caller_gpu.py cnv_sv_caller_gpu.py +238 -193

No files found.
--- a/cnv_sv_caller_gpu.py
+++ b/cnv_sv_caller_gpu.py
@@ -4,7 +4,7 @@
 ## find_SV_gpu.py
 ## Version : 2.10.0
 ## Description : This script find CNV and SV events
-## Usage : python test_gpu_time_SV.py -b <input/bamfile> -c <num_chr (int) | "All"> -w 100 -s 10 -z 1.5 -l 200 -p <output/tsv/pairs> -m <output/tsv/splits> -e <log/file>
+## Usage : python3 cnv_sv_caller_gpu.py -b <input/bamfile> -c <num_chr (int) | "All"> -w 100 -s 10 -z 1.5 -l 200 -o <output/vcf/cnv> -p <output/tsv/pairs> -m <output/tsv/splits> -e <log/file>
 ## Output : An events file for CNV, paired-reads and split-reads
 ## Requirements : python 3.0+
@@ -13,7 +13,6 @@
 ## last revision date : 20240719
 ## Known bugs : Don't find correct events
 import pysam
 import sys
 import getopt
@@ -39,7 +38,8 @@ def parse_arguments():
    """
    Parse command-line arguments.
-    This function parses the command-line arguments provided to the script and extracts the values for various parameters.
+    This function parses the command-line arguments provided to the script and extracts values for various parameters
+    related to CNV and SV detection in genomic data.
    Parameters
    ----------
@@ -49,20 +49,26 @@ def parse_arguments():
    -------
    tuple
        A tuple containing the following elements:
-        bamfile_path : str or None
+        bamfile_path : Optional[str]
-            The path to the BAM file.
+            The path to the BAM file containing genomic data.
-        window_size : int or None
+        num_chr : Optional[str]
-            The size of the window.
+            The specific chromosome number (if any) to be processed.
-        step_size : int or None
+        window_size : Optional[int]
-            The step size for the analysis.
+            The size of the window used for analysis.
-        zscore_threshold : float or None
+        step_size : Optional[int]
-            The threshold for the Z-score.
+            The step size between windows for scanning the genome.
-        lengthFilter : int or None
+        zscore_threshold : Optional[float]
-            The filter for length.
+            The Z-score threshold to identify significant variants.
-        output_file : str or None
+        lengthFilter : Optional[int]
-            The path to the output file.
+            The minimum length filter for variants.
-        logfile : str or None
+        output_file : Optional[str]
-            The path to the log file.
+            The path where the main output file will be saved.
+        output_file_pairs : Optional[str]
+            The path for the output file that stores paired reads data.
+        output_file_splits : Optional[str]
+            The path for the output file that stores split reads data.
+        logfile : Optional[str]
+            The path where the log file will be stored.
        Each element can be None if the corresponding argument was not provided.
    """
@@ -143,7 +149,7 @@ if __name__ == "__main__":
 # Code CUDA
 mod = SourceModule(
    """
-//Kernel pour calculer la profondeur moyenne brute
+// Kernel to compute the raw average depth
 __global__ void calcul_depth_kernel(int *depth_data, int seq_length, int window_size, int step_size, float *depth_results) {
    int idx = threadIdx.x + blockIdx.x * blockDim.x;
@@ -153,7 +159,6 @@ __global__ void calcul_depth_kernel(int *depth_data, int seq_length, int window_
        int pos_end = pos_start + window_size;
        int count_reads = 0;
        for (int i = pos_start; i < pos_end; i++) {
            count_reads += depth_data[i];
        }
@@ -163,7 +168,7 @@ __global__ void calcul_depth_kernel(int *depth_data, int seq_length, int window_
    }
 }
-//Kernel pour calculer le GC content
+// Kernel to compute GC content
 __global__ void calcul_gc_kernel(char *gc_data, int seq_length, int window_size, int step_size, float *gc_results) {
    int idx = threadIdx.x + blockIdx.x * blockDim.x;
@@ -172,24 +177,19 @@ __global__ void calcul_gc_kernel(char *gc_data, int seq_length, int window_size,
        int pos_start = (idx * step_size) + 1;
        int pos_end = pos_start + window_size;
        int gc_count = 0;
-        //printf(gc_data);
-        //printf(" ");
        for (int i = pos_start; i <= pos_end; ++i) {
-            if ((gc_data[i] == 'G') or (gc_data[i] == 'C') or (gc_data[i] == 'g') or (gc_data[i] == 'c')) {
+            if ((gc_data[i] == 'G') || (gc_data[i] == 'C') || (gc_data[i] == 'g') || (gc_data[i] == 'c')) {
-                //printf(&gc_data[i]);
-                //printf(" ");
                gc_count++;
            }
        }
        float avg_gc = ((float)gc_count / window_size) * 100;
        gc_results[idx] = avg_gc;
    }
 }
-// Kernel pour calculer la mappabilite moyenne ponderee
+// Kernel to compute weighted average mappability
 __global__ void calcul_map_kernel(float *map_data, int seq_length, int window_size, int step_size, float *map_results) {
    int idx = threadIdx.x + blockIdx.x * blockDim.x;
@@ -211,7 +211,7 @@ __global__ void calcul_map_kernel(float *map_data, int seq_length, int window_si
    }
 }
-// Kernel pour filtrer selon le gc
+// Kernel to filter based on GC content
 __global__ void filter_read_gc_kernel(float *gc_results, float *depth_results, int n, float *gc_35, float *gc_40, float *gc_45, float *gc_50, float *gc_55) {
    int idx = blockIdx.x * blockDim.x + threadIdx.x;
@@ -222,7 +222,6 @@ __global__ void filter_read_gc_kernel(float *gc_results, float *depth_results, i
        int gc_value_rounded = round(gc_value);
        if (gc_value_rounded == 35) {
-            //printf(gc_value_rounded)
            int pos = atomicAdd((int*)&gc_35[0], 1);
            gc_35[pos + 1] = depth_value;
        } else if (gc_value_rounded == 40) {
@@ -242,7 +241,7 @@ __global__ void filter_read_gc_kernel(float *gc_results, float *depth_results, i
 }
-// Kernel pour calculer la lecture de profondeur corrigee par la mappabilitee
+// Kernel to compute depth corrected by mappability
 __global__ void calcul_depth_correction_kernel(float *depth_results, float *map_results, int seq_length, int window_size, int step_size, float *depth_correction_results) {
    int idx = threadIdx.x + blockIdx.x * blockDim.x;
@@ -251,14 +250,14 @@ __global__ void calcul_depth_correction_kernel(float *depth_results, float *map_
        float avg_depth = depth_results[idx];
        float avg_map = map_results[idx];
-        // Verification si avg_map est egal a 0 pour eviter la division par 0
+        // Check if avg_map is equal to 0 to avoid division by 0
        float depth_correction = (avg_map != 0.0f) ? (avg_depth / avg_map) : 0.0f;
        depth_correction_results[idx] = depth_correction;
    }
 }
-// Kernel pour normaliser la profondeur corrigee
+// Kernel to normalize corrected depth
 __global__ void normalize_depth_kernel(float *depth_correction, float *gc_results, float m, float *gc_to_median, int seq_length, int window_size, int step_size, float *depth_normalize) {
    int idx = threadIdx.x + blockIdx.x * blockDim.x;
@@ -266,13 +265,13 @@ __global__ void normalize_depth_kernel(float *depth_correction, float *gc_result
    if (idx < seq_length - window_size + 1) {
        float mGC = gc_to_median[(int)gc_results[idx]];
-        // Verification si mGC est egal a 0 pour eviter la division par 0
+        // Check if mGC is equal to 0 to avoid division by 0
        float depth_normalize_val = (mGC != 0.0f) ? (depth_correction[idx] * m / mGC) : 0.0f;
        depth_normalize[idx] = depth_normalize_val;
    }
 }
-// Kernel pour calculer le ratio par window
+// Kernel to compute ratio per window
 __global__ void ratio_par_window_kernel(float *depth_results, float mean_chr, int seq_length, int window_size, int step_size, float *ratio_par_window_results) {
    int idx = threadIdx.x + blockIdx.x * blockDim.x;
@@ -283,7 +282,7 @@ __global__ void ratio_par_window_kernel(float *depth_results, float mean_chr, in
    }
 }
-// Kernel pour calculer le ratio normalise par window
+// Kernel to compute normalized ratio per window
 __global__ void ratio_par_window_norm_kernel(float *normalize_depth_results, float mean_chr_norm, int seq_length, int window_size, int step_size, float *ratio_par_window_norm_results) {
    int idx = threadIdx.x + blockIdx.x * blockDim.x;
@@ -294,7 +293,7 @@ __global__ void ratio_par_window_norm_kernel(float *normalize_depth_results, flo
    }
 }
-// Kernel pour calculer le z_score
+// Kernel to compute Z-score
 __global__ void z_score_kernel(float *ratio_norm, float mean_ratio, float std_ratio, int seq_length, int window_size, int step_size, float *z_score_results) {
    int idx = threadIdx.x + blockIdx.x * blockDim.x;
@@ -302,18 +301,18 @@ __global__ void z_score_kernel(float *ratio_norm, float mean_ratio, float std_ra
    if (idx < seq_length - window_size + 1) {
        float value = ratio_norm[idx];
-        // Verification si la valeur est inf ou NaN, et remplacer par 0 si c est le cas
+        // Check if the value is infinite or NaN, and replace it with 0 if so
        if (isinf(value) || isnan(value)) {
            value = 0.0f;
        }
-        // Calcul du Z-score
+        // Compute the Z-score
        float z_score = (value - mean_ratio) / std_ratio;
        z_score_results[idx] = z_score;
    }
 }
-// Kernel pour calculer le ratio divise par le ratio moyen
+// Kernel to compute the ratio divided by the mean ratio
 __global__ void ratio_par_mean_ratio_kernel(float *ratio_norm, float mean_ratio, int seq_length, int window_size, int step_size, float *ratio_par_mean_ratio_results) {
    int idx = threadIdx.x + blockIdx.x * blockDim.x;
@@ -321,12 +320,12 @@ __global__ void ratio_par_mean_ratio_kernel(float *ratio_norm, float mean_ratio,
    if (idx < seq_length - window_size + 1) {
        float value = ratio_norm[idx];
-        // Verification si la valeur est inf ou NaN et remplacement par 0 si c est le cas
+        // Check if the value is infinite or NaN and replace it with 0 if so
        if (isinf(value) || isnan(value)) {
            value = 0.0f;
        }
-        // Calcul du ratio par rapport au ratio moyen
+        // Compute the ratio relative to the mean ratio
        float ratio_mean_ratio = value / mean_ratio;
        ratio_par_mean_ratio_results[idx] = ratio_mean_ratio;
    }
@@ -335,7 +334,7 @@ __global__ void ratio_par_mean_ratio_kernel(float *ratio_norm, float mean_ratio,
 """
 )
-# Obtention de la fonction de kernel compilée
+# Retrieving the compiled kernel function
 calcul_depth_kernel_cuda = mod.get_function("calcul_depth_kernel")
 calcul_gc_kernel_cuda = mod.get_function("calcul_gc_kernel")
 calcul_map_kernel_cuda = mod.get_function("calcul_map_kernel")
@@ -586,153 +585,28 @@ def calcul_gc_content(seq_length, chr, seq):
    return gc_data
-def calcul_depth_seq(seq_length, bamfile_path, chr):
-    """
-    Calculate the sequencing depth for a given chromosome.
-    This function computes the sequencing depth for a specific chromosome and sequence length using a BAM file.
-    Parameters
-    ----------
-    seq_length : int
-        The length of the sequence.
-    bamfile_path : pysam.AlignmentFile
-        The path to the BAM file opened with pysam.AlignmentFile.
-    chr : str
-        The chromosome for which the depth is calculated.
-    Returns
-    -------
-    numpy.ndarray
-        An array of integers representing the sequencing depth for the given sequence length.
-    """
-    logging.info(f"Entering calcul_depth_seq for {chr}")
-    start_time = time.time()
-    depth_data = np.zeros(seq_length, dtype=np.int32)
-    for pileupcolumn in bamfile_path.pileup(reference = chr):
-        pos = pileupcolumn.reference_pos
-        if pos >= seq_length:
-            break
-        depth_data[pos] = pileupcolumn.nsegments
-    #depth_data = bamfile_path.pileup().nsegments
-    end_time = time.time()
-    elapsed_time = end_time - start_time
-    logging.info(f"Leaving calcul_depth_seq for {chr} (Time taken: {elapsed_time:.4f} seconds)")
-    return depth_data
-def calcul_depth_seq_copy(seq_length, bamfile_path, chr):
-    """
-    Calculate the sequencing depth for a given chromosome.
-    This function computes the sequencing depth for a specific chromosome and sequence length using a BAM file.
-    Parameters
-    ----------
-    seq_length : int
-        The length of the sequence.
-    bamfile_path : pysam.AlignmentFile
-        The path to the BAM file opened with pysam.AlignmentFile.
-    chr : str
-        The chromosome for which the depth is calculated.
-    Returns
-    -------
-    numpy.ndarray
-        An array of integers representing the sequencing depth for the given sequence length.
-    """
-    logging.info(f"Entering calcul_depth_seq for {chr}")
-    start_time = time.time()
-    depth_data = np.zeros(seq_length, dtype=np.int32)
-    i = 0
-    it = bamfile_path.pileup(reference = chr)
-    while i < (seq_length - 1):
-        t = next(it)
-        pos = t.reference_pos
-        depth_data[pos] = t.nsegments
-        print(i)
-        i += 1
-    #for pileupcolumn in bamfile_path.pileup():
-      #  pos = pileupcolumn.reference_pos
-     #   if pos >= seq_length:
-       #     break
-        #depth_data[pos] = pileupcolumn.nsegments
-    #print(bamfile_path.pileup())    print("########################")
-#    depth_data = np.array(bamfile_path.pileup().nsegments)
-    end_time = time.time()
-    elapsed_time = end_time - start_time
-    logging.info(f"Leaving calcul_depth_seq for {chr} (Time taken: {elapsed_time:.4f} seconds)")
-    return depth_data
-def calcul_depth_seq_count(seq_length, bamfile_path, chr):
-    """
-    Calculate the sequencing depth for a given chromosome.
-    This function computes the sequencing depth for a specific chromosome and sequence length using a BAM file.
-    Parameters
-    ----------
-    seq_length : int
-        The length of the sequence.
-    bamfile_path : pysam.AlignmentFile
-        The path to the BAM file opened with pysam.AlignmentFile.
-    chr : str
-        The chromosome for which the depth is calculated.
-    Returns
-    -------
-    numpy.ndarray
-        An array of integers representing the sequencing depth for the given sequence length.
-    """
-    logging.info(f"Entering calcul_depth_seq_count for {chr}")
-    start_time = time.time()
-    # Use count_coverage to get coverage depth
-    depth_data_count = bamfile_path.count_coverage(contig = chr)
-    type(depth_data_count)
-    #print(depth_data_count[:10056])
-    end_time = time.time()
-    elapsed_time = end_time - start_time
-    logging.info(f"Leaving calcul_depth_seq for {chr} (Time taken: {elapsed_time:.4f} seconds)")
-    return depth_data_count
 def calcul_depth_seq_samtools(seq_length, bamfile_path, chr, num_threads=12):
    """
    Calculate the sequencing depth for a given chromosome using a parallelized bash script.
+    This function uses a bash script with multithreading to compute the depth for a specific chromosome 
+    from a BAM file, and returns the depth values as a numpy array.
    Parameters
    ----------
    seq_length : int
        The length of the sequence.
    bamfile_path : str
-        The path to the BAM file.
+        The path to the BAM file containing sequencing data.
    chr : str
        The chromosome for which the depth is calculated.
-    num_threads : int
+    num_threads : int, optional
-        The number of threads to use for parallel processing.
+        The number of threads to use for parallel processing (default is 12).
    Returns
    -------
    numpy.ndarray
-        An array of integers representing the sequencing depth for the given sequence length.
+        An array of integers representing the sequencing depth for the specified sequence length.
    """
    logging.info(f"Entering calcul_depth_seq for {chr}")
    start_time = time.time()
@@ -771,12 +645,15 @@ def calcul_med_total(depth_correction_results, chr):
    """
    Calculate the median of non-zero depth correction results.
-    This function filters out zero values from the depth correction results and computes the median of the remaining values.
+    This function filters out zero values from the depth correction results and computes the median 
+    of the remaining non-zero values.
    Parameters
    ----------
    depth_correction_results : list or numpy.ndarray
        A list or array of depth correction values.
+    chr : str
+        The chromosome identifier for which the median is calculated.
    Returns
    -------
@@ -805,7 +682,8 @@ def calcul_med_same_gc(gc_results, depth_correction_results, chr):
    """
    Calculate the median depth correction for each unique GC content value.
-    This function computes the median depth correction values for each unique GC content value, filtering out zero values.
+    This function computes the median depth correction values for each unique GC content value in `gc_results`, 
+    filtering out zero values from `depth_correction_results`.
    Parameters
    ----------
@@ -813,6 +691,8 @@ def calcul_med_same_gc(gc_results, depth_correction_results, chr):
        A list or array of GC content values.
    depth_correction_results : list or numpy.ndarray
        A list or array of depth correction values.
+    chr : str
+        The chromosome identifier for which the medians are calculated.
    Returns
    -------
@@ -857,12 +737,15 @@ def calcul_moy_totale(normalize_depth_results, chr):
    """
    Calculate the mean of non-zero normalized depth results.
-    This function filters out zero values from the normalized depth results and computes the mean of the remaining values.
+    This function filters out zero values from the normalized depth results and computes the mean 
+    of the remaining non-zero values.
    Parameters
    ----------
    normalize_depth_results : list or numpy.ndarray
        A list or array of normalized depth values.
+    chr : str
+        The chromosome identifier for which the mean is calculated.
    Returns
    -------
@@ -886,18 +769,19 @@ def calcul_moy_totale(normalize_depth_results, chr):
    return mean_chr
 def calcul_std(normalize_depth_results, chr):
    """
    Calculate the standard deviation of non-zero normalized depth results.
-    This function filters out zero values from the normalized depth results and computes the standard deviation of the remaining values.
+    This function filters out zero values from the normalized depth results and computes the standard deviation 
+    of the remaining non-zero values.
    Parameters
    ----------
    normalize_depth_results : list or numpy.ndarray
        A list or array of normalized depth values.
+    chr : str
+        The chromosome identifier for which the standard deviation is calculated.
    Returns
    -------
@@ -927,17 +811,21 @@ def compute_mean_std_med(ratio_par_window_results, chr):
    """
    Compute the mean, standard deviation, and median of non-zero ratio results per window.
-    This function filters out zero and -1 values from the ratio results per window and computes the mean, standard deviation, and median of the remaining values. Values greater than or equal to 5 are capped at 5.
+    This function filters out zero and -1 values from the ratio results per window and computes the mean, 
+    standard deviation, and median of the remaining values. Values greater than or equal to 5 are capped at 5.
    Parameters
    ----------
    ratio_par_window_results : list or numpy.ndarray
        A list or array of ratio values per window.
+    chr : str
+        The chromosome identifier for which the statistics are calculated.
    Returns
    -------
    tuple
-        A tuple containing the mean, standard deviation, and median of the filtered ratio values.
+        A tuple containing the mean, standard deviation, and median of the filtered ratio values:
+        (mean_ratio, std_ratio, med_ratio).
    """
    logging.info(f"Entering compute_mean_std_med for {chr}")
    start_time = time.time()
@@ -980,6 +868,8 @@ def cn_level(x, chr):
    ----------
    x : float
        The input value used to determine the copy number level.
+    chr : str
+        The chromosome identifier for which the copy number level is calculated.
    Returns
    -------
@@ -1017,7 +907,6 @@ def get_sample_name(bamfile_path):
    str
        The sample name extracted from the BAM file. If no sample name is found, returns "UnknownSample".
    """
    logging.info(f"Entering get_sample_name")
    start_time = time.time()
@@ -1029,9 +918,31 @@ def get_sample_name(bamfile_path):
    end_time = time.time()
    elapsed_time = end_time - start_time
    logging.info(f"Leaving get_sample_name (Time taken: {elapsed_time:.4f} seconds)")
    return "UnknownSample"
 def filter_read_gc(gc_results, depth_results, chr):
+    """
+    Filter depth results by GC content categories.
+    This function groups depth values based on rounded GC content categories (35, 40, 45, 50, 55) and returns 
+    the depth values for each category as separate arrays.
+    Parameters
+    ----------
+    gc_results : list or numpy.ndarray
+        A list or array of GC content values.
+    depth_results : list or numpy.ndarray
+        A list or array of depth values corresponding to the GC content values.
+    chr : str
+        The chromosome identifier for which the filtering is performed.
+    Returns
+    -------
+    tuple
+        A tuple containing five numpy arrays, each corresponding to depth values for rounded GC content categories:
+        (gc_35, gc_40, gc_45, gc_50, gc_55).
+    """
    logging.info(f"Entering filter_read_gc for {chr}")
    start_time = time.time()
@@ -1066,6 +977,34 @@ def filter_read_gc(gc_results, depth_results, chr):
    return gc_35, gc_40, gc_45, gc_50, gc_55
 def filter_read_gc_gpu(gc_results, d_gc_results, d_depth_results, grid_size, block_size, chr):
+    """
+    Filter depth results by GC content categories using GPU computation.
+    This function utilizes a GPU kernel to group depth values based on rounded GC content categories (35, 40, 45, 50, 55) 
+    and returns the depth values for each category as separate arrays. The function allocates memory on the GPU, 
+    copies the data, executes the kernel, and retrieves the results.
+    Parameters
+    ----------
+    gc_results : list or numpy.ndarray
+        A list or array of GC content values.
+    d_gc_results : pycuda.gpuarray.GPUArray
+        A GPU array containing GC content values.
+    d_depth_results : pycuda.gpuarray.GPUArray
+        A GPU array containing depth values.
+    grid_size : int
+        The grid size for the GPU kernel execution.
+    block_size : int
+        The block size for the GPU kernel execution.
+    chr : str
+        The chromosome identifier for which the filtering is performed.
+    Returns
+    -------
+    tuple
+        A tuple containing five numpy arrays, each corresponding to depth values for rounded GC content categories:
+        (gc_35, gc_40, gc_45, gc_50, gc_55).
+    """
    logging.info(f"Entering filter_read_gc_gpu for {chr}")
    start_time = time.time()
@@ -1126,6 +1065,33 @@ def filter_read_gc_gpu(gc_results, d_gc_results, d_depth_results, grid_size, blo
    return gc_35, gc_40, gc_45, gc_50, gc_55
 def calc_polynom(gc_35, gc_40, gc_45, gc_50, gc_55, chr):
+    """
+    Calculate polynomial fits for depth arrays grouped by GC content.
+    This function computes a 3rd-degree polynomial fit for each depth array (gc_35, gc_40, gc_45, gc_50, gc_55).
+    Each fit is represented as a polynomial object. If a depth array is empty, `None` is returned for that GC category.
+    Parameters
+    ----------
+    gc_35 : numpy.ndarray
+        Array of depth values for GC content category 35.
+    gc_40 : numpy.ndarray
+        Array of depth values for GC content category 40.
+    gc_45 : numpy.ndarray
+        Array of depth values for GC content category 45.
+    gc_50 : numpy.ndarray
+        Array of depth values for GC content category 50.
+    gc_55 : numpy.ndarray
+        Array of depth values for GC content category 55.
+    chr : str
+        The chromosome identifier for which the polynomial fits are calculated.
+    Returns
+    -------
+    list
+        A list of polynomial objects (numpy.poly1d) for each GC content category, where `None` is returned
+        for categories with empty depth arrays.
+    """
    logging.info(f"Entering calc_polynom for {chr}")
    start_time = time.time()
@@ -1147,6 +1113,24 @@ def calc_polynom(gc_35, gc_40, gc_45, gc_50, gc_55, chr):
    return polynomials
 def calc_mean_chr(depth_results, chr):
+    """
+    Calculate the mean of non-zero depth results for a chromosome.
+    This function filters out zero values from the depth results and computes the mean of the remaining 
+    non-zero values for the specified chromosome.
+    Parameters
+    ----------
+    depth_results : list or numpy.ndarray
+        A list or array of depth values.
+    chr : str
+        The chromosome identifier for which the mean depth is calculated.
+    Returns
+    -------
+    float
+        The mean of the non-zero depth values, or 0 if no non-zero values are present.
+    """
    logging.info(f"Entering calc_mean_chr for {chr}")
    start_time = time.time()   
@@ -1165,6 +1149,31 @@ def calc_mean_chr(depth_results, chr):
    return mean_chr
 def find_polynom(ratio_par_window_results, chr, polynomials, depth_results):
+    """
+    Find the best polynomial fit for copy number ratio results.
+    This function identifies the polynomial from a list of polynomials that best fits the copy number (CN) 
+    values calculated from the ratio per window results. The best fit is determined by minimizing the 
+    difference between normalized polynomial values and unique depth values.
+    Parameters
+    ----------
+    ratio_par_window_results : list or numpy.ndarray
+        A list or array of ratio values per window.
+    chr : str
+        The chromosome identifier for which the best polynomial is determined.
+    polynomials : list of numpy.poly1d or None
+        A list of polynomial objects, where each polynomial corresponds to a GC content category.
+        Polynomials may be `None` if no fit is available for that category.
+    depth_results : list or numpy.ndarray
+        A list or array of depth values used for comparison with polynomial predictions.
+    Returns
+    -------
+    numpy.poly1d or None
+        The polynomial that best fits the data based on the minimum difference in normalized values, 
+        or `None` if no suitable polynomial is found.
+    """
    logging.info(f"Entering find_polynom for {chr}")
    start_time = time.time()
@@ -1207,6 +1216,28 @@ def find_polynom(ratio_par_window_results, chr, polynomials, depth_results):
    return best_polynom
 def normalize(depth_results, best_polynom, map_results):
+    """
+    Normalize depth results using a polynomial model and mappability results.
+    This function normalizes depth values by dividing each non-zero depth value by the product of the 
+    corresponding polynomial value and mappability score. If no polynomial is provided, the depth value is 
+    used as-is. Zero values in either depth or mappability results are set to zero in the output.
+    Parameters
+    ----------
+    depth_results : list or numpy.ndarray
+        A list or array of depth values to be normalized.
+    best_polynom : numpy.poly1d or None
+        A polynomial model for normalization. If `None`, no polynomial normalization is applied.
+    map_results : list or numpy.ndarray
+        A list or array of mappability values corresponding to the depth results.
+    Returns
+    -------
+    numpy.ndarray
+        An array of normalized depth values, where zero values are retained where either depth or mappability 
+        was zero.
+    """
    logging.info("Entering normalization")
    start_time = time.time()
@@ -1222,7 +1253,6 @@ def normalize(depth_results, best_polynom, map_results):
    #normalize_depth_results_unique = np.unique(normalize_depth_results)
    #logging.info(f"normalize_depth_results_unique = {normalize_depth_results_unique}") 
    has_nan = np.isnan(normalize_depth_results).any()
    has_inf = np.isinf(normalize_depth_results).any()
@@ -1242,11 +1272,12 @@ def create_signal(signal, chr, z_score_results, step_size):
    Create a signal dictionary for a specific chromosome based on z-score results.
    This function populates a signal dictionary with positions and corresponding z-score results for a given chromosome.
+    Each position is calculated based on the step size and is mapped to the corresponding z-score value.
    Parameters
    ----------
    signal : dict
-        A dictionary to store the signal data.
+        A dictionary to store the signal data, where chromosome names are keys and position-to-z-score mappings are values.
    chr : str
        The chromosome for which the signal is created.
    z_score_results : list or numpy.ndarray
@@ -1257,7 +1288,7 @@ def create_signal(signal, chr, z_score_results, step_size):
    Returns
    -------
    None
-        The function modifies the signal dictionary in place.
+        The function modifies the `signal` dictionary in place.
    """
    logging.info(f"Entering create_signal for {chr} (include copy_number_level)")
    start_time = time.time()
@@ -1334,19 +1365,26 @@ def segmentation(events, segment, chr):
    """
    Segment the detected events into contiguous regions with the same copy number level.
-    This function processes the detected events and groups contiguous regions with the same copy number level into segments.
+    This function processes the detected events and groups contiguous regions with the same copy number level 
+    into segments, modifying the `segment` dictionary in place to store the start, end, and copy number 
+    information for each contiguous region.
    Parameters
    ----------
    events : dict
-        A dictionary of detected events for each chromosome.
+        A dictionary of detected events for each chromosome. The keys are chromosome names, and the values are dictionaries 
+        with positions as keys and copy number levels as values.
    segment : dict
-        A dictionary to store the segmented regions.
+        A dictionary to store the segmented regions, with chromosome names as keys. Each chromosome's value is a dictionary 
+        where keys represent segment ranges (e.g., "start-end") and values are dictionaries containing 'start', 'end', and 
+        'cn' (copy number level) for the segment.
+    chr : str
+        The chromosome identifier being processed.
    Returns
    -------
    None
-        The function modifies the segment dictionary in place.
+        The function modifies the `segment` dictionary in place, adding segmented regions.
    """
    logging.info(f"Entering segmentation for {chr}")
    start_time = time.time()
@@ -1395,7 +1433,10 @@ def display_results_vcf(sample, segment, signal, lengthFilter, output_file, chr,
    """
    Generate a VCF file containing structural variant calls based on segmented regions and signal data.
-    This function creates a VCF (Variant Call Format) file containing structural variant calls derived from segmented regions and signal data. The structural variant type (DEL for deletion or DUP for duplication) is determined based on copy number levels and signal values. The resulting VCF file includes information about the chromosome, position, type of structural variant, copy number, and other relevant information.
+    This function creates a VCF (Variant Call Format) file containing structural variant calls derived from 
+    segmented regions and signal data. The structural variant type (DEL for deletion or DUP for duplication) 
+    is determined based on copy number levels and signal values. The resulting VCF file includes information 
+    about the chromosome, position, type of structural variant, copy number, and other relevant information.
    Parameters
    ----------
@@ -1409,6 +1450,10 @@ def display_results_vcf(sample, segment, signal, lengthFilter, output_file, chr,
        The minimum length threshold for including variants in the VCF file.
    output_file : str
        The path to the output VCF file.
+    chr : str
+        The chromosome identifier for which the VCF output is generated.
+    normalize_depth_results : list or numpy.ndarray
+        An array or list of normalized depth values used to filter structural variant calls.
    Returns
    -------