Add docstrings

f3fb9d8e · Theo Serralta · cba4fa90 · f3fb9d8e
Commit f3fb9d8e authored Nov 06, 2024 by Theo Serralta
Hide whitespace changes
Inline Side-by-side

Showing with 147 additions and 23 deletions

cnv_sv_caller_gpu.py cnv_sv_caller_gpu.py +147 -23

No files found.
--- a/cnv_sv_caller_gpu.py
+++ b/cnv_sv_caller_gpu.py
@@ -1523,6 +1523,26 @@ def display_results_vcf(sample, segment, signal, lengthFilter, output_file, chr,


 def calcul_distance(bamfile_handle, chr, seq_length):
+    """
+    Calculate template distances for paired-end reads in a specified chromosome.
+
+    This function calculates the absolute template length (distance) for each paired-end read in the specified chromosome, 
+    filtering out unmapped reads and reads with unmapped mates. The distances are stored in an array for further analysis.
+
+    Parameters
+    ----------
+    bamfile_handle : pysam.AlignmentFile
+        A handle to an open BAM file for reading.
+    chr : str
+        The chromosome identifier for which distances are calculated.
+    seq_length : int
+        The length of the sequence (not directly used in this function, but may be needed for context).
+
+    Returns
+    -------
+    numpy.ndarray
+        An array of integer distances representing the absolute template lengths for paired-end reads in the chromosome.
+    """
    logging.info(f"Entering calcul_distance for {chr}")
    start_time = time.time()
    
@@ -1545,6 +1565,27 @@ def calcul_distance(bamfile_handle, chr, seq_length):
    return distances_data

 def stats_distances(distances_data, chr):
+    """
+    Calculate statistics on template distances for a specified chromosome.
+
+    This function calculates the standard deviation and median of the template distances for a given chromosome.
+    It then uses these values to determine a minimum and maximum distance range based on one standard deviation 
+    from the median.
+
+    Parameters
+    ----------
+    distances_data : numpy.ndarray
+        An array of integer distances representing the template lengths for paired-end reads.
+    chr : str
+        The chromosome identifier for which the distance statistics are calculated.
+
+    Returns
+    -------
+    tuple
+        A tuple containing:
+        - MIN_DISTANCE : float, the lower bound distance (median - standard deviation).
+        - MAX_DISTANCE : float, the upper bound distance (median + standard deviation).
+    """
    logging.info(f"Entering stats_distances for {chr}")
    start_time = time.time()
    
@@ -1564,6 +1605,37 @@ def stats_distances(distances_data, chr):
    return MIN_DISTANCE, MAX_DISTANCE
        
 def find_paired_split(chr, MIN_DISTANCE, MAX_DISTANCE, depth_data, output_file_pairs, output_file_splits, bamfile_handle, seq_length):
+    """
+    Detect paired-end and split-read structural variant signals in a chromosome.
+
+    This function scans paired-end and split-read data from a BAM file to identify structural variant signals
+    (e.g., inversions, deletions, insertions, translocations) based on distance thresholds and alignment patterns.
+    Detected signals are filtered and grouped based on proximity and event type, and then written to specified output files.
+
+    Parameters
+    ----------
+    chr : str
+        The chromosome identifier for which structural variant signals are detected.
+    MIN_DISTANCE : int
+        The minimum template length to consider as an abnormal distance for paired-end reads.
+    MAX_DISTANCE : int
+        The maximum template length to consider as an abnormal distance for paired-end reads.
+    depth_data : numpy.ndarray
+        An array of depth values for the chromosome.
+    output_file_pairs : str
+        The path to the output file for storing paired-end read signals.
+    output_file_splits : str
+        The path to the output file for storing split-read signals.
+    bamfile_handle : pysam.AlignmentFile
+        A handle to an open BAM file for reading.
+    seq_length : int
+        The length of the sequence being processed.
+
+    Returns
+    -------
+    None
+        The function writes paired-end and split-read structural variant signals to the specified output files.
+    """
    logging.info(f"Entering find_paired_split for {chr}")
    start_time = time.time()
    
@@ -1748,9 +1820,43 @@ def find_paired_split(chr, MIN_DISTANCE, MAX_DISTANCE, depth_data, output_file_p
    logging.info(f"Leaving find_paired_split for {chr} (Time taken: {elapsed_time:.4f} seconds)")

 def are_coordinates_close(coord1, coord2, tolerance):
+    """
+    Check if two genomic coordinates are within a specified tolerance.
+
+    Parameters
+    ----------
+    coord1 : int
+        The first genomic coordinate.
+    coord2 : int
+        The second genomic coordinate.
+    tolerance : int
+        The maximum distance within which the coordinates are considered close.
+
+    Returns
+    -------
+    bool
+        True if the absolute difference between the coordinates is less than or equal to the tolerance, False otherwise.
+    """
    return abs(coord1 - coord2) <= tolerance

 def are_secondary_alignments_same(read1, read2):
+    """
+    Check if two reads have the same secondary alignment tag.
+
+    This function compares the secondary alignment tags (SA) of two reads to determine if they are the same.
+
+    Parameters
+    ----------
+    read1 : pysam.AlignedSegment
+        The first read to compare.
+    read2 : pysam.AlignedSegment
+        The second read to compare.
+
+    Returns
+    -------
+    bool
+        True if the secondary alignment tags of the reads are identical, False otherwise.
+    """
    return read1.get_tag("SA") == read2.get_tag("SA")
             
 def main_calcul(
@@ -1768,37 +1874,55 @@ def main_calcul(
    sample,
 ):
    """
-    Perform structural variant detection and VCF file generation.
+    Perform copy number variation (CNV) and structural variant (SV) detection with GPU-accelerated computations.

-    This function orchestrates a series of computations and data manipulations,
-    leveraging GPU acceleration for performance improvements in genomic data analysis.
+    This function orchestrates a complex series of computations for genomic data analysis, including CNV and SV detection,
+    and uses GPU acceleration to enhance performance. The process includes mappability, GC content, and depth calculations,
+    as well as distance metrics and normalization. Results are saved in various output files, including a VCF file for 
+    structural variants.

    Parameters
    ----------
-        bamfile_path : str
-            Path to the BAM file containing aligned reads.
-        chr : str
-            Chromosome identifier for which analysis is performed.
-        seq_length : int
-            Length of the chromosome sequence.
-        window_size : int
-            Size of the sliding window used for analysis.
-        step_size : int
-            Size of the step when moving the window along the chromosome.
-        zscore_threshold : float
-            Threshold value for detecting significant events based on Z-scores.
-        lengthFilter : int
-            Minimum length threshold for including variants in the VCF file.
-        output_file : str
-            Path to the output VCF file.
-        sample : str
-            Name of the sample being analyzed.
+    bamfile_path : str
+        Path to the BAM file containing aligned reads.
+    bamfile_handle : pysam.AlignmentFile
+        An open handle to the BAM file for read access.
+    chr : str
+        Chromosome identifier for which analysis is performed.
+    seq_length : int
+        Length of the chromosome sequence.
+    window_size : int
+        Size of the sliding window used for analysis.
+    step_size : int
+        Size of the step when moving the window along the chromosome.
+    zscore_threshold : float
+        Threshold for detecting significant events based on Z-scores.
+    lengthFilter : int
+        Minimum length threshold for including variants in the VCF file.
+    output_file : str
+        Path to the main output VCF file containing variant calls.
+    output_file_pairs : str
+        Path to the output file for paired-read SV events.
+    output_file_splits : str
+        Path to the output file for split-read SV events.
+    sample : str
+        Sample name used in the VCF header.

    Returns
    -------
-        None
+    None
+        The function performs CNV and SV detection, along with associated calculations, and writes the results to 
+        specified output files.
+    
+    Notes
+    -----
+    - Uses CUDA-enabled GPU processing to perform mappability, GC content, depth, and normalization calculations 
+      efficiently over large genomic datasets.
+    - Detects CNVs by analyzing depth and copy number variations across windows and applies Z-score analysis to 
+      identify statistically significant changes.
+    - Identifies SVs (such as inversions, deletions, and translocations) based on read pairing and alignment patterns.
+    - Writes detailed output for each stage, including paired-read and split-read SVs, in separate files.
    """
-
    sys.stderr.write("\t entering main_calcul\n")
    global seq
    events = {}