Commit cba4fa90 authored by Theo Serralta's avatar Theo Serralta

Add docstrings

parent c531573b
...@@ -4,7 +4,7 @@ ...@@ -4,7 +4,7 @@
## find_SV_gpu.py ## find_SV_gpu.py
## Version : 2.10.0 ## Version : 2.10.0
## Description : This script find CNV and SV events ## Description : This script find CNV and SV events
## Usage : python test_gpu_time_SV.py -b <input/bamfile> -c <num_chr (int) | "All"> -w 100 -s 10 -z 1.5 -l 200 -p <output/tsv/pairs> -m <output/tsv/splits> -e <log/file> ## Usage : python3 cnv_sv_caller_gpu.py -b <input/bamfile> -c <num_chr (int) | "All"> -w 100 -s 10 -z 1.5 -l 200 -o <output/vcf/cnv> -p <output/tsv/pairs> -m <output/tsv/splits> -e <log/file>
## Output : An events file for CNV, paired-reads and split-reads ## Output : An events file for CNV, paired-reads and split-reads
## Requirements : python 3.0+ ## Requirements : python 3.0+
...@@ -13,7 +13,6 @@ ...@@ -13,7 +13,6 @@
## last revision date : 20240719 ## last revision date : 20240719
## Known bugs : Don't find correct events ## Known bugs : Don't find correct events
import pysam import pysam
import sys import sys
import getopt import getopt
...@@ -39,7 +38,8 @@ def parse_arguments(): ...@@ -39,7 +38,8 @@ def parse_arguments():
""" """
Parse command-line arguments. Parse command-line arguments.
This function parses the command-line arguments provided to the script and extracts the values for various parameters. This function parses the command-line arguments provided to the script and extracts values for various parameters
related to CNV and SV detection in genomic data.
Parameters Parameters
---------- ----------
...@@ -49,20 +49,26 @@ def parse_arguments(): ...@@ -49,20 +49,26 @@ def parse_arguments():
------- -------
tuple tuple
A tuple containing the following elements: A tuple containing the following elements:
bamfile_path : str or None bamfile_path : Optional[str]
The path to the BAM file. The path to the BAM file containing genomic data.
window_size : int or None num_chr : Optional[str]
The size of the window. The specific chromosome number (if any) to be processed.
step_size : int or None window_size : Optional[int]
The step size for the analysis. The size of the window used for analysis.
zscore_threshold : float or None step_size : Optional[int]
The threshold for the Z-score. The step size between windows for scanning the genome.
lengthFilter : int or None zscore_threshold : Optional[float]
The filter for length. The Z-score threshold to identify significant variants.
output_file : str or None lengthFilter : Optional[int]
The path to the output file. The minimum length filter for variants.
logfile : str or None output_file : Optional[str]
The path to the log file. The path where the main output file will be saved.
output_file_pairs : Optional[str]
The path for the output file that stores paired reads data.
output_file_splits : Optional[str]
The path for the output file that stores split reads data.
logfile : Optional[str]
The path where the log file will be stored.
Each element can be None if the corresponding argument was not provided. Each element can be None if the corresponding argument was not provided.
""" """
...@@ -143,7 +149,7 @@ if __name__ == "__main__": ...@@ -143,7 +149,7 @@ if __name__ == "__main__":
# Code CUDA # Code CUDA
mod = SourceModule( mod = SourceModule(
""" """
//Kernel pour calculer la profondeur moyenne brute // Kernel to compute the raw average depth
__global__ void calcul_depth_kernel(int *depth_data, int seq_length, int window_size, int step_size, float *depth_results) { __global__ void calcul_depth_kernel(int *depth_data, int seq_length, int window_size, int step_size, float *depth_results) {
int idx = threadIdx.x + blockIdx.x * blockDim.x; int idx = threadIdx.x + blockIdx.x * blockDim.x;
...@@ -153,7 +159,6 @@ __global__ void calcul_depth_kernel(int *depth_data, int seq_length, int window_ ...@@ -153,7 +159,6 @@ __global__ void calcul_depth_kernel(int *depth_data, int seq_length, int window_
int pos_end = pos_start + window_size; int pos_end = pos_start + window_size;
int count_reads = 0; int count_reads = 0;
for (int i = pos_start; i < pos_end; i++) { for (int i = pos_start; i < pos_end; i++) {
count_reads += depth_data[i]; count_reads += depth_data[i];
} }
...@@ -163,7 +168,7 @@ __global__ void calcul_depth_kernel(int *depth_data, int seq_length, int window_ ...@@ -163,7 +168,7 @@ __global__ void calcul_depth_kernel(int *depth_data, int seq_length, int window_
} }
} }
//Kernel pour calculer le GC content // Kernel to compute GC content
__global__ void calcul_gc_kernel(char *gc_data, int seq_length, int window_size, int step_size, float *gc_results) { __global__ void calcul_gc_kernel(char *gc_data, int seq_length, int window_size, int step_size, float *gc_results) {
int idx = threadIdx.x + blockIdx.x * blockDim.x; int idx = threadIdx.x + blockIdx.x * blockDim.x;
...@@ -172,24 +177,19 @@ __global__ void calcul_gc_kernel(char *gc_data, int seq_length, int window_size, ...@@ -172,24 +177,19 @@ __global__ void calcul_gc_kernel(char *gc_data, int seq_length, int window_size,
int pos_start = (idx * step_size) + 1; int pos_start = (idx * step_size) + 1;
int pos_end = pos_start + window_size; int pos_end = pos_start + window_size;
int gc_count = 0; int gc_count = 0;
//printf(gc_data);
//printf(" ");
for (int i = pos_start; i <= pos_end; ++i) { for (int i = pos_start; i <= pos_end; ++i) {
if ((gc_data[i] == 'G') or (gc_data[i] == 'C') or (gc_data[i] == 'g') or (gc_data[i] == 'c')) { if ((gc_data[i] == 'G') || (gc_data[i] == 'C') || (gc_data[i] == 'g') || (gc_data[i] == 'c')) {
//printf(&gc_data[i]);
//printf(" ");
gc_count++; gc_count++;
} }
} }
float avg_gc = ((float)gc_count / window_size) * 100; float avg_gc = ((float)gc_count / window_size) * 100;
gc_results[idx] = avg_gc; gc_results[idx] = avg_gc;
} }
} }
// Kernel pour calculer la mappabilite moyenne ponderee // Kernel to compute weighted average mappability
__global__ void calcul_map_kernel(float *map_data, int seq_length, int window_size, int step_size, float *map_results) { __global__ void calcul_map_kernel(float *map_data, int seq_length, int window_size, int step_size, float *map_results) {
int idx = threadIdx.x + blockIdx.x * blockDim.x; int idx = threadIdx.x + blockIdx.x * blockDim.x;
...@@ -211,7 +211,7 @@ __global__ void calcul_map_kernel(float *map_data, int seq_length, int window_si ...@@ -211,7 +211,7 @@ __global__ void calcul_map_kernel(float *map_data, int seq_length, int window_si
} }
} }
// Kernel pour filtrer selon le gc // Kernel to filter based on GC content
__global__ void filter_read_gc_kernel(float *gc_results, float *depth_results, int n, float *gc_35, float *gc_40, float *gc_45, float *gc_50, float *gc_55) { __global__ void filter_read_gc_kernel(float *gc_results, float *depth_results, int n, float *gc_35, float *gc_40, float *gc_45, float *gc_50, float *gc_55) {
int idx = blockIdx.x * blockDim.x + threadIdx.x; int idx = blockIdx.x * blockDim.x + threadIdx.x;
...@@ -222,7 +222,6 @@ __global__ void filter_read_gc_kernel(float *gc_results, float *depth_results, i ...@@ -222,7 +222,6 @@ __global__ void filter_read_gc_kernel(float *gc_results, float *depth_results, i
int gc_value_rounded = round(gc_value); int gc_value_rounded = round(gc_value);
if (gc_value_rounded == 35) { if (gc_value_rounded == 35) {
//printf(gc_value_rounded)
int pos = atomicAdd((int*)&gc_35[0], 1); int pos = atomicAdd((int*)&gc_35[0], 1);
gc_35[pos + 1] = depth_value; gc_35[pos + 1] = depth_value;
} else if (gc_value_rounded == 40) { } else if (gc_value_rounded == 40) {
...@@ -242,7 +241,7 @@ __global__ void filter_read_gc_kernel(float *gc_results, float *depth_results, i ...@@ -242,7 +241,7 @@ __global__ void filter_read_gc_kernel(float *gc_results, float *depth_results, i
} }
// Kernel pour calculer la lecture de profondeur corrigee par la mappabilitee // Kernel to compute depth corrected by mappability
__global__ void calcul_depth_correction_kernel(float *depth_results, float *map_results, int seq_length, int window_size, int step_size, float *depth_correction_results) { __global__ void calcul_depth_correction_kernel(float *depth_results, float *map_results, int seq_length, int window_size, int step_size, float *depth_correction_results) {
int idx = threadIdx.x + blockIdx.x * blockDim.x; int idx = threadIdx.x + blockIdx.x * blockDim.x;
...@@ -251,14 +250,14 @@ __global__ void calcul_depth_correction_kernel(float *depth_results, float *map_ ...@@ -251,14 +250,14 @@ __global__ void calcul_depth_correction_kernel(float *depth_results, float *map_
float avg_depth = depth_results[idx]; float avg_depth = depth_results[idx];
float avg_map = map_results[idx]; float avg_map = map_results[idx];
// Verification si avg_map est egal a 0 pour eviter la division par 0 // Check if avg_map is equal to 0 to avoid division by 0
float depth_correction = (avg_map != 0.0f) ? (avg_depth / avg_map) : 0.0f; float depth_correction = (avg_map != 0.0f) ? (avg_depth / avg_map) : 0.0f;
depth_correction_results[idx] = depth_correction; depth_correction_results[idx] = depth_correction;
} }
} }
// Kernel pour normaliser la profondeur corrigee // Kernel to normalize corrected depth
__global__ void normalize_depth_kernel(float *depth_correction, float *gc_results, float m, float *gc_to_median, int seq_length, int window_size, int step_size, float *depth_normalize) { __global__ void normalize_depth_kernel(float *depth_correction, float *gc_results, float m, float *gc_to_median, int seq_length, int window_size, int step_size, float *depth_normalize) {
int idx = threadIdx.x + blockIdx.x * blockDim.x; int idx = threadIdx.x + blockIdx.x * blockDim.x;
...@@ -266,13 +265,13 @@ __global__ void normalize_depth_kernel(float *depth_correction, float *gc_result ...@@ -266,13 +265,13 @@ __global__ void normalize_depth_kernel(float *depth_correction, float *gc_result
if (idx < seq_length - window_size + 1) { if (idx < seq_length - window_size + 1) {
float mGC = gc_to_median[(int)gc_results[idx]]; float mGC = gc_to_median[(int)gc_results[idx]];
// Verification si mGC est egal a 0 pour eviter la division par 0 // Check if mGC is equal to 0 to avoid division by 0
float depth_normalize_val = (mGC != 0.0f) ? (depth_correction[idx] * m / mGC) : 0.0f; float depth_normalize_val = (mGC != 0.0f) ? (depth_correction[idx] * m / mGC) : 0.0f;
depth_normalize[idx] = depth_normalize_val; depth_normalize[idx] = depth_normalize_val;
} }
} }
// Kernel pour calculer le ratio par window // Kernel to compute ratio per window
__global__ void ratio_par_window_kernel(float *depth_results, float mean_chr, int seq_length, int window_size, int step_size, float *ratio_par_window_results) { __global__ void ratio_par_window_kernel(float *depth_results, float mean_chr, int seq_length, int window_size, int step_size, float *ratio_par_window_results) {
int idx = threadIdx.x + blockIdx.x * blockDim.x; int idx = threadIdx.x + blockIdx.x * blockDim.x;
...@@ -283,7 +282,7 @@ __global__ void ratio_par_window_kernel(float *depth_results, float mean_chr, in ...@@ -283,7 +282,7 @@ __global__ void ratio_par_window_kernel(float *depth_results, float mean_chr, in
} }
} }
// Kernel pour calculer le ratio normalise par window // Kernel to compute normalized ratio per window
__global__ void ratio_par_window_norm_kernel(float *normalize_depth_results, float mean_chr_norm, int seq_length, int window_size, int step_size, float *ratio_par_window_norm_results) { __global__ void ratio_par_window_norm_kernel(float *normalize_depth_results, float mean_chr_norm, int seq_length, int window_size, int step_size, float *ratio_par_window_norm_results) {
int idx = threadIdx.x + blockIdx.x * blockDim.x; int idx = threadIdx.x + blockIdx.x * blockDim.x;
...@@ -294,7 +293,7 @@ __global__ void ratio_par_window_norm_kernel(float *normalize_depth_results, flo ...@@ -294,7 +293,7 @@ __global__ void ratio_par_window_norm_kernel(float *normalize_depth_results, flo
} }
} }
// Kernel pour calculer le z_score // Kernel to compute Z-score
__global__ void z_score_kernel(float *ratio_norm, float mean_ratio, float std_ratio, int seq_length, int window_size, int step_size, float *z_score_results) { __global__ void z_score_kernel(float *ratio_norm, float mean_ratio, float std_ratio, int seq_length, int window_size, int step_size, float *z_score_results) {
int idx = threadIdx.x + blockIdx.x * blockDim.x; int idx = threadIdx.x + blockIdx.x * blockDim.x;
...@@ -302,18 +301,18 @@ __global__ void z_score_kernel(float *ratio_norm, float mean_ratio, float std_ra ...@@ -302,18 +301,18 @@ __global__ void z_score_kernel(float *ratio_norm, float mean_ratio, float std_ra
if (idx < seq_length - window_size + 1) { if (idx < seq_length - window_size + 1) {
float value = ratio_norm[idx]; float value = ratio_norm[idx];
// Verification si la valeur est inf ou NaN, et remplacer par 0 si c est le cas // Check if the value is infinite or NaN, and replace it with 0 if so
if (isinf(value) || isnan(value)) { if (isinf(value) || isnan(value)) {
value = 0.0f; value = 0.0f;
} }
// Calcul du Z-score // Compute the Z-score
float z_score = (value - mean_ratio) / std_ratio; float z_score = (value - mean_ratio) / std_ratio;
z_score_results[idx] = z_score; z_score_results[idx] = z_score;
} }
} }
// Kernel pour calculer le ratio divise par le ratio moyen // Kernel to compute the ratio divided by the mean ratio
__global__ void ratio_par_mean_ratio_kernel(float *ratio_norm, float mean_ratio, int seq_length, int window_size, int step_size, float *ratio_par_mean_ratio_results) { __global__ void ratio_par_mean_ratio_kernel(float *ratio_norm, float mean_ratio, int seq_length, int window_size, int step_size, float *ratio_par_mean_ratio_results) {
int idx = threadIdx.x + blockIdx.x * blockDim.x; int idx = threadIdx.x + blockIdx.x * blockDim.x;
...@@ -321,12 +320,12 @@ __global__ void ratio_par_mean_ratio_kernel(float *ratio_norm, float mean_ratio, ...@@ -321,12 +320,12 @@ __global__ void ratio_par_mean_ratio_kernel(float *ratio_norm, float mean_ratio,
if (idx < seq_length - window_size + 1) { if (idx < seq_length - window_size + 1) {
float value = ratio_norm[idx]; float value = ratio_norm[idx];
// Verification si la valeur est inf ou NaN et remplacement par 0 si c est le cas // Check if the value is infinite or NaN and replace it with 0 if so
if (isinf(value) || isnan(value)) { if (isinf(value) || isnan(value)) {
value = 0.0f; value = 0.0f;
} }
// Calcul du ratio par rapport au ratio moyen // Compute the ratio relative to the mean ratio
float ratio_mean_ratio = value / mean_ratio; float ratio_mean_ratio = value / mean_ratio;
ratio_par_mean_ratio_results[idx] = ratio_mean_ratio; ratio_par_mean_ratio_results[idx] = ratio_mean_ratio;
} }
...@@ -335,7 +334,7 @@ __global__ void ratio_par_mean_ratio_kernel(float *ratio_norm, float mean_ratio, ...@@ -335,7 +334,7 @@ __global__ void ratio_par_mean_ratio_kernel(float *ratio_norm, float mean_ratio,
""" """
) )
# Obtention de la fonction de kernel compilée # Retrieving the compiled kernel function
calcul_depth_kernel_cuda = mod.get_function("calcul_depth_kernel") calcul_depth_kernel_cuda = mod.get_function("calcul_depth_kernel")
calcul_gc_kernel_cuda = mod.get_function("calcul_gc_kernel") calcul_gc_kernel_cuda = mod.get_function("calcul_gc_kernel")
calcul_map_kernel_cuda = mod.get_function("calcul_map_kernel") calcul_map_kernel_cuda = mod.get_function("calcul_map_kernel")
...@@ -586,153 +585,28 @@ def calcul_gc_content(seq_length, chr, seq): ...@@ -586,153 +585,28 @@ def calcul_gc_content(seq_length, chr, seq):
return gc_data return gc_data
def calcul_depth_seq(seq_length, bamfile_path, chr):
"""
Calculate the sequencing depth for a given chromosome.
This function computes the sequencing depth for a specific chromosome and sequence length using a BAM file.
Parameters
----------
seq_length : int
The length of the sequence.
bamfile_path : pysam.AlignmentFile
The path to the BAM file opened with pysam.AlignmentFile.
chr : str
The chromosome for which the depth is calculated.
Returns
-------
numpy.ndarray
An array of integers representing the sequencing depth for the given sequence length.
"""
logging.info(f"Entering calcul_depth_seq for {chr}")
start_time = time.time()
depth_data = np.zeros(seq_length, dtype=np.int32)
for pileupcolumn in bamfile_path.pileup(reference = chr):
pos = pileupcolumn.reference_pos
if pos >= seq_length:
break
depth_data[pos] = pileupcolumn.nsegments
#depth_data = bamfile_path.pileup().nsegments
end_time = time.time()
elapsed_time = end_time - start_time
logging.info(f"Leaving calcul_depth_seq for {chr} (Time taken: {elapsed_time:.4f} seconds)")
return depth_data
def calcul_depth_seq_copy(seq_length, bamfile_path, chr):
"""
Calculate the sequencing depth for a given chromosome.
This function computes the sequencing depth for a specific chromosome and sequence length using a BAM file.
Parameters
----------
seq_length : int
The length of the sequence.
bamfile_path : pysam.AlignmentFile
The path to the BAM file opened with pysam.AlignmentFile.
chr : str
The chromosome for which the depth is calculated.
Returns
-------
numpy.ndarray
An array of integers representing the sequencing depth for the given sequence length.
"""
logging.info(f"Entering calcul_depth_seq for {chr}")
start_time = time.time()
depth_data = np.zeros(seq_length, dtype=np.int32)
i = 0
it = bamfile_path.pileup(reference = chr)
while i < (seq_length - 1):
t = next(it)
pos = t.reference_pos
depth_data[pos] = t.nsegments
print(i)
i += 1
#for pileupcolumn in bamfile_path.pileup():
# pos = pileupcolumn.reference_pos
# if pos >= seq_length:
# break
#depth_data[pos] = pileupcolumn.nsegments
#print(bamfile_path.pileup()) print("########################")
# depth_data = np.array(bamfile_path.pileup().nsegments)
end_time = time.time()
elapsed_time = end_time - start_time
logging.info(f"Leaving calcul_depth_seq for {chr} (Time taken: {elapsed_time:.4f} seconds)")
return depth_data
def calcul_depth_seq_count(seq_length, bamfile_path, chr):
"""
Calculate the sequencing depth for a given chromosome.
This function computes the sequencing depth for a specific chromosome and sequence length using a BAM file.
Parameters
----------
seq_length : int
The length of the sequence.
bamfile_path : pysam.AlignmentFile
The path to the BAM file opened with pysam.AlignmentFile.
chr : str
The chromosome for which the depth is calculated.
Returns
-------
numpy.ndarray
An array of integers representing the sequencing depth for the given sequence length.
"""
logging.info(f"Entering calcul_depth_seq_count for {chr}")
start_time = time.time()
# Use count_coverage to get coverage depth
depth_data_count = bamfile_path.count_coverage(contig = chr)
type(depth_data_count)
#print(depth_data_count[:10056])
end_time = time.time()
elapsed_time = end_time - start_time
logging.info(f"Leaving calcul_depth_seq for {chr} (Time taken: {elapsed_time:.4f} seconds)")
return depth_data_count
def calcul_depth_seq_samtools(seq_length, bamfile_path, chr, num_threads=12): def calcul_depth_seq_samtools(seq_length, bamfile_path, chr, num_threads=12):
""" """
Calculate the sequencing depth for a given chromosome using a parallelized bash script. Calculate the sequencing depth for a given chromosome using a parallelized bash script.
This function uses a bash script with multithreading to compute the depth for a specific chromosome
from a BAM file, and returns the depth values as a numpy array.
Parameters Parameters
---------- ----------
seq_length : int seq_length : int
The length of the sequence. The length of the sequence.
bamfile_path : str bamfile_path : str
The path to the BAM file. The path to the BAM file containing sequencing data.
chr : str chr : str
The chromosome for which the depth is calculated. The chromosome for which the depth is calculated.
num_threads : int num_threads : int, optional
The number of threads to use for parallel processing. The number of threads to use for parallel processing (default is 12).
Returns Returns
------- -------
numpy.ndarray numpy.ndarray
An array of integers representing the sequencing depth for the given sequence length. An array of integers representing the sequencing depth for the specified sequence length.
""" """
logging.info(f"Entering calcul_depth_seq for {chr}") logging.info(f"Entering calcul_depth_seq for {chr}")
start_time = time.time() start_time = time.time()
...@@ -771,12 +645,15 @@ def calcul_med_total(depth_correction_results, chr): ...@@ -771,12 +645,15 @@ def calcul_med_total(depth_correction_results, chr):
""" """
Calculate the median of non-zero depth correction results. Calculate the median of non-zero depth correction results.
This function filters out zero values from the depth correction results and computes the median of the remaining values. This function filters out zero values from the depth correction results and computes the median
of the remaining non-zero values.
Parameters Parameters
---------- ----------
depth_correction_results : list or numpy.ndarray depth_correction_results : list or numpy.ndarray
A list or array of depth correction values. A list or array of depth correction values.
chr : str
The chromosome identifier for which the median is calculated.
Returns Returns
------- -------
...@@ -805,7 +682,8 @@ def calcul_med_same_gc(gc_results, depth_correction_results, chr): ...@@ -805,7 +682,8 @@ def calcul_med_same_gc(gc_results, depth_correction_results, chr):
""" """
Calculate the median depth correction for each unique GC content value. Calculate the median depth correction for each unique GC content value.
This function computes the median depth correction values for each unique GC content value, filtering out zero values. This function computes the median depth correction values for each unique GC content value in `gc_results`,
filtering out zero values from `depth_correction_results`.
Parameters Parameters
---------- ----------
...@@ -813,6 +691,8 @@ def calcul_med_same_gc(gc_results, depth_correction_results, chr): ...@@ -813,6 +691,8 @@ def calcul_med_same_gc(gc_results, depth_correction_results, chr):
A list or array of GC content values. A list or array of GC content values.
depth_correction_results : list or numpy.ndarray depth_correction_results : list or numpy.ndarray
A list or array of depth correction values. A list or array of depth correction values.
chr : str
The chromosome identifier for which the medians are calculated.
Returns Returns
------- -------
...@@ -857,12 +737,15 @@ def calcul_moy_totale(normalize_depth_results, chr): ...@@ -857,12 +737,15 @@ def calcul_moy_totale(normalize_depth_results, chr):
""" """
Calculate the mean of non-zero normalized depth results. Calculate the mean of non-zero normalized depth results.
This function filters out zero values from the normalized depth results and computes the mean of the remaining values. This function filters out zero values from the normalized depth results and computes the mean
of the remaining non-zero values.
Parameters Parameters
---------- ----------
normalize_depth_results : list or numpy.ndarray normalize_depth_results : list or numpy.ndarray
A list or array of normalized depth values. A list or array of normalized depth values.
chr : str
The chromosome identifier for which the mean is calculated.
Returns Returns
------- -------
...@@ -886,18 +769,19 @@ def calcul_moy_totale(normalize_depth_results, chr): ...@@ -886,18 +769,19 @@ def calcul_moy_totale(normalize_depth_results, chr):
return mean_chr return mean_chr
def calcul_std(normalize_depth_results, chr): def calcul_std(normalize_depth_results, chr):
""" """
Calculate the standard deviation of non-zero normalized depth results. Calculate the standard deviation of non-zero normalized depth results.
This function filters out zero values from the normalized depth results and computes the standard deviation of the remaining values. This function filters out zero values from the normalized depth results and computes the standard deviation
of the remaining non-zero values.
Parameters Parameters
---------- ----------
normalize_depth_results : list or numpy.ndarray normalize_depth_results : list or numpy.ndarray
A list or array of normalized depth values. A list or array of normalized depth values.
chr : str
The chromosome identifier for which the standard deviation is calculated.
Returns Returns
------- -------
...@@ -927,17 +811,21 @@ def compute_mean_std_med(ratio_par_window_results, chr): ...@@ -927,17 +811,21 @@ def compute_mean_std_med(ratio_par_window_results, chr):
""" """
Compute the mean, standard deviation, and median of non-zero ratio results per window. Compute the mean, standard deviation, and median of non-zero ratio results per window.
This function filters out zero and -1 values from the ratio results per window and computes the mean, standard deviation, and median of the remaining values. Values greater than or equal to 5 are capped at 5. This function filters out zero and -1 values from the ratio results per window and computes the mean,
standard deviation, and median of the remaining values. Values greater than or equal to 5 are capped at 5.
Parameters Parameters
---------- ----------
ratio_par_window_results : list or numpy.ndarray ratio_par_window_results : list or numpy.ndarray
A list or array of ratio values per window. A list or array of ratio values per window.
chr : str
The chromosome identifier for which the statistics are calculated.
Returns Returns
------- -------
tuple tuple
A tuple containing the mean, standard deviation, and median of the filtered ratio values. A tuple containing the mean, standard deviation, and median of the filtered ratio values:
(mean_ratio, std_ratio, med_ratio).
""" """
logging.info(f"Entering compute_mean_std_med for {chr}") logging.info(f"Entering compute_mean_std_med for {chr}")
start_time = time.time() start_time = time.time()
...@@ -980,6 +868,8 @@ def cn_level(x, chr): ...@@ -980,6 +868,8 @@ def cn_level(x, chr):
---------- ----------
x : float x : float
The input value used to determine the copy number level. The input value used to determine the copy number level.
chr : str
The chromosome identifier for which the copy number level is calculated.
Returns Returns
------- -------
...@@ -1017,7 +907,6 @@ def get_sample_name(bamfile_path): ...@@ -1017,7 +907,6 @@ def get_sample_name(bamfile_path):
str str
The sample name extracted from the BAM file. If no sample name is found, returns "UnknownSample". The sample name extracted from the BAM file. If no sample name is found, returns "UnknownSample".
""" """
logging.info(f"Entering get_sample_name") logging.info(f"Entering get_sample_name")
start_time = time.time() start_time = time.time()
...@@ -1029,9 +918,31 @@ def get_sample_name(bamfile_path): ...@@ -1029,9 +918,31 @@ def get_sample_name(bamfile_path):
end_time = time.time() end_time = time.time()
elapsed_time = end_time - start_time elapsed_time = end_time - start_time
logging.info(f"Leaving get_sample_name (Time taken: {elapsed_time:.4f} seconds)") logging.info(f"Leaving get_sample_name (Time taken: {elapsed_time:.4f} seconds)")
return "UnknownSample" return "UnknownSample"
def filter_read_gc(gc_results, depth_results, chr): def filter_read_gc(gc_results, depth_results, chr):
"""
Filter depth results by GC content categories.
This function groups depth values based on rounded GC content categories (35, 40, 45, 50, 55) and returns
the depth values for each category as separate arrays.
Parameters
----------
gc_results : list or numpy.ndarray
A list or array of GC content values.
depth_results : list or numpy.ndarray
A list or array of depth values corresponding to the GC content values.
chr : str
The chromosome identifier for which the filtering is performed.
Returns
-------
tuple
A tuple containing five numpy arrays, each corresponding to depth values for rounded GC content categories:
(gc_35, gc_40, gc_45, gc_50, gc_55).
"""
logging.info(f"Entering filter_read_gc for {chr}") logging.info(f"Entering filter_read_gc for {chr}")
start_time = time.time() start_time = time.time()
...@@ -1066,6 +977,34 @@ def filter_read_gc(gc_results, depth_results, chr): ...@@ -1066,6 +977,34 @@ def filter_read_gc(gc_results, depth_results, chr):
return gc_35, gc_40, gc_45, gc_50, gc_55 return gc_35, gc_40, gc_45, gc_50, gc_55
def filter_read_gc_gpu(gc_results, d_gc_results, d_depth_results, grid_size, block_size, chr): def filter_read_gc_gpu(gc_results, d_gc_results, d_depth_results, grid_size, block_size, chr):
"""
Filter depth results by GC content categories using GPU computation.
This function utilizes a GPU kernel to group depth values based on rounded GC content categories (35, 40, 45, 50, 55)
and returns the depth values for each category as separate arrays. The function allocates memory on the GPU,
copies the data, executes the kernel, and retrieves the results.
Parameters
----------
gc_results : list or numpy.ndarray
A list or array of GC content values.
d_gc_results : pycuda.gpuarray.GPUArray
A GPU array containing GC content values.
d_depth_results : pycuda.gpuarray.GPUArray
A GPU array containing depth values.
grid_size : int
The grid size for the GPU kernel execution.
block_size : int
The block size for the GPU kernel execution.
chr : str
The chromosome identifier for which the filtering is performed.
Returns
-------
tuple
A tuple containing five numpy arrays, each corresponding to depth values for rounded GC content categories:
(gc_35, gc_40, gc_45, gc_50, gc_55).
"""
logging.info(f"Entering filter_read_gc_gpu for {chr}") logging.info(f"Entering filter_read_gc_gpu for {chr}")
start_time = time.time() start_time = time.time()
...@@ -1126,6 +1065,33 @@ def filter_read_gc_gpu(gc_results, d_gc_results, d_depth_results, grid_size, blo ...@@ -1126,6 +1065,33 @@ def filter_read_gc_gpu(gc_results, d_gc_results, d_depth_results, grid_size, blo
return gc_35, gc_40, gc_45, gc_50, gc_55 return gc_35, gc_40, gc_45, gc_50, gc_55
def calc_polynom(gc_35, gc_40, gc_45, gc_50, gc_55, chr): def calc_polynom(gc_35, gc_40, gc_45, gc_50, gc_55, chr):
"""
Calculate polynomial fits for depth arrays grouped by GC content.
This function computes a 3rd-degree polynomial fit for each depth array (gc_35, gc_40, gc_45, gc_50, gc_55).
Each fit is represented as a polynomial object. If a depth array is empty, `None` is returned for that GC category.
Parameters
----------
gc_35 : numpy.ndarray
Array of depth values for GC content category 35.
gc_40 : numpy.ndarray
Array of depth values for GC content category 40.
gc_45 : numpy.ndarray
Array of depth values for GC content category 45.
gc_50 : numpy.ndarray
Array of depth values for GC content category 50.
gc_55 : numpy.ndarray
Array of depth values for GC content category 55.
chr : str
The chromosome identifier for which the polynomial fits are calculated.
Returns
-------
list
A list of polynomial objects (numpy.poly1d) for each GC content category, where `None` is returned
for categories with empty depth arrays.
"""
logging.info(f"Entering calc_polynom for {chr}") logging.info(f"Entering calc_polynom for {chr}")
start_time = time.time() start_time = time.time()
...@@ -1147,6 +1113,24 @@ def calc_polynom(gc_35, gc_40, gc_45, gc_50, gc_55, chr): ...@@ -1147,6 +1113,24 @@ def calc_polynom(gc_35, gc_40, gc_45, gc_50, gc_55, chr):
return polynomials return polynomials
def calc_mean_chr(depth_results, chr): def calc_mean_chr(depth_results, chr):
"""
Calculate the mean of non-zero depth results for a chromosome.
This function filters out zero values from the depth results and computes the mean of the remaining
non-zero values for the specified chromosome.
Parameters
----------
depth_results : list or numpy.ndarray
A list or array of depth values.
chr : str
The chromosome identifier for which the mean depth is calculated.
Returns
-------
float
The mean of the non-zero depth values, or 0 if no non-zero values are present.
"""
logging.info(f"Entering calc_mean_chr for {chr}") logging.info(f"Entering calc_mean_chr for {chr}")
start_time = time.time() start_time = time.time()
...@@ -1165,6 +1149,31 @@ def calc_mean_chr(depth_results, chr): ...@@ -1165,6 +1149,31 @@ def calc_mean_chr(depth_results, chr):
return mean_chr return mean_chr
def find_polynom(ratio_par_window_results, chr, polynomials, depth_results): def find_polynom(ratio_par_window_results, chr, polynomials, depth_results):
"""
Find the best polynomial fit for copy number ratio results.
This function identifies the polynomial from a list of polynomials that best fits the copy number (CN)
values calculated from the ratio per window results. The best fit is determined by minimizing the
difference between normalized polynomial values and unique depth values.
Parameters
----------
ratio_par_window_results : list or numpy.ndarray
A list or array of ratio values per window.
chr : str
The chromosome identifier for which the best polynomial is determined.
polynomials : list of numpy.poly1d or None
A list of polynomial objects, where each polynomial corresponds to a GC content category.
Polynomials may be `None` if no fit is available for that category.
depth_results : list or numpy.ndarray
A list or array of depth values used for comparison with polynomial predictions.
Returns
-------
numpy.poly1d or None
The polynomial that best fits the data based on the minimum difference in normalized values,
or `None` if no suitable polynomial is found.
"""
logging.info(f"Entering find_polynom for {chr}") logging.info(f"Entering find_polynom for {chr}")
start_time = time.time() start_time = time.time()
...@@ -1207,6 +1216,28 @@ def find_polynom(ratio_par_window_results, chr, polynomials, depth_results): ...@@ -1207,6 +1216,28 @@ def find_polynom(ratio_par_window_results, chr, polynomials, depth_results):
return best_polynom return best_polynom
def normalize(depth_results, best_polynom, map_results): def normalize(depth_results, best_polynom, map_results):
"""
Normalize depth results using a polynomial model and mappability results.
This function normalizes depth values by dividing each non-zero depth value by the product of the
corresponding polynomial value and mappability score. If no polynomial is provided, the depth value is
used as-is. Zero values in either depth or mappability results are set to zero in the output.
Parameters
----------
depth_results : list or numpy.ndarray
A list or array of depth values to be normalized.
best_polynom : numpy.poly1d or None
A polynomial model for normalization. If `None`, no polynomial normalization is applied.
map_results : list or numpy.ndarray
A list or array of mappability values corresponding to the depth results.
Returns
-------
numpy.ndarray
An array of normalized depth values, where zero values are retained where either depth or mappability
was zero.
"""
logging.info("Entering normalization") logging.info("Entering normalization")
start_time = time.time() start_time = time.time()
...@@ -1222,7 +1253,6 @@ def normalize(depth_results, best_polynom, map_results): ...@@ -1222,7 +1253,6 @@ def normalize(depth_results, best_polynom, map_results):
#normalize_depth_results_unique = np.unique(normalize_depth_results) #normalize_depth_results_unique = np.unique(normalize_depth_results)
#logging.info(f"normalize_depth_results_unique = {normalize_depth_results_unique}") #logging.info(f"normalize_depth_results_unique = {normalize_depth_results_unique}")
has_nan = np.isnan(normalize_depth_results).any() has_nan = np.isnan(normalize_depth_results).any()
has_inf = np.isinf(normalize_depth_results).any() has_inf = np.isinf(normalize_depth_results).any()
...@@ -1242,11 +1272,12 @@ def create_signal(signal, chr, z_score_results, step_size): ...@@ -1242,11 +1272,12 @@ def create_signal(signal, chr, z_score_results, step_size):
Create a signal dictionary for a specific chromosome based on z-score results. Create a signal dictionary for a specific chromosome based on z-score results.
This function populates a signal dictionary with positions and corresponding z-score results for a given chromosome. This function populates a signal dictionary with positions and corresponding z-score results for a given chromosome.
Each position is calculated based on the step size and is mapped to the corresponding z-score value.
Parameters Parameters
---------- ----------
signal : dict signal : dict
A dictionary to store the signal data. A dictionary to store the signal data, where chromosome names are keys and position-to-z-score mappings are values.
chr : str chr : str
The chromosome for which the signal is created. The chromosome for which the signal is created.
z_score_results : list or numpy.ndarray z_score_results : list or numpy.ndarray
...@@ -1257,7 +1288,7 @@ def create_signal(signal, chr, z_score_results, step_size): ...@@ -1257,7 +1288,7 @@ def create_signal(signal, chr, z_score_results, step_size):
Returns Returns
------- -------
None None
The function modifies the signal dictionary in place. The function modifies the `signal` dictionary in place.
""" """
logging.info(f"Entering create_signal for {chr} (include copy_number_level)") logging.info(f"Entering create_signal for {chr} (include copy_number_level)")
start_time = time.time() start_time = time.time()
...@@ -1334,19 +1365,26 @@ def segmentation(events, segment, chr): ...@@ -1334,19 +1365,26 @@ def segmentation(events, segment, chr):
""" """
Segment the detected events into contiguous regions with the same copy number level. Segment the detected events into contiguous regions with the same copy number level.
This function processes the detected events and groups contiguous regions with the same copy number level into segments. This function processes the detected events and groups contiguous regions with the same copy number level
into segments, modifying the `segment` dictionary in place to store the start, end, and copy number
information for each contiguous region.
Parameters Parameters
---------- ----------
events : dict events : dict
A dictionary of detected events for each chromosome. A dictionary of detected events for each chromosome. The keys are chromosome names, and the values are dictionaries
with positions as keys and copy number levels as values.
segment : dict segment : dict
A dictionary to store the segmented regions. A dictionary to store the segmented regions, with chromosome names as keys. Each chromosome's value is a dictionary
where keys represent segment ranges (e.g., "start-end") and values are dictionaries containing 'start', 'end', and
'cn' (copy number level) for the segment.
chr : str
The chromosome identifier being processed.
Returns Returns
------- -------
None None
The function modifies the segment dictionary in place. The function modifies the `segment` dictionary in place, adding segmented regions.
""" """
logging.info(f"Entering segmentation for {chr}") logging.info(f"Entering segmentation for {chr}")
start_time = time.time() start_time = time.time()
...@@ -1395,7 +1433,10 @@ def display_results_vcf(sample, segment, signal, lengthFilter, output_file, chr, ...@@ -1395,7 +1433,10 @@ def display_results_vcf(sample, segment, signal, lengthFilter, output_file, chr,
""" """
Generate a VCF file containing structural variant calls based on segmented regions and signal data. Generate a VCF file containing structural variant calls based on segmented regions and signal data.
This function creates a VCF (Variant Call Format) file containing structural variant calls derived from segmented regions and signal data. The structural variant type (DEL for deletion or DUP for duplication) is determined based on copy number levels and signal values. The resulting VCF file includes information about the chromosome, position, type of structural variant, copy number, and other relevant information. This function creates a VCF (Variant Call Format) file containing structural variant calls derived from
segmented regions and signal data. The structural variant type (DEL for deletion or DUP for duplication)
is determined based on copy number levels and signal values. The resulting VCF file includes information
about the chromosome, position, type of structural variant, copy number, and other relevant information.
Parameters Parameters
---------- ----------
...@@ -1409,6 +1450,10 @@ def display_results_vcf(sample, segment, signal, lengthFilter, output_file, chr, ...@@ -1409,6 +1450,10 @@ def display_results_vcf(sample, segment, signal, lengthFilter, output_file, chr,
The minimum length threshold for including variants in the VCF file. The minimum length threshold for including variants in the VCF file.
output_file : str output_file : str
The path to the output VCF file. The path to the output VCF file.
chr : str
The chromosome identifier for which the VCF output is generated.
normalize_depth_results : list or numpy.ndarray
An array or list of normalized depth values used to filter structural variant calls.
Returns Returns
------- -------
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment