Commit be5e1926 authored by Theo Serralta's avatar Theo Serralta

Formatting code and add functions

parent 91be97c9
...@@ -47,19 +47,22 @@ def parse_arguments(): ...@@ -47,19 +47,22 @@ def parse_arguments():
Each element can be None if the corresponding argument was not provided. Each element can be None if the corresponding argument was not provided.
""" """
try: try:
opts, args = getopt.getopt(sys.argv[1:], "b:w:s:z:l:t:o:e:") opts, args = getopt.getopt(sys.argv[1:], "b:c:w:s:z:l:t:o:e:")
( (
bamfile_path, bamfile_path,
num_chr,
window_size, window_size,
step_size, step_size,
zscore_threshold, zscore_threshold,
lengthFilter, lengthFilter,
output_file, output_file,
logfile, logfile,
) = (None, None, None, None, None, None, None) ) = (None, None, None, None, None, None, None, None)
for opt, arg in opts: for opt, arg in opts:
if opt in ("-b"): if opt in ("-b"):
bamfile_path = arg bamfile_path = arg
if opt in ("-c"):
num_chr = arg
if opt in ("-w"): if opt in ("-w"):
window_size = int(arg) window_size = int(arg)
if opt in ("-s"): if opt in ("-s"):
...@@ -74,6 +77,7 @@ def parse_arguments(): ...@@ -74,6 +77,7 @@ def parse_arguments():
logfile = arg logfile = arg
return ( return (
bamfile_path, bamfile_path,
num_chr,
window_size, window_size,
step_size, step_size,
zscore_threshold, zscore_threshold,
...@@ -89,6 +93,7 @@ def parse_arguments(): ...@@ -89,6 +93,7 @@ def parse_arguments():
if __name__ == "__main__": if __name__ == "__main__":
( (
bamfile_path, bamfile_path,
num_chr,
window_size, window_size,
step_size, step_size,
zscore_threshold, zscore_threshold,
...@@ -128,6 +133,24 @@ __global__ void calcul_depth_kernel(int *depth_data, int seq_length, int window_ ...@@ -128,6 +133,24 @@ __global__ void calcul_depth_kernel(int *depth_data, int seq_length, int window_
} }
} }
__global__ void calcul_depth_count_kernel(int *depth_data_count, int seq_length, int window_size, int step_size, float *depth_results_count) {
int idx = threadIdx.x + blockIdx.x * blockDim.x;
if (idx < seq_length - window_size + 1) {
int pos_start = (idx * step_size) + 1;
int pos_end = pos_start + window_size;
int count_reads = 0;
for (int i = pos_start; i < pos_end; i++) {
count_reads += depth_data_count[i];
}
float avg_depth_count = (float)count_reads / window_size;
depth_results_count[idx] = avg_depth_count;
}
}
//Kernel pour calculer le GC content //Kernel pour calculer le GC content
__global__ void calcul_gc_kernel(char *gc_data, int seq_length, int window_size, int step_size, float *gc_results) { __global__ void calcul_gc_kernel(char *gc_data, int seq_length, int window_size, int step_size, float *gc_results) {
...@@ -245,6 +268,7 @@ __global__ void ratio_par_mean_ratio_kernel(float *ratio, float mean_ratio, int ...@@ -245,6 +268,7 @@ __global__ void ratio_par_mean_ratio_kernel(float *ratio, float mean_ratio, int
# Obtention de la fonction de kernel compilée # Obtention de la fonction de kernel compilée
calcul_depth_kernel_cuda = mod.get_function("calcul_depth_kernel") calcul_depth_kernel_cuda = mod.get_function("calcul_depth_kernel")
calcul_depth_count_kernel_cuda = mod.get_function("calcul_depth_count_kernel")
calcul_gc_kernel_cuda = mod.get_function("calcul_gc_kernel") calcul_gc_kernel_cuda = mod.get_function("calcul_gc_kernel")
calcul_map_kernel_cuda = mod.get_function("calcul_map_kernel") calcul_map_kernel_cuda = mod.get_function("calcul_map_kernel")
calcul_depth_correction_kernel_cuda = mod.get_function("calcul_depth_correction_kernel") calcul_depth_correction_kernel_cuda = mod.get_function("calcul_depth_correction_kernel")
...@@ -280,6 +304,7 @@ def merge_intervals(intervals): ...@@ -280,6 +304,7 @@ def merge_intervals(intervals):
merged.append((start, end, score)) merged.append((start, end, score))
start, end, score = interval start, end, score = interval
merged.append((start, end, score)) merged.append((start, end, score))
return merged return merged
...@@ -301,9 +326,12 @@ def dico_mappabilite(mappability_file): ...@@ -301,9 +326,12 @@ def dico_mappabilite(mappability_file):
A dictionary where keys are chromosome names and values are another dictionary with start positions as keys A dictionary where keys are chromosome names and values are another dictionary with start positions as keys
and mappability scores as values. and mappability scores as values.
""" """
sys.stderr.write("\t Entering dico_mappabilite\n") logging.info("Entering dico_mappability")
mappability_dico = {} mappability_dico = {}
logging.info(" In dico_mappability : Open file and create the dico")
with open(mappability_file, "r") as f: with open(mappability_file, "r") as f:
for line in f: for line in f:
fields = line.strip().split("\t") fields = line.strip().split("\t")
...@@ -320,19 +348,29 @@ def dico_mappabilite(mappability_file): ...@@ -320,19 +348,29 @@ def dico_mappabilite(mappability_file):
mappability_dico[chromosome].append((start_pos, end_pos, score)) mappability_dico[chromosome].append((start_pos, end_pos, score))
logging.info(f"In dico_mappability : Leaving file and create the dico")
# Add position 0 for each chromosome # Add position 0 for each chromosome
logging.info(" In dico_mappability : Add position 0 for each chromosome")
for chromosome, intervals in mappability_dico.items(): for chromosome, intervals in mappability_dico.items():
if intervals[0][0] != 0: if intervals[0][0] != 0:
mappability_dico[chromosome].insert(0, (0, intervals[0][0], 0)) mappability_dico[chromosome].insert(0, (0, intervals[0][0], 0))
logging.info(f"In dico_mappability : Ending add position 0 for each chromosome")
# Merge intervals with the same score # Merge intervals with the same score
logging.info(" In dico_mappability : Merge intervals with the same score")
for chromosome, intervals in mappability_dico.items(): for chromosome, intervals in mappability_dico.items():
merged_intervals = merge_intervals(intervals) merged_intervals = merge_intervals(intervals)
mappability_dico[chromosome] = { mappability_dico[chromosome] = {
start: score for start, _, score in merged_intervals start: score for start, _, score in merged_intervals
} }
logging.info(f"In dico_mappability : Ending merge intervals with the same score")
logging.info(f"Leaving dico_mappability")
sys.stderr.write("\t Leaving dico_mappabilite\n")
return mappability_dico return mappability_dico
...@@ -357,24 +395,33 @@ def calcul_mappability(seq_length, mappability, chr): ...@@ -357,24 +395,33 @@ def calcul_mappability(seq_length, mappability, chr):
numpy.ndarray numpy.ndarray
An array of mappability scores for the given sequence length. An array of mappability scores for the given sequence length.
""" """
sys.stderr.write("\t Entering calcul_mappability \n") logging.info(f"Entering calcul_mappability for {chr}")
map_data = np.zeros(seq_length, dtype=np.float32) map_data = np.zeros(seq_length, dtype=np.float32)
sorted_keys = sorted(mappability[chr].keys()) sorted_keys = sorted(mappability[chr].keys())
prev_bound = 0 prev_bound = 0
prev_mappability = 0 prev_mappability = 0
logging.info(f"In calcul_mappability : Entering for bound in sorted_keys for {chr}")
for bound in sorted_keys: for bound in sorted_keys:
for i in range(prev_bound, min(seq_length, bound)): for i in range(prev_bound, min(seq_length, bound)):
map_data[i] = prev_mappability map_data[i] = prev_mappability
prev_bound = bound prev_bound = bound
prev_mappability = mappability[chr][bound] prev_mappability = mappability[chr][bound]
logging.info(f"In calcul_mappability : Leaving for bound in sorted_keys for {chr}")
# Fill remaining positions if sequence length exceeds last bound # Fill remaining positions if sequence length exceeds last bound
logging.info(f"In calcul_mappability : Entering for i in range(prev_bound, seq_length) for {chr}")
for i in range(prev_bound, seq_length): for i in range(prev_bound, seq_length):
map_data[i] = prev_mappability map_data[i] = prev_mappability
sys.stderr.write("\t Leaving calcul_mappability \n") logging.info(f"In calcul_mappability : Leaving for i in range(prev_bound, seq_length) for {chr}")
logging.info(f"Leaving dico_mappability for {chr}")
return map_data return map_data
...@@ -394,7 +441,8 @@ def parse_fasta(gc_file): ...@@ -394,7 +441,8 @@ def parse_fasta(gc_file):
dict dict
A dictionary where keys are sequence headers and values are the corresponding sequences. A dictionary where keys are sequence headers and values are the corresponding sequences.
""" """
sys.stderr.write("\t Entering parse_fasta\n") logging.info("Entering parse_fasta")
sequences = {} sequences = {}
with open(gc_file, "r") as f: with open(gc_file, "r") as f:
data = f.read().split(">") data = f.read().split(">")
...@@ -404,7 +452,8 @@ def parse_fasta(gc_file): ...@@ -404,7 +452,8 @@ def parse_fasta(gc_file):
sequence = "".join(lines[1:]) sequence = "".join(lines[1:])
sequences[header] = sequence sequences[header] = sequence
sys.stderr.write("\t Leaving parse_fasta\n") logging.info(f"Leaving parse_fasta")
return sequences return sequences
...@@ -428,12 +477,14 @@ def calcul_gc_content(seq_length, chr, seq): ...@@ -428,12 +477,14 @@ def calcul_gc_content(seq_length, chr, seq):
numpy.ndarray numpy.ndarray
An array of bytes ('S' dtype) representing the GC content for the given sequence length. An array of bytes ('S' dtype) representing the GC content for the given sequence length.
""" """
sys.stderr.write("\t Entering calcul_gc_content\n") logging.info(f"Entering calcul_gc_content for {chr}")
gc_data = np.zeros(seq_length, dtype="S") gc_data = np.zeros(seq_length, dtype="S")
for i in range(len(seq[chr])): for i in range(len(seq[chr])):
gc_data[i] = seq[chr][i] gc_data[i] = seq[chr][i]
sys.stderr.write("\t Leaving calcul_gc_content\n") logging.info(f"Leaving calcul_gc_content for {chr}")
return gc_data return gc_data
...@@ -457,18 +508,103 @@ def calcul_depth_seq(seq_length, bamfile_path, chr): ...@@ -457,18 +508,103 @@ def calcul_depth_seq(seq_length, bamfile_path, chr):
numpy.ndarray numpy.ndarray
An array of integers representing the sequencing depth for the given sequence length. An array of integers representing the sequencing depth for the given sequence length.
""" """
sys.stderr.write("\t Entering calcul_depth_seq\n") logging.info(f"Entering calcul_depth_seq for {chr}")
depth_data = np.zeros(seq_length, dtype=np.int32) depth_data = np.zeros(seq_length, dtype=np.int32)
for pileupcolumn in bamfile_path.pileup(): for pileupcolumn in bamfile_path.pileup(reference = chr):
if pileupcolumn.reference_pos >= seq_length: pos = pileupcolumn.reference_pos
if pos >= seq_length:
break break
depth_data[pileupcolumn.reference_pos] = pileupcolumn.nsegments depth_data[pos] = pileupcolumn.nsegments
#depth_data = bamfile_path.pileup().nsegments
logging.info(f"Leaving calcul_depth_seq for {chr}")
return depth_data
def calcul_depth_seq_copy(seq_length, bamfile_path, chr):
"""
Calculate the sequencing depth for a given chromosome.
This function computes the sequencing depth for a specific chromosome and sequence length using a BAM file.
Parameters
----------
seq_length : int
The length of the sequence.
bamfile_path : pysam.AlignmentFile
The path to the BAM file opened with pysam.AlignmentFile.
chr : str
The chromosome for which the depth is calculated.
Returns
-------
numpy.ndarray
An array of integers representing the sequencing depth for the given sequence length.
"""
logging.info(f"Entering calcul_depth_seq for {chr}")
depth_data = np.zeros(seq_length, dtype=np.int32)
i = 0
it = bamfile_path.pileup(reference = chr)
while i < (seq_length - 1):
t = next(it)
pos = t.reference_pos
depth_data[pos] = t.nsegments
print(i)
i += 1
#for pileupcolumn in bamfile_path.pileup():
# pos = pileupcolumn.reference_pos
# if pos >= seq_length:
# break
#depth_data[pos] = pileupcolumn.nsegments
#print(bamfile_path.pileup()) print("########################")
# depth_data = np.array(bamfile_path.pileup().nsegments)
logging.info(f"Leaving calcul_depth_seq for {chr}")
sys.stderr.write("\t Leaving calcul_depth_seq\n")
return depth_data return depth_data
def calcul_depth_seq_count(seq_length, bamfile_path, chr):
"""
Calculate the sequencing depth for a given chromosome.
This function computes the sequencing depth for a specific chromosome and sequence length using a BAM file.
Parameters
----------
seq_length : int
The length of the sequence.
bamfile_path : pysam.AlignmentFile
The path to the BAM file opened with pysam.AlignmentFile.
chr : str
The chromosome for which the depth is calculated.
Returns
-------
numpy.ndarray
An array of integers representing the sequencing depth for the given sequence length.
"""
logging.info(f"Entering calcul_depth_seq for {chr}")
depth_data_count = np.zeros(seq_length, dtype=np.int32)
for i in range(seq_length):
depth_data_count[i] = bamfile_path.count(contig = chr, start = i, stop = i+1)
logging.info(f"Leaving calcul_depth_seq for {chr}")
return depth_data_count
def calcul_med_total(depth_correction_results):
def calcul_med_total(depth_correction_results, chr):
""" """
Calculate the median of non-zero depth correction results. Calculate the median of non-zero depth correction results.
...@@ -484,17 +620,22 @@ def calcul_med_total(depth_correction_results): ...@@ -484,17 +620,22 @@ def calcul_med_total(depth_correction_results):
float float
The median of the non-zero depth correction values, or 0 if no non-zero values are present. The median of the non-zero depth correction values, or 0 if no non-zero values are present.
""" """
sys.stderr.write("\t entering calcul_med_total\n") logging.info(f"Entering calcul_med_total for {chr}")
depth_correction_results = np.array(depth_correction_results) depth_correction_results = np.array(depth_correction_results)
# Filter results to remove zero values # Filter results to remove zero values
non_zero_results = depth_correction_results[depth_correction_results != 0] non_zero_results = depth_correction_results[depth_correction_results != 0]
# Calculate the median of non-zero results # Calculate the median of non-zero results
m = np.median(non_zero_results) if non_zero_results.size > 0 else 0 m = np.median(non_zero_results) if non_zero_results.size > 0 else 0
sys.stderr.write("\t Leaving calcul_med_total\n")
sys.stderr.write("Chromosome : %s, med_chr : %s\n" % (chr, m))
logging.info(f"Leaving calcul_med_total for {chr}")
return m return m
def calcul_med_same_gc(gc_results, depth_correction_results): def calcul_med_same_gc(gc_results, depth_correction_results, chr):
""" """
Calculate the median depth correction for each unique GC content value. Calculate the median depth correction for each unique GC content value.
...@@ -512,7 +653,8 @@ def calcul_med_same_gc(gc_results, depth_correction_results): ...@@ -512,7 +653,8 @@ def calcul_med_same_gc(gc_results, depth_correction_results):
dict dict
A dictionary where keys are unique GC content values and values are the median depth correction for those GC values. A dictionary where keys are unique GC content values and values are the median depth correction for those GC values.
""" """
sys.stderr.write("\t entering calcul_med_same_gc\n") logging.info(f"Entering calcul_med_same_gc for {chr}")
mGC = [] mGC = []
depth_correction_results_array = np.array(depth_correction_results) depth_correction_results_array = np.array(depth_correction_results)
unique_gc_values = np.unique(gc_results) unique_gc_values = np.unique(gc_results)
...@@ -537,11 +679,12 @@ def calcul_med_same_gc(gc_results, depth_correction_results): ...@@ -537,11 +679,12 @@ def calcul_med_same_gc(gc_results, depth_correction_results):
gc_to_median = dict(mGC) gc_to_median = dict(mGC)
sys.stderr.write("\t Leaving calcul_med_same_gc\n") logging.info(f"Leaving calcul_med_same_gc for {chr}")
return gc_to_median return gc_to_median
def calcul_moy_totale(normalize_depth_results): def calcul_moy_totale(normalize_depth_results, chr):
""" """
Calculate the mean of non-zero normalized depth results. Calculate the mean of non-zero normalized depth results.
...@@ -557,45 +700,23 @@ def calcul_moy_totale(normalize_depth_results): ...@@ -557,45 +700,23 @@ def calcul_moy_totale(normalize_depth_results):
float float
The mean of the non-zero normalized depth values, or 0 if no non-zero values are present. The mean of the non-zero normalized depth values, or 0 if no non-zero values are present.
""" """
sys.stderr.write("\t entering calcul_moy_totale\n") logging.info(f"Entering calcul_moy_totale for {chr}")
normalize_depth_results = np.array(normalize_depth_results) normalize_depth_results = np.array(normalize_depth_results)
# Filter results to remove zero values # Filter results to remove zero values
non_zero_results = normalize_depth_results[normalize_depth_results != 0] non_zero_results = normalize_depth_results[normalize_depth_results != 0]
# Calculate the mean of non-zero results # Calculate the mean of non-zero results
mean_chr = np.mean(non_zero_results) if non_zero_results.size > 0 else 0 mean_chr = np.mean(non_zero_results) if non_zero_results.size > 0 else 0
print(mean_chr)
sys.stderr.write("\t Leaving calcul_moy_totale\n")
return mean_chr
def calcul_moy_totale_ratio(ratio_par_window_results): sys.stderr.write("Chromosome : %s, mean_chr : %s\n" % (chr, mean_chr))
"""
Calculate the mean of non-zero ratio results per window.
This function filters out zero values from the ratio results per window and computes the mean of the remaining values.
Parameters logging.info(f"Leaving calcul_moy_totale for {chr}")
----------
ratio_par_window_results : list or numpy.ndarray return mean_chr
A list or array of ratio values per window.
Returns
-------
float
The mean of the non-zero ratio values per window, or 0 if no non-zero values are present.
"""
sys.stderr.write("\t entering calcul_moy_totale_ratio\n")
ratio_par_window_results = np.array(ratio_par_window_results)
# Filter results to remove zero values
non_zero_results = ratio_par_window_results[ratio_par_window_results != 0]
# Calculate the mean of non-zero results
mean_ratio = np.mean(non_zero_results) if non_zero_results.size > 0 else 0
print(mean_ratio)
sys.stderr.write("\t Leaving calcul_moy_totale_ratio\n")
return mean_ratio
def calcul_std(normalize_depth_results): def calcul_std(normalize_depth_results, chr):
""" """
Calculate the standard deviation of non-zero normalized depth results. Calculate the standard deviation of non-zero normalized depth results.
...@@ -611,45 +732,23 @@ def calcul_std(normalize_depth_results): ...@@ -611,45 +732,23 @@ def calcul_std(normalize_depth_results):
float float
The standard deviation of the non-zero normalized depth values, or 0 if no non-zero values are present. The standard deviation of the non-zero normalized depth values, or 0 if no non-zero values are present.
""" """
sys.stderr.write("\t entering calcul_std\n") logging.info(f"Entering calcul_std for {chr}")
normalize_depth_results = np.array(normalize_depth_results) normalize_depth_results = np.array(normalize_depth_results)
# Filter results to remove zero values # Filter results to remove zero values
non_zero_results = normalize_depth_results[normalize_depth_results != 0] non_zero_results = normalize_depth_results[normalize_depth_results != 0]
# Calculate the standard deviation of non-zero results # Calculate the standard deviation of non-zero results
std_chr = np.std(non_zero_results) if non_zero_results.size > 0 else 0 std_chr = np.std(non_zero_results) if non_zero_results.size > 0 else 0
print(std_chr)
sys.stderr.write("\t Leaving calcul_std\n")
return std_chr
def calcul_std_ratio(ratio_par_window_results):
"""
Calculate the standard deviation of non-zero ratio results per window.
This function filters out zero values from the ratio results per window and computes the standard deviation of the remaining values.
Parameters
----------
ratio_par_window_results : list or numpy.ndarray
A list or array of ratio values per window.
Returns sys.stderr.write("Chromosome : %s, std_chr : %s\n" % (chr, std_chr))
-------
float logging.info(f"Leaving calcul_std for {chr}")
The standard deviation of the non-zero ratio values per window, or 0 if no non-zero values are present.
""" return std_chr
sys.stderr.write("\t entering calcul_std_ratio\n")
ratio_par_window_results = np.array(ratio_par_window_results)
# Filter results to remove zero values
non_zero_results = ratio_par_window_results[ratio_par_window_results != 0]
# Calculate the standard deviation of non-zero results
std_ratio = np.std(non_zero_results) if non_zero_results.size > 0 else 0
print(std_ratio)
sys.stderr.write("\t Leaving calcul_std_ratio\n")
return std_ratio
def compute_mean_and_std(ratio_par_window_results): def compute_mean_std_med(ratio_par_window_results, chr):
""" """
Compute the mean, standard deviation, and median of non-zero ratio results per window. Compute the mean, standard deviation, and median of non-zero ratio results per window.
...@@ -665,7 +764,7 @@ def compute_mean_and_std(ratio_par_window_results): ...@@ -665,7 +764,7 @@ def compute_mean_and_std(ratio_par_window_results):
tuple tuple
A tuple containing the mean, standard deviation, and median of the filtered ratio values. A tuple containing the mean, standard deviation, and median of the filtered ratio values.
""" """
sys.stderr.write("Computing stats : \n") logging.info(f"Entering compute_mean_std_med for {chr}")
# Filter results to remove zero and -1 values # Filter results to remove zero and -1 values
ratio_par_window_results = np.array(ratio_par_window_results) ratio_par_window_results = np.array(ratio_par_window_results)
...@@ -685,15 +784,14 @@ def compute_mean_and_std(ratio_par_window_results): ...@@ -685,15 +784,14 @@ def compute_mean_and_std(ratio_par_window_results):
std_ratio = np.std(table) if table else 0 std_ratio = np.std(table) if table else 0
med_ratio = np.median(table) if table else 0 med_ratio = np.median(table) if table else 0
# Display results sys.stderr.write("Chromosome : %s, mean_ratio : %s, std_ratio : %s, med_ratio : %s\n" % (chr, mean_ratio, std_ratio, med_ratio))
print(mean_ratio, std_ratio, med_ratio)
sys.stderr.write("Computation done\n") logging.info(f"Leaving compute_mean_std_med for {chr}")
# Return results # Return results
return mean_ratio, std_ratio, med_ratio return mean_ratio, std_ratio, med_ratio
def cn_level(x): def cn_level(x, chr):
""" """
Determine the copy number level based on the given value. Determine the copy number level based on the given value.
...@@ -744,10 +842,15 @@ def get_sample_name(bamfile_path): ...@@ -744,10 +842,15 @@ def get_sample_name(bamfile_path):
str str
The sample name extracted from the BAM file. If no sample name is found, returns "UnknownSample". The sample name extracted from the BAM file. If no sample name is found, returns "UnknownSample".
""" """
logging.info(f"Entering get_sample_name")
with pysam.AlignmentFile(bamfile_path, "rb") as bamfile: with pysam.AlignmentFile(bamfile_path, "rb") as bamfile:
for read_group in bamfile.header.get("RG", []): for read_group in bamfile.header.get("RG", []):
if "SM" in read_group: if "SM" in read_group:
return read_group["SM"] return read_group["SM"]
logging.info(f"Leaving get_sample_name")
return "UnknownSample" return "UnknownSample"
...@@ -773,12 +876,15 @@ def create_signal(signal, chr, z_score_results, step_size): ...@@ -773,12 +876,15 @@ def create_signal(signal, chr, z_score_results, step_size):
None None
The function modifies the signal dictionary in place. The function modifies the signal dictionary in place.
""" """
logging.info(f"Entering create_signal for {chr} (include copy_number_level)")
if chr not in signal: if chr not in signal:
signal[chr] = {} signal[chr] = {}
for i in range(len(z_score_results)): for i in range(len(z_score_results)):
pos = (i * step_size) + 1 pos = (i * step_size) + 1
signal[chr][pos] = z_score_results[i] signal[chr][pos] = z_score_results[i]
logging.info(f"Leaving create_signal for {chr} (include copy_number_level)")
def detect_events( def detect_events(
z_score_results, z_score_results,
...@@ -813,7 +919,8 @@ def detect_events( ...@@ -813,7 +919,8 @@ def detect_events(
None None
The function modifies the events dictionary in place. The function modifies the events dictionary in place.
""" """
sys.stderr.write("\t starting detect_events\n") logging.info(f"Entering detect_events for {chr}")
c = 0 c = 0
for i, z_score in enumerate(z_score_results): for i, z_score in enumerate(z_score_results):
if (z_score <= -zscore_threshold) or (z_score >= zscore_threshold): if (z_score <= -zscore_threshold) or (z_score >= zscore_threshold):
...@@ -822,7 +929,7 @@ def detect_events( ...@@ -822,7 +929,7 @@ def detect_events(
if med_ratio == 0: if med_ratio == 0:
c = 0 c = 0
else: else:
c = cn_level(float(ratio_par_mean_ratio_results[i])) c = cn_level(float(ratio_par_mean_ratio_results[i]), chr)
if z_score >= zscore_threshold: if z_score >= zscore_threshold:
c = 3 c = 3
...@@ -832,10 +939,10 @@ def detect_events( ...@@ -832,10 +939,10 @@ def detect_events(
pos_end = pos_start + window_size pos_end = pos_start + window_size
events[chr][(pos_start, pos_end)] = c events[chr][(pos_start, pos_end)] = c
sys.stderr.write("\t ending detect_events\n")
logging.info(f"Leaving detect_events for {chr}")
def segmentation(events, segment): def segmentation(events, segment, chr):
""" """
Segment the detected events into contiguous regions with the same copy number level. Segment the detected events into contiguous regions with the same copy number level.
...@@ -853,12 +960,11 @@ def segmentation(events, segment): ...@@ -853,12 +960,11 @@ def segmentation(events, segment):
None None
The function modifies the segment dictionary in place. The function modifies the segment dictionary in place.
""" """
sys.stderr.write("starting segmentation : \n") logging.info(f"Entering segmentation for {chr}")
for k in events.keys(): for k in events.keys():
sys.stderr.write("\tfor chromosome %s\n" % k) sys.stderr.write("\tfor chromosome %s\n" % k)
starts = 0 starts, oldPos, oldLevel = 0, 0, -1
oldPos = 0
oldLevel = -1
for p in sorted(events[k].keys()): for p in sorted(events[k].keys()):
level = events[k][p] level = events[k][p]
# new coordinates # new coordinates
...@@ -871,9 +977,7 @@ def segmentation(events, segment): ...@@ -871,9 +977,7 @@ def segmentation(events, segment):
segment[k][index]["start"] = starts segment[k][index]["start"] = starts
segment[k][index]["end"] = oldPos + window_size segment[k][index]["end"] = oldPos + window_size
segment[k][index]["cn"] = oldLevel segment[k][index]["cn"] = oldLevel
oldPos = p[0] oldPos, starts, oldLevel = p[0], p[0], level
starts = p[0]
oldLevel = level
continue continue
else: else:
starts = p[0] starts = p[0]
...@@ -887,18 +991,15 @@ def segmentation(events, segment): ...@@ -887,18 +991,15 @@ def segmentation(events, segment):
segment[k][index]["start"] = starts segment[k][index]["start"] = starts
segment[k][index]["end"] = oldPos segment[k][index]["end"] = oldPos
segment[k][index]["cn"] = oldLevel segment[k][index]["cn"] = oldLevel
oldPos = p[0] oldPos, starts, oldLevel = p[0], p[0], level
starts = p[0]
oldLevel = level
continue continue
else: else:
oldLevel = level oldLevel = level
oldPos = p[0] oldPos, oldLevel = p[0], level
oldLevel = level
sys.stderr.write("segmentation done\n")
logging.info(f"Leaving segmentation for {chr}")
def display_results_vcf(sample, segment, signal, lengthFilter, output_file): def display_results_vcf(sample, segment, signal, lengthFilter, output_file, chr):
""" """
Generate a VCF file containing structural variant calls based on segmented regions and signal data. Generate a VCF file containing structural variant calls based on segmented regions and signal data.
...@@ -923,7 +1024,7 @@ def display_results_vcf(sample, segment, signal, lengthFilter, output_file): ...@@ -923,7 +1024,7 @@ def display_results_vcf(sample, segment, signal, lengthFilter, output_file):
This function writes the structural variant calls to the specified output file in VCF format. This function writes the structural variant calls to the specified output file in VCF format.
""" """
global header_written global header_written
sys.stderr.write("starting display results\n") logging.info(f"Entering display_results_vcf for {chr}")
with open(output_file, "a") as f: with open(output_file, "a") as f:
if not header_written: if not header_written:
...@@ -973,11 +1074,10 @@ def display_results_vcf(sample, segment, signal, lengthFilter, output_file): ...@@ -973,11 +1074,10 @@ def display_results_vcf(sample, segment, signal, lengthFilter, output_file):
int(segment[k][elt]["cn"]), int(segment[k][elt]["cn"]),
) )
) )
logging.info(f"Leaving display_results_vcf for {chr}")
#################################
###### <---Fonction main--->######
#################################
def main_calcul( def main_calcul(
bamfile_path, bamfile_path,
chr, chr,
...@@ -1031,7 +1131,9 @@ def main_calcul( ...@@ -1031,7 +1131,9 @@ def main_calcul(
map_data = calcul_mappability(seq_length, mappability, chr) map_data = calcul_mappability(seq_length, mappability, chr)
gc_data = calcul_gc_content(seq_length, chr, seq) gc_data = calcul_gc_content(seq_length, chr, seq)
depth_data = calcul_depth_seq(seq_length, bamfile_path, chr) depth_data = calcul_depth_seq(seq_length, bamfile_path, chr)
#depth_data_count = calcul_depth_seq_count(seq_length, bamfile_path, chr)
#depth_data = calcul_depth_seq_copy(seq_length, bamfile_path, chr)
# Transférer le tableau NumPy vers CUDA # Transférer le tableau NumPy vers CUDA
d_depth_data = cuda.mem_alloc(depth_data.nbytes) d_depth_data = cuda.mem_alloc(depth_data.nbytes)
cuda.memcpy_htod(d_depth_data, depth_data) cuda.memcpy_htod(d_depth_data, depth_data)
...@@ -1039,6 +1141,13 @@ def main_calcul( ...@@ -1039,6 +1141,13 @@ def main_calcul(
"\t d_depth_data : %s, %s\n" "\t d_depth_data : %s, %s\n"
% (d_depth_data, d_depth_data.as_buffer(sys.getsizeof(depth_data))) % (d_depth_data, d_depth_data.as_buffer(sys.getsizeof(depth_data)))
) )
#d_depth_data_count = cuda.mem_alloc(depth_data_count.nbytes)
#cuda.memcpy_htod(d_depth_data_count, depth_data_count)
#sys.stderr.write(
# "\t d_depth_data_count : %s, %s\n"
# % (d_depth_data_count, d_depth_data_count.as_buffer(sys.getsizeof(depth_data_count)))
#)
d_gc_data = cuda.mem_alloc(gc_data.nbytes) d_gc_data = cuda.mem_alloc(gc_data.nbytes)
cuda.memcpy_htod(d_gc_data, gc_data) cuda.memcpy_htod(d_gc_data, gc_data)
...@@ -1066,6 +1175,11 @@ def main_calcul( ...@@ -1066,6 +1175,11 @@ def main_calcul(
) )
sys.stderr.write("\t Definition de depth_results\n") sys.stderr.write("\t Definition de depth_results\n")
depth_results_count = np.zeros(
int((seq_length - window_size) / step_size) + 1, dtype=np.float32
)
sys.stderr.write("\t Definition de depth_results_count\n")
gc_results = np.zeros( gc_results = np.zeros(
int((seq_length - window_size) / step_size) + 1, dtype=np.float32 int((seq_length - window_size) / step_size) + 1, dtype=np.float32
) )
...@@ -1109,6 +1223,13 @@ def main_calcul( ...@@ -1109,6 +1223,13 @@ def main_calcul(
) )
sys.stderr.write("\t depth_results.nbytes = %s\n" % depth_results.nbytes) sys.stderr.write("\t depth_results.nbytes = %s\n" % depth_results.nbytes)
d_depth_results_count = cuda.mem_alloc(depth_results_count.nbytes)
sys.stderr.write(
"\t d_depth_results_count = %s\n"
% d_depth_results_count.as_buffer(sys.getsizeof(d_depth_results_count))
)
sys.stderr.write("\t depth_results_count.nbytes = %s\n" % depth_results_count.nbytes)
d_gc_results = cuda.mem_alloc(gc_results.nbytes) d_gc_results = cuda.mem_alloc(gc_results.nbytes)
sys.stderr.write( sys.stderr.write(
"\t d_gc_results = %s\n" % d_gc_results.as_buffer(sys.getsizeof(d_gc_results)) "\t d_gc_results = %s\n" % d_gc_results.as_buffer(sys.getsizeof(d_gc_results))
...@@ -1184,6 +1305,17 @@ def main_calcul( ...@@ -1184,6 +1305,17 @@ def main_calcul(
) )
sys.stderr.write("\t appel fonction calcul_depth_kernel_cuda\n") sys.stderr.write("\t appel fonction calcul_depth_kernel_cuda\n")
#calcul_depth_count_kernel_cuda(
# d_depth_data_count,
# np.int32(seq_length),
# np.int32(window_size),
#np.int32(step_size),
#d_depth_results,
#block=(block_size, 1, 1),
#grid=(grid_size, 1),
#)
#sys.stderr.write("\t appel fonction calcul_depth_count_kernel_cuda\n")
calcul_gc_kernel_cuda( calcul_gc_kernel_cuda(
d_gc_data, d_gc_data,
np.int32(seq_length), np.int32(seq_length),
...@@ -1226,6 +1358,11 @@ def main_calcul( ...@@ -1226,6 +1358,11 @@ def main_calcul(
sys.stderr.write( sys.stderr.write(
"\t Copie les resultats du GPU (d_depth_results) vers le CPU (depth_results)\n" "\t Copie les resultats du GPU (d_depth_results) vers le CPU (depth_results)\n"
) )
#cuda.memcpy_dtoh(depth_results_count, d_depth_results_count)
#sys.stderr.write(
# "\t Copie les resultats du GPU (d_depth_results_count) vers le CPU (depth_results_count)\n"
#)
cuda.memcpy_dtoh(gc_results, d_gc_results) # cuda.memcpy_dtoh(dest, src) cuda.memcpy_dtoh(gc_results, d_gc_results) # cuda.memcpy_dtoh(dest, src)
sys.stderr.write( sys.stderr.write(
...@@ -1248,8 +1385,8 @@ def main_calcul( ...@@ -1248,8 +1385,8 @@ def main_calcul(
# Appel fonctions medianes # Appel fonctions medianes
sys.stderr.write("\t appel fonctions calcul medianes\n") sys.stderr.write("\t appel fonctions calcul medianes\n")
m = calcul_med_total(depth_correction_results) m = calcul_med_total(depth_correction_results, chr)
gc_to_median = calcul_med_same_gc(gc_results, depth_correction_results) gc_to_median = calcul_med_same_gc(gc_results, depth_correction_results, chr)
# Convertir gc_to_median en un tableau NumPy pour le transfert vers CUDA # Convertir gc_to_median en un tableau NumPy pour le transfert vers CUDA
sys.stderr.write("\t Conversion medianes en tableau numpy\n") sys.stderr.write("\t Conversion medianes en tableau numpy\n")
...@@ -1286,7 +1423,7 @@ def main_calcul( ...@@ -1286,7 +1423,7 @@ def main_calcul(
# Appel fonction moyenne # Appel fonction moyenne
sys.stderr.write("\t appel fonction calcul moyenne\n") sys.stderr.write("\t appel fonction calcul moyenne\n")
mean_chr = calcul_moy_totale(normalize_depth_results) mean_chr = calcul_moy_totale(normalize_depth_results, chr)
# Appeler le kernel de ratio # Appeler le kernel de ratio
ratio_par_window_kernel_cuda( ratio_par_window_kernel_cuda(
...@@ -1307,7 +1444,7 @@ def main_calcul( ...@@ -1307,7 +1444,7 @@ def main_calcul(
cuda.memcpy_dtoh(ratio_par_window_results, d_ratio_par_window_results) cuda.memcpy_dtoh(ratio_par_window_results, d_ratio_par_window_results)
# Création table à partir du ratio # Création table à partir du ratio
mean_ratio, std_ratio, med_ratio = compute_mean_and_std(ratio_par_window_results) mean_ratio, std_ratio, med_ratio = compute_mean_std_med(ratio_par_window_results, chr)
# Appeler le kernel de calcule du ratio divisé par le ratio moyen # Appeler le kernel de calcule du ratio divisé par le ratio moyen
ratio_par_mean_ratio_kernel_cuda( ratio_par_mean_ratio_kernel_cuda(
...@@ -1356,11 +1493,18 @@ def main_calcul( ...@@ -1356,11 +1493,18 @@ def main_calcul(
) )
# Appel fonction segmentation # Appel fonction segmentation
segmentation(events, segment) segmentation(events, segment, chr)
# Appel fonction display_results_vcf # Appel fonction display_results_vcf
display_results_vcf(sample, segment, signal, lengthFilter, output_file) display_results_vcf(sample, segment, signal, lengthFilter, output_file, chr)
#Ecrire les résultats dans le fichier de sortie
#with open(output_file, 'a') as f:
# sys.stderr.write("\t ecriture des fichiers\n")
# for i, (depth_data, depth_correction, depth_normalize_val, gc_data, map_data, ratio, ratio_mean_ratio, z_score) in enumerate(zip(depth_results, depth_correction_results, normalize_depth_results, gc_results, map_results, ratio_par_window_results, ratio_par_mean_ratio_results, z_score_results)):
# pos_start = (i * step_size) + 1
# pos_end = pos_start + window_size
# f.write(f"{chr}\t{pos_start}\t{pos_end}\t{depth_data}\t{depth_correction}\t{depth_normalize_val}\t{gc_data}\t{map_data}\t{ratio}\t{ratio_mean_ratio}\t{z_score}\n")
# Programme principal # Programme principal
# Calcul nombre de coeurs max pour le GPU # Calcul nombre de coeurs max pour le GPU
...@@ -1374,13 +1518,41 @@ print("Nombre de CPU: ", multiprocessing.cpu_count()) ...@@ -1374,13 +1518,41 @@ print("Nombre de CPU: ", multiprocessing.cpu_count())
print(f"Nombre de coeurs max GPU: {num_cores}") print(f"Nombre de coeurs max GPU: {num_cores}")
gc_file = "/work/gad/shared/pipeline/grch38/index/grch38_essential.fa" gc_file = "/work/gad/shared/pipeline/grch38/index/grch38_essential.fa"
mappability_file = "/work/gad/shared/analyse/test/cnvGPU/test_scalability/wgEncodeCrgMapabilityAlign100mer_no_uniq.grch38.bedgraph" mappability_file = "/work/gad/shared/pipeline/grch38/cnv/k100.Umap.MultiTrackMappability.bedgraph"
seq = parse_fasta(gc_file) seq = parse_fasta(gc_file)
mappability = dico_mappabilite(mappability_file) mappability = dico_mappabilite(mappability_file)
with pysam.AlignmentFile(bamfile_path, "rb") as bamfile_handle: if num_chr == "ALL" :
for i, seq_length in enumerate(bamfile_handle.lengths): with pysam.AlignmentFile(bamfile_path, "rb") as bamfile_handle:
chr = bamfile_handle.references[i] for i, seq_length in enumerate(bamfile_handle.lengths):
chr = bamfile_handle.references[i]
sys.stderr.write("Chromosome : %s, seq length : %s\n" % (chr, seq_length))
# Appeler la fonction de calcul de la profondeur moyenne pour ce chromosome
main_calcul(
bamfile_handle,
chr,
seq_length,
window_size,
step_size,
zscore_threshold,
lengthFilter,
output_file,
sample,
)
logging.basicConfig(
filename="%s" % (logfile),
filemode="a",
level=logging.INFO,
format="%(asctime)s %(levelname)s - %(message)s",
)
logging.info("end")
else :
with pysam.AlignmentFile(bamfile_path, "rb") as bamfile_handle:
seq_length = bamfile_handle.lengths[int(num_chr) - 1]
chr = bamfile_handle.references[int(num_chr) - 1]
sys.stderr.write("Chromosome : %s, seq length : %s\n" % (chr, seq_length)) sys.stderr.write("Chromosome : %s, seq length : %s\n" % (chr, seq_length))
# Appeler la fonction de calcul de la profondeur moyenne pour ce chromosome # Appeler la fonction de calcul de la profondeur moyenne pour ce chromosome
...@@ -1403,4 +1575,3 @@ with pysam.AlignmentFile(bamfile_path, "rb") as bamfile_handle: ...@@ -1403,4 +1575,3 @@ with pysam.AlignmentFile(bamfile_path, "rb") as bamfile_handle:
format="%(asctime)s %(levelname)s - %(message)s", format="%(asctime)s %(levelname)s - %(message)s",
) )
logging.info("end") logging.info("end")
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment