Commit 9bb5c514 authored by Theo Serralta's avatar Theo Serralta

Add faster function calcul_depth_seq

parent be5e1926
...@@ -11,6 +11,8 @@ from pycuda.autoinit import context ...@@ -11,6 +11,8 @@ from pycuda.autoinit import context
import multiprocessing import multiprocessing
from multiprocessing import Process, Queue from multiprocessing import Process, Queue
import time import time
import subprocess
import pandas as pd
# Options # Options
...@@ -327,10 +329,12 @@ def dico_mappabilite(mappability_file): ...@@ -327,10 +329,12 @@ def dico_mappabilite(mappability_file):
and mappability scores as values. and mappability scores as values.
""" """
logging.info("Entering dico_mappability") logging.info("Entering dico_mappability")
start_time = time.time()
mappability_dico = {} mappability_dico = {}
logging.info(" In dico_mappability : Open file and create the dico") logging.info(" In dico_mappability : Open file and create the dico")
start_time_1 = time.time()
with open(mappability_file, "r") as f: with open(mappability_file, "r") as f:
for line in f: for line in f:
...@@ -348,19 +352,25 @@ def dico_mappabilite(mappability_file): ...@@ -348,19 +352,25 @@ def dico_mappabilite(mappability_file):
mappability_dico[chromosome].append((start_pos, end_pos, score)) mappability_dico[chromosome].append((start_pos, end_pos, score))
logging.info(f"In dico_mappability : Leaving file and create the dico") end_time_1 = time.time()
elapsed_time_1 = end_time_1 - start_time_1
logging.info(f"In dico_mappability : Leaving file and create the dico (Time taken: {elapsed_time_1:.4f} seconds)")
# Add position 0 for each chromosome # Add position 0 for each chromosome
logging.info(" In dico_mappability : Add position 0 for each chromosome") logging.info(" In dico_mappability : Add position 0 for each chromosome")
start_time_2 = time.time()
for chromosome, intervals in mappability_dico.items(): for chromosome, intervals in mappability_dico.items():
if intervals[0][0] != 0: if intervals[0][0] != 0:
mappability_dico[chromosome].insert(0, (0, intervals[0][0], 0)) mappability_dico[chromosome].insert(0, (0, intervals[0][0], 0))
logging.info(f"In dico_mappability : Ending add position 0 for each chromosome") end_time_2 = time.time()
elapsed_time_2 = end_time_2 - start_time_2
logging.info(f"In dico_mappability : Ending add position 0 for each chromosome (Time taken: {elapsed_time_2:.4f} seconds)")
# Merge intervals with the same score # Merge intervals with the same score
logging.info(" In dico_mappability : Merge intervals with the same score") logging.info(" In dico_mappability : Merge intervals with the same score")
start_time_3 = time.time()
for chromosome, intervals in mappability_dico.items(): for chromosome, intervals in mappability_dico.items():
merged_intervals = merge_intervals(intervals) merged_intervals = merge_intervals(intervals)
...@@ -368,8 +378,13 @@ def dico_mappabilite(mappability_file): ...@@ -368,8 +378,13 @@ def dico_mappabilite(mappability_file):
start: score for start, _, score in merged_intervals start: score for start, _, score in merged_intervals
} }
logging.info(f"In dico_mappability : Ending merge intervals with the same score") end_time_3 = time.time()
logging.info(f"Leaving dico_mappability") elapsed_time_3 = end_time_3 - start_time_3
logging.info(f"In dico_mappability : Ending merge intervals with the same score (Time taken: {elapsed_time_2:.4f} seconds)")
end_time = time.time()
elapsed_time = end_time - start_time
logging.info(f"Leaving dico_mappability (Time taken: {elapsed_time:.4f} seconds)")
return mappability_dico return mappability_dico
...@@ -396,6 +411,7 @@ def calcul_mappability(seq_length, mappability, chr): ...@@ -396,6 +411,7 @@ def calcul_mappability(seq_length, mappability, chr):
An array of mappability scores for the given sequence length. An array of mappability scores for the given sequence length.
""" """
logging.info(f"Entering calcul_mappability for {chr}") logging.info(f"Entering calcul_mappability for {chr}")
start_time = time.time()
map_data = np.zeros(seq_length, dtype=np.float32) map_data = np.zeros(seq_length, dtype=np.float32)
sorted_keys = sorted(mappability[chr].keys()) sorted_keys = sorted(mappability[chr].keys())
...@@ -404,6 +420,7 @@ def calcul_mappability(seq_length, mappability, chr): ...@@ -404,6 +420,7 @@ def calcul_mappability(seq_length, mappability, chr):
prev_mappability = 0 prev_mappability = 0
logging.info(f"In calcul_mappability : Entering for bound in sorted_keys for {chr}") logging.info(f"In calcul_mappability : Entering for bound in sorted_keys for {chr}")
start_time_1 = time.time()
for bound in sorted_keys: for bound in sorted_keys:
for i in range(prev_bound, min(seq_length, bound)): for i in range(prev_bound, min(seq_length, bound)):
...@@ -411,17 +428,25 @@ def calcul_mappability(seq_length, mappability, chr): ...@@ -411,17 +428,25 @@ def calcul_mappability(seq_length, mappability, chr):
prev_bound = bound prev_bound = bound
prev_mappability = mappability[chr][bound] prev_mappability = mappability[chr][bound]
logging.info(f"In calcul_mappability : Leaving for bound in sorted_keys for {chr}") end_time_1 = time.time()
elapsed_time_1 = end_time_1 - start_time_1
logging.info(f"In calcul_mappability : Leaving for bound in sorted_keys for {chr} (Time taken: {elapsed_time_1:.4f} seconds)")
# Fill remaining positions if sequence length exceeds last bound # Fill remaining positions if sequence length exceeds last bound
logging.info(f"In calcul_mappability : Entering for i in range(prev_bound, seq_length) for {chr}") logging.info(f"In calcul_mappability : Entering for i in range(prev_bound, seq_length) for {chr}")
start_time_2 = time.time()
for i in range(prev_bound, seq_length): for i in range(prev_bound, seq_length):
map_data[i] = prev_mappability map_data[i] = prev_mappability
logging.info(f"In calcul_mappability : Leaving for i in range(prev_bound, seq_length) for {chr}") end_time_2 = time.time()
elapsed_time_2 = end_time_2 - start_time_2
logging.info(f"In calcul_mappability : Leaving for i in range(prev_bound, seq_length) for {chr} (Time taken: {elapsed_time_2:.4f} seconds)")
end_time = time.time()
elapsed_time = end_time - start_time
logging.info(f"Leaving dico_mappability for {chr} (Time taken: {elapsed_time:.4f} seconds)")
logging.info(f"Leaving dico_mappability for {chr}")
return map_data return map_data
...@@ -442,6 +467,7 @@ def parse_fasta(gc_file): ...@@ -442,6 +467,7 @@ def parse_fasta(gc_file):
A dictionary where keys are sequence headers and values are the corresponding sequences. A dictionary where keys are sequence headers and values are the corresponding sequences.
""" """
logging.info("Entering parse_fasta") logging.info("Entering parse_fasta")
start_time = time.time()
sequences = {} sequences = {}
with open(gc_file, "r") as f: with open(gc_file, "r") as f:
...@@ -452,7 +478,9 @@ def parse_fasta(gc_file): ...@@ -452,7 +478,9 @@ def parse_fasta(gc_file):
sequence = "".join(lines[1:]) sequence = "".join(lines[1:])
sequences[header] = sequence sequences[header] = sequence
logging.info(f"Leaving parse_fasta") end_time = time.time()
elapsed_time = end_time - start_time
logging.info(f"Leaving parse_fasta (Time taken: {elapsed_time:.4f} seconds)")
return sequences return sequences
...@@ -478,12 +506,15 @@ def calcul_gc_content(seq_length, chr, seq): ...@@ -478,12 +506,15 @@ def calcul_gc_content(seq_length, chr, seq):
An array of bytes ('S' dtype) representing the GC content for the given sequence length. An array of bytes ('S' dtype) representing the GC content for the given sequence length.
""" """
logging.info(f"Entering calcul_gc_content for {chr}") logging.info(f"Entering calcul_gc_content for {chr}")
start_time = time.time()
gc_data = np.zeros(seq_length, dtype="S") gc_data = np.zeros(seq_length, dtype="S")
for i in range(len(seq[chr])): for i in range(len(seq[chr])):
gc_data[i] = seq[chr][i] gc_data[i] = seq[chr][i]
logging.info(f"Leaving calcul_gc_content for {chr}") end_time = time.time()
elapsed_time = end_time - start_time
logging.info(f"Leaving calcul_gc_content for {chr} (Time taken: {elapsed_time:.4f} seconds)")
return gc_data return gc_data
...@@ -509,6 +540,7 @@ def calcul_depth_seq(seq_length, bamfile_path, chr): ...@@ -509,6 +540,7 @@ def calcul_depth_seq(seq_length, bamfile_path, chr):
An array of integers representing the sequencing depth for the given sequence length. An array of integers representing the sequencing depth for the given sequence length.
""" """
logging.info(f"Entering calcul_depth_seq for {chr}") logging.info(f"Entering calcul_depth_seq for {chr}")
start_time = time.time()
depth_data = np.zeros(seq_length, dtype=np.int32) depth_data = np.zeros(seq_length, dtype=np.int32)
for pileupcolumn in bamfile_path.pileup(reference = chr): for pileupcolumn in bamfile_path.pileup(reference = chr):
...@@ -519,7 +551,10 @@ def calcul_depth_seq(seq_length, bamfile_path, chr): ...@@ -519,7 +551,10 @@ def calcul_depth_seq(seq_length, bamfile_path, chr):
#depth_data = bamfile_path.pileup().nsegments #depth_data = bamfile_path.pileup().nsegments
logging.info(f"Leaving calcul_depth_seq for {chr}")
end_time = time.time()
elapsed_time = end_time - start_time
logging.info(f"Leaving calcul_depth_seq for {chr} (Time taken: {elapsed_time:.4f} seconds)")
return depth_data return depth_data
...@@ -545,6 +580,7 @@ def calcul_depth_seq_copy(seq_length, bamfile_path, chr): ...@@ -545,6 +580,7 @@ def calcul_depth_seq_copy(seq_length, bamfile_path, chr):
An array of integers representing the sequencing depth for the given sequence length. An array of integers representing the sequencing depth for the given sequence length.
""" """
logging.info(f"Entering calcul_depth_seq for {chr}") logging.info(f"Entering calcul_depth_seq for {chr}")
start_time = time.time()
depth_data = np.zeros(seq_length, dtype=np.int32) depth_data = np.zeros(seq_length, dtype=np.int32)
...@@ -568,7 +604,9 @@ def calcul_depth_seq_copy(seq_length, bamfile_path, chr): ...@@ -568,7 +604,9 @@ def calcul_depth_seq_copy(seq_length, bamfile_path, chr):
# depth_data = np.array(bamfile_path.pileup().nsegments) # depth_data = np.array(bamfile_path.pileup().nsegments)
logging.info(f"Leaving calcul_depth_seq for {chr}") end_time = time.time()
elapsed_time = end_time - start_time
logging.info(f"Leaving calcul_depth_seq for {chr} (Time taken: {elapsed_time:.4f} seconds)")
return depth_data return depth_data
...@@ -593,17 +631,73 @@ def calcul_depth_seq_count(seq_length, bamfile_path, chr): ...@@ -593,17 +631,73 @@ def calcul_depth_seq_count(seq_length, bamfile_path, chr):
numpy.ndarray numpy.ndarray
An array of integers representing the sequencing depth for the given sequence length. An array of integers representing the sequencing depth for the given sequence length.
""" """
logging.info(f"Entering calcul_depth_seq for {chr}") logging.info(f"Entering calcul_depth_seq_count for {chr}")
start_time = time.time()
depth_data_count = np.zeros(seq_length, dtype=np.int32) # Use count_coverage to get coverage depth
for i in range(seq_length): depth_data_count = bamfile_path.count_coverage(contig = chr)
depth_data_count[i] = bamfile_path.count(contig = chr, start = i, stop = i+1) type(depth_data_count)
#print(depth_data_count[:10056])
logging.info(f"Leaving calcul_depth_seq for {chr}") end_time = time.time()
elapsed_time = end_time - start_time
logging.info(f"Leaving calcul_depth_seq for {chr} (Time taken: {elapsed_time:.4f} seconds)")
return depth_data_count return depth_data_count
def calcul_depth_seq_samtools(seq_length, bamfile_path, chr, num_threads=12):
"""
Calculate the sequencing depth for a given chromosome using a parallelized bash script.
Parameters
----------
seq_length : int
The length of the sequence.
bamfile_path : str
The path to the BAM file.
chr : str
The chromosome for which the depth is calculated.
num_threads : int
The number of threads to use for parallel processing.
Returns
-------
numpy.ndarray
An array of integers representing the sequencing depth for the given sequence length.
"""
logging.info(f"Entering calcul_depth_seq for {chr}")
start_time = time.time()
#Define the output file for depth results
output_file_1 = f"/work/gad/shared/analyse/test/cnvGPU/test_scalability/tmp_depth_{chr}.out"
# Run the parallelized bash script to calculate depth
p = subprocess.Popen(["/work/gad/shared/analyse/test/cnvGPU/test_scalability/samtools_test_by_chr_multithreading_log.sh %s %s %s %s" % (bamfile_path, chr, output_file_1, num_threads)], shell = True, executable = "/bin/bash")
p.communicate()
# Create the numpy array for depth data
depth_data = np.zeros(seq_length, dtype=np.int32)
# Read the output file and fill the numpy array using pandas for efficiency
try:
df = pd.read_csv(output_file_1, delim_whitespace=True, header=None, names=['chr', 'pos', 'depth'])
df['pos'] -= 1 # Convert to 0-based index
df = df[df['pos'] < seq_length] # Ensure positions are within sequence length
depth_data[df['pos']] = df['depth']
except Exception as e:
logging.error(f"Error reading depth file: {e}")
# Clean up the output file
subprocess.run(['rm', output_file_1])
end_time = time.time()
elapsed_time = end_time - start_time
logging.info(f"Leaving calcul_depth_seq for {chr} (Time taken: {elapsed_time:.4f} seconds)")
return depth_data
def calcul_med_total(depth_correction_results, chr): def calcul_med_total(depth_correction_results, chr):
""" """
Calculate the median of non-zero depth correction results. Calculate the median of non-zero depth correction results.
...@@ -621,6 +715,7 @@ def calcul_med_total(depth_correction_results, chr): ...@@ -621,6 +715,7 @@ def calcul_med_total(depth_correction_results, chr):
The median of the non-zero depth correction values, or 0 if no non-zero values are present. The median of the non-zero depth correction values, or 0 if no non-zero values are present.
""" """
logging.info(f"Entering calcul_med_total for {chr}") logging.info(f"Entering calcul_med_total for {chr}")
start_time = time.time()
depth_correction_results = np.array(depth_correction_results) depth_correction_results = np.array(depth_correction_results)
# Filter results to remove zero values # Filter results to remove zero values
...@@ -630,7 +725,9 @@ def calcul_med_total(depth_correction_results, chr): ...@@ -630,7 +725,9 @@ def calcul_med_total(depth_correction_results, chr):
sys.stderr.write("Chromosome : %s, med_chr : %s\n" % (chr, m)) sys.stderr.write("Chromosome : %s, med_chr : %s\n" % (chr, m))
logging.info(f"Leaving calcul_med_total for {chr}") end_time = time.time()
elapsed_time = end_time - start_time
logging.info(f"Leaving calcul_med_total for {chr} (Time taken: {elapsed_time:.4f} seconds)")
return m return m
...@@ -654,6 +751,7 @@ def calcul_med_same_gc(gc_results, depth_correction_results, chr): ...@@ -654,6 +751,7 @@ def calcul_med_same_gc(gc_results, depth_correction_results, chr):
A dictionary where keys are unique GC content values and values are the median depth correction for those GC values. A dictionary where keys are unique GC content values and values are the median depth correction for those GC values.
""" """
logging.info(f"Entering calcul_med_same_gc for {chr}") logging.info(f"Entering calcul_med_same_gc for {chr}")
start_time = time.time()
mGC = [] mGC = []
depth_correction_results_array = np.array(depth_correction_results) depth_correction_results_array = np.array(depth_correction_results)
...@@ -679,7 +777,9 @@ def calcul_med_same_gc(gc_results, depth_correction_results, chr): ...@@ -679,7 +777,9 @@ def calcul_med_same_gc(gc_results, depth_correction_results, chr):
gc_to_median = dict(mGC) gc_to_median = dict(mGC)
logging.info(f"Leaving calcul_med_same_gc for {chr}") end_time = time.time()
elapsed_time = end_time - start_time
logging.info(f"Leaving calcul_med_same_gc for {chr} (Time taken: {elapsed_time:.4f} seconds)")
return gc_to_median return gc_to_median
...@@ -701,6 +801,7 @@ def calcul_moy_totale(normalize_depth_results, chr): ...@@ -701,6 +801,7 @@ def calcul_moy_totale(normalize_depth_results, chr):
The mean of the non-zero normalized depth values, or 0 if no non-zero values are present. The mean of the non-zero normalized depth values, or 0 if no non-zero values are present.
""" """
logging.info(f"Entering calcul_moy_totale for {chr}") logging.info(f"Entering calcul_moy_totale for {chr}")
start_time = time.time()
normalize_depth_results = np.array(normalize_depth_results) normalize_depth_results = np.array(normalize_depth_results)
# Filter results to remove zero values # Filter results to remove zero values
...@@ -710,7 +811,9 @@ def calcul_moy_totale(normalize_depth_results, chr): ...@@ -710,7 +811,9 @@ def calcul_moy_totale(normalize_depth_results, chr):
sys.stderr.write("Chromosome : %s, mean_chr : %s\n" % (chr, mean_chr)) sys.stderr.write("Chromosome : %s, mean_chr : %s\n" % (chr, mean_chr))
logging.info(f"Leaving calcul_moy_totale for {chr}") end_time = time.time()
elapsed_time = end_time - start_time
logging.info(f"Leaving calcul_moy_totale for {chr} (Time taken: {elapsed_time:.4f} seconds)")
return mean_chr return mean_chr
...@@ -733,6 +836,7 @@ def calcul_std(normalize_depth_results, chr): ...@@ -733,6 +836,7 @@ def calcul_std(normalize_depth_results, chr):
The standard deviation of the non-zero normalized depth values, or 0 if no non-zero values are present. The standard deviation of the non-zero normalized depth values, or 0 if no non-zero values are present.
""" """
logging.info(f"Entering calcul_std for {chr}") logging.info(f"Entering calcul_std for {chr}")
start_time = time.time()
normalize_depth_results = np.array(normalize_depth_results) normalize_depth_results = np.array(normalize_depth_results)
# Filter results to remove zero values # Filter results to remove zero values
...@@ -743,7 +847,9 @@ def calcul_std(normalize_depth_results, chr): ...@@ -743,7 +847,9 @@ def calcul_std(normalize_depth_results, chr):
sys.stderr.write("Chromosome : %s, std_chr : %s\n" % (chr, std_chr)) sys.stderr.write("Chromosome : %s, std_chr : %s\n" % (chr, std_chr))
logging.info(f"Leaving calcul_std for {chr}") end_time = time.time()
elapsed_time = end_time - start_time
logging.info(f"Leaving calcul_std for {chr} (Time taken: {elapsed_time:.4f} seconds)")
return std_chr return std_chr
...@@ -765,6 +871,7 @@ def compute_mean_std_med(ratio_par_window_results, chr): ...@@ -765,6 +871,7 @@ def compute_mean_std_med(ratio_par_window_results, chr):
A tuple containing the mean, standard deviation, and median of the filtered ratio values. A tuple containing the mean, standard deviation, and median of the filtered ratio values.
""" """
logging.info(f"Entering compute_mean_std_med for {chr}") logging.info(f"Entering compute_mean_std_med for {chr}")
start_time = time.time()
# Filter results to remove zero and -1 values # Filter results to remove zero and -1 values
ratio_par_window_results = np.array(ratio_par_window_results) ratio_par_window_results = np.array(ratio_par_window_results)
...@@ -786,7 +893,10 @@ def compute_mean_std_med(ratio_par_window_results, chr): ...@@ -786,7 +893,10 @@ def compute_mean_std_med(ratio_par_window_results, chr):
sys.stderr.write("Chromosome : %s, mean_ratio : %s, std_ratio : %s, med_ratio : %s\n" % (chr, mean_ratio, std_ratio, med_ratio)) sys.stderr.write("Chromosome : %s, mean_ratio : %s, std_ratio : %s, med_ratio : %s\n" % (chr, mean_ratio, std_ratio, med_ratio))
logging.info(f"Leaving compute_mean_std_med for {chr}") end_time = time.time()
elapsed_time = end_time - start_time
logging.info(f"Leaving compute_mean_std_med for {chr} (Time taken: {elapsed_time:.4f} seconds)")
# Return results # Return results
return mean_ratio, std_ratio, med_ratio return mean_ratio, std_ratio, med_ratio
...@@ -811,19 +921,15 @@ def cn_level(x, chr): ...@@ -811,19 +921,15 @@ def cn_level(x, chr):
- 2 if 0.75 < x < 1 or round(x) == 1 - 2 if 0.75 < x < 1 or round(x) == 1
- round(x) if round(x) > 1 - round(x) if round(x) > 1
""" """
if x < 1: if x < 0.1:
if x <= 0.75:
if x >= 0.1:
return 1
else:
return 0 return 0
else: elif x <= 0.75:
return 1
elif x < 1:
return 2 return 2
else: else:
if round(x) == 1: rounded_x = round(x)
return 2 return 2 if rounded_x == 1 else rounded_x
if round(x) > 1:
return round(x)
def get_sample_name(bamfile_path): def get_sample_name(bamfile_path):
...@@ -844,13 +950,16 @@ def get_sample_name(bamfile_path): ...@@ -844,13 +950,16 @@ def get_sample_name(bamfile_path):
""" """
logging.info(f"Entering get_sample_name") logging.info(f"Entering get_sample_name")
start_time = time.time()
with pysam.AlignmentFile(bamfile_path, "rb") as bamfile: with pysam.AlignmentFile(bamfile_path, "rb") as bamfile:
for read_group in bamfile.header.get("RG", []): for read_group in bamfile.header.get("RG", []):
if "SM" in read_group: if "SM" in read_group:
return read_group["SM"] return read_group["SM"]
logging.info(f"Leaving get_sample_name") end_time = time.time()
elapsed_time = end_time - start_time
logging.info(f"Leaving get_sample_name (Time taken: {elapsed_time:.4f} seconds)")
return "UnknownSample" return "UnknownSample"
...@@ -877,6 +986,7 @@ def create_signal(signal, chr, z_score_results, step_size): ...@@ -877,6 +986,7 @@ def create_signal(signal, chr, z_score_results, step_size):
The function modifies the signal dictionary in place. The function modifies the signal dictionary in place.
""" """
logging.info(f"Entering create_signal for {chr} (include copy_number_level)") logging.info(f"Entering create_signal for {chr} (include copy_number_level)")
start_time = time.time()
if chr not in signal: if chr not in signal:
signal[chr] = {} signal[chr] = {}
...@@ -884,7 +994,10 @@ def create_signal(signal, chr, z_score_results, step_size): ...@@ -884,7 +994,10 @@ def create_signal(signal, chr, z_score_results, step_size):
pos = (i * step_size) + 1 pos = (i * step_size) + 1
signal[chr][pos] = z_score_results[i] signal[chr][pos] = z_score_results[i]
logging.info(f"Leaving create_signal for {chr} (include copy_number_level)") #sys.stderr.write("\t signal %s\n" % signal[chr])
end_time = time.time()
elapsed_time = end_time - start_time
logging.info(f"Leaving create_signal for {chr} (include copy_number_level) (Time taken: {elapsed_time:.4f} seconds)")
def detect_events( def detect_events(
z_score_results, z_score_results,
...@@ -920,27 +1033,28 @@ def detect_events( ...@@ -920,27 +1033,28 @@ def detect_events(
The function modifies the events dictionary in place. The function modifies the events dictionary in place.
""" """
logging.info(f"Entering detect_events for {chr}") logging.info(f"Entering detect_events for {chr}")
start_time = time.time()
c = 0 c = 0
for i, z_score in enumerate(z_score_results):
if (z_score <= -zscore_threshold) or (z_score >= zscore_threshold):
if chr not in events: if chr not in events:
events[chr] = {} events[chr] = {}
if med_ratio == 0: for i, z_score in enumerate(z_score_results):
c = 0 if abs(z_score) >= zscore_threshold:
else: if med_ratio != 0:
c = cn_level(float(ratio_par_mean_ratio_results[i]), chr) c = cn_level(float(ratio_par_mean_ratio_results[i]), chr)
if z_score >= 0:
if z_score >= zscore_threshold:
c = 3 c = 3
elif c == 2 and z_score <= -zscore_threshold: elif c == 2 and z_score < 0:
c = 1 c = 1
pos_start = (i * step_size) + 1 pos_start = (i * step_size) + 1
pos_end = pos_start + window_size pos_end = pos_start + window_size
events[chr][(pos_start, pos_end)] = c events[chr][(pos_start, pos_end)] = c
#sys.stderr.write("\t events %s\n" % events)
logging.info(f"Leaving detect_events for {chr}") end_time = time.time()
elapsed_time = end_time - start_time
logging.info(f"Leaving detect_events for {chr} (Time taken: {elapsed_time:.4f} seconds)")
def segmentation(events, segment, chr): def segmentation(events, segment, chr):
""" """
...@@ -961,15 +1075,17 @@ def segmentation(events, segment, chr): ...@@ -961,15 +1075,17 @@ def segmentation(events, segment, chr):
The function modifies the segment dictionary in place. The function modifies the segment dictionary in place.
""" """
logging.info(f"Entering segmentation for {chr}") logging.info(f"Entering segmentation for {chr}")
start_time = time.time()
for k in events.keys(): for k in events.keys():
sys.stderr.write("\tfor chromosome %s\n" % k) sys.stderr.write("\tfor chromosome %s\n" % k)
starts, oldPos, oldLevel = 0, 0, -1 starts, oldPos, oldLevel = 1, 1, -1
for p in sorted(events[k].keys()): for p in sorted(events[k].keys()):
level = events[k][p] level = events[k][p]
#sys.stderr.write("\t p %s\n" % p)
# new coordinates # new coordinates
if p[0] > (oldPos + window_size): if p[0] > (oldPos + window_size):
if (starts != 0) and (starts != p[0]): if (starts != 1) and (starts != p[0]):
if k not in segment: if k not in segment:
segment[k] = {} segment[k] = {}
index = str(starts) + "-" + str(oldPos) index = str(starts) + "-" + str(oldPos)
...@@ -997,7 +1113,9 @@ def segmentation(events, segment, chr): ...@@ -997,7 +1113,9 @@ def segmentation(events, segment, chr):
oldLevel = level oldLevel = level
oldPos, oldLevel = p[0], level oldPos, oldLevel = p[0], level
logging.info(f"Leaving segmentation for {chr}") end_time = time.time()
elapsed_time = end_time - start_time
logging.info(f"Leaving segmentation for {chr} (Time taken: {elapsed_time:.4f} seconds)")
def display_results_vcf(sample, segment, signal, lengthFilter, output_file, chr): def display_results_vcf(sample, segment, signal, lengthFilter, output_file, chr):
""" """
...@@ -1025,6 +1143,7 @@ def display_results_vcf(sample, segment, signal, lengthFilter, output_file, chr) ...@@ -1025,6 +1143,7 @@ def display_results_vcf(sample, segment, signal, lengthFilter, output_file, chr)
""" """
global header_written global header_written
logging.info(f"Entering display_results_vcf for {chr}") logging.info(f"Entering display_results_vcf for {chr}")
start_time = time.time()
with open(output_file, "a") as f: with open(output_file, "a") as f:
if not header_written: if not header_written:
...@@ -1049,11 +1168,14 @@ def display_results_vcf(sample, segment, signal, lengthFilter, output_file, chr) ...@@ -1049,11 +1168,14 @@ def display_results_vcf(sample, segment, signal, lengthFilter, output_file, chr)
header_written = True header_written = True
for k in segment.keys(): for k in segment.keys():
#sys.stderr.write("\tfor chromosome %s\n" % k)
for elt in sorted(segment[k].keys()): for elt in sorted(segment[k].keys()):
#sys.stderr.write("\tfor elt %s\n" % elt)
if segment[k][elt]["start"] == segment[k][elt]["end"]: if segment[k][elt]["start"] == segment[k][elt]["end"]:
continue continue
if (segment[k][elt]["end"] - segment[k][elt]["start"]) < lengthFilter: if (segment[k][elt]["end"] - segment[k][elt]["start"]) < lengthFilter:
continue continue
#sys.stderr.write("\t [segment[k][elt] %s\n" % [segment[k][elt]])
if int(signal[k][segment[k][elt]["start"]]) < 0: if int(signal[k][segment[k][elt]["start"]]) < 0:
f.write( f.write(
"%s\t%s\t.\tN\t<DEL>\t.\t.\tSVTYPE=DEL;END=%s;VALUE=%s\tGT:GQ\t./.:0\n" "%s\t%s\t.\tN\t<DEL>\t.\t.\tSVTYPE=DEL;END=%s;VALUE=%s\tGT:GQ\t./.:0\n"
...@@ -1075,7 +1197,9 @@ def display_results_vcf(sample, segment, signal, lengthFilter, output_file, chr) ...@@ -1075,7 +1197,9 @@ def display_results_vcf(sample, segment, signal, lengthFilter, output_file, chr)
) )
) )
logging.info(f"Leaving display_results_vcf for {chr}") end_time = time.time()
elapsed_time = end_time - start_time
logging.info(f"Leaving display_results_vcf for {chr} (Time taken: {elapsed_time:.4f} seconds)")
def main_calcul( def main_calcul(
...@@ -1130,9 +1254,10 @@ def main_calcul( ...@@ -1130,9 +1254,10 @@ def main_calcul(
# Appeler les différentes fonctions # Appeler les différentes fonctions
map_data = calcul_mappability(seq_length, mappability, chr) map_data = calcul_mappability(seq_length, mappability, chr)
gc_data = calcul_gc_content(seq_length, chr, seq) gc_data = calcul_gc_content(seq_length, chr, seq)
depth_data = calcul_depth_seq(seq_length, bamfile_path, chr) #depth_data = calcul_depth_seq(seq_length, bamfile_path, chr)
#depth_data_count = calcul_depth_seq_count(seq_length, bamfile_path, chr) #depth_data_count = calcul_depth_seq_count(seq_length, bamfile_path, chr)
#depth_data = calcul_depth_seq_copy(seq_length, bamfile_path, chr) #depth_data = calcul_depth_seq_copy(seq_length, bamfile_path, chr)
depth_data = calcul_depth_seq_samtools(seq_length, bamfile_path, chr)
# Transférer le tableau NumPy vers CUDA # Transférer le tableau NumPy vers CUDA
d_depth_data = cuda.mem_alloc(depth_data.nbytes) d_depth_data = cuda.mem_alloc(depth_data.nbytes)
...@@ -1353,8 +1478,8 @@ def main_calcul( ...@@ -1353,8 +1478,8 @@ def main_calcul(
context.synchronize() context.synchronize()
# Copier les résultats depuis le périphérique CUDA vers l'hôte # Copier les résultats depuis le périphérique CUDA vers l'hôte
# cuda.memcpy_dtoh(dest, src)
cuda.memcpy_dtoh(depth_results, d_depth_results) cuda.memcpy_dtoh(depth_results, d_depth_results) # cuda.memcpy_dtoh(dest, src)
sys.stderr.write( sys.stderr.write(
"\t Copie les resultats du GPU (d_depth_results) vers le CPU (depth_results)\n" "\t Copie les resultats du GPU (d_depth_results) vers le CPU (depth_results)\n"
) )
...@@ -1521,16 +1646,23 @@ gc_file = "/work/gad/shared/pipeline/grch38/index/grch38_essential.fa" ...@@ -1521,16 +1646,23 @@ gc_file = "/work/gad/shared/pipeline/grch38/index/grch38_essential.fa"
mappability_file = "/work/gad/shared/pipeline/grch38/cnv/k100.Umap.MultiTrackMappability.bedgraph" mappability_file = "/work/gad/shared/pipeline/grch38/cnv/k100.Umap.MultiTrackMappability.bedgraph"
seq = parse_fasta(gc_file) seq = parse_fasta(gc_file)
mappability = dico_mappabilite(mappability_file) mappability = dico_mappabilite(mappability_file)
#bamfile_path_2 = "/work/gad/shared/analyse/test/cnvGPU/test_scalability/dijen1000.bam"
if num_chr == "ALL" : if num_chr == "ALL" :
with pysam.AlignmentFile(bamfile_path, "rb") as bamfile_handle: with pysam.AlignmentFile(bamfile_path, "rb") as bamfile_handle:
for i, seq_length in enumerate(bamfile_handle.lengths): for i, seq_length in enumerate(bamfile_handle.lengths):
chr = bamfile_handle.references[i] chr = bamfile_handle.references[i]
#Exclude chrM
if chr == "chrM":
continue
sys.stderr.write("Chromosome : %s, seq length : %s\n" % (chr, seq_length)) sys.stderr.write("Chromosome : %s, seq length : %s\n" % (chr, seq_length))
# Appeler la fonction de calcul de la profondeur moyenne pour ce chromosome # Appeler la fonction de calcul de la profondeur moyenne pour ce chromosome
main_calcul( main_calcul(
bamfile_handle, bamfile_path,
chr, chr,
seq_length, seq_length,
window_size, window_size,
...@@ -1557,7 +1689,7 @@ else : ...@@ -1557,7 +1689,7 @@ else :
# Appeler la fonction de calcul de la profondeur moyenne pour ce chromosome # Appeler la fonction de calcul de la profondeur moyenne pour ce chromosome
main_calcul( main_calcul(
bamfile_handle, bamfile_path,
chr, chr,
seq_length, seq_length,
window_size, window_size,
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment