Commit c68a9914 authored by Theo Serralta's avatar Theo Serralta

Correct normalisation (don't work)

parent 26feb9eb
...@@ -218,33 +218,44 @@ __global__ void normalize_depth_kernel(float *depth_correction, float *gc_result ...@@ -218,33 +218,44 @@ __global__ void normalize_depth_kernel(float *depth_correction, float *gc_result
// Kernel pour calculer le ratio par window // Kernel pour calculer le ratio par window
__global__ void ratio_par_window_kernel(float *depth_normalize_val, float mean_chr, int seq_length, int window_size, int step_size, float *ratio_par_window_results) { __global__ void ratio_par_window_kernel(float *avg_depth, float mean_chr, int seq_length, int window_size, int step_size, float *ratio_par_window_results) {
int idx = threadIdx.x + blockIdx.x * blockDim.x; int idx = threadIdx.x + blockIdx.x * blockDim.x;
if (idx < seq_length - window_size + 1) { if (idx < seq_length - window_size + 1) {
float ratio = depth_normalize_val[idx] / mean_chr; float ratio = depth_results[idx] / mean_chr;
ratio_par_window_results[idx] = ratio; ratio_par_window_results[idx] = ratio;
} }
} }
// Kernel pour calculer le ratio normalisé par window
__global__ void ratio_par_window_norm_kernel(float *depth_normalize_val, float mean_chr_norm, int seq_length, int window_size, int step_size, float *ratio_par_window_norm_results) {
int idx = threadIdx.x + blockIdx.x * blockDim.x;
if (idx < seq_length - window_size + 1) {
float ratio_norm = depth_normalize_val[idx] / mean_chr_norm;
ratio_par_window_norm_results[idx] = ratio_norm;
}
}
// Kernel pour calculer le z_score par window sur le ratio // Kernel pour calculer le z_score par window sur le ratio
__global__ void z_score_kernel(float *ratio, float mean_ratio, float std_ratio, int seq_length, int window_size, int step_size, float *z_score_results) { __global__ void z_score_kernel(float *ratio_norm, float mean_ratio, float std_ratio, int seq_length, int window_size, int step_size, float *z_score_results) {
int idx = threadIdx.x + blockIdx.x * blockDim.x; int idx = threadIdx.x + blockIdx.x * blockDim.x;
if (idx < seq_length - window_size + 1) { if (idx < seq_length - window_size + 1) {
float z_score = (ratio[idx] - mean_ratio) / std_ratio; float z_score = (ratio_norm[idx] - mean_ratio) / std_ratio;
z_score_results[idx] = z_score; z_score_results[idx] = z_score;
} }
} }
// Kernel pour calculer le ratio divise par le ratio moyen // Kernel pour calculer le ratio divise par le ratio moyen
__global__ void ratio_par_mean_ratio_kernel(float *ratio, float mean_ratio, int seq_length, int window_size, int step_size, float *ratio_par_mean_ratio_results) { __global__ void ratio_par_mean_ratio_kernel(float *ratio_norm, float mean_ratio, int seq_length, int window_size, int step_size, float *ratio_par_mean_ratio_results) {
int idx = threadIdx.x + blockIdx.x * blockDim.x; int idx = threadIdx.x + blockIdx.x * blockDim.x;
if (idx < seq_length - window_size + 1) { if (idx < seq_length - window_size + 1) {
float ratio_mean_ratio = ratio[idx] / mean_ratio; float ratio_mean_ratio = ratio_norm[idx] / mean_ratio;
ratio_par_mean_ratio_results[idx] = ratio_mean_ratio; ratio_par_mean_ratio_results[idx] = ratio_mean_ratio;
} }
} }
...@@ -259,6 +270,7 @@ calcul_map_kernel_cuda = mod.get_function("calcul_map_kernel") ...@@ -259,6 +270,7 @@ calcul_map_kernel_cuda = mod.get_function("calcul_map_kernel")
calcul_depth_correction_kernel_cuda = mod.get_function("calcul_depth_correction_kernel") calcul_depth_correction_kernel_cuda = mod.get_function("calcul_depth_correction_kernel")
normalize_depth_kernel_cuda = mod.get_function("normalize_depth_kernel") normalize_depth_kernel_cuda = mod.get_function("normalize_depth_kernel")
ratio_par_window_kernel_cuda = mod.get_function("ratio_par_window_kernel") ratio_par_window_kernel_cuda = mod.get_function("ratio_par_window_kernel")
ratio_par_window_norm_kernel_cuda = mod.get_function("ratio_par_window_norm_kernel")
z_score_kernel_cuda = mod.get_function("z_score_kernel") z_score_kernel_cuda = mod.get_function("z_score_kernel")
ratio_par_mean_ratio_kernel_cuda = mod.get_function("ratio_par_mean_ratio_kernel") ratio_par_mean_ratio_kernel_cuda = mod.get_function("ratio_par_mean_ratio_kernel")
...@@ -540,59 +552,7 @@ def calcul_depth_seq(seq_length, bamfile_path, chr): ...@@ -540,59 +552,7 @@ def calcul_depth_seq(seq_length, bamfile_path, chr):
logging.info(f"Leaving calcul_depth_seq for {chr} (Time taken: {elapsed_time:.4f} seconds)") logging.info(f"Leaving calcul_depth_seq for {chr} (Time taken: {elapsed_time:.4f} seconds)")
return depth_data return depth_data
def calcul_depth_seq_copy(seq_length, bamfile_path, chr):
"""
Calculate the sequencing depth for a given chromosome.
This function computes the sequencing depth for a specific chromosome and sequence length using a BAM file.
Parameters
----------
seq_length : int
The length of the sequence.
bamfile_path : pysam.AlignmentFile
The path to the BAM file opened with pysam.AlignmentFile.
chr : str
The chromosome for which the depth is calculated.
Returns
-------
numpy.ndarray
An array of integers representing the sequencing depth for the given sequence length.
"""
logging.info(f"Entering calcul_depth_seq for {chr}")
start_time = time.time()
depth_data = np.zeros(seq_length, dtype=np.int32)
i = 0
it = bamfile_path.pileup(reference = chr)
while i < (seq_length - 1):
t = next(it)
pos = t.reference_pos
depth_data[pos] = t.nsegments
print(i)
i += 1
#for pileupcolumn in bamfile_path.pileup():
# pos = pileupcolumn.reference_pos
# if pos >= seq_length:
# break
#depth_data[pos] = pileupcolumn.nsegments
#print(bamfile_path.pileup()) print("########################")
# depth_data = np.array(bamfile_path.pileup().nsegments)
end_time = time.time()
elapsed_time = end_time - start_time
logging.info(f"Leaving calcul_depth_seq for {chr} (Time taken: {elapsed_time:.4f} seconds)")
return depth_data
def calcul_depth_seq_samtools(seq_length, bamfile_path, chr, num_threads=12): def calcul_depth_seq_samtools(seq_length, bamfile_path, chr, num_threads=12):
""" """
Calculate the sequencing depth for a given chromosome using a parallelized bash script. Calculate the sequencing depth for a given chromosome using a parallelized bash script.
...@@ -754,16 +714,15 @@ def calcul_moy_totale(normalize_depth_results, chr): ...@@ -754,16 +714,15 @@ def calcul_moy_totale(normalize_depth_results, chr):
# Filter results to remove zero values # Filter results to remove zero values
non_zero_results = normalize_depth_results[normalize_depth_results != 0] non_zero_results = normalize_depth_results[normalize_depth_results != 0]
# Calculate the mean of non-zero results # Calculate the mean of non-zero results
mean_chr = np.mean(non_zero_results) if non_zero_results.size > 0 else 0 mean_chr_norm = np.mean(non_zero_results) if non_zero_results.size > 0 else 0
sys.stderr.write("Chromosome : %s, mean_chr : %s\n" % (chr, mean_chr)) sys.stderr.write("Chromosome : %s, mean_chr_norm : %s\n" % (chr, mean_chr_norm))
end_time = time.time() end_time = time.time()
elapsed_time = end_time - start_time elapsed_time = end_time - start_time
logging.info(f"Leaving calcul_moy_totale for {chr} (Time taken: {elapsed_time:.4f} seconds)") logging.info(f"Leaving calcul_moy_totale for {chr} (Time taken: {elapsed_time:.4f} seconds)")
return mean_chr return mean_chr_norm
def calcul_std(normalize_depth_results, chr): def calcul_std(normalize_depth_results, chr):
...@@ -801,7 +760,7 @@ def calcul_std(normalize_depth_results, chr): ...@@ -801,7 +760,7 @@ def calcul_std(normalize_depth_results, chr):
return std_chr return std_chr
def compute_mean_std_med(ratio_par_window_results, chr): def compute_mean_std_med(ratio_par_window_norm_results, chr):
""" """
Compute the mean, standard deviation, and median of non-zero ratio results per window. Compute the mean, standard deviation, and median of non-zero ratio results per window.
...@@ -809,7 +768,7 @@ def compute_mean_std_med(ratio_par_window_results, chr): ...@@ -809,7 +768,7 @@ def compute_mean_std_med(ratio_par_window_results, chr):
Parameters Parameters
---------- ----------
ratio_par_window_results : list or numpy.ndarray ratio_par_window_norm_results : list or numpy.ndarray
A list or array of ratio values per window. A list or array of ratio values per window.
Returns Returns
...@@ -821,8 +780,8 @@ def compute_mean_std_med(ratio_par_window_results, chr): ...@@ -821,8 +780,8 @@ def compute_mean_std_med(ratio_par_window_results, chr):
start_time = time.time() start_time = time.time()
# Filter results to remove zero and -1 values # Filter results to remove zero and -1 values
ratio_par_window_results = np.array(ratio_par_window_results) ratio_par_window_norm_results = np.array(ratio_par_window_norm_results)
non_zero_results = ratio_par_window_results[ratio_par_window_results != 0] non_zero_results = ratio_par_window_norm_results[ratio_par_window_norm_results != 0]
# Initialize list for stats computation # Initialize list for stats computation
table = [] table = []
...@@ -1193,7 +1152,7 @@ def calc_polynom(gc_35, gc_40, gc_45, gc_50, gc_55, chr, degree = 3): ...@@ -1193,7 +1152,7 @@ def calc_polynom(gc_35, gc_40, gc_45, gc_50, gc_55, chr, degree = 3):
for depths in depth_arrays: for depths in depth_arrays:
if len(depths) > 0: if len(depths) > 0:
indices = np.arrange(len(depths)) indices = np.arange(len(depths))
p = Polynomial.fit(indices, depths, degree) p = Polynomial.fit(indices, depths, degree)
polynomials.append(p) polynomials.append(p)
else: else:
...@@ -1204,7 +1163,45 @@ def calc_polynom(gc_35, gc_40, gc_45, gc_50, gc_55, chr, degree = 3): ...@@ -1204,7 +1163,45 @@ def calc_polynom(gc_35, gc_40, gc_45, gc_50, gc_55, chr, degree = 3):
logging.info(f"Leaving calc_polynom for {chr} (Time taken: {elapsed_time:.4f} seconds)") logging.info(f"Leaving calc_polynom for {chr} (Time taken: {elapsed_time:.4f} seconds)")
return polynomials return polynomials
def calc_mean_chr(depth_results, chr)
logging.info(f"Entering calc_mean_chr for {chr}")
start_time = time.time()
depth_results = np.array(depth_results)
# Filter results to remove zero values
non_zero_results = depth_results[depth_results != 0]
# Calculate the mean of non-zero results
mean_chr = np.mean(non_zero_results) if non_zero_results.size > 0 else 0
sys.stderr.write("Chromosome : %s, mean_chr : %s\n" % (chr, mean_chr))
end_time = time.time()
elapsed_time = end_time - start_time
logging.info(f"Leaving dinf_polynom for {chr} (Time taken: {elapsed_time:.4f} seconds)")
return mean_chr
def find_polynom(ratio_par_window_results, chr, polynomials
ratio_par_window_results = np.array(ratio_par_window_results)
cn = ratio_par_window_results * 2
cn_to_test = []
for i in cn:
if i >= 1.5 and i < 2.5:
cn_to_test.append(i)
cn_unique = np.unique(cn_to_test)
knorm = np.argmin(cn_unique) * abs(polynomial
def normalisation(
normalize_deth = []
for i in depth_results:
normalize_depth[i] = depth_results[i] / (polynom * map_data[i])
def main_calcul( def main_calcul(
bamfile_path, bamfile_path,
chr, chr,
...@@ -1258,7 +1255,6 @@ def main_calcul( ...@@ -1258,7 +1255,6 @@ def main_calcul(
map_data = calcul_mappability(seq_length, mappability, chr) map_data = calcul_mappability(seq_length, mappability, chr)
gc_data = calcul_gc_content(seq_length, chr, seq) gc_data = calcul_gc_content(seq_length, chr, seq)
#depth_data = calcul_depth_seq(seq_length, bamfile_path, chr) #depth_data = calcul_depth_seq(seq_length, bamfile_path, chr)
#depth_data = calcul_depth_seq_copy(seq_length, bamfile_path, chr)
depth_data = calcul_depth_seq_samtools(seq_length, bamfile_path, chr) depth_data = calcul_depth_seq_samtools(seq_length, bamfile_path, chr)
# Transférer le tableau NumPy vers CUDA # Transférer le tableau NumPy vers CUDA
...@@ -1320,6 +1316,11 @@ def main_calcul( ...@@ -1320,6 +1316,11 @@ def main_calcul(
) )
sys.stderr.write("\t Definition de ratio_par_window\n") sys.stderr.write("\t Definition de ratio_par_window\n")
ratio_par_window_norm_results = np.zeros(
int((seq_length - window_size) / step_size) + 1, dtype=np.float32
)
sys.stderr.write("\t Definition de ratio_par_window_norm\n")
z_score_results = np.zeros( z_score_results = np.zeros(
int((seq_length - window_size) / step_size) + 1, dtype=np.float32 int((seq_length - window_size) / step_size) + 1, dtype=np.float32
) )
...@@ -1382,6 +1383,17 @@ def main_calcul( ...@@ -1382,6 +1383,17 @@ def main_calcul(
"\t ratio_par_window_results.nbytes = %s\n" % ratio_par_window_results.nbytes "\t ratio_par_window_results.nbytes = %s\n" % ratio_par_window_results.nbytes
) )
d_ratio_par_window_norm_results = cuda.mem_alloc(ratio_par_window_norm_results.nbytes)
sys.stderr.write(
"\t d_ratio_par_window_norm_results = %s\n"
% d_ratio_par_window_norm_results.as_buffer(
sys.getsizeof(d_ratio_par_window_norm_results)
)
)
sys.stderr.write(
"\t ratio_par_window_norm_results.nbytes = %s\n" % ratio_par_window_norm_results.nbytes
)
d_z_score_results = cuda.mem_alloc(z_score_results.nbytes) d_z_score_results = cuda.mem_alloc(z_score_results.nbytes)
sys.stderr.write( sys.stderr.write(
"\t d_z_score_results = %s\n" "\t d_z_score_results = %s\n"
...@@ -1482,6 +1494,27 @@ def main_calcul( ...@@ -1482,6 +1494,27 @@ def main_calcul(
#Appel fonction calcul polynomes #Appel fonction calcul polynomes
polynomials = calc_polynom(gc_35, gc_40, gc_45, gc_50, gc_55, chr, degree = 3) polynomials = calc_polynom(gc_35, gc_40, gc_45, gc_50, gc_55, chr, degree = 3)
#Appel fonction calcul moyenne
mean_chr = calc_mean_chr(depth_results, chr)
# Appeler le kernel de ratio
ratio_par_window_kernel_cuda(
d_depth_results,
np.float32(mean_chr),
np.int32(seq_length),
np.int32(window_size),
np.int32(step_size),
d_ratio_par_window_results,
block=(block_size, 1, 1),
grid=(grid_size, 1),
)
sys.stderr.write("\t appel fonction ratio_par_window_kernel_cuda\n")
context.synchronize()
# Copier les resultats ratio depuis le peripherique CUDA vers l'hote
cuda.memcpy_dtoh(ratio_par_window_results, d_ratio_par_window_results)
# Appel fonctions medianes # Appel fonctions medianes
sys.stderr.write("\t appel fonctions calcul medianes\n") sys.stderr.write("\t appel fonctions calcul medianes\n")
m = calcul_med_total(depth_correction_results, chr) m = calcul_med_total(depth_correction_results, chr)
...@@ -1522,32 +1555,32 @@ def main_calcul( ...@@ -1522,32 +1555,32 @@ def main_calcul(
# Appel fonction moyenne # Appel fonction moyenne
sys.stderr.write("\t appel fonction calcul moyenne\n") sys.stderr.write("\t appel fonction calcul moyenne\n")
mean_chr = calcul_moy_totale(normalize_depth_results, chr) mean_chr_norm = calcul_moy_totale(normalize_depth_results, chr)
# Appeler le kernel de ratio # Appeler le kernel de ratio normalise
ratio_par_window_kernel_cuda( ratio_par_window_norm_kernel_cuda(
d_normalize_depth_results, d_normalize_depth_results,
np.float32(mean_chr), np.float32(mean_chr_norm),
np.int32(seq_length), np.int32(seq_length),
np.int32(window_size), np.int32(window_size),
np.int32(step_size), np.int32(step_size),
d_ratio_par_window_results, d_ratio_par_window_norm_results,
block=(block_size, 1, 1), block=(block_size, 1, 1),
grid=(grid_size, 1), grid=(grid_size, 1),
) )
sys.stderr.write("\t appel fonction ratio_par_window_kernel_cuda\n") sys.stderr.write("\t appel fonction ratio_par_window_norm_kernel_cuda\n")
context.synchronize() context.synchronize()
# Copier les resultats ratio depuis le peripherique CUDA vers l'hote # Copier les resultats ratio depuis le peripherique CUDA vers l'hote
cuda.memcpy_dtoh(ratio_par_window_results, d_ratio_par_window_results) cuda.memcpy_dtoh(ratio_par_window_norm_results, d_ratio_par_window_norm_results)
# Création table à partir du ratio # Création table à partir du ratio
mean_ratio, std_ratio, med_ratio = compute_mean_std_med(ratio_par_window_results, chr) mean_ratio, std_ratio, med_ratio = compute_mean_std_med(ratio_par_window_norm_results, chr)
# Appeler le kernel de calcule du ratio divisé par le ratio moyen # Appeler le kernel de calcule du ratio divisé par le ratio moyen
ratio_par_mean_ratio_kernel_cuda( ratio_par_mean_ratio_kernel_cuda(
d_ratio_par_window_results, d_ratio_par_window_norm_results,
np.float32(mean_ratio), np.float32(mean_ratio),
np.int32(seq_length), np.int32(seq_length),
np.int32(window_size), np.int32(window_size),
...@@ -1560,7 +1593,7 @@ def main_calcul( ...@@ -1560,7 +1593,7 @@ def main_calcul(
# Appeler le kernel de Z-score # Appeler le kernel de Z-score
z_score_kernel_cuda( z_score_kernel_cuda(
d_ratio_par_window_results, d_ratio_par_window_norm_results,
np.float32(mean_ratio), np.float32(mean_ratio),
np.float32(std_ratio), np.float32(std_ratio),
np.int32(seq_length), np.int32(seq_length),
...@@ -1600,10 +1633,10 @@ def main_calcul( ...@@ -1600,10 +1633,10 @@ def main_calcul(
#Ecrire les résultats dans le fichier de sortie #Ecrire les résultats dans le fichier de sortie
#with open(output_file, 'a') as f: #with open(output_file, 'a') as f:
# sys.stderr.write("\t ecriture des fichiers\n") # sys.stderr.write("\t ecriture des fichiers\n")
# for i, (depth_data, depth_correction, depth_normalize_val, gc_data, map_data, ratio, ratio_mean_ratio, z_score) in enumerate(zip(depth_results, depth_correction_results, normalize_depth_results, gc_results, map_results, ratio_par_window_results, ratio_par_mean_ratio_results, z_score_results)): # for i, (depth_data, depth_correction, depth_normalize_val, gc_data, map_data, ratio_norm, ratio_mean_ratio, z_score) in enumerate(zip(depth_results, depth_correction_results, normalize_depth_results, gc_results, map_results, ratio_par_window_norm_results, ratio_par_mean_ratio_results, z_score_results)):
# pos_start = (i * step_size) + 1 # pos_start = (i * step_size) + 1
# pos_end = pos_start + window_size # pos_end = pos_start + window_size
# f.write(f"{chr}\t{pos_start}\t{pos_end}\t{depth_data}\t{depth_correction}\t{depth_normalize_val}\t{gc_data}\t{map_data}\t{ratio}\t{ratio_mean_ratio}\t{z_score}\n") # f.write(f"{chr}\t{pos_start}\t{pos_end}\t{depth_data}\t{depth_correction}\t{depth_normalize_val}\t{gc_data}\t{map_data}\t{ratio_norm}\t{ratio_mean_ratio}\t{z_score}\n")
# Programme principal # Programme principal
# Calcul nombre de coeurs max pour le GPU # Calcul nombre de coeurs max pour le GPU
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment