Correct normalisation (don't work)

c68a9914 · Theo Serralta · 26feb9eb · c68a9914
Commit c68a9914 authored Jul 19, 2024 by Theo Serralta
Hide whitespace changes
Inline Side-by-side

Showing with 113 additions and 80 deletions

test_gpu_time_normalisation.py CNV/test/test_gpu_time_normalisation.py +113 -80

No files found.
--- a/CNV/test/test_gpu_time_normalisation.py
+++ b/CNV/test/test_gpu_time_normalisation.py
@@ -218,33 +218,44 @@ __global__ void normalize_depth_kernel(float *depth_correction, float *gc_result
 // Kernel pour calculer le ratio par window
-__global__ void ratio_par_window_kernel(float *depth_normalize_val, float mean_chr, int seq_length, int window_size, int step_size, float *ratio_par_window_results) {
+__global__ void ratio_par_window_kernel(float *avg_depth, float mean_chr, int seq_length, int window_size, int step_size, float *ratio_par_window_results) {
    int idx = threadIdx.x + blockIdx.x * blockDim.x;
    if (idx < seq_length - window_size + 1) {
-        float ratio = depth_normalize_val[idx] / mean_chr;
+        float ratio = depth_results[idx] / mean_chr;
        ratio_par_window_results[idx] = ratio;
    }
 }
+// Kernel pour calculer le ratio normalisé par window
+__global__ void ratio_par_window_norm_kernel(float *depth_normalize_val, float mean_chr_norm, int seq_length, int window_size, int step_size, float *ratio_par_window_norm_results) {
+    int idx = threadIdx.x + blockIdx.x * blockDim.x;
+    if (idx < seq_length - window_size + 1) {
+        float ratio_norm = depth_normalize_val[idx] / mean_chr_norm;
+        ratio_par_window_norm_results[idx] = ratio_norm;
+    }
+}
 // Kernel pour calculer le z_score par window sur le ratio
-__global__ void z_score_kernel(float *ratio, float mean_ratio, float std_ratio, int seq_length, int window_size, int step_size, float *z_score_results) {
+__global__ void z_score_kernel(float *ratio_norm, float mean_ratio, float std_ratio, int seq_length, int window_size, int step_size, float *z_score_results) {
    int idx = threadIdx.x + blockIdx.x * blockDim.x;
    if (idx < seq_length - window_size + 1) {
-        float z_score = (ratio[idx] - mean_ratio) / std_ratio;
+        float z_score = (ratio_norm[idx] - mean_ratio) / std_ratio;
        z_score_results[idx] = z_score;
    }
 }
 // Kernel pour calculer le ratio divise par le ratio moyen
-__global__ void ratio_par_mean_ratio_kernel(float *ratio, float mean_ratio, int seq_length, int window_size, int step_size, float *ratio_par_mean_ratio_results) {
+__global__ void ratio_par_mean_ratio_kernel(float *ratio_norm, float mean_ratio, int seq_length, int window_size, int step_size, float *ratio_par_mean_ratio_results) {
    int idx = threadIdx.x + blockIdx.x * blockDim.x;
    if (idx < seq_length - window_size + 1) {
-        float ratio_mean_ratio = ratio[idx] / mean_ratio;
+        float ratio_mean_ratio = ratio_norm[idx] / mean_ratio;
        ratio_par_mean_ratio_results[idx] = ratio_mean_ratio;
    }
 }
@@ -259,6 +270,7 @@ calcul_map_kernel_cuda = mod.get_function("calcul_map_kernel")
 calcul_depth_correction_kernel_cuda = mod.get_function("calcul_depth_correction_kernel")
 normalize_depth_kernel_cuda = mod.get_function("normalize_depth_kernel")
 ratio_par_window_kernel_cuda = mod.get_function("ratio_par_window_kernel")
+ratio_par_window_norm_kernel_cuda = mod.get_function("ratio_par_window_norm_kernel")
 z_score_kernel_cuda = mod.get_function("z_score_kernel")
 ratio_par_mean_ratio_kernel_cuda = mod.get_function("ratio_par_mean_ratio_kernel")
@@ -540,59 +552,7 @@ def calcul_depth_seq(seq_length, bamfile_path, chr):
    logging.info(f"Leaving calcul_depth_seq for {chr} (Time taken: {elapsed_time:.4f} seconds)")
    return depth_data
-def calcul_depth_seq_copy(seq_length, bamfile_path, chr):
-    """
-    Calculate the sequencing depth for a given chromosome.
-    This function computes the sequencing depth for a specific chromosome and sequence length using a BAM file.
-    Parameters
-    ----------
-    seq_length : int
-        The length of the sequence.
-    bamfile_path : pysam.AlignmentFile
-        The path to the BAM file opened with pysam.AlignmentFile.
-    chr : str
-        The chromosome for which the depth is calculated.
-    Returns
-    -------
-    numpy.ndarray
-        An array of integers representing the sequencing depth for the given sequence length.
-    """
-    logging.info(f"Entering calcul_depth_seq for {chr}")
-    start_time = time.time()
-    depth_data = np.zeros(seq_length, dtype=np.int32)
-    i = 0
-    it = bamfile_path.pileup(reference = chr)
-    while i < (seq_length - 1):
-        t = next(it)
-        pos = t.reference_pos
-        depth_data[pos] = t.nsegments
-        print(i)
-        i += 1
-    #for pileupcolumn in bamfile_path.pileup():
-      #  pos = pileupcolumn.reference_pos
-     #   if pos >= seq_length:
-       #     break
-        #depth_data[pos] = pileupcolumn.nsegments
-    #print(bamfile_path.pileup())    print("########################")
-#    depth_data = np.array(bamfile_path.pileup().nsegments)
-    end_time = time.time()
-    elapsed_time = end_time - start_time
-    logging.info(f"Leaving calcul_depth_seq for {chr} (Time taken: {elapsed_time:.4f} seconds)")
-    return depth_data
 def calcul_depth_seq_samtools(seq_length, bamfile_path, chr, num_threads=12):
    """
    Calculate the sequencing depth for a given chromosome using a parallelized bash script.
@@ -754,16 +714,15 @@ def calcul_moy_totale(normalize_depth_results, chr):
    # Filter results to remove zero values
    non_zero_results = normalize_depth_results[normalize_depth_results != 0]
    # Calculate the mean of non-zero results
-    mean_chr = np.mean(non_zero_results) if non_zero_results.size > 0 else 0
+    mean_chr_norm = np.mean(non_zero_results) if non_zero_results.size > 0 else 0
-    sys.stderr.write("Chromosome : %s, mean_chr : %s\n" % (chr, mean_chr))
+    sys.stderr.write("Chromosome : %s, mean_chr_norm : %s\n" % (chr, mean_chr_norm))
    end_time = time.time()
    elapsed_time = end_time - start_time
    logging.info(f"Leaving calcul_moy_totale for {chr} (Time taken: {elapsed_time:.4f} seconds)")    
-    return mean_chr
+    return mean_chr_norm
 def calcul_std(normalize_depth_results, chr):
@@ -801,7 +760,7 @@ def calcul_std(normalize_depth_results, chr):
    return std_chr
-def compute_mean_std_med(ratio_par_window_results, chr):
+def compute_mean_std_med(ratio_par_window_norm_results, chr):
    """
    Compute the mean, standard deviation, and median of non-zero ratio results per window.
@@ -809,7 +768,7 @@ def compute_mean_std_med(ratio_par_window_results, chr):
    Parameters
    ----------
-    ratio_par_window_results : list or numpy.ndarray
+    ratio_par_window_norm_results : list or numpy.ndarray
        A list or array of ratio values per window.
    Returns
@@ -821,8 +780,8 @@ def compute_mean_std_med(ratio_par_window_results, chr):
    start_time = time.time()
    # Filter results to remove zero and -1 values
-    ratio_par_window_results = np.array(ratio_par_window_results)
+    ratio_par_window_norm_results = np.array(ratio_par_window_norm_results)
-    non_zero_results = ratio_par_window_results[ratio_par_window_results != 0]
+    non_zero_results = ratio_par_window_norm_results[ratio_par_window_norm_results != 0]
    # Initialize list for stats computation
    table = []
@@ -1193,7 +1152,7 @@ def calc_polynom(gc_35, gc_40, gc_45, gc_50, gc_55, chr, degree = 3):
    for depths in depth_arrays:
        if len(depths) > 0:
-            indices = np.arrange(len(depths))
+            indices = np.arange(len(depths))
            p = Polynomial.fit(indices, depths, degree)
            polynomials.append(p)
        else:
@@ -1204,7 +1163,45 @@ def calc_polynom(gc_35, gc_40, gc_45, gc_50, gc_55, chr, degree = 3):
    logging.info(f"Leaving calc_polynom for {chr} (Time taken: {elapsed_time:.4f} seconds)") 
    return polynomials
+def calc_mean_chr(depth_results, chr) 
+    logging.info(f"Entering calc_mean_chr for {chr}")
+    start_time = time.time()   
+    depth_results = np.array(depth_results)
+    # Filter results to remove zero values
+    non_zero_results = depth_results[depth_results != 0]
+    # Calculate the mean of non-zero results
+    mean_chr = np.mean(non_zero_results) if non_zero_results.size > 0 else 0
+    sys.stderr.write("Chromosome : %s, mean_chr : %s\n" % (chr, mean_chr))
+    end_time = time.time()
+    elapsed_time = end_time - start_time
+    logging.info(f"Leaving dinf_polynom for {chr} (Time taken: {elapsed_time:.4f} seconds)")    
+    return mean_chr
+def find_polynom(ratio_par_window_results, chr, polynomials
+    ratio_par_window_results = np.array(ratio_par_window_results)
+    cn = ratio_par_window_results * 2
+    cn_to_test = []
+    for i in cn:
+        if i >= 1.5 and i < 2.5:
+            cn_to_test.append(i)
+    cn_unique = np.unique(cn_to_test)
+    knorm = np.argmin(cn_unique) * abs(polynomial
+def normalisation(
+    normalize_deth = []
+    for i in depth_results:
+        normalize_depth[i] = depth_results[i] / (polynom * map_data[i])
 def main_calcul(
    bamfile_path,
    chr,
@@ -1258,7 +1255,6 @@ def main_calcul(
    map_data = calcul_mappability(seq_length, mappability, chr)
    gc_data = calcul_gc_content(seq_length, chr, seq)
    #depth_data = calcul_depth_seq(seq_length, bamfile_path, chr)
-    #depth_data = calcul_depth_seq_copy(seq_length, bamfile_path, chr)
    depth_data = calcul_depth_seq_samtools(seq_length, bamfile_path, chr)
    # Transférer le tableau NumPy vers CUDA
@@ -1320,6 +1316,11 @@ def main_calcul(
    )
    sys.stderr.write("\t Definition de ratio_par_window\n")
+    ratio_par_window_norm_results = np.zeros(
+        int((seq_length - window_size) / step_size) + 1, dtype=np.float32
+    )
+    sys.stderr.write("\t Definition de ratio_par_window_norm\n")
    z_score_results = np.zeros(
        int((seq_length - window_size) / step_size) + 1, dtype=np.float32
    )
@@ -1382,6 +1383,17 @@ def main_calcul(
        "\t ratio_par_window_results.nbytes = %s\n" % ratio_par_window_results.nbytes
    )
+    d_ratio_par_window_norm_results = cuda.mem_alloc(ratio_par_window_norm_results.nbytes)
+    sys.stderr.write(
+        "\t d_ratio_par_window_norm_results = %s\n"
+        % d_ratio_par_window_norm_results.as_buffer(
+            sys.getsizeof(d_ratio_par_window_norm_results)
+        )
+    )
+    sys.stderr.write(
+        "\t ratio_par_window_norm_results.nbytes = %s\n" % ratio_par_window_norm_results.nbytes
+    )
    d_z_score_results = cuda.mem_alloc(z_score_results.nbytes)
    sys.stderr.write(
        "\t d_z_score_results = %s\n"
@@ -1482,6 +1494,27 @@ def main_calcul(
    #Appel fonction calcul polynomes
    polynomials = calc_polynom(gc_35, gc_40, gc_45, gc_50, gc_55, chr, degree = 3)
+    #Appel fonction calcul moyenne
+    mean_chr = calc_mean_chr(depth_results, chr)
+    # Appeler le kernel de ratio
+    ratio_par_window_kernel_cuda(
+        d_depth_results,
+        np.float32(mean_chr),
+        np.int32(seq_length),
+        np.int32(window_size),
+        np.int32(step_size),
+        d_ratio_par_window_results,
+        block=(block_size, 1, 1),
+        grid=(grid_size, 1),
+    )
+    sys.stderr.write("\t appel fonction ratio_par_window_kernel_cuda\n")
+    context.synchronize()
+    # Copier les resultats ratio depuis le peripherique CUDA vers l'hote
+    cuda.memcpy_dtoh(ratio_par_window_results, d_ratio_par_window_results)
    # Appel fonctions medianes
    sys.stderr.write("\t appel fonctions calcul medianes\n")
    m = calcul_med_total(depth_correction_results, chr)
@@ -1522,32 +1555,32 @@ def main_calcul(
    # Appel fonction moyenne
    sys.stderr.write("\t appel fonction calcul moyenne\n")
-    mean_chr = calcul_moy_totale(normalize_depth_results, chr)
+    mean_chr_norm = calcul_moy_totale(normalize_depth_results, chr)
-    # Appeler le kernel de ratio
+    # Appeler le kernel de ratio normalise
-    ratio_par_window_kernel_cuda(
+    ratio_par_window_norm_kernel_cuda(
        d_normalize_depth_results,
-        np.float32(mean_chr),
+        np.float32(mean_chr_norm),
        np.int32(seq_length),
        np.int32(window_size),
        np.int32(step_size),
-        d_ratio_par_window_results,
+        d_ratio_par_window_norm_results,
        block=(block_size, 1, 1),
        grid=(grid_size, 1),
    )
-    sys.stderr.write("\t appel fonction ratio_par_window_kernel_cuda\n")
+    sys.stderr.write("\t appel fonction ratio_par_window_norm_kernel_cuda\n")
    context.synchronize()
    # Copier les resultats ratio depuis le peripherique CUDA vers l'hote
-    cuda.memcpy_dtoh(ratio_par_window_results, d_ratio_par_window_results)
+    cuda.memcpy_dtoh(ratio_par_window_norm_results, d_ratio_par_window_norm_results)
    # Création table à partir du ratio
-    mean_ratio, std_ratio, med_ratio = compute_mean_std_med(ratio_par_window_results, chr)
+    mean_ratio, std_ratio, med_ratio = compute_mean_std_med(ratio_par_window_norm_results, chr)
    # Appeler le kernel de calcule du ratio divisé par le ratio moyen
    ratio_par_mean_ratio_kernel_cuda(
-        d_ratio_par_window_results,
+        d_ratio_par_window_norm_results,
        np.float32(mean_ratio),
        np.int32(seq_length),
        np.int32(window_size),
@@ -1560,7 +1593,7 @@ def main_calcul(
    # Appeler le kernel de Z-score
    z_score_kernel_cuda(
-        d_ratio_par_window_results,
+        d_ratio_par_window_norm_results,
        np.float32(mean_ratio),
        np.float32(std_ratio),
        np.int32(seq_length),
@@ -1600,10 +1633,10 @@ def main_calcul(
    #Ecrire les résultats dans le fichier de sortie
    #with open(output_file, 'a') as f:
     #   sys.stderr.write("\t ecriture des fichiers\n")
-      #  for i, (depth_data, depth_correction, depth_normalize_val, gc_data, map_data, ratio, ratio_mean_ratio, z_score) in enumerate(zip(depth_results, depth_correction_results, normalize_depth_results, gc_results, map_results, ratio_par_window_results, ratio_par_mean_ratio_results, z_score_results)):
+      #  for i, (depth_data, depth_correction, depth_normalize_val, gc_data, map_data, ratio_norm, ratio_mean_ratio, z_score) in enumerate(zip(depth_results, depth_correction_results, normalize_depth_results, gc_results, map_results, ratio_par_window_norm_results, ratio_par_mean_ratio_results, z_score_results)):
       #     pos_start = (i * step_size) + 1
        #    pos_end = pos_start + window_size
-         #   f.write(f"{chr}\t{pos_start}\t{pos_end}\t{depth_data}\t{depth_correction}\t{depth_normalize_val}\t{gc_data}\t{map_data}\t{ratio}\t{ratio_mean_ratio}\t{z_score}\n")
+         #   f.write(f"{chr}\t{pos_start}\t{pos_end}\t{depth_data}\t{depth_correction}\t{depth_normalize_val}\t{gc_data}\t{map_data}\t{ratio_norm}\t{ratio_mean_ratio}\t{z_score}\n")
 # Programme principal
 # Calcul nombre de coeurs max pour le GPU