Finishing code to test on genome

59eeb957 · Theo Serralta · 74f8d0ae · 59eeb957
Commit 59eeb957 authored Jun 11, 2024 by Theo Serralta
Hide whitespace changes
Inline Side-by-side

Showing with 265 additions and 55 deletions

test_gpu_mean_depth.py CNV/test_gpu_mean_depth.py +265 -55

No files found.
--- a/CNV/test_gpu_mean_depth.py
+++ b/CNV/test_gpu_mean_depth.py
@@ -10,13 +10,14 @@ import pycuda.gpuarray as gpuarray
 from pycuda.autoinit import context
 import multiprocessing
 from multiprocessing import Process, Queue
+import time

 # Options

 def parse_arguments():
    try:
-        opts, args = getopt.getopt(sys.argv[1:], 'b:w:s:z:t:o:e:')
-        bamfile_path, window_size, step_size, zscore_threshold, output_file, logfile = None, None, None, None, None, None
+        opts, args = getopt.getopt(sys.argv[1:], 'b:w:s:z:l:t:o:e:')
+        bamfile_path, window_size, step_size, zscore_threshold, lengthFilter, output_file, logfile = None, None, None, None, None, None, None
        for opt, arg in opts:
            if opt in ("-b"):
                bamfile_path = arg
@@ -26,17 +27,19 @@ def parse_arguments():
                step_size = int(arg)
            if opt in ("-z"):
                zscore_threshold = float(arg)
+            if opt in ("-l"):
+                lengthFilter = int(arg)
            if opt in ("-o"):
                output_file = arg
            if opt in ("-e"):
                logfile = arg
-        return bamfile_path, window_size, step_size, zscore_threshold, output_file, logfile
+        return bamfile_path, window_size, step_size, zscore_threshold, lengthFilter, output_file, logfile
    except getopt.GetoptError:
        print('Invalid argument')
        sys.exit(1)

 if __name__ == '__main__':
-    bamfile_path, window_size, step_size, zscore_threshold, output_file, logfile = parse_arguments()
+    bamfile_path, window_size, step_size, zscore_threshold, lengthFilter, output_file, logfile = parse_arguments()
    logging.basicConfig(filename='%s' % (logfile), filemode='a', level=logging.INFO, format='%(asctime)s %(levelname)s - %(message)s')
    logging.info('start')
    global seq
@@ -153,16 +156,28 @@ __global__ void ratio_par_window_kernel(float *depth_normalize_val, float mean_c
    }
 }

-// Kernel pour calculer le z_score par window
+// Kernel pour calculer le z_score par window sur le ratio

-__global__ void z_score_kernel(float *depth_normalize_val, float mean_chr, float std_chr, int seq_length, int window_size, int step_size, float *z_score_results) {
+__global__ void z_score_kernel(float *ratio, float mean_ratio, float std_ratio, int seq_length, int window_size, int step_size, float *z_score_results) {
    int idx = threadIdx.x + blockIdx.x * blockDim.x;

    if (idx < seq_length - window_size + 1) {
-        float z_score = (depth_normalize_val[idx] - mean_chr) / std_chr;
+        float z_score = (ratio[idx] - mean_ratio) / std_ratio;
        z_score_results[idx] = z_score;
    }
 }
+
+// Kernel pour calculer le ratio divise par le ratio moyen
+
+__global__ void ratio_par_mean_ratio_kernel(float *ratio, float mean_ratio, int seq_length, int window_size, int step_size, float *ratio_par_mean_ratio_results) {
+    int idx = threadIdx.x + blockIdx.x * blockDim.x;
+
+    if (idx < seq_length - window_size + 1) {
+        float ratio_mean_ratio = ratio[idx] / mean_ratio;
+        ratio_par_mean_ratio_results[idx] = ratio_mean_ratio;
+    }
+}
+
 """)
    
 # Obtention de la fonction de kernel compilée
@@ -173,6 +188,7 @@ calcul_depth_correction_kernel_cuda = mod.get_function("calcul_depth_correction_
 normalize_depth_kernel_cuda = mod.get_function("normalize_depth_kernel")
 ratio_par_window_kernel_cuda = mod.get_function("ratio_par_window_kernel")
 z_score_kernel_cuda = mod.get_function("z_score_kernel")
+ratio_par_mean_ratio_kernel_cuda = mod.get_function("ratio_par_mean_ratio_kernel")

 #############################################
 ######<---Fonctions  mappability--->#########
@@ -341,8 +357,19 @@ def calcul_moy_totale(normalize_depth_results):
    sys.stderr.write("\t Leaving calcul_moy_totale\n")
    return mean_chr

+def calcul_moy_totale_ratio(ratio_par_window_results):
+    sys.stderr.write("\t entering calcul_moy_totale_ratio\n")
+    ratio_par_window_results = np.array(ratio_par_window_results)
+    # Filtrer les résultats pour enlever les valeurs égales à 0
+    non_zero_results = ratio_par_window_results[ratio_par_window_results != 0]
+    # Calculer la mpyenne des résultats non nuls
+    mean_ratio = np.mean(non_zero_results) if non_zero_results.size > 0 else 0
+    print(mean_ratio)
+    sys.stderr.write("\t Leaving calcul_moy_totale_ratio\n")
+    return mean_ratio
+
 #############################################
-######<---Fonction calcul std--->#####
+######<---Fonction calcul std--->############
 #############################################
 def calcul_std(normalize_depth_results):
    sys.stderr.write("\t entering calcul_std\n")
@@ -355,13 +382,207 @@ def calcul_std(normalize_depth_results):
    sys.stderr.write("\t Leaving calcul_std\n")
    return std_chr

+
+def calcul_std_ratio(ratio_par_window_results):
+    sys.stderr.write("\t entering calcul_std_ratio\n")
+    ratio_par_window_results = np.array(ratio_par_window_results)
+    # Filtrer les résultats pour enlever les valeurs égales à 0
+    non_zero_results = ratio_par_window_results[ratio_par_window_results != 0]
+    # Calculer le std des résultats non nuls
+    std_ratio = np.std(non_zero_results) if non_zero_results.size > 0 else 0
+    print(std_ratio)
+    sys.stderr.write("\t Leaving calcul_std_ratio\n")
+    return std_ratio
+
+##############################################
+######<---Fonction statistiques ratio--->#####
+##############################################
+def compute_mean_and_std(ratio_par_window_results):
+    sys.stderr.write("Computing stats : \n")
+    
+    # Filtrer les résultats pour enlever les valeurs égales à 0
+    ratio_par_window_results = np.array(ratio_par_window_results)
+    non_zero_results = ratio_par_window_results[ratio_par_window_results != 0]
+    
+    # Initialiser les listes pour le calcul des stats
+    table = []
+    
+    for value in non_zero_results:
+        if float(value) >= 5:
+            table.append(5)
+        elif float(value) != -1:
+            table.append(float(value))
+    
+    # Calculer la moyenne, l'écart-type et la médiane des valeurs filtrées
+    mean_ratio = np.mean(table) if table else 0
+    std_ratio = np.std(table) if table else 0
+    med_ratio = np.median(table) if table else 0
+    
+    # Créer le dictionnaire de stats
+    #stats = {'mean': mean_ratio, 'std': std_ratio, 'med': med_ratio}
+
+    # Afficher les résultats
+    #print(stats)
+    print(mean_ratio, std_ratio, med_ratio)
+    sys.stderr.write("Computation done\n")
+    
+    # Retourner les résultats
+    return mean_ratio, std_ratio, med_ratio
+    
+##############################################
+######<---Fonction copy number level--->######
+##############################################    
+def cn_level(x):
+    if x < 1 :
+        if x <= 0.75:
+            if x >= 0.1:
+                return 1
+            else:
+                return 0
+        else:
+            return 2
+    else:
+        if round(x) == 1:
+            return 2
+        if round(x) > 1:
+            return round(x)
+
+##############################################
+######<---Fonction get sample--->#############
+##############################################
+def get_sample_name(bamfile_path):
+    with pysam.AlignmentFile(bamfile_path, "rb") as bamfile:
+        for read_group in bamfile.header.get('RG', []):
+            if 'SM' in read_group:
+                return read_group['SM']
+    return "UnknownSample"
+
+##############################################
+######<---Fonction create signal--->##########
+##############################################
+def create_signal(signal, chr, z_score_results, step_size):
+    if chr not in signal:
+        signal[chr] = {}
+    for i in range(len(z_score_results)):
+        pos = (i * step_size) + 1
+        signal[chr][pos] = z_score_results[i]
+
+##############################################
+######<---Fonction detect_events--->##########
+##############################################
+def detect_events(z_score_results, zscore_threshold, events, med_ratio, ratio_par_mean_ratio_results, chr):
+    sys.stderr.write("\t starting detect_events\n")
+    c = 0
+    for i, z_score in enumerate(z_score_results):
+        if (z_score <= -zscore_threshold) or (z_score >= zscore_threshold):
+            if chr not in events:
+                events[chr] = {}
+            if med_ratio == 0:
+                c = 0
+            else:
+                c = cn_level(float(ratio_par_mean_ratio_results[i]))
+            
+            if z_score >= zscore_threshold:
+                c = 3
+            elif c == 2 and z_score <= -zscore_threshold:
+                c = 1
+            pos_start = (i * step_size) + 1
+            pos_end = pos_start + window_size
+            
+            events[chr][(pos_start, pos_end)] = c
+    sys.stderr.write("\t ending detect_events\n")
+    
+##############################################
+######<---Fonction segmentation--->###########
+##############################################
+def segmentation(events, segment):
+    sys.stderr.write("starting segmentation : \n")
+    for k in events.keys():
+        sys.stderr.write("\tfor chromosome %s\n" % k)
+        starts = 0
+        oldPos = 0
+        oldLevel = -1
+        for p in sorted(events[k].keys()):
+            level = events[k][p]
+            # new coordinates
+            if (p[0] > (oldPos + window_size)):
+                if (starts != 0) and (starts != p[0]):
+                    if k not in segment:
+                        segment[k] = {}
+                    index = str(starts) + "-" + str(oldPos)
+                    segment[k][index] = {}
+                    segment[k][index]["start"] = starts
+                    segment[k][index]["end"] = oldPos + window_size
+                    segment[k][index]["cn"] = oldLevel
+                    oldPos = p[0]
+                    starts = p[0]
+                    oldLevel = level
+                    continue
+                else:
+                    starts = p[0]
+            # case where it's contiguous but different level
+            if (level != oldLevel):
+                if oldLevel != -1:
+                    if k not in segment:
+                        segment[k] = {}
+                    index = str(starts) + "-" + str(oldPos)
+                    segment[k][index] = {}
+                    segment[k][index]["start"] = starts
+                    segment[k][index]["end"] = oldPos
+                    segment[k][index]["cn"] = oldLevel
+                    oldPos = p[0]
+                    starts = p[0]
+                    oldLevel = level
+                    continue
+                else:
+                    oldLevel = level
+            oldPos = p[0]
+            oldLevel = level
+    sys.stderr.write("segmentation done\n")
+
+##############################################
+######<---Fonction display results--->########
+##############################################
+def display_results_vcf(sample, segment, signal, lengthFilter, output_file):
+    global header_written
+    sys.stderr.write("starting display results\n")
+
+    with open(output_file, 'a') as f:
+        if not header_written:
+            f.write("##fileformat=VCFv4.2\n")
+            f.write("##source=cnvcaller\n")
+            f.write("##INFO=<ID=SVTYPE,Number=1,Type=String,Description=\"Type of structural variant\">\n")
+            f.write("##INFO=<ID=SVLEN,Number=.,Type=Integer,Description=\"Difference in length between REF and ALT alleles\">\n")
+            f.write("##INFO=<ID=END,Number=1,Type=Integer,Description=\"End position of the variant described in this record\">\n")
+            f.write("##INFO=<ID=CN,Number=1,Type=Integer,Description=\"Copy number\">\n")
+            f.write("##ALT=<ID=DEL,Description=\"Deletion\">\n")
+            f.write("##ALT=<ID=DUP,Description=\"Duplication\">\n")
+            f.write("##FORMAT=<ID=GT,Number=1,Type=String,Description=\"Genotype\">\n")
+            f.write("#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\tFORMAT\t%s\n" % (sample))
+            header_written = True
+
+        for k in segment.keys():
+            for elt in sorted(segment[k].keys()):
+                if segment[k][elt]["start"] == segment[k][elt]["end"]:
+                    continue
+                if (segment[k][elt]["end"] - segment[k][elt]["start"]) < lengthFilter:
+                    continue
+                if int(signal[k][segment[k][elt]["start"]]) < 0:
+                    f.write("%s\t%s\t.\tN\t<DEL>\t.\t.\tSVTYPE=DEL;END=%s;VALUE=%s\tGT:GQ\t./.:0\n" % (k, segment[k][elt]["start"], segment[k][elt]["end"], int(segment[k][elt]["cn"])))
+                else:
+                    f.write("%s\t%s\t.\tN\t<DUP>\t.\t.\tSVTYPE=DUP;END=%s;VALUE=%s\tGT:GQ\t./.:0\n" % (k, segment[k][elt]["start"], segment[k][elt]["end"], int(segment[k][elt]["cn"])))
+
+
 #################################
 ######<---Fonction main--->######
 #################################
-def main_calcul(bamfile_path, chr, seq_length, window_size, step_size, zscore_threshold, output_file):
+def main_calcul(bamfile_path, chr, seq_length, window_size, step_size, zscore_threshold, lengthFilter, output_file, sample):
    sys.stderr.write("\t entering main_calcul\n")
    global seq
-
+    events = {}
+    segment = {}
+    signal = {}
+    
    # Appeler les différentes fonctions
    map_data = calcul_mappability(seq_length, mappability, chr)
    gc_data = calcul_gc_content(seq_length, chr, seq)
@@ -408,6 +629,9 @@ def main_calcul(bamfile_path, chr, seq_length, window_size, step_size, zscore_th
    z_score_results = np.zeros(int((seq_length - window_size) / step_size) + 1, dtype=np.float32)
    sys.stderr.write("\t Definition de z_score_results\n")
    
+    ratio_par_mean_ratio_results = np.zeros(int((seq_length - window_size) / step_size) + 1, dtype=np.float32)
+    sys.stderr.write("\t Definition de ratio_par_mean_ratio_results\n")
+    
    # Allouer de la mémoire pour les résultats sur le périphérique CUDA
    d_depth_results = cuda.mem_alloc(depth_results.nbytes)
    sys.stderr.write("\t d_depth_results = %s\n" % d_depth_results.as_buffer(sys.getsizeof(d_depth_results)))
@@ -437,6 +661,10 @@ def main_calcul(bamfile_path, chr, seq_length, window_size, step_size, zscore_th
    sys.stderr.write("\t d_z_score_results = %s\n" % d_z_score_results.as_buffer(sys.getsizeof(d_z_score_results)))
    sys.stderr.write("\t z_score_results.nbytes = %s\n" % z_score_results.nbytes)    
    
+    d_ratio_par_mean_ratio_results = cuda.mem_alloc(ratio_par_mean_ratio_results.nbytes)
+    sys.stderr.write("\t d_ratio_par_mean_ratio_results = %s\n" % d_ratio_par_mean_ratio_results.as_buffer(sys.getsizeof(d_ratio_par_mean_ratio_results)))
+    sys.stderr.write("\t ratio_par_mean_ratio_results.nbytes = %s\n" % ratio_par_mean_ratio_results.nbytes) 
+    
    # Appeler la fonction de calcul de profondeur avec CUDA
    calcul_depth_kernel_cuda(d_depth_data, np.int32(seq_length), np.int32(window_size), np.int32(step_size), d_depth_results, block=(block_size, 1, 1), grid=(grid_size, 1))
    sys.stderr.write("\t appel fonction calcul_depth_kernel_cuda\n")
@@ -500,7 +728,7 @@ def main_calcul(bamfile_path, chr, seq_length, window_size, step_size, zscore_th
    sys.stderr.write("\t appel fonction calcul moyenne\n")
    mean_chr = calcul_moy_totale(normalize_depth_results)
    
-    # Appeler le kernel de normalisation
+    # Appeler le kernel de ratio
    ratio_par_window_kernel_cuda(d_normalize_depth_results, np.float32(mean_chr), np.int32(seq_length), np.int32(window_size), np.int32(step_size), d_ratio_par_window_results, block=(block_size, 1, 1), grid=(grid_size, 1))
    sys.stderr.write("\t appel fonction ratio_par_window_kernel_cuda\n")
    
@@ -509,62 +737,41 @@ def main_calcul(bamfile_path, chr, seq_length, window_size, step_size, zscore_th
    # Copier les resultats ratio depuis le peripherique CUDA vers l'hote
    cuda.memcpy_dtoh(ratio_par_window_results, d_ratio_par_window_results)
    
+    #Création table à partir du ratio
+    mean_ratio, std_ratio, med_ratio = compute_mean_and_std(ratio_par_window_results)
    
-    ###Z-score###
-    
-    #Appel fonction ecart-type
-    sys.stderr.write("\t appel fonction ecart-type\n")
-    std_chr = calcul_std(normalize_depth_results)
-    
-    # Appeler le kernel de normalisation
-    z_score_kernel_cuda(d_normalize_depth_results, np.float32(mean_chr), np.float32(std_chr), np.int32(seq_length), np.int32(window_size), np.int32(step_size), d_z_score_results, block=(block_size, 1, 1), grid=(grid_size, 1))
+    # Appeler le kernel de calcule du ratio divisé par le ratio moyen
+    ratio_par_mean_ratio_kernel_cuda(d_ratio_par_window_results, np.float32(mean_ratio), np.int32(seq_length), np.int32(window_size), np.int32(step_size), d_ratio_par_mean_ratio_results, block=(block_size, 1, 1), grid=(grid_size, 1))
+    sys.stderr.write("\t appel fonction ratio_par_mean_ratio_kernel_cuda\n")
+        
+    # Appeler le kernel de Z-score
+    z_score_kernel_cuda(d_ratio_par_window_results, np.float32(mean_ratio), np.float32(std_ratio), np.int32(seq_length), np.int32(window_size), np.int32(step_size), d_z_score_results, block=(block_size, 1, 1), grid=(grid_size, 1))
    sys.stderr.write("\t appel fonction z_score_kernel_cuda\n")
    
    context.synchronize()
    
    # Copier les resultats ratio depuis le peripherique CUDA vers l'hote
+    cuda.memcpy_dtoh(ratio_par_mean_ratio_results, d_ratio_par_mean_ratio_results)
    cuda.memcpy_dtoh(z_score_results, d_z_score_results)
-        
    
-    # Ecrire les résultats dans le fichier de sortie
-#    with open(output_file, 'a') as f:
- #       sys.stderr.write("\t ecriture des fichiers\n")
-  #      for i, (avg_depth, avg_gc, avg_map, depth_correction, depth_normalize_val) in enumerate(zip(depth_results, gc_results, map_results, depth_correction_results, normalize_depth_results)):
-   #         pos_start = (i * step_size) + 1
-    #        pos_end = pos_start + window_size
-     #       f.write(f"{chr}\t{pos_start}\t{pos_end}\t{avg_depth}\t{avg_gc}\t{avg_map}\t{depth_correction}\t{depth_normalize_val}\n")
-#
- #   with open(output_file, 'a') as f:
-  #      sys.stderr.write("\t ecriture des fichiers\n")
-   #     for i, (depth_normalize_val, ratio, z_score) in enumerate(zip(normalize_depth_results, ratio_par_window_results, z_score_results)):
-    #        pos_start = (i * step_size) + 1
-     #       pos_end = pos_start + window_size
-      #      f.write(f"{chr}\t{pos_start}\t{pos_end}\t{depth_normalize_val}\t{ratio}\t{z_score}\n")
-            
-            
-    # Filtrer les résultats selon le z-score et stocker dans un tableau NumPy
-    max_windows = int((seq_length - window_size) / step_size) + 1
-    filtered_windows = np.zeros((max_windows, 6), dtype=object)
-    count = 0
-    sys.stderr.write("\t  stockage des zscore selon le threshold\n")
-    for i, (depth_normalize_val, ratio, z_score) in enumerate(zip(normalize_depth_results, ratio_par_window_results, z_score_results)):
-        if z_score <= -zscore_threshold or z_score >= zscore_threshold:
-            pos_start = (i * step_size) + 1
-            pos_end = pos_start + window_size
-            filtered_windows[count] = [chr, pos_start, pos_end, depth_normalize_val, ratio, z_score]
-            count += 1
+    #Appel fonction create signal
+    create_signal(signal, chr, z_score_results, step_size)

-    # Redimensionner le tableau pour enlever les lignes inutilisées
-    filtered_windows = filtered_windows[:count]
+    #Appel fonction detect events
+    detect_events(z_score_results, zscore_threshold, events, med_ratio, ratio_par_mean_ratio_results, chr)

-    # Écrire les résultats filtrés dans le fichier de sortie
-    with open(output_file, 'a') as f:
-        sys.stderr.write("\t ecriture des fichiers\n")
-        for window in filtered_windows:
-            f.write(f"{window[0]}\t{window[1]}\t{window[2]}\t{window[3]}\t{window[4]}\t{window[5]}\n")
+    #Appel fonction segmentation
+    segmentation(events, segment)
+    
+    #Appel fonction display_results_vcf
+    display_results_vcf(sample, segment, signal, lengthFilter, output_file)
+    

 # Programme principal
 #Calcul nombre de coeurs max pour le GPU
+header_written = False
+
+sample = get_sample_name(bamfile_path)
 device = cuda.Device(0)
 attributes = device.get_attributes()
 num_cores = attributes[1]
@@ -582,7 +789,10 @@ with pysam.AlignmentFile(bamfile_path, "rb") as bamfile_handle:
        sys.stderr.write("Chromosome : %s, seq length : %s\n" % (chr, seq_length))

        # Appeler la fonction de calcul de la profondeur moyenne pour ce chromosome
-        main_calcul(bamfile_handle, chr, seq_length, window_size, step_size, zscore_threshold, output_file)
+        main_calcul(bamfile_handle, chr, seq_length, window_size, step_size, zscore_threshold, lengthFilter, output_file, sample)
+
+    logging.basicConfig(filename='%s' % (logfile), filemode='a', level=logging.INFO, format='%(asctime)s %(levelname)s - %(message)s')
+    logging.info('end')