Commit 59eeb957 authored by Theo Serralta's avatar Theo Serralta

Finishing code to test on genome

parent 74f8d0ae
......@@ -10,13 +10,14 @@ import pycuda.gpuarray as gpuarray
from pycuda.autoinit import context
import multiprocessing
from multiprocessing import Process, Queue
import time
# Options
def parse_arguments():
try:
opts, args = getopt.getopt(sys.argv[1:], 'b:w:s:z:t:o:e:')
bamfile_path, window_size, step_size, zscore_threshold, output_file, logfile = None, None, None, None, None, None
opts, args = getopt.getopt(sys.argv[1:], 'b:w:s:z:l:t:o:e:')
bamfile_path, window_size, step_size, zscore_threshold, lengthFilter, output_file, logfile = None, None, None, None, None, None, None
for opt, arg in opts:
if opt in ("-b"):
bamfile_path = arg
......@@ -26,17 +27,19 @@ def parse_arguments():
step_size = int(arg)
if opt in ("-z"):
zscore_threshold = float(arg)
if opt in ("-l"):
lengthFilter = int(arg)
if opt in ("-o"):
output_file = arg
if opt in ("-e"):
logfile = arg
return bamfile_path, window_size, step_size, zscore_threshold, output_file, logfile
return bamfile_path, window_size, step_size, zscore_threshold, lengthFilter, output_file, logfile
except getopt.GetoptError:
print('Invalid argument')
sys.exit(1)
if __name__ == '__main__':
bamfile_path, window_size, step_size, zscore_threshold, output_file, logfile = parse_arguments()
bamfile_path, window_size, step_size, zscore_threshold, lengthFilter, output_file, logfile = parse_arguments()
logging.basicConfig(filename='%s' % (logfile), filemode='a', level=logging.INFO, format='%(asctime)s %(levelname)s - %(message)s')
logging.info('start')
global seq
......@@ -153,16 +156,28 @@ __global__ void ratio_par_window_kernel(float *depth_normalize_val, float mean_c
}
}
// Kernel pour calculer le z_score par window
// Kernel pour calculer le z_score par window sur le ratio
__global__ void z_score_kernel(float *depth_normalize_val, float mean_chr, float std_chr, int seq_length, int window_size, int step_size, float *z_score_results) {
__global__ void z_score_kernel(float *ratio, float mean_ratio, float std_ratio, int seq_length, int window_size, int step_size, float *z_score_results) {
int idx = threadIdx.x + blockIdx.x * blockDim.x;
if (idx < seq_length - window_size + 1) {
float z_score = (depth_normalize_val[idx] - mean_chr) / std_chr;
float z_score = (ratio[idx] - mean_ratio) / std_ratio;
z_score_results[idx] = z_score;
}
}
// Kernel pour calculer le ratio divise par le ratio moyen
__global__ void ratio_par_mean_ratio_kernel(float *ratio, float mean_ratio, int seq_length, int window_size, int step_size, float *ratio_par_mean_ratio_results) {
int idx = threadIdx.x + blockIdx.x * blockDim.x;
if (idx < seq_length - window_size + 1) {
float ratio_mean_ratio = ratio[idx] / mean_ratio;
ratio_par_mean_ratio_results[idx] = ratio_mean_ratio;
}
}
""")
# Obtention de la fonction de kernel compilée
......@@ -173,6 +188,7 @@ calcul_depth_correction_kernel_cuda = mod.get_function("calcul_depth_correction_
normalize_depth_kernel_cuda = mod.get_function("normalize_depth_kernel")
ratio_par_window_kernel_cuda = mod.get_function("ratio_par_window_kernel")
z_score_kernel_cuda = mod.get_function("z_score_kernel")
ratio_par_mean_ratio_kernel_cuda = mod.get_function("ratio_par_mean_ratio_kernel")
#############################################
######<---Fonctions mappability--->#########
......@@ -341,8 +357,19 @@ def calcul_moy_totale(normalize_depth_results):
sys.stderr.write("\t Leaving calcul_moy_totale\n")
return mean_chr
def calcul_moy_totale_ratio(ratio_par_window_results):
sys.stderr.write("\t entering calcul_moy_totale_ratio\n")
ratio_par_window_results = np.array(ratio_par_window_results)
# Filtrer les résultats pour enlever les valeurs égales à 0
non_zero_results = ratio_par_window_results[ratio_par_window_results != 0]
# Calculer la mpyenne des résultats non nuls
mean_ratio = np.mean(non_zero_results) if non_zero_results.size > 0 else 0
print(mean_ratio)
sys.stderr.write("\t Leaving calcul_moy_totale_ratio\n")
return mean_ratio
#############################################
######<---Fonction calcul std--->#####
######<---Fonction calcul std--->############
#############################################
def calcul_std(normalize_depth_results):
sys.stderr.write("\t entering calcul_std\n")
......@@ -355,13 +382,207 @@ def calcul_std(normalize_depth_results):
sys.stderr.write("\t Leaving calcul_std\n")
return std_chr
def calcul_std_ratio(ratio_par_window_results):
sys.stderr.write("\t entering calcul_std_ratio\n")
ratio_par_window_results = np.array(ratio_par_window_results)
# Filtrer les résultats pour enlever les valeurs égales à 0
non_zero_results = ratio_par_window_results[ratio_par_window_results != 0]
# Calculer le std des résultats non nuls
std_ratio = np.std(non_zero_results) if non_zero_results.size > 0 else 0
print(std_ratio)
sys.stderr.write("\t Leaving calcul_std_ratio\n")
return std_ratio
##############################################
######<---Fonction statistiques ratio--->#####
##############################################
def compute_mean_and_std(ratio_par_window_results):
sys.stderr.write("Computing stats : \n")
# Filtrer les résultats pour enlever les valeurs égales à 0
ratio_par_window_results = np.array(ratio_par_window_results)
non_zero_results = ratio_par_window_results[ratio_par_window_results != 0]
# Initialiser les listes pour le calcul des stats
table = []
for value in non_zero_results:
if float(value) >= 5:
table.append(5)
elif float(value) != -1:
table.append(float(value))
# Calculer la moyenne, l'écart-type et la médiane des valeurs filtrées
mean_ratio = np.mean(table) if table else 0
std_ratio = np.std(table) if table else 0
med_ratio = np.median(table) if table else 0
# Créer le dictionnaire de stats
#stats = {'mean': mean_ratio, 'std': std_ratio, 'med': med_ratio}
# Afficher les résultats
#print(stats)
print(mean_ratio, std_ratio, med_ratio)
sys.stderr.write("Computation done\n")
# Retourner les résultats
return mean_ratio, std_ratio, med_ratio
##############################################
######<---Fonction copy number level--->######
##############################################
def cn_level(x):
if x < 1 :
if x <= 0.75:
if x >= 0.1:
return 1
else:
return 0
else:
return 2
else:
if round(x) == 1:
return 2
if round(x) > 1:
return round(x)
##############################################
######<---Fonction get sample--->#############
##############################################
def get_sample_name(bamfile_path):
with pysam.AlignmentFile(bamfile_path, "rb") as bamfile:
for read_group in bamfile.header.get('RG', []):
if 'SM' in read_group:
return read_group['SM']
return "UnknownSample"
##############################################
######<---Fonction create signal--->##########
##############################################
def create_signal(signal, chr, z_score_results, step_size):
if chr not in signal:
signal[chr] = {}
for i in range(len(z_score_results)):
pos = (i * step_size) + 1
signal[chr][pos] = z_score_results[i]
##############################################
######<---Fonction detect_events--->##########
##############################################
def detect_events(z_score_results, zscore_threshold, events, med_ratio, ratio_par_mean_ratio_results, chr):
sys.stderr.write("\t starting detect_events\n")
c = 0
for i, z_score in enumerate(z_score_results):
if (z_score <= -zscore_threshold) or (z_score >= zscore_threshold):
if chr not in events:
events[chr] = {}
if med_ratio == 0:
c = 0
else:
c = cn_level(float(ratio_par_mean_ratio_results[i]))
if z_score >= zscore_threshold:
c = 3
elif c == 2 and z_score <= -zscore_threshold:
c = 1
pos_start = (i * step_size) + 1
pos_end = pos_start + window_size
events[chr][(pos_start, pos_end)] = c
sys.stderr.write("\t ending detect_events\n")
##############################################
######<---Fonction segmentation--->###########
##############################################
def segmentation(events, segment):
sys.stderr.write("starting segmentation : \n")
for k in events.keys():
sys.stderr.write("\tfor chromosome %s\n" % k)
starts = 0
oldPos = 0
oldLevel = -1
for p in sorted(events[k].keys()):
level = events[k][p]
# new coordinates
if (p[0] > (oldPos + window_size)):
if (starts != 0) and (starts != p[0]):
if k not in segment:
segment[k] = {}
index = str(starts) + "-" + str(oldPos)
segment[k][index] = {}
segment[k][index]["start"] = starts
segment[k][index]["end"] = oldPos + window_size
segment[k][index]["cn"] = oldLevel
oldPos = p[0]
starts = p[0]
oldLevel = level
continue
else:
starts = p[0]
# case where it's contiguous but different level
if (level != oldLevel):
if oldLevel != -1:
if k not in segment:
segment[k] = {}
index = str(starts) + "-" + str(oldPos)
segment[k][index] = {}
segment[k][index]["start"] = starts
segment[k][index]["end"] = oldPos
segment[k][index]["cn"] = oldLevel
oldPos = p[0]
starts = p[0]
oldLevel = level
continue
else:
oldLevel = level
oldPos = p[0]
oldLevel = level
sys.stderr.write("segmentation done\n")
##############################################
######<---Fonction display results--->########
##############################################
def display_results_vcf(sample, segment, signal, lengthFilter, output_file):
global header_written
sys.stderr.write("starting display results\n")
with open(output_file, 'a') as f:
if not header_written:
f.write("##fileformat=VCFv4.2\n")
f.write("##source=cnvcaller\n")
f.write("##INFO=<ID=SVTYPE,Number=1,Type=String,Description=\"Type of structural variant\">\n")
f.write("##INFO=<ID=SVLEN,Number=.,Type=Integer,Description=\"Difference in length between REF and ALT alleles\">\n")
f.write("##INFO=<ID=END,Number=1,Type=Integer,Description=\"End position of the variant described in this record\">\n")
f.write("##INFO=<ID=CN,Number=1,Type=Integer,Description=\"Copy number\">\n")
f.write("##ALT=<ID=DEL,Description=\"Deletion\">\n")
f.write("##ALT=<ID=DUP,Description=\"Duplication\">\n")
f.write("##FORMAT=<ID=GT,Number=1,Type=String,Description=\"Genotype\">\n")
f.write("#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\tFORMAT\t%s\n" % (sample))
header_written = True
for k in segment.keys():
for elt in sorted(segment[k].keys()):
if segment[k][elt]["start"] == segment[k][elt]["end"]:
continue
if (segment[k][elt]["end"] - segment[k][elt]["start"]) < lengthFilter:
continue
if int(signal[k][segment[k][elt]["start"]]) < 0:
f.write("%s\t%s\t.\tN\t<DEL>\t.\t.\tSVTYPE=DEL;END=%s;VALUE=%s\tGT:GQ\t./.:0\n" % (k, segment[k][elt]["start"], segment[k][elt]["end"], int(segment[k][elt]["cn"])))
else:
f.write("%s\t%s\t.\tN\t<DUP>\t.\t.\tSVTYPE=DUP;END=%s;VALUE=%s\tGT:GQ\t./.:0\n" % (k, segment[k][elt]["start"], segment[k][elt]["end"], int(segment[k][elt]["cn"])))
#################################
######<---Fonction main--->######
#################################
def main_calcul(bamfile_path, chr, seq_length, window_size, step_size, zscore_threshold, output_file):
def main_calcul(bamfile_path, chr, seq_length, window_size, step_size, zscore_threshold, lengthFilter, output_file, sample):
sys.stderr.write("\t entering main_calcul\n")
global seq
events = {}
segment = {}
signal = {}
# Appeler les différentes fonctions
map_data = calcul_mappability(seq_length, mappability, chr)
gc_data = calcul_gc_content(seq_length, chr, seq)
......@@ -408,6 +629,9 @@ def main_calcul(bamfile_path, chr, seq_length, window_size, step_size, zscore_th
z_score_results = np.zeros(int((seq_length - window_size) / step_size) + 1, dtype=np.float32)
sys.stderr.write("\t Definition de z_score_results\n")
ratio_par_mean_ratio_results = np.zeros(int((seq_length - window_size) / step_size) + 1, dtype=np.float32)
sys.stderr.write("\t Definition de ratio_par_mean_ratio_results\n")
# Allouer de la mémoire pour les résultats sur le périphérique CUDA
d_depth_results = cuda.mem_alloc(depth_results.nbytes)
sys.stderr.write("\t d_depth_results = %s\n" % d_depth_results.as_buffer(sys.getsizeof(d_depth_results)))
......@@ -437,6 +661,10 @@ def main_calcul(bamfile_path, chr, seq_length, window_size, step_size, zscore_th
sys.stderr.write("\t d_z_score_results = %s\n" % d_z_score_results.as_buffer(sys.getsizeof(d_z_score_results)))
sys.stderr.write("\t z_score_results.nbytes = %s\n" % z_score_results.nbytes)
d_ratio_par_mean_ratio_results = cuda.mem_alloc(ratio_par_mean_ratio_results.nbytes)
sys.stderr.write("\t d_ratio_par_mean_ratio_results = %s\n" % d_ratio_par_mean_ratio_results.as_buffer(sys.getsizeof(d_ratio_par_mean_ratio_results)))
sys.stderr.write("\t ratio_par_mean_ratio_results.nbytes = %s\n" % ratio_par_mean_ratio_results.nbytes)
# Appeler la fonction de calcul de profondeur avec CUDA
calcul_depth_kernel_cuda(d_depth_data, np.int32(seq_length), np.int32(window_size), np.int32(step_size), d_depth_results, block=(block_size, 1, 1), grid=(grid_size, 1))
sys.stderr.write("\t appel fonction calcul_depth_kernel_cuda\n")
......@@ -500,7 +728,7 @@ def main_calcul(bamfile_path, chr, seq_length, window_size, step_size, zscore_th
sys.stderr.write("\t appel fonction calcul moyenne\n")
mean_chr = calcul_moy_totale(normalize_depth_results)
# Appeler le kernel de normalisation
# Appeler le kernel de ratio
ratio_par_window_kernel_cuda(d_normalize_depth_results, np.float32(mean_chr), np.int32(seq_length), np.int32(window_size), np.int32(step_size), d_ratio_par_window_results, block=(block_size, 1, 1), grid=(grid_size, 1))
sys.stderr.write("\t appel fonction ratio_par_window_kernel_cuda\n")
......@@ -509,62 +737,41 @@ def main_calcul(bamfile_path, chr, seq_length, window_size, step_size, zscore_th
# Copier les resultats ratio depuis le peripherique CUDA vers l'hote
cuda.memcpy_dtoh(ratio_par_window_results, d_ratio_par_window_results)
#Création table à partir du ratio
mean_ratio, std_ratio, med_ratio = compute_mean_and_std(ratio_par_window_results)
###Z-score###
#Appel fonction ecart-type
sys.stderr.write("\t appel fonction ecart-type\n")
std_chr = calcul_std(normalize_depth_results)
# Appeler le kernel de normalisation
z_score_kernel_cuda(d_normalize_depth_results, np.float32(mean_chr), np.float32(std_chr), np.int32(seq_length), np.int32(window_size), np.int32(step_size), d_z_score_results, block=(block_size, 1, 1), grid=(grid_size, 1))
# Appeler le kernel de calcule du ratio divisé par le ratio moyen
ratio_par_mean_ratio_kernel_cuda(d_ratio_par_window_results, np.float32(mean_ratio), np.int32(seq_length), np.int32(window_size), np.int32(step_size), d_ratio_par_mean_ratio_results, block=(block_size, 1, 1), grid=(grid_size, 1))
sys.stderr.write("\t appel fonction ratio_par_mean_ratio_kernel_cuda\n")
# Appeler le kernel de Z-score
z_score_kernel_cuda(d_ratio_par_window_results, np.float32(mean_ratio), np.float32(std_ratio), np.int32(seq_length), np.int32(window_size), np.int32(step_size), d_z_score_results, block=(block_size, 1, 1), grid=(grid_size, 1))
sys.stderr.write("\t appel fonction z_score_kernel_cuda\n")
context.synchronize()
# Copier les resultats ratio depuis le peripherique CUDA vers l'hote
cuda.memcpy_dtoh(ratio_par_mean_ratio_results, d_ratio_par_mean_ratio_results)
cuda.memcpy_dtoh(z_score_results, d_z_score_results)
# Ecrire les résultats dans le fichier de sortie
# with open(output_file, 'a') as f:
# sys.stderr.write("\t ecriture des fichiers\n")
# for i, (avg_depth, avg_gc, avg_map, depth_correction, depth_normalize_val) in enumerate(zip(depth_results, gc_results, map_results, depth_correction_results, normalize_depth_results)):
# pos_start = (i * step_size) + 1
# pos_end = pos_start + window_size
# f.write(f"{chr}\t{pos_start}\t{pos_end}\t{avg_depth}\t{avg_gc}\t{avg_map}\t{depth_correction}\t{depth_normalize_val}\n")
#
# with open(output_file, 'a') as f:
# sys.stderr.write("\t ecriture des fichiers\n")
# for i, (depth_normalize_val, ratio, z_score) in enumerate(zip(normalize_depth_results, ratio_par_window_results, z_score_results)):
# pos_start = (i * step_size) + 1
# pos_end = pos_start + window_size
# f.write(f"{chr}\t{pos_start}\t{pos_end}\t{depth_normalize_val}\t{ratio}\t{z_score}\n")
# Filtrer les résultats selon le z-score et stocker dans un tableau NumPy
max_windows = int((seq_length - window_size) / step_size) + 1
filtered_windows = np.zeros((max_windows, 6), dtype=object)
count = 0
sys.stderr.write("\t stockage des zscore selon le threshold\n")
for i, (depth_normalize_val, ratio, z_score) in enumerate(zip(normalize_depth_results, ratio_par_window_results, z_score_results)):
if z_score <= -zscore_threshold or z_score >= zscore_threshold:
pos_start = (i * step_size) + 1
pos_end = pos_start + window_size
filtered_windows[count] = [chr, pos_start, pos_end, depth_normalize_val, ratio, z_score]
count += 1
#Appel fonction create signal
create_signal(signal, chr, z_score_results, step_size)
# Redimensionner le tableau pour enlever les lignes inutilisées
filtered_windows = filtered_windows[:count]
#Appel fonction detect events
detect_events(z_score_results, zscore_threshold, events, med_ratio, ratio_par_mean_ratio_results, chr)
# Écrire les résultats filtrés dans le fichier de sortie
with open(output_file, 'a') as f:
sys.stderr.write("\t ecriture des fichiers\n")
for window in filtered_windows:
f.write(f"{window[0]}\t{window[1]}\t{window[2]}\t{window[3]}\t{window[4]}\t{window[5]}\n")
#Appel fonction segmentation
segmentation(events, segment)
#Appel fonction display_results_vcf
display_results_vcf(sample, segment, signal, lengthFilter, output_file)
# Programme principal
#Calcul nombre de coeurs max pour le GPU
header_written = False
sample = get_sample_name(bamfile_path)
device = cuda.Device(0)
attributes = device.get_attributes()
num_cores = attributes[1]
......@@ -582,7 +789,10 @@ with pysam.AlignmentFile(bamfile_path, "rb") as bamfile_handle:
sys.stderr.write("Chromosome : %s, seq length : %s\n" % (chr, seq_length))
# Appeler la fonction de calcul de la profondeur moyenne pour ce chromosome
main_calcul(bamfile_handle, chr, seq_length, window_size, step_size, zscore_threshold, output_file)
main_calcul(bamfile_handle, chr, seq_length, window_size, step_size, zscore_threshold, lengthFilter, output_file, sample)
logging.basicConfig(filename='%s' % (logfile), filemode='a', level=logging.INFO, format='%(asctime)s %(levelname)s - %(message)s')
logging.info('end')
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment