Add some function to find pairs and splits read

26feb9eb · Theo Serralta · b6aa9197 · 26feb9eb
Commit 26feb9eb authored Jul 19, 2024 by Theo Serralta
Show whitespace changes
Inline Side-by-side

Showing with 159 additions and 22 deletions

test_gpu_time_SV.py CNV/test/test_gpu_time_SV.py +159 -22

No files found.
--- a/CNV/test/test_gpu_time_SV.py
+++ b/CNV/test/test_gpu_time_SV.py
@@ -1256,7 +1256,7 @@ def stats_distances(distances_data, chr):
    # Return results
    return MIN_DISTANCE, MAX_DISTANCE
-def find_paired_split(chr, MIN_DISTANCE, MAX_DISTANCE, depth_data, output_file_pairs, output_file_splits, bamfile_handle):
+def find_paired_split(chr, MIN_DISTANCE, MAX_DISTANCE, depth_data, output_file_pairs, output_file_splits, bamfile_handle, seq_length):
    logging.info(f"Entering find_paired_split for {chr}")
    start_time = time.time()
@@ -1266,53 +1266,190 @@ def find_paired_split(chr, MIN_DISTANCE, MAX_DISTANCE, depth_data, output_file_p
    unique_pairs = []
    unique_splits = []
+    fragment_size = 1000
+    num_fragments = (seq_length // fragment_size) + 1
+    fragment_abnormal_reads_pairs = np.zeros(num_fragments, dtype=int)
+    fragment_abnormal_reads_splits = np.zeros(num_fragments, dtype=int)
+    fragment_total_reads = np.zeros(num_fragments, dtype=int)
+    event_pairs = []
+    event_splits = []
    for read in bamfile_handle.fetch(chr):
-        if read.is_paired:
+        fragment_index = read.reference_start // fragment_size
+        fragment_total_reads[fragment_index] += 1
+        if read.is_paired and (read.mapping_quality > 0):
            key = (read.reference_start, read.cigarstring)
-            #sys.stderr.write("Chromosome : %s, key : %s, coordinates_pairs[key] : %s\n" % (chr, key, coordinates_pairs[key]))
            if read.is_unmapped or read.mate_is_unmapped:
                if coordinates_pairs[key] == 0:
-                    unique_pairs.append(read)
+                    fragment_abnormal_reads_pairs[fragment_index] += 1
+                    unique_pairs.append((read, "Unmapped"))
                coordinates_pairs[key] += 1
            elif read.is_reverse == read.mate_is_reverse:
                if coordinates_pairs[key] == 0:
-                    unique_pairs.append(read)
+                    fragment_abnormal_reads_pairs[fragment_index] += 1
+                    unique_pairs.append((read, "INV"))
                coordinates_pairs[key] += 1
-            elif abs(read.template_length) < MIN_DISTANCE or abs(read.template_length) > MAX_DISTANCE:
+            elif abs(read.template_length) < MIN_DISTANCE:
                if coordinates_pairs[key] == 0:
-                    unique_pairs.append(read)
+                    fragment_abnormal_reads_pairs[fragment_index] += 1
+                    unique_pairs.append((read, "INS"))
+                coordinates_pairs[key] += 1
+            elif abs(read.template_length) > MAX_DISTANCE:
+                if coordinates_pairs[key] == 0:
+                    fragment_abnormal_reads_pairs[fragment_index] += 1
+                    unique_pairs.append((read, "DEL"))
                coordinates_pairs[key] += 1    
            elif read.reference_id != read.next_reference_id:
                if coordinates_pairs[key] == 0:
-                    unique_pairs.append(read)
+                    fragment_abnormal_reads_pairs[fragment_index] += 1
+                    unique_pairs.append((read, "TRN"))
                coordinates_pairs[key] += 1
-        if read.cigartuples and any(cigar_op in [2, 3, 4, 5] for cigar_op, _ in read.cigartuples):
+        # Lectures splits
+        elif read.cigartuples and any(cigar_op in [3, 4, 5] for cigar_op, _ in read.cigartuples):
+            if not read.has_tag("SA"):
+                continue
+            key = (read.reference_start, read.cigarstring)
            if coordinates_splits[key] == 0:
-                unique_splits.append(read)
+                fragment_abnormal_reads_splits[fragment_index] += 1
+                # Classification des événements split
+                for cigar_op, cigar_len in read.cigartuples:
+                    if cigar_op == 3:  # Skipped region
+                        unique_splits.append((read, "SKIP"))
+                    elif cigar_op == 4:  # Soft clipping
+                        unique_splits.append((read, "SOFT_CLIP"))
+                    elif cigar_op == 5:  # Hard clipping
+                        unique_splits.append((read, "HARD_CLIP"))                
                coordinates_splits[key] += 1
+    logging.info("Filtering pairs")
+    for read, event_type in unique_pairs:
+        fragment_index = read.reference_start // fragment_size
+        threshold = fragment_abnormal_reads_pairs[fragment_index] / fragment_total_reads[fragment_index]
+        if threshold > 0.2:
+            event_pairs.append((read, event_type))
+    logging.info("Filtering splits")
+    for read, event_type in unique_splits:
+        fragment_index = read.reference_start // fragment_size
+        threshold = fragment_abnormal_reads_splits[fragment_index] / fragment_total_reads[fragment_index]
+        if threshold > 0.2:
+            event_splits.append((read, event_type))
+    chromosome_coordinate_pairs_count = defaultdict(int)
+    reads_by_coordinate_pair = defaultdict(list)
+    tolerance = 400
+    chromosome_coordinate_splits_count = defaultdict(int)
+    reads_by_coordinate_splits = defaultdict(list)
+    logging.info("Merging pairs")   
+    #Paired-reads
+    for read, event_type in event_pairs:
+        start_chr = read.reference_name
+        start_coord = read.reference_start
+        end_chr = read.next_reference_name
+        end_coord = read.next_reference_start
+        found_close_pair = False
+        for (chr_pair, coord_pair, existing_event_type), count in chromosome_coordinate_pairs_count.items():
+            (stored_start_chr, stored_end_chr) = chr_pair
+            (stored_start_coord, stored_end_coord) = coord_pair
+            if (start_chr == stored_start_chr and end_chr == stored_end_chr and
+                are_coordinates_close(start_coord, stored_start_coord, tolerance) and
+                are_coordinates_close(end_coord, stored_end_coord, tolerance) and
+                event_type == existing_event_type):
+                chromosome_coordinate_pairs_count[(chr_pair, coord_pair, event_type)] += 1
+                reads_by_coordinate_pair[(chr_pair, coord_pair, event_type)].append(read)
+                found_close_pair = True
+                break
+        if not found_close_pair:
+            chr_pair = (start_chr, end_chr)
+            coord_pair = (start_coord, end_coord)
+            chromosome_coordinate_pairs_count[(chr_pair, coord_pair, event_type)] += 1
+            reads_by_coordinate_pair[(chr_pair, coord_pair, event_type)].append(read)
+    logging.info("Merging splits")
+    #Split-reads     
+    for read, event_type in event_splits:
+        start_chr = read.reference_name
+        start_coord = read.reference_start
+        end_chr = read.next_reference_name
+        end_coord = read.next_reference_start
+        found_close_splits = False
+        for (chr_splits, coord_splits, existing_event_type), count in chromosome_coordinate_splits_count.items():
+            (stored_start_chr, stored_end_chr) = chr_splits
+            (stored_start_coord, stored_end_coord) = coord_splits
+            if (start_chr == stored_start_chr and end_chr == stored_end_chr and
+                are_coordinates_close(start_coord, stored_start_coord, tolerance) and
+                are_coordinates_close(end_coord, stored_end_coord, tolerance) and
+                event_type == existing_event_type):
+                existing_reads = reads_by_coordinate_splits[(chr_splits, coord_splits, event_type)]
+                if any(are_secondary_alignments_same(read, existing_read) for existing_read in existing_reads):
+                    chromosome_coordinate_splits_count[(chr_splits, coord_splits, event_type)] += 1
+                    reads_by_coordinate_splits[(chr_splits, coord_splits, event_type)].append(read)
+                    found_close_splits = True
+                    break
+        if not found_close_splits:
+            chr_splits = (start_chr, end_chr)
+            coord_splits = (start_coord, end_coord)
+            chromosome_coordinate_splits_count[(chr_splits, coord_splits, event_type)] += 1
+            reads_by_coordinate_splits[(chr_splits, coord_splits, event_type)].append(read)
    logging.info("Writting results for pairs and split")
    with open(output_file_pairs, 'a') as p:
-        for read in unique_pairs:
+        for (chr_pair, coord_pair, event_type), count in chromosome_coordinate_pairs_count.items():
-            key = (read.reference_start, read.cigarstring)
+            if count > 1:
-            threshold = depth_data[read.reference_start] * 0.60
+                reads_to_merge = reads_by_coordinate_pair[(chr_pair, coord_pair, event_type)]
-            #sys.stderr.write("Chromosome : %s, threshold : %s, depth_data[read.reference_start] : %s\n" % (chr, threshold, depth_data[read.reference_start]))
+                start_chr, end_chr = chr_pair
-            if coordinates_pairs[key] > threshold:
+                min_start = min(read.reference_start for read in reads_to_merge)
-                p.write(read.to_string()+'\n')
+                max_start = max(read.reference_start for read in reads_to_merge)
+                min_end = min(read.next_reference_start for read in reads_to_merge)
+                max_end = max(read.next_reference_start for read in reads_to_merge)
+                read_count = len(reads_to_merge)
+                p.write(f"{start_chr}\t{min_start}\t{max_start}\t{end_chr}\t{min_end}\t{max_end}\t{event_type}\t{read_count}\n")
    with open(output_file_splits, 'a') as s:
-        for read in unique_splits:
+        for (chr_splits, coord_splits, event_type), count in chromosome_coordinate_splits_count.items():
-            key = (read.reference_start, read.cigarstring)
+            if count > 1:
-            threshold = depth_data[read.reference_start] * 0.60
+                reads_to_merge = reads_by_coordinate_splits[(chr_splits, coord_splits, event_type)]
-            if coordinates_splits[key] > threshold:
+                start_chr, end_chr = chr_splits
-                s.write(read.to_string()+'\n')
+                min_start = min(read.reference_start for read in reads_to_merge)
+                max_start = max(read.reference_start for read in reads_to_merge)
+                min_end = min(read.next_reference_start for read in reads_to_merge)
+                max_end = max(read.next_reference_start for read in reads_to_merge)
+                read_count = len(reads_to_merge)
+                s.write(f"{start_chr}\t{min_start}\t{max_start}\t{end_chr}\t{min_end}\t{max_end}\t{event_type}\t{read_count}\n")
    end_time = time.time()
    elapsed_time = end_time - start_time
    logging.info(f"Leaving find_paired_split for {chr} (Time taken: {elapsed_time:.4f} seconds)")
+def are_coordinates_close(coord1, coord2, tolerance):
+    return abs(coord1 - coord2) <= tolerance
+def are_secondary_alignments_same(read1, read2):
+    return read1.get_tag("SA") == read2.get_tag("SA")
 def main_calcul(
    bamfile_path,
    bamfile_handle,
@@ -1374,7 +1511,7 @@ def main_calcul(
    depth_data = calcul_depth_seq_samtools(seq_length, bamfile_path, chr)
    distances_data = calcul_distance(bamfile_handle, chr, seq_length)
    MIN_DISTANCE, MAX_DISTANCE = stats_distances(distances_data, chr)
-    find_paired_split(chr, MIN_DISTANCE, MAX_DISTANCE, depth_data, output_file_pairs, output_file_splits, bamfile_handle)
+    find_paired_split(chr, MIN_DISTANCE, MAX_DISTANCE, depth_data, output_file_pairs, output_file_splits, bamfile_handle, seq_length)
    # Transférer le tableau NumPy vers CUDA
    d_depth_data = cuda.mem_alloc(depth_data.nbytes)