Optimisation of some functions

a0f29fab · Theo Serralta · 676b420d · a0f29fab
Commit a0f29fab authored Aug 05, 2024 by Theo Serralta
Hide whitespace changes
Inline Side-by-side

Showing with 28 additions and 31 deletions

test_gpu_time_SV.py CNV/test/test_gpu_time_SV.py +28 -31

No files found.
--- a/CNV/test/test_gpu_time_SV.py
+++ b/CNV/test/test_gpu_time_SV.py
@@ -1260,22 +1260,17 @@ def find_paired_split(chr, MIN_DISTANCE, MAX_DISTANCE, depth_data, output_file_p
    logging.info(f"Entering find_paired_split for {chr}")
    start_time = time.time()
    
-    coordinates_pairs = defaultdict(int)
-    coordinates_splits = defaultdict(int)
+    coordinates_pairs, coordinates_splits = defaultdict(int), defaultdict(int)
+
+    unique_pairs, unique_splits = [], []

-    unique_pairs = []
-    unique_splits = []

    fragment_size = 1000
    num_fragments = (seq_length // fragment_size) + 1
    
-    fragment_abnormal_reads_pairs = np.zeros(num_fragments, dtype=int)
-    fragment_abnormal_reads_splits = np.zeros(num_fragments, dtype=int)
-    
-    fragment_total_reads = np.zeros(num_fragments, dtype=int)
+    fragment_abnormal_reads_pairs, fragment_abnormal_reads_splits, fragment_total_reads = np.zeros(num_fragments, dtype=int), np.zeros(num_fragments, dtype=int), np.zeros(num_fragments, dtype=int)

-    event_pairs = []
-    event_splits = []
+    event_pairs, event_splits = [], []

    for read in bamfile_handle.fetch(chr):
        fragment_index = read.reference_start // fragment_size
@@ -1287,7 +1282,7 @@ def find_paired_split(chr, MIN_DISTANCE, MAX_DISTANCE, depth_data, output_file_p
                    fragment_abnormal_reads_pairs[fragment_index] += 1
                    unique_pairs.append((read, "Unmapped"))
                coordinates_pairs[key] += 1
-            elif read.is_reverse == read.mate_is_reverse:
+            elif read.is_reverse == read.mate_is_reverse and read.reference_id == read.next_reference_id:
                if coordinates_pairs[key] == 0:
                    fragment_abnormal_reads_pairs[fragment_index] += 1
                    unique_pairs.append((read, "INV"))
@@ -1415,29 +1410,31 @@ def find_paired_split(chr, MIN_DISTANCE, MAX_DISTANCE, depth_data, output_file_p

    with open(output_file_pairs, 'a') as p:
        for (chr_pair, coord_pair, event_type), count in chromosome_coordinate_pairs_count.items():
-            if count > 1:
-                reads_to_merge = reads_by_coordinate_pair[(chr_pair, coord_pair, event_type)]
-                start_chr, end_chr = chr_pair
-                min_start = min(read.reference_start for read in reads_to_merge)
-                max_start = max(read.reference_start for read in reads_to_merge)
-                min_end = min(read.next_reference_start for read in reads_to_merge)
-                max_end = max(read.next_reference_start for read in reads_to_merge)
-                read_count = len(reads_to_merge)
-
-                p.write(f"{start_chr}\t{min_start}\t{max_start}\t{end_chr}\t{min_end}\t{max_end}\t{event_type}\t{read_count}\n")
+            reads_to_merge = reads_by_coordinate_pair[(chr_pair, coord_pair, event_type)]
+            start_chr, end_chr = chr_pair
+            min_start = min(read.reference_start for read in reads_to_merge)
+            max_start = max(read.reference_start for read in reads_to_merge)
+            min_end = min(read.next_reference_start for read in reads_to_merge)
+            max_end = max(read.next_reference_start for read in reads_to_merge)
+            depth_count = np.mean(depth_data[min_start:max_start+1])
+            read_count = count
+            
+            if read_count > (depth_count * 0.2):
+                p.write(f"{start_chr}\t{min_start}\t{max_start}\t{end_chr}\t{min_end}\t{max_end}\t{event_type}\t{read_count}\t{depth_count}\n")
                
    with open(output_file_splits, 'a') as s:
        for (chr_splits, coord_splits, event_type), count in chromosome_coordinate_splits_count.items():
-            if count > 1:
-                reads_to_merge = reads_by_coordinate_splits[(chr_splits, coord_splits, event_type)]
-                start_chr, end_chr = chr_splits
-                min_start = min(read.reference_start for read in reads_to_merge)
-                max_start = max(read.reference_start for read in reads_to_merge)
-                min_end = min(read.next_reference_start for read in reads_to_merge)
-                max_end = max(read.next_reference_start for read in reads_to_merge)
-                read_count = len(reads_to_merge)
-
-                s.write(f"{start_chr}\t{min_start}\t{max_start}\t{end_chr}\t{min_end}\t{max_end}\t{event_type}\t{read_count}\n")
+            reads_to_merge = reads_by_coordinate_splits[(chr_splits, coord_splits, event_type)]
+            start_chr, end_chr = chr_splits
+            min_start = min(read.reference_start for read in reads_to_merge)
+            max_start = max(read.reference_start for read in reads_to_merge)
+            min_end = min(read.next_reference_start for read in reads_to_merge)
+            max_end = max(read.next_reference_start for read in reads_to_merge)
+            depth_count = np.mean(depth_data[min_start:max_start+1])
+            read_count = count
+            
+            if read_count > (depth_count * 0.2):
+                s.write(f"{start_chr}\t{min_start}\t{max_start}\t{end_chr}\t{min_end}\t{max_end}\t{event_type}\t{read_count}\t{depth_count}\n")

            
    end_time = time.time()