Commit a0f29fab authored by Theo Serralta's avatar Theo Serralta

Optimisation of some functions

parent 676b420d
......@@ -1260,22 +1260,17 @@ def find_paired_split(chr, MIN_DISTANCE, MAX_DISTANCE, depth_data, output_file_p
logging.info(f"Entering find_paired_split for {chr}")
start_time = time.time()
coordinates_pairs = defaultdict(int)
coordinates_splits = defaultdict(int)
coordinates_pairs, coordinates_splits = defaultdict(int), defaultdict(int)
unique_pairs, unique_splits = [], []
unique_pairs = []
unique_splits = []
fragment_size = 1000
num_fragments = (seq_length // fragment_size) + 1
fragment_abnormal_reads_pairs = np.zeros(num_fragments, dtype=int)
fragment_abnormal_reads_splits = np.zeros(num_fragments, dtype=int)
fragment_total_reads = np.zeros(num_fragments, dtype=int)
fragment_abnormal_reads_pairs, fragment_abnormal_reads_splits, fragment_total_reads = np.zeros(num_fragments, dtype=int), np.zeros(num_fragments, dtype=int), np.zeros(num_fragments, dtype=int)
event_pairs = []
event_splits = []
event_pairs, event_splits = [], []
for read in bamfile_handle.fetch(chr):
fragment_index = read.reference_start // fragment_size
......@@ -1287,7 +1282,7 @@ def find_paired_split(chr, MIN_DISTANCE, MAX_DISTANCE, depth_data, output_file_p
fragment_abnormal_reads_pairs[fragment_index] += 1
unique_pairs.append((read, "Unmapped"))
coordinates_pairs[key] += 1
elif read.is_reverse == read.mate_is_reverse:
elif read.is_reverse == read.mate_is_reverse and read.reference_id == read.next_reference_id:
if coordinates_pairs[key] == 0:
fragment_abnormal_reads_pairs[fragment_index] += 1
unique_pairs.append((read, "INV"))
......@@ -1415,29 +1410,31 @@ def find_paired_split(chr, MIN_DISTANCE, MAX_DISTANCE, depth_data, output_file_p
with open(output_file_pairs, 'a') as p:
for (chr_pair, coord_pair, event_type), count in chromosome_coordinate_pairs_count.items():
if count > 1:
reads_to_merge = reads_by_coordinate_pair[(chr_pair, coord_pair, event_type)]
start_chr, end_chr = chr_pair
min_start = min(read.reference_start for read in reads_to_merge)
max_start = max(read.reference_start for read in reads_to_merge)
min_end = min(read.next_reference_start for read in reads_to_merge)
max_end = max(read.next_reference_start for read in reads_to_merge)
read_count = len(reads_to_merge)
depth_count = np.mean(depth_data[min_start:max_start+1])
read_count = count
p.write(f"{start_chr}\t{min_start}\t{max_start}\t{end_chr}\t{min_end}\t{max_end}\t{event_type}\t{read_count}\n")
if read_count > (depth_count * 0.2):
p.write(f"{start_chr}\t{min_start}\t{max_start}\t{end_chr}\t{min_end}\t{max_end}\t{event_type}\t{read_count}\t{depth_count}\n")
with open(output_file_splits, 'a') as s:
for (chr_splits, coord_splits, event_type), count in chromosome_coordinate_splits_count.items():
if count > 1:
reads_to_merge = reads_by_coordinate_splits[(chr_splits, coord_splits, event_type)]
start_chr, end_chr = chr_splits
min_start = min(read.reference_start for read in reads_to_merge)
max_start = max(read.reference_start for read in reads_to_merge)
min_end = min(read.next_reference_start for read in reads_to_merge)
max_end = max(read.next_reference_start for read in reads_to_merge)
read_count = len(reads_to_merge)
depth_count = np.mean(depth_data[min_start:max_start+1])
read_count = count
s.write(f"{start_chr}\t{min_start}\t{max_start}\t{end_chr}\t{min_end}\t{max_end}\t{event_type}\t{read_count}\n")
if read_count > (depth_count * 0.2):
s.write(f"{start_chr}\t{min_start}\t{max_start}\t{end_chr}\t{min_end}\t{max_end}\t{event_type}\t{read_count}\t{depth_count}\n")
end_time = time.time()
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment