Commit 26feb9eb authored by Theo Serralta's avatar Theo Serralta

Add some function to find pairs and splits read

parent b6aa9197
...@@ -1256,7 +1256,7 @@ def stats_distances(distances_data, chr): ...@@ -1256,7 +1256,7 @@ def stats_distances(distances_data, chr):
# Return results # Return results
return MIN_DISTANCE, MAX_DISTANCE return MIN_DISTANCE, MAX_DISTANCE
def find_paired_split(chr, MIN_DISTANCE, MAX_DISTANCE, depth_data, output_file_pairs, output_file_splits, bamfile_handle): def find_paired_split(chr, MIN_DISTANCE, MAX_DISTANCE, depth_data, output_file_pairs, output_file_splits, bamfile_handle, seq_length):
logging.info(f"Entering find_paired_split for {chr}") logging.info(f"Entering find_paired_split for {chr}")
start_time = time.time() start_time = time.time()
...@@ -1266,53 +1266,190 @@ def find_paired_split(chr, MIN_DISTANCE, MAX_DISTANCE, depth_data, output_file_p ...@@ -1266,53 +1266,190 @@ def find_paired_split(chr, MIN_DISTANCE, MAX_DISTANCE, depth_data, output_file_p
unique_pairs = [] unique_pairs = []
unique_splits = [] unique_splits = []
fragment_size = 1000
num_fragments = (seq_length // fragment_size) + 1
fragment_abnormal_reads_pairs = np.zeros(num_fragments, dtype=int)
fragment_abnormal_reads_splits = np.zeros(num_fragments, dtype=int)
fragment_total_reads = np.zeros(num_fragments, dtype=int)
event_pairs = []
event_splits = []
for read in bamfile_handle.fetch(chr): for read in bamfile_handle.fetch(chr):
if read.is_paired: fragment_index = read.reference_start // fragment_size
fragment_total_reads[fragment_index] += 1
if read.is_paired and (read.mapping_quality > 0):
key = (read.reference_start, read.cigarstring) key = (read.reference_start, read.cigarstring)
#sys.stderr.write("Chromosome : %s, key : %s, coordinates_pairs[key] : %s\n" % (chr, key, coordinates_pairs[key]))
if read.is_unmapped or read.mate_is_unmapped: if read.is_unmapped or read.mate_is_unmapped:
if coordinates_pairs[key] == 0: if coordinates_pairs[key] == 0:
unique_pairs.append(read) fragment_abnormal_reads_pairs[fragment_index] += 1
unique_pairs.append((read, "Unmapped"))
coordinates_pairs[key] += 1 coordinates_pairs[key] += 1
elif read.is_reverse == read.mate_is_reverse: elif read.is_reverse == read.mate_is_reverse:
if coordinates_pairs[key] == 0: if coordinates_pairs[key] == 0:
unique_pairs.append(read) fragment_abnormal_reads_pairs[fragment_index] += 1
unique_pairs.append((read, "INV"))
coordinates_pairs[key] += 1 coordinates_pairs[key] += 1
elif abs(read.template_length) < MIN_DISTANCE or abs(read.template_length) > MAX_DISTANCE: elif abs(read.template_length) < MIN_DISTANCE:
if coordinates_pairs[key] == 0: if coordinates_pairs[key] == 0:
unique_pairs.append(read) fragment_abnormal_reads_pairs[fragment_index] += 1
unique_pairs.append((read, "INS"))
coordinates_pairs[key] += 1
elif abs(read.template_length) > MAX_DISTANCE:
if coordinates_pairs[key] == 0:
fragment_abnormal_reads_pairs[fragment_index] += 1
unique_pairs.append((read, "DEL"))
coordinates_pairs[key] += 1 coordinates_pairs[key] += 1
elif read.reference_id != read.next_reference_id: elif read.reference_id != read.next_reference_id:
if coordinates_pairs[key] == 0: if coordinates_pairs[key] == 0:
unique_pairs.append(read) fragment_abnormal_reads_pairs[fragment_index] += 1
unique_pairs.append((read, "TRN"))
coordinates_pairs[key] += 1 coordinates_pairs[key] += 1
if read.cigartuples and any(cigar_op in [2, 3, 4, 5] for cigar_op, _ in read.cigartuples): # Lectures splits
elif read.cigartuples and any(cigar_op in [3, 4, 5] for cigar_op, _ in read.cigartuples):
if not read.has_tag("SA"):
continue
key = (read.reference_start, read.cigarstring)
if coordinates_splits[key] == 0: if coordinates_splits[key] == 0:
unique_splits.append(read) fragment_abnormal_reads_splits[fragment_index] += 1
# Classification des événements split
for cigar_op, cigar_len in read.cigartuples:
if cigar_op == 3: # Skipped region
unique_splits.append((read, "SKIP"))
elif cigar_op == 4: # Soft clipping
unique_splits.append((read, "SOFT_CLIP"))
elif cigar_op == 5: # Hard clipping
unique_splits.append((read, "HARD_CLIP"))
coordinates_splits[key] += 1 coordinates_splits[key] += 1
logging.info("Filtering pairs")
for read, event_type in unique_pairs:
fragment_index = read.reference_start // fragment_size
threshold = fragment_abnormal_reads_pairs[fragment_index] / fragment_total_reads[fragment_index]
if threshold > 0.2:
event_pairs.append((read, event_type))
logging.info("Filtering splits")
for read, event_type in unique_splits:
fragment_index = read.reference_start // fragment_size
threshold = fragment_abnormal_reads_splits[fragment_index] / fragment_total_reads[fragment_index]
if threshold > 0.2:
event_splits.append((read, event_type))
chromosome_coordinate_pairs_count = defaultdict(int)
reads_by_coordinate_pair = defaultdict(list)
tolerance = 400
chromosome_coordinate_splits_count = defaultdict(int)
reads_by_coordinate_splits = defaultdict(list)
logging.info("Merging pairs")
#Paired-reads
for read, event_type in event_pairs:
start_chr = read.reference_name
start_coord = read.reference_start
end_chr = read.next_reference_name
end_coord = read.next_reference_start
found_close_pair = False
for (chr_pair, coord_pair, existing_event_type), count in chromosome_coordinate_pairs_count.items():
(stored_start_chr, stored_end_chr) = chr_pair
(stored_start_coord, stored_end_coord) = coord_pair
if (start_chr == stored_start_chr and end_chr == stored_end_chr and
are_coordinates_close(start_coord, stored_start_coord, tolerance) and
are_coordinates_close(end_coord, stored_end_coord, tolerance) and
event_type == existing_event_type):
chromosome_coordinate_pairs_count[(chr_pair, coord_pair, event_type)] += 1
reads_by_coordinate_pair[(chr_pair, coord_pair, event_type)].append(read)
found_close_pair = True
break
if not found_close_pair:
chr_pair = (start_chr, end_chr)
coord_pair = (start_coord, end_coord)
chromosome_coordinate_pairs_count[(chr_pair, coord_pair, event_type)] += 1
reads_by_coordinate_pair[(chr_pair, coord_pair, event_type)].append(read)
logging.info("Merging splits")
#Split-reads
for read, event_type in event_splits:
start_chr = read.reference_name
start_coord = read.reference_start
end_chr = read.next_reference_name
end_coord = read.next_reference_start
found_close_splits = False
for (chr_splits, coord_splits, existing_event_type), count in chromosome_coordinate_splits_count.items():
(stored_start_chr, stored_end_chr) = chr_splits
(stored_start_coord, stored_end_coord) = coord_splits
if (start_chr == stored_start_chr and end_chr == stored_end_chr and
are_coordinates_close(start_coord, stored_start_coord, tolerance) and
are_coordinates_close(end_coord, stored_end_coord, tolerance) and
event_type == existing_event_type):
existing_reads = reads_by_coordinate_splits[(chr_splits, coord_splits, event_type)]
if any(are_secondary_alignments_same(read, existing_read) for existing_read in existing_reads):
chromosome_coordinate_splits_count[(chr_splits, coord_splits, event_type)] += 1
reads_by_coordinate_splits[(chr_splits, coord_splits, event_type)].append(read)
found_close_splits = True
break
if not found_close_splits:
chr_splits = (start_chr, end_chr)
coord_splits = (start_coord, end_coord)
chromosome_coordinate_splits_count[(chr_splits, coord_splits, event_type)] += 1
reads_by_coordinate_splits[(chr_splits, coord_splits, event_type)].append(read)
logging.info("Writting results for pairs and split") logging.info("Writting results for pairs and split")
with open(output_file_pairs, 'a') as p: with open(output_file_pairs, 'a') as p:
for read in unique_pairs: for (chr_pair, coord_pair, event_type), count in chromosome_coordinate_pairs_count.items():
key = (read.reference_start, read.cigarstring) if count > 1:
threshold = depth_data[read.reference_start] * 0.60 reads_to_merge = reads_by_coordinate_pair[(chr_pair, coord_pair, event_type)]
#sys.stderr.write("Chromosome : %s, threshold : %s, depth_data[read.reference_start] : %s\n" % (chr, threshold, depth_data[read.reference_start])) start_chr, end_chr = chr_pair
if coordinates_pairs[key] > threshold: min_start = min(read.reference_start for read in reads_to_merge)
p.write(read.to_string()+'\n') max_start = max(read.reference_start for read in reads_to_merge)
min_end = min(read.next_reference_start for read in reads_to_merge)
max_end = max(read.next_reference_start for read in reads_to_merge)
read_count = len(reads_to_merge)
p.write(f"{start_chr}\t{min_start}\t{max_start}\t{end_chr}\t{min_end}\t{max_end}\t{event_type}\t{read_count}\n")
with open(output_file_splits, 'a') as s: with open(output_file_splits, 'a') as s:
for read in unique_splits: for (chr_splits, coord_splits, event_type), count in chromosome_coordinate_splits_count.items():
key = (read.reference_start, read.cigarstring) if count > 1:
threshold = depth_data[read.reference_start] * 0.60 reads_to_merge = reads_by_coordinate_splits[(chr_splits, coord_splits, event_type)]
if coordinates_splits[key] > threshold: start_chr, end_chr = chr_splits
s.write(read.to_string()+'\n') min_start = min(read.reference_start for read in reads_to_merge)
max_start = max(read.reference_start for read in reads_to_merge)
min_end = min(read.next_reference_start for read in reads_to_merge)
max_end = max(read.next_reference_start for read in reads_to_merge)
read_count = len(reads_to_merge)
s.write(f"{start_chr}\t{min_start}\t{max_start}\t{end_chr}\t{min_end}\t{max_end}\t{event_type}\t{read_count}\n")
end_time = time.time() end_time = time.time()
elapsed_time = end_time - start_time elapsed_time = end_time - start_time
logging.info(f"Leaving find_paired_split for {chr} (Time taken: {elapsed_time:.4f} seconds)") logging.info(f"Leaving find_paired_split for {chr} (Time taken: {elapsed_time:.4f} seconds)")
def are_coordinates_close(coord1, coord2, tolerance):
return abs(coord1 - coord2) <= tolerance
def are_secondary_alignments_same(read1, read2):
return read1.get_tag("SA") == read2.get_tag("SA")
def main_calcul( def main_calcul(
bamfile_path, bamfile_path,
bamfile_handle, bamfile_handle,
...@@ -1374,7 +1511,7 @@ def main_calcul( ...@@ -1374,7 +1511,7 @@ def main_calcul(
depth_data = calcul_depth_seq_samtools(seq_length, bamfile_path, chr) depth_data = calcul_depth_seq_samtools(seq_length, bamfile_path, chr)
distances_data = calcul_distance(bamfile_handle, chr, seq_length) distances_data = calcul_distance(bamfile_handle, chr, seq_length)
MIN_DISTANCE, MAX_DISTANCE = stats_distances(distances_data, chr) MIN_DISTANCE, MAX_DISTANCE = stats_distances(distances_data, chr)
find_paired_split(chr, MIN_DISTANCE, MAX_DISTANCE, depth_data, output_file_pairs, output_file_splits, bamfile_handle) find_paired_split(chr, MIN_DISTANCE, MAX_DISTANCE, depth_data, output_file_pairs, output_file_splits, bamfile_handle, seq_length)
# Transférer le tableau NumPy vers CUDA # Transférer le tableau NumPy vers CUDA
d_depth_data = cuda.mem_alloc(depth_data.nbytes) d_depth_data = cuda.mem_alloc(depth_data.nbytes)
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment