Skip to content
Projects
Groups
Snippets
Help
This project
Loading...
Sign in / Register
Toggle navigation
C
cnvCallerGPU
Project
Project
Details
Activity
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
gad-public
cnvCallerGPU
Commits
26feb9eb
Commit
26feb9eb
authored
Jul 19, 2024
by
Theo Serralta
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
Add some function to find pairs and splits read
parent
b6aa9197
Hide whitespace changes
Inline
Side-by-side
Showing
1 changed file
with
163 additions
and
26 deletions
+163
-26
test_gpu_time_SV.py
CNV/test/test_gpu_time_SV.py
+163
-26
No files found.
CNV/test/test_gpu_time_SV.py
View file @
26feb9eb
...
...
@@ -1256,7 +1256,7 @@ def stats_distances(distances_data, chr):
# Return results
return
MIN_DISTANCE
,
MAX_DISTANCE
def
find_paired_split
(
chr
,
MIN_DISTANCE
,
MAX_DISTANCE
,
depth_data
,
output_file_pairs
,
output_file_splits
,
bamfile_handle
):
def
find_paired_split
(
chr
,
MIN_DISTANCE
,
MAX_DISTANCE
,
depth_data
,
output_file_pairs
,
output_file_splits
,
bamfile_handle
,
seq_length
):
logging
.
info
(
f
"Entering find_paired_split for {chr}"
)
start_time
=
time
.
time
()
...
...
@@ -1266,53 +1266,190 @@ def find_paired_split(chr, MIN_DISTANCE, MAX_DISTANCE, depth_data, output_file_p
unique_pairs
=
[]
unique_splits
=
[]
fragment_size
=
1000
num_fragments
=
(
seq_length
//
fragment_size
)
+
1
fragment_abnormal_reads_pairs
=
np
.
zeros
(
num_fragments
,
dtype
=
int
)
fragment_abnormal_reads_splits
=
np
.
zeros
(
num_fragments
,
dtype
=
int
)
fragment_total_reads
=
np
.
zeros
(
num_fragments
,
dtype
=
int
)
event_pairs
=
[]
event_splits
=
[]
for
read
in
bamfile_handle
.
fetch
(
chr
):
if
read
.
is_paired
:
fragment_index
=
read
.
reference_start
//
fragment_size
fragment_total_reads
[
fragment_index
]
+=
1
if
read
.
is_paired
and
(
read
.
mapping_quality
>
0
):
key
=
(
read
.
reference_start
,
read
.
cigarstring
)
#sys.stderr.write("Chromosome : %s, key : %s, coordinates_pairs[key] : %s\n" % (chr, key, coordinates_pairs[key]))
if
read
.
is_unmapped
or
read
.
mate_is_unmapped
:
if
coordinates_pairs
[
key
]
==
0
:
unique_pairs
.
append
(
read
)
fragment_abnormal_reads_pairs
[
fragment_index
]
+=
1
unique_pairs
.
append
((
read
,
"Unmapped"
))
coordinates_pairs
[
key
]
+=
1
elif
read
.
is_reverse
==
read
.
mate_is_reverse
:
if
coordinates_pairs
[
key
]
==
0
:
unique_pairs
.
append
(
read
)
fragment_abnormal_reads_pairs
[
fragment_index
]
+=
1
unique_pairs
.
append
((
read
,
"INV"
))
coordinates_pairs
[
key
]
+=
1
elif
abs
(
read
.
template_length
)
<
MIN_DISTANCE
or
abs
(
read
.
template_length
)
>
MAX_DISTANCE
:
elif
abs
(
read
.
template_length
)
<
MIN_DISTANCE
:
if
coordinates_pairs
[
key
]
==
0
:
unique_pairs
.
append
(
read
)
fragment_abnormal_reads_pairs
[
fragment_index
]
+=
1
unique_pairs
.
append
((
read
,
"INS"
))
coordinates_pairs
[
key
]
+=
1
elif
abs
(
read
.
template_length
)
>
MAX_DISTANCE
:
if
coordinates_pairs
[
key
]
==
0
:
fragment_abnormal_reads_pairs
[
fragment_index
]
+=
1
unique_pairs
.
append
((
read
,
"DEL"
))
coordinates_pairs
[
key
]
+=
1
elif
read
.
reference_id
!=
read
.
next_reference_id
:
if
coordinates_pairs
[
key
]
==
0
:
unique_pairs
.
append
(
read
)
fragment_abnormal_reads_pairs
[
fragment_index
]
+=
1
unique_pairs
.
append
((
read
,
"TRN"
))
coordinates_pairs
[
key
]
+=
1
if
read
.
cigartuples
and
any
(
cigar_op
in
[
2
,
3
,
4
,
5
]
for
cigar_op
,
_
in
read
.
cigartuples
):
# Lectures splits
elif
read
.
cigartuples
and
any
(
cigar_op
in
[
3
,
4
,
5
]
for
cigar_op
,
_
in
read
.
cigartuples
):
if
not
read
.
has_tag
(
"SA"
):
continue
key
=
(
read
.
reference_start
,
read
.
cigarstring
)
if
coordinates_splits
[
key
]
==
0
:
unique_splits
.
append
(
read
)
coordinates_splits
[
key
]
+=
1
fragment_abnormal_reads_splits
[
fragment_index
]
+=
1
# Classification des événements split
for
cigar_op
,
cigar_len
in
read
.
cigartuples
:
if
cigar_op
==
3
:
# Skipped region
unique_splits
.
append
((
read
,
"SKIP"
))
elif
cigar_op
==
4
:
# Soft clipping
unique_splits
.
append
((
read
,
"SOFT_CLIP"
))
elif
cigar_op
==
5
:
# Hard clipping
unique_splits
.
append
((
read
,
"HARD_CLIP"
))
coordinates_splits
[
key
]
+=
1
logging
.
info
(
"Filtering pairs"
)
for
read
,
event_type
in
unique_pairs
:
fragment_index
=
read
.
reference_start
//
fragment_size
threshold
=
fragment_abnormal_reads_pairs
[
fragment_index
]
/
fragment_total_reads
[
fragment_index
]
if
threshold
>
0.2
:
event_pairs
.
append
((
read
,
event_type
))
logging
.
info
(
"Filtering splits"
)
for
read
,
event_type
in
unique_splits
:
fragment_index
=
read
.
reference_start
//
fragment_size
threshold
=
fragment_abnormal_reads_splits
[
fragment_index
]
/
fragment_total_reads
[
fragment_index
]
if
threshold
>
0.2
:
event_splits
.
append
((
read
,
event_type
))
chromosome_coordinate_pairs_count
=
defaultdict
(
int
)
reads_by_coordinate_pair
=
defaultdict
(
list
)
tolerance
=
400
chromosome_coordinate_splits_count
=
defaultdict
(
int
)
reads_by_coordinate_splits
=
defaultdict
(
list
)
logging
.
info
(
"Merging pairs"
)
#Paired-reads
for
read
,
event_type
in
event_pairs
:
start_chr
=
read
.
reference_name
start_coord
=
read
.
reference_start
end_chr
=
read
.
next_reference_name
end_coord
=
read
.
next_reference_start
found_close_pair
=
False
for
(
chr_pair
,
coord_pair
,
existing_event_type
),
count
in
chromosome_coordinate_pairs_count
.
items
():
(
stored_start_chr
,
stored_end_chr
)
=
chr_pair
(
stored_start_coord
,
stored_end_coord
)
=
coord_pair
if
(
start_chr
==
stored_start_chr
and
end_chr
==
stored_end_chr
and
are_coordinates_close
(
start_coord
,
stored_start_coord
,
tolerance
)
and
are_coordinates_close
(
end_coord
,
stored_end_coord
,
tolerance
)
and
event_type
==
existing_event_type
):
chromosome_coordinate_pairs_count
[(
chr_pair
,
coord_pair
,
event_type
)]
+=
1
reads_by_coordinate_pair
[(
chr_pair
,
coord_pair
,
event_type
)]
.
append
(
read
)
found_close_pair
=
True
break
if
not
found_close_pair
:
chr_pair
=
(
start_chr
,
end_chr
)
coord_pair
=
(
start_coord
,
end_coord
)
chromosome_coordinate_pairs_count
[(
chr_pair
,
coord_pair
,
event_type
)]
+=
1
reads_by_coordinate_pair
[(
chr_pair
,
coord_pair
,
event_type
)]
.
append
(
read
)
logging
.
info
(
"Merging splits"
)
#Split-reads
for
read
,
event_type
in
event_splits
:
start_chr
=
read
.
reference_name
start_coord
=
read
.
reference_start
end_chr
=
read
.
next_reference_name
end_coord
=
read
.
next_reference_start
found_close_splits
=
False
for
(
chr_splits
,
coord_splits
,
existing_event_type
),
count
in
chromosome_coordinate_splits_count
.
items
():
(
stored_start_chr
,
stored_end_chr
)
=
chr_splits
(
stored_start_coord
,
stored_end_coord
)
=
coord_splits
if
(
start_chr
==
stored_start_chr
and
end_chr
==
stored_end_chr
and
are_coordinates_close
(
start_coord
,
stored_start_coord
,
tolerance
)
and
are_coordinates_close
(
end_coord
,
stored_end_coord
,
tolerance
)
and
event_type
==
existing_event_type
):
existing_reads
=
reads_by_coordinate_splits
[(
chr_splits
,
coord_splits
,
event_type
)]
if
any
(
are_secondary_alignments_same
(
read
,
existing_read
)
for
existing_read
in
existing_reads
):
chromosome_coordinate_splits_count
[(
chr_splits
,
coord_splits
,
event_type
)]
+=
1
reads_by_coordinate_splits
[(
chr_splits
,
coord_splits
,
event_type
)]
.
append
(
read
)
found_close_splits
=
True
break
if
not
found_close_splits
:
chr_splits
=
(
start_chr
,
end_chr
)
coord_splits
=
(
start_coord
,
end_coord
)
chromosome_coordinate_splits_count
[(
chr_splits
,
coord_splits
,
event_type
)]
+=
1
reads_by_coordinate_splits
[(
chr_splits
,
coord_splits
,
event_type
)]
.
append
(
read
)
logging
.
info
(
"Writting results for pairs and split"
)
with
open
(
output_file_pairs
,
'a'
)
as
p
:
for
read
in
unique_pairs
:
key
=
(
read
.
reference_start
,
read
.
cigarstring
)
threshold
=
depth_data
[
read
.
reference_start
]
*
0.60
#sys.stderr.write("Chromosome : %s, threshold : %s, depth_data[read.reference_start] : %s\n" % (chr, threshold, depth_data[read.reference_start]))
if
coordinates_pairs
[
key
]
>
threshold
:
p
.
write
(
read
.
to_string
()
+
'
\n
'
)
for
(
chr_pair
,
coord_pair
,
event_type
),
count
in
chromosome_coordinate_pairs_count
.
items
():
if
count
>
1
:
reads_to_merge
=
reads_by_coordinate_pair
[(
chr_pair
,
coord_pair
,
event_type
)]
start_chr
,
end_chr
=
chr_pair
min_start
=
min
(
read
.
reference_start
for
read
in
reads_to_merge
)
max_start
=
max
(
read
.
reference_start
for
read
in
reads_to_merge
)
min_end
=
min
(
read
.
next_reference_start
for
read
in
reads_to_merge
)
max_end
=
max
(
read
.
next_reference_start
for
read
in
reads_to_merge
)
read_count
=
len
(
reads_to_merge
)
p
.
write
(
f
"{start_chr}
\t
{min_start}
\t
{max_start}
\t
{end_chr}
\t
{min_end}
\t
{max_end}
\t
{event_type}
\t
{read_count}
\n
"
)
with
open
(
output_file_splits
,
'a'
)
as
s
:
for
read
in
unique_splits
:
key
=
(
read
.
reference_start
,
read
.
cigarstring
)
threshold
=
depth_data
[
read
.
reference_start
]
*
0.60
if
coordinates_splits
[
key
]
>
threshold
:
s
.
write
(
read
.
to_string
()
+
'
\n
'
)
for
(
chr_splits
,
coord_splits
,
event_type
),
count
in
chromosome_coordinate_splits_count
.
items
():
if
count
>
1
:
reads_to_merge
=
reads_by_coordinate_splits
[(
chr_splits
,
coord_splits
,
event_type
)]
start_chr
,
end_chr
=
chr_splits
min_start
=
min
(
read
.
reference_start
for
read
in
reads_to_merge
)
max_start
=
max
(
read
.
reference_start
for
read
in
reads_to_merge
)
min_end
=
min
(
read
.
next_reference_start
for
read
in
reads_to_merge
)
max_end
=
max
(
read
.
next_reference_start
for
read
in
reads_to_merge
)
read_count
=
len
(
reads_to_merge
)
s
.
write
(
f
"{start_chr}
\t
{min_start}
\t
{max_start}
\t
{end_chr}
\t
{min_end}
\t
{max_end}
\t
{event_type}
\t
{read_count}
\n
"
)
end_time
=
time
.
time
()
elapsed_time
=
end_time
-
start_time
logging
.
info
(
f
"Leaving find_paired_split for {chr} (Time taken: {elapsed_time:.4f} seconds)"
)
def
are_coordinates_close
(
coord1
,
coord2
,
tolerance
):
return
abs
(
coord1
-
coord2
)
<=
tolerance
def
are_secondary_alignments_same
(
read1
,
read2
):
return
read1
.
get_tag
(
"SA"
)
==
read2
.
get_tag
(
"SA"
)
def
main_calcul
(
bamfile_path
,
bamfile_handle
,
...
...
@@ -1374,7 +1511,7 @@ def main_calcul(
depth_data
=
calcul_depth_seq_samtools
(
seq_length
,
bamfile_path
,
chr
)
distances_data
=
calcul_distance
(
bamfile_handle
,
chr
,
seq_length
)
MIN_DISTANCE
,
MAX_DISTANCE
=
stats_distances
(
distances_data
,
chr
)
find_paired_split
(
chr
,
MIN_DISTANCE
,
MAX_DISTANCE
,
depth_data
,
output_file_pairs
,
output_file_splits
,
bamfile_handle
)
find_paired_split
(
chr
,
MIN_DISTANCE
,
MAX_DISTANCE
,
depth_data
,
output_file_pairs
,
output_file_splits
,
bamfile_handle
,
seq_length
)
# Transférer le tableau NumPy vers CUDA
d_depth_data
=
cuda
.
mem_alloc
(
depth_data
.
nbytes
)
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment