Skip to content
Projects
Groups
Snippets
Help
This project
Loading...
Sign in / Register
Toggle navigation
S
STR
Project
Project
Details
Activity
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
Anne-Sophie Denommé-Pichon
STR
Commits
4e810be0
Commit
4e810be0
authored
Dec 18, 2019
by
Anne-Sophie Denommé-Pichon
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
Add Tredparse and GangSTR to getResults.py
parent
23fe58b7
Show whitespace changes
Inline
Side-by-side
Showing
1 changed file
with
42 additions
and
18 deletions
+42
-18
getResults.py
getResults.py
+42
-18
No files found.
getResults.py
View file @
4e810be0
...
...
@@ -11,10 +11,11 @@
## Author : anne-sophie.denomme-pichon@u-bourgogne.fr
## Creation Date : 20191215
## last revision date : 2019121
5
## last revision date : 2019121
7
## Known bugs : None
import
glob
import
gzip
import
json
import
logging
import
os
...
...
@@ -25,6 +26,8 @@ variants_catalog = '/work/gad/shared/bin/expansionhunter/ExpansionHunter-v3.1.2-
input_directory
=
'/work/gad/shared/analyse/STR/pipeline/'
output_directory
=
'/work/gad/shared/analyse/STR/results/'
genotype
=
re
.
compile
(
r'<STR([0-9]+)>'
)
def
enumerate_variants
(
catalog_path
):
with
open
(
catalog_path
)
as
catalog_file
:
variants
=
json
.
load
(
catalog_file
)
...
...
@@ -40,28 +43,51 @@ def enumerate_variants(catalog_path):
def
get_eh_results
(
file_path
,
region
):
try
:
with
open
(
file_path
)
as
eh_file
:
for
line
in
eh_file
:
columns
=
re
.
split
(
r'[ \t]+'
,
line
)
# TODO FIXME allow for some difference in position
if
len
(
columns
)
>=
2
and
columns
[
0
]
==
region
[
0
]
and
columns
[
1
]
==
region
[
1
]:
return
re
.
sub
(
'<STR([0-9]+)>'
,
'
\
g<1>'
,
columns
[
4
])
with
open
(
file_path
)
as
file
:
for
line
in
file
:
columns
=
line
.
replace
(
'
\t
'
,
' '
)
.
split
()
if
(
len
(
columns
)
>=
2
and
columns
[
0
]
==
region
[
0
]
and
# same chrom
abs
(
int
(
columns
[
1
])
-
int
(
region
[
1
]))
<=
10
):
# same pos (+/- 10)
return
re
.
sub
(
genotype
,
'
\
g<1>'
,
columns
[
4
])
return
'.'
except
FileNotFoundError
:
logging
.
warn
(
f
'File not found "{file_path}"'
)
return
'nofile'
def
get_tred_results
(
file_path
,
region
):
# gunzip -c $dijen/tredparse/$dijen.tred.vcf.gz |
# awk '$1 == "chrX" && $2 == "146993569" && $3 == "FXS" {print $8}' |
# tr ';' '\n' |
# awk -F '=' '$1 == "RPA" {print $2}'
return
'FIXME'
try
:
with
gzip
.
open
(
file_path
,
'rt'
)
as
file
:
for
line
in
file
:
columns
=
line
.
replace
(
'
\t
'
,
' '
)
.
split
()
if
(
len
(
columns
)
>=
2
and
columns
[
0
]
==
region
[
0
]
and
# same chrom
abs
(
int
(
columns
[
1
])
-
int
(
region
[
1
]))
<=
10
):
# same pos (+/- 10)
for
value
in
columns
[
7
]
.
split
(
';'
):
if
value
.
startswith
(
'RPA='
):
return
value
[
4
:]
return
'.'
except
FileNotFoundError
:
logging
.
warn
(
f
'File not found "{file_path}"'
)
return
'nofile'
def
get_gang_results
(
file_path
,
region
):
# awk '$1 == "chrX" && $2 == "146993569" {print $9 " " $10}' $dijen/gangstr/$dijen.vcf |
# python3 -c 'import sys; format, genotype = sys.stdin.read().split(); print(genotype.split(":")[format.split(":").index("REPCN")]) if genotype != "." else print(".")'
return
'FIXME'
try
:
with
open
(
file_path
)
as
file
:
for
line
in
file
:
columns
=
line
.
replace
(
'
\t
'
,
' '
)
.
split
()
if
(
len
(
columns
)
>=
10
and
columns
[
0
]
==
region
[
0
]
and
# same chrom
abs
(
int
(
columns
[
1
])
-
int
(
region
[
1
]))
<=
10
):
# same pos (+/- 10)
format
,
genotype
=
columns
[
8
],
columns
[
9
]
if
genotype
!=
'.'
:
return
genotype
.
split
(
':'
)[
format
.
split
(
':'
)
.
index
(
'REPCN'
)]
else
:
return
'.'
return
'.'
except
FileNotFoundError
:
logging
.
warn
(
f
'File not found "{file_path}"'
)
return
'nofile'
def
get_results
(
locus
,
region
):
with
open
(
os
.
path
.
join
(
output_directory
,
locus
+
'.tsv'
),
'w'
)
as
result_file
:
...
...
@@ -69,7 +95,7 @@ def get_results(locus, region):
for
file_path
in
sorted
(
glob
.
glob
(
os
.
path
.
join
(
input_directory
,
'dijen*'
))):
file_name
=
file_path
.
split
(
os
.
sep
)[
-
1
]
eh
=
get_eh_results
(
os
.
path
.
join
(
file_path
,
f
'eh/{file_name}.vcf'
),
region
)
tred
=
get_tred_results
(
os
.
path
.
join
(
file_path
,
f
'tredparse/{file_name}.vcf.gz'
),
region
)
tred
=
get_tred_results
(
os
.
path
.
join
(
file_path
,
f
'tredparse/{file_name}.
tred.
vcf.gz'
),
region
)
gang
=
get_gang_results
(
os
.
path
.
join
(
file_path
,
f
'gangstr/{file_name}.vcf'
),
region
)
result_file
.
write
(
f
'{file_name}
\t
{eh}
\t
{tred}
\t
{gang}
\n
'
)
...
...
@@ -77,5 +103,3 @@ if __name__ == '__main__':
os
.
makedirs
(
output_directory
,
exist_ok
=
True
)
for
locus
,
region
in
enumerate_variants
(
variants_catalog
):
get_results
(
locus
,
region
)
break
# TODO remove
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment