Add Tredparse and GangSTR to getResults.py

parent 23fe58b7
......@@ -11,10 +11,11 @@
## Author : anne-sophie.denomme-pichon@u-bourgogne.fr
## Creation Date : 20191215
## last revision date : 20191215
## last revision date : 20191217
## Known bugs : None
import glob
import gzip
import json
import logging
import os
......@@ -25,6 +26,8 @@ variants_catalog = '/work/gad/shared/bin/expansionhunter/ExpansionHunter-v3.1.2-
input_directory ='/work/gad/shared/analyse/STR/pipeline/'
output_directory ='/work/gad/shared/analyse/STR/results/'
genotype = re.compile(r'<STR([0-9]+)>')
def enumerate_variants(catalog_path):
with open(catalog_path) as catalog_file:
variants = json.load(catalog_file)
......@@ -40,28 +43,51 @@ def enumerate_variants(catalog_path):
def get_eh_results(file_path, region):
try:
with open(file_path) as eh_file:
for line in eh_file:
columns = re.split(r'[ \t]+', line)
# TODO FIXME allow for some difference in position
if len(columns) >= 2 and columns[0] == region[0] and columns[1] == region[1]:
return re.sub('<STR([0-9]+)>', '\g<1>', columns[4])
with open(file_path) as file:
for line in file:
columns = line.replace('\t', ' ').split()
if (len(columns) >= 2 and
columns[0] == region[0] and # same chrom
abs(int(columns[1]) - int(region[1])) <= 10): # same pos (+/- 10)
return re.sub(genotype, '\g<1>', columns[4])
return '.'
except FileNotFoundError:
logging.warn(f'File not found "{file_path}"')
return 'nofile'
def get_tred_results(file_path, region):
# gunzip -c $dijen/tredparse/$dijen.tred.vcf.gz |
# awk '$1 == "chrX" && $2 == "146993569" && $3 == "FXS" {print $8}' |
# tr ';' '\n' |
# awk -F '=' '$1 == "RPA" {print $2}'
return 'FIXME'
try:
with gzip.open(file_path, 'rt') as file:
for line in file:
columns = line.replace('\t', ' ').split()
if (len(columns) >= 2 and
columns[0] == region[0] and # same chrom
abs(int(columns[1]) - int(region[1])) <= 10): # same pos (+/- 10)
for value in columns[7].split(';'):
if value.startswith('RPA='):
return value[4:]
return '.'
except FileNotFoundError:
logging.warn(f'File not found "{file_path}"')
return 'nofile'
def get_gang_results(file_path, region):
# awk '$1 == "chrX" && $2 == "146993569" {print $9 " " $10}' $dijen/gangstr/$dijen.vcf |
# python3 -c 'import sys; format, genotype = sys.stdin.read().split(); print(genotype.split(":")[format.split(":").index("REPCN")]) if genotype != "." else print(".")'
return 'FIXME'
try:
with open(file_path) as file:
for line in file:
columns = line.replace('\t', ' ').split()
if (len(columns) >= 10 and
columns[0] == region[0] and # same chrom
abs(int(columns[1]) - int(region[1])) <= 10): # same pos (+/- 10)
format, genotype = columns[8], columns[9]
if genotype != '.':
return genotype.split(':')[format.split(':').index('REPCN')]
else:
return '.'
return '.'
except FileNotFoundError:
logging.warn(f'File not found "{file_path}"')
return 'nofile'
def get_results(locus, region):
with open(os.path.join(output_directory, locus + '.tsv'), 'w') as result_file:
......@@ -69,7 +95,7 @@ def get_results(locus, region):
for file_path in sorted(glob.glob(os.path.join(input_directory, 'dijen*'))):
file_name = file_path.split(os.sep)[-1]
eh = get_eh_results(os.path.join(file_path, f'eh/{file_name}.vcf'), region)
tred = get_tred_results(os.path.join(file_path, f'tredparse/{file_name}.vcf.gz'), region)
tred = get_tred_results(os.path.join(file_path, f'tredparse/{file_name}.tred.vcf.gz'), region)
gang = get_gang_results(os.path.join(file_path, f'gangstr/{file_name}.vcf'), region)
result_file.write(f'{file_name}\t{eh}\t{tred}\t{gang}\n')
......@@ -77,5 +103,3 @@ if __name__ == '__main__':
os.makedirs(output_directory, exist_ok=True)
for locus, region in enumerate_variants(variants_catalog):
get_results(locus, region)
break # TODO remove
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment