Add Tredparse and GangSTR to getResults.py

4e810be0 · Anne-Sophie Denommé-Pichon · 23fe58b7 · 4e810be0
Commit 4e810be0 authored Dec 18, 2019 by Anne-Sophie Denommé-Pichon
Show whitespace changes
Inline Side-by-side

Showing with 42 additions and 18 deletions

getResults.py getResults.py +42 -18

No files found.
--- a/getResults.py
+++ b/getResults.py
@@ -11,10 +11,11 @@

 ## Author : anne-sophie.denomme-pichon@u-bourgogne.fr
 ## Creation Date : 20191215
-## last revision date : 20191215
+## last revision date : 20191217
 ## Known bugs : None

 import glob
+import gzip
 import json
 import logging
 import os
@@ -25,6 +26,8 @@ variants_catalog = '/work/gad/shared/bin/expansionhunter/ExpansionHunter-v3.1.2-
 input_directory ='/work/gad/shared/analyse/STR/pipeline/'
 output_directory ='/work/gad/shared/analyse/STR/results/'

+genotype = re.compile(r'<STR([0-9]+)>')
+
 def enumerate_variants(catalog_path):
    with open(catalog_path) as catalog_file:
        variants = json.load(catalog_file)
@@ -40,28 +43,51 @@ def enumerate_variants(catalog_path):

 def get_eh_results(file_path, region):
    try:
-        with open(file_path) as eh_file:
-            for line in eh_file:
-                columns = re.split(r'[ \t]+', line)
-                # TODO FIXME allow for some difference in position
-                if len(columns) >= 2 and columns[0] == region[0] and columns[1] == region[1]:
-                    return re.sub('<STR([0-9]+)>', '\g<1>', columns[4])
+        with open(file_path) as file:
+            for line in file:
+                columns = line.replace('\t', ' ').split()
+                if (len(columns) >= 2 and
+                    columns[0] == region[0] and # same chrom
+                    abs(int(columns[1]) - int(region[1])) <= 10): # same pos (+/- 10)
+                    return re.sub(genotype, '\g<1>', columns[4])
        return '.'
    except FileNotFoundError:
        logging.warn(f'File not found "{file_path}"')
        return 'nofile'

 def get_tred_results(file_path, region):
-#             gunzip -c $dijen/tredparse/$dijen.tred.vcf.gz |
-#                 awk '$1 == "chrX" && $2 == "146993569" && $3 == "FXS" {print $8}' |
-#                 tr ';' '\n' |
-#                 awk -F '=' '$1 == "RPA" {print $2}'
-    return 'FIXME'
+    try:
+        with gzip.open(file_path, 'rt') as file:
+            for line in file:
+                columns = line.replace('\t', ' ').split()
+                if (len(columns) >= 2 and
+                    columns[0] == region[0] and # same chrom
+                    abs(int(columns[1]) - int(region[1])) <= 10): # same pos (+/- 10)
+                    for value in columns[7].split(';'):
+                        if value.startswith('RPA='):
+                            return value[4:]
+        return '.'
+    except FileNotFoundError:
+        logging.warn(f'File not found "{file_path}"')
+        return 'nofile'

 def get_gang_results(file_path, region):
-#             awk '$1 == "chrX" && $2 == "146993569" {print $9 " " $10}' $dijen/gangstr/$dijen.vcf |
-#                 python3 -c 'import sys; format, genotype = sys.stdin.read().split(); print(genotype.split(":")[format.split(":").index("REPCN")]) if genotype != "." else print(".")'
-    return 'FIXME'
+    try:
+        with open(file_path) as file:
+            for line in file:
+                columns = line.replace('\t', ' ').split()
+                if (len(columns) >= 10 and
+                    columns[0] == region[0] and # same chrom
+                    abs(int(columns[1]) - int(region[1])) <= 10): # same pos (+/- 10)
+                    format, genotype = columns[8], columns[9]
+                    if genotype != '.':
+                        return genotype.split(':')[format.split(':').index('REPCN')]
+                    else:
+                        return '.'
+        return '.'
+    except FileNotFoundError:
+        logging.warn(f'File not found "{file_path}"')
+        return 'nofile'

 def get_results(locus, region):
    with open(os.path.join(output_directory, locus + '.tsv'), 'w') as result_file:
@@ -69,7 +95,7 @@ def get_results(locus, region):
        for file_path in sorted(glob.glob(os.path.join(input_directory, 'dijen*'))):
            file_name = file_path.split(os.sep)[-1]
            eh = get_eh_results(os.path.join(file_path, f'eh/{file_name}.vcf'), region)
-            tred = get_tred_results(os.path.join(file_path, f'tredparse/{file_name}.vcf.gz'), region)
+            tred = get_tred_results(os.path.join(file_path, f'tredparse/{file_name}.tred.vcf.gz'), region)
            gang = get_gang_results(os.path.join(file_path, f'gangstr/{file_name}.vcf'), region)
            result_file.write(f'{file_name}\t{eh}\t{tred}\t{gang}\n')

@@ -77,5 +103,3 @@ if __name__ == '__main__':
    os.makedirs(output_directory, exist_ok=True)
    for locus, region in enumerate_variants(variants_catalog):
        get_results(locus, region)
-        break # TODO remove
-