getResults.py 5.56 KB
Newer Older
1 2 3 4
#! /usr/bin/env python3

### ASDP PIPELINE ###
## getResults.py
5 6
## Version: 0.0.1
## Licence: AGPLv3
7
## Author: Anne-Sophie Denommé-Pichon
8
## Description: script to get automatically results from pipeline.sh script in a tsv format on all the locus
9

10
import csv
11
import gzip
12 13 14 15 16
import json
import logging
import os
import os.path
import re
17 18 19 20 21 22
import sys

input_directory = None
output_directory = None
variant_catalog = None

23
with open(os.path.join(os.path.dirname(sys.argv[0]), 'config.sh')) as config:
24 25 26 27
    for line in config:
        if '=' in line:
            variable, value = line.split('=', 1)
            if variable == 'OUTPUTDIR':
28
                input_directory = value.split('#')[0].strip('"\' \n') # strip double quotes, simple quotes and spaces, new line
29
            elif variable == 'RESULTS_OUTPUTDIR':
30
                output_directory = value.split('#')[0].strip('"\' \n') # strip double quotes, simple quotes and spaces, new line
31
            elif variable == 'EH_VARIANT_CATALOG':
32
                variant_catalog = value.split('#')[0].strip('"\' \n') # strip double quotes, simple quotes and spaces, new line
33

34 35 36
if input_directory is None or output_directory is None or variant_catalog is None:
    logging.error('OUTPUTDIR, RESULTS_OUTPUTDIR or EH_VARIANT_CATALOG is missing in config.sh')
    sys.exit(1)
37

38 39
genotype = re.compile(r'<STR([0-9]+)>')

40 41 42 43 44 45 46 47 48 49 50 51 52 53 54
def enumerate_variants(catalog_path):
    with open(catalog_path) as catalog_file:
        variants = json.load(catalog_file)

        for variant in variants:
            region = variant['ReferenceRegion']
            # TODO handle other regions?
            if isinstance(region, list):
                region = region[0]
            chrom, pos = region.split(':')
            pos = pos.split('-')[0]
            yield variant['LocusId'], (chrom, pos)

def get_eh_results(file_path, region):
    try:
55 56 57 58 59 60 61
        with open(file_path) as file:
            for line in file:
                columns = line.replace('\t', ' ').split()
                if (len(columns) >= 2 and
                    columns[0] == region[0] and # same chrom
                    abs(int(columns[1]) - int(region[1])) <= 10): # same pos (+/- 10)
                    return re.sub(genotype, '\g<1>', columns[4])
62 63 64 65 66 67
        return '.'
    except FileNotFoundError:
        logging.warn(f'File not found "{file_path}"')
        return 'nofile'

def get_tred_results(file_path, region):
68 69 70 71 72 73 74 75 76 77 78 79 80 81
    try:
        with gzip.open(file_path, 'rt') as file:
            for line in file:
                columns = line.replace('\t', ' ').split()
                if (len(columns) >= 2 and
                    columns[0] == region[0] and # same chrom
                    abs(int(columns[1]) - int(region[1])) <= 10): # same pos (+/- 10)
                    for value in columns[7].split(';'):
                        if value.startswith('RPA='):
                            return value[4:]
        return '.'
    except FileNotFoundError:
        logging.warn(f'File not found "{file_path}"')
        return 'nofile'
82 83

def get_gang_results(file_path, region):
84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99
    try:
        with open(file_path) as file:
            for line in file:
                columns = line.replace('\t', ' ').split()
                if (len(columns) >= 10 and
                    columns[0] == region[0] and # same chrom
                    abs(int(columns[1]) - int(region[1])) <= 10): # same pos (+/- 10)
                    format, genotype = columns[8], columns[9]
                    if genotype != '.':
                        return genotype.split(':')[format.split(':').index('REPCN')]
                    else:
                        return '.'
        return '.'
    except FileNotFoundError:
        logging.warn(f'File not found "{file_path}"')
        return 'nofile'
100

101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128
def get_results(locus, region, samples):
    has_header = False
    previous_results = set()
    try:
        with open(os.path.join(output_directory, locus + '.tsv'), 'r') as result_file:
            tsvreader = csv.reader(result_file, delimiter='\t')
            try:
                next(tsvreader) # not take into account the header
                has_header = True
                for row in tsvreader:
                    previous_results.add(row[0])
            except StopIteration:
                pass
    except:
        pass
    
        
    with open(os.path.join(output_directory, locus + '.tsv'), 'a') as result_file:
        if not has_header: # If the outputfile has not header (file empty), it adds the header (unlikely to happen)
            result_file.write('samplexxx\tEH\tTred\tGangSTR\n')
        for sample in samples:
            if sample not in previous_results:
                file_path = os.path.join(input_directory, sample)
                print(file_path)
                eh = get_eh_results(os.path.join(file_path, f'eh/{sample}.vcf'), region)
                tred = get_tred_results(os.path.join(file_path, f'tredparse/{sample}.tred.vcf.gz'), region)
                gang = get_gang_results(os.path.join(file_path, f'gangstr/{sample}.vcf'), region)
                result_file.write(f'{sample}\t{eh}\t{tred}\t{gang}\n')
129 130

if __name__ == '__main__':
131 132 133 134 135
    if len(sys.argv) != 2:
        print(f'Usage: {sys.argv[0].split(os.sep)[-1]} <SAMPLESLIST>', file=sys.stderr)
        sys.exit(1)
    with open(sys.argv[1]) as samples_list:
        samples = []
136 137 138 139
        for line in sorted(samples_list.readlines()):
            sample = line.rstrip()
            if sample:
                samples.append(sample)
140
    os.makedirs(output_directory, exist_ok=True)
141
    for locus, region in enumerate_variants(variant_catalog):
142 143
        print(locus, region)
        get_results(locus, region, samples)