str_outliers.py 6.96 KB
Newer Older
1 2 3 4
#! /usr/bin/env python3

### ASDP PIPELINE ###
## Version : 0.0.1
5
## Licence : AGPLv3
6
## Author : Anne-Sophie Denommé-Pichon
7 8 9 10
## Description : script to get automatically outliers from expansion pipeline results from getResults.py

import collections
import csv
11
import logging
12
import math
13
import os
14
import os.path
15 16 17
import scipy.stats
import sys

18 19 20 21
output_directory = None
zscore_threshold = None
percentile_threshold = None

22
with open(os.path.join(os.path.dirname(sys.argv[0]), 'config.sh')) as config:
23 24 25
    for line in config:
        if '=' in line:
            variable, value = line.split('=', 1)
26
            if variable == 'RESULTS_OUTPUTDIR':
27 28 29 30 31 32
                output_directory = value.split('#')[0].strip('"\' ') # strip double quotes, simple quotes and spaces
            elif variable == 'ZSCORE_THRESHOLD':
                zscore_threshold = float(value.split('#')[0].strip('"\' ')) # strip double quotes, simple quotes and spaces
            elif variable == 'PERCENTILE_THRESHOLD':
                percentile_threshold = float(value.split('#')[0].strip('"\' ')) # strip double quotes, simple quotes and spaces

33
if output_directory is None:
34 35 36
    logging.error('RESULTS_OUTPUTDIR or ZSCORE_THRESHOLD or PERCENTILE_THRESHOLD is missing in config.sh')
    sys.exit(1)

37 38 39
zscore_label = f'Z>={zscore_threshold}'
percentile_label = f'{percentile_threshold}%'

40 41 42 43 44 45 46 47 48 49 50 51 52 53
def load_limits():
    limits = {}
    with open(f'{sys.argv[0].rsplit("/", 1)[0]}{os.sep}patho.csv') as limits_file:
        csvreader = csv.reader(limits_file, delimiter=';')
        try:
            next(csvreader)
            for row in csvreader:
                if row[1] != 'NA':
                    limits[row[0]] = int(row[1])
            return limits
        except StopIteration:
            print('Limits file is empty', file=sys.stderr)
            sys.exit(1)

54
def display_outliers(locus, limits, samples):
55 56 57 58 59
    # results = {
    #     'dijen': {
    #         'tool': {
    #             'Limit': 42,
    #             '5 %': 42,
60 61
    #             'Z score': 42,
    #             '< 3': 2
62 63 64 65 66
    #         }
    #     }
    # }
    results = collections.OrderedDict()
    tools_values = {}
67
    with open(f'{output_directory}{os.sep}{locus}.tsv') as result_file:
68 69 70 71 72
        tsvreader = csv.reader(result_file, delimiter='\t')
        try:
            tools = next(tsvreader)[1:]
            for row in tsvreader:
                dijen = row[0]
73

74 75 76 77 78 79
                results[dijen] = collections.OrderedDict()
                for tool_id, tool in enumerate(tools):
                    tool_value = row[tool_id + 1].replace('nofile', '.').replace('-1', '.')
                    tools_values.setdefault(tool, [])
                    results[dijen][tool] = collections.OrderedDict()
                    results[dijen][tool]['Limit'] = '.'
80 81
                    results[dijen][tool][percentile_label] = tool_value
                    results[dijen][tool][zscore_label] = tool_value
82 83 84
                    results[dijen][tool]['< 3'] = '.'

                    # > upper limit of normality or < 3
85 86 87 88 89
                    if tool_value != '.':
                        # count: number of repeats from the input file
                        for count in tool_value.split(','):
                            if count != '.':
                                tools_values[tool].append(int(count))
90 91
                                if int(count) < 3:
                                    results[dijen][tool]['< 3'] = tool_value
92 93 94 95 96
                                if locus in limits:
                                    if int(count) > limits[locus]:
                                        results[dijen][tool]['Limit'] = tool_value
                                else:
                                    results[dijen][tool]['Limit'] = 'NA'
97
    
98 99 100 101
        except StopIteration:
            print('Input file is empty', file=sys.stderr)
            sys.exit(1)

102
    # outlier threshold (example: 5%)
103
    for tool, tool_values in tools_values.items():
104
        # Test if there is at least one value given by the tool
105
        if tool_values:
106
            tool_percentile_limit = sorted(tool_values)[-math.ceil(len(tool_values) * percentile_threshold / 100):][0]
107
            for dijen, dijen_outliers in results.items():
108
                tool_percentile_outliers = dijen_outliers[tool][percentile_label]
109 110
                actual_outlier = False
                # count: number of repeats from the input file
111
                for count in tool_percentile_outliers.split(','):
112
                    if count != '.':
113
                        if int(count) >= tool_percentile_limit:
114 115 116
                            actual_outlier = True
                            break
                if not actual_outlier:
117
                    dijen_outliers[tool][percentile_label] = '.'
118 119 120 121 122 123 124 125 126 127 128 129

    # Z score
    for tool, tool_values in tools_values.items():
        if tool_values:
            if len(tool_values) > 1:
                zscores = iter(scipy.stats.zscore(tool_values))
            else:
                zscores = iter(['.'])
            for dijen_outliers in results.values():
                actual_outlier = False
                zscore_outliers = []
                # count: number of repeats from the input file                
130
                for count in dijen_outliers[tool][zscore_label].split(','):
131 132 133 134 135 136
                    if count != '.':
                        zscore = next(zscores)
                        if zscore == '.':
                            zscore_outliers.append('.')
                        else:
                            zscore_outliers.append(f'{zscore:.3f}')
137
                            if zscore >= zscore_threshold:
138
                                actual_outlier = True
139
                if actual_outlier:
140
                    dijen_outliers[tool][zscore_label] = ','.join(zscore_outliers)
141
                else:
142
                    dijen_outliers[tool][zscore_label] = '.'
143 144

    # Output
145
    print('sample\tEH\tEH\tEH\tEH\tTred\tTred\tTred\tTred\tGangSTR\tGangSTR\tGangSTR\tGangSTR')
146
    print(f'\tLimit\t{percentile_label}\t{zscore_label}\t< 3' * 3)
147 148 149 150 151
    for dijen, dijen_outliers in results.items():
        all_outliers = [dijen]
        dijen_has_outliers = False
        for tool, tool_outliers in dijen_outliers.items():
            if tool_outliers:
152 153 154
                for tool_outlier_value in tool_outliers.values():
                    if tool_outlier_value != '.':
                        dijen_has_outliers = True
155 156 157
                all_outliers.extend(tool_outliers.values())
            else:
                all_outliers.append('.')
158
        if dijen_has_outliers and dijen in samples:
159 160 161
            print('\t'.join(all_outliers))

if __name__ == '__main__':
162
    if len(sys.argv) != 3:
163
        print(f'Usage: {sys.argv[0].split(os.sep)[-1]} <LOCUS> <SAMPLES.LIST>', file=sys.stderr)
164
        sys.exit(1)
165 166
    with open(sys.argv[2]) as samples_list:
        samples = set()
167 168 169 170
        for line in samples_list.readlines():
            sample = line.rstrip()
            if sample:
                samples.add(sample)
171
    limits = load_limits()
172
    display_outliers(sys.argv[1], limits, samples)