triplets_outliers.py 5.66 KB
Newer Older
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44
#! /usr/bin/env python3

### ASDP PIPELINE ###
## triplets_outliers.py
## Version : 0.0.1
## Licence : FIXME
## Description : script to get automatically outliers from expansion pipeline results from getResults.py
## Usage :
## Output : FIXME
## Requirements : FIXME

## Author : anne-sophie.denomme-pichon@u-bourgogne.fr
## Creation Date : 20200202
## last revision date : 20200202
## Known bugs : None

import collections
import csv
import os
import scipy.stats
import sys

path = '/work/gad/shared/analyse/STR/results2020-01-09'

def load_limits():
    limits = {}
    with open(f'{sys.argv[0].rsplit("/", 1)[0]}{os.sep}patho.csv') as limits_file:
        csvreader = csv.reader(limits_file, delimiter=';')
        try:
            next(csvreader)
            for row in csvreader:
                if row[1] != 'NA':
                    limits[row[0]] = int(row[1])
            return limits
        except StopIteration:
            print('Limits file is empty', file=sys.stderr)
            sys.exit(1)

def display_outliers(locus, limits):
    # results = {
    #     'dijen': {
    #         'tool': {
    #             'Limit': 42,
    #             '5 %': 42,
45 46
    #             'Z score': 42,
    #             '< 3': 2
47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65
    #         }
    #     }
    # }
    results = collections.OrderedDict()
    tools_values = {}
    with open(f'{path}{os.sep}{locus}.tsv') as result_file:
        tsvreader = csv.reader(result_file, delimiter='\t')
        try:
            tools = next(tsvreader)[1:]
            for row in tsvreader:
                dijen = row[0]
                results[dijen] = collections.OrderedDict()
                for tool_id, tool in enumerate(tools):
                    tool_value = row[tool_id + 1].replace('nofile', '.').replace('-1', '.')
                    tools_values.setdefault(tool, [])
                    results[dijen][tool] = collections.OrderedDict()
                    results[dijen][tool]['Limit'] = '.'
                    results[dijen][tool]['5 %'] = tool_value
                    results[dijen][tool]['Z score'] = tool_value
66 67 68
                    results[dijen][tool]['< 3'] = '.'

                    # > upper limit of normality or < 3
69 70 71 72 73
                    if tool_value != '.':
                        # count: number of repeats from the input file
                        for count in tool_value.split(','):
                            if count != '.':
                                tools_values[tool].append(int(count))
74 75
                                if int(count) < 3:
                                    results[dijen][tool]['< 3'] = tool_value
76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119
                                if locus in limits:
                                    if int(count) > limits[locus]:
                                        results[dijen][tool]['Limit'] = tool_value
                                else:
                                    results[dijen][tool]['Limit'] = 'NA'

        except StopIteration:
            print('Input file is empty', file=sys.stderr)
            sys.exit(1)

    # 5 % limit
    for tool, tool_values in tools_values.items():
        if tool_values:
            tool_5p_limit = sorted(tool_values)[-len(tool_values)//20:][0]
            for dijen, dijen_outliers in results.items():
                tool_5p_outliers = dijen_outliers[tool]['5 %']
                actual_outlier = False
                # count: number of repeats from the input file
                for count in tool_5p_outliers.split(','):
                    if count != '.':
                        if int(count) >= tool_5p_limit:
                            actual_outlier = True
                            break
                if not actual_outlier:
                    dijen_outliers[tool]['5 %'] = '.'

    # Z score
    for tool, tool_values in tools_values.items():
        if tool_values:
            if len(tool_values) > 1:
                zscores = iter(scipy.stats.zscore(tool_values))
            else:
                zscores = iter(['.'])
            for dijen_outliers in results.values():
                actual_outlier = False
                zscore_outliers = []
                # count: number of repeats from the input file                
                for count in dijen_outliers[tool]['Z score'].split(','):
                    if count != '.':
                        zscore = next(zscores)
                        if zscore == '.':
                            zscore_outliers.append('.')
                        else:
                            zscore_outliers.append(f'{zscore:.3f}')
120
                            if zscore >= 2.0:
121
                                actual_outlier = True
122 123 124 125 126 127
                if actual_outlier:
                    dijen_outliers[tool]['Z score'] = ','.join(zscore_outliers)
                else:
                    dijen_outliers[tool]['Z score'] = '.'

    # Output
128 129
    print('dijen\tEH\tEH\tEH\tEH\tTred\tTred\tTred\tTred\tGangSTR\tGangSTR\tGangSTR\tGangSTR')
    print('\tLimit\t5 %\tZ score\t< 3' * 3)
130 131 132 133 134
    for dijen, dijen_outliers in results.items():
        all_outliers = [dijen]
        dijen_has_outliers = False
        for tool, tool_outliers in dijen_outliers.items():
            if tool_outliers:
135 136 137
                for tool_outlier_value in tool_outliers.values():
                    if tool_outlier_value != '.':
                        dijen_has_outliers = True
138 139 140 141 142 143 144 145 146 147 148 149
                all_outliers.extend(tool_outliers.values())
            else:
                all_outliers.append('.')
        if dijen_has_outliers:
            print('\t'.join(all_outliers))

if __name__ == '__main__':
    if len(sys.argv) != 2:
        print(f'Usage: {sys.argv[0].split(os.sep)[-1]} <LOCUS>', file=sys.stderr)
        sys.exit(1)
    limits = load_limits()
    display_outliers(sys.argv[1], limits)