Create script to identify outliers from STR results

parent 1e5b9d7d
#! /bin/sh
SCRIPT="$(dirname "$(readlink -f "$0")")/triplets_outliers.py"
cd '/work/gad/shared/analyse/STR/results2020-01-09' || exit 1
for locus_tsv in $(ls *.tsv | grep -v outliers); do
locus="$(basename "$locus_tsv" ".tsv")"
echo "Processing $locus" >&2
"$SCRIPT" "$locus" > "$locus.outliers.tsv"
done
Gene;High normal range
AFF2;39
AR;35
ATN1;34
ATXN1;38
ATXN10;20
ATXN2;24
ATXN3;36
ATXN7;35
ATXN8OS;34
C9ORF72;19
CACNA1A;7
CBL;NA
CNBP;26
CSTB;3
DIP2B;NA
DMPK;37
FMR1;45
FXN;32
HTT;34
JPH3;27
NOP56;8
PHOX2B;20
PPP2R2B;45
TBP;42
TCF4;NA
BEAN/TK2;NA
DAB1;NA
SAMD12;NA
ALEA;74
\ No newline at end of file
#! /usr/bin/env python3
### ASDP PIPELINE ###
## triplets_outliers.py
## Version : 0.0.1
## Licence : FIXME
## Description : script to get automatically outliers from expansion pipeline results from getResults.py
## Usage :
## Output : FIXME
## Requirements : FIXME
## Author : anne-sophie.denomme-pichon@u-bourgogne.fr
## Creation Date : 20200202
## last revision date : 20200202
## Known bugs : None
import collections
import csv
import os
import scipy.stats
import sys
path = '/work/gad/shared/analyse/STR/results2020-01-09'
def load_limits():
limits = {}
with open(f'{sys.argv[0].rsplit("/", 1)[0]}{os.sep}patho.csv') as limits_file:
csvreader = csv.reader(limits_file, delimiter=';')
try:
next(csvreader)
for row in csvreader:
if row[1] != 'NA':
limits[row[0]] = int(row[1])
return limits
except StopIteration:
print('Limits file is empty', file=sys.stderr)
sys.exit(1)
def display_outliers(locus, limits):
# results = {
# 'dijen': {
# 'tool': {
# 'Limit': 42,
# '5 %': 42,
# 'Z score': 42
# }
# }
# }
results = collections.OrderedDict()
tools_values = {}
with open(f'{path}{os.sep}{locus}.tsv') as result_file:
tsvreader = csv.reader(result_file, delimiter='\t')
try:
tools = next(tsvreader)[1:]
for row in tsvreader:
dijen = row[0]
results[dijen] = collections.OrderedDict()
for tool_id, tool in enumerate(tools):
tool_value = row[tool_id + 1].replace('nofile', '.').replace('-1', '.')
tools_values.setdefault(tool, [])
results[dijen][tool] = collections.OrderedDict()
results[dijen][tool]['Limit'] = '.'
results[dijen][tool]['5 %'] = tool_value
results[dijen][tool]['Z score'] = tool_value
if tool_value != '.':
# count: number of repeats from the input file
for count in tool_value.split(','):
if count != '.':
tools_values[tool].append(int(count))
if locus in limits:
if int(count) > limits[locus]:
results[dijen][tool]['Limit'] = tool_value
else:
results[dijen][tool]['Limit'] = 'NA'
except StopIteration:
print('Input file is empty', file=sys.stderr)
sys.exit(1)
# 5 % limit
for tool, tool_values in tools_values.items():
if tool_values:
tool_5p_limit = sorted(tool_values)[-len(tool_values)//20:][0]
for dijen, dijen_outliers in results.items():
tool_5p_outliers = dijen_outliers[tool]['5 %']
actual_outlier = False
# count: number of repeats from the input file
for count in tool_5p_outliers.split(','):
if count != '.':
if int(count) >= tool_5p_limit:
actual_outlier = True
break
if not actual_outlier:
dijen_outliers[tool]['5 %'] = '.'
# Z score
for tool, tool_values in tools_values.items():
if tool_values:
if len(tool_values) > 1:
zscores = iter(scipy.stats.zscore(tool_values))
else:
zscores = iter(['.'])
for dijen_outliers in results.values():
actual_outlier = False
zscore_outliers = []
# count: number of repeats from the input file
for count in dijen_outliers[tool]['Z score'].split(','):
if count != '.':
zscore = next(zscores)
if zscore == '.':
zscore_outliers.append('.')
else:
zscore_outliers.append(f'{zscore:.3f}')
if True: #zscore > 2:
actual_outlier = True
if actual_outlier:
dijen_outliers[tool]['Z score'] = ','.join(zscore_outliers)
else:
dijen_outliers[tool]['Z score'] = '.'
# Output
print('dijen\tEH\tEH\tEH\tTred\tTred\tTred\tGangSTR\tGangSTR\tGangSTR')
print('\tLimit\t5 %\tZ score' * 3)
for dijen, dijen_outliers in results.items():
all_outliers = [dijen]
dijen_has_outliers = False
for tool, tool_outliers in dijen_outliers.items():
if tool_outliers:
dijen_has_outliers = True
all_outliers.extend(tool_outliers.values())
else:
all_outliers.append('.')
if dijen_has_outliers:
print('\t'.join(all_outliers))
if __name__ == '__main__':
if len(sys.argv) != 2:
print(f'Usage: {sys.argv[0].split(os.sep)[-1]} <LOCUS>', file=sys.stderr)
sys.exit(1)
limits = load_limits()
display_outliers(sys.argv[1], limits)
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment