Make results pipeline more generic

parent cc421cc2
......@@ -9,6 +9,7 @@
1. Create `samples.list`
2. Fill the configuration file `config.sh`. Warning, don't overwrite existing files
3. Launch `launch_pipeline.sh` : `nohup ./launch_pipeline.sh samples.list &`. Dependencies :
- `config.sh`
- `samples.list`
- `pipeline.sh`
- `wrapper_delete.sh`
......@@ -17,13 +18,11 @@
- `wrapper_gangstr.sh`
- `wrapper_transfer.sh`
- `wrapper_tredparse.sh`
4. Launch `getResults.py`. Warning, don't overwrite existing files.
5. Specify input directory in `str_plotly.py` and in `launch_str_plotly.sh`.
4. Launch `launch_pipeline_ehdn_outlier.sh` : `nohup ./launch_pipeline_ehdn_outlier.sh &`. Dependencies :
- `config.sh`
- `pipeline_ehdn_outlier.sh`
- `wrapper_ehdn_outlier.sh`
5. Launch `getResults.py`. Warning, don't overwrite existing files.
6. Launch `launch_str_plotly.sh`.
7. Specify input directory in `str_outliers.py` and in `launch_str_outliers.sh`.
8. Change z-score threshold if necessary in `str_outliers.py`.
9. Launch `launch_str_outliers.sh`. Dependency: `patho.csv`.
7. Change z-score threshold if necessary in `config.sh`. Launch `launch_str_outliers.sh`. Dependency: `patho.csv`.
10. Get files (i.e.: `scp 'an1770de@ssh-ccub.u-bourgogne.fr:/work/gad/shared/analyse/STR/results/*' .`)
Dans doc : préciser que le répertoire input doit être différent du répertoire d'output (précaution pour éviter de supprimer données brutes)
\ No newline at end of file
......@@ -3,8 +3,12 @@
- [ ] Ajouter dans la doc que Tred doit être ouvert dans virtual
- [ ] Mode transfert
- [ ] Préciser qu'il ne faut rien ajouter manuellement dans les répertoires de OUTPUTDIR (car EHDN travaille sur tous les répertoires présents dans OUTPUTDIR, uniquement samples)
- [ ] Préciser que le répertoire input doit être différent du répertoire d'output (précaution pour éviter de supprimer\ données brutes)
- [ ] Dépendances : Plotly pour str_plotly.py et Scipy pour str_outliers.py, Python 3
- [ ] EHDN
- [ ] Brancher EHDN et outlier
- [ ] Séparer case et contrôles
- [ ] Ajouter EHDN dans GetResults
- [ ] Ajouter le sample dans le nom de fichier de log
- [ ] Changer le répertoire de sortie : /STR/pipeline car pipeline n'est pas explicite (répertoire avec fichiers de sortie des outils de détection de STR)
- [ ] Remove dijen from str_outliers.py
......@@ -8,6 +8,7 @@
INPUTDIR="/archive/gad/shared/bam_new_genome_temp"
OUTPUTDIR="/work/gad/shared/analyse/STR/pipeline"
RESULTS_OUTPUTDIR="/work/gad/shared/analyse/STR/results" # for getResults.py, Plotly and outliers tables
# Valid values: "sge"
INFRA=sge
......@@ -33,9 +34,12 @@ GANGSTR_REGIONS="/work/gad/shared/bin/gangstr/STRregions/hg19_ver13_1.bed"
EHDN="/work/gad/shared/bin/expansionhunterdenovo/ExpansionHunterDenovo-v0.8.0-linux_x86_64/bin/ExpansionHunterDenovo-v0.8.0"
EHDN_OUTLIER="/work/gad/shared/bin/expansionhunterdenovo/ExpansionHunterDenovo-v0.8.0-linux_x86_64/scripts/outlier.py"
REF="/work/gad/shared/pipeline/hg19/index/hg19_essential.fa"
# Outliers
ZSCORE_THRESHOLD=4.0
PERCENTILE_THRESHOLD=1.0
......@@ -2,17 +2,10 @@
### ASDP PIPELINE ###
## getResults.py
## Version : 0.0.1
## Licence : FIXME
## Description : script to get automatically results from pipeline.sh script in a tsv format on all the locus
## Usage :
## Output : FIXME
## Requirements : FIXME
## Author : anne-sophie.denomme-pichon@u-bourgogne.fr
## Creation Date : 20191215
## last revision date : 20191217
## Known bugs : None
## Version: 0.0.1
## Licence: AGPLv3
## Author: anne-sophie.denomme-pichon@u-bourgogne.fr
## Description: script to get automatically results from pipeline.sh script in a tsv format on all the locus
import glob
import gzip
......@@ -21,10 +14,26 @@ import logging
import os
import os.path
import re
import sys
input_directory = None
output_directory = None
variant_catalog = None
with open(os.path.join(os.path.dirname(sys.argv[0]), 'config.sh'))) as config:
for line in config:
if '=' in line:
variable, value = line.split('=', 1)
if variable == 'OUTPUTDIR':
input_directory = value.split('#')[0].strip('"\' ') # strip double quotes, simple quotes and spaces
elif variable == 'RESULTS_OUTPUTDIR':
output_directory = value.split('#')[0].strip('"\' ') # strip double quotes, simple quotes and spaces
elif variable == 'EH_VARIANT_CATALOG':
variant_catalog = value.split('#')[0].strip('"\' ') # strip double quotes, simple quotes and spaces
variants_catalog = '/work/gad/shared/bin/expansionhunter/ExpansionHunter-v3.1.2-linux_x86_64/variant_catalog/hg19/variant_catalog.json'
input_directory ='/work/gad/shared/analyse/STR/pipeline/'
output_directory ='/work/gad/shared/analyse/STR/results/'
if input_directory is None or output_directory is None or variant_catalog is None:
logging.error('OUTPUTDIR, RESULTS_OUTPUTDIR or EH_VARIANT_CATALOG is missing in config.sh')
sys.exit(1)
genotype = re.compile(r'<STR([0-9]+)>')
......@@ -91,8 +100,8 @@ def get_gang_results(file_path, region):
def get_results(locus, region):
with open(os.path.join(output_directory, locus + '.tsv'), 'w') as result_file:
result_file.write('dijenxxx\tEH\tTred\tGangSTR\n')
for file_path in sorted(glob.glob(os.path.join(input_directory, 'dijen*'))):
result_file.write('samplexxx\tEH\tTred\tGangSTR\n')
for file_path in sorted(glob.glob(os.path.join(input_directory, '*'))):
file_name = file_path.split(os.sep)[-1]
eh = get_eh_results(os.path.join(file_path, f'eh/{file_name}.vcf'), region)
tred = get_tred_results(os.path.join(file_path, f'tredparse/{file_name}.tred.vcf.gz'), region)
......
......@@ -14,4 +14,4 @@ printf "%s\n" * |
"$PARALLEL" \
--jobs "$PARALLEL_JOB_COUNT" \
--line-buffer \
"$(dirname "$0")/pipeline_ehdn_outlier.sh"
"$(dirname "$(readlink -f "$0"))/pipeline_ehdn_outlier.sh"
#! /bin/sh
### ASDP PIPELINE ###
## Version: 0.0.1
## Licence: AGPLv3
## Author : anne-sophie.denomme-pichon@u-bourgogne.fr
## Description : script to launch the script to get automatically outliers from expansion pipeline results from getResults.py
# Source configuration file
. "$(dirname "$0")/config.sh"
SCRIPT="$(dirname "$(readlink -f "$0")")/str_outliers.py"
cd '/work/gad/shared/analyse/STR/results' || exit 1
cd "$OUTPUTDIR" || exit 1
for locus_tsv in $(ls *.tsv | grep -v outliers); do
locus="$(basename "$locus_tsv" ".tsv")"
echo "Processing $locus" >&2
......
#! /bin/sh
### ASDP pipeline###
## Version: 0.0.1
## License; AGPLv3
## Author: anne-sophie.denomme-pichon@u-bourgogne.fr
## Description: script to launch the script to get automatically graphics from expansion pipeline results from getResults.py with Plotly
# Source configuration file
. "$(dirname "$0")/config.sh"
SCRIPT="$(dirname "$(readlink -f "$0")")/str_plotly.py"
cd '/work/gad/shared/analyse/STR/results' || exit 1
cd "$RESULTS_OUTPUTDIR" || exit 1
for locus_tsv in *.tsv; do
locus="$(basename "$locus_tsv" ".tsv")"
echo "Processing $locus" >&2
......
......@@ -2,31 +2,44 @@
### ASDP PIPELINE ###
## Version : 0.0.1
## Licence : FIXME
## Licence : AGPLv3
## Author : anne-sophie.denomme-pichon@u-bourgogne.fr
## Description : script to get automatically outliers from expansion pipeline results from getResults.py
## Usage :
## Output : FIXME
## Requirements : FIXME
## Author : anne-sophie.denomme-pichon@u-bourgogne.fr
## Creation Date : 20200202
## last revision date : 20200216
## Known bugs : None
import collections
import csv
import logging
import math
import os
import os.path
import scipy.stats
import sys
path = '/work/gad/shared/analyse/STR/results'
zscore_threshold = 4
output_directory = None
zscore_threshold = None
percentile_threshold = None
with open(os.path.join(os.path.dirname(sys.argv[0]), 'config.sh'))) as config:
for line in config:
if '=' in line:
variable, value = line.split('=', 1)
elif variable == 'RESULTS_OUTPUTDIR':
output_directory = value.split('#')[0].strip('"\' ') # strip double quotes, simple quotes and spaces
elif variable == 'ZSCORE_THRESHOLD':
zscore_threshold = float(value.split('#')[0].strip('"\' ')) # strip double quotes, simple quotes and spaces
elif variable == 'PERCENTILE_THRESHOLD':
percentile_threshold = float(value.split('#')[0].strip('"\' ')) # strip double quotes, simple quotes and spaces
if output_directory is None
logging.error('RESULTS_OUTPUTDIR or ZSCORE_THRESHOLD or PERCENTILE_THRESHOLD is missing in config.sh')
sys.exit(1)
zscore_label = f'Z>={zscore_threshold}'
percentile_threshold = 1.0
percentile_label = f'{percentile_threshold}%'
def load_limits():
limits = {}
with open(f'{sys.argv[0].rsplit("/", 1)[0]}{os.sep}patho.csv') as limits_file:
......@@ -54,7 +67,7 @@ def display_outliers(locus, limits):
# }
results = collections.OrderedDict()
tools_values = {}
with open(f'{path}{os.sep}{locus}.tsv') as result_file:
with open(f'{output_directory}{os.sep}{locus}.tsv') as result_file:
tsvreader = csv.reader(result_file, delimiter='\t')
try:
tools = next(tsvreader)[1:]
......@@ -131,7 +144,7 @@ def display_outliers(locus, limits):
dijen_outliers[tool][zscore_label] = '.'
# Output
print('dijen\tEH\tEH\tEH\tEH\tTred\tTred\tTred\tTred\tGangSTR\tGangSTR\tGangSTR\tGangSTR')
print('sample\tEH\tEH\tEH\tEH\tTred\tTred\tTred\tTred\tGangSTR\tGangSTR\tGangSTR\tGangSTR')
print(f'\tLimit\t{percentile_label}\t{zscore_label}\t< 3' * 3)
for dijen, dijen_outliers in results.items():
all_outliers = [dijen]
......
#! /usr/bin/env python3
### ASDP PIPELINE ###
## Version : 0.0.1
## Licence : FIXME
## Description : script to get automatically graphics from expansion pipeline results from getResults.py with Plotly
## Usage :
## Output : FIXME
## Requirements : FIXME
## Version: 0.0.1
## Licence: AGPLv3
## Author : anne-sophie.denomme-pichon@u-bourgogne.fr
## Creation Date : 20200202
## last revision date : 20200202
## Known bugs : None
## Description: script to get automatically graphics from expansion pipeline results from getResults.py with Plotly
import collections
import csv
import logging
import os
import os.path
import sys
path = '/work/gad/shared/analyse/STR/results'
output_directory = None
with open(os.path.join(os.path.dirname(sys.argv[0]), 'config.sh'))) as config:
for line in config:
if '=' in line:
variable, value = line.split('=', 1)
elif variable == 'RESULTS_OUTPUTDIR':
output_directory = value.split('#')[0].strip('"\' ') # strip double quotes, simple quotes and spaces
if output_directory is None
logging.error('RESULTS_OUTPUTDIR is missing in config.sh')
sys.exit(1)
def display_console_graph(title, tools, data):
print(title)
......@@ -54,7 +60,7 @@ def display_html_graph(title, tools, data):
def graph_locus(locus):
title = f'Effectif pour chaque nombre de répétitions au locus {locus}'
data = []
with open(f'{path}{os.sep}{locus}.tsv') as result_file:
with open(f'{output_directory}{os.sep}{locus}.tsv') as result_file:
tsvreader = csv.reader(result_file, delimiter='\t')
try:
tools = next(tsvreader)[1:]
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment