Make results pipeline more generic

2b4c77e7 · Anne-Sophie Denommé-Pichon · cc421cc2 · 2b4c77e7 · 2b4c77e7 · 2b4c77e7
Commit 2b4c77e7 authored Jun 07, 2020 by Anne-Sophie Denommé-Pichon
9 changed files
--- a/README.md
+++ b/README.md
@@ -9,6 +9,7 @@
 1. Create `samples.list`
 2. Fill the configuration file `config.sh`. Warning, don't overwrite existing files
 3. Launch `launch_pipeline.sh` : `nohup ./launch_pipeline.sh samples.list &`. Dependencies :
+   - `config.sh`
   - `samples.list`
   - `pipeline.sh`
   - `wrapper_delete.sh`
@@ -17,13 +18,11 @@
   - `wrapper_gangstr.sh`
   - `wrapper_transfer.sh`
   - `wrapper_tredparse.sh`
-4. Launch `getResults.py`. Warning, don't overwrite existing files.
-5. Specify input directory in `str_plotly.py` and in `launch_str_plotly.sh`.
+4. Launch `launch_pipeline_ehdn_outlier.sh` : `nohup ./launch_pipeline_ehdn_outlier.sh &`. Dependencies :
+   - `config.sh`
+   - `pipeline_ehdn_outlier.sh`
+   - `wrapper_ehdn_outlier.sh`
+5. Launch `getResults.py`. Warning, don't overwrite existing files.
 6. Launch `launch_str_plotly.sh`.
-7. Specify input directory in `str_outliers.py` and in `launch_str_outliers.sh`.
-8. Change z-score threshold if necessary in `str_outliers.py`.
-9. Launch `launch_str_outliers.sh`. Dependency: `patho.csv`.
+7. Change z-score threshold if necessary in `config.sh`. Launch `launch_str_outliers.sh`. Dependency: `patho.csv`.
 10. Get files (i.e.: `scp 'an1770de@ssh-ccub.u-bourgogne.fr:/work/gad/shared/analyse/STR/results/*' .`)
-
-
-Dans doc : préciser que le répertoire input doit être différent du répertoire d'output (précaution pour éviter de supprimer données brutes)
\ No newline at end of file
--- a/TODO.org
+++ b/TODO.org
@@ -3,8 +3,12 @@
  - [ ] Ajouter dans la doc que Tred doit être ouvert dans virtual
  - [ ] Mode transfert
  - [ ] Préciser qu'il ne faut rien ajouter manuellement dans les répertoires de OUTPUTDIR (car EHDN travaille sur tous les répertoires présents dans OUTPUTDIR, uniquement samples)
+  - [ ] Préciser que le répertoire input doit être différent du répertoire d'output (précaution pour éviter de supprimer\ données brutes)
+  - [ ] Dépendances : Plotly pour str_plotly.py et Scipy pour str_outliers.py, Python 3
 - [ ] EHDN
  - [ ] Brancher EHDN et outlier
  - [ ] Séparer case et contrôles
  - [ ] Ajouter EHDN dans GetResults
 - [ ] Ajouter le sample dans le nom de fichier de log
+- [ ] Changer le répertoire de sortie : /STR/pipeline car pipeline n'est pas explicite (répertoire avec fichiers de sortie des outils de détection de STR)
+- [ ] Remove dijen from str_outliers.py
--- a/config.sh
+++ b/config.sh
@@ -8,6 +8,7 @@

 INPUTDIR="/archive/gad/shared/bam_new_genome_temp"
 OUTPUTDIR="/work/gad/shared/analyse/STR/pipeline"
+RESULTS_OUTPUTDIR="/work/gad/shared/analyse/STR/results" # for getResults.py, Plotly and outliers tables

 # Valid values: "sge"
 INFRA=sge
@@ -33,9 +34,12 @@ GANGSTR_REGIONS="/work/gad/shared/bin/gangstr/STRregions/hg19_ver13_1.bed"
 EHDN="/work/gad/shared/bin/expansionhunterdenovo/ExpansionHunterDenovo-v0.8.0-linux_x86_64/bin/ExpansionHunterDenovo-v0.8.0"
 EHDN_OUTLIER="/work/gad/shared/bin/expansionhunterdenovo/ExpansionHunterDenovo-v0.8.0-linux_x86_64/scripts/outlier.py"

-
 REF="/work/gad/shared/pipeline/hg19/index/hg19_essential.fa"

+# Outliers
+ZSCORE_THRESHOLD=4.0
+PERCENTILE_THRESHOLD=1.0
+



--- a/getResults.py
+++ b/getResults.py
@@ -2,17 +2,10 @@

 ### ASDP PIPELINE ###
 ## getResults.py
-## Version : 0.0.1
-## Licence : FIXME
-## Description : script to get automatically results from pipeline.sh script in a tsv format on all the locus
-## Usage : 
-## Output : FIXME
-## Requirements : FIXME
-
-## Author : anne-sophie.denomme-pichon@u-bourgogne.fr
-## Creation Date : 20191215
-## last revision date : 20191217
-## Known bugs : None
+## Version: 0.0.1
+## Licence: AGPLv3
+## Author: anne-sophie.denomme-pichon@u-bourgogne.fr
+## Description: script to get automatically results from pipeline.sh script in a tsv format on all the locus

 import glob
 import gzip
@@ -21,10 +14,26 @@ import logging
 import os
 import os.path
 import re
+import sys
+
+input_directory = None
+output_directory = None
+variant_catalog = None
+
+with open(os.path.join(os.path.dirname(sys.argv[0]), 'config.sh'))) as config:
+    for line in config:
+        if '=' in line:
+            variable, value = line.split('=', 1)
+            if variable == 'OUTPUTDIR':
+                input_directory = value.split('#')[0].strip('"\' ') # strip double quotes, simple quotes and spaces
+            elif variable == 'RESULTS_OUTPUTDIR':
+                output_directory = value.split('#')[0].strip('"\' ') # strip double quotes, simple quotes and spaces
+            elif variable == 'EH_VARIANT_CATALOG':
+                variant_catalog = value.split('#')[0].strip('"\' ') # strip double quotes, simple quotes and spaces

-variants_catalog = '/work/gad/shared/bin/expansionhunter/ExpansionHunter-v3.1.2-linux_x86_64/variant_catalog/hg19/variant_catalog.json'
-input_directory ='/work/gad/shared/analyse/STR/pipeline/'
-output_directory ='/work/gad/shared/analyse/STR/results/'
+if input_directory is None or output_directory is None or variant_catalog is None:
+    logging.error('OUTPUTDIR, RESULTS_OUTPUTDIR or EH_VARIANT_CATALOG is missing in config.sh')
+    sys.exit(1)

 genotype = re.compile(r'<STR([0-9]+)>')

@@ -91,8 +100,8 @@ def get_gang_results(file_path, region):

 def get_results(locus, region):
    with open(os.path.join(output_directory, locus + '.tsv'), 'w') as result_file:
-        result_file.write('dijenxxx\tEH\tTred\tGangSTR\n')
-        for file_path in sorted(glob.glob(os.path.join(input_directory, 'dijen*'))):
+        result_file.write('samplexxx\tEH\tTred\tGangSTR\n')
+        for file_path in sorted(glob.glob(os.path.join(input_directory, '*'))):
            file_name = file_path.split(os.sep)[-1]
            eh = get_eh_results(os.path.join(file_path, f'eh/{file_name}.vcf'), region)
            tred = get_tred_results(os.path.join(file_path, f'tredparse/{file_name}.tred.vcf.gz'), region)

--- a/launch_pipeline_ehdn_outlier.sh
+++ b/launch_pipeline_ehdn_outlier.sh
@@ -14,4 +14,4 @@ printf "%s\n" * |
    "$PARALLEL" \
 	--jobs "$PARALLEL_JOB_COUNT" \
        --line-buffer \
-	"$(dirname "$0")/pipeline_ehdn_outlier.sh"
+	"$(dirname "$(readlink -f "$0"))/pipeline_ehdn_outlier.sh"
--- a/launch_str_outliers.py
+++ b/launch_str_outliers.py
 #! /bin/sh

+### ASDP PIPELINE ###
+## Version: 0.0.1
+## Licence: AGPLv3
+## Author : anne-sophie.denomme-pichon@u-bourgogne.fr
+## Description : script to launch the script to get automatically outliers from expansion pipeline results from getResults.py
+
+# Source configuration file
+. "$(dirname "$0")/config.sh"
+
 SCRIPT="$(dirname "$(readlink -f "$0")")/str_outliers.py"

-cd '/work/gad/shared/analyse/STR/results' || exit 1
+cd "$OUTPUTDIR" || exit 1
 for locus_tsv in $(ls *.tsv | grep -v outliers); do
    locus="$(basename "$locus_tsv" ".tsv")"
    echo "Processing $locus" >&2

--- a/launch_str_plotly.sh
+++ b/launch_str_plotly.sh
 #! /bin/sh

+### ASDP pipeline###
+## Version: 0.0.1
+## License; AGPLv3
+## Author: anne-sophie.denomme-pichon@u-bourgogne.fr
+## Description: script to launch the script to get automatically graphics from expansion pipeline results from getResults.py with Plotly
+
+# Source configuration file
+. "$(dirname "$0")/config.sh"
+
 SCRIPT="$(dirname "$(readlink -f "$0")")/str_plotly.py"

-cd '/work/gad/shared/analyse/STR/results' || exit 1
+cd "$RESULTS_OUTPUTDIR" || exit 1
 for locus_tsv in *.tsv; do
    locus="$(basename "$locus_tsv" ".tsv")"
    echo "Processing $locus" >&2

--- a/str_outliers.py
+++ b/str_outliers.py
@@ -2,31 +2,44 @@

 ### ASDP PIPELINE ###
 ## Version : 0.0.1
-## Licence : FIXME
+## Licence : AGPLv3
+## Author : anne-sophie.denomme-pichon@u-bourgogne.fr
 ## Description : script to get automatically outliers from expansion pipeline results from getResults.py
-## Usage :
-## Output : FIXME
-## Requirements : FIXME

-## Author : anne-sophie.denomme-pichon@u-bourgogne.fr
-## Creation Date : 20200202
-## last revision date : 20200216
-## Known bugs : None

 import collections
 import csv
+import logging
 import math
 import os
+import os.path
 import scipy.stats
 import sys

-path = '/work/gad/shared/analyse/STR/results'
-zscore_threshold = 4
+
+
+output_directory = None
+zscore_threshold = None
+percentile_threshold = None
+
+with open(os.path.join(os.path.dirname(sys.argv[0]), 'config.sh'))) as config:
+    for line in config:
+        if '=' in line:
+            variable, value = line.split('=', 1)
+            elif variable == 'RESULTS_OUTPUTDIR':
+                output_directory = value.split('#')[0].strip('"\' ') # strip double quotes, simple quotes and spaces
+            elif variable == 'ZSCORE_THRESHOLD':
+                zscore_threshold = float(value.split('#')[0].strip('"\' ')) # strip double quotes, simple quotes and spaces
+            elif variable == 'PERCENTILE_THRESHOLD':
+                percentile_threshold = float(value.split('#')[0].strip('"\' ')) # strip double quotes, simple quotes and spaces
+
+if output_directory is None
+    logging.error('RESULTS_OUTPUTDIR or ZSCORE_THRESHOLD or PERCENTILE_THRESHOLD is missing in config.sh')
+    sys.exit(1)
+
 zscore_label = f'Z>={zscore_threshold}'
-percentile_threshold = 1.0
 percentile_label = f'{percentile_threshold}%'

-
 def load_limits():
    limits = {}
    with open(f'{sys.argv[0].rsplit("/", 1)[0]}{os.sep}patho.csv') as limits_file:
@@ -54,7 +67,7 @@ def display_outliers(locus, limits):
    # }
    results = collections.OrderedDict()
    tools_values = {}
-    with open(f'{path}{os.sep}{locus}.tsv') as result_file:
+    with open(f'{output_directory}{os.sep}{locus}.tsv') as result_file:
        tsvreader = csv.reader(result_file, delimiter='\t')
        try:
            tools = next(tsvreader)[1:]
@@ -131,7 +144,7 @@ def display_outliers(locus, limits):
                    dijen_outliers[tool][zscore_label] = '.'

    # Output
-    print('dijen\tEH\tEH\tEH\tEH\tTred\tTred\tTred\tTred\tGangSTR\tGangSTR\tGangSTR\tGangSTR')
+    print('sample\tEH\tEH\tEH\tEH\tTred\tTred\tTred\tTred\tGangSTR\tGangSTR\tGangSTR\tGangSTR')
    print(f'\tLimit\t{percentile_label}\t{zscore_label}\t< 3' * 3)
    for dijen, dijen_outliers in results.items():
        all_outliers = [dijen]

--- a/str_plotly.py
+++ b/str_plotly.py
 #! /usr/bin/env python3

 ### ASDP PIPELINE ###
-## Version : 0.0.1
-## Licence : FIXME
-## Description : script to get automatically graphics from expansion pipeline results from getResults.py with Plotly
-## Usage :
-## Output : FIXME
-## Requirements : FIXME
-
+## Version: 0.0.1
+## Licence: AGPLv3
 ## Author : anne-sophie.denomme-pichon@u-bourgogne.fr
-## Creation Date : 20200202
-## last revision date : 20200202
-## Known bugs : None
+## Description: script to get automatically graphics from expansion pipeline results from getResults.py with Plotly

 import collections
 import csv
+import logging
 import os
+import os.path
 import sys

-path = '/work/gad/shared/analyse/STR/results'
+output_directory = None
+
+with open(os.path.join(os.path.dirname(sys.argv[0]), 'config.sh'))) as config:
+    for line in config:
+        if '=' in line:
+            variable, value = line.split('=', 1)
+            elif variable == 'RESULTS_OUTPUTDIR':
+                output_directory = value.split('#')[0].strip('"\' ') # strip double quotes, simple quotes and spaces
+
+if output_directory is None
+    logging.error('RESULTS_OUTPUTDIR is missing in config.sh')
+    sys.exit(1)

 def display_console_graph(title, tools, data):
    print(title)
@@ -54,7 +60,7 @@ def display_html_graph(title, tools, data):
 def graph_locus(locus):
    title = f'Effectif pour chaque nombre de répétitions au locus {locus}'
    data = []
-    with open(f'{path}{os.sep}{locus}.tsv') as result_file:
+    with open(f'{output_directory}{os.sep}{locus}.tsv') as result_file:
        tsvreader = csv.reader(result_file, delimiter='\t')
        try:
            tools = next(tsvreader)[1:]