Add result cache to handle only new samples & group results scripts in one

fcbfaac5 · Anne-Sophie Denommé-Pichon · c5833135 · fcbfaac5 · fcbfaac5 · fcbfaac5
Commit fcbfaac5 authored Jun 21, 2020 by Anne-Sophie Denommé-Pichon
8 changed files
--- a/README.md
+++ b/README.md
@@ -7,8 +7,8 @@
 - Description: How to launch scripts to get STR genotype from genomes on all the locus tested
 1. Create `samples.list`
-2. Fill the configuration file `config.sh`. Warning, don't overwrite existing files
+2. Fill the configuration file `config.sh`.
-3. Launch `launch_pipeline.sh` : `nohup ./launch_pipeline.sh samples.list &`. Dependencies :
+3. Launch `launch_pipeline.sh`: `nohup ./launch_pipeline.sh samples.list &`. Dependencies :
   - `config.sh`
   - `samples.list`
   - `pipeline.sh`
@@ -18,11 +18,17 @@
   - `wrapper_gangstr.sh`
   - `wrapper_transfer.sh`
   - `wrapper_tredparse.sh`
-4. Launch `launch_pipeline_ehdn_outlier.sh` : `nohup ./launch_pipeline_ehdn_outlier.sh &`. Dependencies :
+4. Optional: launch `launch_pipeline_ehdn_outlier.sh`: `nohup ./launch_pipeline_ehdn_outlier.sh &`. Dependencies :
   - `config.sh`
   - `pipeline_ehdn_outlier.sh`
   - `wrapper_ehdn_outlier.sh`
-5. Launch `getResults.py`. Warning, don't overwrite existing files.
+5. Launch `launch_results.sh`: `nohup ./launch_results.sh &`. Dependencies:
-6. Launch `launch_str_plotly.sh`.
+   - `config.sh`
-7. Change z-score threshold if necessary in `config.sh`. Launch `launch_str_outliers.sh`. Dependency: `patho.csv`.
+   - `sample.list`
-10. Get files (i.e.: `scp 'an1770de@ssh-ccub.u-bourgogne.fr:/work/gad/shared/analyse/STR/results/*' .`)
+   - `patho.csv`
+   - `getResults.py`
+   - `launch_str_outliers.sh`
+   - `str_outliers.py`
+6. Optional: launch `launch_str_plotly.sh`.
+7. Get files (i.e.: `scp 'an1770de@ssh-ccub.u-bourgogne.fr:/work/gad/shared/analyse/STR/results/*' .`)
--- a/TODO.org
+++ b/TODO.org
@@ -9,11 +9,14 @@
  - [ ] Brancher EHDN et outlier
  - [ ] Séparer case et contrôles
  - [ ] Ajouter EHDN dans GetResults
- [ ] Ajouter le sample dans le nom de fichier de log pour pieline.sh
+- [ ] Ajouter le sample dans le nom de fichier de log pour pipeline.sh
 - [ ] Changer le répertoire de sortie : /STR/pipeline car pipeline n'est pas explicite (répertoire avec fichiers de sortie des outils de détection de STR)
 - [ ] Remove dijen from str_outliers.py
 - [ ] Prévoir l'installation des dépendances
 - [ ] Faire un launcher pour lancer getResults dans SGE
- [ ] Ajouter la possibilité de faire tourner getResults sur une liste d'échantillons donnée (actuellement, glob *)
+- [X] Ajouter la possibilité de faire tourner getResults sur une liste d'échantillons donnée (actuellement, glob *)
- [ ] Créer le répertoire results automatiquement + avec la date
+- [X] Créer le répertoire results automatiquement (sans la date)
- [ ] Ajouter la possibilité de lancer le pipeline depuis n'importe quel répertoire et pas seulement celui où se trouve les scripts
+- [ ] Ajouter la possibilité de lancer le pipeline depuis n'importe quel répertoire et pas seulement celui où se trouvent les scripts
+- [ ] Créer un script pour faire le ménage après le passage de SGE
+- [ ] Possibilité de choisir les outils à utiliser (par exemple, option pour ne pas utiliser GangSTR)
+- [X] Gérer l'absence de queue transfer
--- a/config.sh
+++ b/config.sh
@@ -8,7 +8,7 @@
 INPUTDIR="/archive/gad/shared/bam_new_genome_temp"
 OUTPUTDIR="/work/gad/shared/analyse/STR/pipeline"
-RESULTS_OUTPUTDIR="/work/gad/shared/analyse/STR/results" # for getResults.py, Plotly and outliers tables
+RESULTS_OUTPUTDIR="/work/gad/shared/analyse/STR/results" # For Plotly and outliers tables
 # Valid values: "sge"
 INFRA=sge
@@ -16,7 +16,7 @@ INFRA=sge
 # Variables specific to SGE
 TRANSFER_QUEUE=transfer
 COMPUTE_QUEUE=batch
-TRANSFER=yes #FIXME attention, changer aussi le delete +++
+TRANSFER=yes
 # Tools
 PARALLEL="/work/gad/shared/bin/parallel/parallel-20150522-1.el7.cern/bin/parallel"

--- a/getResults.py
+++ b/getResults.py
@@ -7,7 +7,7 @@
 ## Author: anne-sophie.denomme-pichon@u-bourgogne.fr
 ## Description: script to get automatically results from pipeline.sh script in a tsv format on all the locus
-import glob
+import csv
 import gzip
 import json
 import logging
@@ -98,17 +98,44 @@ def get_gang_results(file_path, region):
        logging.warn(f'File not found "{file_path}"')
        return 'nofile'
-def get_results(locus, region):
+def get_results(locus, region, samples):
-    with open(os.path.join(output_directory, locus + '.tsv'), 'w') as result_file:
+    has_header = False
+    previous_results = set()
+    try:
+        with open(os.path.join(output_directory, locus + '.tsv'), 'r') as result_file:
+            tsvreader = csv.reader(result_file, delimiter='\t')
+            try:
+                next(tsvreader) # not take into account the header
+                has_header = True
+                for row in tsvreader:
+                    previous_results.add(row[0])
+            except StopIteration:
+                pass
+    except:
+        pass
+    with open(os.path.join(output_directory, locus + '.tsv'), 'a') as result_file:
+        if not has_header: # If the outputfile has not header (file empty), it adds the header (unlikely to happen)
            result_file.write('samplexxx\tEH\tTred\tGangSTR\n')
-        for file_path in sorted(glob.glob(os.path.join(input_directory, '*'))):
+        for sample in samples:
-            file_name = file_path.split(os.sep)[-1]
+            if sample not in previous_results:
-            eh = get_eh_results(os.path.join(file_path, f'eh/{file_name}.vcf'), region)
+                file_path = os.path.join(input_directory, sample)
-            tred = get_tred_results(os.path.join(file_path, f'tredparse/{file_name}.tred.vcf.gz'), region)
+                print(file_path)
-            gang = get_gang_results(os.path.join(file_path, f'gangstr/{file_name}.vcf'), region)
+                eh = get_eh_results(os.path.join(file_path, f'eh/{sample}.vcf'), region)
-            result_file.write(f'{file_name}\t{eh}\t{tred}\t{gang}\n')
+                tred = get_tred_results(os.path.join(file_path, f'tredparse/{sample}.tred.vcf.gz'), region)
+                gang = get_gang_results(os.path.join(file_path, f'gangstr/{sample}.vcf'), region)
+                result_file.write(f'{sample}\t{eh}\t{tred}\t{gang}\n')
 if __name__ == '__main__':
+    if len(sys.argv) != 2:
+        print(f'Usage: {sys.argv[0].split(os.sep)[-1]} <SAMPLESLIST>', file=sys.stderr)
+        sys.exit(1)
+    with open(sys.argv[1]) as samples_list:
+        samples = []
+        for sample in sorted(samples_list.readlines()):
+            samples.append(sample.rstrip())
    os.makedirs(output_directory, exist_ok=True)
    for locus, region in enumerate_variants(variant_catalog):
-        get_results(locus, region)
+        print(locus, region)
+        get_results(locus, region, samples)
--- a/launch_results.sh
+++ b/launch_results.sh
+#! /bin/sh
+### ASDP PIPELINE ###
+## Version: 0.0.1
+## Licence: AGPLV3
+## Author: anne-sophie.denomme-pichon@u-bourgogne.fr
+## Description: script to launch the pipeline for getting STR detection results. Receive multiple samples: one sample per line
+# $1 : first argument in the command line : a list containing one sample per line, for exemple samples.list
+SAMPLES="$1"
+# Check if sample is specified
+if [ -z "$SAMPLES" ]
+then
+    echo "List of samples is not specified"
+    echo "$(date +"%F_%H-%M-%S"): END"
+    exit 1
+fi
+# Source configuration file
+. "$(dirname "$0")/config.sh"
+# Create the results output directory
+mkdir -p "$RESULTS_OUTPUTDIR"
+# Launch getResults.py
+"$(dirname "$0")/getResults.py" "$SAMPLES"
+# Launch launch_str_outliers.sh
+"$(dirname "$0")/launch_str_outliers.sh" "$SAMPLES"
--- a/launch_str_outliers.sh
+++ b/launch_str_outliers.sh
@@ -9,12 +9,25 @@
 # Source configuration file
 . "$(dirname "$0")/config.sh"
+# $1 : first argument in the command line : a list containing one sample per line, for exemple samples.list
+SAMPLES="$1"
+# Check if sample is specified
+if [ -z "$SAMPLES" ]
+then
+    echo "List of samples is not specified"
+    echo "$(date +"%F_%H-%M-%S"): END"
+    exit 1
+fi
 SCRIPT="$(dirname "$(readlink -f "$0")")/str_outliers.py"
+SAMPLES="$(dirname "$(readlink -f "$0")")/$SAMPLES"
+cd "$RESULTS_OUTPUTDIR" || exit 1
-cd "$OUTPUTDIR" || exit 1
+   for locus_tsv in $(ls *.tsv | grep -v outliers); do
-for locus_tsv in $(ls *.tsv | grep -v outliers); do
    locus="$(basename "$locus_tsv" ".tsv")"
    echo "Processing $locus" >&2
-    "$SCRIPT" "$locus" > "$locus.outliers.tsv"
+    "$SCRIPT" "$locus" "$SAMPLES" > "$locus.outliers.tsv"
 done
--- a/pipeline.sh
+++ b/pipeline.sh
@@ -4,7 +4,7 @@
 ## Version: 0.0.1
 ## Licence: AGPLV3
 ## Author: anne-sophie.denomme-pichon@u-bourgogne.fr
-## Description: script to launch the pipeline for STR detection
+## Description: script to launch the pipeline for STR detection. This script handles a single sample.
 # $1 : first argument in the command line : the input file

--- a/str_outliers.py
+++ b/str_outliers.py
@@ -22,18 +22,18 @@ output_directory = None
 zscore_threshold = None
 percentile_threshold = None
-with open(os.path.join(os.path.dirname(sys.argv[0]), 'config.sh'))) as config:
+with open(os.path.join(os.path.dirname(sys.argv[0]), 'config.sh')) as config:
    for line in config:
        if '=' in line:
            variable, value = line.split('=', 1)
-            elif variable == 'RESULTS_OUTPUTDIR':
+            if variable == 'RESULTS_OUTPUTDIR':
                output_directory = value.split('#')[0].strip('"\' ') # strip double quotes, simple quotes and spaces
            elif variable == 'ZSCORE_THRESHOLD':
                zscore_threshold = float(value.split('#')[0].strip('"\' ')) # strip double quotes, simple quotes and spaces
            elif variable == 'PERCENTILE_THRESHOLD':
                percentile_threshold = float(value.split('#')[0].strip('"\' ')) # strip double quotes, simple quotes and spaces
-if output_directory is None
+if output_directory is None:
    logging.error('RESULTS_OUTPUTDIR or ZSCORE_THRESHOLD or PERCENTILE_THRESHOLD is missing in config.sh')
    sys.exit(1)
@@ -54,7 +54,7 @@ def load_limits():
            print('Limits file is empty', file=sys.stderr)
            sys.exit(1)
-def display_outliers(locus, limits):
+def display_outliers(locus, limits, samples):
    # results = {
    #     'dijen': {
    #         'tool': {
@@ -73,6 +73,7 @@ def display_outliers(locus, limits):
            tools = next(tsvreader)[1:]
            for row in tsvreader:
                dijen = row[0]
                results[dijen] = collections.OrderedDict()
                for tool_id, tool in enumerate(tools):
                    tool_value = row[tool_id + 1].replace('nofile', '.').replace('-1', '.')
@@ -157,12 +158,16 @@ def display_outliers(locus, limits):
                all_outliers.extend(tool_outliers.values())
            else:
                all_outliers.append('.')
-        if dijen_has_outliers:
+        if dijen_has_outliers and dijen in samples:
            print('\t'.join(all_outliers))
 if __name__ == '__main__':
-    if len(sys.argv) != 2:
+    if len(sys.argv) != 3:
-        print(f'Usage: {sys.argv[0].split(os.sep)[-1]} <LOCUS>', file=sys.stderr)
+        print(f'Usage: {sys.argv[0].split(os.sep)[-1]} <LOCUS> <SAMPLE.LIST>', file=sys.stderr)
        sys.exit(1)
+    with open(sys.argv[2]) as samples_list:
+        samples = set()
+        for sample in samples_list.readlines():
+            samples.add(sample.rstrip())
    limits = load_limits()
-    display_outliers(sys.argv[1], limits)
+    display_outliers(sys.argv[1], limits, samples)