Add result cache to handle only new samples & group results scripts in one

parent c5833135
...@@ -7,8 +7,8 @@ ...@@ -7,8 +7,8 @@
- Description: How to launch scripts to get STR genotype from genomes on all the locus tested - Description: How to launch scripts to get STR genotype from genomes on all the locus tested
1. Create `samples.list` 1. Create `samples.list`
2. Fill the configuration file `config.sh`. Warning, don't overwrite existing files 2. Fill the configuration file `config.sh`.
3. Launch `launch_pipeline.sh` : `nohup ./launch_pipeline.sh samples.list &`. Dependencies : 3. Launch `launch_pipeline.sh`: `nohup ./launch_pipeline.sh samples.list &`. Dependencies :
- `config.sh` - `config.sh`
- `samples.list` - `samples.list`
- `pipeline.sh` - `pipeline.sh`
...@@ -18,11 +18,17 @@ ...@@ -18,11 +18,17 @@
- `wrapper_gangstr.sh` - `wrapper_gangstr.sh`
- `wrapper_transfer.sh` - `wrapper_transfer.sh`
- `wrapper_tredparse.sh` - `wrapper_tredparse.sh`
4. Launch `launch_pipeline_ehdn_outlier.sh` : `nohup ./launch_pipeline_ehdn_outlier.sh &`. Dependencies : 4. Optional: launch `launch_pipeline_ehdn_outlier.sh`: `nohup ./launch_pipeline_ehdn_outlier.sh &`. Dependencies :
- `config.sh` - `config.sh`
- `pipeline_ehdn_outlier.sh` - `pipeline_ehdn_outlier.sh`
- `wrapper_ehdn_outlier.sh` - `wrapper_ehdn_outlier.sh`
5. Launch `getResults.py`. Warning, don't overwrite existing files. 5. Launch `launch_results.sh`: `nohup ./launch_results.sh &`. Dependencies:
6. Launch `launch_str_plotly.sh`. - `config.sh`
7. Change z-score threshold if necessary in `config.sh`. Launch `launch_str_outliers.sh`. Dependency: `patho.csv`. - `sample.list`
10. Get files (i.e.: `scp 'an1770de@ssh-ccub.u-bourgogne.fr:/work/gad/shared/analyse/STR/results/*' .`) - `patho.csv`
- `getResults.py`
- `launch_str_outliers.sh`
- `str_outliers.py`
6. Optional: launch `launch_str_plotly.sh`.
7. Get files (i.e.: `scp 'an1770de@ssh-ccub.u-bourgogne.fr:/work/gad/shared/analyse/STR/results/*' .`)
...@@ -9,11 +9,14 @@ ...@@ -9,11 +9,14 @@
- [ ] Brancher EHDN et outlier - [ ] Brancher EHDN et outlier
- [ ] Séparer case et contrôles - [ ] Séparer case et contrôles
- [ ] Ajouter EHDN dans GetResults - [ ] Ajouter EHDN dans GetResults
- [ ] Ajouter le sample dans le nom de fichier de log pour pieline.sh - [ ] Ajouter le sample dans le nom de fichier de log pour pipeline.sh
- [ ] Changer le répertoire de sortie : /STR/pipeline car pipeline n'est pas explicite (répertoire avec fichiers de sortie des outils de détection de STR) - [ ] Changer le répertoire de sortie : /STR/pipeline car pipeline n'est pas explicite (répertoire avec fichiers de sortie des outils de détection de STR)
- [ ] Remove dijen from str_outliers.py - [ ] Remove dijen from str_outliers.py
- [ ] Prévoir l'installation des dépendances - [ ] Prévoir l'installation des dépendances
- [ ] Faire un launcher pour lancer getResults dans SGE - [ ] Faire un launcher pour lancer getResults dans SGE
- [ ] Ajouter la possibilité de faire tourner getResults sur une liste d'échantillons donnée (actuellement, glob *) - [X] Ajouter la possibilité de faire tourner getResults sur une liste d'échantillons donnée (actuellement, glob *)
- [ ] Créer le répertoire results automatiquement + avec la date - [X] Créer le répertoire results automatiquement (sans la date)
- [ ] Ajouter la possibilité de lancer le pipeline depuis n'importe quel répertoire et pas seulement celui où se trouve les scripts - [ ] Ajouter la possibilité de lancer le pipeline depuis n'importe quel répertoire et pas seulement celui où se trouvent les scripts
- [ ] Créer un script pour faire le ménage après le passage de SGE
- [ ] Possibilité de choisir les outils à utiliser (par exemple, option pour ne pas utiliser GangSTR)
- [X] Gérer l'absence de queue transfer
...@@ -8,7 +8,7 @@ ...@@ -8,7 +8,7 @@
INPUTDIR="/archive/gad/shared/bam_new_genome_temp" INPUTDIR="/archive/gad/shared/bam_new_genome_temp"
OUTPUTDIR="/work/gad/shared/analyse/STR/pipeline" OUTPUTDIR="/work/gad/shared/analyse/STR/pipeline"
RESULTS_OUTPUTDIR="/work/gad/shared/analyse/STR/results" # for getResults.py, Plotly and outliers tables RESULTS_OUTPUTDIR="/work/gad/shared/analyse/STR/results" # For Plotly and outliers tables
# Valid values: "sge" # Valid values: "sge"
INFRA=sge INFRA=sge
...@@ -16,7 +16,7 @@ INFRA=sge ...@@ -16,7 +16,7 @@ INFRA=sge
# Variables specific to SGE # Variables specific to SGE
TRANSFER_QUEUE=transfer TRANSFER_QUEUE=transfer
COMPUTE_QUEUE=batch COMPUTE_QUEUE=batch
TRANSFER=yes #FIXME attention, changer aussi le delete +++ TRANSFER=yes
# Tools # Tools
PARALLEL="/work/gad/shared/bin/parallel/parallel-20150522-1.el7.cern/bin/parallel" PARALLEL="/work/gad/shared/bin/parallel/parallel-20150522-1.el7.cern/bin/parallel"
......
...@@ -7,7 +7,7 @@ ...@@ -7,7 +7,7 @@
## Author: anne-sophie.denomme-pichon@u-bourgogne.fr ## Author: anne-sophie.denomme-pichon@u-bourgogne.fr
## Description: script to get automatically results from pipeline.sh script in a tsv format on all the locus ## Description: script to get automatically results from pipeline.sh script in a tsv format on all the locus
import glob import csv
import gzip import gzip
import json import json
import logging import logging
...@@ -98,17 +98,44 @@ def get_gang_results(file_path, region): ...@@ -98,17 +98,44 @@ def get_gang_results(file_path, region):
logging.warn(f'File not found "{file_path}"') logging.warn(f'File not found "{file_path}"')
return 'nofile' return 'nofile'
def get_results(locus, region): def get_results(locus, region, samples):
with open(os.path.join(output_directory, locus + '.tsv'), 'w') as result_file: has_header = False
previous_results = set()
try:
with open(os.path.join(output_directory, locus + '.tsv'), 'r') as result_file:
tsvreader = csv.reader(result_file, delimiter='\t')
try:
next(tsvreader) # not take into account the header
has_header = True
for row in tsvreader:
previous_results.add(row[0])
except StopIteration:
pass
except:
pass
with open(os.path.join(output_directory, locus + '.tsv'), 'a') as result_file:
if not has_header: # If the outputfile has not header (file empty), it adds the header (unlikely to happen)
result_file.write('samplexxx\tEH\tTred\tGangSTR\n') result_file.write('samplexxx\tEH\tTred\tGangSTR\n')
for file_path in sorted(glob.glob(os.path.join(input_directory, '*'))): for sample in samples:
file_name = file_path.split(os.sep)[-1] if sample not in previous_results:
eh = get_eh_results(os.path.join(file_path, f'eh/{file_name}.vcf'), region) file_path = os.path.join(input_directory, sample)
tred = get_tred_results(os.path.join(file_path, f'tredparse/{file_name}.tred.vcf.gz'), region) print(file_path)
gang = get_gang_results(os.path.join(file_path, f'gangstr/{file_name}.vcf'), region) eh = get_eh_results(os.path.join(file_path, f'eh/{sample}.vcf'), region)
result_file.write(f'{file_name}\t{eh}\t{tred}\t{gang}\n') tred = get_tred_results(os.path.join(file_path, f'tredparse/{sample}.tred.vcf.gz'), region)
gang = get_gang_results(os.path.join(file_path, f'gangstr/{sample}.vcf'), region)
result_file.write(f'{sample}\t{eh}\t{tred}\t{gang}\n')
if __name__ == '__main__': if __name__ == '__main__':
if len(sys.argv) != 2:
print(f'Usage: {sys.argv[0].split(os.sep)[-1]} <SAMPLESLIST>', file=sys.stderr)
sys.exit(1)
with open(sys.argv[1]) as samples_list:
samples = []
for sample in sorted(samples_list.readlines()):
samples.append(sample.rstrip())
os.makedirs(output_directory, exist_ok=True) os.makedirs(output_directory, exist_ok=True)
for locus, region in enumerate_variants(variant_catalog): for locus, region in enumerate_variants(variant_catalog):
get_results(locus, region) print(locus, region)
get_results(locus, region, samples)
#! /bin/sh
### ASDP PIPELINE ###
## Version: 0.0.1
## Licence: AGPLV3
## Author: anne-sophie.denomme-pichon@u-bourgogne.fr
## Description: script to launch the pipeline for getting STR detection results. Receive multiple samples: one sample per line
# $1 : first argument in the command line : a list containing one sample per line, for exemple samples.list
SAMPLES="$1"
# Check if sample is specified
if [ -z "$SAMPLES" ]
then
echo "List of samples is not specified"
echo "$(date +"%F_%H-%M-%S"): END"
exit 1
fi
# Source configuration file
. "$(dirname "$0")/config.sh"
# Create the results output directory
mkdir -p "$RESULTS_OUTPUTDIR"
# Launch getResults.py
"$(dirname "$0")/getResults.py" "$SAMPLES"
# Launch launch_str_outliers.sh
"$(dirname "$0")/launch_str_outliers.sh" "$SAMPLES"
...@@ -9,12 +9,25 @@ ...@@ -9,12 +9,25 @@
# Source configuration file # Source configuration file
. "$(dirname "$0")/config.sh" . "$(dirname "$0")/config.sh"
# $1 : first argument in the command line : a list containing one sample per line, for exemple samples.list
SAMPLES="$1"
# Check if sample is specified
if [ -z "$SAMPLES" ]
then
echo "List of samples is not specified"
echo "$(date +"%F_%H-%M-%S"): END"
exit 1
fi
SCRIPT="$(dirname "$(readlink -f "$0")")/str_outliers.py" SCRIPT="$(dirname "$(readlink -f "$0")")/str_outliers.py"
SAMPLES="$(dirname "$(readlink -f "$0")")/$SAMPLES"
cd "$RESULTS_OUTPUTDIR" || exit 1
cd "$OUTPUTDIR" || exit 1 for locus_tsv in $(ls *.tsv | grep -v outliers); do
for locus_tsv in $(ls *.tsv | grep -v outliers); do
locus="$(basename "$locus_tsv" ".tsv")" locus="$(basename "$locus_tsv" ".tsv")"
echo "Processing $locus" >&2 echo "Processing $locus" >&2
"$SCRIPT" "$locus" > "$locus.outliers.tsv" "$SCRIPT" "$locus" "$SAMPLES" > "$locus.outliers.tsv"
done done
...@@ -4,7 +4,7 @@ ...@@ -4,7 +4,7 @@
## Version: 0.0.1 ## Version: 0.0.1
## Licence: AGPLV3 ## Licence: AGPLV3
## Author: anne-sophie.denomme-pichon@u-bourgogne.fr ## Author: anne-sophie.denomme-pichon@u-bourgogne.fr
## Description: script to launch the pipeline for STR detection ## Description: script to launch the pipeline for STR detection. This script handles a single sample.
# $1 : first argument in the command line : the input file # $1 : first argument in the command line : the input file
......
...@@ -22,18 +22,18 @@ output_directory = None ...@@ -22,18 +22,18 @@ output_directory = None
zscore_threshold = None zscore_threshold = None
percentile_threshold = None percentile_threshold = None
with open(os.path.join(os.path.dirname(sys.argv[0]), 'config.sh'))) as config: with open(os.path.join(os.path.dirname(sys.argv[0]), 'config.sh')) as config:
for line in config: for line in config:
if '=' in line: if '=' in line:
variable, value = line.split('=', 1) variable, value = line.split('=', 1)
elif variable == 'RESULTS_OUTPUTDIR': if variable == 'RESULTS_OUTPUTDIR':
output_directory = value.split('#')[0].strip('"\' ') # strip double quotes, simple quotes and spaces output_directory = value.split('#')[0].strip('"\' ') # strip double quotes, simple quotes and spaces
elif variable == 'ZSCORE_THRESHOLD': elif variable == 'ZSCORE_THRESHOLD':
zscore_threshold = float(value.split('#')[0].strip('"\' ')) # strip double quotes, simple quotes and spaces zscore_threshold = float(value.split('#')[0].strip('"\' ')) # strip double quotes, simple quotes and spaces
elif variable == 'PERCENTILE_THRESHOLD': elif variable == 'PERCENTILE_THRESHOLD':
percentile_threshold = float(value.split('#')[0].strip('"\' ')) # strip double quotes, simple quotes and spaces percentile_threshold = float(value.split('#')[0].strip('"\' ')) # strip double quotes, simple quotes and spaces
if output_directory is None if output_directory is None:
logging.error('RESULTS_OUTPUTDIR or ZSCORE_THRESHOLD or PERCENTILE_THRESHOLD is missing in config.sh') logging.error('RESULTS_OUTPUTDIR or ZSCORE_THRESHOLD or PERCENTILE_THRESHOLD is missing in config.sh')
sys.exit(1) sys.exit(1)
...@@ -54,7 +54,7 @@ def load_limits(): ...@@ -54,7 +54,7 @@ def load_limits():
print('Limits file is empty', file=sys.stderr) print('Limits file is empty', file=sys.stderr)
sys.exit(1) sys.exit(1)
def display_outliers(locus, limits): def display_outliers(locus, limits, samples):
# results = { # results = {
# 'dijen': { # 'dijen': {
# 'tool': { # 'tool': {
...@@ -73,6 +73,7 @@ def display_outliers(locus, limits): ...@@ -73,6 +73,7 @@ def display_outliers(locus, limits):
tools = next(tsvreader)[1:] tools = next(tsvreader)[1:]
for row in tsvreader: for row in tsvreader:
dijen = row[0] dijen = row[0]
results[dijen] = collections.OrderedDict() results[dijen] = collections.OrderedDict()
for tool_id, tool in enumerate(tools): for tool_id, tool in enumerate(tools):
tool_value = row[tool_id + 1].replace('nofile', '.').replace('-1', '.') tool_value = row[tool_id + 1].replace('nofile', '.').replace('-1', '.')
...@@ -157,12 +158,16 @@ def display_outliers(locus, limits): ...@@ -157,12 +158,16 @@ def display_outliers(locus, limits):
all_outliers.extend(tool_outliers.values()) all_outliers.extend(tool_outliers.values())
else: else:
all_outliers.append('.') all_outliers.append('.')
if dijen_has_outliers: if dijen_has_outliers and dijen in samples:
print('\t'.join(all_outliers)) print('\t'.join(all_outliers))
if __name__ == '__main__': if __name__ == '__main__':
if len(sys.argv) != 2: if len(sys.argv) != 3:
print(f'Usage: {sys.argv[0].split(os.sep)[-1]} <LOCUS>', file=sys.stderr) print(f'Usage: {sys.argv[0].split(os.sep)[-1]} <LOCUS> <SAMPLE.LIST>', file=sys.stderr)
sys.exit(1) sys.exit(1)
with open(sys.argv[2]) as samples_list:
samples = set()
for sample in samples_list.readlines():
samples.add(sample.rstrip())
limits = load_limits() limits = load_limits()
display_outliers(sys.argv[1], limits) display_outliers(sys.argv[1], limits, samples)
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment