Add result cache to handle only new samples & group results scripts in one

parent c5833135
......@@ -7,8 +7,8 @@
- Description: How to launch scripts to get STR genotype from genomes on all the locus tested
1. Create `samples.list`
2. Fill the configuration file `config.sh`. Warning, don't overwrite existing files
3. Launch `launch_pipeline.sh` : `nohup ./launch_pipeline.sh samples.list &`. Dependencies :
2. Fill the configuration file `config.sh`.
3. Launch `launch_pipeline.sh`: `nohup ./launch_pipeline.sh samples.list &`. Dependencies :
- `config.sh`
- `samples.list`
- `pipeline.sh`
......@@ -18,11 +18,17 @@
- `wrapper_gangstr.sh`
- `wrapper_transfer.sh`
- `wrapper_tredparse.sh`
4. Launch `launch_pipeline_ehdn_outlier.sh` : `nohup ./launch_pipeline_ehdn_outlier.sh &`. Dependencies :
4. Optional: launch `launch_pipeline_ehdn_outlier.sh`: `nohup ./launch_pipeline_ehdn_outlier.sh &`. Dependencies :
- `config.sh`
- `pipeline_ehdn_outlier.sh`
- `wrapper_ehdn_outlier.sh`
5. Launch `getResults.py`. Warning, don't overwrite existing files.
6. Launch `launch_str_plotly.sh`.
7. Change z-score threshold if necessary in `config.sh`. Launch `launch_str_outliers.sh`. Dependency: `patho.csv`.
10. Get files (i.e.: `scp 'an1770de@ssh-ccub.u-bourgogne.fr:/work/gad/shared/analyse/STR/results/*' .`)
5. Launch `launch_results.sh`: `nohup ./launch_results.sh &`. Dependencies:
- `config.sh`
- `sample.list`
- `patho.csv`
- `getResults.py`
- `launch_str_outliers.sh`
- `str_outliers.py`
6. Optional: launch `launch_str_plotly.sh`.
7. Get files (i.e.: `scp 'an1770de@ssh-ccub.u-bourgogne.fr:/work/gad/shared/analyse/STR/results/*' .`)
......@@ -9,11 +9,14 @@
- [ ] Brancher EHDN et outlier
- [ ] Séparer case et contrôles
- [ ] Ajouter EHDN dans GetResults
- [ ] Ajouter le sample dans le nom de fichier de log pour pieline.sh
- [ ] Ajouter le sample dans le nom de fichier de log pour pipeline.sh
- [ ] Changer le répertoire de sortie : /STR/pipeline car pipeline n'est pas explicite (répertoire avec fichiers de sortie des outils de détection de STR)
- [ ] Remove dijen from str_outliers.py
- [ ] Prévoir l'installation des dépendances
- [ ] Faire un launcher pour lancer getResults dans SGE
- [ ] Ajouter la possibilité de faire tourner getResults sur une liste d'échantillons donnée (actuellement, glob *)
- [ ] Créer le répertoire results automatiquement + avec la date
- [ ] Ajouter la possibilité de lancer le pipeline depuis n'importe quel répertoire et pas seulement celui où se trouve les scripts
- [X] Ajouter la possibilité de faire tourner getResults sur une liste d'échantillons donnée (actuellement, glob *)
- [X] Créer le répertoire results automatiquement (sans la date)
- [ ] Ajouter la possibilité de lancer le pipeline depuis n'importe quel répertoire et pas seulement celui où se trouvent les scripts
- [ ] Créer un script pour faire le ménage après le passage de SGE
- [ ] Possibilité de choisir les outils à utiliser (par exemple, option pour ne pas utiliser GangSTR)
- [X] Gérer l'absence de queue transfer
......@@ -8,7 +8,7 @@
INPUTDIR="/archive/gad/shared/bam_new_genome_temp"
OUTPUTDIR="/work/gad/shared/analyse/STR/pipeline"
RESULTS_OUTPUTDIR="/work/gad/shared/analyse/STR/results" # for getResults.py, Plotly and outliers tables
RESULTS_OUTPUTDIR="/work/gad/shared/analyse/STR/results" # For Plotly and outliers tables
# Valid values: "sge"
INFRA=sge
......@@ -16,7 +16,7 @@ INFRA=sge
# Variables specific to SGE
TRANSFER_QUEUE=transfer
COMPUTE_QUEUE=batch
TRANSFER=yes #FIXME attention, changer aussi le delete +++
TRANSFER=yes
# Tools
PARALLEL="/work/gad/shared/bin/parallel/parallel-20150522-1.el7.cern/bin/parallel"
......
......@@ -7,7 +7,7 @@
## Author: anne-sophie.denomme-pichon@u-bourgogne.fr
## Description: script to get automatically results from pipeline.sh script in a tsv format on all the locus
import glob
import csv
import gzip
import json
import logging
......@@ -98,17 +98,44 @@ def get_gang_results(file_path, region):
logging.warn(f'File not found "{file_path}"')
return 'nofile'
def get_results(locus, region):
with open(os.path.join(output_directory, locus + '.tsv'), 'w') as result_file:
result_file.write('samplexxx\tEH\tTred\tGangSTR\n')
for file_path in sorted(glob.glob(os.path.join(input_directory, '*'))):
file_name = file_path.split(os.sep)[-1]
eh = get_eh_results(os.path.join(file_path, f'eh/{file_name}.vcf'), region)
tred = get_tred_results(os.path.join(file_path, f'tredparse/{file_name}.tred.vcf.gz'), region)
gang = get_gang_results(os.path.join(file_path, f'gangstr/{file_name}.vcf'), region)
result_file.write(f'{file_name}\t{eh}\t{tred}\t{gang}\n')
def get_results(locus, region, samples):
has_header = False
previous_results = set()
try:
with open(os.path.join(output_directory, locus + '.tsv'), 'r') as result_file:
tsvreader = csv.reader(result_file, delimiter='\t')
try:
next(tsvreader) # not take into account the header
has_header = True
for row in tsvreader:
previous_results.add(row[0])
except StopIteration:
pass
except:
pass
with open(os.path.join(output_directory, locus + '.tsv'), 'a') as result_file:
if not has_header: # If the outputfile has not header (file empty), it adds the header (unlikely to happen)
result_file.write('samplexxx\tEH\tTred\tGangSTR\n')
for sample in samples:
if sample not in previous_results:
file_path = os.path.join(input_directory, sample)
print(file_path)
eh = get_eh_results(os.path.join(file_path, f'eh/{sample}.vcf'), region)
tred = get_tred_results(os.path.join(file_path, f'tredparse/{sample}.tred.vcf.gz'), region)
gang = get_gang_results(os.path.join(file_path, f'gangstr/{sample}.vcf'), region)
result_file.write(f'{sample}\t{eh}\t{tred}\t{gang}\n')
if __name__ == '__main__':
if len(sys.argv) != 2:
print(f'Usage: {sys.argv[0].split(os.sep)[-1]} <SAMPLESLIST>', file=sys.stderr)
sys.exit(1)
with open(sys.argv[1]) as samples_list:
samples = []
for sample in sorted(samples_list.readlines()):
samples.append(sample.rstrip())
os.makedirs(output_directory, exist_ok=True)
for locus, region in enumerate_variants(variant_catalog):
get_results(locus, region)
print(locus, region)
get_results(locus, region, samples)
#! /bin/sh
### ASDP PIPELINE ###
## Version: 0.0.1
## Licence: AGPLV3
## Author: anne-sophie.denomme-pichon@u-bourgogne.fr
## Description: script to launch the pipeline for getting STR detection results. Receive multiple samples: one sample per line
# $1 : first argument in the command line : a list containing one sample per line, for exemple samples.list
SAMPLES="$1"
# Check if sample is specified
if [ -z "$SAMPLES" ]
then
echo "List of samples is not specified"
echo "$(date +"%F_%H-%M-%S"): END"
exit 1
fi
# Source configuration file
. "$(dirname "$0")/config.sh"
# Create the results output directory
mkdir -p "$RESULTS_OUTPUTDIR"
# Launch getResults.py
"$(dirname "$0")/getResults.py" "$SAMPLES"
# Launch launch_str_outliers.sh
"$(dirname "$0")/launch_str_outliers.sh" "$SAMPLES"
......@@ -9,12 +9,25 @@
# Source configuration file
. "$(dirname "$0")/config.sh"
# $1 : first argument in the command line : a list containing one sample per line, for exemple samples.list
SAMPLES="$1"
# Check if sample is specified
if [ -z "$SAMPLES" ]
then
echo "List of samples is not specified"
echo "$(date +"%F_%H-%M-%S"): END"
exit 1
fi
SCRIPT="$(dirname "$(readlink -f "$0")")/str_outliers.py"
SAMPLES="$(dirname "$(readlink -f "$0")")/$SAMPLES"
cd "$RESULTS_OUTPUTDIR" || exit 1
cd "$OUTPUTDIR" || exit 1
for locus_tsv in $(ls *.tsv | grep -v outliers); do
for locus_tsv in $(ls *.tsv | grep -v outliers); do
locus="$(basename "$locus_tsv" ".tsv")"
echo "Processing $locus" >&2
"$SCRIPT" "$locus" > "$locus.outliers.tsv"
"$SCRIPT" "$locus" "$SAMPLES" > "$locus.outliers.tsv"
done
......@@ -4,7 +4,7 @@
## Version: 0.0.1
## Licence: AGPLV3
## Author: anne-sophie.denomme-pichon@u-bourgogne.fr
## Description: script to launch the pipeline for STR detection
## Description: script to launch the pipeline for STR detection. This script handles a single sample.
# $1 : first argument in the command line : the input file
......
......@@ -22,18 +22,18 @@ output_directory = None
zscore_threshold = None
percentile_threshold = None
with open(os.path.join(os.path.dirname(sys.argv[0]), 'config.sh'))) as config:
with open(os.path.join(os.path.dirname(sys.argv[0]), 'config.sh')) as config:
for line in config:
if '=' in line:
variable, value = line.split('=', 1)
elif variable == 'RESULTS_OUTPUTDIR':
if variable == 'RESULTS_OUTPUTDIR':
output_directory = value.split('#')[0].strip('"\' ') # strip double quotes, simple quotes and spaces
elif variable == 'ZSCORE_THRESHOLD':
zscore_threshold = float(value.split('#')[0].strip('"\' ')) # strip double quotes, simple quotes and spaces
elif variable == 'PERCENTILE_THRESHOLD':
percentile_threshold = float(value.split('#')[0].strip('"\' ')) # strip double quotes, simple quotes and spaces
if output_directory is None
if output_directory is None:
logging.error('RESULTS_OUTPUTDIR or ZSCORE_THRESHOLD or PERCENTILE_THRESHOLD is missing in config.sh')
sys.exit(1)
......@@ -54,7 +54,7 @@ def load_limits():
print('Limits file is empty', file=sys.stderr)
sys.exit(1)
def display_outliers(locus, limits):
def display_outliers(locus, limits, samples):
# results = {
# 'dijen': {
# 'tool': {
......@@ -73,6 +73,7 @@ def display_outliers(locus, limits):
tools = next(tsvreader)[1:]
for row in tsvreader:
dijen = row[0]
results[dijen] = collections.OrderedDict()
for tool_id, tool in enumerate(tools):
tool_value = row[tool_id + 1].replace('nofile', '.').replace('-1', '.')
......@@ -96,7 +97,7 @@ def display_outliers(locus, limits):
results[dijen][tool]['Limit'] = tool_value
else:
results[dijen][tool]['Limit'] = 'NA'
except StopIteration:
print('Input file is empty', file=sys.stderr)
sys.exit(1)
......@@ -157,12 +158,16 @@ def display_outliers(locus, limits):
all_outliers.extend(tool_outliers.values())
else:
all_outliers.append('.')
if dijen_has_outliers:
if dijen_has_outliers and dijen in samples:
print('\t'.join(all_outliers))
if __name__ == '__main__':
if len(sys.argv) != 2:
print(f'Usage: {sys.argv[0].split(os.sep)[-1]} <LOCUS>', file=sys.stderr)
if len(sys.argv) != 3:
print(f'Usage: {sys.argv[0].split(os.sep)[-1]} <LOCUS> <SAMPLE.LIST>', file=sys.stderr)
sys.exit(1)
with open(sys.argv[2]) as samples_list:
samples = set()
for sample in samples_list.readlines():
samples.add(sample.rstrip())
limits = load_limits()
display_outliers(sys.argv[1], limits)
display_outliers(sys.argv[1], limits, samples)
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment