Create scripts for upsampling

parents
#! /usr/bin/env python3
#Anne-Sophie Denommé-Pichon
#AGPLv3
import sys
def upsample(upsampling_read_count, read_id):
"""
upsampling_read_count : number of additional reads
read_id : id of the read you want to repeat
"""
try:
max_lane = 0
max_tile = 0
max_x = 0
max_y = 0
while True: # boucle infinie pour lire les lignes 4 par 4
title_line = next(sys.stdin).rstrip()
sequence_line = next(sys.stdin).rstrip()
plus_line = next(sys.stdin).rstrip()
quality_line = next(sys.stdin).rstrip()
print(title_line)
print(sequence_line)
print(plus_line)
print(quality_line)
lane = int(title_line.split(':')[2])
if lane > max_lane:
max_lane = lane
tile = int(title_line.split(':')[3])
if tile > max_tile:
max_tile = tile
x = int(title_line.split(':')[4])
if x > max_x:
max_x = x
y = str(title_line.split(':')[5])
if title_line == read_id:
title_line_str = title_line
sequence_line_str = sequence_line
plus_line_str = plus_line
quality_line_str = quality_line
except StopIteration: # quand erreur StopIteration, arrêter
pass
for i in range(1, upsampling_read_count + 1):
print(':'.join(title_line_str.split(':')[0:2] + [str(max_lane + i), str(max_tile + i), str(max_x + i), str(y)]))
print(sequence_line_str)
print('+')
print(quality_line_str)
if __name__ == '__main__':
if len(sys.argv) == 3:
upsample(int(sys.argv[1]), sys.argv[2])
else:
print("Usage: fastq_upsampling.py <upsampling_read_count> <read_id> < input.fastq > output.fastq", file=sys.stderr)
sys.exit(1)
#! /bin/sh
### ASDP PIPELINE ###
## Licence: AGPLV3
## Author: Anne-Sophie Denommé-Pichon
## Description: script to launch the wrapper to upsample data
READ_ID="@HISEQ2:C88CYACXX:7:1213:21186:56408"
INPUTFILE="/work/gad/shared/analyse/STR/Data/dijen402/dijen402.R2.fastq"
SAMPLE="$(basename -s .fastq "$INPUTFILE")"
OUTPUTDIR="/work/gad/shared/analyse/STR/Data/upsampling"
COMPUTE_QUEUE="batch"
LOGDIR="$OUTPUTDIR"
for rate in 10 20 30 40 50 100 200 300 400 500 1000
do
qsub -pe smp 1 -o "$LOGDIR" -e "$LOGDIR" -q "$COMPUTE_QUEUE" \
-v INPUTFILE="$INPUTFILE",OUTPUTFILE="$OUTPUTDIR/$SAMPLE.upsampling_$rate.fastq",UPSAMPLING_RATE="$rate",READ_ID="$READ_ID",LOGFILE="$LOGDIR/upsampling_$SAMPLE_$rate.$(date +"%F_%H-%M-%S").log" "$(dirname "$0")/wrapper_upsampling.sh"
done
#! /bin/sh
### ASDP PIPELINE ###
## Licence: AGPLv3
## Author: Anne-Sophie Denommé-Pichon
## Description: a wrapper for FASTQ upsampling
## Usage: qsub -pe smp 1 -v INPUTFILE=<path to the FASTQ file>,OUTPUTFILE=<output file>,UPSAMPLING_RATE=<percentage of reads to add to theFASTQ file>,READ_ID=<header of the read to add>,[LOGFILE=<path to the log file>] wrapper_upsampling.sh
# Log file path option
if [ -z "$LOGFILE" ]
then
LOGFILE=downsampling.$(date +"%F_%H-%M-%S").log
fi
# Logging
exec 1>> "$LOGFILE" 2>&1
echo "$(date +"%F_%H-%M-%S"): START"
# Check if input file exists
if [ ! -f "$INPUTFILE" ]
then
echo "Input file '$INPUTFILE' does not exist"
echo "$(date +"%F_%H-%M-%S"): END"
exit 1
fi
# Check if output prefix is specified
if [ -z "$OUTPUTFILE" ]
then
echo "Outputfile is not specified"
echo "$(date +"%F_%H-%M-%S"): END"
exit 1
fi
# Check if upsampling rate is specified
if [ -z "$UPSAMPLING_RATE" ]
then
echo "Upsampling rate is not specified"
echo "$(date +"%F_%H-%M-%S"): END"
exit 1
fi
# Check if read ID is specified
if [ -z "$READ_ID" ]
then
echo "Read ID is not specified"
echo "$(date +"%F_%H-%M-%S"): END"
exit 1
fi
# Launch script command and check exit code
/user1/gad/an1770de/Scripts/upsampling/fastq_upsampling.py "$UPSAMPLING_RATE" "$READ_ID" < "$INPUTFILE" > "$OUTPUTFILE"
upsampling_exitcode=$?
echo "upsampling exit code : $upsampling_exitcode"
if [ $upsampling_exitcode != 0 ]
then
echo "$(date +"%F_%H-%M-%S"): END"
exit 1
fi
echo "$(date +"%F_%H-%M-%S"): END"
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment