Commit fc81204a authored by simon verdez's avatar simon verdez

first commit public branch

parent 79aedba9
HLASCAN=''
SNPEFF=''
#todo Dowload
#https://api.pharmgkb.org/v1/download/file/data/clinicalAnnotations.zip
#change star notation by dbsnp notation with CPIC recommandations or PharmVar website
#data test
#wget https://ftp-trace.ncbi.nlm.nih.gov/giab/ftp/data/NA12878/Nebraska_NA12878_HG001_TruSeq_Exome/NIST-hg001-7001-b-gatk.vcf
#HLASCAN
python2.7 $HLASCAN/HLAscan.v1.0.Files/hla-paper/haplo_scan_v4.0-hla.py $BAMFILE $HLASCAN/HLAscan.v1.0.Files/hla-ref-5gene/gene_list HLAscan_result/
#Annotate VCF File
bash -o pipefail -c "java -Xmx4g -jar $SNPEFF/SnpSift.jar annotate pharmgkb_v2.vcf $VCF > ${VCF}_annot"
#create HTML
python2.7 create_html.py -v ${VCF}_annot -h HLAscan_result/Report -b $BAMFILE -t tableau_pharmgkb.tsv -o Report.html
#!/bin/bash
### GAD PIPELINE ###
## Master.sh
## Version : 1.0
## Description : process pharmacoAnnot for exome data
## Usage : qsub -v ANALYSISDIR=<path to the analysis dir>,CONFIGFILE=<path to the config file>,PHARMCONFIG=<path to the config_pharm file> Master.sh
## Output : no standard output
## Requirements : Require pipeline scripts
## Author : Simon.verdez@u-bourgogne.fr
## Creation Date : 20180725
## Last revision date : 20180725
## Known bugs : None.
#$ -N Master
#$ -q batchbm
#$ -pe smp 1
#$ -V
#Logging
logbasename=$(date +"%F_%H-%M-%S")
vcfbasename=$(date +"%F")
reportbasename=$(date +"%F")
if [ -z ${LOGFILE} ]
then
LOGFILE=PharmAnnot.$logbasename.log
fi
exec 1>> $LOGFILE 2>&1
# Mandatory arguments
if [ -z ${ANALYSISDIR} ]
then
echo "ANALYSISDIR was not defined : execution stopped"
exit 1
fi
if [ -z ${CONFIGFILE+x} ]
then
echo "Config file not provided by the user. You need it to run this script. Stopping execution."
exit
fi
if [ -z ${PHARMCONFIG+x} ]
then
echo "Config file not provided by the user. You need it to run this script. Stopping execution."
exit
fi
#Usefull variables
PIPELINEBASE=`grep "pipelinebase" $PHARMCONFIG | cut -f2`
TSVFILE=`grep "pharmagktab" $PHARMCONFIG | cut -f2`
PHARMGKB=`grep "pharmagkbase" $PHARMCONFIG | cut -f2`
OMIM=`grep "omim" $PHARMCONFIG | cut -f2`
HLALIST=`grep "HLAlist" $PHARMCONFIG | cut -f2`
HEADER=`grep "header" $PHARMCONFIG | cut -f2`
REF=`grep "reference" $PHARMCONFIG | cut -f2`
# python path for proper execution of python
PYTHONPATH=$PIPELINEBASE/common:/work/gad/shared/bin/lib/python_2.7/lib/python2.7/site-packages:/work/gad/shared/bin/miniconda2/lib/python2.7/site-packages/
export PYTHONPATH
# Sample list
samples=`find -L $ANALYSISDIR/ -maxdepth 1 -mindepth 1 -type d | xargs -l basename`
# Batch sample list
if [ -e $ANALYSISDIR/batch.list ]
then
rm $ANALYSISDIR/batch.list
fi
find -L $ANALYSISDIR/ -maxdepth 1 -mindepth 1 -type d | xargs -l basename >> $ANALYSISDIR/batch.list
HLAscan:
# HLA genotyping
echo "### Execute HLAscan ###"
echo "Start : $(date +"%F_%H-%M-%S")"
for currentSample in $samples
do
echo "Command : qsub -pe smp 1 -N HLAscan_${currentSample} -o $ANALYSISDIR/$currentSample/logs/ -e $ANALYSISDIR/$currentSample/logs/ -v INPUTFILE=$ANALYSISDIR/$currentSample/$currentSample.dedup.bam,OUTPUTFILE=$ANALYSISDIR/$currentSample/HLAscan,GeneList=$GL,LOGFILE=$ANALYSISDIR/$currentSample/logs/process_HLAscan.$logbasename.log,CONFIGFILE=$CONFIGFILE $PIPELINEBASE/haplo_scan.sh"
# qsub -q batch -pe smp 1 -N HLAscan_${currentSample} -o $ANALYSISDIR/$currentSample/logs/ -e $ANALYSISDIR/$currentSample/logs/ -v BAM=$ANALYSISDIR/$currentSample/$currentSample.dedup.bam,SORTIE=$ANALYSISDIR/$currentSample/HLAscan,GeneList=$GL,LOGFILE=$ANALYSISDIR/$currentSample/logs/process_HLAscan.$logbasename.log,CONFIGFILE=$CONFIGFILE $PIPELINEBASE/haplo_scan.sh
done
echo "End : $(date +"%F_%H-%M-%S")"
varcallSNP:
# Variant calling first step
echo "### Variant calling SNP first step ###"
echo "Start : $(date +"%F_%H-%M-%S")"
for currentSample in $samples
do
echo "Command : qsub -pe smp 1 -o $ANALYSISDIR/$currentSample/logs/ -e $ANALYSISDIR/$currentSample/logs/ -N varcallSNP_${currentSample} -v INPUTFILE=$ANALYSISDIR/$currentSample/$currentSample.raw.vcf,PHARMGKB=$PHARMGKB,OUTPUTFILE=$ANALYSISDIR/$currentSample/$currentSample.pharmacoAnnot.SNP.vcf,LOGFILE=$ANALYSISDIR/$currentSample/logs/process_varcallSNP.$logbasename.log,CONFIGFILE=$CONFIGFILE $PIPELINEBASE/annotate_variants_pharmAnnot.sh"
qsub -q batch -pe smp 1 -N varcallSNP_${currentSample} -o $ANALYSISDIR/$currentSample/logs/ -e $ANALYSISDIR/$currentSample/logs/ -v INPUTFILE=$ANALYSISDIR/$currentSample/$currentSample.raw.vcf,PHARMGKB=$PHARMGKB,OUTPUTFILE=$ANALYSISDIR/$currentSample/$currentSample.pharmacoAnnot.SNP.vcf,LOGFILE=$ANALYSISDIR/$currentSample/logs/process_varcallSNP.$logbasename.log,CONFIGFILE=$CONFIGFILE $PIPELINEBASE/annotate_variants_pharmAnnot.sh
done
echo "End : $(date +"%F_%H-%M-%S")"
varcallCNV:
# Variant calling second step
echo "### Variant calling CNV second step ###"
echo "Start : $(date +"%F_%H-%M-%S")"
for currentSample in $samples
do
cnvfile=`ls -lht $ANALYSISDIR/$currentSample/ | grep "\.annot.tsv" | head -1 | awk '{print $9}'`
echo "Command : qsub -pe smp 1 -N varcallCNV_${currentSample} -o $ANALYSISDIR/$currentSample/logs/ -e $ANALYSISDIR/$currentSample/logs/ -v INPUTFILE=$ANALYSISDIR/$currentSample/$cnvfile,PHARMGKB=$TSVFILE,OMIM=$OMIM,OUTPUTFILE=$ANALYSISDIR/$currentSample/$currentSample.CNV.vcf,LOGFILE=$ANALYSISDIR/$currentSample/logs/process_varcallCNV.$logbasename.log,CONFIGFILE=$CONFIGFILE $PIPELINEBASE/wrapper_tsv2vcf.sh"
qsub -q batch -pe smp 1 -N tsv2vcf_${currentSample} -o $ANALYSISDIR/$currentSample/logs/ -e $ANALYSISDIR/$currentSample/logs/ -v INPUTFILE=$ANALYSISDIR/$currentSample/$cnvfile,PHARMGKB=$TSVFILE,OMIM=$OMIM,REF=$REF,OUTPUTFILE=$ANALYSISDIR/$currentSample/$currentSample.CNV.vcf,LOGFILE=$ANALYSISDIR/$currentSample/logs/process_varcallCNV.$logbasenameCNV.log,CONFIGFILE=$CONFIGFILE $PIPELINEBASE/wrapper_tsv2vcf.sh
done
echo "End : $(date +"%F_%H-%M-%S")"
createHTML:
# Generate a HTML report_vcf
echo "### Generate HTML report ###"
echo "Start : $(date +"%F_%H-%M-%S")"
for currentSample in $samples
do
echo "Command : qsub -pe smp 1 -hold_jid varcallCNV_${currentSample},varcallSNP_${currentSample},HLAscan_${currentSample} -N createHTML_${currentSample} -o $ANALYSISDIR/$currentSample/logs/ -e $ANALYSISDIR/$currentSample/logs/ -v VCF_SNP=$ANALYSISDIR/$currentSample/$currentSample.pharmacoAnnot.SNP.vcf,VCF_CNV=$ANALYSISDIR/$currentSample/$currentSample.CNV.vcf,REPORTFILE=$ANALYSISDIR/$currentSample/HLAscan/Report,BAMFILE=$ANALYSISDIR/$currentSample/$currentSample.dedup.bam,TSVFILE=$TSVFILE,OUTPUTFILE=$ANALYSISDIR/$currentSample/$currentSample.html,LOGFILE=$ANALYSISDIR/$currentSample/logs/create_html.$logbasename.log,CONFIGFILE=$CONFIGFILE $PIPELINEBASE/wrapper_create_html.sh"
qsub -q batch -pe smp 1 -hold_jid varcallCNV_${currentSample},varcallSNP_${currentSample},HLAscan_${currentSample} -N createHTML_${currentSample} -o $ANALYSISDIR/$currentSample/logs/ -e $ANALYSISDIR/$currentSample/logs/ -v VCF_SNP=$ANALYSISDIR/$currentSample/$currentSample.pharmacoAnnot.SNP.vcf,VCF_CNV=$ANALYSISDIR/$currentSample/$currentSample.CNV.vcf,REPORTFILE=$ANALYSISDIR/$currentSample/HLAscan/Report,BAMFILE=$ANALYSISDIR/$currentSample/$currentSample.dedup.bam,TSVFILE=$TSVFILE,OUTPUTFILE=$ANALYSISDIR/$currentSample/$currentSample.html,LOGFILE=$ANALYSISDIR/$currentSample/logs/create_html.$logbasename.log,CONFIGFILE=$CONFIGFILE $PIPELINEBASE/wrapper_create_html.sh
done
echo "End : $(date +"%F_%H-%M-%S")"
MergeVCF:
#merge vcf fies
echo "### Merge VCF files"
echo "Start : $(date +"%F_%H-%M-%S")"
for currentSample in $samples
do
echo "Command : qsub -pe smp 1 -hold_jid varcallCNV_${currentSample},varcallSNP_${currentSample} -N mergeVCF_${currentSample} -o $ANALYSISDIR/$currentSample/logs/ -e $ANALYSISDIR/$currentSample/logs/ -v INPUTFILE1=$ANALYSISDIR/$currentSample/$currentSample.CNV.vcf,INPUTFILE2=$ANALYSISDIR/$currentSample/$currentSample.pharmacoAnnot.SNP.vcf,REF=$REF,OUTPUTFILE=$ANALYSISDIR/$currentSample/$currentSample.pharmacoAnnot.merge.vcf,LOGFILE=$ANALYSISDIR/$currentSample/logs/merge_file.$logbasename.log,CONFIGFILE=$CONFIGFILE $PIPELINEBASE/merge_GATK.sh"
qsub -q batch -pe smp 1 -hold_jid varcallCNV_${currentSample},varcallSNP_${currentSample} -N mergeVCF_${currentSample} -o $ANALYSISDIR/$currentSample/logs/ -e $ANALYSISDIR/$currentSample/logs/ -v INPUTFILE1=$ANALYSISDIR/$currentSample/$currentSample.CNV.vcf,INPUTFILE2=$ANALYSISDIR/$currentSample/$currentSample.pharmacoAnnot.SNP.vcf,REF=$REF,OUTPUTFILE=$ANALYSISDIR/$currentSample/$currentSample.pharmacoAnnot.merge.vcf,LOGFILE=$ANALYSISDIR/$currentSample/logs/merge_file.$logbasename.log,CONFIGFILE=$CONFIGFILE $PIPELINEBASE/merge_GATK.sh
done
echo "End : $(date +"%F_%H-%M-%S")"
class VCF_CNV:
def __init__(self):
self.CHROM = 0
self.POS = 0
self.ID = "."
self.REF = "."
self.ALT = "."
self.QUAL = 6
self.FILTER = "PASS"
self.SVTYPE = None
self.END = 0
self.SVLEN = 0
self.CIPOS = "0,0"
self.CIEND = "0,0"
self.GENE = "."
self.FORMAT = "GT:GQ"
self.SAMPLE = ".:0"
def __repr__(self):
vcf = "{}\t{}\t{}\t{}\t{}\t{}\t{}\tSVTYPE={};END={};SVLEN={};CIPOS={};CIEND={};GENE={}\t{}\t{}".format(self.CHROM,self.POS,self.ID,self.REF,self.ALT,self.QUAL,self.FILTER,self.SVTYPE,self.END,self.SVLEN,self.CIPOS,self.CIEND,self.GENE,self.FORMAT,self.SAMPLE)
return vcf
def create_header():
header="""##fileformat=VCFv4.2
##reference=hg19
##INFO=<ID=BKPTID,Number=1,Type=String,Description="ID of the assembled alternate allele in the assembly file">
##INFO=<ID=CIEND,Number=2,Type=Integer,Description="Confidence interval around END for imprecise variants">
##INFO=<ID=CIPOS,Number=2,Type=Integer,Description="Confidence interval around POS for imprecise variants">
##INFO=<ID=END,Number=1,Type=Integer,Description="End position of the variant described in this record">
##INFO=<ID=HOMLEN,Number=1,Type=Integer,Description="Length of base pair identical micro-homology at event breakpoints">
##INFO=<ID=HOMSEQ,Number=1,Type=String,Description="Sequence of base pair identical micro-homology at event breakpoints">
##INFO=<ID=IMPRECISE,Number=0,Type=Flag,Description="Imprecise structural variation">
##INFO=<ID=MEINFO,Number=4,Type=String,Description="Mobile element info of the form NAME,START,END,POLARITY">
##INFO=<ID=SVLEN,Number=1,Type=Integer,Description="Difference in length between REF and ALT alleles">
##INFO=<ID=SVTYPE,Number=1,Type=String,Description="Type of structural variant">
##INFO=<ID=GENE,Number=1,Type=String,Description="Involved genes">
##ALT=<ID=DEL,Description="Deletion">
##ALT=<ID=DEL:ME:ALU,Description="Deletion of ALU element">
##ALT=<ID=DEL:ME:L1,Description="Deletion of L1 element">
##ALT=<ID=DUP,Description="Duplication">
##ALT=<ID=DUP:TANDEM,Description="Tandem Duplication">
##ALT=<ID=INS,Description="Insertion of novel sequence">
##ALT=<ID=INS:ME:ALU,Description="Insertion of ALU element">
##ALT=<ID=INS:ME:L1,Description="Insertion of L1 element">
##ALT=<ID=INV,Description="Inversion">
##ALT=<ID=CNV,Description="Copy number variable region">
##FORMAT=<ID=GT,Number=1,Type=Integer,Description="Genotype">
##FORMAT=<ID=GQ,Number=1,Type=Float,Description="Genotype quality">
##FORMAT=<ID=CN,Number=1,Type=Integer,Description="Copy number genotype for imprecise events">
##FORMAT=<ID=CNQ,Number=1,Type=Float,Description="Copy number genotype quality for imprecise events">
##contig=<ID=chrM,length=16571>
##contig=<ID=chr1,length=249250621>
##contig=<ID=chr2,length=243199373>
##contig=<ID=chr3,length=198022430>
##contig=<ID=chr4,length=191154276>
##contig=<ID=chr5,length=180915260>
##contig=<ID=chr6,length=171115067>
##contig=<ID=chr7,length=159138663>
##contig=<ID=chr8,length=146364022>
##contig=<ID=chr9,length=141213431>
##contig=<ID=chr10,length=135534747>
##contig=<ID=chr11,length=135006516>
##contig=<ID=chr12,length=133851895>
##contig=<ID=chr13,length=115169878>
##contig=<ID=chr14,length=107349540>
##contig=<ID=chr15,length=102531392>
##contig=<ID=chr16,length=90354753>
##contig=<ID=chr17,length=81195210>
##contig=<ID=chr18,length=78077248>
##contig=<ID=chr19,length=59128983>
##contig=<ID=chr20,length=63025520>
##contig=<ID=chr21,length=48129895>
##contig=<ID=chr22,length=51304566>
##contig=<ID=chrX,length=155270560>
##contig=<ID=chrY,length=59373566>
#CHROM POS ID REF ALT QUAL FILTER INFO FORMAT dijex
"""
return header
pipelinebase /work/gad/shared/pipeline/pharmacogen_test/pharmAnnot
pythonbin /work/gad/shared/bin/miniconda2/bin/python
GATKbase /work/gad/shared/bin/GATK_3.7
javacmd /soft/java/jdk_x64/bin/java
picardbase /work/gad/shared/bin/PICARD_2.4.1
temporary_dir /work/gad/shared/tmp/
reference /work/gad/shared/pipeline/hg19/index/hg19_essential.fa
refdict /work/gad/shared/pipeline/hg19/index/hg19_essential.dict
refdir /work/gad/shared/pipeline/hg19/index
dbsnp /work/gad/shared/pipeline/hg19/clinvar/dbsnp.vcf
mainqueue batch
autosubqueue batchbm
bwacmd /work/gad/shared/bin/bwa-0.7.15
fastqccmd /work/gad/shared/bin/FastQC_0.11.4/fastqc
fastqc_config /work/gad/shared/bin/FastQC_0.11.4/Configuration/limits.txt
trimmomatic /work/gad/shared/bin/Trimmomatic-0.35/trimmomatic-0.35.jar
adapters_file /work/gad/shared/bin/Trimmomatic-0.35/adapters/TruSeq3-PE-All.fa
samtools /work/gad/shared/bin/samtools-1.5/
igvtools /work/gad/shared/bin/IGVTools_2.3.67/igvtools.jar
targetlist /work/gad/shared/pipeline/hg19/targets/
refseqlist /work/gad/shared/pipeline/hg19/refseq/refseq.cds.2bp.hg19.with_genes.interval_list
refseqgenelist /work/gad/shared/pipeline/hg19/refseq/genetrack.hg19.sorted.refseq
controlfile /work/gad/shared/pipeline/samples/control_100_samples.g.vcf
email emilie.tisserant@u-bourgogne.fr
clinvar_vcf /work/gad/shared/pipeline/hg19/clinvar/clinvar.vcf
cosmic_vcf /work/gad/shared/pipeline/hg19/cosmic/CosmicCodingMuts.vcf
omim_list /work/gad/shared/pipeline/hg19/omim/OMIM2.list
acmg_list /work/gad/shared/pipeline/hg19/omim/ACMG.list
common_list /work/gad/shared/pipeline/hg19/clinvar/common.list
ssa_nb_files 10
tabixpath /work/gad/shared/bin/htslib-1.3/bin/tabix
kaviar /work/gad/shared/pipeline/hg19/kaviar/Kaviar-160204-Public-hg19.vcf.gz
regulomedb /work/gad/shared/pipeline/hg19/regulomeDB/RegulomeDB.dbSNP141.txt.gz
targetscan /work/gad/shared/pipeline/hg19/targetscan/Predicted_Targets.hg19.bed
cytobands /work/gad/shared/pipeline/hg19/cnv/cytobands.tsv
dgv_variants /work/gad/shared/pipeline/hg19/cnv/GRCh37_hg19_variants_2015-07-23.txt.gz
cnv_map /work/gad/shared/pipeline/hg19/cnv/Stringent.hg19.2015-02-03.txt.gz
isca_benign_cnv /work/gad/shared/pipeline/hg19/cnv/iscaBenign.txt.gz
isca_pathogenic_cnv /work/gad/shared/pipeline/hg19/cnv/iscaPathogenic.txt.gz
devdelay_control_cnv /work/gad/shared/pipeline/hg19/cnv/cnvDevDelayControl.txt.gz
devdelay_case_cnv /work/gad/shared/pipeline/hg19/cnv/cnvDevDelayCase.txt.gz
clingen_dosage_sensitive_gene /work/gad/shared/pipeline/hg19/cnv/ClinGen_gene_curation_list.tsv
xhmmbin /work/gad/shared/bin/xhmm_1.0/xhmm
xhmmparams /work/gad/shared/pipeline/hg19/cnv/xhmm_params.txt
cnvtarget /work/gad/shared/pipeline/hg19/cnv/cnv_targets_refseq_only_coding_exon_2bp.hg19.list
targetgc /work/gad/shared/pipeline/hg19/cnv/cnv_targets_refseq_only_coding_exon_2bp.hg19_GC.txt
docmatrix /work/gad/shared/pipeline/hg19/cnv/ALL.CNV.RD.txt
extremegc /work/gad/shared/pipeline/hg19/cnv/extreme_gc_targets.txt
lowcomplexity /work/gad/shared/pipeline/hg19/cnv/low_complexity_targets.txt
controlfreec /work/gad/shared/bin/ControlFreeC
gemMappabilityFile /work/gad/shared/pipeline/hg19/cnv/out100m1_hg19.gem
chrLenFile /work/gad/shared/pipeline/hg19/index/hg19.len
chrFiles /work/gad/shared/pipeline/hg19/index/chr/
lumpybase /work/gad/shared/bin/lumpy/lumpy-sv-master/
mapabilitybedgraph /work/gad/shared/pipeline/hg19/cnv/wgEncodeCrgMapabilityAlign100mer_no_uniq.bedgraph
blacklistfile /work/gad/shared/pipeline/hg19/cnv/wgEncodeDacMapabilityConsensusExcludable.txt
repeatmasker /work/gad/shared/pipeline/hg19/cnv/UCSC_rmsk.hg19.sort.bed.gz
bedtools /work/gad/shared/bin/bedtools
nist_bed /work/gad/shared/pipeline/hg19/nist/agilent_CRE.refseq_cds.NA12878_GIAB_highconf.intersect.hg19.bed
nist_vcf /work/gad/shared/pipeline/hg19/nist/NA12878_GIAB_highconf.agilent.refseq.intersect.vcf
hg19gtf /work/gad/shared/pipeline/hg19/refseq/hg19.refseq.with_genes.gtf
star2 /work/gad/shared/bin/STAR-2.5.2b/source
circosconfdir /work/gad/shared/pipeline/hg19/circos/
circosbin /work/gad/shared/bin/circos-0.69-2/bin/circos
rscript /soft/R/3.2.3/intel/13.1.3/bin/Rscript
snpeffbase /work/gad/shared/bin/snpEff
exacvcf /work/gad/shared/pipeline/hg19/ExAC/ExAC.r0.3.1.sites.vep.vcf.gz
exacscores /work/gad/shared/pipeline/hg19/ExAC/fordist_cleaned_exac_r03_march16_z_pli_rec_null_data.txt
dbnsfp /work/gad/shared/bin/snpEff/data/dbNSFP/dbNSFP2.9.txt.gz
gnomadex /work/gad/shared/pipeline/hg19/ExAC/gnomad.exomes.r2.0.2.sites.vcf.gz
gnomadge /work/gad/shared/pipeline/hg19/ExAC/gnomad.genomes.r2.0.2.sites.vcf.gz
gnomadchr /work/gad/shared/pipeline/hg19/ExAC/gnomad.genomes.r2.0.2.sites
hgmd_vcf /work/gad/shared/pipeline/hg19/HGMD/HGMD_PRO_2017.2_hg19.vcf
regulatoryfeatures /work/gad/shared/pipeline/hg19/ensembl_regulation/regulatory_features.bed
motiffeatures /work/gad/shared/pipeline/hg19/ensembl_regulation/motif_features.bed
encodednase /work/gad/shared/pipeline/hg19/Encode/wgEncodeRegDnaseClusteredV3.bed
encodetfbs /work/gad/shared/pipeline/hg19/Encode/wgEncodeRegTfbsClusteredV3.bed
linsight /work/gad/shared/pipeline/hg19/LINSIGHT
fantom /work/gad/shared/pipeline/hg19/FANTOM/fantom_cage_peak.bed
isgenic /work/gad/shared/pipeline/hg19/FANTOM/isGenic.bed
genelists /work/gad/shared/pipeline/hg19/gene_lists/
vtbase /work/gad/shared/bin/vt
#!/bin/ksh
### GAD PIPELINE ###
## annotate_variants_snpeff.sh
## Version : 1.0
## Description : This script allows user to launch SnpEff and SNpSift for basic variant annotation, frequency annotation and score annotation
## Usage : qsub -pe smp <nb thread> -v INPUTFILE=<raw/vcf/file>,OUTPUTFILE=</path/to/the/output/vcf/file>,PHARMGKB=<ref/file/pharmgkb>,[NOCHECK=yes|no],[LOGFILE=/path/to/the/log/file],[CONFIGFILE=/path/to/the/config/file] annotate_variants_pharmAnnot.sh
## Output : .vcf file containing annotations
## Requirements : jdk 1.8.0
## Author : Simon.Verdez@u-bourgogne.fr
## Creation Date : 20180727
## last revision date : 20180727
## Known bugs : None
#$ -q batch
#$ -V
# Log file path option
if [ -z ${LOGFILE} ]
then
LOGFILE=annotate_variants_snpeff.$(date +"%F_%H-%M-%S").log
fi
# Config file path option
if [ -z ${CONFIGFILE} ]
then
CONFIGFILE=analysis_config.tsv
fi
# Logging
exec 1>> $LOGFILE 2>&1
echo "$(date +"%F_%H-%M-%S"): START"
# Check if config file exist
if [ ! -f $CONFIGFILE ]
then
echo "Config file does not exist"
echo "$(date +"%F_%H-%M-%S"): END"
touch annotate_variants_snpeff.failed
exit 1
fi
# Check if input file exist
if [ ! -f $INPUTFILE ]
then
echo "Input file does not exist"
echo "$(date +"%F_%H-%M-%S"): END"
touch annotate_variants_snpeff.failed
exit 1
fi
if [ ! -f $PHARMGKB ]
then
echo "ref pharmgkb file does not exist"
echo "$(date +"%F_%H-%M-%S"): END"
touch annotate_variants_snpeff.failed
exit 1
fi
# Check if output file is specified
if [ -z $OUTPUTFILE ]
then
echo "Output file is not specified"
echo "$(date +"%F_%H-%M-%S"): END"
touch annotate_variants_snpeff.failed
exit 1
fi
JAVA=$(grep "javacmd" $CONFIGFILE | awk '{print $2}')
PATHSCRIPT=$(grep "pipelinebase" $CONFIGFILE | awk '{print $2}')
SNPEFF=$(grep "snpeffbase" $CONFIGFILE | awk '{print $2}')
# Launch Snpeff and SnpSift command and check exit code
echo "Command: $JAVA -Xmx4g -jar $SNPEFF/SnpSift.jar Annotate $PHARMGKB $INPUTFILE > $OUTPUTFILE"
bash -o pipefail -c "$JAVA -Xmx4g -jar $SNPEFF/SnpSift.jar Annotate $PHARMGKB $INPUTFILE > $OUTPUTFILE"
snpeff_exitcode=$?
echo "snpeff exit code : $snpeff_exitcode"
if [ $snpeff_exitcode != 0 ]
then
echo "$(date +"%F_%H-%M-%S"): END"
touch annotate_variants_Annot.failed
exit 1
fi
if [ -z ${NOCHECK} ] || [ ${NOCHECK} == "no" ]
then
chr=`tail -1 $OUTPUTFILE | cut -f1`
if [ $chr != "chrX" ] && [ $chr != "chrY" ] && [ $chr != "X" ] && [ $chr != "Y" ]
then
echo "vcf file does not seem complete"
echo "$(date +"%F_%H-%M-%S"): END"
touch annotate_variants_snpeff.failed
exit 1
fi
fi
echo "$(date +"%F_%H-%M-%S"): END"
This diff is collapsed.
#!/bin/bash
###GAD PIPELINE ###
## haplo_scan.sh
## Version : 1.0.0
## Description : This scprit tke a bam and return a file text with HLA type
## Usage : qsub -v BAM=<path to the bam>,SORTIE=<path to outdir>,GeneList=<path to gene list>,[LOGFILE=/path/to/the/log/file] haplo_scan.sh
## Output : file text
## Requirements : HLAscan
## Author : Simon.Verdez@etu.u-bourgogne.fr
## Creation Date : 20180425
## last revision date : 20180425
## Known bugs : None
#$ -N hlascan
#$ -q batch
#$ -pe smp 2
#$ -V
# Log file path option
if [ -z ${LOGFILE} ]
then
LOGFILE=haplo_scan.$(date +"%F_%H-%M-%S").log
fi
# Logging
exec 1>> $LOGFILE 2>&1
echo "$(date +"%F_%H-%M-%S"): START"
# Check if input file exist
if [ ! -f $BAM ]
then
echo "Bam file does not exist"
echo "$(date +"%F_%H-%M-%S"): END"
touch haplo_scan.failed
exit 1
fi
# Check if gene file exist
if [ ! -f $GeneList ]
then
echo "Gene file does not exist"
echo "$(date +"%F_%H-%M-%S"): END"
touch haplo_scan.failed
exit 1
fi
# Check if output file is specified
if [ -z $SORTIE ]
then
echo "SORTIE file is not specified"
echo "$(date +"%F_%H-%M-%S"): END"
touch haplo_scan.failed
exit 1
fi
SAMPLE=`basename ${BAM} | cut -d . -f1`
RES=${SORTIE}
GL=/work/gad/shared/bin/HLAscan_v1.0/hla-ref-5gene/gene_list
#Launch haplo_scan
/bin/python2.7 /work/gad/shared/bin/HLAscan_v1.0/hla-paper/haplo_scan_v4.0-sv.py $BAM $GL $RES
hla_exitcode=$?
echo "hla_scan exit code : $hla_exitcode"
if [ $hla_exitcode != 0 ]
then
echo "$(date +"%F_%H-%M-%S"): END"
touch hla_scan.failed
exit 1
fi
echo "$(date +"%F_%H-%M-%S"): END"
#!/bin/ksh
### PharmAnnot PIPELINE ###
## merge_GATK.sh
## Version : 1.0.0
## Description : This script merge two vcf file
## Usage : qsub -pe smp <nb thread> -v INPUTFILE1=</path/to/the/input/vcf/file>,INPUTFILE2=</path/to/the/input/vcf/file>,REF=</path/to/the/ref/file>,OUTPUTFILE=</path/to/the/output/vcf/file>,[LOGFILE=/path/to/the/log/file],[CONFIGFILE=/path/to/the/config/file] merge_GATK.sh
## Output : .vcf file containing annotations
## Requirements : jdk 1.8.0
## Author : simon.verdez@chu-dijon.fr
## Creation Date : 20180702
## last revision date : 20180702
## Known bugs : None
#$ -q batch
#$ -V
# Log file path option
if [ -z ${LOGFILE} ]
then
LOGFILE=merge_GATK.$(date +"%F_%H-%M-%S").log
fi
# Config file path option
if [ -z ${CONFIGFILE} ]
then
CONFIGFILE=analysis_config.tsv
fi
# Logging
exec 1>> $LOGFILE 2>&1
echo "$(date +"%F_%H-%M-%S"): START"
# Check if config file exist
if [ ! -f $CONFIGFILE ]
then
echo "Config file does not exist"
echo "$(date +"%F_%H-%M-%S"): END"
touch merge_GATK.failed
exit 1
fi
# Check if input1 file exist
if [ ! -f $INPUTFILE1 ]
then
echo "Input1 file does not exist"
echo "$(date +"%F_%H-%M-%S"): END"
touch merge_GATK.failed
exit 1
fi
# Check if input2 file exist
if [ ! -f $INPUTFILE2 ]
then
echo "Input2 file does not exist"
echo "$(date +"%F_%H-%M-%S"): END"
touch merge_GATK.failed
exit 1
fi
# Check if ref file exist
if [ ! -f $REF ]
then
echo "ref file does not exist"
echo "$(date +"%F_%H-%M-%S"): END"
touch merge_GATK.failed
exit 1
fi
# Check if output file is specified
if [ -z $OUTPUTFILE ]
then
echo "Output file is not specified"
echo "$(date +"%F_%H-%M-%S"): END"
touch merge_GATK.failed
exit 1
fi
#JAVA=$(grep "javacmd" $CONFIGFILE | awk '{print $2}')
module load java64/1.8.0_162
PATHSCRIPT=$(grep "pipelinebase" $CONFIGFILE | awk '{print $2}')
#GATK=$(grep "GATKbase" $CONFIGFILE | awk '{print $2}')
echo "command : $JAVA -jar /work/gad/shared/bin/GATK_3.7/GenomeAnalysisTK.jar -T CombineVariants -R $REF --variant $INPUTFILE1 --variant $INPUTFILE2 -genotypeMergeOptions UNIQUIFY"
java -jar /work/gad/shared/bin/GATK_3.7/GenomeAnalysisTK.jar -T CombineVariants -R $REF --variant $INPUTFILE1 --variant $INPUTFILE2 -o $OUTPUTFILE -genotypeMergeOptions UNIQUIFY
genotypeMerge_exitcode=$?
echo "GenotypeMerge exit code : $genotypeMerge_exitcode"
if [ $genotypeMerge_exitcode != 0 ]
then
echo "$(date +"%F_%H-%M-%S"): END"
touch merge.failed
exit 1
fi
echo "$(date +"%F_%H-%M-%S"): END"
pipelinebase /work/gad/shared/pipeline/pharmacogen_test/pharmAnnot
HLAlist /work/gad/shared/analyse/test_HLA/HLAscan_v1.0/hla-ref-5gene/gene_list
pharmagkbase /work/gad/sv347413/epilepsie/Pipeline/pharmgkb_v2.vcf
pharmagktab /work/gad/sv347413/epilepsie/Pipeline/tableau_pharmgkb.tsv
omim /work/gad/sv347413/epilepsie/Pipeline/tableau_OMIM.tsv
header /work/gad/sv347413/epilepsie/Pipeline/header_CNV.vcf
reference /work/gad/shared/pipeline/hg19/index/hg19_essential.fa
pipelinebase /work/gad/shared/pipeline/pharmacogen_test/pharmAnnot
HLAlist /work/gad/shared/analyse/test_HLA/HLAscan_v1.0/hla-ref-5gene/gene_list
pharmagkbase /work/gad/sv347413/oncologie/pharmgkb_oncologie_v4.vcf
pharmagktab /work/gad/sv347413/oncologie/tableau_oncologie_v2.tsv
omim /work/gad/sv347413/oncologie/tableau_OMIM_oncologie.tsv
header /work/gad/sv347413/epilepsie/Pipeline/header_CNV.vcf
reference /work/gad/shared/pipeline/hg19/index/hg19_essential.fa
#!/usr/bin/python
### PharmAnnot PIPELINE ###
## tsv2vcf.py
## Version : 1.1.0
## Description : Conversion a tsv file in vcf file.
## Usage : python tsv2vcf.py -i <input/tsv/file> -p <input/tsv/file> -m<input/tsv/file> -r <reference/bam/fie> -o <output/vcf/file> -e <log/file>
## Output : vcf file
## Requirements : python 2.7+
## Author : Simon.Verdez@chu-dijon.fr
## Creation Date : 20180709
## last revision date : 20180731
## Known bugs : None
import os
import sys
import vcf
import csv
from VCF_tools import *
import subprocess
from operator import *
import getopt
import logging
#Options
try:
opts, args = getopt.getopt(sys.argv[1:], 'i:p:m:r:o:e:')
for opt, arg in opts:
if opt in ("-i"):
tsvfile = arg
if opt in ("-p"):
pharmgkb = arg
if opt in ("-m"):
OMIM = arg
if opt in ("-r"):
REFERENCE = arg
elif opt in ("-o"):
out_vcf = arg
elif opt in ("-e"):
logfile = arg
except getopt.GetoptError:
print 'usage : '
sys.exit(1)
# Logging
logging.basicConfig(filename = '%s' % (logfile), filemode = 'a', level = logging.INFO, format = '%(asctime)s %(levelname)s - %(message)s')
logging.info('start')
# Manage options
if not vars().has_key('tsvfile'):
logging.exception('No input file given')
sys.exit(1)
if not vars().has_key('pharmgkb'):
logging.exception('No pharmgkb database given')
sys.exit(1)
if not vars().has_key('OMIM'):
logging.exception('No OMIM database given')
sys.exit(1)
if not vars().has_key('REFERENCE'):
logging.exception('No REFERENCE bam given')
sys.exit(1)
# Check files
try:
f = open(tsvfile, 'r')
f.close()
except IOError:
logging.exception('Input file does not exist')
sys.exit(1)
try:
f = open(pharmgkb, 'r')
f.close()
except IOError:
logging.exception('Clinvar file does not exist')
sys.exit(1)
try:
f = open(OMIM, 'r')
f.close()
except IOError:
logging.exception('Hgmd file does not exist')
sys.exit(1)
try:
f = open(REFERENCE, 'r')
f.close()
except IOError:
logging.exception('reference bam file does not exist')
sys.exit(1)
"""Liste contenant les genes d'interets"""
ListeG = []
"""recuperation des genes dans le fichier"""
with open(pharmgkb) as pharmfile:
reader = csv.DictReader(pharmfile, dialect='excel-tab')
for row in reader:
if row['gene'] != "":
if row['gene'] not in ListeG:
ListeG.append(row['gene'])
with open(OMIM) as OMIMfile:
reader = csv.DictReader(OMIMfile, dialect='excel-tab')
for row in reader:
if row['GENE'] != "":
if row['GENE'] not in ListeG:
ListeG.append(row['GENE'])
#print ListeG
"""ecriture des lignes CNV dans le vcf"""
ListeVCF=[]
for gene in ListeG:
with open(tsvfile) as file:
reader = csv.DictReader(file, dialect='excel-tab')
for row in reader:
if gene in row['GeneList'] and gene != "":
line_vcf = VCF_CNV()
line_vcf.CHROM = row["#CHR"]
line_vcf.POS = row["START"]
p = subprocess.Popen(["samtools", 'faidx',REFERENCE, row["#CHR"]+":"+row["START"]+"-"+row["START"]], stdout = subprocess.PIPE)
line_vcf.REF = p.stdout.readlines()[1].strip("\n")
line_vcf.ALT = "<"+row["CNV"]+">"
line_vcf.QUAL = "6"
line_vcf.SVTYPE = row["CNV"]
line_vcf.END = row["END"]
if row["CNV"] == "DEL":
line_vcf.SVLEN = "-"+row["CNVLength"]
else :
line_vcf.SVLEN =row["CNVLength"]
line_vcf.GENE = str(gene)
ListeVCF.append(line_vcf)
ListeVCF = sorted(ListeVCF,key=attrgetter('POS'))
header = create_header()
with open (out_vcf,'w') as vcf_out:
vcf_out.write(header)
for line in ListeVCF:
with open (out_vcf,'a') as vcf_out:
vcf_out.write(repr(line)+"\n")
#!/bin/ksh
### GAD PIPELINE ###
## wrapper_create_html.sh
## Version : 1.0.0
## Description : a wrapper for qsubing create_html.py script
## Usage : qsub -pe smp <nb thread> -v VCF_SNP=</path/to/the/input/vcf/file>,VCF_CNV=</path/to/the/input/vcf/file>,REPORTFILE=<path/to/the/HLAscan/report>,BAMFILE=</path/to/the/bam>,TSVFILE=</path/to/the/tsv>,OUTPUTFILE=</path/to/the/output/vcf/file>,[LOGFILE=/path/to/the/log/file],[CONFIGFILE=/path/to/the/config/file] wrapper_create_html.sh
## Output : create_html.py output
## Requirements : Recquire script create_html.py and VcfTools
## Author : Simon.Verdez@u-bourgogne.fr
## Creation Date : 20180802
## last revision date : 20180802
## Known bugs : None
#$ -q batch
#$ -V
# Log file path option
if [ -z ${LOGFILE} ]
then
LOGFILE=annotate_variants.$(date +"%F_%H-%M-%S").log
fi
# Config file path option
if [ -z ${CONFIGFILE} ]
then
CONFIGFILE=analysis_config.tsv
fi
# Logging
exec 1>> $LOGFILE 2>&1
echo "$(date +"%F_%H-%M-%S"): START"
# Check if config file exist
if [ ! -f $VCF_SNP ]
then
echo "Vcf SNP file does not exist"
echo "$(date +"%F_%H-%M-%S"): END"
touch create_html.failed
exit 1
fi
if [ ! -f $VCF_CNV ]
then
echo "Vcf CNV file does not exist"
echo "$(date +"%F_%H-%M-%S"): END"
touch create_html.failed
exit 1
fi
if [ ! -f $REPORTFILE ]
then
echo "Report HLAscan file does not exist"
echo "$(date +"%F_%H-%M-%S"): END"
touch create_html.failed
exit 1
fi
if [ ! -f $BAMFILE ]
then
echo "BAM file does not exist"
echo "$(date +"%F_%H-%M-%S"): END"
touch create_html.failed
exit 1
fi
if [ ! -f $TSVFILE ]
then
echo "TSV file does not exist"
echo "$(date +"%F_%H-%M-%S"): END"
touch create_html.failed
exit 1
fi
# Check if output file is specified
if [ -z $OUTPUTFILE ]
then
echo "Output file is not specified"
echo "$(date +"%F_%H-%M-%S"): END"
touch create_html.failed
exit 1
fi
PYTHONBIN=`grep pythonbin $CONFIGFILE | cut -f2`
PIPELINEBASE=`grep pipelinebase $CONFIGFILE | cut -f2`
# Launch python script command and check exit code
echo "command : $PIPELINEBASE/create_html.py -v $VCF_SNP -c $VCF_CNV -h $REPORTFILE -b $BAMFILE -t $TSVFILE -o $OUTPUTFILE -e $LOGFILE"
$PYTHONBIN $PIPELINEBASE/create_html.py -v $VCF_SNP -c $VCF_CNV -h $REPORTFILE -b $BAMFILE -t $TSVFILE -o $OUTPUTFILE -e $LOGFILE
create_html_exitcode=$?
echo "create_html exit code : $annotate_variants_exitcode"
if [ $creaete_html_exitcode != 0 ]
then
echo "$(date +"%F_%H-%M-%S"): END"
touch create_html.failed
exit 1
fi
echo "$(date +"%F_%H-%M-%S"): END"
#!/bin/ksh
### GAD PIPELINE ###
## wrapper_tsv2vcf.sh
## Version : 1.0.0
## Description : a wrapper for qsubing tsv2vcf.py script
## Usage : qsub -pe smp <nb thread> -v INPUTFILE=</path/to/the/input/tsv/file>,PHARMGKB=<ref/file/pharmgkb>,OMIM=<ref/file/OMIM>,REF=<ref/bam/file>,OUTPUTFILE=</path/to/the/output/vcf/file>,[LOGFILE=/path/to/the/log/file],[CONFIGFILE=/path/to/the/config/file] wrapper_tsv2vcf.sh
## Output : annotate_variants.py output
## Requirements : Recquire script tsv2vcf.py and VcfTools
## Author : Simon.Verdez@u-bourgogne.fr
## Creation Date : 20180702
## last revision date : 20180702
## Known bugs : None
#$ -q batch
#$ -V
Log file path option
if [ -z ${LOGFILE} ]
then
LOGFILE=annotate_variants.$(date +"%F_%H-%M-%S").log
fi
# Config file path option
if [ -z ${CONFIGFILE} ]
then
CONFIGFILE=analysis_config.tsv
fi
# Logging
exec 1>> $LOGFILE 2>&1
echo "$(date +"%F_%H-%M-%S"): START"
# Check if config file exist
if [ ! -f $CONFIGFILE ]
then
echo "Config file does not exist"
echo "$(date +"%F_%H-%M-%S"): END"
touch tsv2vcf.failed
exit 1
fi
# Check if input file exist
if [ ! -f $INPUTFILE ]
then
echo "Input file does not exist"
echo "$(date +"%F_%H-%M-%S"): END"
touch tsv2vcf.failed
exit 1
fi
if [ ! -f $PHARMGKB ]
then
echo "PHARMGKB reference file does not exist"
echo "$(date +"%F_%H-%M-%S"): END"
touch tsv2vcf.failed
exit 1
fi
if [ ! -f $OMIM ]
then
echo "OMIM reference file does not exist"
echo "$(date +"%F_%H-%M-%S"): END"
touch tsv2vcf.failed
exit 1
fi
if [ ! -f $REF ]
then
echo "reference bam file does not exist"
echo "$(date +"%F_%H-%M-%S"): END"
touch tsv2vcf.failed
exit 1
fi
# Check if output file is specified
if [ -z $OUTPUTFILE ]
then
echo "Output file is not specified"
echo "$(date +"%F_%H-%M-%S"): END"
touch tsv2vcf.failed
exit 1
fi
PYTHONBIN=`grep pythonbin $CONFIGFILE | cut -f2`
PIPELINEBASE=`grep pipelinebase $CONFIGFILE | cut -f2`
# Launch python script command and check exit code
echo "command : $PIPELINEBASE/tsv2vcf.py -i $INPUTFILE -p $PHARMGKB -m $OMIM -r $REF -o $OUTPUTFILE -e $LOGFILE"
$PYTHONBIN $PIPELINEBASE/tsv2vcf.py -i $INPUTFILE -p $PHARMGKB -m $OMIM -r $REF -o $OUTPUTFILE -e $LOGFILE
tsv2vcf_exitcode=$?
echo "tsv2vcf exit code : $tsv2vcf_exitcode"
if [ $tsv2vcf_exitcode != 0 ]
then
echo "$(date +"%F_%H-%M-%S"): END"
touch tsv2vcf.failed
exit 1
fi
echo "$(date +"%F_%H-%M-%S"): END"
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment