create the README

769b466c · simon verdez · fc81204a · 769b466c · 769b466c · 769b466c
Commit 769b466c authored Dec 01, 2021 by simon verdez
Hide whitespace changes
Inline Side-by-side

Showing with 166 additions and 13 deletions

Extract_PharmGKB_Variants.sh Extract_PharmGKB_Variants.sh +112 -12

Readme.md Readme.md +53 -0

create_html.py create_html.py +1 -1

No files found.
--- a/Extract_PharmGKB_Variants.sh
+++ b/Extract_PharmGKB_Variants.sh
-HLASCAN=''
-SNPEFF=''

-#todo Dowload
-#https://api.pharmgkb.org/v1/download/file/data/clinicalAnnotations.zip
-#change star notation by dbsnp notation with CPIC recommandations or PharmVar website
+#!/bin/bash

-#data test
-#wget https://ftp-trace.ncbi.nlm.nih.gov/giab/ftp/data/NA12878/Nebraska_NA12878_HG001_TruSeq_Exome/NIST-hg001-7001-b-gatk.vcf
+PHARMGKB=""
+HLASCAN=""
+BAM=""
+VCF=""
+OUTPUTDIR=""

-#HLASCAN
-python2.7 $HLASCAN/HLAscan.v1.0.Files/hla-paper/haplo_scan_v4.0-hla.py $BAMFILE $HLASCAN/HLAscan.v1.0.Files/hla-ref-5gene/gene_list HLAscan_result/
+PRINT_HELP=0
+
+while [[ "$#" -gt 0 ]]; do
+    case $1 in
+        -p|--pharmgkb) PHARMGKB="$2"; shift ;;
+        -s|--hlascan) HLASCAN="$2"; shift ;;
+        -b|--bam) BAM="$2"; shift ;;
+        -v|--vcf) VCF="$2" ; shift ;;
+	-o|--out-directory) OUTPUTDIR="$2" ; shift ;;
+	-h|--help) PRINT_HELP=1 ;;
+        *) echo "Unknown parameter passed: $1"; exit 1 ;;
+    esac
+    shift
+done
+
+if [ ${PRINT_HELP} -eq 1 ]
+then
+
+	echo "Usage:"
+	echo ""
+	echo "bash -p <PharmGKB file> -b <bam file> -s <path to hlascan files> -v <vcf file>  -o <output dir> -h [show this help]"
+	echo ""
+	echo "<Mandatory>: "
+	echo "-p <PharmGKB file>: Path to PharmGKB tsv file download at PharmGKB site."
+	echo "-b <bam file>: Bam file provided by alignement."
+	echo "-s <path to hlascan files>: Path to HLAscan tools download and unzipped."
+	echo "-v <vcf file>: vcf file provided by variant calling"
+	echo "-o <OUTPUT_PATH>: Output path."
+	echo ""
+
+	exit 0
+fi
+
+# Check pharmgkb file is provided and exists
+if [ "${PHARMGKB}" == "" ]
+then
+	1>&2 echo "Missing input pharmgkb tsv file (-p). Abort."
+	exit 2
+
+elif [ ! -f "${PHARMGKB}" ]
+then
+	1>&2 echo "pharmgkb tsv file not found. Abort."
+	exit 3
+fi
+
+# Check  BAM file is provided and exists
+if [ "${BAM}" == "" ]
+then
+	1>&2 echo "Missing input BAMs (-b). Abort."
+	exit 4
+
+elif [ ! -f "${BAM}" ]
+then
+	1>&2 echo "BAM file not found. Abort."
+	exit 5
+fi

-#Annotate VCF File
-bash -o pipefail  -c "java -Xmx4g -jar $SNPEFF/SnpSift.jar annotate pharmgkb_v2.vcf $VCF > ${VCF}_annot"
+
+# Check hlascan dir is provided and exists
+if [ "${HLASCAN}" == "" ]
+then
+	1>&2 echo "Missing input HLAscan path (-s). Abort."
+	exit 6
+
+elif [ ! -d "${HLASCAN}" ]
+then
+	1>&2 echo "HLAscan path not found. Abort."
+	exit 7
+fi
+
+# Check vcf file is provided and exists
+if [ "${VCF}" == "" ]
+then
+	1>&2 echo "Missing input vcf (-v). Abort."
+	exit 8
+
+elif [ ! -f "${VCF}" ]
+then
+	1>&2 echo "vcf file not found. Abort."
+	exit 9
+fi
+
+#  Check output dir is provided and exists
+if [ "${OUTPUTDIR}" == "" ]
+then
+	1>&2 echo "Missing input output dir path (-s). Abort."
+	exit 10
+
+elif [ ! -d "${OUTPUTDIR}" ]
+then
+	1>&2 echo "Output dir path not found. Abort."
+	exit 11
+fi
+
+echo "Input pharmgkb file: $PHARMGKB"
+echo "Path to HLAscan: $HLASCAN"
+echo "HLAscan executable: $HLASCAN/hla-paper/haplo_scan_v4.0-hla.py"
+echo "Gene list for HLAscan: $HLASCAN/hla-ref-5gene/gene_list"
+echo "Input bam file: $BAM"
+echo "Input vcf file: $VCF"
+echo "Output directory: ${OUTPUTDIR}"
+echo "Output HLAscan: ${OUTPUTDIR}/HLAscan_result/"
+echo "Output HTML file: $OUTPUTDIR/Report.html" 
+
+#HLASCAN
+python2.7 $HLASCAN/hla-paper/haplo_scan_v4.0-hla.py $BAM $HLASCAN/hla-ref-5gene/gene_list $OUTPUTDIR/HLAscan_result/ 1>> $OUTPUTDIR/HLAscan_log.txt 2>> $OUTPUTDIR/HLAscan_log.txt

 #create HTML
-python2.7 create_html.py -v ${VCF}_annot -h HLAscan_result/Report -b $BAMFILE -t tableau_pharmgkb.tsv -o Report.html
+python2.7 create_html.py -v $VCF -h $OUTPUTDIR/HLAscan_result/Report -t $PHARMGKB -o $OUTPUTDIR/Report.html

--- a/Readme.md
+++ b/Readme.md
+# Extract PharmGKB variants
+
+### A simple way to extract SNV and HLA alleles from exome data
+
+This script extract known variants in PharmGKB database from exome data. It is based on two steps, first HLA genotyping with HLAscan and SNV extraction with 
+pysam library. On output, you have a HTML file readable with any web navigator (Internet Explorer, Firefox or Chrome)
+
+## Table of Contents
+
+* [Requirements](#requirements)
+* [Installation](#installation)
+* [Usage](#usage)
+
+## Requirements
+
+* HLAscan v1.0:
+    * [download zip file](https://github.com/SyntekabioTools/HLAscan/releases/download/v1.0.0/HLAscan.v1.0.Files.zip)
+    * [Publication](https://bmcbioinformatics.biomedcentral.com/track/pdf/10.1186/s12859-017-1671-3)
+
+Reference : Ka, S., Lee, S., Hong, J., Cho, Y., ... & Jung, J. (2017). HLAscan: genotyping of the HLA region using next-generation sequencing data. BMC bioinformatics, 18(1), 258.
+
+* Python2.7:
+    * library: pysam , htmltag (required sphinx)
+    * [download python2.7](https://www.python.org/download/releases/2.7/)
+
+* PharmGKB csv file
+    * [download csv file](https://api.pharmgkb.org/v1/download/file/data/clinicalAnnotations.zip)
+    * we will use the **clinical_annotations.tsv** for the next step
+    * Change all star alleles by DBSnp variants if possible, example : CYP2C9*2 become rs1799853
+    * Use [CPIC](https://cpicpgx.org/) recommandations or [PharmVar](https://www.pharmvar.org/) website for the conversion
+
+## Installation
+
+* First install library with pip:
+    * pip2.7 install pysam
+    * pip2.7 install sphinx
+    * pip2.7 install htmltag
+
+* Clone this repository
+
+## Usage
+
+### Data test
+
+* NA12878 data test : 
+    * You can dowload vcf file provided by exome sequencing at this link : 
+https://ftp-trace.ncbi.nlm.nih.gov/giab/ftp/data/NA12878/Nebraska_NA12878_HG001_TruSeq_Exome/NIST-hg001-7001-b-gatk.vcf
+
+    * Bam file is provided with HLAscan
+
+### Execution
+
+    bash -p <PharmGKB file> -b <bam file> -s <path to hlascan files> -v <vcf file>  -o <output dir>
--- a/create_html.py
+++ b/create_html.py
@@ -240,7 +240,7 @@ def main():
        f = open(tsvfile, 'r')
        f.close()
    except IOError:
-        print('Tsv file does not exist')
+        print('PharmGKB tsv file does not exist')
        sys.exit(1)

    #execute