#!/usr/bin/env python # -*- coding: utf-8 -*- # ### GAD development file ### # ## hiv_genotyping.py # ## Version : 1.0 # ## Description : # ## Usage : hiv_genotyping.py # ## Output : # ## Requirements : python 2.7+ - pySAM - scipy # # ## Author: yannis.duffourd@u-bourgogne.fr # ## Creation Date: 2016-10-18 # ## last revision date: 2018-03-29 # ## Known bugs: None # ## TODO: # import os import sys import getopt import logging import math import random import string # pysam requirement try: import pysam except ImportError: sys.stderr.write( "pysam library is required and was not found on your system. Please check PYTHONPATH, or install the library. Exiting.\n" ) sys.exit( 1 ) # scipy requirement try: from scipy import stats except ImportError: sys.stderr.write( "scipy library is required and was not found on your system. Please check PYTHONPATH, or install the library. Exiting.\n" ) sys.exit( 1 ) # plotly optional requirement isGraphicalPossible = False try: import plotly.plotly as py from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot import plotly.graph_objs as go from plotly.graph_objs import Scatter, Figure, Layout isGraphicalPossible = True except ImportError: sys.stderr.write( "plotly library is required and was not found on your system. Please check PYTHONPATH, or install the library. Continuing without graphical possibilities.\n" ) currentThread = 0 integraseFile , samFile , rtFile , proteaseFile , referenceFile , logFile , outputFile , allControlBamList , inRunControlBam , gtfFile , csFile = "" , "" , "" , "" , "" , "" , "" , "" , "" , "" , "" # generate a temp hash for the file result hsh = ''.join(random.choice(string.ascii_lowercase + string.digits) for _ in range(12)) depthThreshold = 200 ratioThreshold = 0 pvalueThreshold = 1.0 annotationIsDefined = False nbThread = 1 countStop = False graphical = False loggingLevel = 1 referenceSequence = {} referenceCodon = {} knownMutationTable = {} genes = {} effectifTable = {} totalAltSequencedBase = {} protCodon = {} bamCodon = {} bamCodonReference = {} # codon table codonTable = { "TTT":"F" , "TTC" : "F" , "TTA" : "L" , "TTG" : "L" , "CTT":"L" , "CTC":"L" , "CTA":"L" , "CTG":"L" , "ATT" : "I" , "ATC":"I" , "ATA":"I" , "ATG":"M" , "GTT":"V" , "GTC":"V" , "GTA":"V" , "GTG":"V" , "TCT":"S" , "TCC":"S" , "TCA":"S" , "TCG":"S" , "CCT":"P" , "CCC":"P" , "CCA":"P" , "CCG":"P", "ACT":"T" , "ACC":"T" , "ACA":"T" , "ACG":"T" , "GCT":"A" , "GCC":"A" , "GCA":"A" , "GCG":"A" , "TAT":"Y", "TAC":"Y" , "TAA":"*" , "TAG":"*" , "CAT":"H" , "CAC":"H" , "CAA":"Q" , "CAG" : "Q" , "AAT":"N" , "AAC":"N" , "AAA":"K" , "AAG":"K" , "GAT":"D" , "GAC":"D" , "GAA":"E" , "GAG":"E", "TGT":"C" , "TGC":"C" , "TGA":"*" , "TGG":"W" , "CGT":"R" , "CGC":"R" , "CGA":"R" , "CGG":"R" , "AGT":"S" , "AGC":"S" , "AGA":"R" , "AGG":"R" , "GGT":"G", "GGC":"G" , "GGA":"G", "GGG":"G" } # we need to store some stuff for further statistical tests some values totalRefSequencedBase = {} opts, args = getopt.getopt(sys.argv[1:], 's:i:r:p:R:D:l:P:o:c:b:a:n:SGe:L:g:C:') for opt, arg in opts: if opt in ("-i"): integraseFile = arg elif opt in ("-s"): samFile = arg elif opt in ("-r"): rtFile = arg elif opt in ("-p"): proteaseFile = arg elif opt in ("-R"): referenceFile = arg elif opt in ("-D"): depthThreshold = int(arg) elif opt in ("-l"): ratioThreshold = int(arg) elif opt in ("-P"): pvalueThreshold = float(arg) elif opt in ("-o"): outputFile = arg elif opt in ("-c"): codonResultFile = arg elif opt in ("-C"): csFile = arg elif opt in ("-b"): inRunControlBam = arg elif opt in ("-g"): gtfFile = arg elif opt in ("-a"): allControlBamList = arg elif opt in ("-n"): nbThread = int( arg ) elif opt in ("-S"): countStop = True sys.stderr.write('Stop codon detection activated.') elif opt in ("-G"): if isGraphicalPossible : sys.stderr.write('Graphical capabilities activated.') graphical = True else: sys.stderr.write('Warning : Unable to use graphical capabilities.') graphical = False elif opt in ("-e") : logFile = arg elif opt in ("-L") : loggingLevel = int( arg ) if logFile == "" : logging.basicConfig(stream = sys.stderr , filemode = 'w', level = logging.INFO, format = '%(asctime)s %(levelname)s - %(message)s') else: logging.basicConfig(filename = logFile , filemode = 'w', level = logging.INFO, format = '%(asctime)s %(levelname)s - %(message)s') logging.info('start') # sample name if samFile.find( "/" ) != -1 : sample = samFile.split( "/" ) [-1] else: sample = samFile sample = sample.replace( ".bam" , "" ) sample = sample.split( "." )[0] logging.info( "Sample name detected : %s" % sample ) # codon output file name if countStop == True : codonResultFile = "codons.stop.tsv" else: codonResultFile = "codons.tsv" def phredToInt( incString ): # convert to ascii score = ord( incString ) # substract 33 to the score score -= 33 return score def igf( lddl , chiStat ): if chiStat < 0.0: return 0.0 sc = 1.0 / lddl sc *= chiStat**lddl sc *= math.exp( -chiStat ) sys.stderr.write( "sc value = %s\n" % ( sc ) ) somme = 1.0 nom = 1.0 denom = 1.0 for i in range( 1 , 200): nom *= chiStat lddl += 1 denom *= lddl somme += (nom / denom) if ( somme * sc ) < 0.000000000000001: sys.stderr.write( "igf value = %s\n" % (0.000000000000001) ) return 0.000000000000001 sys.stderr.write( "igf value = %s\n" % (somme * sc ) ) return somme * sc def gamma( lddl ): RECIP_E = 0.36787944117144232159552377016147 TWOPI = 6.283185307179586476925286766559 D = 1.0 / (10.0 * lddl) D = 1.0 / ((12 * lddl) - D) D = (D + lddl) * RECIP_E D = D**lddl D *= math.sqrt(TWOPI / lddl) sys.stderr.write( "gamma = %s\n" % D ) return D def adequationChiTwo( effectTable , localRef , localAlt ): # compute deviation between obs & theoritical thMean = sum( effectTable ) / len( effectTable ) obsMean = localAlt * 100 / ( float( localRef ) + float( localAlt ) ) if obsMean < thMean: return 1 if obsMean == 100: obsMean = 99.99999 logging.info( "Th mean : %s ; obsMean : %s\n" % ( thMean , obsMean )) refDev = ( ( (100 - obsMean) - (100 - thMean) )**2 ) / (100 - obsMean) altDev = ( ( obsMean - thMean )**2 ) / obsMean ddl = 1 qSquare = refDev + altDev alpha = 5 sys.stderr.write( "X2 statistical value = %s\n" % qSquare ) # with alpha = 5% and a ddl = 1, the confidence interval is [0;5.991] # H0 is rejected if qSquare superior to 5.991 with a 5% error risk # so there is asignificant difference between the position wie observed and the rest of the reference. # need to compute a p-value return pochisq( qSquare , 1 ) def pochisq(x, df) : LOG_SQRT_PI = 0.5723649429247000870717135 I_SQRT_PI = 0.5641895835477562869480795 if x <= 0.0 or df < 1 : return 1.0 s = 2.0 * poz( - math.sqrt( x ) ) sys.stderr.write( " p-value : %s\n" % s ) return s def poz(z) : sys.stderr.write( "Incoming z : %s\n" % z ) y = 0 x = 0 w = 0 Z_MAX = 6.0 if z == 0.0 : x = 0.0 else : y = 0.5 * abs(z) if (y >= (Z_MAX * 0.5)): x = 1.0 elif y < 1.0 : w = y * y x = ((((((((0.000124818987 * w - 0.001075204047) * w + 0.005198775019) * w - 0.019198292004) * w + 0.059054035642) * w - 0.151968751364) * w + 0.319152932694) * w - 0.531923007300) * w + 0.797884560593) * y * 2.0 else : y -= 2.0 x = (((((((((((((-0.000045255659 * y + 0.000152529290) * y - 0.000019538132) * y - 0.000676904986) * y + 0.001390604284) * y - 0.000794620820) * y - 0.002034254874) * y + 0.006549791214) * y - 0.010557625006) * y + 0.011630447319) * y - 0.009279453341) * y + 0.005353579108) * y - 0.002141268741) * y + 0.000535310849) * y + 0.999936657524 if z > 0.0 : sys.stderr.write( "Output : %s\n" % ((x + 1.0) * 0.5) ) return ((x + 1.0) * 0.5) else: sys.stderr.write( "Output : %s\n" % ((1.0 - x) * 0.5) ) return ((1.0 - x) * 0.5) # need the bam already parsed to be used. def getEffectiveTable( ref , excludeList ): localEffectifTable = [] for position in bamCodon[ref].keys(): # pass the position if new as a mutation if position in excludeList: continue totalOnPos = 0 totalAltOnPos = 0 refAA = codonTable[ referenceCodon[ref][position] ] for codon in bamCodon[ref][position].keys(): totalOnPos += bamCodon[ref][position][codon] obsAA = codonTable[ codon ] if obsAA != refAA: totalAltOnPos += bamCodon[ref][position][codon] if totalAltOnPos == 0: localEffectifTable.append( 0 ) #~ sys.stderr.write( "Adding 0 to %s effectives for pos : %s \n" % ( ref , position ) ) else: localEffectifTable.append( totalAltOnPos * 100 / float( totalOnPos ) ) #~ sys.stderr.write( "Adding %s to %s effectives for pos : %s \n" % ( totalAltOnPos * 100 / float( totalOnPos ) , ref , position ) ) return localEffectifTable # compute factoriel def fact(n): """fact(n): calcule la factorielle de n (entier >= 0)""" if n<2: return 1 else: return n*fact(n-1) # compute log factoriel def logFactoriel( inc ): ret = 0 while inc > 0: ret += math.log( inc ) inc -= 1 return ret; # compute log of hypergeometrique for FET def logHypergeometricProb( a , b , c , d ): return logFactoriel( a + b ) + logFactoriel( c + d ) + logFactoriel( a + c ) + logFactoriel( b + d )- logFactoriel( a ) - logFactoriel( b ) - logFactoriel( c ) - logFactoriel( d ) - logFactoriel( a + b + c + d ) # compute pvalue from FET def FETPvalue( a , b , c, d ): #sys.stderr.write( "Computing a p-value from FET ..." ) n = a + b + c + d logpCutOff = logHypergeometricProb( a , b , c , d ) pFraction = 0 logpValue = 0 for x in range( 0 , n ): if( ( a + b - x >= 0 ) and ( a + c - x >= 0 ) and ( d - a + x >= 0 ) ) : l = logHypergeometricProb( x , a + b - x , a + c - x , d - a + x ) if l <= logpCutOff : pFraction += math.exp( l - logpCutOff ) logpValue = logpCutOff + math.log( pFraction ) #sys.stderr.write( " done \n" ) return math.exp(logpValue); # determine if a codon is a stop or not def isStop( incCodon ): stopList = [ "TAG" , "TAA" , "TGA" ] if incCodon in stopList : return True else: return False # compute the complement of a sequence def complement( incCodon ): t = {"A":"T" , "T":"A" , "G":"C" ,"C":"G" } s = "" incCodon = incCodon.upper() for i in incCodon: s += t[i] return s # deal with the ref genome def get_genome(): logging.info( "Starting genome parsing" ) referenceStream = open( referenceFile , "r" ) currentChr = "" i = 1 for line in referenceStream: # name of the sequence line = line.strip() if line.startswith( ">" ): line = line.replace( ">" , "" ) line = line.split( " " )[0] currentChr = line referenceSequence[line] = {} logging.info( "New reference sequence detected : %s." % currentChr ) else: for base in line: referenceSequence[currentChr][i] = base i += 1 referenceStream.close() logging.info( "Genome summary : \n" ) for chr in referenceSequence.keys(): logging.info("\t%s\t%s" % ( chr , len(referenceSequence[chr] )) ) logging.info( "Ending genome parsing." ) # codon table for each genes. def get_genes(): # parse the gtf file to get every cds # warning : need to deal with genes in reverse ... logging.info( "Starting genes parsing." ) for line in open( gtfFile , 'r' ): line = line.strip() if line.split( "\t" )[0] not in referenceSequence.keys() : continue if line.split( "\t" )[2] == "CDS" : a = line.split( "\t" )[8] aTable = a.split( ";" ) for elt in aTable : if elt.startswith( "protein_id" ): g = elt.split( "=" )[1] g = g.replace( '"' , '' ) if g not in genes.keys(): genes[g] = {} referenceCodon[g] = {} genes[g]["start"] = [] genes[g]["end"] = [] genes[g]["strand"] = "" genes[g]["chrom"] = line.split( "\t" )[0] genes[g]["start"].append( int( line.split( "\t" )[3] ) ) genes[g]["end"].append( int( line.split( "\t" )[4] ) ) genes[g]["strand"] = line.split( "\t" )[6] # get codon from reference for g in genes.keys(): logging.info( "Getting codons for gene : %s" % g ) codon = "" chrom = genes[g]["chrom"] logging.info( "\tchrom is %s" % chrom ) if genes[g]["strand"] == "+" : logging.info( "\tstrand is +" ) posList = referenceSequence[chrom].keys() posList.sort() for pos in posList: for start,end in zip(genes[g]["start"],genes[g]["end"]) : if (pos >= start) and (pos <= end) : codon += referenceSequence[chrom][pos] if (len(codon) % 3 == 0) and (len(codon) != 0 ) : codon = codon.upper() referenceCodon[g][(pos-3)+1] = codon # ~ referenceCodon[g][(pos-3)] = codon codon = "" if genes[g]["strand"] == "-" : logging.info( "\tstrand is -" ) posList = referenceSequence[chrom].keys() posList.sort() posList.reverse() for pos in posList : for start,end in zip(genes[g]["start"],genes[g]["end"]) : if (pos >= start) and (pos <= end) : codon += referenceSequence[chrom][pos] if (len(codon) % 3 == 0) and (len(codon) != 0 ) : codon = codon.upper() referenceCodon[g][(pos+3)+1] = complement(codon) # ~ referenceCodon[g][(pos+3)] = complement(codon) codon = "" logging.info( "Genes content : " ) for g in referenceCodon.keys(): posList = referenceCodon[g].keys() posList.sort() for pos in posList: logging.info( "%s\t%s\t%s" % ( g , pos , referenceCodon[g][pos] ) ) # conversion to the protein numbers for g in referenceCodon.keys(): protCodon[g] = {} i = 1 posList = referenceCodon[g].keys() posList.sort() for pos in posList : protCodon[g][pos] = i i += 1 logging.info( "/ conversion : " ) for g in protCodon.keys(): posList = protCodon[g].keys() posList.sort() for pos in posList: logging.info( "%s\t%s\t%s" % ( g , pos , protCodon[g][pos] ) ) logging.info( "Ending genes parsing." ) def display_reference_codon(): w = open( "referenceCodon.tsv" , "w" ) w.write( "##### Ref codons : #####\n" ) for g in referenceCodon.keys(): for pos in sorted(referenceCodon[g].keys()): w.write( "\t%s : %s : %s \n" % (g, pos , referenceCodon[g][pos])) w.close() def display_prot_codon(): w = open( "protCodon.tsv" , "w" ) w.write( "##### Prot codons : #####\n" ) for g in protCodon.keys(): for pos in sorted(protCodon[g].keys()): w.write( "\t%s : %s : %s \n" % (g, pos , protCodon[g][pos])) w.close() def display_bam_codon(): w = open( "bamCodon.tsv" , "w" ) w.write( "###### bam codons ######\n") for a in bamCodon.keys(): for b in sorted(bamCodon[a].keys()): for c in bamCodon[a][b].keys(): w.write( "%s %s %s %s\n" % ( a , b , c , bamCodon[a][b][c] ) ) w.close() def display_bam_ref_codon(): w = open( "bamRefCodon.tsv" , "w" ) w.write( "###### bam codons ######\n") for a in bamCodonReference.keys(): for b in sorted(bamCodonReference[a].keys()): for c in bamCodonReference[a][b].keys(): w.write( "%s %s %s %s\n" % ( a , b , c , bamCodonReference[a][b][c] ) ) w.close() def display_bam_reference_codon(): w = open( "bamCodonReference.tsv" , "w" ) w.write( "###### bam codons ######\n") for a in bamCodonReference.keys(): for b in sorted(bamCodonReference[a].keys()): for c in bamCodonReference[a][b].keys(): w.write( "%s %s %s %s\n" % ( a , b , c , bamCodonReference[a][b][c] ) ) w.close() def get_annotation(): logging.info( "Starting genotype tables parsing" ) logging.info( "Starting %s parsing" % integraseFile ) global annotationIsDefined if integraseFile != "": annotationIsDefined = True refFound = False # parse & store genotyping files integraseStream = open( integraseFile, "r" ) for line in integraseStream: if line.startswith( " " ) or line.startswith( "Position" ) or line.startswith( "AA" ) or len(line) < 10: continue if line.startswith( "#" ) : temp = line.strip().split( "#" ) ref = temp[1] knownMutationTable[ref] = {} refFound = True continue if not refFound : logging.error( "Error : %s file is malformed. Please redo the analysis without it or correct the file (see doc at http://blabla.html)\n" % integraseFile ) sys.exit( 1 ) lineTable = line.strip().split( "\t" ) index = lineTable[2] + lineTable[0] + lineTable[4] if loggingLevel >= 3: logging.info( "index : %s" % index ) if lineTable[2] == lineTable[4]: continue knownMutationTable[ref][index] = lineTable[5] integraseStream.close() logging.info( "\tINTEGRASE : OK\n" ) else: logging.info( "\tINTEGRASE : no file provided" ) logging.info( "Starting %s parsing" % rtFile ) if rtFile != "": annotationIsDefined = True refFound = False # parse & store genotyping files reverseTranscriptaseStream = open( rtFile, "r" ) for line in reverseTranscriptaseStream: if line.startswith( " " ) or line.startswith( "Position" ) or line.startswith( "AA" ) or len(line) < 10: continue if line.startswith( "#" ) : temp = line.strip().split( "#" ) ref = temp[1] knownMutationTable[ref] = {} refFound = True continue if not refFound : logging.error( "Error : %s file is malformed. Please redo the analysis without it or correct the file (see doc at http://blabla.html)\n" % rtFile ) sys.exit( 1 ) lineTable = line.strip().split( "\t" ) index = lineTable[2] + lineTable[0] + lineTable[4] if loggingLevel >= 3: logging.info( "index : %s" % index ) if lineTable[2] == lineTable[4]: continue knownMutationTable[ref][index] = lineTable[5] reverseTranscriptaseStream.close() logging.info( "\tREVERSETRANSCRIPTASE : OK\n" ) else: logging.info( "\tREVERSETRANSCRIPTASE : no file provided\n" ) logging.info( "Starting %s parsing" % proteaseFile ) if proteaseFile != "": annotationIsDefined = True refFound = False # parse & store genotyping files proteaseStream = open( proteaseFile, "r" ) for line in proteaseStream: if line.startswith( " " ) or line.startswith( "Position" ) or line.startswith( "AA" ) or len(line) < 10: continue if line.startswith( "#" ) : temp = line.strip().split( "#" ) ref = temp[1] knownMutationTable[ref] = {} refFound = True continue if not refFound : logging.error( "Error : %s file is malformed. Please redo the analysis without it or correct the file (see doc at http://blabla.html)\n" % proteaseFile ) sys.exit( 1 ) lineTable = line.strip().split( "\t" ) index = lineTable[2] + lineTable[0] + lineTable[4] if loggingLevel >= 3: logging.info( "index : %s" % index ) if lineTable[2] == lineTable[4]: continue knownMutationTable[ref][index] = lineTable[5] proteaseStream.close() logging.info( "\tPROTEASE : OK\n" ) else: logging.info( "\tPROTEASE : no file provided\n" ) logging.info( "Genotype tables parsing done\n" ) resultTable = {} qualityTable = {} resultTable["ReverseTranscriptase"] = {} resultTable["Protease"] = {} resultTable["Integrase"] = {} qualityTable["ReverseTranscriptase"] = {} qualityTable["Protease"] = {} qualityTable["Integrase"] = {} ###### deal with inRunControlBam def read_inRunControl(): global bamCodonReference logging.info( "Starting bam control file parsing : %s" % ( inRunControlBam ) ) bamIterRef = pysam.AlignmentFile( inRunControlBam , "r" ) bamCodonReference = {} # samStream = open( samFile , "r" ) countLine = 0 for line in bamIterRef: if ( countLine != 0 ) and (countLine % 200000 == 0 ): logging.info( "%s aln read\n" % countLine ) if loggingLevel >= 3: logging.info('New read to parse : %s' % str(line) ) # pass bad alignements if (line.is_unmapped == True ) or (line.is_secondary == True ) or (line.is_supplementary == True) or (int(line.mapping_quality) < 30) : if loggingLevel >= 3: logging.info('Passing sequence : bad quality ' ) continue # get position and ref ref = line.reference_name position = int( line.reference_start) + 1 sequence = line.query_sequence quality = line.query_qualities # non aligned reads, theoritically unreachable because tested before if ref == "*" : if loggingLevel >= 3: logging.info('Passing sequence : not aligned : %s ' % ref ) continue # test CIGAR string : if insertion or deletion, we skip the read cigar = line.cigarstring if "I" in cigar or "H" in cigar or "S" in cigar or "D" in cigar: if loggingLevel >= 3: logging.info('Passing sequence : CIGAR contains a special event : %s ' % cigar ) continue currentPosition = position i = 0 countLine += 1 # browse every codons, but need to determine if the first codon is complete or not while ( i < len(sequence) ) and (i + 3 <= len( sequence ) ) : complete = False for g in genes.keys(): for start,end in zip(genes[g]["start"],genes[g]["end"]) : if currentPosition >= start and currentPosition <= end : # manage new chr encounters if not bamCodonReference.has_key( g ): logging.info('New gene found in data : %s' % g ) bamCodonReference[g] = {} if protCodon[g].has_key( currentPosition ): complete = True codon = sequence[i:i + 3] codon = codon.upper() if loggingLevel >= 3: logging.info('Adding a codon on ' + g + ' : pos on read : ' + str(i) + ' ; pos on ref : ' + str( currentPosition ) + ' ; codon : ' + codon + '(' + str(((currentPosition-start)/3)+1 ) + ')' ) qualityCodon = quality[i:i+3] qualityScore = 0 takeIt = 1 for base in qualityCodon: qualityScore += int( base ) if int(base) < 30 : takeIt = 0 if takeIt == 1: if not bamCodonReference[g].has_key(((currentPosition-start)/3)+1 ): bamCodonReference[g][((currentPosition-start)/3)+1] = {} if not bamCodonReference[g][((currentPosition-start)/3)+1].has_key( codon ): bamCodonReference[g][((currentPosition-start)/3)+1][codon] = 0 bamCodonReference[g][((currentPosition-start)/3)+1][codon] += 1 else: complete = False if complete : i += 3 currentPosition += 3 else: # codon is not complete if loggingLevel >= 3: logging.info('Not on a 1st base, passing to next base : pos on read : ' + str(i) + ' ; pos on ref : ' + str( currentPosition ) ) i += 1 currentPosition += 1 logging.info( "Parsing done with %s aln treated\n" % countLine ) def read_sample(): ############ deal with bam file logging.info( "Parsing bam file : %s ..." % ( samFile ) ) # ~ logging.info( "Starting bam file parsing : %s" % ( samFile ) ) bannedReadID = [] countLine = 0 bamIter = pysam.AlignmentFile( samFile , "r" ) global bamCodon #samStream = open( samFile , "r" ) for line in bamIter: if loggingLevel >= 3: logging.info('New read to parse : %s' % str(line) ) # pass bad alignements if (line.is_unmapped == True ) or (line.is_secondary == True ) or (line.is_supplementary == True) or (int(line.mapping_quality) < 30) : if loggingLevel >= 3: logging.info('Passing sequence : bad quality ' ) continue # pass stop pairs if activated if line.query_name in bannedReadID : # ~ sys.stderr.write( 'Passing sequence : stop in pair \n' ) continue # get position and ref ref = line.reference_name position = int( line.reference_start) + 1 sequence = line.query_sequence quality = line.query_qualities # non aligned reads, theoritically unreachable because tested before if ref == "*" : if loggingLevel >= 3: logging.info('Passing sequence : not aligned : %s ' % ref ) continue #~ sys.stderr.write( "%s\n" % refTable ) # test CIGAR string : if insertion or deletion, we skip the read cigar = line.cigarstring if "I" in cigar or "H" in cigar or "S" in cigar or "D" in cigar: if loggingLevel >= 3: logging.info('Passing sequence : CIGAR contains a special event : %s ' % cigar ) continue # manage deletion # ~ positionOfDel = 0 # ~ lengthOfDel = 0 # ~ if "D" in cigar: # ~ #logging.info('Passing sequence : Deletion detected : ' + cigar + " ; sequence : " + sequence ) # ~ # get position of the del in the read and its size # ~ for elt in line.cigartuples: # ~ if elt[0] == 2 : # ~ lengthOfDel = elt[1] # ~ break # ~ if elt[0] == 0: # ~ positionOfDel += elt[1] currentPosition = position # extract codon sequences i = 1 # browse the sequence to look for stop codon if stop analysis detected takeSequence = True ## need a full rewamp of the stop detection #### # ~ if countStop: # ~ while i < len(sequence) : # ~ if i + 3 > len( sequence ): # ~ break # ~ if currentPosition % 3 == 0 or currentPosition == 1: # ~ codon = sequence[i:i + 3] # ~ codon = codon.upper() # ~ if isStop( codon ): # ~ takeSequence = False # ~ bannedReadID.append( line.query_name ) # ~ break # ~ i += 3 # ~ currentPosition += 3 # ~ else: # ~ i += 1 # ~ currentPosition += 1 ################################################# countLine += 1 currentPosition = position # browse every codons, but need to determine if the first codon is complete or not if takeSequence : i = 0 # deal with end of sequence while ( i < len(sequence) ) and (i + 3 <= len( sequence ) ) : # detect the gene complete = False for g in genes.keys(): for start,end in zip(genes[g]["start"],genes[g]["end"]) : if currentPosition >= start and currentPosition <= end : # manage new gene encounters if not bamCodon.has_key( g ): logging.info('New reference found : %s' % g ) bamCodon[g] = {} # codon is complete if protCodon[g].has_key( currentPosition ): complete = True codon = sequence[i:i + 3] codon = codon.upper() if loggingLevel >= 3: logging.info('Adding on ' + g + ' : pos on read : ' + str(i) + ' ; pos on ref : ' + str( currentPosition ) + ' ; codon : ' + codon + '(' + str(((currentPosition-start)/3)+1 ) + ')' ) qualityCodon = quality[i:i+3] qualityScore = 0 takeIt = 1 for base in qualityCodon: qualityScore += int( base ) if int(base) < 30 : takeIt = 0 if takeIt == 1: if not bamCodon[g].has_key(((currentPosition-start)/3)+1 ): bamCodon[g][((currentPosition-start)/3)+1] = {} if not bamCodon[g][((currentPosition-start)/3)+1].has_key( codon ): bamCodon[g][((currentPosition-start)/3)+1][codon] = 0 bamCodon[g][((currentPosition-start)/3)+1][codon] += 1 else: # codon is not complete complete = False if complete : i += 3 currentPosition += 3 else: if loggingLevel >= 3: logging.info('Not on a 1st base, passing to next base : pos on read : ' + str(i) + ' ; pos on ref : ' + str( currentPosition ) ) i += 1 currentPosition += 1 logging.info( " done with %s aln kept\n" % countLine ) logging.info( "%s sequence discarded due to stop codon.\n" % len( bannedReadID ) ) def get_aa( g , ppos ): gpos = 0 # get the genomic position for elt in protCodon[g].keys(): if ( protCodon[g][elt] == ppos ): gpos = elt break # verify if gpos == 0: logging.error( "Error : submitted position not in the reference. Please check data consistency\n" ) # get the codon codon = referenceCodon[g][gpos] # get the aa aa = codonTable[codon] # ~ sys.stderr.write( "Gene : %s ; genomic pos : %s ; proteic pos : %s ; codon : %s ; aa : %s\n" % ( g , gpos , ppos , codon , aa ) ) return aa def get_gpos_from_ppos( g, ppos ): for elt in protCodon[g].keys(): if ( protCodon[g][elt] == ppos ): gpos = elt break # verify if gpos == 0: logging.error( "Error : submitted position not in the reference. Please check data consistency\n" ) return gpos # let's modelize the background noise def modelize_noise(): global bamCodon # count codons wich are not the reference logging.info( "Counting codons ... " ) for ref in bamCodon.keys(): # ~ sys.stderr.write( "\ton ref %s\n" % ref ) outStream = open( sample + "/" + sample + "." + ref + ".percent.data" , "w" ) noiseTable = {} for position in bamCodon[ref].keys(): # ~ sys.stderr.write( "\t\ton p.pos %s\n" % position ) totalOnPos = 0 if not noiseTable.has_key( position ): noiseTable[position] = {} # position is in prot. # We need to convert to a g position refAA = get_aa( ref , position ) for codon in bamCodon[ref][position].keys(): # ~ sys.stderr.write( "\t\t\ton codon %s\n" % codon ) totalOnPos += bamCodon[ref][position][codon] obsAA = codonTable[ codon ] if obsAA != refAA: if not noiseTable[position].has_key( obsAA ): noiseTable[position][obsAA] = 0 noiseTable[position][obsAA] += bamCodon[ref][position][codon] # we have all data for counting for aa in noiseTable[position].keys(): noiseTable[position][aa] = noiseTable[position][aa] * 100 / float(totalOnPos) listofAA = list( set( codonTable.values() ) ) outStream.write( "#position\t%s\n" % "\t".join( listofAA ) ) for position in noiseTable.keys(): outStream.write( "%s" % position ) for a in listofAA : if noiseTable[position].has_key( a ): outStream.write( "\t%s" % noiseTable[position][a] ) else: outStream.write( "\t0" ) outStream.write( "\n" ) outStream.close() sys.stderr.write( " done\n " ) # count codons wich are not the reference logging.info( "Modelizing background noise ... " ) for ref in bamCodon.keys(): totalRefSequencedBase[ref] = 0 totalAltSequencedBase[ref] = 0 bigTotal = 0 noiseTable = {} if not noiseTable.has_key( ref ): noiseTable[ref] = {} outStream = open( sample + "/" + sample + "." + ref + ".value.data" , "w" ) for position in bamCodon[ref].keys(): totalOnPos = 0 totalAltOnPos = 0 if not noiseTable[ref].has_key( position ): noiseTable[ref][position] = {} refAA = get_aa( ref , position ) for codon in bamCodon[ref][position].keys(): totalOnPos += bamCodon[ref][position][codon] bigTotal += bamCodon[ref][position][codon] obsAA = codonTable[ codon ] if obsAA != refAA: if not noiseTable[ref][position].has_key( obsAA ): noiseTable[ref][position][obsAA] = 0 noiseTable[ref][position][obsAA] += bamCodon[ref][position][codon] totalAltSequencedBase[ref] += bamCodon[ref][position][codon] totalAltOnPos += bamCodon[ref][position][codon] #~ sys.stderr.write( "Position : %s , Ref : %s , Alt : %s , Total : %s \n" % ( position , totalOnPos - totalAltOnPos , totalAltOnPos , totalOnPos ) ) totalRefSequencedBase[ref] += bigTotal - totalAltSequencedBase[ref] listofAA = list( set( codonTable.values() ) ) outStream.write( "#position\t%s\n" % "\t".join( listofAA ) ) for position in noiseTable[ref].keys(): outStream.write( "%s" % position ) for a in listofAA : if noiseTable[ref][position].has_key( a ): outStream.write( "\t%s" % noiseTable[ref][position][a] ) else: outStream.write( "\t0" ) outStream.write( "\n" ) outStream.close() logging.info( " done\n " ) # calculate distributions def compute_distribution(): sys.stderr.write( "Computing distribution ... " ) for ref in bamCodon.keys(): sys.stderr.write( "Calculating distribution for %s \n" % ref ) incStream = open( sample + "/" + sample + "." + ref + ".value.data" , "r" ) contingencyTable = {} for line in incStream : #sys.stderr.write( "%s" % line ) if line.startswith( "#position" ): continue myTable = line.split( "\t" ) for elt in myTable[1:]: i_elt = int( elt ) if not contingencyTable.has_key( i_elt ): contingencyTable[i_elt] = 0 contingencyTable[i_elt] += 1 totalSum = 0 for elt in contingencyTable.keys(): totalSum += contingencyTable[elt] # calculate probabilities for elt in contingencyTable.keys(): tmp = contingencyTable[elt] contingencyTable[elt] = tmp / float( totalSum ) outStream = open( sample + "/" + sample + "." + ref+".value.density" , "w" ) outStream.write( "#value\tcount\n" ) for elt in contingencyTable.keys(): outStream.write( "%s\t%s\n" % ( elt , contingencyTable[elt] ) ) outStream.close() sys.stderr.write( " done\n " ) def variant_calling(): global bamCodon global bamCodonReference # detect the potential background noise a = 0 b = 1 turn = 1 positionToDiscard = [] bg = {} while b != 0: bg = {} bgEff = {} b = 0 logging.info( "BG estimation : turn %s\n" % turn ) # first : get data for chisquare test for ref in bamCodon.keys(): if loggingLevel >= 3: logging.info( "\tOn ref : %s\n" % ref ) altCount = [] refCount = [] if not bg.has_key( "ref" ): bg[ref] = [] bgEff[ref] = [] for position in bamCodon[ref].keys(): if loggingLevel >= 3: logging.info( "\t\tOn pos : %s\n" % position ) positionAltCount = 0 positionTotalCount = 0 if position in positionToDiscard: logging.info( "discarded pos : %s Passing \n" % position ) continue for codon in bamCodon[ref][position].keys(): positionTotalCount += bamCodon[ref][position][codon] # ~ sys.stderr.write( "On %s : pos %s , totalD : %s\n" % ( ref , position , positionTotalCount ) ) for codon in bamCodon[ref][position].keys(): if loggingLevel >= 3: logging.info( "\t\t\tOn codon : %s\n" % codon ) gpos = get_gpos_from_ppos( ref, position ) refCodon = referenceCodon[ref][gpos] if codon != refCodon : positionAltCount = bamCodon[ref][position][codon] if loggingLevel >= 3: logging.info( "On %s : pos %s , alt : %s ; %s vs %s \n" % ( ref , position , positionAltCount , refCodon , codon ) ) bg[ref].append( positionAltCount / float( positionTotalCount ) ) if positionAltCount > positionTotalCount : logging.critical( "Critical error : more alt allele than the total on position %s\n" % position ) sys.exit( 1 ) altCount.append( positionAltCount ) refCount.append( positionTotalCount ) if len( altCount ) == 0 : logging.error( "Error : all position (alt) excluded on ref %s\n" % ref ) continue if len( refCount ) == 0 : logging.error( "Error : all position (ref) excluded on ref %s\n" % ref ) continue bgEff[ref].append( sum( altCount ) / len( altCount ) ) bgEff[ref].append( sum( refCount ) / len( refCount ) ) logging.info( "%s : %s , %s\n" % ( ref , bgEff[ref][0] , bgEff[ref][1] ) ) # now redo a loop to find mutations for ref in bamCodon.keys(): for position in bamCodon[ref].keys(): positionAltCount = 0 positionTotalCount = 0 if position in positionToDiscard: continue for codon in bamCodon[ref][position].keys(): positionTotalCount += bamCodon[ref][position][codon] for codon in bamCodon[ref][position].keys(): gpos = get_gpos_from_ppos( ref , position ) refCodon = referenceCodon[ref][gpos] if codon != refCodon : positionAltCount = bamCodon[ref][position][codon] # chisquare test to determine if position is significantly different from ref t,p = stats.chisquare( positionAltCount / float( positionTotalCount ) , bg[ref] ) #~ sys.stderr.write( "Testing bg : %s , %s , %s, %s\n" % ( positionAltCount , positionTotalCount , ",".join( bg[ref] ) , p ) ) if p < 0.01: positionToDiscard.append( position ) b += 1 a += 1 logging.info( "End of turn %s with %s new mutation found\n" % ( turn , b ) ) turn += 1 # background noise is estimated with a control sample. logging.info( "Calling variants ... \n" ) nbMutTotal = 0 resultTable = {} if not os.path.isdir( sample ): os.makedirs( sample ) codonFileResult = open( codonResultFile , "w" ) codonFileResult.write( "#Reference\treferenceCodon\treferenceAA\tposition\tsequencedCodon\tsequencedAA\tcounts\tallelicratio\t\tctrlCounts\tctrlRatio\n" ) mutationFileResult = open( hsh , "w" ) testToPerform = [] for ref in bamCodon.keys(): resultTable[ref] = {} totalOnPos = 0 logging.info( "ref : %s\n" % ref ) for position in bamCodon[ref].keys(): #~ logging.info( "ref : " + ref + " ; position : " + str( position ) ) gpos = get_gpos_from_ppos( ref , position ) refCodon = referenceCodon[ref][gpos] refAA = codonTable[refCodon] totalOnPos = 0 totalOnPosReference = 0 for a in bamCodon[ref][position].keys(): totalOnPos += bamCodon[ref][position][a] if not bamCodonReference.has_key( ref ) : logging.warning( "The in-run control sample hasn't any %s ref : passing \n" % ref ) break if bamCodonReference[ref].has_key( position ): for a in bamCodonReference[ref][position].keys(): totalOnPosReference += bamCodonReference[ref][position][a] for codon in bamCodon[ref][position].keys(): count = bamCodon[ref][position][codon] obsAA = codonTable[codon] if bamCodonReference[ref].has_key( position ): if bamCodonReference[ref][position].has_key( codon ): countCtrl = bamCodonReference[ref][position][codon] else: countCtrl = 0 else: countCtrl = 0 ratioCtrl = 0 ratio = 0 if totalOnPos != 0: ratio = ( count * 100 ) / float(totalOnPos) if totalOnPosReference != 0: ratioCtrl = ( countCtrl * 100 ) / float(totalOnPosReference) codonFileResult.write( "%s\t%s\t%s\t%s\t%s\t%s\t%s/%s\t%s\t%s\t%s\n" % ( ref , refCodon , refAA , position , codon , obsAA , count , totalOnPos , ratio , countCtrl , ratioCtrl ) ) # output mutation if obsAA != refAA and totalOnPos > 200 and count > 30 and countCtrl < count : logging.info( "Testing position : %s on %s , AA : %s (ref %s ), depth : %s , ratio : %s" % ( position , ref , obsAA , refAA , totalOnPos , ratio ) ) # Fisher exact test to determine if different from the in run control oddsratio,pValue = stats.fisher_exact([[count , totalOnPos] , [countCtrl , totalOnPosReference]] ) logging.info( "\tFET p-value : %s" % pValue ) if pValue < 0.01: # contingency chi 2 test vs background noise of the sample chi2, p, dof, ex = stats.chi2_contingency( [[count , totalOnPos ] , [ bgEff[ref][0] , bgEff[ref][1] ]] , correction=False) logging.info( "\tchi2 p-value : %s " % p ) if p < 0.01: logging.info( "Writing in results %s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s" % ( ref , refCodon , refAA , position , codon , obsAA , count , totalOnPos , countCtrl , totalOnPosReference , pValue , p ) ) mutationFileResult.write( "%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\n" % ( ref , refCodon , refAA , position , codon , obsAA , count , totalOnPos , countCtrl , totalOnPosReference , pValue , p )) else: logging.info( "%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s was different from control but in background noise\n" % ( ref , refCodon , refAA , position , codon , obsAA , count , totalOnPos , countCtrl , totalOnPosReference , pValue , p ) ) logging.info( "Done, performing statistical tests ...\n" ) codonFileResult.close() mutationFileResult.close() # stats for graphics def compute_stats_for_graphics(): statsForGraph = {} for ref in bamCodon.keys(): statsForGraph[ref] = {} totalOnPos = 0 logging.info( "ref : " + ref ) for position in bamCodon[ref].keys(): #~ logging.info( "ref : " + ref + " ; position : " + str( position ) ) refCodon = referenceCodon[ref][position] refAA = codonTable[refCodon] statsForGraph[ref][position] = {} totalOnPos = 0 totalOnPosReference = 0 for a in bamCodon[ref][position].keys(): totalOnPos += bamCodon[ref][position][a] if bamCodonReference[ref].has_key( position ) : for a in bamCodonReference[ref][position].keys(): totalOnPosReference += bamCodonReference[ref][position][a] else: totalOnPosReference = 0 for codon in bamCodon[ref][position].keys(): statsForGraph[ref][position][codon] = {} count = bamCodon[ref][position][codon] obsAA = codonTable[codon] if bamCodonReference[ref].has_key( position ): if bamCodonReference[ref][position].has_key( codon ): countCtrl = bamCodonReference[ref][position][codon] else: countCtrl = 0 else: countCtrl = 0 # FET oddsratio,pValue = stats.fisher_exact([[count , totalOnPos] , [countCtrl , totalOnPosReference]] ) statsForGraph[ref][position][codon]['FET'] = pValue # chisquare chi2, p, dof, ex = stats.chi2_contingency( [[count , totalOnPos ] , [ bgEff[ref][0] , bgEff[ref][1] ]] , correction=False) statsForGraph[ref][position][codon]['CHI'] = p statsForGraph[ref][position][codon]['codonDepth'] = count # writing final results after annotation def write_results(): global annotationIsDefined finalFileResult = open( outputFile , "w" ) finalFileResult.write( "#Reference\taa Position\tReference Codon\tReference aa\tAlternative Codon\tAlternative aa\tDepth\tAlternative Allele Ratio\tFET ctrl p-value\tFET bg pValue\tAnnotation\n") mutationFileResult = open( hsh , "r" ) for line in mutationFileResult: lineTable = line.strip().split( "\t" ) ref = lineTable[0] annotation = "No annotation found" if annotationIsDefined : index = lineTable[2] + lineTable[3] + lineTable[5] logging.info( "looking for annotation for %s\n" % index ) if index in knownMutationTable[ref].keys(): annotation = knownMutationTable[ref][index] depth = int( lineTable[7] ) ratio = ( float( lineTable[6] ) *100 ) / depth finalFileResult.write( "%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s pct\t%s\t%s\t%s\n" % ( ref , lineTable[3] , lineTable[1] , lineTable[2] , lineTable[4] , lineTable[5] , depth , ratio , lineTable[10], lineTable[11] , annotation ) ) mutationFileResult.close() finalFileResult.close() # writing consensus sequence def compute_consensus(): global bamCodon consFile = open( csFile , "w" ) for ref in bamCodon.keys(): consFile.write( ">" + ref + "\n" ) for position in sorted( bamCodon[ref].keys() ) : count = 0 retainedCodon = "" for codon in bamCodon[ref][position].keys(): if bamCodon[ref][position][codon] > count: count = bamCodon[ref][position][codon] retainedCodon = codon consFile.write( retainedCodon ) consFile.write( "\n" ) consFile.close() # main def main( ): #verify_req() get_genome() get_genes() get_annotation() display_reference_codon() display_prot_codon() read_inRunControl() read_sample() display_bam_codon() display_bam_ref_codon() modelize_noise() variant_calling() write_results() compute_consensus() if __name__ == "__main__": main()