Initial commit of the repository

cd11dcf8 · YannisDuffourd · cd11dcf8
Commit cd11dcf8 authored Mar 19, 2018 by YannisDuffourd
Hide whitespace changes
Inline Side-by-side

Showing with 1194 additions and 0 deletions

vihgo.py vihgo.py +1194 -0

No files found.
--- a/vihgo.py
+++ b/vihgo.py
+#  ### GAD development file  ###
+#  ## hiv_genotyping.py
+#  ## Version : 1.0
+#  ## Description : 
+#  ## Usage : hiv_genotyping.py
+#  ## Output : 
+#  ## Requirements : python 2.7+ - pySAM - scipy 
+#  
+#  ## Author: yannis.duffourd@u-bourgogne.fr
+#  ## Creation Date: 2016-10-18
+#  ## last revision date: 2018-03-19
+#  ## Known bugs: None
+#  ## TODO: 
+#  
+
+
+
+
+
+import os
+import sys
+
+sys.path.insert( 0 , "/user1/gad/ya0902du/miniconda2/lib/python2.7/site-packages")
+
+
+print sys.path
+
+
+import getopt
+import logging
+import pysam 
+import logging
+import math
+import threading
+from scipy import stats
+# ~ import matplotlib.pyplot as plt
+
+currentThread = 0
+integraseFile , samFile , rtFile , proteaseFile , referenceFile = "" , "" , "" , "" , ""
+depthThreshold = 200
+ratioThreshold = 0
+pvalueThreshold = 1.0
+annotationIsDefined = False
+inRunControlBam = ""
+allControlBamList = ""
+nbThread = 1
+countStop = False
+
+
+opts, args = getopt.getopt(sys.argv[1:], 's:i:r:p:R:D:l:P:o:c:b:a:n:S')  
+for opt, arg in opts:
+	if opt in ("-i"):
+		integraseFile = arg
+	elif opt in ("-s"):
+		samFile = arg
+	elif opt in ("-r"):
+		rtFile = arg
+	elif opt in ("-p"):
+		proteaseFile = arg
+	elif opt in ("-R"):
+		referenceFile = arg
+	elif opt in ("-D"):
+		depthThreshold = int(arg)
+	elif opt in ("-l"):
+		ratioThreshold = int(arg)
+	elif opt in ("-P"):
+		pvalueThreshold = float(arg)
+	elif opt in ("-o"):
+		outputFile = arg
+	elif opt in ("-c"):
+		codonResultFile = arg
+	elif opt in ("-b"):
+		inRunControlBam = arg
+	elif opt in ("-a"):
+		allControlBamList = arg
+	elif opt in ("-n"):
+		nbThread = int( arg )
+	elif opt in ("-S"):
+		countStop = True
+		sys.stderr.write('Stop codon detection activated.')
+
+sample = samFile.split( "/" ) [-1]
+sample = sample.replace( ".bam" , "" )
+sample = sample.split( "." )[0]
+
+if countStop == True :
+	codonResultFile = "codons.stop.tsv" 
+else:
+	codonResultFile = "codons.tsv"
+
+
+
+sys.stderr.write('start\n')
+
+class myThread (threading.Thread):
+    def __init__( self , ref , refCodon , refAA , position , codon , obsAA , count , totalOnPos , countCtrl , totalOnPosReference , nb ) :
+		threading.Thread.__init__(self)
+		self.ref = ref
+		self.refCodon = refCodon
+		self.refAA = refAA
+		self.position = position
+		self.nb = nb
+		self.codon = codon
+		self.obsAA = obsAA
+		self.count = count
+		self.totalOnPos = totalOnPos
+		self.countCtrl = countCtrl
+		self.totalOnPosReference = totalOnPosReference
+		
+		threadLock.acquire()
+		sys.stderr.write( "New thread number %s , launch with params : %s , %s , %s , %s , %s , %s , %s , %s , %s , %s\n" % ( nb , ref , refCodon , refAA , position , codon , obsAA , count , totalOnPos , countCtrl , totalOnPosReference ) )
+		threadLock.release()
+
+    def run( self ):		
+		global currentThread 
+		oddsratio,pValue = stats.fisher_exact([[self.count , self.totalOnPos] , [self.countCtrl , self.totalOnPosReference]] )
+		threadLock.acquire()
+		sys.stderr.write( "p-value : %s \n" % pValue )
+		if pValue < 0.01:
+			mutationFileResult.write( "%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\n" % ( self.ref , self.refCodon , self.refAA , self.position , self.codon , self.obsAA , self.count , self.totalOnPos , self.countCtrl , self.totalOnPosReference , pValue ))
+		currentThread -= 1
+		threadLock.release()	
+
+
+
+    
+def phredToInt( incString ):
+    # convert to ascii 
+    score = ord( incString )
+    
+    # substract 33 to the score 
+    score -= 33
+    
+    return score
+
+
+def igf( lddl , chiStat ):
+    if chiStat < 0.0:
+		return 0.0
+    
+    sc = 1.0 / lddl
+    sc *= chiStat**lddl
+    sc *= math.exp( -chiStat )
+    sys.stderr.write( "sc value = %s\n" % ( sc ) )
+    
+    somme = 1.0
+    nom = 1.0
+    denom = 1.0
+    
+    for i in range( 1 , 200):
+		nom *= chiStat
+		lddl += 1
+		denom *= lddl
+		somme += (nom / denom)
+		
+    if ( somme * sc ) < 0.000000000000001:
+		sys.stderr.write( "igf value = %s\n" % (0.000000000000001) )
+		return 0.000000000000001
+    
+    sys.stderr.write( "igf value = %s\n" % (somme * sc ) )
+    return somme * sc
+    
+
+def gamma( lddl ):
+    RECIP_E = 0.36787944117144232159552377016147
+    TWOPI = 6.283185307179586476925286766559
+    D = 1.0 / (10.0 * lddl)
+    D = 1.0 / ((12 * lddl) - D)
+    D = (D + lddl) * RECIP_E
+    D = D**lddl
+    D *= math.sqrt(TWOPI / lddl)
+    
+    
+    sys.stderr.write( "gamma = %s\n" % D )
+    return D
+
+
+def adequationChiTwo( effectTable , localRef , localAlt  ):
+    
+    
+    # compute deviation between obs & theoritical
+    thMean = sum( effectTable ) / len( effectTable )
+    obsMean = localAlt * 100 / ( float( localRef ) + float( localAlt  ) )
+    
+    
+    if obsMean < thMean:
+		return 1
+    
+    if obsMean == 100:
+		obsMean = 99.99999
+    sys.stderr.write( "Th mean : %s ; obsMean : %s\n" % ( thMean , obsMean ))
+    
+    refDev = ( ( (100 - obsMean) - (100 - thMean) )**2 ) /  (100 - obsMean)
+    altDev = ( ( obsMean - thMean )**2 ) / obsMean
+    ddl = 1
+    qSquare = refDev + altDev
+    alpha = 5
+    
+    sys.stderr.write( "X2 statistical value = %s\n" % qSquare )
+    
+    # with alpha = 5% and a ddl = 1, the confidence interval is [0;5.991] 
+    # H0 is rejected if qSquare superior to  5.991 with a 5% error risk
+    # so there is asignificant difference between the position wie observed and the rest of the reference. 
+    # need to compute a p-value
+    
+    return pochisq( qSquare , 1 )
+    
+    #~ pvalue = igf( ddl , qSquare )
+    #~ pvalue /= gamma( ddl )
+    
+    #~ sys.stderr.write( "p-value = %s\n" % (1.0 - pvalue) )
+    #~ return 1.0 - pvalue
+    
+def pochisq(x, df) :
+       
+    LOG_SQRT_PI = 0.5723649429247000870717135
+    I_SQRT_PI = 0.5641895835477562869480795
+
+    if x <= 0.0 or df < 1 :
+	return 1.0
+
+    s = 2.0 * poz( - math.sqrt( x ) )
+    
+    sys.stderr.write( " p-value : %s\n" % s )
+    return s
+ 
+def poz(z) :
+    sys.stderr.write( "Incoming z : %s\n" % z )
+
+    y = 0 
+    x = 0 
+    w = 0
+    Z_MAX = 6.0          
+
+    if z == 0.0 :
+		x = 0.0
+    else :
+		y = 0.5 * abs(z)
+		
+		if (y >= (Z_MAX * 0.5)): 
+			x = 1.0
+		elif y < 1.0 :
+			w = y * y
+			x = ((((((((0.000124818987 * w - 0.001075204047) * w + 0.005198775019) * w - 0.019198292004) * w + 0.059054035642) * w
+				 - 0.151968751364) * w + 0.319152932694) * w - 0.531923007300) * w + 0.797884560593) * y * 2.0
+		else :
+			y -= 2.0
+			x = (((((((((((((-0.000045255659 * y + 0.000152529290) * y - 0.000019538132) * y - 0.000676904986) * y + 0.001390604284) * y -   0.000794620820) * y - 0.002034254874) * y + 0.006549791214) * y - 0.010557625006) * y + 0.011630447319) * y - 0.009279453341) * y + 0.005353579108) * y - 0.002141268741) * y + 0.000535310849) * y + 0.999936657524
+		
+	
+    if z > 0.0 :
+		sys.stderr.write( "Output : %s\n" % ((x + 1.0) * 0.5) )
+		return ((x + 1.0) * 0.5)
+    else:
+		sys.stderr.write( "Output : %s\n" % ((1.0 - x) * 0.5) )
+		return ((1.0 - x) * 0.5)
+
+# need the bam already parsed to be used.
+def getEffectiveTable( ref , excludeList ):
+    localEffectifTable = []
+    for position in bamCodon[ref].keys():
+	
+		# pass the position if new as a mutation
+		if position in excludeList:
+			continue
+			
+		totalOnPos = 0
+		totalAltOnPos = 0
+
+		refAA = codonTable[ referenceCodon[ref][position] ]
+		for codon in bamCodon[ref][position].keys():
+			totalOnPos += bamCodon[ref][position][codon]
+			obsAA = codonTable[ codon ]
+			
+			if obsAA != refAA:
+				totalAltOnPos += bamCodon[ref][position][codon]
+
+		if totalAltOnPos == 0:
+			localEffectifTable.append( 0 )
+			#~ sys.stderr.write( "Adding 0 to %s effectives for pos : %s \n" % ( ref , position ) )
+		else:
+			localEffectifTable.append( totalAltOnPos * 100 / float( totalOnPos ) )
+			#~ sys.stderr.write( "Adding %s to %s effectives for pos : %s \n" % ( totalAltOnPos * 100 / float( totalOnPos ) , ref , position ) )
+			
+    return localEffectifTable
+
+# compute factoriel
+def fact(n):
+    """fact(n): calcule la factorielle de n (entier >= 0)"""
+    if n<2:
+        return 1
+    else:
+        return n*fact(n-1)
+
+# compute log factoriel
+def logFactoriel( inc ):
+    ret = 0
+    while inc > 0:
+		ret += math.log( inc )
+		inc -= 1
+    return ret;
+
+# compute log of hypergeometrique for FET
+def logHypergeometricProb( a ,  b , c  , d ):
+    return logFactoriel( a + b ) + logFactoriel( c + d ) + logFactoriel( a + c ) + logFactoriel( b + d )- logFactoriel( a ) - logFactoriel( b ) - logFactoriel( c ) - logFactoriel( d ) - logFactoriel( a + b + c + d )
+
+# compute pvalue from FET
+def FETPvalue( a , b , c, d ):
+	#sys.stderr.write( "Computing a p-value from FET ..." )
+	n = a + b + c + d 
+	logpCutOff = logHypergeometricProb( a , b , c , d )
+	pFraction = 0
+	logpValue = 0
+	
+	for x in range( 0 , n ):
+		if( ( a + b - x >= 0 ) and ( a + c - x >= 0 ) and ( d - a + x >= 0 ) ) :
+			l = logHypergeometricProb( x , a + b - x , a + c - x , d - a + x )
+			if  l <= logpCutOff :
+				pFraction += math.exp( l - logpCutOff )
+
+	logpValue = logpCutOff + math.log( pFraction )
+	#sys.stderr.write( " done \n" )
+	return math.exp(logpValue);
+
+# determine if a codon is a stop or not
+def isStop( incCodon ):
+	stopList = [ "TAG" , "TAA" , "TGA" ]
+	if incCodon in stopList :
+		return True 
+	else:
+		return False
+
+# deal with the ref genome
+sys.stderr.write( "Parsing reference genome ..." )
+referenceSequence = {}
+referenceStream = open( referenceFile , "r" )
+currentChr = ""
+
+for line in referenceStream:
+    # name of the sequence
+    if line.startswith( ">" ):
+		line = line.strip()
+		line = line.replace( ">" , "" )
+		currentChr = line
+		referenceSequence[line] = {}
+    else:
+		i = 1
+		for base in line:
+			referenceSequence[currentChr][i] = base
+			i += 1
+referenceStream.close()
+
+#~ # debug 
+#~ for i in referenceSequence.keys(): 
+    #~ sys.stderr.write( "%s : %s\n" % (i , referenceSequence[i] )) 
+# end of debug
+
+# codon table. 
+referenceCodon = {}
+for chrom in referenceSequence.keys():
+    referenceCodon[chrom] = {}
+    codon = ""
+    for pos in  referenceSequence[chrom].keys():
+		codon += referenceSequence[chrom][pos]
+		if pos % 3 == 0 :
+			codon = codon.upper()
+			referenceCodon[chrom][((pos-3)/3)+1] = codon
+			codon = ""
+
+# debug 
+#~ for i in referenceCodon.keys(): 
+    #~ sys.stderr.write( "%s : %s\n" % (i , referenceCodon[i] )) 
+# end of debug	    
+
+sys.stderr.write( " done\n" )
+
+knownMutationTable = {}
+
+
+sys.stderr.write( "Parsing genotyping tables ...\n" )
+if integraseFile != "":
+    annotationIsDefined = True
+    refFound = False
+    # parse & store genotyping files
+    
+    integraseStream = open( integraseFile,  "r" )
+    for line in integraseStream:
+		if line.startswith( " " ) or line.startswith( "Position" ) or line.startswith( "AA" ) or  len(line) < 10:
+			continue
+		
+		if line.startswith( "#" ) :
+			temp = line.strip().split( "#" )
+			ref = temp[1]
+			knownMutationTable[ref] = {}
+			refFound = True
+			continue
+	
+		if not refFound :
+			sys.stderr.write( "Error : %s file is malformed. Please redo the analysis without it or correct the file (see doc at http://blabla.html)\n" % integraseFile )
+			sys.exit( 1 )
+			
+		lineTable = line.strip().split( "\t" )
+		index = lineTable[2] + lineTable[0] + lineTable[4]
+		#~ sys.stderr.write( "index : %s\n" % index )
+		if lineTable[2] == lineTable[4]:
+			continue
+	
+		knownMutationTable[ref][index] = lineTable[5]
+	
+    integraseStream.close()
+    sys.stderr.write( "\tINTEGRASE : OK\n" )
+else:
+    sys.stderr.write( "\tINTEGRASE : no file provided\n" )
+
+if rtFile != "":
+    annotationIsDefined = True
+    refFound = False
+    # parse & store genotyping files
+    
+    reverseTranscriptaseStream = open( rtFile,  "r" )
+    for line in reverseTranscriptaseStream:
+		if line.startswith( " " ) or line.startswith( "Position" ) or line.startswith( "AA" ) or  len(line) < 10:
+			continue
+		
+		if line.startswith( "#" ) :
+			temp = line.strip().split( "#" )
+			ref = temp[1]
+			knownMutationTable[ref] = {}
+			refFound = True
+			continue
+		
+		if not refFound :
+			sys.stderr.write( "Error : %s file is malformed. Please redo the analysis without it or correct the file (see doc at http://blabla.html)\n" % rtFile )
+			sys.exit( 1 )
+		lineTable = line.strip().split( "\t" )
+		index = lineTable[2] + lineTable[0] + lineTable[4]
+		if lineTable[2] == lineTable[4]:
+			continue
+		knownMutationTable[ref][index] = lineTable[5]
+	
+    reverseTranscriptaseStream.close()
+    sys.stderr.write( "\tREVERSETRANSCRIPTASE : OK\n" )
+else:
+    sys.stderr.write( "\tREVERSETRANSCRIPTASE : no file provided\n" )
+	
+if proteaseFile != "":
+    annotationIsDefined = True
+    refFound = False
+    # parse & store genotyping files
+    
+    proteaseStream = open( proteaseFile,  "r" )
+    for line in proteaseStream:
+		if line.startswith( " " ) or line.startswith( "Position" ) or line.startswith( "AA" ) or  len(line) < 10:
+			continue
+		
+		if line.startswith( "#" ) :
+			temp = line.strip().split( "#" )
+			ref = temp[1]
+			knownMutationTable[ref] = {}
+			refFound = True
+			continue
+		
+		if not refFound :
+			sys.stderr.write( "Error : %s file is malformed. Please redo the analysis without it or correct the file (see doc at http://blabla.html)\n" % proteaseFile )
+			sys.exit( 1 )
+		lineTable = line.strip().split( "\t" )
+		index = lineTable[2] + lineTable[0] + lineTable[4]
+		if lineTable[2] == lineTable[4]:
+			continue
+		knownMutationTable[ref][index] = lineTable[5]
+	
+    proteaseStream.close()
+    sys.stderr.write( "\tPROTEASE : OK\n" )
+else:
+    sys.stderr.write( "\tPROTEASE : no file provided\n" )
+   
+
+
+sys.stderr.write( " ... done\n" )
+sys.stderr.write( "Parsing bam to genotype ... " )
+
+resultTable = {}
+qualityTable = {}
+
+resultTable["ReverseTranscriptase"] = {}
+resultTable["Protease"] = {}
+resultTable["Integrase"] = {}
+
+qualityTable["ReverseTranscriptase"] = {}
+qualityTable["Protease"] = {}
+qualityTable["Integrase"] = {}
+
+effectifTable = {}
+
+###### deal with inRunControlBam 
+sys.stderr.write( "Parsing bam control file : %s ..." % ( inRunControlBam ) )
+bamIterRef = pysam.AlignmentFile( inRunControlBam , "r" )
+bamCodonReference = {}
+# samStream = open( samFile , "r" )
+
+for line in bamIterRef:
+	#logging.info('########\nNew read to parse : ' + str(line) )
+	# pass bad alignements
+	if (line.is_unmapped == True ) or (line.is_secondary == True ) or (line.is_supplementary == True) or (int(line.mapping_quality) < 30) :
+		#~ logging.info('Passing sequence : bad quality ' )
+		continue
+
+	# get position and ref 
+	ref = line.reference_name
+	position = int( line.reference_start) + 1
+	sequence = line.query_sequence
+	quality = line.query_qualities
+	
+	# non aligned reads, theoritically unreachable because tested before
+	if ref == "*" :
+		#~ logging.info('Passing sequence : not aligned : ' + ref )
+		continue
+		
+	# manage new chr encounters 
+	if not bamCodonReference.has_key( ref ):
+		logging.info('New reference found : ' + ref )
+		bamCodonReference[ref] = {}
+		
+	#~ sys.stderr.write( "%s\n" % refTable )
+	
+	# test CIGAR string : if insertion or deletion, we skip the read
+	cigar = line.cigarstring
+	
+	if "I" in cigar or "H" in cigar or "S" in cigar or "D" in cigar:
+		#logging.info('Passing sequence : CIGAR is bad: ' + cigar )
+		continue
+		
+	# manage deletion 
+	positionOfDel = 0 
+	lengthOfDel = 0
+	if "D" in cigar:
+	#logging.info('Passing sequence : Deletion detected : ' + cigar + " ; sequence : " + sequence )
+	
+		# get position of the del in the read and its size
+		for elt in line.cigartuples:
+			if elt[0] == 2 :
+				lengthOfDel = elt[1]
+				break
+			if elt[0] == 0:
+				positionOfDel += elt[1]
+			
+
+
+	# deal with 0-based bam position 
+	#position += 1
+	currentPosition = position
+	# extract codon sequences
+	i = 1 
+	
+	# browse every codons, but need to determine if the first codon is complete or not
+	while i < len(sequence) :
+		# deal with end of sequence
+		if i + 3 > len( sequence ):
+			break
+		
+		# codon is complete
+		if currentPosition % 3 == 0 or currentPosition == 1:
+			
+			#~ # manage del 
+			#~ if i >= positionOfDel and (i + 3) <= positionOfDel :
+			 #~ test = 1
+			
+			
+			
+			codon = sequence[i:i + 3]
+			codon = codon.upper()
+			#~ logging.info('Adding a codon on ' + ref + ' : pos on read : ' + str(i) + ' ; pos on ref : ' + str( currentPosition ) + ' ; codon : ' + codon + '(' + str((currentPosition/3)+1 ) + ')' )
+			qualityCodon = quality[i:i+3]
+			qualityScore = 0
+			takeIt = 1
+			for base in qualityCodon:
+				qualityScore += int( base )
+				if int(base) < 30 :
+					takeIt = 0
+				
+				if takeIt == 1:
+					if not bamCodonReference[ref].has_key((currentPosition/3)+1 ):
+						bamCodonReference[ref][(currentPosition/3)+1] = {}
+					if not bamCodonReference[ref][(currentPosition/3)+1].has_key( codon ):
+						bamCodonReference[ref][(currentPosition/3)+1][codon] = 0
+					bamCodonReference[ref][(currentPosition/3)+1][codon] += 1
+			i += 3 
+			currentPosition += 3 
+		else:
+			# codon is not complete
+			#logging.info('Not on a 1st base, passing to next base : pos on read : ' + str(i) + ' ; pos on ref : ' + str( currentPosition ) )
+			i += 1
+			currentPosition += 1 
+		
+sys.stderr.write( " done\n" )
+
+
+
+############ deal with bam file
+sys.stderr.write( "Parsing bam file : %s ..." % ( samFile ) )
+
+
+
+bannedReadID = []
+
+bamIter = pysam.AlignmentFile( samFile , "r" )
+bamCodon = {}
+#samStream = open( samFile , "r" )
+
+for line in bamIter:
+	#logging.info('########\nNew read to parse : ' + str(line) )
+	# pass bad alignements
+	if (line.is_unmapped == True ) or (line.is_secondary == True ) or (line.is_supplementary == True) or (int(line.mapping_quality) < 30) :
+		sys.stderr.write('Passing sequence : bad quality\n' )
+		continue
+	
+	# pass stop pairs if activated
+	if line.query_name in bannedReadID :
+		# ~ sys.stderr.write( 'Passing sequence : stop in pair \n' ) 
+		continue
+
+	# get position and ref 
+	ref = line.reference_name
+	position = int( line.reference_start) + 1
+	sequence = line.query_sequence
+	quality = line.query_qualities
+	
+	# non aligned reads, theoritically unreachable because tested before
+	if ref == "*" :
+		sys.stderr.write('Passing sequence : not aligned : %s \n' % ref )
+		continue
+	
+	# manage new chr encounters 
+	if not bamCodon.has_key( ref ):
+		sys.stderr.write('New reference found : %s\n' % ref )
+		bamCodon[ref] = {}
+	
+	#~ sys.stderr.write( "%s\n" % refTable )
+	
+	# test CIGAR string : if insertion or deletion, we skip the read
+	cigar = line.cigarstring
+	
+	if "I" in cigar or "H" in cigar or "S" in cigar or "D" in cigar:
+		#logging.info('Passing sequence : CIGAR is bad: ' + cigar )
+		continue
+	
+	# manage deletion 
+	positionOfDel = 0 
+	lengthOfDel = 0
+	if "D" in cigar:
+		#logging.info('Passing sequence : Deletion detected : ' + cigar + " ; sequence : " + sequence )
+		
+		# get position of the del in the read and its size
+		for elt in line.cigartuples:
+			if elt[0] == 2 :
+				lengthOfDel = elt[1]
+				break
+			if elt[0] == 0:
+				positionOfDel += elt[1]
+		
+
+
+	# deal with 0-based bam position 
+	#position += 1
+	currentPosition = position
+	# extract codon sequences
+	i = 1 
+	
+	# browse every codons, but need to determine if the first codon is complete or not
+	while i < len(sequence) :
+		# deal with end of sequence
+		if i + 3 > len( sequence ):
+			break
+		
+		# codon is complete
+		if currentPosition % 3 == 0 or currentPosition == 1:
+			
+			#~ # manage del 
+			#~ if i >= positionOfDel and (i + 3) <= positionOfDel :
+			 #~ test = 1
+			
+			
+			
+			codon = sequence[i:i + 3]
+			codon = codon.upper()
+			#~ logging.info('Adding a codon on ' + ref + ' : pos on read : ' + str(i) + ' ; pos on ref : ' + str( currentPosition ) + ' ; codon : ' + codon + '(' + str((currentPosition/3)+1 ) + ')' )
+			qualityCodon = quality[i:i+3]
+			qualityScore = 0
+			takeIt = 1
+			for base in qualityCodon:
+				qualityScore += int( base )
+				if int(base) < 30 :
+					takeIt = 0
+			
+			if takeIt == 1:
+				if not bamCodon[ref].has_key((currentPosition/3)+1 ):
+					bamCodon[ref][(currentPosition/3)+1] = {}
+				if not bamCodon[ref][(currentPosition/3)+1].has_key( codon ):
+					bamCodon[ref][(currentPosition/3)+1][codon] = 0
+				bamCodon[ref][(currentPosition/3)+1][codon] += 1
+			i += 3 
+			currentPosition += 3 
+			
+			# break if stop and stop analysis required
+			if countStop:
+				if isStop( codon ):
+					bannedReadID.append( line.query_name )
+					break
+					
+			
+		else:
+			# codon is not complete
+			#logging.info('Not on a 1st base, passing to next base : pos on read : ' + str(i) + ' ; pos on ref : ' + str( currentPosition ) )
+			i += 1
+			currentPosition += 1 
+	    
+	    
+
+sys.stderr.write( " done\n" )
+
+
+sys.stderr.write( "%s sequence passed truncated due to stop codon.\n" % len( bannedReadID ) )
+
+
+# debug 
+#~ for a in bamCodon.keys():
+    #~ for b in bamCodon[a].keys():
+	#~ for c in bamCodon[a][b].keys():
+	    #~ sys.stderr.write( "%s %s %s %s\n" % ( a , b , c , bamCodon[a][b][c] ) )
+
+# debug end 
+
+# codon table 
+codonTable = { "TTT":"F" , "TTC" : "F" , "TTA" : "L" , "TTG" : "L" , "CTT":"L" , "CTC":"L" , "CTA":"L" , "CTG":"L" , "ATT" : "I" , "ATC":"I" , "ATA":"I" , "ATG":"M" , "GTT":"V" , "GTC":"V" , "GTA":"V" , "GTG":"V" , "TCT":"S" , "TCC":"S" , "TCA":"S" , "TCG":"S" , "CCT":"P" , "CCC":"P" , "CCA":"P" , "CCG":"P", "ACT":"T" , "ACC":"T" , "ACA":"T" , "ACG":"T" , "GCT":"A" , "GCC":"A" , "GCA":"A" , "GCG":"A" , "TAT":"Y", "TAC":"Y" , "TAA":"*" , "TAG":"*" , "CAT":"H" , "CAC":"H" , "CAA":"Q" , "CAG" : "Q" , "AAT":"N" , "AAC":"N" , "AAA":"K" , "AAG":"K" , "GAT":"D" , "GAC":"D" , "GAA":"E" , "GAG":"E", "TGT":"C" , "TGC":"C" , "TGA":"*" , "TGG":"W" , "CGT":"R" , "CGC":"R" , "CGA":"R" , "CGG":"R" , "AGT":"S" , "AGC":"S" , "AGA":"R" , "AGG":"R" , "GGT":"G", "GGC":"G" , "GGA":"G", "GGG":"G" }
+
+
+# we need to store some stuff for further statistical tests some values 
+totalRefSequencedBase = {}
+totalAltSequencedBase = {}
+
+
+# let's modelize the background noise : 
+# count codons wich are not the reference
+sys.stderr.write( "Counting codons ... " )
+for ref in bamCodon.keys():
+    outStream = open( sample + "/" + sample + "." + ref + ".percent.data" , "w" )
+    noiseTable = {}
+    for position in bamCodon[ref].keys():
+		totalOnPos = 0
+		if not noiseTable.has_key( position ):
+			noiseTable[position] = {}
+		
+		refAA = codonTable[ referenceCodon[ref][position] ]
+		for codon in bamCodon[ref][position].keys():
+			totalOnPos += bamCodon[ref][position][codon]
+			obsAA = codonTable[ codon ]
+			
+			if obsAA != refAA:
+				if not noiseTable[position].has_key( obsAA ):
+					noiseTable[position][obsAA] = 0
+				noiseTable[position][obsAA] += bamCodon[ref][position][codon]
+			
+    # we have all data for counting
+		for aa in noiseTable[position].keys():
+			noiseTable[position][aa] = noiseTable[position][aa] * 100 / float(totalOnPos)
+
+    listofAA = list( set( codonTable.values() ) )
+    outStream.write( "#position\t%s\n" % "\t".join( listofAA ) )
+    for position in noiseTable.keys():
+		outStream.write( "%s" % position )
+		for a in listofAA : 
+			if noiseTable[position].has_key( a ):
+				outStream.write( "\t%s" % noiseTable[position][a] )
+			else:
+				outStream.write( "\t0" )
+		outStream.write( "\n" )
+    outStream.close()
+sys.stderr.write( " done\n " )
+    
+    
+
+# let's modelize the background noise : 
+# count codons wich are not the reference
+sys.stderr.write( "Modelizing background noise  ... " )
+
+for ref in bamCodon.keys():
+    totalRefSequencedBase[ref] = 0
+    totalAltSequencedBase[ref] = 0
+    
+    bigTotal = 0
+    noiseTable = {}
+    if not noiseTable.has_key( ref ):
+		noiseTable[ref] = {}
+    
+    
+    outStream = open( sample + "/" + sample + "." + ref + ".value.data" , "w" )
+    
+    for position in bamCodon[ref].keys():
+		totalOnPos = 0
+		totalAltOnPos = 0
+		
+		if not noiseTable[ref].has_key( position ):
+			noiseTable[ref][position] = {}
+		
+		refAA = codonTable[ referenceCodon[ref][position] ]
+		for codon in bamCodon[ref][position].keys():
+			totalOnPos += bamCodon[ref][position][codon]
+			bigTotal += bamCodon[ref][position][codon]
+			obsAA = codonTable[ codon ]
+			
+			if obsAA != refAA:
+				if not noiseTable[ref][position].has_key( obsAA ):
+					noiseTable[ref][position][obsAA] = 0
+				noiseTable[ref][position][obsAA] += bamCodon[ref][position][codon]
+				totalAltSequencedBase[ref] += bamCodon[ref][position][codon]
+				totalAltOnPos += bamCodon[ref][position][codon]
+		
+		
+		#~ sys.stderr.write( "Position : %s , Ref : %s ,  Alt : %s , Total : %s \n" % ( position , totalOnPos - totalAltOnPos , totalAltOnPos , totalOnPos ) )
+			
+		
+			
+		totalRefSequencedBase[ref] += bigTotal - totalAltSequencedBase[ref]
+	
+    listofAA = list( set( codonTable.values() ) )
+    outStream.write( "#position\t%s\n" % "\t".join( listofAA ) )
+    for position in noiseTable[ref].keys():
+		outStream.write( "%s" % position )
+		for a in listofAA : 
+			if noiseTable[ref][position].has_key( a ):
+				outStream.write( "\t%s" % noiseTable[ref][position][a] )
+			else:
+				outStream.write( "\t0" )
+		outStream.write( "\n" )
+    outStream.close()
+sys.stderr.write( " done\n " )
+
+
+
+# calculate distributions 
+sys.stderr.write( "Computing distribution  ... " )
+for ref in bamCodon.keys():
+    sys.stderr.write( "Calculating distribution for %s \n" % ref )
+    incStream = open( sample + "/" + sample + "." + ref+".value.data" , "r" )
+    contingencyTable = {}
+    
+    for line in incStream : 
+		#sys.stderr.write( "%s" % line )
+		if line.startswith( "#position" ):
+			continue
+		myTable = line.split( "\t" )
+		
+		for elt in myTable[1:]:
+			i_elt = int( elt )
+			if not contingencyTable.has_key( i_elt ):
+				contingencyTable[i_elt] = 0
+			contingencyTable[i_elt] += 1
+    
+    totalSum = 0
+    for elt in contingencyTable.keys():
+		totalSum += contingencyTable[elt]
+	
+    # calculate probabilities
+    for elt in contingencyTable.keys():
+		tmp = contingencyTable[elt]
+		contingencyTable[elt] = tmp / float( totalSum )
+    
+    
+    outStream = open( sample + "/" + sample + "." + ref+".value.density" , "w" )
+    outStream.write( "#value\tcount\n" )
+    for elt in contingencyTable.keys():
+		outStream.write( "%s\t%s\n" % ( elt , contingencyTable[elt] ) )
+    outStream.close()
+    
+sys.stderr.write( " done\n " )
+
+
+# detect the potential background noise 
+a = 0
+b = 1
+turn = 1
+positionToDiscard = []
+bg = {}
+while b != 0:
+    bg = {}
+    bgEff = {}
+    b = 0
+    
+    
+    
+    sys.stderr.write( "BG estimation : turn %s\n" % turn )
+    
+    # first : get data for chisquare test
+    for ref in bamCodon.keys():
+	altCount = []
+	refCount = []
+	if not bg.has_key( "ref" ):
+	    bg[ref] = []
+	    bgEff[ref] = []
+
+	    
+	for position in bamCodon[ref].keys():
+	    positionAltCount = 0
+	    positionTotalCount = 0
+	    if position in positionToDiscard:
+			sys.stderr.write( "discarded pos : %s Passing \n" % position ) 
+			continue
+	    
+		
+	    for codon in bamCodon[ref][position].keys():
+			positionTotalCount += bamCodon[ref][position][codon]
+	    sys.stderr.write( "On %s : pos %s , totalD : %s\n" % ( ref , position , positionTotalCount ) )
+
+	    for codon in bamCodon[ref][position].keys():
+			refCodon = referenceCodon[ref][position]
+			if codon != refCodon :
+				positionAltCount = bamCodon[ref][position][codon]
+				sys.stderr.write( "On %s : pos %s , alt : %s\n" % ( ref , position , positionAltCount ) )
+				bg[ref].append( positionAltCount / float( positionTotalCount ) )
+				
+	    if positionAltCount > positionTotalCount :
+			sys.stderr.write( "Error : more alt allele than the total on position %s\n" % position )
+			sys.exit( 1 )
+			
+	    altCount.append( positionAltCount )
+	    refCount.append( positionTotalCount )
+	
+	
+	
+	
+	
+	if len( altCount ) == 0 :
+	    sys.stderr.write( "Error : all position excluded on ref %s\n" % ref )
+	    sys.exit( 0 )
+	    
+	if len( refCount ) == 0 :
+	    sys.stderr.write( "Error : all position excluded on ref %s\n" % ref )
+	    sys.exit( 0 )  
+	
+	bgEff[ref].append( sum( altCount ) / len( altCount ) )
+	bgEff[ref].append( sum( refCount ) / len( refCount ) )
+	sys.stderr.write( "%s : %s , %s\n" % ( ref , bgEff[ref][0] , bgEff[ref][1] ) )
+	    
+    # now redo a loop to find mutations
+    for ref in bamCodon.keys():
+		for position in bamCodon[ref].keys():
+			positionAltCount = 0
+			positionTotalCount = 0
+			if position in positionToDiscard:
+				continue
+
+			for codon in bamCodon[ref][position].keys():
+				positionTotalCount += bamCodon[ref][position][codon]
+			
+			
+			for codon in bamCodon[ref][position].keys():
+				refCodon = referenceCodon[ref][position]
+				if codon != refCodon :
+					positionAltCount = bamCodon[ref][position][codon]
+					# chisquare test to determine if position is significantly different from ref
+					t,p = stats.chisquare( positionAltCount / float( positionTotalCount ) , bg[ref] )
+					#~ sys.stderr.write( "Testing bg : %s , %s , %s, %s\n" % ( positionAltCount , positionTotalCount , ",".join( bg[ref] ) , p )  )
+					if p < 0.01: 
+						positionToDiscard.append( position )
+						b += 1
+						a += 1
+
+    sys.stderr.write( "End of turn %s with %s new mutation found\n" % ( turn , b ) ) 
+    turn += 1
+    
+
+
+# background noise is estimated with a control sample.
+sys.stderr.write( "Calling variants ... \n" )
+
+nbMutTotal = 0
+resultTable = {}
+
+codonFileResult = open( sample + "/" + sample + "." + codonResultFile , "w" )
+codonFileResult.write( "#Reference\treferenceCodon\treferenceAA\tposition\tsequencedCodon\tsequencedAA\tcounts\tallelicratio\t\tctrlCounts\tctrlRatio\n" )
+
+mutationFileResult = open( sample + "/" + sample + ".mutation.temp.tsv" , "w" )
+
+
+
+testToPerform = []
+
+for ref in bamCodon.keys():
+    
+    resultTable[ref] = {}    
+    totalOnPos = 0
+    sys.stderr.write( "ref : %s\n" % ref )
+    for position in bamCodon[ref].keys():
+		#~ logging.info( "ref : " + ref + " ; position : " + str( position ) )
+		refCodon = referenceCodon[ref][position]
+		refAA = codonTable[refCodon]
+		
+		totalOnPos = 0
+		totalOnPosReference = 0
+		for a in bamCodon[ref][position].keys():
+			totalOnPos += bamCodon[ref][position][a]
+		
+		if bamCodonReference[ref].has_key( position ):
+			for a in bamCodonReference[ref][position].keys():
+				totalOnPosReference += bamCodonReference[ref][position][a]
+		
+			
+		
+		for codon in bamCodon[ref][position].keys():
+			count = bamCodon[ref][position][codon]
+			obsAA = codonTable[codon]
+			
+			if bamCodonReference[ref].has_key( position ):
+				if bamCodonReference[ref][position].has_key( codon ):
+					countCtrl = bamCodonReference[ref][position][codon]
+				else:
+					countCtrl = 0
+			else:
+				countCtrl = 0
+			
+			ratioCtrl = 0
+			ratio = 0
+			if totalOnPos != 0:
+				ratio = ( count * 100 ) / float(totalOnPos)
+			
+			if totalOnPosReference != 0:
+				ratioCtrl = ( countCtrl * 100 ) / float(totalOnPosReference)
+			
+			codonFileResult.write( "%s\t%s\t%s\t%s\t%s\t%s\t%s/%s\t%s\t%s\t%s\n" % ( ref , refCodon , refAA , position , codon , obsAA , count , totalOnPos , ratio , countCtrl , ratioCtrl))
+			 
+			# output mutation		
+			if obsAA != refAA and totalOnPos > 200 and count > 30  and  countCtrl < count :
+				sys.stderr.write( "Testing position : %s on %s , AA : %s (ref %s ), depth : %s , ratio : %s , " % ( position , ref , obsAA , refAA , totalOnPos , ratio  ) ) 
+				# perform statistical test  )
+				#testToPerform.append( [ ref , refCodon , refAA , position , codon , obsAA , count , totalOnPos , countCtrl , totalOnPosReference] )
+				oddsratio,pValue = stats.fisher_exact([[count , totalOnPos] , [countCtrl , totalOnPosReference]] )
+				sys.stderr.write( "p-value : %s \n" % pValue )
+				if pValue < 0.01:  
+					chi2, p, dof, ex = stats.chi2_contingency( [[count , totalOnPos ] , [ bgEff[ref][0] , bgEff[ref][1] ]] , correction=False)
+					# test for bg noise
+					if p < 0.01:
+						mutationFileResult.write( "%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\n" % ( ref , refCodon , refAA , position , codon , obsAA , count , totalOnPos , countCtrl , totalOnPosReference , pValue , p ))
+					else:
+						sys.stderr.write( "%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s was different from control but in background noise\n" % ( ref , refCodon , refAA , position , codon , obsAA , count , totalOnPos , countCtrl , totalOnPosReference , pValue , p ) )
+sys.stderr.write( "Done, performing statistical tests ...\n" )
+
+
+codonFileResult.close()
+mutationFileResult.close()
+
+# stats for graphics
+statsForGraph = {}
+for ref in bamCodon.keys():
+	
+	statsForGraph[ref] = {}	
+	totalOnPos = 0
+	logging.info( "ref : " + ref )
+	for position in bamCodon[ref].keys():
+		#~ logging.info( "ref : " + ref + " ; position : " + str( position ) )
+		refCodon = referenceCodon[ref][position]
+		refAA = codonTable[refCodon]
+		statsForGraph[ref][position] = {}
+		
+		
+		totalOnPos = 0
+		totalOnPosReference = 0
+		for a in bamCodon[ref][position].keys():
+			totalOnPos += bamCodon[ref][position][a]
+			
+		if bamCodonReference[ref].has_key( position ) :
+			for a in bamCodonReference[ref][position].keys():
+				totalOnPosReference += bamCodonReference[ref][position][a]
+		else:
+			totalOnPosReference = 0
+		
+		
+		for codon in bamCodon[ref][position].keys():
+			statsForGraph[ref][position][codon] = {}
+			
+			count = bamCodon[ref][position][codon]
+			obsAA = codonTable[codon]
+			
+			if bamCodonReference[ref].has_key( position ):
+				if bamCodonReference[ref][position].has_key( codon ):
+					countCtrl = bamCodonReference[ref][position][codon]
+				else:
+					countCtrl = 0
+			else:
+				countCtrl = 0
+			
+			# FET
+			oddsratio,pValue = stats.fisher_exact([[count , totalOnPos] , [countCtrl , totalOnPosReference]] )
+			statsForGraph[ref][position][codon]['FET'] = pValue
+			
+			# chisquare
+			chi2, p, dof, ex = stats.chi2_contingency( [[count , totalOnPos ] , [ bgEff[ref][0] , bgEff[ref][1] ]] , correction=False)
+			statsForGraph[ref][position][codon]['CHI'] = p
+			statsForGraph[ref][position][codon]['codonDepth'] = count
+
+
+
+
+# ~ # graphics
+# ~ for ref in statsForGraph.keys():
+    # ~ plt.figure( figsize=( 12,10 ) ) 
+    # ~ title = "Analysis for " + str( ref )
+    # ~ gName = str( ref ) + "_analysis.png" 
+    # ~ plt.title( title )
+    # ~ plt.ylabel( "-log( pValue )" )
+    # ~ plt.xlabel( "Position" )
+
+    # ~ x = []
+    # ~ y = []
+    # ~ z = []
+    # ~ ac = []
+    # ~ b = []
+    
+    # ~ for position in statsForGraph[ref].keys():
+	# ~ x.append( position )
+	# ~ totalOnPos = 0
+	# ~ for a in bamCodon[ref][position].keys():
+	    # ~ totalOnPos += bamCodon[ref][position][a]
+	# ~ ac.append( totalOnPos )
+    
+	# ~ maxPValFET = -1
+	# ~ maxPValCHI = -1
+	# ~ temp  = []
+	# ~ for codon in statsForGraph[ref][position].keys():
+	    # ~ sys.stderr.write( "fet : %s ; chi : %s \n" % ( statsForGraph[ref][position][codon]['FET'] , statsForGraph[ref][position][codon]['CHI'] ) )
+	    
+	    
+	    # ~ if statsForGraph[ref][position][codon]['FET'] == 0:
+		# ~ fetPValue = 500
+	    # ~ else:
+		# ~ fetPValue = - math.log10( statsForGraph[ref][position][codon]['FET'] )
+	    # ~ if fetPValue > maxPValFET:
+		# ~ maxPValFET = fetPValue
+	    
+	    # ~ if statsForGraph[ref][position][codon]['CHI'] == 0:
+		# ~ chiPValue = 500
+	    # ~ else:
+		# ~ chiPValue = - math.log10( statsForGraph[ref][position][codon]['CHI'] )
+
+	    # ~ if chiPValue > maxPValCHI:
+		# ~ maxPValCHI = chiPValue
+	    # ~ temp.append( statsForGraph[ref][position][codon]['codonDepth'] )
+	
+	# ~ y.append( maxPValFET )
+	# ~ z.append( maxPValCHI )
+	# ~ b.append( temp )
+	
+    # ~ plt.plot( x, y, linewidth=2.0 , color = 'r' )
+    # ~ plt.plot( x , z , linewidth=2.0 , color = 'b' )
+    # ~ plt.savefig( gName )
+
+# writing final results after annotation
+
+finalFileResult = open( outputFile , "w" )
+finalFileResult.write( "#Reference\taa Position\tReference Codon\tReference aa\tAlternative Codon\tAlternative aa\tDepth\tAlternative Allele Ratio\tFET ctrl p-value\tFET bg pValue\tAnnotation\n")
+mutationFileResult = open( sample + "/" + sample + ".mutation.temp.tsv" , "r" )
+
+for line in mutationFileResult:
+    lineTable = line.strip().split( "\t" )
+    ref = lineTable[0]
+    annotation = "No annotation found"
+    
+    if annotationIsDefined :
+		index = lineTable[2] + lineTable[3] + lineTable[5]
+		if index in knownMutationTable[ref].keys():
+			annotation = knownMutationTable[ref][index]
+    
+    depth = int( lineTable[7] )
+    ratio = ( float( lineTable[6] ) *100 ) / depth 
+    
+    finalFileResult.write( "%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s pct\t%s\t%s\t%s\n" % ( ref , lineTable[3] , lineTable[1] , lineTable[2] , lineTable[4] , lineTable[5] , depth , ratio , lineTable[10], lineTable[11] ,  annotation ) ) 
+mutationFileResult.close()
+finalFileResult.close()
+
+
+# writing consensus sequence
+consFile = open( sample + "/" + sample + ".cs" , "w" )
+for ref in bamCodon.keys():
+    consFile.write( ">" + ref + "\n" )
+    for position in sorted( bamCodon[ref].keys() ) :
+		count = 0
+		retainedCodon = ""
+		for codon in bamCodon[ref][position].keys():
+			if bamCodon[ref][position][codon] > count:
+				count = bamCodon[ref][position][codon]
+				retainedCodon = codon
+		consFile.write( retainedCodon )
+    consFile.write( "\n" )
+consFile.close()
+