Commit cd11dcf8 authored by YannisDuffourd's avatar YannisDuffourd

Initial commit of the repository

parents
# ### GAD development file ###
# ## hiv_genotyping.py
# ## Version : 1.0
# ## Description :
# ## Usage : hiv_genotyping.py
# ## Output :
# ## Requirements : python 2.7+ - pySAM - scipy
#
# ## Author: yannis.duffourd@u-bourgogne.fr
# ## Creation Date: 2016-10-18
# ## last revision date: 2018-03-19
# ## Known bugs: None
# ## TODO:
#
import os
import sys
sys.path.insert( 0 , "/user1/gad/ya0902du/miniconda2/lib/python2.7/site-packages")
print sys.path
import getopt
import logging
import pysam
import logging
import math
import threading
from scipy import stats
# ~ import matplotlib.pyplot as plt
currentThread = 0
integraseFile , samFile , rtFile , proteaseFile , referenceFile = "" , "" , "" , "" , ""
depthThreshold = 200
ratioThreshold = 0
pvalueThreshold = 1.0
annotationIsDefined = False
inRunControlBam = ""
allControlBamList = ""
nbThread = 1
countStop = False
opts, args = getopt.getopt(sys.argv[1:], 's:i:r:p:R:D:l:P:o:c:b:a:n:S')
for opt, arg in opts:
if opt in ("-i"):
integraseFile = arg
elif opt in ("-s"):
samFile = arg
elif opt in ("-r"):
rtFile = arg
elif opt in ("-p"):
proteaseFile = arg
elif opt in ("-R"):
referenceFile = arg
elif opt in ("-D"):
depthThreshold = int(arg)
elif opt in ("-l"):
ratioThreshold = int(arg)
elif opt in ("-P"):
pvalueThreshold = float(arg)
elif opt in ("-o"):
outputFile = arg
elif opt in ("-c"):
codonResultFile = arg
elif opt in ("-b"):
inRunControlBam = arg
elif opt in ("-a"):
allControlBamList = arg
elif opt in ("-n"):
nbThread = int( arg )
elif opt in ("-S"):
countStop = True
sys.stderr.write('Stop codon detection activated.')
sample = samFile.split( "/" ) [-1]
sample = sample.replace( ".bam" , "" )
sample = sample.split( "." )[0]
if countStop == True :
codonResultFile = "codons.stop.tsv"
else:
codonResultFile = "codons.tsv"
sys.stderr.write('start\n')
class myThread (threading.Thread):
def __init__( self , ref , refCodon , refAA , position , codon , obsAA , count , totalOnPos , countCtrl , totalOnPosReference , nb ) :
threading.Thread.__init__(self)
self.ref = ref
self.refCodon = refCodon
self.refAA = refAA
self.position = position
self.nb = nb
self.codon = codon
self.obsAA = obsAA
self.count = count
self.totalOnPos = totalOnPos
self.countCtrl = countCtrl
self.totalOnPosReference = totalOnPosReference
threadLock.acquire()
sys.stderr.write( "New thread number %s , launch with params : %s , %s , %s , %s , %s , %s , %s , %s , %s , %s\n" % ( nb , ref , refCodon , refAA , position , codon , obsAA , count , totalOnPos , countCtrl , totalOnPosReference ) )
threadLock.release()
def run( self ):
global currentThread
oddsratio,pValue = stats.fisher_exact([[self.count , self.totalOnPos] , [self.countCtrl , self.totalOnPosReference]] )
threadLock.acquire()
sys.stderr.write( "p-value : %s \n" % pValue )
if pValue < 0.01:
mutationFileResult.write( "%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\n" % ( self.ref , self.refCodon , self.refAA , self.position , self.codon , self.obsAA , self.count , self.totalOnPos , self.countCtrl , self.totalOnPosReference , pValue ))
currentThread -= 1
threadLock.release()
def phredToInt( incString ):
# convert to ascii
score = ord( incString )
# substract 33 to the score
score -= 33
return score
def igf( lddl , chiStat ):
if chiStat < 0.0:
return 0.0
sc = 1.0 / lddl
sc *= chiStat**lddl
sc *= math.exp( -chiStat )
sys.stderr.write( "sc value = %s\n" % ( sc ) )
somme = 1.0
nom = 1.0
denom = 1.0
for i in range( 1 , 200):
nom *= chiStat
lddl += 1
denom *= lddl
somme += (nom / denom)
if ( somme * sc ) < 0.000000000000001:
sys.stderr.write( "igf value = %s\n" % (0.000000000000001) )
return 0.000000000000001
sys.stderr.write( "igf value = %s\n" % (somme * sc ) )
return somme * sc
def gamma( lddl ):
RECIP_E = 0.36787944117144232159552377016147
TWOPI = 6.283185307179586476925286766559
D = 1.0 / (10.0 * lddl)
D = 1.0 / ((12 * lddl) - D)
D = (D + lddl) * RECIP_E
D = D**lddl
D *= math.sqrt(TWOPI / lddl)
sys.stderr.write( "gamma = %s\n" % D )
return D
def adequationChiTwo( effectTable , localRef , localAlt ):
# compute deviation between obs & theoritical
thMean = sum( effectTable ) / len( effectTable )
obsMean = localAlt * 100 / ( float( localRef ) + float( localAlt ) )
if obsMean < thMean:
return 1
if obsMean == 100:
obsMean = 99.99999
sys.stderr.write( "Th mean : %s ; obsMean : %s\n" % ( thMean , obsMean ))
refDev = ( ( (100 - obsMean) - (100 - thMean) )**2 ) / (100 - obsMean)
altDev = ( ( obsMean - thMean )**2 ) / obsMean
ddl = 1
qSquare = refDev + altDev
alpha = 5
sys.stderr.write( "X2 statistical value = %s\n" % qSquare )
# with alpha = 5% and a ddl = 1, the confidence interval is [0;5.991]
# H0 is rejected if qSquare superior to 5.991 with a 5% error risk
# so there is asignificant difference between the position wie observed and the rest of the reference.
# need to compute a p-value
return pochisq( qSquare , 1 )
#~ pvalue = igf( ddl , qSquare )
#~ pvalue /= gamma( ddl )
#~ sys.stderr.write( "p-value = %s\n" % (1.0 - pvalue) )
#~ return 1.0 - pvalue
def pochisq(x, df) :
LOG_SQRT_PI = 0.5723649429247000870717135
I_SQRT_PI = 0.5641895835477562869480795
if x <= 0.0 or df < 1 :
return 1.0
s = 2.0 * poz( - math.sqrt( x ) )
sys.stderr.write( " p-value : %s\n" % s )
return s
def poz(z) :
sys.stderr.write( "Incoming z : %s\n" % z )
y = 0
x = 0
w = 0
Z_MAX = 6.0
if z == 0.0 :
x = 0.0
else :
y = 0.5 * abs(z)
if (y >= (Z_MAX * 0.5)):
x = 1.0
elif y < 1.0 :
w = y * y
x = ((((((((0.000124818987 * w - 0.001075204047) * w + 0.005198775019) * w - 0.019198292004) * w + 0.059054035642) * w
- 0.151968751364) * w + 0.319152932694) * w - 0.531923007300) * w + 0.797884560593) * y * 2.0
else :
y -= 2.0
x = (((((((((((((-0.000045255659 * y + 0.000152529290) * y - 0.000019538132) * y - 0.000676904986) * y + 0.001390604284) * y - 0.000794620820) * y - 0.002034254874) * y + 0.006549791214) * y - 0.010557625006) * y + 0.011630447319) * y - 0.009279453341) * y + 0.005353579108) * y - 0.002141268741) * y + 0.000535310849) * y + 0.999936657524
if z > 0.0 :
sys.stderr.write( "Output : %s\n" % ((x + 1.0) * 0.5) )
return ((x + 1.0) * 0.5)
else:
sys.stderr.write( "Output : %s\n" % ((1.0 - x) * 0.5) )
return ((1.0 - x) * 0.5)
# need the bam already parsed to be used.
def getEffectiveTable( ref , excludeList ):
localEffectifTable = []
for position in bamCodon[ref].keys():
# pass the position if new as a mutation
if position in excludeList:
continue
totalOnPos = 0
totalAltOnPos = 0
refAA = codonTable[ referenceCodon[ref][position] ]
for codon in bamCodon[ref][position].keys():
totalOnPos += bamCodon[ref][position][codon]
obsAA = codonTable[ codon ]
if obsAA != refAA:
totalAltOnPos += bamCodon[ref][position][codon]
if totalAltOnPos == 0:
localEffectifTable.append( 0 )
#~ sys.stderr.write( "Adding 0 to %s effectives for pos : %s \n" % ( ref , position ) )
else:
localEffectifTable.append( totalAltOnPos * 100 / float( totalOnPos ) )
#~ sys.stderr.write( "Adding %s to %s effectives for pos : %s \n" % ( totalAltOnPos * 100 / float( totalOnPos ) , ref , position ) )
return localEffectifTable
# compute factoriel
def fact(n):
"""fact(n): calcule la factorielle de n (entier >= 0)"""
if n<2:
return 1
else:
return n*fact(n-1)
# compute log factoriel
def logFactoriel( inc ):
ret = 0
while inc > 0:
ret += math.log( inc )
inc -= 1
return ret;
# compute log of hypergeometrique for FET
def logHypergeometricProb( a , b , c , d ):
return logFactoriel( a + b ) + logFactoriel( c + d ) + logFactoriel( a + c ) + logFactoriel( b + d )- logFactoriel( a ) - logFactoriel( b ) - logFactoriel( c ) - logFactoriel( d ) - logFactoriel( a + b + c + d )
# compute pvalue from FET
def FETPvalue( a , b , c, d ):
#sys.stderr.write( "Computing a p-value from FET ..." )
n = a + b + c + d
logpCutOff = logHypergeometricProb( a , b , c , d )
pFraction = 0
logpValue = 0
for x in range( 0 , n ):
if( ( a + b - x >= 0 ) and ( a + c - x >= 0 ) and ( d - a + x >= 0 ) ) :
l = logHypergeometricProb( x , a + b - x , a + c - x , d - a + x )
if l <= logpCutOff :
pFraction += math.exp( l - logpCutOff )
logpValue = logpCutOff + math.log( pFraction )
#sys.stderr.write( " done \n" )
return math.exp(logpValue);
# determine if a codon is a stop or not
def isStop( incCodon ):
stopList = [ "TAG" , "TAA" , "TGA" ]
if incCodon in stopList :
return True
else:
return False
# deal with the ref genome
sys.stderr.write( "Parsing reference genome ..." )
referenceSequence = {}
referenceStream = open( referenceFile , "r" )
currentChr = ""
for line in referenceStream:
# name of the sequence
if line.startswith( ">" ):
line = line.strip()
line = line.replace( ">" , "" )
currentChr = line
referenceSequence[line] = {}
else:
i = 1
for base in line:
referenceSequence[currentChr][i] = base
i += 1
referenceStream.close()
#~ # debug
#~ for i in referenceSequence.keys():
#~ sys.stderr.write( "%s : %s\n" % (i , referenceSequence[i] ))
# end of debug
# codon table.
referenceCodon = {}
for chrom in referenceSequence.keys():
referenceCodon[chrom] = {}
codon = ""
for pos in referenceSequence[chrom].keys():
codon += referenceSequence[chrom][pos]
if pos % 3 == 0 :
codon = codon.upper()
referenceCodon[chrom][((pos-3)/3)+1] = codon
codon = ""
# debug
#~ for i in referenceCodon.keys():
#~ sys.stderr.write( "%s : %s\n" % (i , referenceCodon[i] ))
# end of debug
sys.stderr.write( " done\n" )
knownMutationTable = {}
sys.stderr.write( "Parsing genotyping tables ...\n" )
if integraseFile != "":
annotationIsDefined = True
refFound = False
# parse & store genotyping files
integraseStream = open( integraseFile, "r" )
for line in integraseStream:
if line.startswith( " " ) or line.startswith( "Position" ) or line.startswith( "AA" ) or len(line) < 10:
continue
if line.startswith( "#" ) :
temp = line.strip().split( "#" )
ref = temp[1]
knownMutationTable[ref] = {}
refFound = True
continue
if not refFound :
sys.stderr.write( "Error : %s file is malformed. Please redo the analysis without it or correct the file (see doc at http://blabla.html)\n" % integraseFile )
sys.exit( 1 )
lineTable = line.strip().split( "\t" )
index = lineTable[2] + lineTable[0] + lineTable[4]
#~ sys.stderr.write( "index : %s\n" % index )
if lineTable[2] == lineTable[4]:
continue
knownMutationTable[ref][index] = lineTable[5]
integraseStream.close()
sys.stderr.write( "\tINTEGRASE : OK\n" )
else:
sys.stderr.write( "\tINTEGRASE : no file provided\n" )
if rtFile != "":
annotationIsDefined = True
refFound = False
# parse & store genotyping files
reverseTranscriptaseStream = open( rtFile, "r" )
for line in reverseTranscriptaseStream:
if line.startswith( " " ) or line.startswith( "Position" ) or line.startswith( "AA" ) or len(line) < 10:
continue
if line.startswith( "#" ) :
temp = line.strip().split( "#" )
ref = temp[1]
knownMutationTable[ref] = {}
refFound = True
continue
if not refFound :
sys.stderr.write( "Error : %s file is malformed. Please redo the analysis without it or correct the file (see doc at http://blabla.html)\n" % rtFile )
sys.exit( 1 )
lineTable = line.strip().split( "\t" )
index = lineTable[2] + lineTable[0] + lineTable[4]
if lineTable[2] == lineTable[4]:
continue
knownMutationTable[ref][index] = lineTable[5]
reverseTranscriptaseStream.close()
sys.stderr.write( "\tREVERSETRANSCRIPTASE : OK\n" )
else:
sys.stderr.write( "\tREVERSETRANSCRIPTASE : no file provided\n" )
if proteaseFile != "":
annotationIsDefined = True
refFound = False
# parse & store genotyping files
proteaseStream = open( proteaseFile, "r" )
for line in proteaseStream:
if line.startswith( " " ) or line.startswith( "Position" ) or line.startswith( "AA" ) or len(line) < 10:
continue
if line.startswith( "#" ) :
temp = line.strip().split( "#" )
ref = temp[1]
knownMutationTable[ref] = {}
refFound = True
continue
if not refFound :
sys.stderr.write( "Error : %s file is malformed. Please redo the analysis without it or correct the file (see doc at http://blabla.html)\n" % proteaseFile )
sys.exit( 1 )
lineTable = line.strip().split( "\t" )
index = lineTable[2] + lineTable[0] + lineTable[4]
if lineTable[2] == lineTable[4]:
continue
knownMutationTable[ref][index] = lineTable[5]
proteaseStream.close()
sys.stderr.write( "\tPROTEASE : OK\n" )
else:
sys.stderr.write( "\tPROTEASE : no file provided\n" )
sys.stderr.write( " ... done\n" )
sys.stderr.write( "Parsing bam to genotype ... " )
resultTable = {}
qualityTable = {}
resultTable["ReverseTranscriptase"] = {}
resultTable["Protease"] = {}
resultTable["Integrase"] = {}
qualityTable["ReverseTranscriptase"] = {}
qualityTable["Protease"] = {}
qualityTable["Integrase"] = {}
effectifTable = {}
###### deal with inRunControlBam
sys.stderr.write( "Parsing bam control file : %s ..." % ( inRunControlBam ) )
bamIterRef = pysam.AlignmentFile( inRunControlBam , "r" )
bamCodonReference = {}
# samStream = open( samFile , "r" )
for line in bamIterRef:
#logging.info('########\nNew read to parse : ' + str(line) )
# pass bad alignements
if (line.is_unmapped == True ) or (line.is_secondary == True ) or (line.is_supplementary == True) or (int(line.mapping_quality) < 30) :
#~ logging.info('Passing sequence : bad quality ' )
continue
# get position and ref
ref = line.reference_name
position = int( line.reference_start) + 1
sequence = line.query_sequence
quality = line.query_qualities
# non aligned reads, theoritically unreachable because tested before
if ref == "*" :
#~ logging.info('Passing sequence : not aligned : ' + ref )
continue
# manage new chr encounters
if not bamCodonReference.has_key( ref ):
logging.info('New reference found : ' + ref )
bamCodonReference[ref] = {}
#~ sys.stderr.write( "%s\n" % refTable )
# test CIGAR string : if insertion or deletion, we skip the read
cigar = line.cigarstring
if "I" in cigar or "H" in cigar or "S" in cigar or "D" in cigar:
#logging.info('Passing sequence : CIGAR is bad: ' + cigar )
continue
# manage deletion
positionOfDel = 0
lengthOfDel = 0
if "D" in cigar:
#logging.info('Passing sequence : Deletion detected : ' + cigar + " ; sequence : " + sequence )
# get position of the del in the read and its size
for elt in line.cigartuples:
if elt[0] == 2 :
lengthOfDel = elt[1]
break
if elt[0] == 0:
positionOfDel += elt[1]
# deal with 0-based bam position
#position += 1
currentPosition = position
# extract codon sequences
i = 1
# browse every codons, but need to determine if the first codon is complete or not
while i < len(sequence) :
# deal with end of sequence
if i + 3 > len( sequence ):
break
# codon is complete
if currentPosition % 3 == 0 or currentPosition == 1:
#~ # manage del
#~ if i >= positionOfDel and (i + 3) <= positionOfDel :
#~ test = 1
codon = sequence[i:i + 3]
codon = codon.upper()
#~ logging.info('Adding a codon on ' + ref + ' : pos on read : ' + str(i) + ' ; pos on ref : ' + str( currentPosition ) + ' ; codon : ' + codon + '(' + str((currentPosition/3)+1 ) + ')' )
qualityCodon = quality[i:i+3]
qualityScore = 0
takeIt = 1
for base in qualityCodon:
qualityScore += int( base )
if int(base) < 30 :
takeIt = 0
if takeIt == 1:
if not bamCodonReference[ref].has_key((currentPosition/3)+1 ):
bamCodonReference[ref][(currentPosition/3)+1] = {}
if not bamCodonReference[ref][(currentPosition/3)+1].has_key( codon ):
bamCodonReference[ref][(currentPosition/3)+1][codon] = 0
bamCodonReference[ref][(currentPosition/3)+1][codon] += 1
i += 3
currentPosition += 3
else:
# codon is not complete
#logging.info('Not on a 1st base, passing to next base : pos on read : ' + str(i) + ' ; pos on ref : ' + str( currentPosition ) )
i += 1
currentPosition += 1
sys.stderr.write( " done\n" )
############ deal with bam file
sys.stderr.write( "Parsing bam file : %s ..." % ( samFile ) )
bannedReadID = []
bamIter = pysam.AlignmentFile( samFile , "r" )
bamCodon = {}
#samStream = open( samFile , "r" )
for line in bamIter:
#logging.info('########\nNew read to parse : ' + str(line) )
# pass bad alignements
if (line.is_unmapped == True ) or (line.is_secondary == True ) or (line.is_supplementary == True) or (int(line.mapping_quality) < 30) :
sys.stderr.write('Passing sequence : bad quality\n' )
continue
# pass stop pairs if activated
if line.query_name in bannedReadID :
# ~ sys.stderr.write( 'Passing sequence : stop in pair \n' )
continue
# get position and ref
ref = line.reference_name
position = int( line.reference_start) + 1
sequence = line.query_sequence
quality = line.query_qualities
# non aligned reads, theoritically unreachable because tested before
if ref == "*" :
sys.stderr.write('Passing sequence : not aligned : %s \n' % ref )
continue
# manage new chr encounters
if not bamCodon.has_key( ref ):
sys.stderr.write('New reference found : %s\n' % ref )
bamCodon[ref] = {}
#~ sys.stderr.write( "%s\n" % refTable )
# test CIGAR string : if insertion or deletion, we skip the read
cigar = line.cigarstring
if "I" in cigar or "H" in cigar or "S" in cigar or "D" in cigar:
#logging.info('Passing sequence : CIGAR is bad: ' + cigar )
continue
# manage deletion
positionOfDel = 0
lengthOfDel = 0
if "D" in cigar:
#logging.info('Passing sequence : Deletion detected : ' + cigar + " ; sequence : " + sequence )
# get position of the del in the read and its size
for elt in line.cigartuples:
if elt[0] == 2 :
lengthOfDel = elt[1]
break
if elt[0] == 0:
positionOfDel += elt[1]
# deal with 0-based bam position
#position += 1
currentPosition = position
# extract codon sequences
i = 1
# browse every codons, but need to determine if the first codon is complete or not
while i < len(sequence) :
# deal with end of sequence
if i + 3 > len( sequence ):
break
# codon is complete
if currentPosition % 3 == 0 or currentPosition == 1:
#~ # manage del
#~ if i >= positionOfDel and (i + 3) <= positionOfDel :
#~ test = 1
codon = sequence[i:i + 3]
codon = codon.upper()
#~ logging.info('Adding a codon on ' + ref + ' : pos on read : ' + str(i) + ' ; pos on ref : ' + str( currentPosition ) + ' ; codon : ' + codon + '(' + str((currentPosition/3)+1 ) + ')' )
qualityCodon = quality[i:i+3]
qualityScore = 0
takeIt = 1
for base in qualityCodon:
qualityScore += int( base )
if int(base) < 30 :
takeIt = 0
if takeIt == 1:
if not bamCodon[ref].has_key((currentPosition/3)+1 ):
bamCodon[ref][(currentPosition/3)+1] = {}
if not bamCodon[ref][(currentPosition/3)+1].has_key( codon ):
bamCodon[ref][(currentPosition/3)+1][codon] = 0
bamCodon[ref][(currentPosition/3)+1][codon] += 1
i += 3
currentPosition += 3
# break if stop and stop analysis required
if countStop:
if isStop( codon ):
bannedReadID.append( line.query_name )
break
else:
# codon is not complete
#logging.info('Not on a 1st base, passing to next base : pos on read : ' + str(i) + ' ; pos on ref : ' + str( currentPosition ) )
i += 1
currentPosition += 1
sys.stderr.write( " done\n" )
sys.stderr.write( "%s sequence passed truncated due to stop codon.\n" % len( bannedReadID ) )
# debug
#~ for a in bamCodon.keys():
#~ for b in bamCodon[a].keys():
#~ for c in bamCodon[a][b].keys():
#~ sys.stderr.write( "%s %s %s %s\n" % ( a , b , c , bamCodon[a][b][c] ) )
# debug end
# codon table
codonTable = { "TTT":"F" , "TTC" : "F" , "TTA" : "L" , "TTG" : "L" , "CTT":"L" , "CTC":"L" , "CTA":"L" , "CTG":"L" , "ATT" : "I" , "ATC":"I" , "ATA":"I" , "ATG":"M" , "GTT":"V" , "GTC":"V" , "GTA":"V" , "GTG":"V" , "TCT":"S" , "TCC":"S" , "TCA":"S" , "TCG":"S" , "CCT":"P" , "CCC":"P" , "CCA":"P" , "CCG":"P", "ACT":"T" , "ACC":"T" , "ACA":"T" , "ACG":"T" , "GCT":"A" , "GCC":"A" , "GCA":"A" , "GCG":"A" , "TAT":"Y", "TAC":"Y" , "TAA":"*" , "TAG":"*" , "CAT":"H" , "CAC":"H" , "CAA":"Q" , "CAG" : "Q" , "AAT":"N" , "AAC":"N" , "AAA":"K" , "AAG":"K" , "GAT":"D" , "GAC":"D" , "GAA":"E" , "GAG":"E", "TGT":"C" , "TGC":"C" , "TGA":"*" , "TGG":"W" , "CGT":"R" , "CGC":"R" , "CGA":"R" , "CGG":"R" , "AGT":"S" , "AGC":"S" , "AGA":"R" , "AGG":"R" , "GGT":"G", "GGC":"G" , "GGA":"G", "GGG":"G" }
# we need to store some stuff for further statistical tests some values
totalRefSequencedBase = {}
totalAltSequencedBase = {}
# let's modelize the background noise :
# count codons wich are not the reference
sys.stderr.write( "Counting codons ... " )
for ref in bamCodon.keys():
outStream = open( sample + "/" + sample + "." + ref + ".percent.data" , "w" )
noiseTable = {}
for position in bamCodon[ref].keys():
totalOnPos = 0
if not noiseTable.has_key( position ):
noiseTable[position] = {}
refAA = codonTable[ referenceCodon[ref][position] ]
for codon in bamCodon[ref][position].keys():
totalOnPos += bamCodon[ref][position][codon]
obsAA = codonTable[ codon ]
if obsAA != refAA:
if not noiseTable[position].has_key( obsAA ):
noiseTable[position][obsAA] = 0
noiseTable[position][obsAA] += bamCodon[ref][position][codon]
# we have all data for counting
for aa in noiseTable[position].keys():
noiseTable[position][aa] = noiseTable[position][aa] * 100 / float(totalOnPos)
listofAA = list( set( codonTable.values() ) )
outStream.write( "#position\t%s\n" % "\t".join( listofAA ) )
for position in noiseTable.keys():
outStream.write( "%s" % position )
for a in listofAA :
if noiseTable[position].has_key( a ):
outStream.write( "\t%s" % noiseTable[position][a] )
else:
outStream.write( "\t0" )
outStream.write( "\n" )
outStream.close()
sys.stderr.write( " done\n " )
# let's modelize the background noise :
# count codons wich are not the reference
sys.stderr.write( "Modelizing background noise ... " )
for ref in bamCodon.keys():
totalRefSequencedBase[ref] = 0
totalAltSequencedBase[ref] = 0
bigTotal = 0
noiseTable = {}
if not noiseTable.has_key( ref ):
noiseTable[ref] = {}
outStream = open( sample + "/" + sample + "." + ref + ".value.data" , "w" )
for position in bamCodon[ref].keys():
totalOnPos = 0
totalAltOnPos = 0
if not noiseTable[ref].has_key( position ):
noiseTable[ref][position] = {}
refAA = codonTable[ referenceCodon[ref][position] ]
for codon in bamCodon[ref][position].keys():
totalOnPos += bamCodon[ref][position][codon]
bigTotal += bamCodon[ref][position][codon]
obsAA = codonTable[ codon ]
if obsAA != refAA:
if not noiseTable[ref][position].has_key( obsAA ):
noiseTable[ref][position][obsAA] = 0
noiseTable[ref][position][obsAA] += bamCodon[ref][position][codon]
totalAltSequencedBase[ref] += bamCodon[ref][position][codon]
totalAltOnPos += bamCodon[ref][position][codon]
#~ sys.stderr.write( "Position : %s , Ref : %s , Alt : %s , Total : %s \n" % ( position , totalOnPos - totalAltOnPos , totalAltOnPos , totalOnPos ) )
totalRefSequencedBase[ref] += bigTotal - totalAltSequencedBase[ref]
listofAA = list( set( codonTable.values() ) )
outStream.write( "#position\t%s\n" % "\t".join( listofAA ) )
for position in noiseTable[ref].keys():
outStream.write( "%s" % position )
for a in listofAA :
if noiseTable[ref][position].has_key( a ):
outStream.write( "\t%s" % noiseTable[ref][position][a] )
else:
outStream.write( "\t0" )
outStream.write( "\n" )
outStream.close()
sys.stderr.write( " done\n " )
# calculate distributions
sys.stderr.write( "Computing distribution ... " )
for ref in bamCodon.keys():
sys.stderr.write( "Calculating distribution for %s \n" % ref )
incStream = open( sample + "/" + sample + "." + ref+".value.data" , "r" )
contingencyTable = {}
for line in incStream :
#sys.stderr.write( "%s" % line )
if line.startswith( "#position" ):
continue
myTable = line.split( "\t" )
for elt in myTable[1:]:
i_elt = int( elt )
if not contingencyTable.has_key( i_elt ):
contingencyTable[i_elt] = 0
contingencyTable[i_elt] += 1
totalSum = 0
for elt in contingencyTable.keys():
totalSum += contingencyTable[elt]
# calculate probabilities
for elt in contingencyTable.keys():
tmp = contingencyTable[elt]
contingencyTable[elt] = tmp / float( totalSum )
outStream = open( sample + "/" + sample + "." + ref+".value.density" , "w" )
outStream.write( "#value\tcount\n" )
for elt in contingencyTable.keys():
outStream.write( "%s\t%s\n" % ( elt , contingencyTable[elt] ) )
outStream.close()
sys.stderr.write( " done\n " )
# detect the potential background noise
a = 0
b = 1
turn = 1
positionToDiscard = []
bg = {}
while b != 0:
bg = {}
bgEff = {}
b = 0
sys.stderr.write( "BG estimation : turn %s\n" % turn )
# first : get data for chisquare test
for ref in bamCodon.keys():
altCount = []
refCount = []
if not bg.has_key( "ref" ):
bg[ref] = []
bgEff[ref] = []
for position in bamCodon[ref].keys():
positionAltCount = 0
positionTotalCount = 0
if position in positionToDiscard:
sys.stderr.write( "discarded pos : %s Passing \n" % position )
continue
for codon in bamCodon[ref][position].keys():
positionTotalCount += bamCodon[ref][position][codon]
sys.stderr.write( "On %s : pos %s , totalD : %s\n" % ( ref , position , positionTotalCount ) )
for codon in bamCodon[ref][position].keys():
refCodon = referenceCodon[ref][position]
if codon != refCodon :
positionAltCount = bamCodon[ref][position][codon]
sys.stderr.write( "On %s : pos %s , alt : %s\n" % ( ref , position , positionAltCount ) )
bg[ref].append( positionAltCount / float( positionTotalCount ) )
if positionAltCount > positionTotalCount :
sys.stderr.write( "Error : more alt allele than the total on position %s\n" % position )
sys.exit( 1 )
altCount.append( positionAltCount )
refCount.append( positionTotalCount )
if len( altCount ) == 0 :
sys.stderr.write( "Error : all position excluded on ref %s\n" % ref )
sys.exit( 0 )
if len( refCount ) == 0 :
sys.stderr.write( "Error : all position excluded on ref %s\n" % ref )
sys.exit( 0 )
bgEff[ref].append( sum( altCount ) / len( altCount ) )
bgEff[ref].append( sum( refCount ) / len( refCount ) )
sys.stderr.write( "%s : %s , %s\n" % ( ref , bgEff[ref][0] , bgEff[ref][1] ) )
# now redo a loop to find mutations
for ref in bamCodon.keys():
for position in bamCodon[ref].keys():
positionAltCount = 0
positionTotalCount = 0
if position in positionToDiscard:
continue
for codon in bamCodon[ref][position].keys():
positionTotalCount += bamCodon[ref][position][codon]
for codon in bamCodon[ref][position].keys():
refCodon = referenceCodon[ref][position]
if codon != refCodon :
positionAltCount = bamCodon[ref][position][codon]
# chisquare test to determine if position is significantly different from ref
t,p = stats.chisquare( positionAltCount / float( positionTotalCount ) , bg[ref] )
#~ sys.stderr.write( "Testing bg : %s , %s , %s, %s\n" % ( positionAltCount , positionTotalCount , ",".join( bg[ref] ) , p ) )
if p < 0.01:
positionToDiscard.append( position )
b += 1
a += 1
sys.stderr.write( "End of turn %s with %s new mutation found\n" % ( turn , b ) )
turn += 1
# background noise is estimated with a control sample.
sys.stderr.write( "Calling variants ... \n" )
nbMutTotal = 0
resultTable = {}
codonFileResult = open( sample + "/" + sample + "." + codonResultFile , "w" )
codonFileResult.write( "#Reference\treferenceCodon\treferenceAA\tposition\tsequencedCodon\tsequencedAA\tcounts\tallelicratio\t\tctrlCounts\tctrlRatio\n" )
mutationFileResult = open( sample + "/" + sample + ".mutation.temp.tsv" , "w" )
testToPerform = []
for ref in bamCodon.keys():
resultTable[ref] = {}
totalOnPos = 0
sys.stderr.write( "ref : %s\n" % ref )
for position in bamCodon[ref].keys():
#~ logging.info( "ref : " + ref + " ; position : " + str( position ) )
refCodon = referenceCodon[ref][position]
refAA = codonTable[refCodon]
totalOnPos = 0
totalOnPosReference = 0
for a in bamCodon[ref][position].keys():
totalOnPos += bamCodon[ref][position][a]
if bamCodonReference[ref].has_key( position ):
for a in bamCodonReference[ref][position].keys():
totalOnPosReference += bamCodonReference[ref][position][a]
for codon in bamCodon[ref][position].keys():
count = bamCodon[ref][position][codon]
obsAA = codonTable[codon]
if bamCodonReference[ref].has_key( position ):
if bamCodonReference[ref][position].has_key( codon ):
countCtrl = bamCodonReference[ref][position][codon]
else:
countCtrl = 0
else:
countCtrl = 0
ratioCtrl = 0
ratio = 0
if totalOnPos != 0:
ratio = ( count * 100 ) / float(totalOnPos)
if totalOnPosReference != 0:
ratioCtrl = ( countCtrl * 100 ) / float(totalOnPosReference)
codonFileResult.write( "%s\t%s\t%s\t%s\t%s\t%s\t%s/%s\t%s\t%s\t%s\n" % ( ref , refCodon , refAA , position , codon , obsAA , count , totalOnPos , ratio , countCtrl , ratioCtrl))
# output mutation
if obsAA != refAA and totalOnPos > 200 and count > 30 and countCtrl < count :
sys.stderr.write( "Testing position : %s on %s , AA : %s (ref %s ), depth : %s , ratio : %s , " % ( position , ref , obsAA , refAA , totalOnPos , ratio ) )
# perform statistical test )
#testToPerform.append( [ ref , refCodon , refAA , position , codon , obsAA , count , totalOnPos , countCtrl , totalOnPosReference] )
oddsratio,pValue = stats.fisher_exact([[count , totalOnPos] , [countCtrl , totalOnPosReference]] )
sys.stderr.write( "p-value : %s \n" % pValue )
if pValue < 0.01:
chi2, p, dof, ex = stats.chi2_contingency( [[count , totalOnPos ] , [ bgEff[ref][0] , bgEff[ref][1] ]] , correction=False)
# test for bg noise
if p < 0.01:
mutationFileResult.write( "%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\n" % ( ref , refCodon , refAA , position , codon , obsAA , count , totalOnPos , countCtrl , totalOnPosReference , pValue , p ))
else:
sys.stderr.write( "%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s was different from control but in background noise\n" % ( ref , refCodon , refAA , position , codon , obsAA , count , totalOnPos , countCtrl , totalOnPosReference , pValue , p ) )
sys.stderr.write( "Done, performing statistical tests ...\n" )
codonFileResult.close()
mutationFileResult.close()
# stats for graphics
statsForGraph = {}
for ref in bamCodon.keys():
statsForGraph[ref] = {}
totalOnPos = 0
logging.info( "ref : " + ref )
for position in bamCodon[ref].keys():
#~ logging.info( "ref : " + ref + " ; position : " + str( position ) )
refCodon = referenceCodon[ref][position]
refAA = codonTable[refCodon]
statsForGraph[ref][position] = {}
totalOnPos = 0
totalOnPosReference = 0
for a in bamCodon[ref][position].keys():
totalOnPos += bamCodon[ref][position][a]
if bamCodonReference[ref].has_key( position ) :
for a in bamCodonReference[ref][position].keys():
totalOnPosReference += bamCodonReference[ref][position][a]
else:
totalOnPosReference = 0
for codon in bamCodon[ref][position].keys():
statsForGraph[ref][position][codon] = {}
count = bamCodon[ref][position][codon]
obsAA = codonTable[codon]
if bamCodonReference[ref].has_key( position ):
if bamCodonReference[ref][position].has_key( codon ):
countCtrl = bamCodonReference[ref][position][codon]
else:
countCtrl = 0
else:
countCtrl = 0
# FET
oddsratio,pValue = stats.fisher_exact([[count , totalOnPos] , [countCtrl , totalOnPosReference]] )
statsForGraph[ref][position][codon]['FET'] = pValue
# chisquare
chi2, p, dof, ex = stats.chi2_contingency( [[count , totalOnPos ] , [ bgEff[ref][0] , bgEff[ref][1] ]] , correction=False)
statsForGraph[ref][position][codon]['CHI'] = p
statsForGraph[ref][position][codon]['codonDepth'] = count
# ~ # graphics
# ~ for ref in statsForGraph.keys():
# ~ plt.figure( figsize=( 12,10 ) )
# ~ title = "Analysis for " + str( ref )
# ~ gName = str( ref ) + "_analysis.png"
# ~ plt.title( title )
# ~ plt.ylabel( "-log( pValue )" )
# ~ plt.xlabel( "Position" )
# ~ x = []
# ~ y = []
# ~ z = []
# ~ ac = []
# ~ b = []
# ~ for position in statsForGraph[ref].keys():
# ~ x.append( position )
# ~ totalOnPos = 0
# ~ for a in bamCodon[ref][position].keys():
# ~ totalOnPos += bamCodon[ref][position][a]
# ~ ac.append( totalOnPos )
# ~ maxPValFET = -1
# ~ maxPValCHI = -1
# ~ temp = []
# ~ for codon in statsForGraph[ref][position].keys():
# ~ sys.stderr.write( "fet : %s ; chi : %s \n" % ( statsForGraph[ref][position][codon]['FET'] , statsForGraph[ref][position][codon]['CHI'] ) )
# ~ if statsForGraph[ref][position][codon]['FET'] == 0:
# ~ fetPValue = 500
# ~ else:
# ~ fetPValue = - math.log10( statsForGraph[ref][position][codon]['FET'] )
# ~ if fetPValue > maxPValFET:
# ~ maxPValFET = fetPValue
# ~ if statsForGraph[ref][position][codon]['CHI'] == 0:
# ~ chiPValue = 500
# ~ else:
# ~ chiPValue = - math.log10( statsForGraph[ref][position][codon]['CHI'] )
# ~ if chiPValue > maxPValCHI:
# ~ maxPValCHI = chiPValue
# ~ temp.append( statsForGraph[ref][position][codon]['codonDepth'] )
# ~ y.append( maxPValFET )
# ~ z.append( maxPValCHI )
# ~ b.append( temp )
# ~ plt.plot( x, y, linewidth=2.0 , color = 'r' )
# ~ plt.plot( x , z , linewidth=2.0 , color = 'b' )
# ~ plt.savefig( gName )
# writing final results after annotation
finalFileResult = open( outputFile , "w" )
finalFileResult.write( "#Reference\taa Position\tReference Codon\tReference aa\tAlternative Codon\tAlternative aa\tDepth\tAlternative Allele Ratio\tFET ctrl p-value\tFET bg pValue\tAnnotation\n")
mutationFileResult = open( sample + "/" + sample + ".mutation.temp.tsv" , "r" )
for line in mutationFileResult:
lineTable = line.strip().split( "\t" )
ref = lineTable[0]
annotation = "No annotation found"
if annotationIsDefined :
index = lineTable[2] + lineTable[3] + lineTable[5]
if index in knownMutationTable[ref].keys():
annotation = knownMutationTable[ref][index]
depth = int( lineTable[7] )
ratio = ( float( lineTable[6] ) *100 ) / depth
finalFileResult.write( "%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s pct\t%s\t%s\t%s\n" % ( ref , lineTable[3] , lineTable[1] , lineTable[2] , lineTable[4] , lineTable[5] , depth , ratio , lineTable[10], lineTable[11] , annotation ) )
mutationFileResult.close()
finalFileResult.close()
# writing consensus sequence
consFile = open( sample + "/" + sample + ".cs" , "w" )
for ref in bamCodon.keys():
consFile.write( ">" + ref + "\n" )
for position in sorted( bamCodon[ref].keys() ) :
count = 0
retainedCodon = ""
for codon in bamCodon[ref][position].keys():
if bamCodon[ref][position][codon] > count:
count = bamCodon[ref][position][codon]
retainedCodon = codon
consFile.write( retainedCodon )
consFile.write( "\n" )
consFile.close()
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment