Commit 08c0c724 authored by Yannis Duffourd's avatar Yannis Duffourd

release v0.3 : Adding features : read in run control optionnaly, cs generation,…

release v0.3 : Adding features : read in run control optionnaly, cs generation, variant calling by codon, new codon file format, annotation are now deactivated, backgroupnd noise is computed on sample itself avoiding issues if there is no control, hard & soft clipping management, bug correction regarding missing data in controls
parent 9a6ba918
// C++ std libs
#include <iostream>
// #include <iomanip>
#include <vector>
#include <map>
#include <string>
......@@ -13,6 +14,10 @@
// C++ Boost
#include <boost/program_options.hpp>
// htslib
#include <htslib/sam.h>
// utils
#include "utils.h"
......@@ -21,6 +26,7 @@
using namespace std;
Vihgo::Vihgo()
{
//
......@@ -44,16 +50,88 @@ Vihgo::Vihgo( string iBF , string gF, string rtF , string intF, string protF, st
this->pvalueThreshold = pvT;
this->countStop = countStop;
this->codonTable.insert( pair<string, string>("TTT","F") );
this->codonTable.insert( pair<string, string>("TTC","F") );
this->codonTable.insert( pair<string, string>("TTA","L") );
this->codonTable.insert( pair<string, string>("TTG","L") );
this->codonTable.insert( pair<string, string>("CTT","L") );
this->codonTable.insert( pair<string, string>("CTC","L") );
this->codonTable.insert( pair<string, string>("CTA","L") );
this->codonTable.insert( pair<string, string>("CTG","L") );
this->codonTable.insert( pair<string, string>("ATT","I") );
this->codonTable.insert( pair<string, string>("ATC","I") );
this->codonTable.insert( pair<string, string>("ATA","I") );
this->codonTable.insert( pair<string, string>("ATG","M") );
this->codonTable.insert( pair<string, string>("GTT","V") );
this->codonTable.insert( pair<string, string>("GTC","V") );
this->codonTable.insert( pair<string, string>("GTA","V") );
this->codonTable.insert( pair<string, string>("GTG","V") );
this->codonTable.insert( pair<string, string>("TCT","S") );
this->codonTable.insert( pair<string, string>("TCC","S") );
this->codonTable.insert( pair<string, string>("TCA","S") );
this->codonTable.insert( pair<string, string>("TCG","S") );
this->codonTable.insert( pair<string, string>("CCT","P") );
this->codonTable.insert( pair<string, string>("CCC","P") );
this->codonTable.insert( pair<string, string>("CCA","P") );
this->codonTable.insert( pair<string, string>("CCG","P") );
this->codonTable.insert( pair<string, string>("ACT","T") );
this->codonTable.insert( pair<string, string>("ACC","T") );
this->codonTable.insert( pair<string, string>("ACA","T") );
this->codonTable.insert( pair<string, string>("ACG","T") );
this->codonTable.insert( pair<string, string>("GCT","A") );
this->codonTable.insert( pair<string, string>("GCC","A") );
this->codonTable.insert( pair<string, string>("GCA","A") );
this->codonTable.insert( pair<string, string>("GCG","A") );
this->codonTable.insert( pair<string, string>("TAT","Y") );
this->codonTable.insert( pair<string, string>("TAC","Y") );
this->codonTable.insert( pair<string, string>("TAA","*") );
this->codonTable.insert( pair<string, string>("TAG","*") );
this->codonTable.insert( pair<string, string>("CAT","H") );
this->codonTable.insert( pair<string, string>("CAC","H") );
this->codonTable.insert( pair<string, string>("CAA","Q") );
this->codonTable.insert( pair<string, string>("CAG","Q") );
this->codonTable.insert( pair<string, string>("AAT","N") );
this->codonTable.insert( pair<string, string>("AAC","N") );
this->codonTable.insert( pair<string, string>("AAA","K") );
this->codonTable.insert( pair<string, string>("AAG","K") );
this->codonTable.insert( pair<string, string>("GAT","D") );
this->codonTable.insert( pair<string, string>("GAC","D") );
this->codonTable.insert( pair<string, string>("GAA","E") );
this->codonTable.insert( pair<string, string>("GAG","E") );
this->codonTable.insert( pair<string, string>("TGT","C") );
this->codonTable.insert( pair<string, string>("TGC","C") );
this->codonTable.insert( pair<string, string>("TGA","*") );
this->codonTable.insert( pair<string, string>("TGG","W") );
this->codonTable.insert( pair<string, string>("CGT","R") );
this->codonTable.insert( pair<string, string>("CGC","R") );
this->codonTable.insert( pair<string, string>("CGA","R") );
this->codonTable.insert( pair<string, string>("CGG","R") );
this->codonTable.insert( pair<string, string>("AGT","S") );
this->codonTable.insert( pair<string, string>("AGC","S") );
this->codonTable.insert( pair<string, string>("AGA","R") );
this->codonTable.insert( pair<string, string>("AGG","R") );
this->codonTable.insert( pair<string, string>("GGT","G") );
this->codonTable.insert( pair<string, string>("GGC","G") );
this->codonTable.insert( pair<string, string>("GGA","G") );
this->codonTable.insert( pair<string, string>("GGG","G") );
}
int Vihgo::mainLoop()
{
this->getGenome();
//this->displayGenome();
// this->displayGenome();
this->getGenes();
this->displayGenes();
this->getAnnotation();
//this->displayGenes();
// this->getAnnotation();
//this->displayCodons();
// this->displayReferenceCodons();
this->readRunControl();
this->readSample();
// this->displayData( "control" );
// this->displayData( "sample" );
this->variantCalling();
return 0;
......@@ -65,7 +143,7 @@ int Vihgo::getGenome( )
string ligne;
char mot;
string header = ">";
int i = 0;
int i = 1;
string currentChr;
// fasta parsing
......@@ -195,11 +273,11 @@ int Vihgo::getGenes()
// test if gene is already known
if (this->genes.count( subMot ) == 0 )
{
cerr << "Creating new gene : " << subMot << endl;
// cerr << "Creating new gene : " << subMot << endl;
map<string , string> tmpMap;
map<int, string> tmpMapT;
this->genes[subMot] = tmpMap;
this->referenceCodon[subMot] = tmpMapT;
// this->referenceCodon[subMot] = tmpMapT;
this->genes[subMot]["start"] = "";
this->genes[subMot]["end"] = "";
this->genes[subMot]["strand"] = "";
......@@ -229,23 +307,38 @@ int Vihgo::getGenes()
}
}
this->displayGenes();
map< string , map< string , string > >::iterator myIterC;
for( myIterC = genes.begin() ; myIterC != genes.end() ; myIterC ++ )
{
string currentGene = myIterC->first;
cerr << "getting codons for gene " << currentGene << endl;
// cerr << "getting codons for gene " << currentGene << endl;
chromosome =myIterC->second["chrom"];
chromosome = myIterC->second["chrom"];
vector<string> starts = parseOnSep( myIterC->second["start"] , "," );
vector<string> ends = parseOnSep( myIterC->second["end"] , "," );
vector<string> strands = parseOnSep( myIterC->second["strand"] , "," );
// create gene references in prot relative tables
map< int , string > tmpMap;
map< int , int > tmpMapSecond;
this->protCodon[currentGene] = tmpMap;
this->refProtCorresp[currentGene] = tmpMapSecond;
int protPos = 1;
int geneSubPart = 0;
for( geneSubPart = 0 ; geneSubPart < starts.size() ; geneSubPart ++ )
{
map< int , char > posList;
posList = this->referenceSequence[chromosome];
// sort position in the map
map< int , char>::iterator myIterE;
vector<int> pos;
......@@ -257,11 +350,11 @@ int Vihgo::getGenes()
if ( strands[geneSubPart] == "+" )
{
cerr << "\tStrand is + " << endl;
// cerr << "\tStrand is + " << endl;
}
else
{
cerr << "\tStrand is _ " << endl;
// cerr << "\tStrand is - " << endl;
reverse(pos.begin(),pos.end());
}
......@@ -285,38 +378,28 @@ int Vihgo::getGenes()
map <int, string > tmpMap;
this->referenceCodon[chromosome] = tmpMap;
}
// fill objects
// reference genomic codon
this->referenceCodon[chromosome][*myIterF - 2] = codon;
cerr << "\tAdded " << codon << " at position " << *myIterF - 2 << endl;
// protein aa
this->protCodon[currentGene][protPos] = this->codonTable[ codon ];
// corresp between genomic and proteic numerotation
this->refProtCorresp[currentGene][*myIterF - 2] = protPos;
// logs for test purposes
// cerr << "\tAdded " << codon << " : " << this->codonTable[ codon ] << " at g. pos " << *myIterF - 2 << " ; p. pos " << protPos << " in gene " << currentGene << endl;
codon = "";
protPos ++;
}
}
}
}
}
// convertion to protein aa
map< string , map< int , string > >::iterator myIterG;
for( myIterG = this->referenceCodon.begin() ; myIterG != this->referenceCodon.end() ; myIterG ++)
{
map< int , string > tmpMap;
this->protCodon[myIterG->first] = tmpMap;
// sort position in the map
map< int , string>::iterator myIterE;
vector<int> pos;
for( myIterE = (myIterG->second).begin() ; myIterE != (myIterG->second).end() ; myIterE ++ )
{
pos.push_back( myIterE->first );
}
sort( pos.begin(), pos.begin()+pos.size() );
vector< int >::iterator myIterH;
int i = 1;
for( myIterH = pos.begin(); myIterH != pos.end() ; myIterH ++)
{
this->referenceCodon[myIterG->first][*myIterH] = i;
i++;
}
}
return 0;
}
......@@ -330,10 +413,89 @@ void Vihgo::displayGenes()
map< string , string> tmpMap;
tmpMap = myIterB->second;
cerr << "chr : " << tmpMap["chrom"] << " ; start : " << tmpMap["start"] << " ; end : " << tmpMap["end"] << " ; strand : " << tmpMap["strand"] << endl;
cerr << "\tchr : " << tmpMap["chrom"] << " ; start : " << tmpMap["start"] << " ; end : " << tmpMap["end"] << " ; strand : " << tmpMap["strand"] << endl;
}
}
void Vihgo::displayCodons()
{
cerr << "Displaying prot codon list : " << endl;
//std::map<std::string, std::map< int , std::string> > protCodon;
map< string, map< int , string> >::iterator myIterA;
for( myIterA = this->protCodon.begin() ;myIterA != this->protCodon.end() ; myIterA ++ )
{
cerr << "Gene : " << myIterA->first << " : " << endl;
map< int , string> tmpMap;
tmpMap = myIterA->second;
map<int , string>::iterator myIterB;
for(myIterB = tmpMap.begin() ; myIterB != tmpMap.end() ; myIterB ++)
{
cerr << "\tPos : " << myIterB->first << " ; AA : " << myIterB->second << endl;
}
}
}
void Vihgo::displayReferenceCodons()
{
// std::map<std::string, std::map< int , std::string> > referenceCodon;
map<string , map<int , string> >::iterator myIterA;
map<int , string>::iterator myIterB;
cerr << "Reference codons : " << endl;
for (myIterA = this->referenceCodon.begin() ; myIterA != this->referenceCodon.end() ; myIterA ++ )
{
cerr << "\t" << myIterA->first << " : " << endl;
for( myIterB = (myIterA->second).begin() ; myIterB != (myIterA->second).end() ; myIterB ++)
{
cerr << "\t\t" << myIterB->first << " : " << myIterB->second << endl;
}
}
}
void Vihgo::displayData( string incType )
{
map<string , map<int , map< string , int> > >::iterator myIterA;
map<int , map< string , int> >::iterator myIterB;
map< string , int>::iterator myIterC;
if ( incType == "control" )
{
for( myIterA = this->controlData.begin() ; myIterA != this->controlData.end() ; myIterA ++ )
{
cerr << myIterA->first << " : " << endl;
for( myIterB = (myIterA->second).begin() ; myIterB != (myIterA->second).end() ; myIterB ++ )
{
cerr << "\t" << myIterB->first << " : " << endl;
for( myIterC = (myIterB->second).begin() ; myIterC != (myIterB->second).end() ; myIterC ++ )
{
cerr << "\t\t" << myIterC->first << " : " << myIterC->second << endl;
}
}
}
}
if ( incType == "sample" )
{
for( myIterA = this->sampleData.begin() ; myIterA != this->sampleData.end() ; myIterA ++ )
{
cerr << myIterA->first << " : " << endl;
for( myIterB = (myIterA->second).begin() ; myIterB != (myIterA->second).end() ; myIterB ++ )
{
cerr << "\t" << myIterB->first << " : " << endl;
for( myIterC = (myIterB->second).begin() ; myIterC != (myIterB->second).end() ; myIterC ++ )
{
cerr << "\t\t" << myIterC->first << " : " << myIterC->second << endl;
}
}
}
}
}
int Vihgo::getAnnotation()
{
vector<string> fileToParse;
......@@ -456,19 +618,609 @@ bool Vihgo::isAnnotationDefined()
return false;
}
// parse the control bam file to get informations on background noise of the experiment
int Vihgo::readRunControl()
{
long int nbLineRead = 0;
int readPassedAln = 0;
int readPassedSecondary = 0;
int readPassedSupplementary = 0;
int readPassedMQ = 0;
int readPassedCIGAROp = 0;
cerr << "Reading control BAM file..." << endl;
bam1_t *aln = bam_init1();
samFile *fp_in = hts_open( (this->getControlBamFile()).c_str() , "r" );
bam_hdr_t *bamHdr = sam_hdr_read( fp_in);
while(sam_read1(fp_in , bamHdr , aln) > 0)
{
nbLineRead ++;
if ((nbLineRead != 0 ) && ((nbLineRead % 200000) == 0) )
{
cerr << "\t" << nbLineRead << " aln read ; \n\t\tSkipped : " << readPassedAln << " unaligned ; " << readPassedSecondary << " secondary alignements ; " << readPassedSupplementary << " supplementary alignements ; " << readPassedMQ << " bad aln quality ; " << readPassedCIGAROp << " special CIGAR operation " << endl;
}
// QC check
// if ( )
// {
//
// }
// skip alignement if is unmmapped
if ( (aln->core.flag & BAM_FUNMAP) == BAM_FUNMAP )
{
// cerr << "\tSkipping read : unaligned" << endl;
readPassedAln ++;
continue;
}
// skip alignement if secondary alignement
if ( (aln->core.flag & BAM_FSECONDARY) == BAM_FSECONDARY)
{
// cerr << "\tSkipping read : secondary" << endl;
readPassedSecondary ++;
continue;
}
// skip alignement if supplmentary alignement
if ( (aln->core.flag & BAM_FSUPPLEMENTARY) == BAM_FSUPPLEMENTARY)
{
// cerr << "\tSkipping read : supplementary" << endl;
readPassedSupplementary ++;
continue;
}
// Skip alignment if quality score is less than 30
if (aln->core.qual < 30)
{
// cerr << "\tSkipping read : bad alignement quality" << endl;
readPassedMQ ++;
continue;
}
// get pos
int32_t pos = aln->core.pos + 1;
// get chr and its name
string chr = bamHdr->target_name[aln->core.tid];
// Deal with unknown chromosome
if (!( this->isInGeneList( chr ) ))
{
cerr << "Read is mapped on a reference not known. Please verify the reference provided." << endl;
continue;
}
// Check for each gene if the position is in the gene
// vector<string> g = parseOnSep( isInGene( chr , pos ) , ",");
// vector<string>::iterator myIterA;
// for ( myIterA = g.begin() ; myIterA != g.end() ; myIterA ++ )
// {
// cerr << "Read is in a known gene : " << *myIterA << endl;
// }
// cigar string event detection ; need to be improved to deal with insertion / deletion
// pass the read if cigar op is in : Del, Ins, CLIP, SKIP, PADD, BACK
bool readNeedToBeSkipped = false;
for (int k = 0; k < aln->core.n_cigar; k++)
{
const int op = bam_cigar_op( bam_get_cigar(aln)[k]);
const int ol = bam_cigar_oplen( bam_get_cigar(aln)[k]);
if ( op == BAM_CREF_SKIP || op == BAM_CINS || op == BAM_CDEL || op == BAM_CSOFT_CLIP || op == BAM_CHARD_CLIP || op == BAM_CPAD || op == BAM_CBACK )
{
readNeedToBeSkipped = true;
}
}
if (readNeedToBeSkipped )
{
// cerr << "\tSkipping read : CIGAR OP issue" << endl;
readPassedCIGAROp ++;
continue;
}
const uint8_t *sequence = bam_get_seq(aln);
const uint8_t *quality = bam_get_qual(aln);
// we also need to check if the quality string (and if the sequence ha non null length) has the same length than the sequence itself, otherwise the quality check should be skipped
// query name
// cerr << "\tRead : " << bam_get_qname( aln ) << " : " << chr << ":" << pos << endl;
for( int i = 0 ; i < aln->core.l_qseq ; i ++ )
{
// check if we are at a codon position
int currentGenomicPosition = pos + i;
string codon;
if( (referenceCodon[chr]).count(currentGenomicPosition) > 0 )
{
// cerr << "\t\tcurrent pos in read : " << currentGenomicPosition << endl;
for ( int x = 0 ; x < 3 ; x++)
{
int a = bam_seqi( sequence , i+x );
int q = bam_seqi( quality , i+x );
char base;
switch( a )
{
case 1 :
base = 'A';
break;
case 2 :
base = 'C';
break;
case 4 :
base = 'G';
break;
case 8 :
base = 'T';
break;
case 15 :
base = 'N';
break;
}
// if( q < 0 )
// {
// cerr << "q" << q << endl;
// }
codon += base;
}
// cerr << "\t\t" << codon << endl;
i += 2;
// insert the codon in the data
if( !( this->controlData.count( chr ) > 0) )
{
map<int , map< string , int> > tmpMap;
this->controlData[chr] = tmpMap;
}
if( !(this->controlData[chr].count( currentGenomicPosition ) > 0 ) )
{
map< string , int> tmpMap;
this->controlData[chr][currentGenomicPosition] = tmpMap;
}
if( !(this->controlData[chr][currentGenomicPosition].count( codon ) > 0 ) )
{
this->controlData[chr][currentGenomicPosition][codon] = 0;
}
this->controlData[chr][currentGenomicPosition][codon] ++;
}
else
{
// cerr << "\t\tNot in referenceCodon table : skipping" << endl;
continue;
}
}
}
// destruct objects
bam_hdr_destroy( bamHdr );
sam_close( fp_in );
bam_destroy1( aln );
return 0;
}
int Vihgo::variantCalling()
{
// estimate BG noise on control data
// we need to eliminate "real" variants by testing their vaf to all the sample VAF
// If a variant is detected, a new turn begin while excluding detected variation.
// First compute all VAF
map<string , map<int , map< string , int> > >::iterator myIterA;
map<int , map< string , int> >::iterator myIterB;
map< string , int>::iterator myIterC;
map<string , map<int , vector<double> > > VAFControlTable;
// map<string , map<int , vector<double> > > VAFControlTable;
for( myIterA = this->controlData.begin() ; myIterA != this->controlData.end() ; myIterA ++ )
{
// cerr << myIterA->first << " : " << endl;
for( myIterB = (myIterA->second).begin() ; myIterB != (myIterA->second).end() ; myIterB ++ )
{
// cerr << "\t" << myIterB->first << " : " << endl;
double totalDepth = 0.0;
double altNumber = 0.0;
string refCodon = this->referenceCodon[myIterA->first][myIterB->first];
string aa = codonTable[refCodon];
cerr << "Analysing codon " << myIterA->first << ":" << myIterB->first << " ; Ref is : " << refCodon << " : " << aa << endl;
for( myIterC = (myIterB->second).begin() ; myIterC != (myIterB->second).end() ; myIterC ++ )
{
string currentCodon = myIterC->first;
string currentAa = codonTable[currentCodon];
cerr << "\tin control : " << currentCodon << " : " << currentAa << " : " << myIterC->second << endl;
if (!( currentAa == aa ))
{
altNumber += myIterC->second;
}
totalDepth += myIterC->second;
}
cerr << "\t\tRepresenting in total : " << altNumber << "/" << totalDepth << endl;
if (!(VAFControlTable.count( myIterA->first ) > 0 ) )
{
map<int , vector<double> > tmpMap;
VAFControlTable[myIterA->first] = tmpMap;
}
double x = (altNumber / totalDepth);
vector<double> tmpVect;
tmpVect.push_back( altNumber );
tmpVect.push_back( totalDepth );
// VAFControlTable[myIterA->first][myIterB->first] = x * 100;
VAFControlTable[myIterA->first][myIterB->first] = tmpVect;
// cerr << myIterA->first << " : " << myIterB->first << " : " << VAFControlTable[myIterA->first][myIterB->first] << endl;
}
}
// looping while we find a variation
// a variation is a position where the ratio is different from the whole BG noise from the control sample
int numberOfVariation = 0;
// we'll store variation position in this map and exclude the position found in it at each turn
map<string , int> positionToExclude;
while (numberOfVariation > 0)
{
vector<double> t;
t.push_back( 0.0 );
t.push_back( 0.0 );
vector<double> a;
a.push_back( 0.0 );
a.push_back( 0.0 );
map<string , map<int , vector<double> > >::iterator myIterD;
map<int , vector<double> >::iterator myIterE;
// compute a
for ( myIterD = VAFControlTable.begin(); myIterD != VAFControlTable.end(); myIterD ++)
{
for ( myIterE = (myIterD->second).begin() ; myIterE != (myIterD->second).end() ; myIterE++)
{
a[0] += VAFControlTable[myIterD->first][myIterE->first][0];
a[1] += VAFControlTable[myIterD->first][myIterE->first][1];
}
}
// test each position
for ( myIterD = VAFControlTable.begin(); myIterD != VAFControlTable.end(); myIterD ++)
{
for ( myIterE = (myIterD->second).begin() ; myIterE != (myIterD->second).end() ; myIterE++)
{
t[0] = VAFControlTable[myIterD->first][myIterE->first][0];
t[1] = VAFControlTable[myIterD->first][myIterE->first][1];
double p = chisquare( t , a );
cerr << "testing : " << t[0] << " : " << t[1] << " vs " << a[0] << " : " << a[1] << " ; Pval = " << p << endl;
}
}
}
return 0;
}
// parse the bam file to get informations on data of the experiment
int Vihgo::readSample()
{
long int nbLineRead = 0;
int readPassedAln = 0;
int readPassedSecondary = 0;
int readPassedSupplementary = 0;
int readPassedMQ = 0;
int readPassedCIGAROp = 0;
cerr << "Reading BAM file..." << endl;
bam1_t *aln = bam_init1();
samFile *fp_in = hts_open( (this->getBamFile()).c_str() , "r" );
bam_hdr_t *bamHdr = sam_hdr_read( fp_in);
while(sam_read1(fp_in , bamHdr , aln) > 0)
{
nbLineRead ++;
if ((nbLineRead != 0 ) && ((nbLineRead % 200000) == 0) )
{
cerr << "\t" << nbLineRead << " aln read ; \n\t\tSkipped : " << readPassedAln << " unaligned ; " << readPassedSecondary << " secondary alignements ; " << readPassedSupplementary << " supplementary alignements ; " << readPassedMQ << " bad aln quality ; " << readPassedCIGAROp << " special CIGAR operation " << endl;
}
// skip alignement if is unmmapped
if ( (aln->core.flag & BAM_FUNMAP) == BAM_FUNMAP )
{
readPassedAln ++;
continue;
}
// skip alignement if secondary alignement
if ( (aln->core.flag & BAM_FSECONDARY) == BAM_FSECONDARY)
{
readPassedSecondary ++;
continue;
}
// skip alignement if supplmentary alignement
if ( (aln->core.flag & BAM_FSUPPLEMENTARY) == BAM_FSUPPLEMENTARY)
{
readPassedSupplementary ++;
continue;
}
// Skip alignment if quality score is less than 30
if (aln->core.qual < 30)
{
readPassedMQ ++;
continue;
}
// get pos
int32_t pos = aln->core.pos + 1;
// get chr and its name
string chr = bamHdr->target_name[aln->core.tid];
// Deal with unknown chromosome
if (!( this->isInGeneList( chr ) ))
{
cerr << "Read is mapped on a reference not known. Please verify the reference provided." << endl;
continue;
}
// Check for each gene if the position is in the gene
// vector<string> g = parseOnSep( isInGene( chr , pos ) , ",");
// vector<string>::iterator myIterA;
// for ( myIterA = g.begin() ; myIterA != g.end() ; myIterA ++ )
// {
// cerr << "Read is in a known gene : " << *myIterA << endl;
// }
// cigar string event detection ; need to be improved to deal with insertion / deletion
// pass the read if cigar op is in : Del, Ins, CLIP, SKIP, PADD, BACK
bool readNeedToBeSkipped = false;
for (int k = 0; k < aln->core.n_cigar; k++)
{
const int op = bam_cigar_op( bam_get_cigar(aln)[k]);
const int ol = bam_cigar_oplen( bam_get_cigar(aln)[k]);
if ( op == BAM_CREF_SKIP || op == BAM_CINS || op == BAM_CDEL || op == BAM_CSOFT_CLIP || op == BAM_CHARD_CLIP || op == BAM_CPAD || op == BAM_CBACK )
{
readNeedToBeSkipped = true;
}
}
if (readNeedToBeSkipped )
{
readPassedCIGAROp ++;
continue;
}
const uint8_t *sequence = bam_get_seq(aln);
// string sequence = bam1_seq( aln );
// string quality = bam1_qual( aln );
for( int i = 0 ; i < aln->core.l_qseq ; i ++ )
{
// check if we are at a codon position
int currentGenomicPosition = pos + i;
string codon;
if( (referenceCodon[chr]).count(currentGenomicPosition) > 0 )
{
for ( int x = 0 ; x < 3 ; x++)
{
int a = bam_seqi( sequence , i+x );
char base;
switch( a )
{
case 1 :
base = 'A';
break;
case 2 :
base = 'C';
break;
case 4 :
base = 'G';
break;
case 8 :
base = 'T';
break;
case 15 :
base = 'N';
break;
}
codon += base;
}
i += 2;
// insert the codon in the data
if( !( this->sampleData.count( chr ) > 0) )
{
map<int , map< string , int> > tmpMap;
this->sampleData[chr] = tmpMap;
// cerr << "\tCreating chromosome " << chr << " in control data " << endl;
}
if( !(this->sampleData[chr].count( currentGenomicPosition ) > 0 ) )
{
map< string , int> tmpMap;
this->sampleData[chr][currentGenomicPosition] = tmpMap;
// cerr << "\tCreating Position " << currentGenomicPosition << " on chromosome " << chr << " in control data " << endl;
}
if( !(this->sampleData[chr][currentGenomicPosition].count( codon ) > 0 ) )
{
this->sampleData[chr][currentGenomicPosition][codon] = 0;
// cerr << "\tCreating codon " << codon << " on position " << currentGenomicPosition << " on chromosome " << chr << " in control data " << endl;
}
this->sampleData[chr][currentGenomicPosition][codon] ++;
// cerr << "\tAdding codon " << codon << " on position " << currentGenomicPosition << " on chromosome " << chr << " in control data " << endl;
}
else
{
continue;
}
}
}
// destruct objects
bam_hdr_destroy( bamHdr );
sam_close( fp_in );
bam_destroy1( aln );
return 0;
}
// Method to verify the existence of a gene in the reference provided
bool Vihgo::isInGeneList( string incName )
{
map<string, map<int, char> >::iterator myIterA ;
for( myIterA = this->referenceSequence.begin() ; myIterA != this->referenceSequence.end() ; myIterA ++ )
{
if ( myIterA->first == incName )
{
return true;
}
}
return false;
}
// Method to determine if a position is in a gene
// return a gene name if possible, if several genes are present : return the genes separated with comma
// or return "none" if no gene is at the position
string Vihgo::isInGene( string incChrom , int incPos )
{
// cerr << "Entering isInGene method with incoming arguments : " << incChrom << " and " << incPos << endl;
string ret = "";
bool posFound = false;
map< string , map< string , string > >::iterator myIterA;
for (myIterA = genes.begin() ; myIterA != genes.end() ; myIterA ++)
{
string allChr = genes[myIterA->first]["chrom"];
vector<string> allChrVec = parseOnSep( allChr , "," );
vector<string>::iterator myIterB;
for( myIterB = allChrVec.begin() ; myIterB != allChrVec.end() ; myIterB ++ )
{
if (*myIterB == incChrom )
{
vector<string> allStart;
allStart = parseOnSep( genes[myIterA->first]["start"] , "," );
vector<string> allEnd;
allEnd = parseOnSep( genes[myIterA->first]["end"] , "," );
vector<string>::iterator myIterC;
int i = 0;
for( myIterC = allStart.begin() ; myIterC != allStart.end() ; myIterC ++)
{
int s = string_to_int( *myIterC );
if( incPos >= s )
{
if( incPos <= string_to_int( allEnd[i] ) )
{
if( ret.length() > 0 )
{
ret.append( "," );
}
ret += myIterA->first;
}
}
i ++;
}
}
else
{
continue;
}
}
}
if( ret.length() > 0)
{
// cerr << "\tReturning " << ret << endl;
return ret ;
}
else
{
// cerr << "\tReturning none" << endl;
return "none";
}
}
// getters
vector<string> Vihgo::getAnnotationFiles()
vector<string> Vihgo::getAnnotationFiles()
{
vector<string> res;
res.push_back( this->integraseFile );
......@@ -477,5 +1229,16 @@ int Vihgo::readRunControl()
return res;
}
string Vihgo::getControlBamFile()
{
return this->inRunControlBam;
}
string Vihgo::getBamFile()
{
return this->inputBamFile;
}
// setters
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment