Commit e3c16446 authored by Yannis Duffourd's avatar Yannis Duffourd

Renaming into Galacs

parent 51d0f376
// C++ std libs
#include <iostream>
// #include <iomanip>
#include <algorithm>
#include <fstream>
#include <map>
#include <sstream>
#include <stdexcept>
#include <string>
#include <sys/time.h>
#include <sys/types.h>
#include <vector>
// C++ Boost
#include <boost/program_options.hpp>
// htslib
#include <htslib/sam.h>
// utils
#include "utils.h"
// Classes
#include "Galacs.h"
using namespace std;
Galacs::Galacs() {
}
Galacs::Galacs(std::string incBamFile, std::string incReferenceFile, std::string incOutputFile, std::string incOutputFilePairs , std::string incOutputFileSplits, int incWindow, int incStep, double incZscore, int incNbthread, int incLoggingLevel) {
this->bamFile = incBamFile;
this->referenceFile = incReferenceFile;
this->outputFile = incOutputFile;
this->outputFilePairs = incOutputFilePairs;
this->outputFileSplits = incOutputFileSplits;
this->window = incWindow;
this->step = incStep;
this->zscore = incZscore;
this->nbThread = incNbthread;
this->loggingLevel = incLoggingLevel;
}
int Galacs::mainLoop() {
this->getGenome();
this->displayGenome();
this->bamParsingLauncher();
return 0;
}
int Galacs::getGenome() {
struct timeval tbegin, tend;
string ligne;
char mot;
string header = ">";
int i = 1;
string currentChr;
map<int32_t, map<int32_t, int16_t> > windows;
for (int32_t a = start ; a < end ; a += this->getStepSize()) {
map<int32_t, int16_t> tmpMap;
tmpMap[start + this->getWindowSize()] = 0;
windows[start] = tmpMap;
}
// fasta parsing
ifstream fastaStream(this->referenceFile.c_str());
if (fastaStream) {
while (getline(fastaStream, ligne)) {
if (ligne.find(header) == 0) {
ligne.replace(0, 1, "");
// Deal with headers including spaces, etc...
if (ligne.find(" ") != string::npos) {
ligne = parseOnSep(ligne, " ")[0];
}
currentChr = ligne;
map<int, char> tmpMap;
this->referenceSequence[currentChr] = tmpMap;
} else {
for (unsigned int k = 0; k < ligne.length(); k++) {
mot = ligne[k];
mot = checkBase(mot);
this->referenceSequence[currentChr][i] = mot;
i++;
}
}
}
}
return 0;
}
void Galacs::displayGenome() {
map<string, map<int, char>>::iterator myIterA;
for (myIterA = this->referenceSequence.begin();
myIterA != this->referenceSequence.end(); myIterA++) {
cerr << "Chr " << myIterA->first << " : " << endl;
map<int, char> tmpMap;
tmpMap = myIterA->second;
map<int, char>::iterator myIterB;
for (myIterB = tmpMap.begin(); myIterB != tmpMap.end(); myIterB++) {
cerr << myIterB->first << " : " << myIterB->second << " ; ";
}
cerr << endl;
}
}
// bam parsing launcher
int Galacs::bamParsingLauncher() {
BOOST_LOG_TRIVIAL(trace) << "Entering Galacs::bamParsingLauncher";
boost::asio::io_service ioService;
boost::thread_group threadpool;
boost::asio::io_service::work work(ioService);
// create the number of require threads
for (int i = 0; i <= this->getNbThread() - 1; i++) {
threadpool.create_thread(
boost::bind(&boost::asio::io_service::run, &ioService));
}
// Take care of the bam
map<string, map<int32_t,int32_t>>::iterator myIterA;
for (myIterA = (this->refDict).begin(); myIterA != (this->refDict).end(); myIterA++) {
map<int32_t,int32_t>::iterator myIterB;
myIterB = myIterA->second;
BOOST_LOG_TRIVIAL(info) << "Assigning new task for : " << myIterA->first << ":" << myIterB->first << "-" << myIterB->second;
ioService.post(boost::bind(&Galacs::readBam, this, myIterA->first, myIterB->first, myIterB->second));
}
threadpool.join_all();
BOOST_LOG_TRIVIAL(info) << "Thread are joined";
// after this you stop the processing loop, any tasks added after will not be
// executed
// ioService.stop();
// compute and display enrichment in motifs
//this->ComputeEnrichment();
BOOST_LOG_TRIVIAL(trace) << "Leaving Galacs::bamParsingLauncher";
return 0;
}
// parse the bam file to get informations on data of the experiment
int Galacs::readBam(string chr, int32_t start, int32_t end) {
BOOST_LOG_TRIVIAL(debug) << "Start of Reader for chr" << chr << " : " << start << "-" << end;
// map<int32_t, map<string, short int>> positionCount;
map<int32_t, Position *> baseDict;
map<int32_t, map<int32_t, int16_t> > windows;
for (int32_t a = start ; a < end ; a += this->getStepSize()) {
map<int32_t, int16_t> tmpMap;
tmpMap[start + this->getWindowSize()] = 0;
windows[start] = tmpMap;
}
// map<int32_t, string> localRefBamDict;
string s_currentChr;
samFile *fp = NULL;
char *c_BamFile = NULL;
char *region = NULL;
bam_hdr_t *header = NULL;
bam1_t *b = NULL;
bam1_core_t *c = NULL;
hts_idx_t *idx = NULL;
hts_itr_t *iter = NULL;
int ret, nbReads = 0, nbAlignementDup = 0, nbUnmappedReads = 0, nbBadMapReadQual = 0, nbPassingReads = 0;
int32_t lastChr, currentChr, currentStartPos, lastStartPos;
map<int32_t, string> refMapDict;
lastChr = -1;
lastStartPos = 0;
// open the bam file and verify
BOOST_LOG_TRIVIAL(debug) << "Opening BAM file " << " in thread " << this->getBamFile() << ":" << chr;
c_BamFile = (char *)(this->getBamFile()).c_str();
BOOST_LOG_TRIVIAL(debug) << "C_str bam file : " << c_BamFile;
fp = hts_open(c_BamFile, "rb");
BOOST_LOG_TRIVIAL(trace) << "Pointer to fp : " << fp;
if (fp == NULL) {
BOOST_LOG_TRIVIAL(error) << "Cannot open input file " << this->getBamFile() << " in thread : " << chr << ":" << start << "-" << end;
return 1;
}
BOOST_LOG_TRIVIAL(info) << "BAM is open in thread " << this->getBamFile() << ":" << chr << ":" << start << "-" << end;
// load index & verify
BOOST_LOG_TRIVIAL(debug) << "Opening index file in thread " << this->getBamFile() << ":" << chr ":" << start << "-" << end;
idx = sam_index_load2(fp, c_BamFile, NULL );
if (idx == NULL) {
BOOST_LOG_TRIVIAL(error) << "Cannot open bam index for input file " << this->getBamFile() << " in thread : " << << chr ":" << start << "-" << end;
return 1;
}
BOOST_LOG_TRIVIAL(info) << "Index file is open in thread " << this->getBamFile() << " in thread : " << << chr ":" << start << "-" << end;
// manage header, need to convert to a string corresponding to our reference
header = sam_hdr_read(fp);
for (int32_t i = 0; i < header->n_targets; i++) {
string chr;
stringstream ss;
ss << header->target_name[i];
ss >> chr;
refBamDict[i] = chr;
BOOST_LOG_TRIVIAL(trace) << "Ref idx " << i << " : " << header->target_name[i];
}
// perform the query
BOOST_LOG_TRIVIAL(info) << "Performing query in thread " << this->getBamFile() << " in thread : " << chr ":" << start << "-" << end;
string tmpRegion = chr + ":" + int_to_string(start) + "-" + int_to_string(end)
region = (char *)tmpRegion.c_str();
BOOST_LOG_TRIVIAL(debug) << "Region defined";
iter = sam_itr_querys(idx, header, region);
BOOST_LOG_TRIVIAL(debug) << "iter result passed";
if (iter == NULL) {
BOOST_LOG_TRIVIAL(info) << "No read found in thread " << this->getBamFile() << " in thread : " << chr ":" << start << "-" << end;
return;
}
// init reader
b = bam_init1();
BOOST_LOG_TRIVIAL(debug) << "Bam init passed";
struct timeval tbegin, tend;
gettimeofday(&tbegin, NULL);
// browse the data from the query
while ((ret = sam_itr_next(fp, iter, b)) >= 0) {
BOOST_LOG_TRIVIAL(trace) << "Reading aln in thread "
<< this->getBamFile() << ":" << chr
<< ": & ret : " << ret;
bool take_the_read = true;
nbReads++;
currentChr = b->core.tid;
currentStartPos = b->core.pos;
s_currentChr = this->refBamDict[currentChr];
BOOST_LOG_TRIVIAL(debug) << "\taln in thread "
<< this->getBamFile() << ":" << chr
<< " => " << currentChr
<< " & " << currentStartPos << " & "
<< s_currentChr;
// Check bam is sorted
if (currentStartPos < lastStartPos) {
// cerr << "current pos : " << currentStartPos << " < last pos : " <<
// lastStartPos << endl;
return 1;
}
c = &b->core;
// check if the read is not a pcr optical duplicate
if ((c->flag & BAM_FDUP) and (this->GetPCRDupStatus() == false)) {
take_the_read = false;
nbAlignementDup++;
BOOST_LOG_TRIVIAL(debug) << currentChr << ":" << currentStartPos << " = PCR Dup ";
}
// - it's aligned
if ((c->flag & BAM_FUNMAP) and (c->flag & BAM_FPROPER_PAIR)) {
take_the_read = false;
nbUnmappedReads++;
BOOST_LOG_TRIVIAL(debug) << currentChr << ":" << currentStartPos << " = unmapped ";
}
// - the mapping quality >= treshold
if (c->qual < this->GetAlnQThreshold()) {
take_the_read = false;
nbBadMapReadQual++;
BOOST_LOG_TRIVIAL(debug) << currentChr << ":" << currentStartPos + 1
<< " = MAPQ=" << int_to_string(c->qual) << " vs "
<< this->GetAlnQThreshold()
<< " : Discarding read.";
} else {
BOOST_LOG_TRIVIAL(trace) << currentChr << ":" << currentStartPos + 1 << " = MAPQ=" << int_to_string(c->qual) << " vs " << this->GetAlnQThreshold() << " : Keeping read.";
}
// treat read if ok
if (take_the_read) {
BOOST_LOG_TRIVIAL(debug) << "\taln in thread " << this->getBamFile() << ":" <<chr ":" << start << "-" << end << " : Taking the read";
map<int32_t, map<int32_t, int16_t> >::iterator iter;
map<int32_t, int16_t>::iterator iterBis;
for (iter = windows.lower_bound(b->core.pos) ; iter != windows.upper_bound(b->core.pos + b->core.l_qseq) ; iter ++) {
iterBis = iter->second;
windows[iter->first][iterBis->first] ++;
}
}
BOOST_LOG_TRIVIAL(debug) << "\taln in thread " << this->getBamFile() << " in thread : " << chr ":" << start << "-" << end << " : End of read parsing";
BOOST_LOG_TRIVIAL(trace) << "End of turn";
if ((nbReads % 200000) == 0) {
BOOST_LOG_TRIVIAL(info) << "#" << this->getBamFile() << " : " << chr
<< " : Total reads : " << nbReads << " ; Passing reads : "
<< nbPassingReads
<< " ; Unmapped reads : " << nbUnmappedReads
<< " ; PCR Duplicate reads : " << nbAlignementDup
<< " ; bad aln quality reads : " << nbBadMapReadQual
<< " in thread " << chr ":" << start << "-" << end;
gettimeofday(&tend, NULL);
BOOST_LOG_TRIVIAL(info) << ExecMeasure(tbegin, tend, "+200 000 read treated");
gettimeofday(&tbegin, NULL);
}
}
BOOST_LOG_TRIVIAL(trace) << "End of while iter";
bam_destroy1(b);
BOOST_LOG_TRIVIAL(trace) << "bam destroy passed";
BOOST_LOG_TRIVIAL(info) <<
"#" << this->getBamFile() << " : " << chr
<< " : Total reads : " << nbReads << " ; Passing reads : "
<< nbPassingReads
<< " ; Unmapped reads : " << nbUnmappedReads
<< " ; PCR Duplicate reads : " << nbAlignementDup
<< " ; bad aln quality reads : " << nbBadMapReadQual
<< " in thread " << chr ":" << start << "-" << end;
hts_itr_destroy(iter);
bam_hdr_destroy(header);
// Merging this thread data to the global container
// probably need a mutex
BOOST_LOG_TRIVIAL(info) << "Merging thread data in final data in thread " << chr ":" << start << "-" << end;
map<int32_t, map<int32_t, int32_t> >::iterator myIterA;
for (myIterA = windows.begin(); myIterA != windows.end(); myIterA++) {
map<int32_t, int16_t>::iterator myIterB;
myIterB = myIterA->second;
rawCounts[chr][myIterA->first][myIterB->first] = myIterB->second;
}
// delete fp;
// delete c_BamFile;
// delete region;
// delete header;
// delete iter;
BOOST_LOG_TRIVIAL(debug) << "End of Reader for indiv in thread " << chr ":" << start << "-" << end;
return 0;
}
// cnvCaller.cu
// Version : 1.1.0
// Description : C++/CUDA kernels to compute information needed by cnvcallerGPU
// Author: Theo.Serralta@u-bourgogne.fr ; yannis.duffourd@u-bourgogne.fr ; anthony.auclair@u-bourgogne.fr
#include <cuda_runtime.h>
// Kernel to compute raw average depth
__global__ void computeDepthKernel(int *depth_data, int seq_length, int window_size, int step_size, float *depth_results) {
int idx = threadIdx.x + blockIdx.x * blockDim.x;
if (idx < seq_length - window_size + 1) {
int pos_start = (idx * step_size);
int pos_end = pos_start + window_size;
int count_reads = 0;
for (int i = pos_start; i < pos_end; i++) {
count_reads += depth_data[i];
}
depth_results[idx] = static_cast<float>(count_reads) / window_size;
}
}
// NEED TO OPTIMIZE WITH 3RD DEGREE POLYNOME
// Kernel to normalize depth using GC content and mappability
__global__ void normalizeDepthKernel(float *depth_correction, float *gc_results, float *map_results,
float mean_depth, float *gc_to_median, int seq_length,
int window_size, int step_size, float *normalized_depth) {
int idx = threadIdx.x + blockIdx.x * blockDim.x;
if (idx < seq_length - window_size + 1) {
float gc_value = gc_results[idx];
float map_value = map_results[idx];
float depth_value = depth_correction[idx];
// Récupérer la médiane du GC content
float gc_median = gc_to_median[(int)gc_value];
// Vérifier les valeurs pour éviter des erreurs (division par zéro)
if (gc_median != 0.0f && map_value != 0.0f) {
normalized_depth[idx] = (depth_value / map_value) * (mean_depth / gc_median);
} else {
normalized_depth[idx] = 0.0f;
}
}
}
// Kernel to compute GC content in a sliding window
__global__ void computeGCKernel(char *seq_data, int seq_length, int window_size, int step_size, float *gc_results) {
int idx = threadIdx.x + blockIdx.x * blockDim.x;
if (idx < seq_length - window_size + 1) {
int pos_start = (idx * step_size);
int pos_end = pos_start + window_size;
int gc_count = 0;
for (int i = pos_start; i < pos_end; ++i) {
char base = seq_data[i];
if (base == 'G' || base == 'C' || base == 'g' || base == 'c') {
gc_count++;
}
}
gc_results[idx] = (static_cast<float>(gc_count) / window_size) * 100.0f;
}
}
// Kernel to compute weighted average mappability
__global__ void computeMappabilityKernel(float *map_data, int seq_length, int window_size, int step_size, float *map_results) {
int idx = threadIdx.x + blockIdx.x * blockDim.x;
if (idx < seq_length - window_size + 1) {
int pos_start = (idx * step_size);
int pos_end = pos_start + window_size;
float weighted_sum = 0.0f;
float total_weight = 0.0f;
for (int i = pos_start; i < pos_end; ++i) {
float weight = map_data[i];
weighted_sum += weight;
total_weight += 1.0f;
}
map_results[idx] = (total_weight > 0.0f) ? (weighted_sum / total_weight) : 0.0f;
}
}
// Kernel to compute Z-score
__global__ void computeZScoreKernel(float *normalized_depth, float mean_ratio, float std_ratio, int seq_length, float *z_score_results) {
int idx = threadIdx.x + blockIdx.x * blockDim.x;
if (idx < seq_length) {
float value = normalized_depth[idx];
if (isinf(value) || isnan(value)) {
value = 0.0f;
}
z_score_results[idx] = (value - mean_ratio) / std_ratio;
}
}
\ No newline at end of file
#ifndef _CNVCALLERGPU_H
#define _CNVCALLERGPU_H
#include <vector>
#include <map>
#include <string>
#include <htslib/sam.h>
class cnvCallerGPU
{
public:
cnvCallerGPU();
cnvCallerGPU(std::string, std::string, std::string, std::string, std::string, int, int, double, int, int);
int mainLoop();
int getNbThread();
int getWindowSize();
int getStepSize();
double getZscore();
std::string getOutputFilePairs();
std::string getOutputFileSplit();
std::string getOutputFile();
std::string getReferenceFile();
std::string getBamFile();
private :
std::string sampleName;
std::string inputBamFile;
std::string referenceFile;
std::string outputFile;
std::string outputFilePairs;
std::string outputFileSplit;
std::map<std::string, std::map<int32_t, std::map<int32_t, int32_t>>> rawCounts;
std::map<std::string, std::map<int32_t, std::map<int32_t, float>>> gcContent;
std::map<std::string, std::map<int32_t,int32_t>> refDict;
int depthThreshold;
int nbThread;
int loggingLevel;
int window;
int step;
double zscore;
// method
int getGenome();
void displayGenome();
int readBam(std::string, int32_t, int32_t);
int bamParsingLauncher();
};
#endif
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment