Skip to content

Commit 1fed127

Browse files
adding sam-related programs
1 parent 34289a7 commit 1fed127

File tree

5 files changed

+486
-1
lines changed

5 files changed

+486
-1
lines changed

Makefile

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -20,7 +20,7 @@
2020
SOURCES = $(wildcard *.cpp)
2121
OBJECTS = $(patsubst %.cpp, %.o, $(SOURCES))
2222
HEADERS = $(wildcard *.hpp)
23-
REQUIRES_HTSLIB = htslib_wrapper_deprecated.o
23+
REQUIRES_HTSLIB = htslib_wrapper_deprecated.o htslib_wrapper.o
2424

2525
ifndef HAVE_HTSLIB
2626
NO_HTSLIB := $(filter-out $(REQUIRES_HTSLIB), $(OBJECTS))

htslib_wrapper.cpp

Lines changed: 104 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,104 @@
1+
/* Part of SMITHLAB_CPP software
2+
*
3+
* Copyright (C) 2013-2019 University of Southern California and
4+
* Andrew D. Smith
5+
*
6+
* Authors: Meng Zhou, Qiang Song, Andrew Smith
7+
*
8+
* This program is free software: you can redistribute it and/or
9+
* modify it under the terms of the GNU General Public License as
10+
* published by the Free Software Foundation, either version 3 of the
11+
* License, or (at your option) any later version.
12+
*
13+
* This program is distributed in the hope that it will be useful, but
14+
* WITHOUT ANY WARRANTY; without even the implied warranty of
15+
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
16+
* General Public License for more details.
17+
*/
18+
19+
#include <string>
20+
#include <vector>
21+
#include <fstream>
22+
#include <iostream>
23+
24+
#include "htslib_wrapper.hpp"
25+
#include "smithlab_utils.hpp"
26+
#include "MappedRead.hpp"
27+
28+
using std::string;
29+
using std::vector;
30+
using std::cerr;
31+
using std::endl;
32+
using std::runtime_error;
33+
34+
char check_htslib_wrapper() {return 1;}
35+
36+
SAMReader::SAMReader(const string &fn) :
37+
filename(fn), good(true) {
38+
39+
if (!(hts = hts_open(filename.c_str(), "r")))
40+
throw runtime_error("cannot open file: " + filename);
41+
42+
if (hts_get_format(hts)->category != sequence_data)
43+
throw runtime_error("file format appears wrong: " + filename);
44+
45+
if (!(hdr = sam_hdr_read(hts)))
46+
throw runtime_error("failed to read header from file: " + filename);
47+
48+
if (!(b = bam_init1()))
49+
throw runtime_error("failed to read record from file: " + filename);
50+
}
51+
52+
SAMReader::~SAMReader() {
53+
if (hdr) {
54+
bam_hdr_destroy(hdr);
55+
hdr = 0;
56+
}
57+
if (b) {
58+
bam_destroy1(b);
59+
b = 0;
60+
}
61+
if (hts) {
62+
assert(hts_close(hts) >= 0);
63+
hts = 0;
64+
}
65+
good = false;
66+
}
67+
68+
69+
SAMReader&
70+
operator>>(SAMReader &reader, sam_rec &aln) {
71+
reader.get_sam_record(aln);
72+
return reader;
73+
}
74+
75+
/////////////////////////////////////////////
76+
//// general facility for SAM format
77+
/////////////////////////////////////////////
78+
79+
bool
80+
SAMReader::get_sam_record(sam_rec &sr) {
81+
int rd_ret = 0;
82+
if ((rd_ret = sam_read1(hts, hdr, b)) >= 0) {
83+
int fmt_ret = 0;
84+
if ((fmt_ret = sam_format1(hdr, b, &hts->line)) <= 0)
85+
throw runtime_error("failed reading record from: " + filename);
86+
sr = sam_rec(hts->line.s);
87+
good = true; //reader.get_sam_record(reader.hts->line.s, sr);
88+
// ADS: line below seems to be needed when the file format is SAM
89+
// because the hts_getline for parsing SAM format files within
90+
// sam_read1 only get called when "(fp->line.l == 0)". For BAM
91+
// format, it does not seem to matter.
92+
hts->line.l = 0;
93+
}
94+
else if (rd_ret == -1)
95+
good = false;
96+
else // rd_ret < -1
97+
throw runtime_error("failed to read record from file: " + filename);
98+
return good;
99+
}
100+
101+
string
102+
SAMReader::get_header() const {
103+
return hdr->text; // includes newline
104+
}

htslib_wrapper.hpp

Lines changed: 61 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,61 @@
1+
/*
2+
* Part of SMITHLAB_CPP software
3+
*
4+
* Copyright (C) 2019 Meng Zhou and Andrew Smith
5+
*
6+
* Authors: Meng Zhou and Andrew Smith
7+
*
8+
* This is free software: you can redistribute it and/or modify it
9+
* under the terms of the GNU General Public License as published by
10+
* the Free Software Foundation, either version 3 of the License, or
11+
* (at your option) any later version.
12+
*
13+
* This software is distributed in the hope that it will be useful,
14+
* but WITHOUT ANY WARRANTY; without even the implied warranty of
15+
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
16+
* General Public License for more details.
17+
*/
18+
19+
#ifndef HTSLIB_WRAPPER_HPP
20+
#define HTSLIB_WRAPPER_HPP
21+
22+
#include "smithlab_utils.hpp"
23+
#include "sam_record.hpp"
24+
25+
#include <string>
26+
#include <vector>
27+
#include <fstream>
28+
29+
extern "C" {
30+
#include <htslib/sam.h>
31+
#include <htslib/hts.h>
32+
}
33+
34+
extern "C" {char check_htslib_wrapper();}
35+
36+
class SAMReader {
37+
public:
38+
SAMReader(const std::string &filename);
39+
~SAMReader();
40+
41+
operator bool() const {return good;}
42+
43+
bool get_sam_record(sam_rec &sr);
44+
45+
std::string get_header() const;
46+
47+
private:
48+
49+
// data
50+
std::string filename;
51+
bool good;
52+
53+
htsFile* hts;
54+
bam_hdr_t *hdr;
55+
bam1_t *b;
56+
};
57+
58+
SAMReader &
59+
operator>>(SAMReader& sam_stream, sam_rec &samr);
60+
61+
#endif

sam_record.cpp

Lines changed: 173 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,173 @@
1+
/* Part of Smith Lab software
2+
*
3+
* Copyright (C) 2020 University of Southern California and
4+
* Guilherme De Sena Brandine and Andrew D. Smith
5+
*
6+
* Authors: Guilherme De Sena Brandine and Andrew D. Smith
7+
*
8+
* This program is free software: you can redistribute it and/or modify
9+
* it under the terms of the GNU General Public License as published by
10+
* the Free Software Foundation, either version 3 of the License, or
11+
* (at your option) any later version.
12+
*
13+
* This program is distributed in the hope that it will be useful, but
14+
* WITHOUT ANY WARRANTY; without even the implied warranty of
15+
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
16+
* General Public License for more details.
17+
*/
18+
19+
#include "sam_record.hpp"
20+
21+
#include <regex>
22+
#include <sstream>
23+
24+
#include "cigar_utils.hpp"
25+
#include "bisulfite_utils.hpp"
26+
#include "smithlab_utils.hpp"
27+
28+
using std::string;
29+
using std::istringstream;
30+
using std::runtime_error;
31+
using std::regex;
32+
using std::istream;
33+
using std::ostream;
34+
using std::begin;
35+
using std::end;
36+
37+
// ADS: this is for debugging purposes
38+
string
39+
format_sam_flags(const uint16_t the_flags) {
40+
std::ostringstream oss;
41+
using samflags::check;
42+
oss << "read_paired: " << check(the_flags, samflags::read_paired) << '\n'
43+
<< "read_pair_mapped: "
44+
<< check(the_flags, samflags::read_pair_mapped) << '\n'
45+
<< "read_unmapped: " << check(the_flags, samflags::read_unmapped) << '\n'
46+
<< "mate_unmapped: " << check(the_flags, samflags::mate_unmapped) << '\n'
47+
<< "read_rc: " << check(the_flags, samflags::read_rc) << '\n'
48+
<< "mate_rc: " << check(the_flags, samflags::mate_rc) << '\n'
49+
<< "template_first: "
50+
<< check(the_flags, samflags::template_first) << '\n'
51+
<< "template_last: " << check(the_flags, samflags::template_last) << '\n'
52+
<< "secondary_aln: " << check(the_flags, samflags::secondary_aln) << '\n'
53+
<< "below_quality: " << check(the_flags, samflags::below_quality) << '\n'
54+
<< "pcr_duplicate: " << check(the_flags, samflags::pcr_duplicate) << '\n'
55+
<< "supplementary_aln: "
56+
<< check(the_flags, samflags::supplementary_aln);
57+
return oss.str();
58+
}
59+
60+
static bool
61+
valid_cigar(const string &cigar, const string &seq) {
62+
return cigar_qseq_ops(cigar) == seq.size();
63+
}
64+
65+
static bool
66+
valid_seq(const string &read) {
67+
return regex_match(read, regex("\\*|[A-Za-z=.]+"));
68+
}
69+
70+
static bool
71+
valid_qual(const string &qual) {
72+
return regex_match(qual, regex("[!-~]+"));
73+
}
74+
75+
76+
ostream &
77+
operator<<(std::ostream &the_stream, const sam_rec &r) {
78+
the_stream << r.qname << '\t'
79+
<< r.flags << '\t'
80+
<< r.rname << '\t'
81+
<< r.pos << '\t'
82+
<< static_cast<unsigned>(r.mapq) << '\t'
83+
<< r.cigar << '\t'
84+
<< r.rnext << '\t'
85+
<< r.pnext << '\t'
86+
<< r.tlen << '\t'
87+
<< r.seq << '\t'
88+
<< r.qual;
89+
90+
for (auto it(begin(r.tags)); it != end(r.tags); ++it)
91+
the_stream << '\t' << *it;
92+
return the_stream;
93+
}
94+
95+
sam_rec::sam_rec(const string &line) {
96+
97+
string tmp;
98+
istringstream iss(line); // ADS: change to set the buffer from "line"
99+
iss.rdbuf()->pubsetbuf(const_cast<char*>(line.c_str()), line.size());
100+
uint32_t will_become_mapq = 0; // to not read mapq as character
101+
// since it's uint8_t
102+
if (!(iss >>
103+
qname >> flags >> rname >> pos >> will_become_mapq >>
104+
cigar >> rnext >> pnext >> tlen >> seq >> qual))
105+
throw runtime_error("incorrect SAM record:\n" + line);
106+
if (mapq > 255)
107+
throw runtime_error("invalid mapq in SAM record: " + line);
108+
mapq = static_cast<uint8_t>(will_become_mapq);
109+
110+
if (!valid_cigar(cigar, seq))
111+
throw runtime_error("invalid cigar in SAM record: " + line);
112+
113+
if (!valid_seq(seq))
114+
throw runtime_error("invalid read in SAM record: " + line);
115+
116+
if (!valid_qual(qual))
117+
throw runtime_error("invalid qual in SAM record: " + line);
118+
while (iss >> tmp)
119+
tags.push_back(tmp);
120+
}
121+
122+
// // constructor for "general" sam
123+
// sam_rec::sam_rec(const std::string &_qname,
124+
125+
// // to make sam flags
126+
// const sam_record_type type,
127+
// const bool is_rc,
128+
129+
// const std::string &_rname,
130+
// // zero-based!
131+
// const uint32_t start,
132+
// const uint8_t _mapq,
133+
// const std::string &_cigar,
134+
// const std::string &_seq,
135+
// const std::string &_qual,
136+
137+
// // not used in SE
138+
// const uint32_t _pnext,
139+
// const uint32_t end,
140+
141+
// // not used in non-WGBS
142+
// const bool is_a_rich) {
143+
144+
// // direct copies of entries
145+
// qname = _qname;
146+
// rname = _rname;
147+
// pos = start + 1; // sam is one-based
148+
// seq = is_rc ? revcomp(seq) : _seq;
149+
// qual = _qual;
150+
// mapq = _mapq;
151+
// cigar = _cigar;
152+
// pnext = _pnext + 1; // one-based!
153+
154+
// // infer from non-sam arguments
155+
// flags = (!is_a_rich) * bsflags::read_is_t_rich |
156+
// (type != single) * (samflags::read_paired |
157+
// samflags::read_pair_mapped) |
158+
// (is_rc) * samflags::read_rc |
159+
// (!is_rc) * samflags::mate_rc |
160+
// (type == pe_first_mate) * samflags::template_first |
161+
// (type == pe_second_mate) * samflags::template_last;
162+
163+
// rnext = (type == single) ? "*" : "=";
164+
// tlen = is_rc ? (_pnext - end) : (end - start);
165+
// }
166+
167+
168+
169+
void
170+
inflate_with_cigar(const sam_rec &sr, string &to_inflate,
171+
const char inflation_symbol) {
172+
apply_cigar(sr.cigar, to_inflate, inflation_symbol);
173+
}

0 commit comments

Comments
 (0)