Skip to content

Commit b6e0815

Browse files
fixing merge conflict
2 parents 78e4c67 + 65198f8 commit b6e0815

File tree

6 files changed

+121
-108
lines changed

6 files changed

+121
-108
lines changed

Makefile.am

Lines changed: 7 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -25,14 +25,17 @@ lib_LIBRARIES = libsmithlab_cpp.a
2525
libsmithlab_cpp_a_SOURCES = GenomicRegion.cpp MappedRead.cpp \
2626
OptionParser.cpp QualityScore.cpp bisulfite_utils.cpp \
2727
chromosome_utils.cpp sim_utils.cpp smithlab_os.cpp smithlab_utils.cpp \
28-
zlib_wrapper.cpp dna_four_bit.cpp cigar_utils.cpp
28+
zlib_wrapper.cpp dna_four_bit.cpp cigar_utils.cpp sam_record.cpp
2929

3030
if ENABLE_HTS
31-
libsmithlab_cpp_a_SOURCES += htslib_wrapper_deprecated.cpp sam_record.cpp
31+
libsmithlab_cpp_a_SOURCES += htslib_wrapper_deprecated.cpp htslib_wrapper.cpp
3232
endif
3333

3434
include_HEADERS = GenomicRegion.hpp MappedRead.hpp OptionParser.hpp \
3535
QualityScore.hpp bisulfite_utils.hpp chromosome_utils.hpp \
3636
sim_utils.hpp smithlab_os.hpp smithlab_utils.hpp zlib_wrapper.hpp \
37-
dna_four_bit.hpp cigar_utils.hpp htslib_wrapper_deprecated.hpp \
38-
sam_record.hpp
37+
dna_four_bit.hpp cigar_utils.hpp sam_record.hpp
38+
39+
if ENABLE_HTS
40+
include_HEADERS += htslib_wrapper.hpp htslib_wrapper_deprecated.hpp
41+
endif

cigar_utils.hpp

Lines changed: 2 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -64,8 +64,7 @@ cigar_qseq_ops(InputItr itr, const InputItr last) {
6464
size_t op_count = 0;
6565
while (itr != last) {
6666
const size_t curr_val = extract_op_count(itr, last);
67-
if (consumes_query(*itr++))
68-
op_count += curr_val;
67+
op_count += curr_val*consumes_query(*itr++);
6968
}
7069
return op_count;
7170
}
@@ -82,8 +81,7 @@ cigar_rseq_ops(InputItr itr, const InputItr last) {
8281
size_t op_count = 0;
8382
while (itr != last) {
8483
const size_t curr_val = extract_op_count(itr, last);
85-
if (consumes_reference(*itr++))
86-
op_count += curr_val;
84+
op_count += curr_val*consumes_reference(*itr++);
8785
}
8886
return op_count;
8987
}

dna_four_bit.hpp

Lines changed: 45 additions & 68 deletions
Original file line numberDiff line numberDiff line change
@@ -27,40 +27,24 @@ extern char dna_four_bit_decoding[16];
2727

2828
template <typename uint_type> constexpr
2929
uint_type
30-
get_low_nibble(const uint_type x) {return x & 15u;}
31-
32-
template <typename uint_type> constexpr
33-
uint_type
34-
get_high_nibble(const uint_type x) {return (x >> 4) & 15u;}
35-
36-
template <typename uint_type> constexpr
37-
char
38-
decode_dna_four_bit_low(const uint_type x) {
39-
return dna_four_bit_decoding[get_low_nibble(x)];
40-
}
41-
42-
template <typename uint_type> constexpr
43-
char
44-
decode_dna_four_bit_high(const uint_type x) {
45-
return dna_four_bit_decoding[get_high_nibble(x)];
30+
get_nibble(const uint_type x, const size_t offset) {
31+
return (x >> (4*offset)) & 15ul;
4632
}
4733

4834
template <typename uint_type> constexpr
4935
char
5036
decode_dna_four_bit(const uint_type x,
51-
const base_in_byte b = base_in_byte::left) {
52-
return b == base_in_byte::left ?
53-
decode_dna_four_bit_low(x) :
54-
decode_dna_four_bit_high(x);
37+
const size_t offset) {
38+
return dna_four_bit_decoding[get_nibble(x, offset)];
5539
}
5640

5741
template<class InputItr, class OutputIt>
5842
OutputIt
5943
decode_dna_four_bit(InputItr first, InputItr last, OutputIt d_first) {
6044
// ADS: assume destination has enough space
6145
while (first != last) {
62-
*d_first++ = decode_dna_four_bit(*first, base_in_byte::left);
63-
*d_first++ = decode_dna_four_bit(*first, base_in_byte::right);
46+
for (size_t offset = 0; offset < 16; ++offset)
47+
*d_first++ = decode_dna_four_bit(*first, offset);
6448
++first;
6549
}
6650
// if original sequence length is odd and encoding not padded at the front,
@@ -73,95 +57,88 @@ void
7357
decode_dna_four_bit(const InCtr &source, OutCtr &dest) {
7458
// expand out the bytes as pairs (do this backwards in case source == dest)
7559
const size_t source_size = source.size();
76-
dest.resize(2*source_size);
60+
dest.resize(16*source_size);
7761
size_t i = source_size;
7862
size_t j = dest.size();
7963
while (i > 0) {
8064
dest[--j] = source[--i];
8165
dest[--j] = source[i];
8266
}
83-
for (i = 0; i < dest.size(); i += 2) {
84-
dest[i] = decode_dna_four_bit(dest[i], base_in_byte::left);
85-
dest[i+1] = decode_dna_four_bit(dest[i+1], base_in_byte::right);
67+
for (i = 0; i < dest.size(); i += 16) {
68+
for (size_t offset = 0; offset < 16; ++offset)
69+
dest[i + offset] = decode_dna_four_bit(dest[i], offset);
8670
}
8771
}
8872

8973
extern uint8_t dna_four_bit_encoding[128];
90-
91-
template <typename uint_type> constexpr
92-
uint8_t
93-
encode_dna_four_bit_low(const uint_type x) {
94-
return dna_four_bit_encoding[static_cast<unsigned>(x)];
95-
}
96-
9774
template <typename uint_type> constexpr
98-
uint8_t
99-
encode_dna_four_bit_high(const uint_type x) {
100-
return dna_four_bit_encoding[static_cast<unsigned>(x)] << 4;
101-
}
102-
103-
template <typename uint_type> constexpr
104-
uint8_t
75+
size_t
10576
encode_dna_four_bit(const uint_type x,
106-
const base_in_byte b = base_in_byte::left) {
107-
return b == base_in_byte::left ?
108-
encode_dna_four_bit_low(x) :
109-
encode_dna_four_bit_high(x);
77+
const size_t offset) {
78+
return (static_cast<size_t>(
79+
dna_four_bit_encoding[static_cast<unsigned>(x)])
80+
) << (4*offset);
11081
}
11182

11283
template<class InputItr, class OutputIt>
11384
OutputIt
11485
encode_dna_four_bit(InputItr first, InputItr last, OutputIt d_first) {
11586
while (first != last) {
116-
*d_first = encode_dna_four_bit(*first++, base_in_byte::left);
117-
*d_first |= (first == last ? 0 :
118-
encode_dna_four_bit(*first++, base_in_byte::right));
87+
*d_first = 0;
88+
for (size_t i = 0; i < 16 && first != last; ++i)
89+
*d_first |= encode_dna_four_bit(*first++, i);
11990
++d_first;
12091
}
12192
return d_first;
12293
}
12394

124-
// ADS: indented to be used as pointer to 4-bit encoding of DNA within a vector
125-
// of uint8_t values
95+
// GS: intended to be used as pointer to 4-bit encoding of DNA within a vector
96+
// of size_t values
12697
struct genome_four_bit_itr {
127-
genome_four_bit_itr(const std::vector<uint8_t>::const_iterator itr_,
128-
const bool odd_ = false) : itr(itr_), itr_odd(odd_) {}
98+
genome_four_bit_itr(const std::vector<size_t>::const_iterator itr_,
99+
const int off_ = 0) : itr(itr_), offset(off_) {}
129100

130-
uint8_t operator*() const {
131-
return (!itr_odd ? *itr : (*itr >> 4)) & 15;
101+
size_t operator*() const {
102+
return (*itr >> (offset << 2)) & 15ul;
132103
}
133104
genome_four_bit_itr& operator++() {
134-
itr += itr_odd;
135-
itr_odd ^= 1ul;
105+
offset = (offset + 1) & 15ul;
106+
itr += (offset == 0);
136107
return *this;
137108
}
138109
genome_four_bit_itr operator++(int) {
139110
genome_four_bit_itr tmp(*this);
140-
itr += itr_odd;
141-
itr_odd ^= 1ul;
111+
offset = (offset + 1) & 15ul;
112+
itr += (offset == 0);
142113
return tmp;
143114
}
144115
genome_four_bit_itr& operator--() {
145-
itr -= !itr_odd;
146-
itr_odd ^= 1ul;
116+
itr -= (offset == 0);
117+
118+
// GS: will underflow on 0 but it's ok?
119+
offset = (offset - 1) & 15ul;
147120
return *this;
148121
}
149122
genome_four_bit_itr operator--(int) {
150123
genome_four_bit_itr tmp(*this);
151-
itr -= !itr_odd;
152-
itr_odd ^= 1ul;
124+
itr -= (offset == 0);
125+
offset = (offset - 1) & 15ul;
153126
return tmp;
154127
}
155-
genome_four_bit_itr operator+(const size_t offset) const {
156-
const size_t offset_odd = offset & 1ul;
157-
return genome_four_bit_itr(itr + offset/2 + (itr_odd & offset_odd),
158-
itr_odd != offset_odd);
128+
genome_four_bit_itr operator+(const size_t step) const {
129+
// whether the sum of offsets is >= 16
130+
const bool shift_one_pos =
131+
(((offset + (static_cast<int>(step) & 15)) & 16) >> 4);
132+
133+
const int new_offset = (offset + step) & 15;
134+
return genome_four_bit_itr(itr + step/16 + shift_one_pos,
135+
new_offset);
159136
}
160137
bool operator!=(const genome_four_bit_itr &rhs) const {
161-
return itr != rhs.itr || itr_odd != rhs.itr_odd;
138+
return itr != rhs.itr || offset != rhs.offset;
162139
}
163-
std::vector<uint8_t>::const_iterator itr;
164-
size_t itr_odd;
140+
std::vector<size_t>::const_iterator itr;
141+
int offset;
165142
};
166143

167144
#endif

sam_record.cpp

Lines changed: 27 additions & 31 deletions
Original file line numberDiff line numberDiff line change
@@ -34,6 +34,7 @@ using std::ostream;
3434
using std::begin;
3535
using std::end;
3636
using std::ostringstream;
37+
using std::to_string;
3738

3839
// ADS: this is for debugging purposes
3940
string
@@ -73,49 +74,44 @@ format_sam_flags(const uint16_t the_flags) {
7374
// return regex_match(qual, regex("[!-~]+"));
7475
// }
7576

77+
size_t
78+
sam_rec::estimate_line_size() const {
79+
static const size_t all_field_estimates = 100;
80+
return qname.size() + rname.size() + qual.size() + all_field_estimates;
81+
}
82+
7683
string
7784
sam_rec::tostring() const {
78-
ostringstream oss;
79-
oss << qname << '\t'
80-
<< flags << '\t'
81-
<< rname << '\t'
82-
<< pos << '\t'
83-
<< static_cast<unsigned>(mapq) << '\t'
84-
<< cigar << '\t'
85-
<< rnext << '\t'
86-
<< pnext << '\t'
87-
<< tlen << '\t'
88-
<< seq << '\t'
89-
<< qual;
90-
85+
string out;
86+
out.reserve(estimate_line_size());
87+
out.append(qname + "\t" +
88+
to_string(flags) + "\t" +
89+
rname + "\t" +
90+
to_string(pos) + "\t" +
91+
to_string(static_cast<unsigned>(mapq)) + "\t" +
92+
cigar + "\t" +
93+
rnext + "\t" +
94+
to_string(pnext) + "\t" +
95+
to_string(tlen) + "\t" +
96+
seq + "\t" +
97+
qual);
9198
for (auto it(begin(tags)); it != end(tags); ++it)
92-
oss << '\t' << *it;
93-
return oss.str() + "\n";
99+
out.append("\t" + *it);
100+
101+
return out;
94102
}
95103

96104
ostream &
97105
operator<<(std::ostream &the_stream, const sam_rec &r) {
98-
the_stream << r.qname << '\t'
99-
<< r.flags << '\t'
100-
<< r.rname << '\t'
101-
<< r.pos << '\t'
102-
<< static_cast<unsigned>(r.mapq) << '\t'
103-
<< r.cigar << '\t'
104-
<< r.rnext << '\t'
105-
<< r.pnext << '\t'
106-
<< r.tlen << '\t'
107-
<< r.seq << '\t'
108-
<< r.qual;
109-
110-
for (auto it(begin(r.tags)); it != end(r.tags); ++it)
111-
the_stream << '\t' << *it;
106+
the_stream << r.tostring();
112107
return the_stream;
113108
}
114109

115110
sam_rec::sam_rec(const string &line) {
116-
111+
/*
117112
istringstream iss; // ADS: change to set the buffer from "line"
118-
iss.rdbuf()->pubsetbuf(const_cast<char*>(line.c_str()), line.size());
113+
iss.rdbuf()->pubsetbuf(const_cast<char*>(line.c_str()), line.size());*/
114+
istringstream iss(line);
119115
uint32_t will_become_mapq = 0; // to not read mapq as character
120116
// since it's uint8_t
121117
if (!(iss >>

sam_record.hpp

Lines changed: 35 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -22,6 +22,8 @@
2222
#include <string>
2323
#include <vector>
2424
#include <iostream>
25+
#include <sstream>
26+
#include <iterator>
2527

2628
// from 30 April 2020 SAM documentation
2729
// 1 0x1 template having multiple segments in sequencing
@@ -118,6 +120,7 @@ class sam_rec {
118120
qual(_qual) {}
119121
void add_tag(const std::string &the_tag) {tags.push_back(the_tag);}
120122
bool empty() const;
123+
size_t estimate_line_size() const;
121124
std::string tostring() const;
122125
};
123126

@@ -146,4 +149,36 @@ void
146149
inflate_with_cigar(const sam_rec &sr, std::string &to_inflate,
147150
const char inflation_symbol = 'N');
148151

152+
template<typename T>
153+
static void
154+
write_sam_header(const std::vector<std::string> &chrom_names,
155+
const std::vector<T> &chrom_starts,
156+
const std::string program_name,
157+
const std::string program_version,
158+
const int argc, const char **argv,
159+
std::ostream &out) {
160+
static const std::string SAM_VERSION = "1.0";
161+
162+
// sam version
163+
out <<"@HD" << '\t' << "VN:" << SAM_VERSION << '\n'; // sam version
164+
165+
// chromosome sizes
166+
const size_t n_chroms = chrom_names.size() - 1;
167+
for (size_t i = 1; i < n_chroms; ++i) {
168+
out << "@SQ" << '\t'
169+
<< "SN:" << chrom_names[i] << '\t'
170+
<< "LN:" << chrom_starts[i+1] - chrom_starts[i] << '\n';
171+
}
172+
173+
// program details
174+
out << "@PG" << '\t'
175+
<< "ID:" << program_name << '\t'
176+
<< "VN:" << program_version << '\t';
177+
178+
// how the program was run
179+
std::ostringstream the_command;
180+
copy(argv, argv + argc, std::ostream_iterator<const char*>(the_command, " "));
181+
out << "CL:\"" << the_command.str() << "\"" << std::endl;
182+
}
183+
149184
#endif

smithlab_os.cpp

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -390,7 +390,11 @@ read_fasta_file_short_names(const string &filename,
390390
string line;
391391
while (getline(in, line)) {
392392
if (line[0] == '>') {
393-
names.push_back(string(begin(line) + 1,
393+
const auto first_space = line.find_first_of(" \t", 1);
394+
if (first_space == string::npos)
395+
names.push_back(line.substr(1));
396+
else
397+
names.push_back(string(begin(line) + 1,
394398
begin(line) + line.find_first_of(" \t", 1)));
395399
sequences.push_back(string());
396400
}

0 commit comments

Comments
 (0)