45 #include <unordered_map>
88 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27,
89 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27,
91 27, 27, 27, 27, 26, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27,
92 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27,
95 27, 00, 22, 02, 03, 15, 05, 06, 07, 8, 23, 10, 9, 12, 04, 13,
98 14, 16, 17, 18, 19, 20, 21, 11, 25, 01, 24, 27, 27, 27, 27, 27,
101 27, 00, 22, 02, 03, 15, 05, 06, 07, 8, 23, 10, 9, 12, 04, 13,
104 14, 16, 17, 18, 19, 20, 21, 11, 25, 01, 24, 27, 27, 27, 27, 27,
109 struct OPENMS_DLLAPI
Hit {
112 Hit(
T needle_index,
T needle_length,
T query_pos) : needle_index(needle_index), needle_length(needle_length), query_pos(query_pos) {};
124 struct OPENMS_DLLAPI
AA
127 constexpr
AA() : aa_(
AA(
'?').aa_)
147 return aa_ == other.
aa_;
153 return aa_ <= other.
aa_;
159 return aa_ >=
AA(
'B').
aa_;
165 return aa_ !=
AA(
'?').
aa_;
171 return aa_ <=
AA(
'X').
aa_;
178 assert(aa_ <=
AA(
'?').aa_);
187 assert(aa_ <=
AA(
'?').aa_);
231 T i_ = std::numeric_limits<T>::max();
236 template<>
struct std::hash<
OpenMS::Index>
240 return std::hash<OpenMS::Index::T> {}(s());
254 ACNode(
const AA label,
const uint8_t depth) : edge(label)
256 depth_and_hits.depth = depth;
263 memset(
this, 0,
sizeof *
this);
290 ACSpawn(std::string::const_iterator query_pos,
Index tree_pos, uint8_t max_aa, uint8_t max_mm, uint8_t max_prefix_loss);
301 uint8_t max_aaa_leftover {0};
302 uint8_t max_mm_leftover {0};
303 uint8_t max_prefix_loss_leftover {0};
308 OPENMS_DLLAPI
AA nextValidAA(
const std::string::const_iterator end, std::string::const_iterator& it_q);
351 ACTrie(uint32_t max_aaa = 0, uint32_t max_mm = 0);
424 bool addHits_(
Index i,
const size_t text_pos, std::vector<Hit>& hits)
const;
466 uint32_t needle_count_ {0};
467 uint32_t max_aaa_ {0};
468 uint32_t max_mm_ {0};
An Aho Corasick trie (a set of nodes with suffix links mainly)
Definition: AhoCorasickAmbiguous.h:343
Index stepMaster_(const Index i, const AA edge, ACTrieState &state) const
bool stepSpawn_(ACSpawn &spawn, ACTrieState &state) const
void setMaxMMCount(const uint32_t max_mm)
uint32_t getMaxMMCount() const
Maximum number of mismatches allowed during search.
ACTrie(uint32_t max_aaa=0, uint32_t max_mm=0)
Default C'tor which just creates a root node.
void addNeedlesAndCompress(const std::vector< std::string > &needles)
bool nextHitsNoClear_(ACTrieState &state) const
std::vector< ACNode > trie_
the trie, in either naive structure or BFS order (after compressTrie)
Definition: AhoCorasickAmbiguous.h:465
Index add_(const Index from, const AA edge)
void addNeedle(const std::string &needle)
void setMaxAAACount(const uint32_t max_aaa)
std::unordered_map< Index, std::vector< uint32_t > > umap_index2needles_
maps a node to which needles end there (valid for both naive and BFS tree)
Definition: AhoCorasickAmbiguous.h:471
void addNeedles(const std::vector< std::string > &needles)
bool addHitsSpawn_(Index i, const ACSpawn &spawn, const size_t text_pos, std::vector< Hit > &hits, const int current_spawn_depths) const
same as addHits_, but only follows the suffix chain until the spawn loses its prefix
Index follow_(const Index i, const AA edge) const
void getAllHits(ACTrieState &state) const
Index findChildBFS_(const Index parent, const AA child_label) const
After compression (BFS trie), obtain the child with edge child_label from parent; if it does not exis...
bool addHits_(Index i, const size_t text_pos, std::vector< Hit > &hits) const
Add all hits occurring in node i (including all its suffix hits)
void createSpawns_(const Index i, const AA fromAA, const AA toAA, ACTrieState &state, const uint32_t aaa_left, const uint32_t mm_left) const
bool followSpawn_(ACSpawn &spawn, const AA edge, ACTrieState &state) const
uint32_t getMaxAAACount() const
Maximum number of ambiguous amino acids allowed during search.
Index findChildNaive_(Index parent, AA child_label)
During needle addition (naive trie), obtain the child with edge child_label from parent; if it does n...
size_t getNeedleCount() const
How many needles were added to the trie?
void compressTrie()
Traverses the trie in BFS order and makes it more compact and efficient to traverse.
void createSubSpawns_(const ACSpawn &prototype, const AA fromAA, const AA toAA, ACTrieState &state) const
Create spawns from a spawn with an AAA or MM, using prototype as template, following edges in range f...
void createMMSpawns_(const Index i, const AA except_fromAA, const AA except_toAA, const AA except_edge, ACTrieState &state, const uint32_t aaa_left, const uint32_t mm_left) const
Same as createSpawns_, but instantiate all possible AA's except for those in the range from except_fr...
std::unordered_map< Index, std::vector< Index > > umap_index2children_naive_
maps the child nodes of a node for the naive tree; only needed during naive trie construction; storin...
Definition: AhoCorasickAmbiguous.h:473
void createMMSubSpawns_(const ACSpawn &prototype, const AA except_fromAA, const AA except_toAA, const AA except_edge, ACTrieState &state) const
Same as createSubSpawns_, but instantiate all possible AA's except for those in the range from except...
bool nextHits(ACTrieState &state) const
Definition: AhoCorasickAmbiguous.h:206
bool operator==(const Index other) const
equality operator
uint32_t T
Definition: AhoCorasickAmbiguous.h:208
bool isInvalid() const
is this Index invalid, i.e. should not be dereferenced
Index()=default
default C'tor; creates an invalid index
Index(T val)
C'tor from T.
Definition: AhoCorasickAmbiguous.h:213
bool isValid() const
is this Index valid, i.e. an actual index into a vector?
T operator()() const
convert to a number (might be invalid, check with .isValid() first)
T & pos()
allows to set the index, using `index.pos() = 3;` or simply read its value
const double c
Definition: Constants.h:209
static String suffix(const String &this_s, size_t length)
Definition: StringUtilsSimple.h:156
Main OpenMS namespace.
Definition: FeatureDeconvolution.h:47
AA nextValidAA(const std::string::const_iterator end, std::string::const_iterator &it_q)
constexpr char const AAtoChar[28]
Definition: AhoCorasickAmbiguous.h:52
constexpr char const CharToAA[128]
Conversion table from 7-bit ASCII char to internal value representation for an amino acid (AA)
Definition: AhoCorasickAmbiguous.h:86
Definition: AhoCorasickAmbiguous.h:125
constexpr bool isValid() const
is AA a letter or '$' ?
Definition: AhoCorasickAmbiguous.h:163
constexpr AA(const char c)
Definition: AhoCorasickAmbiguous.h:134
constexpr bool isAmbiguous() const
is AA a 'B', 'J', 'Z', 'X', or '$' ?
Definition: AhoCorasickAmbiguous.h:157
constexpr bool isValidForPeptide() const
is the AA a letter, i.e. A-Z or a-z?
Definition: AhoCorasickAmbiguous.h:169
constexpr AA operator++(int)
Post-increment operator (advance to next AA)
Definition: AhoCorasickAmbiguous.h:183
uint8_t aa_
internal representation as 1-byte integer
Definition: AhoCorasickAmbiguous.h:200
constexpr AA & operator++()
Pre-increment operator (advance to next AA)
Definition: AhoCorasickAmbiguous.h:175
constexpr bool operator<=(const AA other) const
less-or-equal operator
Definition: AhoCorasickAmbiguous.h:151
constexpr uint8_t operator()() const
yields the internal 8bit representation
Definition: AhoCorasickAmbiguous.h:139
constexpr AA operator-(const AA rhs) const
Decrement operator.
Definition: AhoCorasickAmbiguous.h:192
constexpr AA()
Default C'tor; creates an invalid AA.
Definition: AhoCorasickAmbiguous.h:127
constexpr bool operator==(const AA other) const
equality operator
Definition: AhoCorasickAmbiguous.h:145
internal struct to steal one bit from depth to use as hit indicator
Definition: AhoCorasickAmbiguous.h:260
DepthHits()
Definition: AhoCorasickAmbiguous.h:261
uint8_t depth
depth of node in the trie
Definition: AhoCorasickAmbiguous.h:267
uint8_t has_hit
does a pattern end here (or when following suffix links)?
Definition: AhoCorasickAmbiguous.h:264
Definition: AhoCorasickAmbiguous.h:249
DepthHits depth_and_hits
depth of node in the tree and one bit if a needle ends in this node or any of its suffices
Definition: AhoCorasickAmbiguous.h:277
ACNode(const AA label, const uint8_t depth)
C'tor from an edge label (from parent to this node) and a depth in the tree.
Definition: AhoCorasickAmbiguous.h:254
uint8_t ChildCountType
Definition: AhoCorasickAmbiguous.h:270
ACNode()
Default C'tor.
Definition: AhoCorasickAmbiguous.h:251
a spin-off search path through the trie, which can deal with ambiguous AAs and mismatches
Definition: AhoCorasickAmbiguous.h:285
AA nextValidAA(const ACTrieState &state)
std::string::const_iterator it_query
position in query
Definition: AhoCorasickAmbiguous.h:299
Index tree_pos
position in trie
Definition: AhoCorasickAmbiguous.h:300
size_t textPos(const ACTrieState &state) const
Where in the text are we currently?
ACSpawn()=delete
No default C'tor.
ACSpawn(std::string::const_iterator query_pos, Index tree_pos, uint8_t max_aa, uint8_t max_mm, uint8_t max_prefix_loss)
C'tor with arguments.
Definition: AhoCorasickAmbiguous.h:313
std::vector< Hit > hits
current hits found
Definition: AhoCorasickAmbiguous.h:332
void setQuery(const std::string &haystack)
const std::string & getQuery() const
The current query.
std::string query_
current query ( = haystack = text)
Definition: AhoCorasickAmbiguous.h:337
std::string::const_iterator it_q_
position in query
Definition: AhoCorasickAmbiguous.h:338
size_t textPos() const
Where in the text are we currently?
friend ACSpawn
Definition: AhoCorasickAmbiguous.h:314
Index tree_pos
position in trie (for the master)
Definition: AhoCorasickAmbiguous.h:333
std::queue< ACSpawn > spawns
initial spawn points which are currently active and need processing
Definition: AhoCorasickAmbiguous.h:334
std::string::const_iterator textPosIt() const
Where in the text are we currently?
Definition: AhoCorasickAmbiguous.h:109
uint32_t T
Definition: AhoCorasickAmbiguous.h:110
T query_pos
Definition: AhoCorasickAmbiguous.h:115
T needle_index
Definition: AhoCorasickAmbiguous.h:112
T needle_length
Definition: AhoCorasickAmbiguous.h:114
bool operator==(const Hit &rhs) const
Definition: AhoCorasickAmbiguous.h:116
Hit(T needle_index, T needle_length, T query_pos)
Definition: AhoCorasickAmbiguous.h:112
std::size_t operator()(OpenMS::Index const &s) const noexcept
Definition: AhoCorasickAmbiguous.h:238