OpenMS
Loading...
Searching...
No Matches
FASTAContainer.h
Go to the documentation of this file.
1// Copyright (c) 2002-present, OpenMS Inc. -- EKU Tuebingen, ETH Zurich, and FU Berlin
2// SPDX-License-Identifier: BSD-3-Clause
3//
4// --------------------------------------------------------------------------
5// $Maintainer: Chris Bielow $
6// $Authors: Chris Bielow $
7// --------------------------------------------------------------------------
8
9#pragma once
10
16
17#include <functional>
18#include <fstream>
19#include <unordered_map>
20#include <memory>
21#include <utility>
22#include <vector>
23
24#include <boost/regex.hpp>
25
26namespace OpenMS
27{
28
29 struct TFI_File;
30 struct TFI_Vector;
31
54template<typename TBackend>
55class FASTAContainer; // prototype
56
65template<>
66class FASTAContainer<TFI_File>
67{
68public:
69 FASTAContainer() = delete;
70
72 FASTAContainer(const std::string& FASTA_file)
73 : f_(),
74 offsets_(),
75 data_fg_(),
76 data_bg_(),
77 chunk_offset_(0),
78 filename_(FASTA_file)
79 {
80 f_.readStart(FASTA_file);
81 }
82
84 size_t getChunkOffset() const
85 {
86 return chunk_offset_;
87 }
88
96 {
97 chunk_offset_ += data_fg_.size();
98 data_fg_.swap(data_bg_);
99 data_bg_.clear(); // just in case someone calls activateCache() multiple times...
100 return !data_fg_.empty();
101 }
102
109 bool cacheChunk(int suggested_size)
110 {
111 data_bg_.clear();
112 data_bg_.reserve(suggested_size);
114 for (int i = 0; i < suggested_size; ++i)
115 {
116 std::streampos spos = f_.position();
117 if (!f_.readNext(p)) break;
118 data_bg_.push_back(std::move(p));
119 offsets_.push_back(spos);
120 }
121 return !data_bg_.empty();
122 }
123
125 size_t chunkSize() const
126 {
127 return data_fg_.size();
128 }
129
137 const FASTAFile::FASTAEntry& chunkAt(size_t pos) const
138 {
139 return data_fg_[pos];
140 }
141
155 bool readAt(FASTAFile::FASTAEntry& protein, size_t pos)
156 {
157 // check if position is currently cached...
158 if (chunk_offset_ <= pos && pos < chunk_offset_ + chunkSize())
159 {
160 protein = data_fg_[pos - chunk_offset_];
161 return true;
162 }
163 // read anew from disk...
164 if (pos >= offsets_.size())
165 {
166 throw Exception::IndexOverflow(__FILE__, __LINE__, OPENMS_PRETTY_FUNCTION, pos, offsets_.size());
167 }
168 std::streampos spos = f_.position(); // save old position
169 if (!f_.setPosition(offsets_[pos])) return false;
170 bool r = f_.readNext(protein);
171 f_.setPosition(spos); // restore old position
172 return r;
173 }
174
176 bool empty()
177 { // trusting the FASTA file can be read...
178 return f_.atEnd() && offsets_.empty();
179 }
180
182 void reset()
183 {
184 offsets_.clear();
185 data_fg_.clear();
186 data_bg_.clear();
187 chunk_offset_ = 0;
188 f_.readStart(filename_);
189 }
190
191
197 size_t size() const
198 {
199 return offsets_.size();
200 }
201
202private:
204 std::vector<std::streampos> offsets_;
205 std::vector<FASTAFile::FASTAEntry> data_fg_;
206 std::vector<FASTAFile::FASTAEntry> data_bg_;
208 std::string filename_;
209};
210
217template<>
218class FASTAContainer<TFI_Vector>
219{
220public:
221 FASTAContainer() = delete;
222
228 FASTAContainer(const std::vector<FASTAFile::FASTAEntry>& data)
229 : data_(data)
230 {
231 }
232
234 size_t getChunkOffset() const
235 {
236 return 0;
237 }
238
244 {
245 if (!activate_count_)
246 {
247 activate_count_ = 1;
248 return true;
249 }
250 return false;
251 }
252
256 bool cacheChunk(int /*suggested_size*/)
257 {
258 if (!cache_count_)
259 {
260 cache_count_ = 1;
261 return true;
262 }
263 return false;
264 }
265
270 size_t chunkSize() const
271 {
272 return data_.size();
273 }
274
276 const FASTAFile::FASTAEntry& chunkAt(size_t pos) const
277 {
278 return data_[pos];
279 }
280
282 bool readAt(FASTAFile::FASTAEntry& protein, size_t pos) const
283 {
284 protein = data_[pos];
285 return true;
286 }
287
289 bool empty() const
290 {
291 return data_.empty();
292 }
293
295 size_t size() const
296 {
297 return data_.size();
298 }
299
301 void reset()
302 {
303 activate_count_ = 0;
304 cache_count_ = 0;
305 }
306
307private:
308 const std::vector<FASTAFile::FASTAEntry>& data_;
309 int activate_count_ = 0;
310 int cache_count_ = 0;
311};
312
317{
318public:
319 struct Result
320 {
321 bool success;
322 std::string name;
324
325 bool operator==(const Result& rhs) const
326 {
327 return success == rhs.success
328 && name == rhs.name
329 && is_prefix == rhs.is_prefix;
330 }
331 };
332
337 {
338 std::unordered_map<std::string, std::pair<Size, Size>> decoy_count;
339 std::unordered_map<std::string, std::string> decoy_case_sensitive;
343
352 };
353
354 // decoy strings
355 inline static const std::vector<std::string> affixes = { "decoy", "dec", "reverse", "rev", "reversed", "__id_decoy", "xxx", "shuffled", "shuffle", "pseudo", "random" };
356
357 // setup prefix- and suffix regex strings
358 inline static const std::string regexstr_prefix = std::string("^(") + ListUtils::concatenate<std::string>(affixes, "_*|") + "_*)";
359 inline static const std::string regexstr_suffix = std::string("(_") + ListUtils::concatenate<std::string>(affixes, "*|_") + ")$";
360
374 template<typename T>
375 static Result findDecoyString(FASTAContainer<T>& proteins, bool quiet = false)
376 {
377 // calls function to search for decoys in input data
378 DecoyStatistics decoy_stats = countDecoys(proteins);
379
380 // DEBUG ONLY: print counts of found decoys
381 for (const auto &a : decoy_stats.decoy_count)
382 {
383 OPENMS_LOG_DEBUG << a.first << "\t" << a.second.first << "\t" << a.second.second << std::endl;
384 }
385
386 // less than 40% of proteins are decoys -> won't be able to determine a decoy string and its position
387 // return default values
388 if (static_cast<double>(decoy_stats.all_prefix_occur + decoy_stats.all_suffix_occur) < 0.4 * static_cast<double>(decoy_stats.all_proteins_count))
389 {
390 if (!quiet) OPENMS_LOG_ERROR << "Unable to determine decoy string (not enough occurrences; <40%)!" << std::endl;
391 return {false, "?", true};
392 }
393
394 if (decoy_stats.all_prefix_occur == decoy_stats.all_suffix_occur)
395 {
396 if (!quiet) OPENMS_LOG_ERROR << "Unable to determine decoy string (prefix and suffix occur equally often)!" << std::endl;
397 return {false, "?", true};
398 }
399
400 // Decoy prefix occurred at least 80% of all prefixes + observed in at least 40% of all proteins -> set it as prefix decoy
401 for (const auto& pair : decoy_stats.decoy_count)
402 {
403 const std::string & case_insensitive_decoy_string = pair.first;
404 const std::pair<Size, Size>& prefix_suffix_counts = pair.second;
405 double freq_prefix = static_cast<double>(prefix_suffix_counts.first) / static_cast<double>(decoy_stats.all_prefix_occur);
406 double freq_prefix_in_proteins = static_cast<double>(prefix_suffix_counts.first) / static_cast<double>(decoy_stats.all_proteins_count);
407
408 if (freq_prefix >= 0.8 && freq_prefix_in_proteins >= 0.4)
409 {
410 if (!quiet && prefix_suffix_counts.first != decoy_stats.all_prefix_occur)
411 {
412 OPENMS_LOG_WARN << "More than one decoy prefix observed!" << std::endl;
413 OPENMS_LOG_WARN << "Using most frequent decoy prefix (" << (int)(freq_prefix * 100) << "%)" << std::endl;
414 }
415
416 return { true, decoy_stats.decoy_case_sensitive[case_insensitive_decoy_string], true};
417 }
418 }
419
420 // Decoy suffix occurred at least 80% of all suffixes + observed in at least 40% of all proteins -> set it as suffix decoy
421 for (const auto& pair : decoy_stats.decoy_count)
422 {
423 const std::string& case_insensitive_decoy_string = pair.first;
424 const std::pair<Size, Size>& prefix_suffix_counts = pair.second;
425 double freq_suffix = static_cast<double>(prefix_suffix_counts.second) / static_cast<double>(decoy_stats.all_suffix_occur);
426 double freq_suffix_in_proteins = static_cast<double>(prefix_suffix_counts.second) / static_cast<double>(decoy_stats.all_proteins_count);
427
428 if (freq_suffix >= 0.8 && freq_suffix_in_proteins >= 0.4)
429 {
430 if (!quiet && prefix_suffix_counts.second != decoy_stats.all_suffix_occur)
431 {
432 OPENMS_LOG_WARN << "More than one decoy suffix observed!" << std::endl;
433 OPENMS_LOG_WARN << "Using most frequent decoy suffix (" << (int)(freq_suffix * 100) << "%)" << std::endl;
434 }
435
436 return { true, decoy_stats.decoy_case_sensitive[case_insensitive_decoy_string], false};
437 }
438 }
439
440 if (!quiet) OPENMS_LOG_ERROR << "Unable to determine decoy string and its position. Please provide a decoy string and its position as parameters." << std::endl;
441 return {false, "?", true};
442 }
443
450 template<typename T>
452 {
453 // common decoy strings in FASTA files
454 // note: decoy prefixes/suffices must be provided in lower case
455
457
458 // setup regexes
459 const boost::regex pattern_prefix(regexstr_prefix);
460 const boost::regex pattern_suffix(regexstr_suffix);
461
462 constexpr size_t PROTEIN_CACHE_SIZE = 4e5;
463
464 while (true)
465 {
466 proteins.cacheChunk(PROTEIN_CACHE_SIZE);
467 if (!proteins.activateCache()) break;
468
469 auto prot_count = (SignedSize)proteins.chunkSize();
470 ds.all_proteins_count += prot_count;
471
472 boost::smatch sm;
473 for (SignedSize i = 0; i < prot_count; ++i)
474 {
475 std::string seq = proteins.chunkAt(i).identifier;
476
477 std::string seq_lower = seq;
478 StringUtils::toLower(seq_lower);
479
480 // search for prefix
481 bool found_prefix = boost::regex_search(seq_lower, sm, pattern_prefix);
482 if (found_prefix)
483 {
484 std::string match = sm[0];
485 ds.all_prefix_occur++;
486
487 // increase count of observed prefix
488 ds.decoy_count[match].first++;
489
490 // store observed (case sensitive and with special characters)
491 std::string seq_decoy = StringUtils::prefix(seq, match.length());
492 ds.decoy_case_sensitive[match] = seq_decoy;
493 }
494
495 // search for suffix
496 bool found_suffix = boost::regex_search(seq_lower, sm, pattern_suffix);
497 if (found_suffix)
498 {
499 std::string match = sm[0];
500 ds.all_suffix_occur++;
501
502 // increase count of observed suffix
503 ds.decoy_count[match].second++;
504
505 // store observed (case sensitive and with special characters)
506 std::string seq_decoy = StringUtils::suffix(seq, match.length());
507 ds.decoy_case_sensitive[match] = seq_decoy;
508 }
509 }
510 }
511 return ds;
512 }
513
514private:
515 using DecoyStringToAffixCount = std::unordered_map<std::string, std::pair<Size, Size>>;
516 using CaseInsensitiveToCaseSensitiveDecoy = std::unordered_map<std::string, std::string>;
517};
518
519} // namespace OpenMS
520
#define OPENMS_LOG_DEBUG
Macro for debug information - includes file and line info.
Definition LogStream.h:591
#define OPENMS_LOG_WARN
Macro for warnings.
Definition LogStream.h:583
#define OPENMS_LOG_ERROR
Macro for non-fatal errors (processing continues)
Definition LogStream.h:579
Helper class for calculations on decoy proteins.
Definition FASTAContainer.h:317
std::unordered_map< std::string, std::pair< Size, Size > > DecoyStringToAffixCount
Definition FASTAContainer.h:515
std::unordered_map< std::string, std::string > CaseInsensitiveToCaseSensitiveDecoy
Definition FASTAContainer.h:516
static Result findDecoyString(FASTAContainer< T > &proteins, bool quiet=false)
Heuristic to determine the decoy string given a set of protein names.
Definition FASTAContainer.h:375
static const std::string regexstr_prefix
Definition FASTAContainer.h:358
static DecoyStatistics countDecoys(FASTAContainer< T > &proteins)
Function to count the occurrences of decoy strings in a given set of protein names.
Definition FASTAContainer.h:451
static const std::vector< std::string > affixes
Definition FASTAContainer.h:355
static const std::string regexstr_suffix
Definition FASTAContainer.h:359
Int overflow exception.
Definition Exception.h:211
std::vector< FASTAFile::FASTAEntry > data_bg_
prefetched (background) data; will become the next active data
Definition FASTAContainer.h:206
bool readAt(FASTAFile::FASTAEntry &protein, size_t pos)
Retrieve a FASTA entry at global position pos (must not be behind the currently active chunk,...
Definition FASTAContainer.h:155
const FASTAFile::FASTAEntry & chunkAt(size_t pos) const
Retrieve a FASTA entry at cache position pos (fast)
Definition FASTAContainer.h:137
std::string filename_
FASTA file name.
Definition FASTAContainer.h:208
size_t size() const
NOT the number of entries in the FASTA file, but merely the number of already read entries (since we ...
Definition FASTAContainer.h:197
bool empty()
is the FASTA file empty?
Definition FASTAContainer.h:176
FASTAContainer(const std::string &FASTA_file)
C'tor with FASTA filename.
Definition FASTAContainer.h:72
std::vector< std::streampos > offsets_
internal byte offsets into FASTA file for random access reading of previous entries.
Definition FASTAContainer.h:204
bool activateCache()
Swaps in the background cache of entries, read previously via cacheChunk()
Definition FASTAContainer.h:95
size_t chunk_offset_
number of entries before the current chunk
Definition FASTAContainer.h:207
bool cacheChunk(int suggested_size)
Prefetch a new cache in the background, with up to suggested_size entries (or fewer upon reaching end...
Definition FASTAContainer.h:109
FASTAFile f_
FASTA file connection.
Definition FASTAContainer.h:203
std::vector< FASTAFile::FASTAEntry > data_fg_
active (foreground) data
Definition FASTAContainer.h:205
size_t chunkSize() const
number of entries in active cache
Definition FASTAContainer.h:125
void reset()
resets reading of the FASTA file, enables fresh reading of the FASTA from the beginning
Definition FASTAContainer.h:182
size_t getChunkOffset() const
how many entries were read and got swapped out already
Definition FASTAContainer.h:84
FASTAContainer(const std::vector< FASTAFile::FASTAEntry > &data)
C'tor for already existing data (by reference).
Definition FASTAContainer.h:228
const FASTAFile::FASTAEntry & chunkAt(size_t pos) const
fast access to chunked (i.e. all) entries
Definition FASTAContainer.h:276
size_t size() const
calls size() on underlying vector
Definition FASTAContainer.h:295
bool readAt(FASTAFile::FASTAEntry &protein, size_t pos) const
fast access to an entry
Definition FASTAContainer.h:282
bool empty() const
calls empty() on underlying vector
Definition FASTAContainer.h:289
bool cacheChunk(int)
no-op (since data is already fully available as vector)
Definition FASTAContainer.h:256
bool activateCache()
no-op (since data is already fully available as vector)
Definition FASTAContainer.h:243
const std::vector< FASTAFile::FASTAEntry > & data_
reference to existing data
Definition FASTAContainer.h:308
size_t chunkSize() const
active data spans the full range, i.e. size of container
Definition FASTAContainer.h:270
void reset()
required for template parameters!
Definition FASTAContainer.h:301
size_t getChunkOffset() const
always 0, since this specialization requires/supports no chunking
Definition FASTAContainer.h:234
This class serves for reading in and writing FASTA files If the protein/gene sequence contains unusua...
Definition FASTAFile.h:35
ptrdiff_t SignedSize
Signed Size type e.g. used as pointer difference.
Definition Types.h:104
size_t Size
Size type e.g. used as variable which can hold result of size()
Definition Types.h:97
std::string suffix(const std::string &s, size_t length)
Definition StringUtils.h:482
std::string prefix(const std::string &s, size_t length)
Definition StringUtils.h:475
std::string & toLower(std::string &s)
Definition StringUtils.h:584
Main OpenMS namespace.
Definition openswathalgo/include/OpenMS/OPENSWATHALGO/DATAACCESS/ISpectrumAccess.h:19
template parameter for vector-based FASTA access
Definition FASTAContainer.h:55
struct for intermediate results needed for calculations on decoy proteins
Definition FASTAContainer.h:337
std::unordered_map< std::string, std::pair< Size, Size > > decoy_count
map decoys to counts of occurrences as prefix/suffix
Definition FASTAContainer.h:338
Size all_proteins_count
number of all checked proteins
Definition FASTAContainer.h:342
Size all_suffix_occur
number of proteins with found decoy_suffix
Definition FASTAContainer.h:341
std::unordered_map< std::string, std::string > decoy_case_sensitive
map case insensitive strings back to original case (as used in fasta)
Definition FASTAContainer.h:339
Size all_prefix_occur
number of proteins with found decoy_prefix
Definition FASTAContainer.h:340
bool operator==(const DecoyStatistics &rhs) const
Definition FASTAContainer.h:344
Definition FASTAContainer.h:320
bool operator==(const Result &rhs) const
Definition FASTAContainer.h:325
bool is_prefix
on success, was it a prefix or suffix
Definition FASTAContainer.h:323
bool success
did more than 40% of proteins have the same prefix or suffix
Definition FASTAContainer.h:321
std::string name
on success, what was the decoy string?
Definition FASTAContainer.h:322
FASTA entry type (identifier, description and sequence) The first std::string corresponds to the iden...
Definition FASTAFile.h:46