OpenMS
FASTAContainer.h
Go to the documentation of this file.
1 // Copyright (c) 2002-2023, The OpenMS Team -- EKU Tuebingen, ETH Zurich, and FU Berlin
2 // SPDX-License-Identifier: BSD-3-Clause
3 //
4 // --------------------------------------------------------------------------
5 // $Maintainer: Chris Bielow $
6 // $Authors: Chris Bielow $
7 // --------------------------------------------------------------------------
8 
9 #pragma once
10 
17 
18 #include <functional>
19 #include <fstream>
20 #include <unordered_map>
21 #include <memory>
22 #include <utility>
23 #include <vector>
24 
25 #include <boost/regex.hpp>
26 
27 namespace OpenMS
28 {
29 
30  struct TFI_File;
31  struct TFI_Vector;
32 
55 template<typename TBackend>
56 class FASTAContainer; // prototype
57 
66 template<>
67 class FASTAContainer<TFI_File>
68 {
69 public:
70  FASTAContainer() = delete;
71 
73  FASTAContainer(const String& FASTA_file)
74  : f_(),
75  offsets_(),
76  data_fg_(),
77  data_bg_(),
78  chunk_offset_(0),
79  filename_(FASTA_file)
80  {
81  f_.readStart(FASTA_file);
82  }
83 
85  size_t getChunkOffset() const
86  {
87  return chunk_offset_;
88  }
89 
97  {
98  chunk_offset_ += data_fg_.size();
99  data_fg_.swap(data_bg_);
100  data_bg_.clear(); // just in case someone calls activateCache() multiple times...
101  return !data_fg_.empty();
102  }
103 
110  bool cacheChunk(int suggested_size)
111  {
112  data_bg_.clear();
113  data_bg_.reserve(suggested_size);
115  for (int i = 0; i < suggested_size; ++i)
116  {
117  std::streampos spos = f_.position();
118  if (!f_.readNext(p)) break;
119  data_bg_.push_back(std::move(p));
120  offsets_.push_back(spos);
121  }
122  return !data_bg_.empty();
123  }
124 
126  size_t chunkSize() const
127  {
128  return data_fg_.size();
129  }
130 
138  const FASTAFile::FASTAEntry& chunkAt(size_t pos) const
139  {
140  return data_fg_[pos];
141  }
142 
156  bool readAt(FASTAFile::FASTAEntry& protein, size_t pos)
157  {
158  // check if position is currently cached...
159  if (chunk_offset_ <= pos && pos < chunk_offset_ + chunkSize())
160  {
161  protein = data_fg_[pos - chunk_offset_];
162  return true;
163  }
164  // read anew from disk...
165  if (pos >= offsets_.size())
166  {
167  throw Exception::IndexOverflow(__FILE__, __LINE__, OPENMS_PRETTY_FUNCTION, pos, offsets_.size());
168  }
169  std::streampos spos = f_.position(); // save old position
170  if (!f_.setPosition(offsets_[pos])) return false;
171  bool r = f_.readNext(protein);
172  f_.setPosition(spos); // restore old position
173  return r;
174  }
175 
177  bool empty()
178  { // trusting the FASTA file can be read...
179  return f_.atEnd() && offsets_.empty();
180  }
181 
183  void reset()
184  {
185  offsets_.clear();
186  data_fg_.clear();
187  data_bg_.clear();
188  chunk_offset_ = 0;
189  f_.readStart(filename_);
190  }
191 
192 
198  size_t size() const
199  {
200  return offsets_.size();
201  }
202 
203 private:
205  std::vector<std::streampos> offsets_;
206  std::vector<FASTAFile::FASTAEntry> data_fg_;
207  std::vector<FASTAFile::FASTAEntry> data_bg_;
208  size_t chunk_offset_;
209  std::string filename_;
210 };
211 
218 template<>
219 class FASTAContainer<TFI_Vector>
220 {
221 public:
222  FASTAContainer() = delete;
223 
229  FASTAContainer(const std::vector<FASTAFile::FASTAEntry>& data)
230  : data_(data)
231  {
232  }
233 
235  size_t getChunkOffset() const
236  {
237  return 0;
238  }
239 
245  {
246  if (!activate_count_)
247  {
248  activate_count_ = 1;
249  return true;
250  }
251  return false;
252  }
253 
257  bool cacheChunk(int /*suggested_size*/)
258  {
259  if (!cache_count_)
260  {
261  cache_count_ = 1;
262  return true;
263  }
264  return false;
265  }
266 
271  size_t chunkSize() const
272  {
273  return data_.size();
274  }
275 
277  const FASTAFile::FASTAEntry& chunkAt(size_t pos) const
278  {
279  return data_[pos];
280  }
281 
283  bool readAt(FASTAFile::FASTAEntry& protein, size_t pos) const
284  {
285  protein = data_[pos];
286  return true;
287  }
288 
290  bool empty() const
291  {
292  return data_.empty();
293  }
294 
296  size_t size() const
297  {
298  return data_.size();
299  }
300 
302  void reset()
303  {
304  activate_count_ = 0;
305  cache_count_ = 0;
306  }
307 
308 private:
309  const std::vector<FASTAFile::FASTAEntry>& data_;
310  int activate_count_ = 0;
311  int cache_count_ = 0;
312 };
313 
318 {
319 public:
320  struct Result
321  {
322  bool success;
324  bool is_prefix;
325 
326  bool operator==(const Result& rhs) const
327  {
328  return success == rhs.success
329  && name == rhs.name
330  && is_prefix == rhs.is_prefix;
331  }
332  };
333 
338  {
339  std::unordered_map<std::string, std::pair<Size, Size>> decoy_count;
340  std::unordered_map<std::string, std::string> decoy_case_sensitive;
344 
345  bool operator==(const DecoyStatistics& rhs) const
346  {
347  return decoy_count == rhs.decoy_count
352  }
353  };
354 
355  // decoy strings
356  inline static const std::vector<std::string> affixes = { "decoy", "dec", "reverse", "rev", "reversed", "__id_decoy", "xxx", "shuffled", "shuffle", "pseudo", "random" };
357 
358  // setup prefix- and suffix regex strings
359  inline static const std::string regexstr_prefix = std::string("^(") + ListUtils::concatenate<std::string>(affixes, "_*|") + "_*)";
360  inline static const std::string regexstr_suffix = std::string("(_") + ListUtils::concatenate<std::string>(affixes, "*|_") + ")$";
361 
369  template<typename T>
371  {
372  // calls function to search for decoys in input data
373  DecoyStatistics decoy_stats = countDecoys(proteins);
374 
375  // DEBUG ONLY: print counts of found decoys
376  for (const auto &a : decoy_stats.decoy_count)
377  {
378  OPENMS_LOG_DEBUG << a.first << "\t" << a.second.first << "\t" << a.second.second << std::endl;
379  }
380 
381  // less than 40% of proteins are decoys -> won't be able to determine a decoy string and its position
382  // return default values
383  if (static_cast<double>(decoy_stats.all_prefix_occur + decoy_stats.all_suffix_occur) < 0.4 * static_cast<double>(decoy_stats.all_proteins_count))
384  {
385  OPENMS_LOG_ERROR << "Unable to determine decoy string (not enough occurrences; <40%)!" << std::endl;
386  return {false, "?", true};
387  }
388 
389  if (decoy_stats.all_prefix_occur == decoy_stats.all_suffix_occur)
390  {
391  OPENMS_LOG_ERROR << "Unable to determine decoy string (prefix and suffix occur equally often)!" << std::endl;
392  return {false, "?", true};
393  }
394 
395  // Decoy prefix occurred at least 80% of all prefixes + observed in at least 40% of all proteins -> set it as prefix decoy
396  for (const auto& pair : decoy_stats.decoy_count)
397  {
398  const std::string & case_insensitive_decoy_string = pair.first;
399  const std::pair<Size, Size>& prefix_suffix_counts = pair.second;
400  double freq_prefix = static_cast<double>(prefix_suffix_counts.first) / static_cast<double>(decoy_stats.all_prefix_occur);
401  double freq_prefix_in_proteins = static_cast<double>(prefix_suffix_counts.first) / static_cast<double>(decoy_stats.all_proteins_count);
402 
403  if (freq_prefix >= 0.8 && freq_prefix_in_proteins >= 0.4)
404  {
405  if (prefix_suffix_counts.first != decoy_stats.all_prefix_occur)
406  {
407  OPENMS_LOG_WARN << "More than one decoy prefix observed!" << std::endl;
408  OPENMS_LOG_WARN << "Using most frequent decoy prefix (" << (int)(freq_prefix * 100) << "%)" << std::endl;
409  }
410 
411  return { true, decoy_stats.decoy_case_sensitive[case_insensitive_decoy_string], true};
412  }
413  }
414 
415  // Decoy suffix occurred at least 80% of all suffixes + observed in at least 40% of all proteins -> set it as suffix decoy
416  for (const auto& pair : decoy_stats.decoy_count)
417  {
418  const std::string& case_insensitive_decoy_string = pair.first;
419  const std::pair<Size, Size>& prefix_suffix_counts = pair.second;
420  double freq_suffix = static_cast<double>(prefix_suffix_counts.second) / static_cast<double>(decoy_stats.all_suffix_occur);
421  double freq_suffix_in_proteins = static_cast<double>(prefix_suffix_counts.second) / static_cast<double>(decoy_stats.all_proteins_count);
422 
423  if (freq_suffix >= 0.8 && freq_suffix_in_proteins >= 0.4)
424  {
425  if (prefix_suffix_counts.second != decoy_stats.all_suffix_occur)
426  {
427  OPENMS_LOG_WARN << "More than one decoy suffix observed!" << std::endl;
428  OPENMS_LOG_WARN << "Using most frequent decoy suffix (" << (int)(freq_suffix * 100) << "%)" << std::endl;
429  }
430 
431  return { true, decoy_stats.decoy_case_sensitive[case_insensitive_decoy_string], false};
432  }
433  }
434 
435  OPENMS_LOG_ERROR << "Unable to determine decoy string and its position. Please provide a decoy string and its position as parameters." << std::endl;
436  return {false, "?", true};
437  }
438 
445  template<typename T>
447  {
448  // common decoy strings in FASTA files
449  // note: decoy prefixes/suffices must be provided in lower case
450 
451  DecoyStatistics ds;
452 
453  // setup regexes
454  const boost::regex pattern_prefix(regexstr_prefix);
455  const boost::regex pattern_suffix(regexstr_suffix);
456 
457  constexpr size_t PROTEIN_CACHE_SIZE = 4e5;
458 
459  while (true)
460  {
461  proteins.cacheChunk(PROTEIN_CACHE_SIZE);
462  if (!proteins.activateCache()) break;
463 
464  auto prot_count = (SignedSize)proteins.chunkSize();
465  ds.all_proteins_count += prot_count;
466 
467  boost::smatch sm;
468  for (SignedSize i = 0; i < prot_count; ++i)
469  {
470  String seq = proteins.chunkAt(i).identifier;
471 
472  String seq_lower = seq;
473  seq_lower.toLower();
474 
475  // search for prefix
476  bool found_prefix = boost::regex_search(seq_lower, sm, pattern_prefix);
477  if (found_prefix)
478  {
479  std::string match = sm[0];
480  ds.all_prefix_occur++;
481 
482  // increase count of observed prefix
483  ds.decoy_count[match].first++;
484 
485  // store observed (case sensitive and with special characters)
486  std::string seq_decoy = StringUtils::prefix(seq, match.length());
487  ds.decoy_case_sensitive[match] = seq_decoy;
488  }
489 
490  // search for suffix
491  bool found_suffix = boost::regex_search(seq_lower, sm, pattern_suffix);
492  if (found_suffix)
493  {
494  std::string match = sm[0];
495  ds.all_suffix_occur++;
496 
497  // increase count of observed suffix
498  ds.decoy_count[match].second++;
499 
500  // store observed (case sensitive and with special characters)
501  std::string seq_decoy = StringUtils::suffix(seq, match.length());
502  ds.decoy_case_sensitive[match] = seq_decoy;
503  }
504  }
505  }
506  return ds;
507  }
508 
509 private:
510  using DecoyStringToAffixCount = std::unordered_map<std::string, std::pair<Size, Size>>;
511  using CaseInsensitiveToCaseSensitiveDecoy = std::unordered_map<std::string, std::string>;
512 };
513 
514 } // namespace OpenMS
515 
#define OPENMS_LOG_DEBUG
Macro for general debugging information.
Definition: LogStream.h:454
#define OPENMS_LOG_WARN
Macro if a warning, a piece of information which should be read by the user, should be logged.
Definition: LogStream.h:444
#define OPENMS_LOG_ERROR
Macro to be used if non-fatal error are reported (processing continues)
Definition: LogStream.h:439
Helper class for calculations on decoy proteins.
Definition: FASTAContainer.h:318
std::unordered_map< std::string, std::pair< Size, Size > > DecoyStringToAffixCount
Definition: FASTAContainer.h:510
std::unordered_map< std::string, std::string > CaseInsensitiveToCaseSensitiveDecoy
Definition: FASTAContainer.h:511
static Result findDecoyString(FASTAContainer< T > &proteins)
Heuristic to determine the decoy string given a set of protein names.
Definition: FASTAContainer.h:370
static const std::string regexstr_prefix
Definition: FASTAContainer.h:359
static DecoyStatistics countDecoys(FASTAContainer< T > &proteins)
Function to count the occurrences of decoy strings in a given set of protein names.
Definition: FASTAContainer.h:446
static const std::vector< std::string > affixes
Definition: FASTAContainer.h:356
static const std::string regexstr_suffix
Definition: FASTAContainer.h:360
Int overflow exception.
Definition: Exception.h:221
std::vector< FASTAFile::FASTAEntry > data_bg_
prefetched (background) data; will become the next active data
Definition: FASTAContainer.h:207
bool readAt(FASTAFile::FASTAEntry &protein, size_t pos)
Retrieve a FASTA entry at global position pos (must not be behind the currently active chunk,...
Definition: FASTAContainer.h:156
std::string filename_
FASTA file name.
Definition: FASTAContainer.h:209
size_t size() const
NOT the number of entries in the FASTA file, but merely the number of already read entries (since we ...
Definition: FASTAContainer.h:198
bool empty()
is the FASTA file empty?
Definition: FASTAContainer.h:177
std::vector< std::streampos > offsets_
internal byte offsets into FASTA file for random access reading of previous entries.
Definition: FASTAContainer.h:205
bool activateCache()
Swaps in the background cache of entries, read previously via cacheChunk()
Definition: FASTAContainer.h:96
FASTAContainer(const String &FASTA_file)
C'tor with FASTA filename.
Definition: FASTAContainer.h:73
size_t chunk_offset_
number of entries before the current chunk
Definition: FASTAContainer.h:208
bool cacheChunk(int suggested_size)
Prefetch a new cache in the background, with up to suggested_size entries (or fewer upon reaching end...
Definition: FASTAContainer.h:110
FASTAFile f_
FASTA file connection.
Definition: FASTAContainer.h:204
std::vector< FASTAFile::FASTAEntry > data_fg_
active (foreground) data
Definition: FASTAContainer.h:206
size_t chunkSize() const
number of entries in active cache
Definition: FASTAContainer.h:126
void reset()
resets reading of the FASTA file, enables fresh reading of the FASTA from the beginning
Definition: FASTAContainer.h:183
const FASTAFile::FASTAEntry & chunkAt(size_t pos) const
Retrieve a FASTA entry at cache position pos (fast)
Definition: FASTAContainer.h:138
size_t getChunkOffset() const
how many entries were read and got swapped out already
Definition: FASTAContainer.h:85
FASTAContainer(const std::vector< FASTAFile::FASTAEntry > &data)
C'tor for already existing data (by reference).
Definition: FASTAContainer.h:229
size_t size() const
calls size() on underlying vector
Definition: FASTAContainer.h:296
bool readAt(FASTAFile::FASTAEntry &protein, size_t pos) const
fast access to an entry
Definition: FASTAContainer.h:283
bool empty() const
calls empty() on underlying vector
Definition: FASTAContainer.h:290
bool cacheChunk(int)
no-op (since data is already fully available as vector)
Definition: FASTAContainer.h:257
bool activateCache()
no-op (since data is already fully available as vector)
Definition: FASTAContainer.h:244
const std::vector< FASTAFile::FASTAEntry > & data_
reference to existing data
Definition: FASTAContainer.h:309
size_t chunkSize() const
active data spans the full range, i.e. size of container
Definition: FASTAContainer.h:271
void reset()
required for template parameters!
Definition: FASTAContainer.h:302
const FASTAFile::FASTAEntry & chunkAt(size_t pos) const
fast access to chunked (i.e. all) entries
Definition: FASTAContainer.h:277
size_t getChunkOffset() const
always 0, since this specialization requires/supports no chunking
Definition: FASTAContainer.h:235
This class serves for reading in and writing FASTA files If the protein/gene sequence contains unusua...
Definition: FASTAFile.h:35
A more convenient string class.
Definition: String.h:34
String & toLower()
Converts the string to lowercase.
ptrdiff_t SignedSize
Signed Size type e.g. used as pointer difference.
Definition: Types.h:108
size_t Size
Size type e.g. used as variable which can hold result of size()
Definition: Types.h:101
static String suffix(const String &this_s, size_t length)
Definition: StringUtilsSimple.h:131
static String prefix(const String &this_s, size_t length)
Definition: StringUtilsSimple.h:122
Main OpenMS namespace.
Definition: FeatureDeconvolution.h:22
template parameter for vector-based FASTA access
Definition: FASTAContainer.h:56
struct for intermediate results needed for calculations on decoy proteins
Definition: FASTAContainer.h:338
std::unordered_map< std::string, std::pair< Size, Size > > decoy_count
map decoys to counts of occurrences as prefix/suffix
Definition: FASTAContainer.h:339
Size all_proteins_count
number of all checked proteins
Definition: FASTAContainer.h:343
Size all_suffix_occur
number of proteins with found decoy_suffix
Definition: FASTAContainer.h:342
std::unordered_map< std::string, std::string > decoy_case_sensitive
map case insensitive strings back to original case (as used in fasta)
Definition: FASTAContainer.h:340
Size all_prefix_occur
number of proteins with found decoy_prefix
Definition: FASTAContainer.h:341
bool operator==(const DecoyStatistics &rhs) const
Definition: FASTAContainer.h:345
Definition: FASTAContainer.h:321
bool operator==(const Result &rhs) const
Definition: FASTAContainer.h:326
bool is_prefix
on success, was it a prefix or suffix
Definition: FASTAContainer.h:324
bool success
did more than 40% of proteins have the *same* prefix or suffix
Definition: FASTAContainer.h:322
String name
on success, what was the decoy string?
Definition: FASTAContainer.h:323
FASTA entry type (identifier, description and sequence) The first String corresponds to the identifie...
Definition: FASTAFile.h:46