46 #include <unordered_map>
51 #include <boost/regex.hpp>
81 template<
typename TBackend>
105 filename_(FASTA_file)
107 f_.readStart(FASTA_file);
113 return chunk_offset_;
124 chunk_offset_ += data_fg_.size();
125 data_fg_.swap(data_bg_);
127 return !data_fg_.empty();
139 data_bg_.reserve(suggested_size);
141 for (
int i = 0; i < suggested_size; ++i)
143 std::streampos spos = f_.position();
144 if (!f_.readNext(p))
break;
145 data_bg_.push_back(std::move(p));
146 offsets_.push_back(spos);
148 return !data_bg_.empty();
154 return data_fg_.size();
166 return data_fg_[pos];
185 if (chunk_offset_ <= pos && pos < chunk_offset_ + chunkSize())
187 protein = data_fg_[pos - chunk_offset_];
191 if (pos >= offsets_.size())
195 std::streampos spos = f_.position();
196 if (!f_.setPosition(offsets_[pos]))
return false;
197 bool r = f_.readNext(protein);
198 f_.setPosition(spos);
205 return f_.atEnd() && offsets_.empty();
215 f_.readStart(filename_);
226 return offsets_.size();
272 if (!activate_count_)
311 protein = data_[pos];
318 return data_.empty();
335 const std::vector<FASTAFile::FASTAEntry>&
data_;
336 int activate_count_ = 0;
337 int cache_count_ = 0;
354 inline static const std::vector<std::string>
affixes = {
"decoy",
"dec",
"reverse",
"rev",
"reversed",
"__id_decoy",
"xxx",
"shuffled",
"shuffle",
"pseudo",
"random" };
357 inline static const std::string
regexstr_prefix = std::string(
"^(") + ListUtils::concatenate<std::string>(
affixes,
"_*|") +
"_*)";
358 inline static const std::string
regexstr_suffix = std::string(
"(_") + ListUtils::concatenate<std::string>(
affixes,
"*|_") +
")$";
382 Size all_prefix_occur(0), all_suffix_occur(0), all_proteins_count(0);
384 constexpr
size_t PROTEIN_CACHE_SIZE = 4e5;
388 proteins.cacheChunk(PROTEIN_CACHE_SIZE);
389 if (!proteins.activateCache())
break;
391 auto prot_count = (
SignedSize)proteins.chunkSize();
392 all_proteins_count += prot_count;
397 String seq = proteins.chunkAt(i).identifier;
403 bool found_prefix = boost::regex_search(seq_lower, sm, pattern_prefix);
406 std::string match = sm[0];
410 decoy_count[match].first++;
414 decoy_case_sensitive[match] = seq_decoy;
418 bool found_suffix = boost::regex_search(seq_lower, sm, pattern_suffix);
421 std::string match = sm[0];
425 decoy_count[match].second++;
429 decoy_case_sensitive[match] = seq_decoy;
435 for (
auto &a : decoy_count)
437 OPENMS_LOG_DEBUG << a.first <<
"\t" << a.second.first <<
"\t" << a.second.second << std::endl;
442 if (
static_cast<double>(all_prefix_occur + all_suffix_occur) < 0.4 *
static_cast<double>(all_proteins_count))
444 OPENMS_LOG_ERROR <<
"Unable to determine decoy string (not enough occurrences; <40%)!" << std::endl;
445 return {
false,
"?",
true};
448 if (all_prefix_occur == all_suffix_occur)
450 OPENMS_LOG_ERROR <<
"Unable to determine decoy string (prefix and suffix occur equally often)!" << std::endl;
451 return {
false,
"?",
true};
455 for (
const auto& pair : decoy_count)
457 const std::string & case_insensitive_decoy_string = pair.first;
458 const std::pair<Size, Size>& prefix_suffix_counts = pair.second;
459 double freq_prefix =
static_cast<double>(prefix_suffix_counts.first) /
static_cast<double>(all_prefix_occur);
460 double freq_prefix_in_proteins =
static_cast<double>(prefix_suffix_counts.first) /
static_cast<double>(all_proteins_count);
462 if (freq_prefix >= 0.8 && freq_prefix_in_proteins >= 0.4)
464 if (prefix_suffix_counts.first != all_prefix_occur)
466 OPENMS_LOG_WARN <<
"More than one decoy prefix observed!" << std::endl;
467 OPENMS_LOG_WARN <<
"Using most frequent decoy prefix (" << (int)(freq_prefix * 100) <<
"%)" << std::endl;
470 return {
true, decoy_case_sensitive[case_insensitive_decoy_string],
true};
475 for (
const auto& pair : decoy_count)
477 const std::string& case_insensitive_decoy_string = pair.first;
478 const std::pair<Size, Size>& prefix_suffix_counts = pair.second;
479 double freq_suffix =
static_cast<double>(prefix_suffix_counts.second) /
static_cast<double>(all_suffix_occur);
480 double freq_suffix_in_proteins =
static_cast<double>(prefix_suffix_counts.second) /
static_cast<double>(all_proteins_count);
482 if (freq_suffix >= 0.8 && freq_suffix_in_proteins >= 0.4)
484 if (prefix_suffix_counts.second != all_suffix_occur)
486 OPENMS_LOG_WARN <<
"More than one decoy suffix observed!" << std::endl;
487 OPENMS_LOG_WARN <<
"Using most frequent decoy suffix (" << (int)(freq_suffix * 100) <<
"%)" << std::endl;
490 return {
true, decoy_case_sensitive[case_insensitive_decoy_string],
false};
494 OPENMS_LOG_ERROR <<
"Unable to determine decoy string and its position. Please provide a decoy string and its position as parameters." << std::endl;
495 return {
false,
"?",
true};
#define OPENMS_LOG_DEBUG
Macro for general debugging information.
Definition: LogStream.h:470
#define OPENMS_LOG_WARN
Macro if a warning, a piece of information which should be read by the user, should be logged.
Definition: LogStream.h:460
#define OPENMS_LOG_ERROR
Macro to be used if non-fatal error are reported (processing continues)
Definition: LogStream.h:455
Helper class for calculations on decoy proteins.
Definition: FASTAContainer.h:344
std::unordered_map< std::string, std::pair< Size, Size > > DecoyStringToAffixCount
Definition: FASTAContainer.h:499
std::unordered_map< std::string, std::string > CaseInsensitiveToCaseSensitiveDecoy
Definition: FASTAContainer.h:500
bool is_prefix
on success, was it a prefix or suffix
Definition: FASTAContainer.h:350
static Result findDecoyString(FASTAContainer< T > &proteins)
Heuristic to determine the decoy string given a set of protein names.
Definition: FASTAContainer.h:368
static const std::string regexstr_prefix
Definition: FASTAContainer.h:357
bool success
did more than 40% of proteins have the *same* prefix or suffix
Definition: FASTAContainer.h:348
String name
on success, what was the decoy string?
Definition: FASTAContainer.h:349
static const std::vector< std::string > affixes
Definition: FASTAContainer.h:354
static const std::string regexstr_suffix
Definition: FASTAContainer.h:358
Definition: FASTAContainer.h:347
Int overflow exception.
Definition: Exception.h:247
std::vector< FASTAFile::FASTAEntry > data_bg_
prefetched (background) data; will become the next active data
Definition: FASTAContainer.h:233
bool readAt(FASTAFile::FASTAEntry &protein, size_t pos)
Retrieve a FASTA entry at global position pos (must not be behind the currently active chunk,...
Definition: FASTAContainer.h:182
std::string filename_
FASTA file name.
Definition: FASTAContainer.h:235
size_t size() const
NOT the number of entries in the FASTA file, but merely the number of already read entries (since we ...
Definition: FASTAContainer.h:224
bool empty()
is the FASTA file empty?
Definition: FASTAContainer.h:203
std::vector< std::streampos > offsets_
internal byte offsets into FASTA file for random access reading of previous entries.
Definition: FASTAContainer.h:231
bool activateCache()
Swaps in the background cache of entries, read previously via cacheChunk()
Definition: FASTAContainer.h:122
FASTAContainer(const String &FASTA_file)
C'tor with FASTA filename.
Definition: FASTAContainer.h:99
size_t chunk_offset_
number of entries before the current chunk
Definition: FASTAContainer.h:234
bool cacheChunk(int suggested_size)
Prefetch a new cache in the background, with up to suggested_size entries (or fewer upon reaching end...
Definition: FASTAContainer.h:136
FASTAFile f_
FASTA file connection.
Definition: FASTAContainer.h:230
std::vector< FASTAFile::FASTAEntry > data_fg_
active (foreground) data
Definition: FASTAContainer.h:232
size_t chunkSize() const
number of entries in active cache
Definition: FASTAContainer.h:152
void reset()
resets reading of the FASTA file, enables fresh reading of the FASTA from the beginning
Definition: FASTAContainer.h:209
const FASTAFile::FASTAEntry & chunkAt(size_t pos) const
Retrieve a FASTA entry at cache position pos (fast)
Definition: FASTAContainer.h:164
size_t getChunkOffset() const
how many entries were read and got swapped out already
Definition: FASTAContainer.h:111
FASTAContainer(const std::vector< FASTAFile::FASTAEntry > &data)
C'tor for already existing data (by reference).
Definition: FASTAContainer.h:255
size_t size() const
calls size() on underlying vector
Definition: FASTAContainer.h:322
bool readAt(FASTAFile::FASTAEntry &protein, size_t pos) const
fast access to an entry
Definition: FASTAContainer.h:309
bool empty() const
calls empty() on underlying vector
Definition: FASTAContainer.h:316
bool cacheChunk(int)
no-op (since data is already fully available as vector)
Definition: FASTAContainer.h:283
bool activateCache()
no-op (since data is already fully available as vector)
Definition: FASTAContainer.h:270
const std::vector< FASTAFile::FASTAEntry > & data_
reference to existing data
Definition: FASTAContainer.h:335
size_t chunkSize() const
active data spans the full range, i.e. size of container
Definition: FASTAContainer.h:297
void reset()
required for template parameters!
Definition: FASTAContainer.h:328
const FASTAFile::FASTAEntry & chunkAt(size_t pos) const
fast access to chunked (i.e. all) entries
Definition: FASTAContainer.h:303
size_t getChunkOffset() const
always 0, since this specialization requires/supports no chunking
Definition: FASTAContainer.h:261
This class serves for reading in and writing FASTA files If the protein/gene sequence contains unusua...
Definition: FASTAFile.h:61
A more convenient string class.
Definition: String.h:60
String & toLower()
Converts the string to lowercase.
ptrdiff_t SignedSize
Signed Size type e.g. used as pointer difference.
Definition: Types.h:134
size_t Size
Size type e.g. used as variable which can hold result of size()
Definition: Types.h:127
static String suffix(const String &this_s, size_t length)
Definition: StringUtilsSimple.h:156
static String prefix(const String &this_s, size_t length)
Definition: StringUtilsSimple.h:147
Main OpenMS namespace.
Definition: FeatureDeconvolution.h:47
template parameter for vector-based FASTA access
Definition: FASTAContainer.h:82
FASTA entry type (identifier, description and sequence) The first String corresponds to the identifie...
Definition: FASTAFile.h:72