OpenMS  2.6.0
PeptideIndexing.h
Go to the documentation of this file.
1 // --------------------------------------------------------------------------
2 // OpenMS -- Open-Source Mass Spectrometry
3 // --------------------------------------------------------------------------
4 // Copyright The OpenMS Team -- Eberhard Karls University Tuebingen,
5 // ETH Zurich, and Freie Universitaet Berlin 2002-2020.
6 //
7 // This software is released under a three-clause BSD license:
8 // * Redistributions of source code must retain the above copyright
9 // notice, this list of conditions and the following disclaimer.
10 // * Redistributions in binary form must reproduce the above copyright
11 // notice, this list of conditions and the following disclaimer in the
12 // documentation and/or other materials provided with the distribution.
13 // * Neither the name of any author or any participating institution
14 // may be used to endorse or promote products derived from this software
15 // without specific prior written permission.
16 // For a full list of authors, refer to the file AUTHORS.
17 // --------------------------------------------------------------------------
18 // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
19 // AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
20 // IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
21 // ARE DISCLAIMED. IN NO EVENT SHALL ANY OF THE AUTHORS OR THE CONTRIBUTING
22 // INSTITUTIONS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
23 // EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
24 // PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
25 // OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
26 // WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR
27 // OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF
28 // ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
29 //
30 // --------------------------------------------------------------------------
31 // $Maintainer: Chris Bielow $
32 // $Authors: Andreas Bertsch, Chris Bielow $
33 // --------------------------------------------------------------------------
34 
35 #pragma once
36 
37 
54 #include <OpenMS/SYSTEM/SysInfo.h>
55 
56 #include <atomic>
57 #include <algorithm>
58 #include <fstream>
59 
60 
61 namespace OpenMS
62 {
63 
126  class OPENMS_DLLAPI PeptideIndexing :
127  public DefaultParamHandler, public ProgressLogger
128  {
129 public:
130 
132  static char const* const AUTO_MODE; /* = 'auto' */
133 
136  {
141  UNEXPECTED_RESULT
142  };
143 
145  enum class Unmatched
146  {
147  IS_ERROR,
148  WARN,
149  REMOVE,
150  SIZE_OF_UNMATCHED
151  };
152  static const std::array<std::string, (Size)Unmatched::SIZE_OF_UNMATCHED> names_of_unmatched;
153 
154  enum class MissingDecoy
155  {
156  IS_ERROR,
157  WARN,
158  SILENT,
159  SIZE_OF_MISSING_DECOY
160  };
161  static const std::array<std::string, (Size)MissingDecoy::SIZE_OF_MISSING_DECOY> names_of_missing_decoy;
162 
164  PeptideIndexing();
165 
167  ~PeptideIndexing() override;
168 
169 
171  inline ExitCodes run(std::vector<FASTAFile::FASTAEntry>& proteins, std::vector<ProteinIdentification>& prot_ids, std::vector<PeptideIdentification>& pep_ids)
172  {
173  FASTAContainer<TFI_Vector> protein_container(proteins);
174  return run<TFI_Vector>(protein_container, prot_ids, pep_ids);
175  }
176 
212  template<typename T>
213  ExitCodes run(FASTAContainer<T>& proteins, std::vector<ProteinIdentification>& prot_ids, std::vector<PeptideIdentification>& pep_ids)
214  {
215  // no decoy string provided? try to deduce from data
216  if (decoy_string_.empty())
217  {
218  auto r = DecoyHelper::findDecoyString(proteins);
219  proteins.reset();
220  if (!r.success)
221  {
222  r.is_prefix = true;
223  r.name = "DECOY_";
224  OPENMS_LOG_WARN << "Unable to determine decoy string automatically (not enough decoys were detected)! Using default " << (r.is_prefix ? "prefix" : "suffix") << " decoy string '" << r.name << "'\n"
225  << "If you think that this is incorrect, please provide a decoy_string and its position manually!" << std::endl;
226  }
227  prefix_ = r.is_prefix;
228  decoy_string_ = r.name;
229  // decoy string and position was extracted successfully
230  OPENMS_LOG_INFO << "Using " << (prefix_ ? "prefix" : "suffix") << " decoy string '" << decoy_string_ << "'" << std::endl;
231  }
232 
233  //---------------------------------------------------------------
234  // parsing parameters, correcting xtandem and MSGFPlus parameters
235  //---------------------------------------------------------------
236  ProteaseDigestion enzyme;
237  if (!enzyme_name_.empty() && (enzyme_name_.compare(AUTO_MODE) != 0))
238  { // use param (not empty, not 'auto')
239  enzyme.setEnzyme(enzyme_name_);
240  }
241  else if (!prot_ids.empty() && prot_ids[0].getSearchParameters().digestion_enzyme.getName() != "unknown_enzyme")
242  { // take from meta (this assumes all runs used the same enzyme)
243  OPENMS_LOG_INFO << "Info: using '" << prot_ids[0].getSearchParameters().digestion_enzyme.getName() << "' as enzyme (obtained from idXML) for digestion." << std::endl;
244  enzyme.setEnzyme(&prot_ids[0].getSearchParameters().digestion_enzyme);
245  }
246  else
247  { // fallback
248  OPENMS_LOG_WARN << "Warning: Enzyme name neither given nor deduceable from input. Defaulting to Trypsin!" << std::endl;
249  enzyme.setEnzyme("Trypsin");
250  }
251 
252  bool xtandem_fix_parameters = true;
253  bool msgfplus_fix_parameters = true;
254 
255  // determine if search engine is solely xtandem or MSGFPlus
256  for (const auto& prot_id : prot_ids)
257  {
258  String search_engine = prot_id.getOriginalSearchEngineName();
259  StringUtils::toUpper(search_engine);
260  OPENMS_LOG_INFO << "Peptide identification engine: " << search_engine << std::endl;
261  if (search_engine != "XTANDEM") { xtandem_fix_parameters = false; }
262  if (!(search_engine == "MSGFPLUS" || search_engine == "MS-GF+")) { msgfplus_fix_parameters = false; }
263  }
264 
265  // solely MSGFPlus -> Trypsin/P as enzyme
266  if (msgfplus_fix_parameters && enzyme.getEnzymeName() == "Trypsin")
267  {
268  OPENMS_LOG_WARN << "MSGFPlus detected but enzyme cutting rules were set to Trypsin. Correcting to Trypsin/P to copy with special cutting rule in MSGFPlus." << std::endl;
269  enzyme.setEnzyme("Trypsin/P");
270  }
271 
272  OPENMS_LOG_INFO << "Enzyme: " << enzyme.getEnzymeName() << std::endl;
273 
274  if (!enzyme_specificity_.empty() && (enzyme_specificity_.compare(AUTO_MODE) != 0))
275  { // use param (not empty and not 'auto')
276  enzyme.setSpecificity(ProteaseDigestion::getSpecificityByName(enzyme_specificity_));
277  }
278  else if (!prot_ids.empty() && prot_ids[0].getSearchParameters().enzyme_term_specificity != ProteaseDigestion::SPEC_UNKNOWN)
279  { // deduce from data ('auto')
280  enzyme.setSpecificity(prot_ids[0].getSearchParameters().enzyme_term_specificity);
281  OPENMS_LOG_INFO << "Info: using '" << EnzymaticDigestion::NamesOfSpecificity[prot_ids[0].getSearchParameters().enzyme_term_specificity] << "' as enzyme specificity (obtained from idXML) for digestion." << std::endl;
282  }
283  else
284  { // fallback
285  OPENMS_LOG_WARN << "Warning: Enzyme specificity neither given nor present in the input file. Defaulting to 'full'!" << std::endl;
287  }
288 
289  //-------------------------------------------------------------
290  // calculations
291  //-------------------------------------------------------------
292  // cache the first proteins
293  const size_t PROTEIN_CACHE_SIZE = 4e5; // 400k should be enough for most DB's and is not too hard on memory either (~200 MB FASTA)
294 
295  this->startProgress(0, 1, "Load first DB chunk");
296  proteins.cacheChunk(PROTEIN_CACHE_SIZE);
297  this->endProgress();
298 
299  if (proteins.empty()) // we do not allow an empty database
300  {
301  OPENMS_LOG_ERROR << "Error: An empty database was provided. Mapping makes no sense. Aborting..." << std::endl;
302  return DATABASE_EMPTY;
303  }
304 
305  if (pep_ids.empty()) // Aho-Corasick requires non-empty input; but we allow this case, since the TOPP tool should not crash when encountering a bad raw file (with no PSMs)
306  {
307  OPENMS_LOG_WARN << "Warning: An empty set of peptide identifications was provided. Output will be empty as well." << std::endl;
308  if (!keep_unreferenced_proteins_)
309  {
310  // delete only protein hits, not whole ID runs incl. meta data:
311  for (std::vector<ProteinIdentification>::iterator it = prot_ids.begin();
312  it != prot_ids.end(); ++it)
313  {
314  it->getHits().clear();
315  }
316  }
317  return PEPTIDE_IDS_EMPTY;
318  }
319 
320  FoundProteinFunctor func(enzyme, xtandem_fix_parameters); // store the matches
321  Map<String, Size> acc_to_prot; // map: accessions --> FASTA protein index
322  std::vector<bool> protein_is_decoy; // protein index -> is decoy?
323  std::vector<std::string> protein_accessions; // protein index -> accession
324 
325  bool invalid_protein_sequence = false; // check for proteins with modifications, i.e. '[' or '(', and throw an exception
326 
327  { // new scope - forget data after search
328 
329  /*
330  BUILD Peptide DB
331  */
332  bool has_illegal_AAs(false);
334  for (std::vector<PeptideIdentification>::const_iterator it1 = pep_ids.begin(); it1 != pep_ids.end(); ++it1)
335  {
336  //String run_id = it1->getIdentifier();
337  const std::vector<PeptideHit>& hits = it1->getHits();
338  for (std::vector<PeptideHit>::const_iterator it2 = hits.begin(); it2 != hits.end(); ++it2)
339  {
340  //
341  // Warning:
342  // do not skip over peptides here, since the results are iterated in the same way
343  //
344  String seq = it2->getSequence().toUnmodifiedString().remove('*'); // make a copy, i.e. do NOT change the peptide sequence!
345  if (seqan::isAmbiguous(seqan::AAString(seq.c_str())))
346  { // do not quit here, to show the user all sequences .. only quit after loop
347  OPENMS_LOG_ERROR << "Peptide sequence '" << it2->getSequence() << "' contains one or more ambiguous amino acids (B|J|Z|X).\n";
348  has_illegal_AAs = true;
349  }
350  if (IL_equivalent_) // convert L to I;
351  {
352  seq.substitute('L', 'I');
353  }
354  appendValue(pep_DB, seq.c_str());
355  }
356  }
357  if (has_illegal_AAs)
358  {
359  OPENMS_LOG_ERROR << "One or more peptides contained illegal amino acids. This is not allowed!"
360  << "\nPlease either remove the peptide or replace it with one of the unambiguous ones (while allowing for ambiguous AA's to match the protein)." << std::endl;;
361  }
362 
363  OPENMS_LOG_INFO << "Mapping " << length(pep_DB) << " peptides to " << (proteins.size() == PROTEIN_CACHE_SIZE ? "? (unknown number of)" : String(proteins.size())) << " proteins." << std::endl;
364 
365  if (length(pep_DB) == 0)
366  { // Aho-Corasick will crash if given empty needles as input
367  OPENMS_LOG_WARN << "Warning: Peptide identifications have no hits inside! Output will be empty as well." << std::endl;
368  return PEPTIDE_IDS_EMPTY;
369  }
370 
371  /*
372  Aho Corasick (fast)
373  */
374  OPENMS_LOG_INFO << "Searching with up to " << aaa_max_ << " ambiguous amino acid(s) and " << mm_max_ << " mismatch(es)!" << std::endl;
376  OPENMS_LOG_INFO << "Building trie ...";
377  StopWatch s;
378  s.start();
380  AhoCorasickAmbiguous::initPattern(pep_DB, aaa_max_, mm_max_, pattern);
381  s.stop();
382  OPENMS_LOG_INFO << " done (" << int(s.getClockTime()) << "s)" << std::endl;
383  s.reset();
384 
385  uint16_t count_j_proteins(0);
386  bool has_active_data = true; // becomes false if end of FASTA file is reached
387  const std::string jumpX(aaa_max_ + mm_max_ + 1, 'X'); // jump over stretches of 'X' which cost a lot of time; +1 because AXXA is a valid hit for aaa_max == 2 (cannot split it)
388  // use very large target value for progress if DB size is unknown (did not fit into first chunk)
389  this->startProgress(0, proteins.size() == PROTEIN_CACHE_SIZE ? std::numeric_limits<SignedSize>::max() : proteins.size(), "Aho-Corasick");
390  std::atomic<int> progress_prots(0);
391 #ifdef _OPENMP
392 #pragma omp parallel
393 #endif
394  {
395  FoundProteinFunctor func_threads(enzyme, xtandem_fix_parameters);
396  Map<String, Size> acc_to_prot_thread; // map: accessions --> FASTA protein index
397  AhoCorasickAmbiguous fuzzyAC;
398  String prot;
399 
400  while (true)
401  {
402  #pragma omp barrier // all threads need to be here, since we are about to swap protein data
403  #pragma omp single
404  {
405  DEBUG_ONLY std::cerr << " activating cache ...\n";
406  has_active_data = proteins.activateCache(); // swap in last cache
407  protein_accessions.resize(proteins.getChunkOffset() + proteins.chunkSize());
408  } // implicit barrier here
409 
410  if (!has_active_data) break; // leave while-loop
411  SignedSize prot_count = (SignedSize)proteins.chunkSize();
412 
413  #pragma omp master
414  {
415  DEBUG_ONLY std::cerr << "Filling Protein Cache ...";
416  proteins.cacheChunk(PROTEIN_CACHE_SIZE);
417  protein_is_decoy.resize(proteins.getChunkOffset() + prot_count);
418  for (SignedSize i = 0; i < prot_count; ++i)
419  { // do this in master only, to avoid false sharing
420  const String& seq = proteins.chunkAt(i).identifier;
421  protein_is_decoy[i + proteins.getChunkOffset()] = (prefix_ ? seq.hasPrefix(decoy_string_) : seq.hasSuffix(decoy_string_));
422  }
423  DEBUG_ONLY std::cerr << " done" << std::endl;
424  }
425  DEBUG_ONLY std::cerr << " starting for loop \n";
426  // search all peptides in each protein
427  #pragma omp for schedule(dynamic, 100) nowait
428  for (SignedSize i = 0; i < prot_count; ++i)
429  {
430  ++progress_prots; // atomic
431  if (omp_get_thread_num() == 0)
432  {
433  this->setProgress(progress_prots);
434  }
435 
436  prot = proteins.chunkAt(i).sequence;
437  prot.remove('*');
438 
439  // check for invalid sequences with modifications
440  if (prot.has('[') || prot.has('('))
441  {
442  invalid_protein_sequence = true; // not omp-critical because its write-only
443  // we cannot throw an exception here, since we'd need to catch it within the parallel region
444  }
445 
446  // convert L/J to I; also replace 'J' in proteins
447  if (IL_equivalent_)
448  {
449  prot.substitute('L', 'I');
450  prot.substitute('J', 'I');
451  }
452  else
453  { // warn if 'J' is found (it eats into aaa_max)
454  if (prot.has('J'))
455  {
456  #pragma omp atomic
457  ++count_j_proteins;
458  }
459  }
460 
461  Size prot_idx = i + proteins.getChunkOffset();
462 
463  // test if protein was a hit
464  Size hits_total = func_threads.filter_passed + func_threads.filter_rejected;
465 
466  // check if there are stretches of 'X'
467  if (prot.has('X'))
468  {
469  // create chunks of the protein (splitting it at stretches of 'X..X') and feed them to AC one by one
470  size_t offset = -1, start = 0;
471  while ((offset = prot.find(jumpX, offset + 1)) != std::string::npos)
472  {
473  //std::cout << "found X..X at " << offset << " in protein " << proteins[i].identifier << "\n";
474  addHits_(fuzzyAC, pattern, pep_DB, prot.substr(start, offset + jumpX.size() - start), prot, prot_idx, (int)start, func_threads);
475  // skip ahead while we encounter more X...
476  while (offset + jumpX.size() < prot.size() && prot[offset + jumpX.size()] == 'X') ++offset;
477  start = offset;
478  //std::cout << " new start: " << start << "\n";
479  }
480  // last chunk
481  if (start < prot.size())
482  {
483  addHits_(fuzzyAC, pattern, pep_DB, prot.substr(start), prot, prot_idx, (int)start, func_threads);
484  }
485  }
486  else
487  {
488  addHits_(fuzzyAC, pattern, pep_DB, prot, prot, prot_idx, 0, func_threads);
489  }
490  // was protein found?
491  if (hits_total < func_threads.filter_passed + func_threads.filter_rejected)
492  {
493  protein_accessions[prot_idx] = proteins.chunkAt(i).identifier;
494  acc_to_prot_thread[protein_accessions[prot_idx]] = prot_idx;
495  }
496  } // end parallel FOR
497 
498  // join results again
499  DEBUG_ONLY std::cerr << " critical now \n";
500  #ifdef _OPENMP
501  #pragma omp critical(PeptideIndexer_joinAC)
502  #endif
503  {
504  s.start();
505  // hits
506  func.merge(func_threads);
507  // accession -> index
508  acc_to_prot.insert(acc_to_prot_thread.begin(), acc_to_prot_thread.end());
509  acc_to_prot_thread.clear();
510  s.stop();
511  } // OMP end critical
512  } // end readChunk
513  } // OMP end parallel
514  this->endProgress();
515  std::cout << "Merge took: " << s.toString() << "\n";
516  mu.after();
517  std::cout << mu.delta("Aho-Corasick") << "\n\n";
518 
519  OPENMS_LOG_INFO << "\nAho-Corasick done:\n found " << func.filter_passed << " hits for " << func.pep_to_prot.size() << " of " << length(pep_DB) << " peptides.\n";
520 
521  // write some stats
522  OPENMS_LOG_INFO << "Peptide hits passing enzyme filter: " << func.filter_passed << "\n"
523  << " ... rejected by enzyme filter: " << func.filter_rejected << std::endl;
524 
525  if (count_j_proteins)
526  {
527  OPENMS_LOG_WARN << "PeptideIndexer found " << count_j_proteins << " protein sequences in your database containing the amino acid 'J'."
528  << "To match 'J' in a protein, an ambiguous amino acid placeholder for I/L will be used.\n"
529  << "This costs runtime and eats into the 'aaa_max' limit, leaving less opportunity for B/Z/X matches.\n"
530  << "If you want 'J' to be treated as unambiguous, enable '-IL_equivalent'!" << std::endl;
531  }
532 
533  } // end local scope
534 
535  //
536  // do mapping
537  //
538  // index existing proteins
539  Map<String, Size> runid_to_runidx; // identifier to index
540  for (Size run_idx = 0; run_idx < prot_ids.size(); ++run_idx)
541  {
542  runid_to_runidx[prot_ids[run_idx].getIdentifier()] = run_idx;
543  }
544 
545  // for peptides --> proteins
546  Size stats_matched_unique(0);
547  Size stats_matched_multi(0);
548  Size stats_unmatched(0); // no match to DB
549  Size stats_count_m_t(0); // match to Target DB
550  Size stats_count_m_d(0); // match to Decoy DB
551  Size stats_count_m_td(0); // match to T+D DB
552 
553  Map<Size, std::set<Size> > runidx_to_protidx; // in which protID do appear which proteins (according to mapped peptides)
554 
555  Size pep_idx(0);
556  for (std::vector<PeptideIdentification>::iterator it1 = pep_ids.begin(); it1 != pep_ids.end(); ++it1)
557  {
558  // which ProteinIdentification does the peptide belong to?
559  Size run_idx = runid_to_runidx[it1->getIdentifier()];
560 
561  std::vector<PeptideHit>& hits = it1->getHits();
562 
563  for (std::vector<PeptideHit>::iterator it_hit = hits.begin(); it_hit != hits.end(); /* no increase here! we might need to skip it; see below */)
564  {
565  // clear protein accessions
566  it_hit->setPeptideEvidences(std::vector<PeptideEvidence>());
567 
568  //
569  // is this a decoy hit?
570  //
571  bool matches_target(false);
572  bool matches_decoy(false);
573 
574  std::set<Size> prot_indices;
575  // add new protein references
576  for (std::set<PeptideProteinMatchInformation>::const_iterator it_i = func.pep_to_prot[pep_idx].begin();
577  it_i != func.pep_to_prot[pep_idx].end(); ++it_i)
578  {
579  prot_indices.insert(it_i->protein_index);
580  const String& accession = protein_accessions[it_i->protein_index];
581  PeptideEvidence pe(accession, it_i->position, it_i->position + (int)it_hit->getSequence().size() - 1, it_i->AABefore, it_i->AAAfter);
582  it_hit->addPeptideEvidence(pe);
583 
584  runidx_to_protidx[run_idx].insert(it_i->protein_index); // fill protein hits
585 
586  if (protein_is_decoy[it_i->protein_index])
587  {
588  matches_decoy = true;
589  }
590  else
591  {
592  matches_target = true;
593  }
594  }
595  ++pep_idx; // next hit
596 
597  if (matches_decoy && matches_target)
598  {
599  it_hit->setMetaValue("target_decoy", "target+decoy");
600  ++stats_count_m_td;
601  }
602  else if (matches_target)
603  {
604  it_hit->setMetaValue("target_decoy", "target");
605  ++stats_count_m_t;
606  }
607  else if (matches_decoy)
608  {
609  it_hit->setMetaValue("target_decoy", "decoy");
610  ++stats_count_m_d;
611  } // else: could match to no protein (i.e. both are false)
612  //else ... // not required (handled below; see stats_unmatched);
613 
614  if (prot_indices.size() == 1)
615  {
616  it_hit->setMetaValue("protein_references", "unique");
617  ++stats_matched_unique;
618  }
619  else if (prot_indices.size() > 1)
620  {
621  it_hit->setMetaValue("protein_references", "non-unique");
622  ++stats_matched_multi;
623  }
624  else
625  {
626  ++stats_unmatched;
627  if (stats_unmatched < 15) OPENMS_LOG_INFO << "Unmatched peptide: " << it_hit->getSequence() << "\n";
628  else if (stats_unmatched == 15) OPENMS_LOG_INFO << "Unmatched peptide: ...\n";
629  if (unmatched_action_ == Unmatched::REMOVE)
630  {
631  it_hit = hits.erase(it_hit);
632  continue; // already points to the next hit
633  }
634  else
635  {
636  it_hit->setMetaValue("protein_references", "unmatched");
637  }
638  }
639 
640  ++it_hit; // next hit
641  } // all hits
642 
643  } // next PepID
644 
645  Size total_peptides = stats_count_m_t + stats_count_m_d + stats_count_m_td + stats_unmatched;
646  OPENMS_LOG_INFO << "-----------------------------------\n";
647  OPENMS_LOG_INFO << "Peptide statistics\n";
648  OPENMS_LOG_INFO << "\n";
649  OPENMS_LOG_INFO << " unmatched : " << stats_unmatched << " (" << stats_unmatched * 100 / total_peptides << " %)\n";
650  OPENMS_LOG_INFO << " target/decoy:\n";
651  OPENMS_LOG_INFO << " match to target DB only: " << stats_count_m_t << " (" << stats_count_m_t * 100 / total_peptides << " %)\n";
652  OPENMS_LOG_INFO << " match to decoy DB only : " << stats_count_m_d << " (" << stats_count_m_d * 100 / total_peptides << " %)\n";
653  OPENMS_LOG_INFO << " match to both : " << stats_count_m_td << " (" << stats_count_m_td * 100 / total_peptides << " %)\n";
654  OPENMS_LOG_INFO << "\n";
655  OPENMS_LOG_INFO << " mapping to proteins:\n";
656  OPENMS_LOG_INFO << " no match (to 0 protein) : " << stats_unmatched << "\n";
657  OPENMS_LOG_INFO << " unique match (to 1 protein) : " << stats_matched_unique << "\n";
658  OPENMS_LOG_INFO << " non-unique match (to >1 protein): " << stats_matched_multi << std::endl;
659 
661  Size stats_matched_proteins(0), stats_matched_new_proteins(0), stats_orphaned_proteins(0), stats_proteins_target(0), stats_proteins_decoy(0);
662 
663  // all peptides contain the correct protein hit references, now update the protein hits
664  for (Size run_idx = 0; run_idx < prot_ids.size(); ++run_idx)
665  {
666  std::set<Size> masterset = runidx_to_protidx[run_idx]; // all protein matches from above
667 
668  std::vector<ProteinHit>& phits = prot_ids[run_idx].getHits();
669  {
670  // go through existing protein hits and count orphaned proteins (with no peptide hits)
671  std::vector<ProteinHit> orphaned_hits;
672  for (std::vector<ProteinHit>::iterator p_hit = phits.begin(); p_hit != phits.end(); ++p_hit)
673  {
674  const String& acc = p_hit->getAccession();
675  if (!acc_to_prot.has(acc)) // acc_to_prot only contains found proteins from current run
676  { // old hit is orphaned
677  ++stats_orphaned_proteins;
678  if (keep_unreferenced_proteins_)
679  {
680  p_hit->setMetaValue("target_decoy", "");
681  orphaned_hits.push_back(*p_hit);
682  }
683  }
684  }
685  // only keep orphaned hits (if any)
686  phits = orphaned_hits;
687  }
688 
689  // add new protein hits
691  phits.reserve(phits.size() + masterset.size());
692  for (std::set<Size>::const_iterator it = masterset.begin(); it != masterset.end(); ++it)
693  {
694  ProteinHit hit;
695  hit.setAccession(protein_accessions[*it]);
696 
697  if (write_protein_sequence_ || write_protein_description_)
698  {
699  proteins.readAt(fe, *it);
700  if (write_protein_sequence_)
701  {
702  hit.setSequence(fe.sequence);
703  } // no else, since sequence is empty by default
704  if (write_protein_description_)
705  {
706  hit.setDescription(fe.description);
707  } // no else, since description is empty by default
708  }
709  if (protein_is_decoy[*it])
710  {
711  hit.setMetaValue("target_decoy", "decoy");
712  ++stats_proteins_decoy;
713  }
714  else
715  {
716  hit.setMetaValue("target_decoy", "target");
717  ++stats_proteins_target;
718  }
719  phits.push_back(hit);
720  ++stats_matched_new_proteins;
721  }
722  stats_matched_proteins += phits.size();
723  }
724 
725 
726  OPENMS_LOG_INFO << "-----------------------------------\n";
727  OPENMS_LOG_INFO << "Protein statistics\n";
728  OPENMS_LOG_INFO << "\n";
729  OPENMS_LOG_INFO << " total proteins searched: " << proteins.size() << "\n";
730  OPENMS_LOG_INFO << " matched proteins : " << stats_matched_proteins << " (" << stats_matched_new_proteins << " new)\n";
731  if (stats_matched_proteins)
732  { // prevent Division-by-0 Exception
733  OPENMS_LOG_INFO << " matched target proteins: " << stats_proteins_target << " (" << stats_proteins_target * 100 / stats_matched_proteins << " %)\n";
734  OPENMS_LOG_INFO << " matched decoy proteins : " << stats_proteins_decoy << " (" << stats_proteins_decoy * 100 / stats_matched_proteins << " %)\n";
735  }
736  OPENMS_LOG_INFO << " orphaned proteins : " << stats_orphaned_proteins << (keep_unreferenced_proteins_ ? " (all kept)" : " (all removed)\n");
737  OPENMS_LOG_INFO << "-----------------------------------" << std::endl;
738 
739 
741  bool has_error = false;
742 
743  if (invalid_protein_sequence)
744  {
745  OPENMS_LOG_ERROR << "Error: One or more protein sequences contained the characters '[' or '(', which are illegal in protein sequences."
746  << "\nPeptide hits might be masked by these characters (which usually indicate presence of modifications).\n";
747  has_error = true;
748  }
749 
750  if ((stats_count_m_d + stats_count_m_td) == 0)
751  {
752  String msg("No peptides were matched to the decoy portion of the database! Did you provide the correct concatenated database? Are your 'decoy_string' (=" + String(decoy_string_) + ") and 'decoy_string_position' (=" + String(param_.getValue("decoy_string_position")) + ") settings correct?");
753  if (missing_decoy_action_ == MissingDecoy::IS_ERROR)
754  {
755  OPENMS_LOG_ERROR << "Error: " << msg << "\nSet 'missing_decoy_action' to 'warn' if you are sure this is ok!\nAborting ..." << std::endl;
756  has_error = true;
757  }
758  else if (missing_decoy_action_ == MissingDecoy::WARN)
759  {
760  OPENMS_LOG_WARN << "Warn: " << msg << "\nSet 'missing_decoy_action' to 'error' if you want to elevate this to an error!" << std::endl;
761  }
762  else // silent
763  {
764  }
765  }
766 
767  if (stats_unmatched > 0)
768  {
769  OPENMS_LOG_ERROR << "PeptideIndexer found unmatched peptides, which could not be associated to a protein.\n";
770  if (unmatched_action_ == Unmatched::IS_ERROR)
771  {
773  << "Potential solutions:\n"
774  << " - check your FASTA database is identical to the search DB (or use 'auto')\n"
775  << " - set 'enzyme:specificity' and 'enzyme:name' to 'auto' to match the parameters of the search engine\n"
776  << " - increase 'aaa_max' to allow more ambiguous amino acids\n"
777  << " - as a last resort: use the 'unmatched_action' option to accept or even remove unmatched peptides\n"
778  << " (note that unmatched peptides cannot be used for FDR calculation or quantification)\n";
779  has_error = true;
780  }
781  else if (unmatched_action_ == Unmatched::WARN)
782  {
783  OPENMS_LOG_ERROR << " Warning: " << stats_unmatched << " unmatched hits have been found, but were not removed!\n"
784  << "These are not annotated with target/decoy information and might lead to issues with downstream tools (such as FDR).\n"
785  << "Switch to '" << names_of_unmatched[(Size)Unmatched::REMOVE] << "' if you want to avoid these problems.\n";
786  }
787  else if (unmatched_action_ == Unmatched::REMOVE)
788  {
789  OPENMS_LOG_ERROR << " Warning: " << stats_unmatched <<" unmatched hits have been removed!\n"
790  << "Make sure that these hits are actually a violation of the cutting rules by inspecting the database!\n";
791  if (xtandem_fix_parameters) OPENMS_LOG_ERROR << "Since the results are from X!Tandem, this is probably ok (check anyways).\n";
792  }
793  else
794  {
795  throw Exception::NotImplemented(__FILE__, __LINE__, OPENMS_PRETTY_FUNCTION);
796  }
797  }
798 
799 
800  if (has_error)
801  {
802  OPENMS_LOG_ERROR << "Result files will be written, but PeptideIndexer will exit with an error code." << std::endl;
803  return UNEXPECTED_RESULT;
804  }
805  return EXECUTION_OK;
806  }
807 
808  const String& getDecoyString() const;
809 
810  bool isPrefix() const;
811 
812  protected:
813 
815  {
816  OpenMS::Size protein_index; //< index of the protein the peptide is contained in
817  OpenMS::Int position; //< the position of the peptide in the protein
818  char AABefore; //< the amino acid after the peptide in the protein
819  char AAAfter; //< the amino acid before the peptide in the protein
820 
821  const std::tuple<const Size&, const Int&, const char&, const char&> tie() const
822  {
823  return std::tie(protein_index, position, AABefore, AAAfter);
824  }
825  bool operator<(const PeptideProteinMatchInformation& other) const
826  {
827  return tie() < other.tie();
828  }
830  {
831  return tie() == other.tie();
832  }
833  };
834 
836  {
837  public:
838  typedef std::map<OpenMS::Size, std::set<PeptideProteinMatchInformation> > MapType;
839  MapType pep_to_prot; //< peptide index --> protein indices
840  OpenMS::Size filter_passed; //< number of accepted hits (passing addHit() constraints)
841  OpenMS::Size filter_rejected; //< number of rejected hits (not passing addHit())
842 
843  private:
845  bool xtandem_; //< are we checking xtandem cleavage rules?
846 
847  public:
848  explicit FoundProteinFunctor(const ProteaseDigestion& enzyme, bool xtandem) :
849  pep_to_prot(), filter_passed(0), filter_rejected(0), enzyme_(enzyme), xtandem_(xtandem)
850  {
851  }
852 
854  {
855  if (pep_to_prot.empty())
856  { // first merge is easy
857  pep_to_prot.swap(other.pep_to_prot);
858  }
859  else
860  {
861  for (FoundProteinFunctor::MapType::const_iterator it = other.pep_to_prot.begin(); it != other.pep_to_prot.end(); ++it)
862  { // augment set
863  this->pep_to_prot[it->first].insert(other.pep_to_prot[it->first].begin(), other.pep_to_prot[it->first].end());
864  }
865  other.pep_to_prot.clear();
866  }
867  // cheap members
868  this->filter_passed += other.filter_passed;
869  other.filter_passed = 0;
870  this->filter_rejected += other.filter_rejected;
871  other.filter_rejected = 0;
872  }
873 
874  void addHit(const OpenMS::Size idx_pep,
875  const OpenMS::Size idx_prot,
876  const OpenMS::Size len_pep,
877  const OpenMS::String& seq_prot,
879  {
880  //TODO we could read and double-check missed cleavages as well
881  if (enzyme_.isValidProduct(seq_prot, position, len_pep, true, true, xtandem_))
882  {
884  {
885  idx_prot,
886  position,
887  (position == 0) ? PeptideEvidence::N_TERMINAL_AA : seq_prot[position - 1],
888  (position + len_pep >= seq_prot.size()) ?
890  seq_prot[position + len_pep]
891  };
892  pep_to_prot[idx_pep].insert(match);
893  ++filter_passed;
894  }
895  else
896  {
897  //std::cerr << "REJECTED Peptide " << seq_pep << " with hit to protein "
898  // << seq_prot << " at position " << position << std::endl;
899  ++filter_rejected;
900  }
901  }
902 
903  };
904 
905  inline void addHits_(AhoCorasickAmbiguous& fuzzyAC, const AhoCorasickAmbiguous::FuzzyACPattern& pattern, const AhoCorasickAmbiguous::PeptideDB& pep_DB, const String& prot, const String& full_prot, SignedSize idx_prot, Int offset, FoundProteinFunctor& func_threads) const
906  {
907  fuzzyAC.setProtein(prot);
908  while (fuzzyAC.findNext(pattern))
909  {
910  const seqan::Peptide& tmp_pep = pep_DB[fuzzyAC.getHitDBIndex()];
911  func_threads.addHit(fuzzyAC.getHitDBIndex(), idx_prot, length(tmp_pep), full_prot, fuzzyAC.getHitProteinPosition() + offset);
912  }
913  }
914 
915  void updateMembers_() override;
916 
917  String decoy_string_{};
918  bool prefix_{ false };
919  MissingDecoy missing_decoy_action_ = MissingDecoy::IS_ERROR;
920  String enzyme_name_{};
921  String enzyme_specificity_{};
922 
923  bool write_protein_sequence_{ false };
924  bool write_protein_description_{ false };
925  bool keep_unreferenced_proteins_{ false };
926  Unmatched unmatched_action_ = Unmatched::IS_ERROR;
927  bool IL_equivalent_{ false };
928 
929  Int aaa_max_{0};
930  Int mm_max_{0};
931  };
932 }
933 
LogStream.h
DefaultParamHandler.h
OpenMS::PeptideIndexing::MissingDecoy
MissingDecoy
Definition: PeptideIndexing.h:154
OpenMS::TOPPBase
Base class for TOPP applications.
Definition: TOPPBase.h:144
FileHandler.h
OpenMS::EnzymaticDigestion::NamesOfSpecificity
static const std::string NamesOfSpecificity[SIZE_OF_SPECIFICITY]
Names of the Specificity.
Definition: EnzymaticDigestion.h:77
OpenMS::PeptideIndexing::addHits_
void addHits_(AhoCorasickAmbiguous &fuzzyAC, const AhoCorasickAmbiguous::FuzzyACPattern &pattern, const AhoCorasickAmbiguous::PeptideDB &pep_DB, const String &prot, const String &full_prot, SignedSize idx_prot, Int offset, FoundProteinFunctor &func_threads) const
Definition: PeptideIndexing.h:905
OpenMS::PeptideIndexing::PEPTIDE_IDS_EMPTY
Definition: PeptideIndexing.h:139
OpenMS::SysInfo::MemUsage
A convenience class to report either absolute or delta (between two timepoints) RAM usage.
Definition: SysInfo.h:83
OpenMS::ProteinHit::setDescription
void setDescription(const String &description)
sets the description of the protein
OpenMS::EnzymaticDigestion::SPEC_UNKNOWN
Definition: EnzymaticDigestion.h:71
OpenMS::IdXMLFile::store
void store(const String &filename, const std::vector< ProteinIdentification > &protein_ids, const std::vector< PeptideIdentification > &peptide_ids, const String &document_id="")
Stores the data in an idXML file.
OpenMS::ProteinHit::setSequence
void setSequence(const String &sequence)
sets the protein sequence
OpenMS::ProteaseDigestion::setEnzyme
void setEnzyme(const String &name)
Sets the enzyme for the digestion (by name)
OpenMS::PeptideIndexing::FoundProteinFunctor::filter_passed
OpenMS::Size filter_passed
Definition: PeptideIndexing.h:840
OpenMS::AhoCorasickAmbiguous::findNext
bool findNext(const FuzzyACPattern &pattern)
Enumerate hits.
Definition: AhoCorasickAmbiguous.h:1037
StopWatch.h
OpenMS::AhoCorasickAmbiguous::setProtein
void setProtein(const String &protein_sequence)
Reset to new protein sequence. All previous data is forgotten.
Definition: AhoCorasickAmbiguous.h:1024
OpenMS::PeptideIndexing::FoundProteinFunctor::enzyme_
ProteaseDigestion enzyme_
Definition: PeptideIndexing.h:844
OpenMS::String::substitute
String & substitute(char from, char to)
Replaces all occurrences of the character from by the character to.
OpenMS::Map::has
bool has(const Key &key) const
Test whether the map contains the given key.
Definition: Map.h:108
OpenMS::FASTAFile::FASTAEntry::sequence
String sequence
Definition: FASTAFile.h:80
OpenMS::String
A more convenient string class.
Definition: String.h:59
OpenMS::PeptideIndexing::PeptideProteinMatchInformation::operator==
bool operator==(const PeptideProteinMatchInformation &other) const
Definition: PeptideIndexing.h:829
PeptideEvidence.h
FASTAContainer.h
OpenMS::PeptideIndexing::FoundProteinFunctor::addHit
void addHit(const OpenMS::Size idx_pep, const OpenMS::Size idx_prot, const OpenMS::Size len_pep, const OpenMS::String &seq_prot, OpenMS::Int position)
Definition: PeptideIndexing.h:874
OpenMS::PeptideIndexing::Unmatched
Unmatched
Action to take when peptide hits could not be matched.
Definition: PeptideIndexing.h:145
OpenMS::FASTAFile::FASTAEntry::description
String description
Definition: FASTAFile.h:79
OpenMS::Size
size_t Size
Size type e.g. used as variable which can hold result of size()
Definition: Types.h:127
IdXMLFile.h
OpenMS::ProteinHit
Representation of a protein hit.
Definition: ProteinHit.h:58
OpenMS::AhoCorasickAmbiguous::getHitProteinPosition
Int getHitProteinPosition()
Offset into protein sequence where hit was found.
Definition: AhoCorasickAmbiguous.h:1057
seqan::AAString
String< AAcid, Alloc< void > > AAString
Definition: AhoCorasickAmbiguous.h:206
OpenMS::AhoCorasickAmbiguous
Extended Aho-Corasick algorithm capable of matching ambiguous amino acids in the pattern (i....
Definition: AhoCorasickAmbiguous.h:970
OpenMS::PeptideIndexing::PeptideProteinMatchInformation
Definition: PeptideIndexing.h:814
OpenMS::Param::getValue
const DataValue & getValue(const String &key) const
Returns a value of a parameter.
OpenMS::PeptideIndexing::run
ExitCodes run(std::vector< FASTAFile::FASTAEntry > &proteins, std::vector< ProteinIdentification > &prot_ids, std::vector< PeptideIdentification > &pep_ids)
forward for old interface and pyOpenMS; use run<T>() for more control
Definition: PeptideIndexing.h:171
OpenMS::SysInfo::MemUsage::delta
String delta(const String &event="delta")
DEBUG_ONLY
#define DEBUG_ONLY
Definition: AhoCorasickAmbiguous.h:46
OpenMS::AhoCorasickAmbiguous::initPattern
static void initPattern(const PeptideDB &pep_db, const int aaa_max, const int mm_max, FuzzyACPattern &pattern)
Construct a trie from a set of peptide sequences (which are to be found in a protein).
Definition: AhoCorasickAmbiguous.h:991
OPENMS_LOG_WARN
#define OPENMS_LOG_WARN
Macro if a warning, a piece of information which should be read by the user, should be logged.
Definition: LogStream.h:460
OpenMS::PeptideIndexing::PeptideProteinMatchInformation::protein_index
OpenMS::Size protein_index
Definition: PeptideIndexing.h:816
OpenMS::EnzymaticDigestion::getSpecificityByName
static Specificity getSpecificityByName(const String &name)
OpenMS::PeptideIndexing::FoundProteinFunctor::filter_rejected
OpenMS::Size filter_rejected
Definition: PeptideIndexing.h:841
OpenMS::FASTAFile::FASTAEntry
FASTA entry type (identifier, description and sequence)
Definition: FASTAFile.h:76
OpenMS::PeptideIndexing::UNEXPECTED_RESULT
Definition: PeptideIndexing.h:141
OpenMS::String::hasPrefix
bool hasPrefix(const String &string) const
true if String begins with string, false otherwise
OpenMS::PeptideIndexing::FoundProteinFunctor::pep_to_prot
MapType pep_to_prot
Definition: PeptideIndexing.h:839
ListUtils.h
OpenMS::EnzymaticDigestion::SPEC_FULL
fully enzyme specific, e.g., tryptic (ends with KR, AA-before is KR), or peptide is at protein termin...
Definition: EnzymaticDigestion.h:70
OpenMS::DefaultParamHandler
A base class for all classes handling default parameters.
Definition: DefaultParamHandler.h:92
OpenMS::IdXMLFile::load
void load(const String &filename, std::vector< ProteinIdentification > &protein_ids, std::vector< PeptideIdentification > &peptide_ids)
Loads the identifications of an idXML file without identifier.
OpenMS::Exception::InvalidParameter
Exception indicating that an invalid parameter was handed over to an algorithm.
Definition: Exception.h:347
OpenMS::ProteinHit::setAccession
void setAccession(const String &accession)
sets the accession of the protein
OpenMS::PeptideIndexing::PeptideProteinMatchInformation::operator<
bool operator<(const PeptideProteinMatchInformation &other) const
Definition: PeptideIndexing.h:825
OpenMS::PeptideEvidence::N_TERMINAL_AA
static const char N_TERMINAL_AA
Definition: PeptideEvidence.h:60
OpenMS::PeptideIndexing::run
ExitCodes run(FASTAContainer< T > &proteins, std::vector< ProteinIdentification > &prot_ids, std::vector< PeptideIdentification > &pep_ids)
Re-index peptide identifications honoring enzyme cutting rules, ambiguous amino acids and target/deco...
Definition: PeptideIndexing.h:213
OpenMS
Main OpenMS namespace.
Definition: FeatureDeconvolution.h:46
OpenMS::DataValue::toBool
bool toBool() const
Conversion to bool.
OpenMS::AhoCorasickAmbiguous::getHitDBIndex
Size getHitDBIndex()
Get index of hit into peptide database of the pattern.
Definition: AhoCorasickAmbiguous.h:1047
ProteaseDigestion.h
OpenMS::PeptideIndexing::FoundProteinFunctor::merge
void merge(FoundProteinFunctor &other)
Definition: PeptideIndexing.h:853
OpenMS::PeptideIndexing::FoundProteinFunctor::FoundProteinFunctor
FoundProteinFunctor(const ProteaseDigestion &enzyme, bool xtandem)
Definition: PeptideIndexing.h:848
OpenMS::ProgressLogger
Base class for all classes that want to report their progress.
Definition: ProgressLogger.h:54
ProteaseDB.h
OpenMS::MetaInfoInterface::setMetaValue
void setMetaValue(const String &name, const DataValue &value)
Sets the DataValue corresponding to a name.
ProgressLogger.h
FASTAFile.h
OpenMS::FASTAContainer< TFI_Vector >
FASTAContainer<TFI_Vector> simply takes an existing vector of FASTAEntries and provides the same inte...
Definition: FASTAContainer.h:243
OpenMS::StopWatch::getClockTime
double getClockTime() const
OpenMS::StopWatch::reset
void reset()
Clear the stop watch but keep running.
int
OpenMS::String::has
bool has(Byte byte) const
true if String contains the byte, false otherwise
ProteinIdentification.h
OpenMS::PeptideEvidence
Representation of a peptide evidence.
Definition: PeptideEvidence.h:50
OpenMS::StopWatch::start
void start()
Start the stop watch.
OpenMS::AhoCorasickAmbiguous::FuzzyACPattern
::seqan::Pattern< PeptideDB, ::seqan::FuzzyAC > FuzzyACPattern
Definition: AhoCorasickAmbiguous.h:974
OpenMS::StopWatch::stop
void stop()
Stop the stop watch (can be resumed later). If the stop watch was not running an exception is thrown.
OpenMS::DefaultParamHandler::setParameters
void setParameters(const Param &param)
Sets the parameters.
OpenMS::String::hasSuffix
bool hasSuffix(const String &string) const
true if String ends with string, false otherwise
AhoCorasickAmbiguous.h
OpenMS::DefaultParamHandler::getParameters
const Param & getParameters() const
Non-mutable access to the parameters.
OpenMS::Exception::NotImplemented
Not implemented exception.
Definition: Exception.h:436
OpenMS::AhoCorasickAmbiguous::PeptideDB
::seqan::StringSet<::seqan::AAString > PeptideDB
Definition: AhoCorasickAmbiguous.h:973
OpenMS::StopWatch::toString
String toString() const
get a compact representation of the current time status.
OpenMS::PeptideIndexing::ExitCodes
ExitCodes
Exit codes.
Definition: PeptideIndexing.h:135
SysInfo.h
OpenMS::SysInfo::MemUsage::after
void after()
record data for the second timepoint
OpenMS::StopWatch
This class is used to determine the current process' CPU (user and/or kernel) and wall time.
Definition: StopWatch.h:65
OpenMS::ProteaseDigestion::isValidProduct
bool isValidProduct(const String &protein, int pep_pos, int pep_length, bool ignore_missed_cleavages=true, bool allow_nterm_protein_cleavage=false, bool allow_random_asp_pro_cleavage=false) const
Variant of EnzymaticDigestion::isValidProduct() with support for n-term protein cleavage and random D...
StringUtils.h
main
int main(int argc, const char **argv)
Definition: INIFileEditor.cpp:73
OpenMS::PeptideIndexing::FoundProteinFunctor::MapType
std::map< OpenMS::Size, std::set< PeptideProteinMatchInformation > > MapType
Definition: PeptideIndexing.h:838
OpenMS::Param::update
bool update(const Param &p_outdated, const bool add_unknown=false)
Rescue parameter values from p_outdated to current param.
OpenMS::SignedSize
ptrdiff_t SignedSize
Signed Size type e.g. used as pointer difference.
Definition: Types.h:134
OpenMS::EnzymaticDigestion::getEnzymeName
String getEnzymeName() const
Returns the enzyme for the digestion.
OpenMS::PeptideIndexing::AUTO_MODE
static char const *const AUTO_MODE
name of enzyme/specificity which signals that the enzyme/specificity should be taken from meta inform...
Definition: PeptideIndexing.h:132
OPENMS_LOG_ERROR
#define OPENMS_LOG_ERROR
Macro to be used if non-fatal error are reported (processing continues)
Definition: LogStream.h:455
OpenMS::OpenMS_Log_debug
Logger::LogStream OpenMS_Log_debug
Global static instance of a LogStream to capture messages classified as debug output....
OpenMS::PeptideIndexing::PeptideProteinMatchInformation::AAAfter
char AAAfter
Definition: PeptideIndexing.h:819
OpenMS::String::substr
String substr(size_t pos=0, size_t n=npos) const
Wrapper for the STL substr() method. Returns a String object with its contents initialized to a subst...
OpenMS::ProteaseDigestion
Class for the enzymatic digestion of proteins.
Definition: ProteaseDigestion.h:60
SeqanIncludeWrapper.h
OpenMS::File::findDatabase
static String findDatabase(const String &db_name)
OpenMS::PeptideIndexing::DATABASE_EMPTY
Definition: PeptideIndexing.h:138
OpenMS::PeptideIndexing::PeptideProteinMatchInformation::AABefore
char AABefore
Definition: PeptideIndexing.h:818
OpenMS::PeptideIndexing
Refreshes the protein references for all peptide hits in a vector of PeptideIdentifications and adds ...
Definition: PeptideIndexing.h:126
OpenMS::File::readable
static bool readable(const String &file)
Return true if the file exists and is readable.
OpenMS::PeptideEvidence::C_TERMINAL_AA
static const char C_TERMINAL_AA
Definition: PeptideEvidence.h:61
PeptideIndexing.h
OpenMS::PeptideIndexing::FoundProteinFunctor::xtandem_
bool xtandem_
Definition: PeptideIndexing.h:845
OpenMS::Param
Management and storage of parameters / INI files.
Definition: Param.h:73
OpenMS::PeptideIndexing::FoundProteinFunctor
Definition: PeptideIndexing.h:835
OpenMS::String::remove
String & remove(char what)
Remove all occurrences of the character what.
seqan::isAmbiguous
bool isAmbiguous(AAcid c)
Definition: AhoCorasickAmbiguous.h:578
OpenMS::Map
Map class based on the STL map (containing several convenience functions)
Definition: Map.h:50
OPENMS_LOG_INFO
#define OPENMS_LOG_INFO
Macro if a information, e.g. a status should be reported.
Definition: LogStream.h:465
OpenMS::EnzymaticDigestion::setSpecificity
void setSpecificity(Specificity spec)
Sets the specificity for the digestion (default is SPEC_FULL).
OpenMS::StringUtils::toUpper
static String & toUpper(String &this_s)
Definition: StringUtils.h:874
OpenMS::PeptideIndexing::EXECUTION_OK
Definition: PeptideIndexing.h:137
PeptideIdentification.h
OpenMS::PeptideIndexing::ILLEGAL_PARAMETERS
Definition: PeptideIndexing.h:140
OpenMS::PeptideIndexing::PeptideProteinMatchInformation::position
OpenMS::Int position
Definition: PeptideIndexing.h:817
OpenMS::PeptideIndexing::PeptideProteinMatchInformation::tie
const std::tuple< const Size &, const Int &, const char &, const char & > tie() const
Definition: PeptideIndexing.h:821
seqan::position
Size< TNeedle >::Type position(const PatternAuxData< TNeedle > &dh)
Definition: AhoCorasickAmbiguous.h:561
StandardTypes.h
File.h
OpenMS::DecoyHelper::findDecoyString
static Result findDecoyString(FASTAContainer< T > &proteins)
Heuristic to determine the decoy string given a set of protein names.
Definition: FASTAContainer.h:359
OpenMS::ProgressLogger::setLogType
void setLogType(LogType type) const
Sets the progress log that should be used. The default type is NONE!
OpenMS::FASTAContainer
template parameter for vector-based FASTA access
Definition: FASTAContainer.h:82
TOPPBase.h
OpenMS::IdXMLFile
Used to load and store idXML files.
Definition: IdXMLFile.h:63
OpenMS::FASTAContainer< TFI_File >
FASTAContainer<TFI_File> will make FASTA entries available chunk-wise from start to end by loading it...
Definition: FASTAContainer.h:93