OpenMS
Loading...
Searching...
No Matches
ProSEAlgorithm.h
Go to the documentation of this file.
1// Copyright (c) 2002-present, The OpenMS Team -- EKU Tuebingen, ETH Zurich, and FU Berlin
2// SPDX-License-Identifier: BSD-3-Clause
3//
4// --------------------------------------------------------------------------
5// $Maintainer: $
6// $Authors: Raphael Förster $
7// --------------------------------------------------------------------------
8
9#pragma once
10
13
21
22#include <algorithm> // std::min (used by inline computeModMatchTolerance_)
23#include <iosfwd> // std::ostream (renderRunSummary / renderModificationSummary)
24#include <map>
25#include <string> // std::string (renderRunSummaryJson return / manifest)
26#include <utility> // std::pair (renderRunSummaryJson manifest)
27#include <vector>
28
29namespace OpenMS
30{
31
32class TheoreticalSpectrumGenerator;
33
48class OPENMS_DLLAPI ProSEAlgorithm :
50 public ProgressLogger
51{
52 public:
54
56 enum class ExitCodes
57 {
58 EXECUTION_OK,
59 INPUT_FILE_EMPTY,
60 UNEXPECTED_RESULT,
61 UNKNOWN_ERROR,
62 ILLEGAL_PARAMETERS
63 };
64
75 {
76 std::string input_file;
77 Size ms2_spectra = 0;
78 Size matched_spectra = 0;
79 Size target_psms = 0;
80 Size decoy_psms = 0;
81 bool fdr_applied = false;
82 double achieved_psm_fdr = -1.0;
83 Size unique_peptides = 0;
84 Size unique_proteins = 0;
85 std::map<Int, Size> charge_histogram;
86 std::map<Size, Size> missed_cleavage_histogram;
87 bool score_stats_valid = false;
88 double hyperscore_min = 0.0;
89 double hyperscore_median = 0.0;
90 double hyperscore_max = 0.0;
91 bool prec_tol_valid = false;
92 double prec_err_median = 0.0, prec_err_mad = 0.0, prec_err_recommended = 0.0;
93 bool frag_tol_valid = false;
94 double frag_err_median = 0.0, frag_err_mad = 0.0, frag_err_recommended = 0.0;
95 double seconds_search = 0.0;
96 double seconds_calibration = 0.0;
97 double seconds_fdr = 0.0;
98 };
99
109 {
110 std::string database_file;
111 std::string enzyme;
112 double precursor_tol_lower = 0.0, precursor_tol_upper = 0.0;
114 double fragment_tol = 0.0;
115 std::string fragment_tol_unit;
116 Int min_charge = 0, max_charge = 0;
117 Size missed_cleavages = 0;
118 std::vector<std::string> fixed_mods, variable_mods, ion_series;
119 bool open_search = false;
120 bool calibration_enabled = false;
121 bool snes_mode = false;
122 bool chunked = false;
123 std::string decoy_mode;
124 double psm_fdr_threshold = 0.0, protein_fdr_threshold = 0.0;
125 Size db_target_proteins = 0;
126 Size db_decoy_proteins = 0;
127 Size indexed_peptides = 0;
128 Size indexed_fragments = 0;
129 double seconds_index_build = 0.0;
130 double seconds_total = 0.0;
131 };
132
143 {
144 ExitCodes exit_code = ExitCodes::EXECUTION_OK;
145 std::vector<ProteinIdentification> protein_ids;
148 bool is_open_search = false;
150 };
151
177 {
178 std::vector<SearchResult> per_file;
180
184 std::string decoy_string;
186 bool decoy_is_prefix = true;
188 bool have_decoys = false;
189
193 };
194
204 {
205 std::vector<FASTAFile::FASTAEntry> db;
207
217 bool release_fragment_index_after_scoring = false;
218
223 std::string decoy_string;
225 bool decoy_is_prefix = true;
228 bool have_decoys = false;
229 };
230
252 ExitCodes search(const std::string& in_spectra,
253 const std::string& in_db,
254 std::vector<ProteinIdentification>& prot_ids,
255 PeptideIdentificationList& pep_ids) const;
256
281 static void applyCompleteSetProteinFDR(std::vector<ProteinIdentification>& protein_ids,
282 PeptideIdentificationList& peptide_ids,
283 const std::string& decoy_string,
284 bool decoy_is_prefix,
285 double protein_fdr);
286
332 SearchResult searchWithModificationAnalysis(const std::string& in_spectra,
333 const std::string& in_db,
334 const std::string& output_base_name = "") const;
335
354 const std::vector<FASTAFile::FASTAEntry>& fasta_db,
355 std::vector<ProteinIdentification>& prot_ids,
356 PeptideIdentificationList& pep_ids) const;
357
376 SearchContext prepareContext(const std::vector<FASTAFile::FASTAEntry>& fasta_db) const;
377
399 SearchContext& ctx,
400 std::vector<ProteinIdentification>& prot_ids,
401 PeptideIdentificationList& pep_ids) const;
402
414 const std::vector<FASTAFile::FASTAEntry>& fasta_db,
415 const std::string& output_base_name = "") const;
416
444 const std::vector<std::string>& in_spectra_files,
445 const std::vector<FASTAFile::FASTAEntry>& fasta_db,
446 const std::vector<std::string>& output_base_names = {},
447 const std::string& aggregate_base_name = "") const;
448
460 const std::vector<std::string>& in_spectra_files,
461 const std::string& in_db,
462 const std::vector<std::string>& output_base_names = {},
463 const std::string& aggregate_base_name = "") const;
464
465 protected:
466 void updateMembers_() override;
467
470 {
472 /*
473 std::string_view sequence;
474 SignedSize peptide_mod_index; ///< enumeration index of the non-RNA peptide modification
475 */
476 // Layout: doubles first, then floats, then int, then uint16_t — minimizes padding (40 bytes excluding AASequence)
477 double score = 0;
478 double delta_mass = 0.0;
479 float prefix_fraction = 0;
480 float suffix_fraction = 0;
481 float mean_error = 0.0f;
482 int isotope_error = 0;
483 uint16_t applied_charge = 0;
484 uint16_t matched_prefix_ions = 0;
485 uint16_t matched_suffix_ions = 0;
486
487 static bool hasBetterScore(const AnnotatedHit_& a, const AnnotatedHit_& b)
488 {
489 if (a.score != b.score) return a.score > b.score;
490 return a.sequence < b.sequence;
491 }
492 };
493
495 static void preprocessSpectra_(PeakMap& exp, double fragment_mass_tolerance, bool fragment_mass_tolerance_unit_ppm, bool deisotope_requested, Size peaks_keep_n, Int peaks_window_top);
496
498 enum class DecoyMode_
499 {
500 AUTO,
501 GENERATE,
502 IGNORE
503 };
504
513 {
514 bool generate{false};
515 bool strip_existing{false};
516 bool have_decoys{false};
517 std::string decoy_string;
518 bool is_prefix{true};
519 std::string strip_string;
520 bool strip_is_prefix{true};
521 };
522
532 const std::vector<FASTAFile::FASTAEntry>& db) const;
533
544
552 std::vector<FASTAFile::FASTAEntry> buildDecoyAugmentedDB_(
553 const std::vector<FASTAFile::FASTAEntry>& fasta_db,
554 const DecoyStrategy_& strategy) const;
555
569 std::vector<FASTAFile::FASTAEntry> buildCalibrationSample_(
570 const std::vector<FASTAFile::FASTAEntry>& full_db) const;
571
586 PeakMap& spectra,
587 std::vector<FASTAFile::FASTAEntry>& full_db,
588 const DecoyStrategy_& strategy,
589 std::vector<ProteinIdentification>& protein_ids,
590 PeptideIdentificationList& peptide_ids) const;
591
601 const PeakMap& spectra,
602 FragmentIndex& fi,
603 const std::vector<FASTAFile::FASTAEntry>& db,
604 const TheoreticalSpectrumGenerator& spectrum_generator,
605 double effective_fragment_tol,
606 bool fragment_mass_tolerance_unit_ppm,
607 bool open_search_mode,
608 std::vector<std::vector<AnnotatedHit_>>& annotated_hits,
609 const std::string& progress_label) const;
610
635 void postProcessHits_(const PeakMap& exp,
636 std::vector<std::vector<ProSEAlgorithm::AnnotatedHit_> >& annotated_hits,
637 std::vector<ProteinIdentification>& protein_ids,
638 PeptideIdentificationList& peptide_ids,
639 Size top_hits,
640 const StringList& modifications_fixed,
641 const StringList& modifications_variable,
642 Int peptide_missed_cleavages,
643 double precursor_mass_tolerance,
644 double fragment_mass_tolerance,
645 const std::string& precursor_mass_tolerance_unit_ppm,
646 const std::string& fragment_mass_tolerance_unit_ppm,
647 const Int precursor_min_charge,
648 const Int precursor_max_charge,
649 const std::string& enzyme,
650 const std::string& database_name) const;
651
655 mutable double precursor_mass_tolerance_lower_{10.0};
656 mutable double precursor_mass_tolerance_upper_{10.0};
657 std::string precursor_mass_tolerance_unit_{"ppm"};
658
661
663
665
667
672 bool deisotope_requested_{true};
673 Size peaks_keep_n_{0};
674 Int peaks_window_top_{20};
675
677
679
681
682 std::string enzyme_;
683
684 DecoyMode_ decoy_mode_{DecoyMode_::AUTO};
685 std::string decoy_prefix_;
686
687 double fdr_psm_{0.0};
688 double fdr_protein_{0.0};
689
691
695 EnzymaticDigestion::Specificity peptide_enzyme_specificity_{EnzymaticDigestion::SPEC_FULL};
696
697 std::string peptide_motif_;
698
700
701 bool add_a_ions_{false};
702 bool add_b_ions_{true};
703 bool add_c_ions_{false};
704 bool add_x_ions_{false};
705 bool add_y_ions_{true};
706 bool add_z_ions_{false};
707
708 Size database_chunk_size_{0};
709
710 bool calibration_enabled_{false};
711 double calibration_subset_ratio_{0.1};
712 Size calibration_min_psms_{50};
713
722 {
723 double precursor_shift{0};
724 double precursor_spread{0};
725 double cal_lower{0};
726 double cal_upper{0};
727 double fragment_tolerance{0};
728 double fragment_shift{0};
729 bool extreme_bias{false};
730 bool success{false};
731 };
732
737
744
752 mutable double last_mod_match_tolerance_used_{-1.0};
753
765 {
766 if (precursor_mass_tolerance_lower_ <= 0.0) return precursor_mass_tolerance_upper_;
767 if (precursor_mass_tolerance_upper_ <= 0.0) return precursor_mass_tolerance_lower_;
768 return std::min(precursor_mass_tolerance_lower_, precursor_mass_tolerance_upper_);
769 }
770
784 FragmentIndex& fragment_index,
785 const std::vector<FASTAFile::FASTAEntry>& db) const;
786
789 static bool accessionHasDecoyMarker_(const std::string& accession,
790 const std::string& marker, bool is_prefix);
791
798 static void capturePreFdrStats_(const PeptideIdentificationList& peptide_ids,
799 RunStatistics& stats);
800
804 static double maxRetainedScore_(const PeptideIdentificationList& peptide_ids);
805
811 void collectRunStatistics_(const PeakMap& spectra,
812 const std::vector<ProteinIdentification>& protein_ids,
813 const PeptideIdentificationList& peptide_ids,
814 RunStatistics& stats) const;
815
816 public:
826 static void updateFinalStats(RunStatistics& stats,
827 const PeptideIdentificationList& peptide_ids,
828 const std::string& enzyme,
829 bool fdr_applied);
830
835 static void renderRunSummary(const RunStatistics& stats,
836 const SharedSearchStats& shared,
838 bool is_open_search,
839 std::ostream& os);
840
844 std::ostream& os);
845
852 static std::string renderRunSummaryYaml(
853 const MultiFileSearchResult& mfres,
854 const std::vector<std::pair<std::string, std::vector<std::string>>>& manifest,
855 Size files_failed,
856 Size files_total);
857
858 private:
859
861 bool isOpenSearchMode_() const
862 {
863 return FragmentIndex::isOpenSearchMode(precursor_mass_tolerance_lower_,
864 precursor_mass_tolerance_upper_,
865 precursor_mass_tolerance_unit_ == "ppm");
866 }
867};
868
869} // namespace
Representation of a peptide/protein sequence.
Definition AASequence.h:88
A base class for all classes handling default parameters.
Definition DefaultParamHandler.h:66
Specificity
when querying for valid digestion products, this determines if the specificity of the two peptide end...
Definition EnzymaticDigestion.h:42
Generates from a set of Fasta files a 2D-datastructure which stores all theoretical masses of all b a...
Definition FragmentIndex.h:35
In-Memory representation of a mass spectrometry run.
Definition MSExperiment.h:49
Combined result of open search modification analysis.
Definition OpenSearchModificationAnalysis.h:104
Management and storage of parameters / INI files.
Definition Param.h:46
Container for peptide identifications from multiple spectra.
Definition PeptideIdentificationList.h:66
Fragment-index-based peptide database search algorithm (experimental).
Definition ProSEAlgorithm.h:51
ExitCodes search(PeakMap &spectra, const std::vector< FASTAFile::FASTAEntry > &fasta_db, std::vector< ProteinIdentification > &prot_ids, PeptideIdentificationList &pep_ids) const
In-memory search: search spectra against a protein database without file I/O.
std::string enzyme_
Definition ProSEAlgorithm.h:682
static std::string renderRunSummaryYaml(const MultiFileSearchResult &mfres, const std::vector< std::pair< std::string, std::vector< std::string > > > &manifest, Size files_failed, Size files_total)
FragmentIndex fragment_index
Definition ProSEAlgorithm.h:206
std::string fragment_tol_unit
Definition ProSEAlgorithm.h:115
Size peptide_max_size_
Definition ProSEAlgorithm.h:693
static void updateFinalStats(RunStatistics &stats, const PeptideIdentificationList &peptide_ids, const std::string &enzyme, bool fdr_applied)
SearchResult searchWithModificationAnalysis(const std::string &in_spectra, const std::string &in_db, const std::string &output_base_name="") const
Search with comprehensive results including modification analysis tables.
Size precursor_max_charge_
Definition ProSEAlgorithm.h:660
Size precursor_min_charge_
Definition ProSEAlgorithm.h:659
std::string decoy_prefix_
Definition ProSEAlgorithm.h:685
RunStatistics stats
Definition ProSEAlgorithm.h:149
Size report_top_hits_
Definition ProSEAlgorithm.h:699
Size modifications_max_variable_mods_per_peptide_
Definition ProSEAlgorithm.h:680
static void renderModificationSummary(const OpenSearchModificationAnalysis::OpenSearchAnalysisResult &mod_analysis, std::ostream &os)
SearchResult searchWithModificationAnalysis(PeakMap &spectra, const std::vector< FASTAFile::FASTAEntry > &fasta_db, const std::string &output_base_name="") const
In-memory search with modification analysis: no file I/O required.
std::string peptide_motif_
Definition ProSEAlgorithm.h:697
std::string input_file
spectrum file this run searched (basename or path)
Definition ProSEAlgorithm.h:76
std::map< Int, Size > charge_histogram
precursor charge -> PSM count
Definition ProSEAlgorithm.h:85
StringList modifications_fixed_
Definition ProSEAlgorithm.h:676
CalibrationResult_ runCalibrationPass_(PeakMap &spectra, FragmentIndex &fragment_index, const std::vector< FASTAFile::FASTAEntry > &db) const
Run a fast calibration pass on a subset of spectra to estimate mass accuracy.
static void capturePreFdrStats_(const PeptideIdentificationList &peptide_ids, RunStatistics &stats)
Param fragmentIndexParameters_() const
ProSE parameters made safe to hand to a FragmentIndex.
void postProcessHits_(const PeakMap &exp, std::vector< std::vector< ProSEAlgorithm::AnnotatedHit_ > > &annotated_hits, std::vector< ProteinIdentification > &protein_ids, PeptideIdentificationList &peptide_ids, Size top_hits, const StringList &modifications_fixed, const StringList &modifications_variable, Int peptide_missed_cleavages, double precursor_mass_tolerance, double fragment_mass_tolerance, const std::string &precursor_mass_tolerance_unit_ppm, const std::string &fragment_mass_tolerance_unit_ppm, const Int precursor_min_charge, const Int precursor_max_charge, const std::string &enzyme, const std::string &database_name) const
Filter and annotate search results.
std::vector< ProteinIdentification > protein_ids
Definition ProSEAlgorithm.h:145
Size peptide_min_size_
Definition ProSEAlgorithm.h:692
std::vector< FASTAFile::FASTAEntry > buildDecoyAugmentedDB_(const std::vector< FASTAFile::FASTAEntry > &fasta_db, const DecoyStrategy_ &strategy) const
Build the searched database according to strategy.
SearchResult aggregate
Definition ProSEAlgorithm.h:179
IntList precursor_isotopes_
Definition ProSEAlgorithm.h:662
std::string database_file
FASTA path (empty for in-memory db)
Definition ProSEAlgorithm.h:110
MultiFileSearchResult searchWithModificationAnalysis(const std::vector< std::string > &in_spectra_files, const std::string &in_db, const std::vector< std::string > &output_base_names={}, const std::string &aggregate_base_name="") const
Multi-file search with modification analysis (FASTA file path).
std::string strip_string
marker of pre-existing decoys to strip
Definition ProSEAlgorithm.h:519
MultiFileSearchResult searchWithModificationAnalysis(const std::vector< std::string > &in_spectra_files, const std::vector< FASTAFile::FASTAEntry > &fasta_db, const std::vector< std::string > &output_base_names={}, const std::string &aggregate_base_name="") const
Multi-file search with modification analysis (in-memory FASTA).
std::map< Size, Size > missed_cleavage_histogram
missed cleavages -> PSM count
Definition ProSEAlgorithm.h:86
RunStatistics last_run_stats_
Definition ProSEAlgorithm.h:743
std::string fragment_mass_tolerance_unit_
Definition ProSEAlgorithm.h:666
StringList annotate_psm_
Definition ProSEAlgorithm.h:690
static void applyCompleteSetProteinFDR(std::vector< ProteinIdentification > &protein_ids, PeptideIdentificationList &peptide_ids, const std::string &decoy_string, bool decoy_is_prefix, double protein_fdr)
Finalize protein-level FDR on a COMPLETE protein set (a single input file, or a merged cross-file agg...
OpenSearchModificationAnalysis::OpenSearchAnalysisResult modification_analysis
Definition ProSEAlgorithm.h:147
ExitCodes search(const std::string &in_spectra, const std::string &in_db, std::vector< ProteinIdentification > &prot_ids, PeptideIdentificationList &pep_ids) const
Search spectra in a spectrum file (mzML or Bruker .d) against a protein database using an FI-backed w...
std::vector< FASTAFile::FASTAEntry > db
Definition ProSEAlgorithm.h:205
static bool accessionHasDecoyMarker_(const std::string &accession, const std::string &marker, bool is_prefix)
static void renderRunSummary(const RunStatistics &stats, const SharedSearchStats &shared, const OpenSearchModificationAnalysis::OpenSearchAnalysisResult &mod_analysis, bool is_open_search, std::ostream &os)
ExitCodes search(PeakMap &spectra, SearchContext &ctx, std::vector< ProteinIdentification > &prot_ids, PeptideIdentificationList &pep_ids) const
In-memory search using a pre-built SearchContext.
SharedSearchStats shared
Definition ProSEAlgorithm.h:192
bool isOpenSearchMode_() const
Helper function to determine if open search should be used based on tolerance.
Definition ProSEAlgorithm.h:861
ExitCodes searchChunked_(PeakMap &spectra, std::vector< FASTAFile::FASTAEntry > &full_db, const DecoyStrategy_ &strategy, std::vector< ProteinIdentification > &protein_ids, PeptideIdentificationList &peptide_ids) const
Chunked database search implementation.
std::string decoy_string
Definition ProSEAlgorithm.h:184
static void preprocessSpectra_(PeakMap &exp, double fragment_mass_tolerance, bool fragment_mass_tolerance_unit_ppm, bool deisotope_requested, Size peaks_keep_n, Int peaks_window_top)
filter, deisotope, decharge spectra
DecoyStrategy_ resolveDecoyStrategy_(const std::vector< FASTAFile::FASTAEntry > &db) const
Decide how to obtain/recognise decoys for db.
std::vector< FASTAFile::FASTAEntry > buildCalibrationSample_(const std::vector< FASTAFile::FASTAEntry > &full_db) const
Build a strided protein sample for chunked calibration.
std::string decoy_mode
"generated" | "external" | "none (target-only)"
Definition ProSEAlgorithm.h:123
PeptideIdentificationList peptide_ids
Definition ProSEAlgorithm.h:146
std::string enzyme
Definition ProSEAlgorithm.h:111
void updateMembers_() override
This method is used to update extra member variables at the end of the setParameters() method.
ExitCodes
Exit codes.
Definition ProSEAlgorithm.h:57
StringList modifications_variable_
Definition ProSEAlgorithm.h:678
static double maxRetainedScore_(const PeptideIdentificationList &peptide_ids)
std::vector< SearchResult > per_file
Definition ProSEAlgorithm.h:178
double fragment_mass_tolerance_
Definition ProSEAlgorithm.h:664
SearchContext prepareContext(const std::vector< FASTAFile::FASTAEntry > &fasta_db) const
Build a SearchContext (decoy-augmented database + FragmentIndex) for reuse.
void scoreSpectraAgainstIndex_(const PeakMap &spectra, FragmentIndex &fi, const std::vector< FASTAFile::FASTAEntry > &db, const TheoreticalSpectrumGenerator &spectrum_generator, double effective_fragment_tol, bool fragment_mass_tolerance_unit_ppm, bool open_search_mode, std::vector< std::vector< AnnotatedHit_ > > &annotated_hits, const std::string &progress_label) const
Score all spectra against one FragmentIndex.
Size peptide_missed_cleavages_
Definition ProSEAlgorithm.h:694
CalibrationResult_ last_calibration_result_
Definition ProSEAlgorithm.h:736
double computeModMatchTolerance_() const
Definition ProSEAlgorithm.h:764
void collectRunStatistics_(const PeakMap &spectra, const std::vector< ProteinIdentification > &protein_ids, const PeptideIdentificationList &peptide_ids, RunStatistics &stats) const
std::vector< std::string > fixed_mods
Definition ProSEAlgorithm.h:118
std::string precursor_tol_unit
Definition ProSEAlgorithm.h:113
DecoyMode_
How decoys are obtained/recognised for a search (parameter "decoys").
Definition ProSEAlgorithm.h:499
Result of a calibration pass.
Definition ProSEAlgorithm.h:722
Resolved decoy handling for one concrete input database.
Definition ProSEAlgorithm.h:513
Multi-file search result bundle.
Definition ProSEAlgorithm.h:177
Per-run identification statistics for the end-of-search report.
Definition ProSEAlgorithm.h:75
Prepared per-database state shared across multiple spectrum files.
Definition ProSEAlgorithm.h:204
Comprehensive search result including modification analysis.
Definition ProSEAlgorithm.h:143
Configuration, database and fragment-index facts shared across all input files of one ProSE invocatio...
Definition ProSEAlgorithm.h:109
Base class for all classes that want to report their progress.
Definition ProgressLogger.h:27
Generates theoretical spectra for peptides with various options.
Definition TheoreticalSpectrumGenerator.h:45
int Int
Signed integer type.
Definition Types.h:72
size_t Size
Size type e.g. used as variable which can hold result of size()
Definition Types.h:97
std::vector< Int > IntList
Vector of signed integers.
Definition ListUtils.h:29
std::vector< std::string > StringList
Vector of String.
Definition ListUtils.h:44
Main OpenMS namespace.
Definition openswathalgo/include/OpenMS/OPENSWATHALGO/DATAACCESS/ISpectrumAccess.h:19
Slimmer structure as storing all scored candidates in PeptideHit objects takes too much space.
Definition ProSEAlgorithm.h:470
static bool hasBetterScore(const AnnotatedHit_ &a, const AnnotatedHit_ &b)
Definition ProSEAlgorithm.h:487
double score
main score
Definition ProSEAlgorithm.h:477
AASequence sequence
Definition ProSEAlgorithm.h:471