OpenMS
ProteinIdentification.h
Go to the documentation of this file.
1 // Copyright (c) 2002-present, The OpenMS Team -- EKU Tuebingen, ETH Zurich, and FU Berlin
2 // SPDX-License-Identifier: BSD-3-Clause
3 //
4 // --------------------------------------------------------------------------
5 // $Maintainer: Chris Bielow $
6 // $Authors: Nico Pfeifer, Chris Bielow $
7 // --------------------------------------------------------------------------
8 
9 #pragma once
10 
19 
20 #include <set>
21 
22 namespace OpenMS
23 {
24  class MSExperiment;
25  class PeptideIdentification;
26  class PeptideEvidence;
27  class ConsensusMap;
28 
48  class OPENMS_DLLAPI ProteinIdentification :
49  public MetaInfoInterface
50  {
51 public:
54 
56  struct Mapping
57  {
58  std::map<String, StringList> identifier_to_msrunpath;
59  std::map<StringList, String> runpath_to_identifier;
60 
61  Mapping() = default;
62 
63  explicit Mapping(const std::vector<ProteinIdentification>& prot_ids)
64  {
65  create(prot_ids);
66  }
67 
68  void create(const std::vector<ProteinIdentification>& prot_ids)
69  {
70  identifier_to_msrunpath.clear();
71  runpath_to_identifier.clear();
72  StringList filenames;
73  for (const ProteinIdentification& prot_id : prot_ids)
74  {
75  prot_id.getPrimaryMSRunPath(filenames);
76  if (filenames.empty())
77  {
78  throw Exception::MissingInformation(__FILE__, __LINE__, OPENMS_PRETTY_FUNCTION, "No MS run path annotated in ProteinIdentification.");
79  }
80  identifier_to_msrunpath[prot_id.getIdentifier()] = filenames;
81  const auto& it = runpath_to_identifier.find(filenames);
82  if (it != runpath_to_identifier.end())
83  {
84  throw Exception::InvalidValue(__FILE__, __LINE__, OPENMS_PRETTY_FUNCTION,
85  "Multiple protein identifications with the same ms-run-path in Consensus/FeatureXML. Check input!\n",
86  ListUtils::concatenate(filenames, ","));
87  }
88  runpath_to_identifier[filenames] = prot_id.getIdentifier();
89  }
90  }
91 
93  {
94  // if a merge index n is annotated, we use the filename annotated at index n in the protein identification, otherwise the one at index 0
95  size_t merge_index = pepid.getMetaValue(Constants::UserParam::ID_MERGE_INDEX, 0);
96  const auto& filenames = identifier_to_msrunpath.at(pepid.getIdentifier());
97  return (merge_index < filenames.size()) ? filenames[merge_index] : ""; // return filename or empty string if missing
98  }
99  };
100 
104  class OPENMS_DLLAPI ProteinGroup
105  {
106  public:
109  typedef std::vector<FloatDataArray> FloatDataArrays;
112  typedef std::vector<StringDataArray> StringDataArrays;
115  typedef std::vector<IntegerDataArray> IntegerDataArrays;
116 
118  double probability;
119 
121  std::vector<String> accessions;
122 
124 
126  bool operator==(const ProteinGroup& rhs) const;
127 
128  /*
129  @brief Comparison operator (for sorting)
130 
131  This operator is intended for sorting protein groups in a "best first"
132  manner. That means higher probabilities are "less" than lower
133  probabilities (!); smaller groups are "less" than larger groups;
134  everything else being equal, accessions are compared lexicographically.
135  */
136  bool operator<(const ProteinGroup& rhs) const;
137 
139 
152 
155  {
156  return float_data_arrays_;
157  }
158 
161 
164 
167 
170 
173 
176 
179 
182  {
183  return *std::find_if(integer_data_arrays_.begin(), integer_data_arrays_.end(),
184  [&name](const IntegerDataArray& da) { return da.getName() == name; } );
185  }
186 
189  {
190  return *std::find_if(string_data_arrays_.begin(), string_data_arrays_.end(),
191  [&name](const StringDataArray& da) { return da.getName() == name; } );
192  }
193 
196  {
197  return *std::find_if(float_data_arrays_.begin(), float_data_arrays_.end(),
198  [&name](const FloatDataArray& da) { return da.getName() == name; } );
199  }
200 
203  {
204  return *std::find_if(integer_data_arrays_.begin(), integer_data_arrays_.end(),
205  [&name](const IntegerDataArray& da) { return da.getName() == name; } );
206  }
207 
210  {
211  return *std::find_if(string_data_arrays_.begin(), string_data_arrays_.end(),
212  [&name](const StringDataArray& da) { return da.getName() == name; } );
213  }
214 
217  {
218  return *std::find_if(float_data_arrays_.begin(), float_data_arrays_.end(),
219  [&name](const FloatDataArray& da) { return da.getName() == name; } );
220  }
221 
222  private:
225 
228 
231  };
232 
235  {
238  SIZE_OF_PEAKMASSTYPE
239  };
240 
242  static const std::string NamesOfPeakMassType[SIZE_OF_PEAKMASSTYPE];
243 
245  struct OPENMS_DLLAPI SearchParameters :
246  public MetaInfoInterface
247  {
253  std::vector<String> fixed_modifications;
254  std::vector<String> variable_modifications;
262 
269  ~SearchParameters() = default;
270 
275 
276  bool operator==(const SearchParameters& rhs) const;
277 
278  bool operator!=(const SearchParameters& rhs) const;
279 
281  std::pair<int,int> getChargeRange() const;
282 
287  bool mergeable(const ProteinIdentification::SearchParameters& sp, const String& experiment_type) const;
288 
289  private:
290  int getChargeValue_(String& charge_str) const;
291  };
292 
303 
308 
310  bool operator==(const ProteinIdentification& rhs) const;
312  bool operator!=(const ProteinIdentification& rhs) const;
314 
316 
317  const std::vector<ProteinHit>& getHits() const;
320  std::vector<ProteinHit>& getHits();
322  void insertHit(const ProteinHit& input);
324  void insertHit(ProteinHit&& input);
325 
331  void setHits(const std::vector<ProteinHit>& hits);
332 
334  std::vector<ProteinHit>::iterator findHit(const String& accession);
335 
337  const std::vector<ProteinGroup>& getProteinGroups() const;
339  std::vector<ProteinGroup>& getProteinGroups();
341  void insertProteinGroup(const ProteinGroup& group);
342 
344  const std::vector<ProteinGroup>& getIndistinguishableProteins() const;
346  std::vector<ProteinGroup>& getIndistinguishableProteins();
351 
353  double getSignificanceThreshold() const;
355  void setSignificanceThreshold(double value);
357  const String& getScoreType() const;
359  void setScoreType(const String& type);
361  bool isHigherScoreBetter() const;
363  void setHigherScoreBetter(bool higher_is_better);
365  void sort();
367  void assignRanks();
375  void computeCoverage(const std::vector<PeptideIdentification>& pep_ids);
376  void computeCoverage(const ConsensusMap& cmap, bool use_unassigned_ids);
378 
386  const std::vector<PeptideIdentification>& pep_ids,
387  const StringList& skip_modifications);
389  const ConsensusMap& cmap,
390  const StringList& skip_modifications,
391  bool use_unassigned_ids);
392 
393 
395 
396  const DateTime& getDateTime() const;
399  void setDateTime(const DateTime& date);
401  void setSearchEngine(const String& search_engine);
403  const String& getSearchEngine() const;
407  void setSearchEngineVersion(const String& search_engine_version);
411  void setInferenceEngine(const String& search_engine);
413  const String getInferenceEngine() const;
415  void setInferenceEngineVersion(const String& inference_engine_version);
419  void setSearchParameters(const SearchParameters& search_parameters);
421  void setSearchParameters(SearchParameters&& search_parameters);
427  const String& getIdentifier() const;
429  void setIdentifier(const String& id);
436  void setPrimaryMSRunPath(const StringList& s, bool raw = false);
437 
440  void addPrimaryMSRunPath(const String& s, bool raw = false);
441  void addPrimaryMSRunPath(const StringList& s, bool raw = false);
442 
449  void getPrimaryMSRunPath(StringList& output, bool raw = false) const;
450 
452  Size nrPrimaryMSRunPaths(bool raw = false) const;
453 
456  bool hasInferenceData() const;
457 
460 
464  bool peptideIDsMergeable(const ProteinIdentification& id_run, const String& experiment_type) const;
465 
468  std::vector<std::pair<String,String>> getSearchEngineSettingsAsPairs(const String& se = "") const;
469 
471 
474 protected:
476 
483 
485 
488  std::vector<ProteinHit> protein_hits_;
489  std::vector<ProteinGroup> protein_groups_;
491  std::vector<ProteinGroup> indistinguishable_proteins_;
494 
495  private:
496  void computeCoverageFromEvidenceMapping_(const std::unordered_map<String, std::set<PeptideEvidence>>& map);
497  void fillEvidenceMapping_(std::unordered_map<String, std::set<PeptideEvidence> >& map_acc_2_evidence,
498  const std::vector<PeptideIdentification>& pep_ids) const;
499 
500  void fillModMapping_(const std::vector<PeptideIdentification>& pep_ids, const StringList& skip_modifications,
501  std::unordered_map<String, std::set<std::pair<Size, ResidueModification>>>& prot2mod) const;
502  };
503 
504 
505 } //namespace OpenMS
A container for consensus elements.
Definition: ConsensusMap.h:66
Float data array class.
Definition: DataArrays.h:22
Integer data array class.
Definition: DataArrays.h:30
String data array class.
Definition: DataArrays.h:38
DateTime Class.
Definition: DateTime.h:33
Representation of a digestion enzyme for proteins (protease)
Definition: DigestionEnzymeProtein.h:24
Specificity
when querying for valid digestion products, this determines if the specificity of the two peptide end...
Definition: EnzymaticDigestion.h:42
Invalid value exception.
Definition: Exception.h:305
Not all required information provided.
Definition: Exception.h:155
static String concatenate(const std::vector< T > &container, const String &glue="")
Concatenates all elements of the container and puts the glue string between elements.
Definition: ListUtils.h:184
In-Memory representation of a mass spectrometry run.
Definition: MSExperiment.h:46
Interface for classes that can store arbitrary meta information (Type-Name-Value tuples).
Definition: MetaInfoInterface.h:35
const DataValue & getMetaValue(const String &name) const
Returns the value corresponding to a string, or DataValue::EMPTY if not found.
Represents the peptide hits for a spectrum.
Definition: PeptideIdentification.h:39
const String & getIdentifier() const
Returns the identifier which links this PI to its corresponding ProteinIdentification.
Representation of a protein hit.
Definition: ProteinHit.h:34
Bundles multiple (e.g. indistinguishable) proteins in a group.
Definition: ProteinIdentification.h:105
void setIntegerDataArrays(const IntegerDataArrays &ida)
Sets the integer meta data arrays.
FloatDataArrays & getFloatDataArrays()
Returns a mutable reference to the float meta data arrays.
Definition: ProteinIdentification.h:154
double probability
Probability of this group.
Definition: ProteinIdentification.h:118
bool operator<(const ProteinGroup &rhs) const
IntegerDataArray & getIntegerDataArrayByName(String name)
Returns a mutable reference to the first integer meta data array with the given name.
Definition: ProteinIdentification.h:181
std::vector< StringDataArray > StringDataArrays
Definition: ProteinIdentification.h:112
OpenMS::DataArrays::FloatDataArray FloatDataArray
Float data array vector type.
Definition: ProteinIdentification.h:108
StringDataArrays & getStringDataArrays()
Returns a mutable reference to the string meta data arrays.
const IntegerDataArray & getIntegerDataArrayByName(String name) const
Returns a const reference to the first integer meta data array with the given name.
Definition: ProteinIdentification.h:202
std::vector< String > accessions
Accessions of (indistinguishable) proteins that belong to the same group.
Definition: ProteinIdentification.h:121
FloatDataArray & getFloatDataArrayByName(String name)
Returns a mutable reference to the first float meta data array with the given name.
Definition: ProteinIdentification.h:195
StringDataArrays string_data_arrays_
String data arrays.
Definition: ProteinIdentification.h:227
StringDataArray & getStringDataArrayByName(String name)
Returns a mutable reference to the first string meta data array with the given name.
Definition: ProteinIdentification.h:188
const StringDataArray & getStringDataArrayByName(String name) const
Returns a const reference to the first string meta data array with the given name.
Definition: ProteinIdentification.h:209
const IntegerDataArrays & getIntegerDataArrays() const
Returns a const reference to the integer meta data arrays.
IntegerDataArrays integer_data_arrays_
Integer data arrays.
Definition: ProteinIdentification.h:230
OpenMS::DataArrays::StringDataArray StringDataArray
String data array vector type.
Definition: ProteinIdentification.h:111
FloatDataArrays float_data_arrays_
Float data arrays.
Definition: ProteinIdentification.h:224
const FloatDataArray & getFloatDataArrayByName(String name) const
Returns a const reference to the first float meta data array with the given name.
Definition: ProteinIdentification.h:216
const FloatDataArrays & getFloatDataArrays() const
Returns a const reference to the float meta data arrays.
void setStringDataArrays(const StringDataArrays &sda)
Sets the string meta data arrays.
const StringDataArrays & getStringDataArrays() const
Returns a const reference to the string meta data arrays.
std::vector< FloatDataArray > FloatDataArrays
Definition: ProteinIdentification.h:109
OpenMS::DataArrays::IntegerDataArray IntegerDataArray
Integer data array vector type.
Definition: ProteinIdentification.h:114
bool operator==(const ProteinGroup &rhs) const
Equality operator.
std::vector< IntegerDataArray > IntegerDataArrays
Definition: ProteinIdentification.h:115
IntegerDataArrays & getIntegerDataArrays()
Returns a mutable reference to the integer meta data arrays.
void setFloatDataArrays(const FloatDataArrays &fda)
Sets the float meta data arrays.
Representation of a protein identification run.
Definition: ProteinIdentification.h:50
void setIdentifier(const String &id)
Sets the identifier.
void computeModifications(const std::vector< PeptideIdentification > &pep_ids, const StringList &skip_modifications)
Compute the modifications of all ProteinHits given PeptideHits.
ProteinIdentification(const ProteinIdentification &)=default
Copy constructor.
const String & getIdentifier() const
Returns the identifier.
void insertProteinGroup(const ProteinGroup &group)
Appends a new protein group.
void insertHit(const ProteinHit &input)
Appends a protein hit.
const String getInferenceEngineVersion() const
Returns the search engine version.
void setSearchEngine(const String &search_engine)
Sets the search engine type.
void addPrimaryMSRunPath(const String &s, bool raw=false)
SearchParameters & getSearchParameters()
Returns the search parameters (mutable)
const String getOriginalSearchEngineName() const
Return the type of search engine that was first applied (e.g., before percolator or consensusID) or "...
bool hasInferenceEngineAsSearchEngine() const
Checks if the search engine name matches an inference engine known to OpenMS.
void computeCoverageFromEvidenceMapping_(const std::unordered_map< String, std::set< PeptideEvidence >> &map)
void setHigherScoreBetter(bool higher_is_better)
Sets the orientation of the score (is higher better?)
const std::vector< ProteinGroup > & getIndistinguishableProteins() const
Returns the indistinguishable proteins.
Size nrPrimaryMSRunPaths(bool raw=false) const
get the number of primary MS runs involve in this ID run
const String & getSearchEngineVersion() const
Returns the search engine version.
double getSignificanceThreshold() const
Returns the protein significance threshold value.
const String getInferenceEngine() const
Returns the type of search engine used.
void sort()
Sorts the protein hits according to their score.
void insertIndistinguishableProteins(const ProteinGroup &group)
Appends new indistinguishable proteins.
String search_engine_
Definition: ProteinIdentification.h:478
std::vector< std::pair< String, String > > getSearchEngineSettingsAsPairs(const String &se="") const
PeakMassType
Peak mass type.
Definition: ProteinIdentification.h:235
@ AVERAGE
Definition: ProteinIdentification.h:237
@ MONOISOTOPIC
Definition: ProteinIdentification.h:236
ProteinIdentification()
Default constructor.
const std::vector< ProteinGroup > & getProteinGroups() const
Returns the protein groups.
void setSignificanceThreshold(double value)
Sets the protein significance threshold value.
void fillModMapping_(const std::vector< PeptideIdentification > &pep_ids, const StringList &skip_modifications, std::unordered_map< String, std::set< std::pair< Size, ResidueModification >>> &prot2mod) const
std::vector< ProteinHit > protein_hits_
Definition: ProteinIdentification.h:488
const SearchParameters & getSearchParameters() const
Returns the search parameters.
void setInferenceEngineVersion(const String &inference_engine_version)
Sets the search engine version.
bool operator!=(const ProteinIdentification &rhs) const
Inequality operator.
ProteinHit HitType
Hit type definition.
Definition: ProteinIdentification.h:53
String search_engine_version_
Definition: ProteinIdentification.h:479
void setSearchEngineVersion(const String &search_engine_version)
Sets the search engine version.
ProteinIdentification & operator=(const ProteinIdentification &)=default
Assignment operator.
void setHits(const std::vector< ProteinHit > &hits)
Sets the protein hits.
void computeCoverage(const ConsensusMap &cmap, bool use_unassigned_ids)
void getPrimaryMSRunPath(StringList &output, bool raw=false) const
double protein_significance_threshold_
Definition: ProteinIdentification.h:492
SearchParameters search_parameters_
Definition: ProteinIdentification.h:480
void fillIndistinguishableGroupsWithSingletons()
Appends singleton groups (with the current score) for every yet ungrouped protein hit.
void setScoreType(const String &type)
Sets the protein score type.
String protein_score_type_
Definition: ProteinIdentification.h:486
bool higher_score_better_
Definition: ProteinIdentification.h:487
std::vector< ProteinGroup > & getIndistinguishableProteins()
Returns the indistinguishable proteins (mutable)
void setPrimaryMSRunPath(const StringList &s, bool raw=false)
void computeModifications(const ConsensusMap &cmap, const StringList &skip_modifications, bool use_unassigned_ids)
void assignRanks()
Sorts the protein hits by score and assigns ranks (best score has rank 1)
void fillEvidenceMapping_(std::unordered_map< String, std::set< PeptideEvidence > > &map_acc_2_evidence, const std::vector< PeptideIdentification > &pep_ids) const
DateTime date_
Definition: ProteinIdentification.h:481
std::vector< ProteinHit >::iterator findHit(const String &accession)
Finds a protein hit by accession (returns past-the-end iterator if not found)
void setInferenceEngine(const String &search_engine)
Sets the inference engine type.
void copyMetaDataOnly(const ProteinIdentification &)
Copies only metadata (no protein hits or protein groups)
void computeCoverage(const std::vector< PeptideIdentification > &pep_ids)
Compute the coverage (in percent) of all ProteinHits given PeptideHits.
std::vector< ProteinHit > & getHits()
Returns the protein hits (mutable)
bool peptideIDsMergeable(const ProteinIdentification &id_run, const String &experiment_type) const
bool isHigherScoreBetter() const
Returns true if a higher score represents a better score.
ProteinIdentification(ProteinIdentification &&)=default
Move constructor.
std::vector< ProteinGroup > indistinguishable_proteins_
Indistinguishable proteins: accessions[0] is "group leader", probability is meaningless.
Definition: ProteinIdentification.h:491
ProteinIdentification & operator=(ProteinIdentification &&)=default
Move assignment operator.
std::vector< ProteinGroup > protein_groups_
Definition: ProteinIdentification.h:489
virtual ~ProteinIdentification()
Destructor.
void setDateTime(const DateTime &date)
Sets the date of the protein identification run.
void setSearchParameters(SearchParameters &&search_parameters)
Sets the search parameters (move)
String id_
Definition: ProteinIdentification.h:477
bool operator==(const ProteinIdentification &rhs) const
Equality operator.
void insertHit(ProteinHit &&input)
Appends a protein hit.
const String & getSearchEngine() const
Returns the type of search engine used.
const String & getScoreType() const
Returns the protein score type.
void setPrimaryMSRunPath(const StringList &s, MSExperiment &e)
set the file path to the primary MS run but try to use the mzML annotated in the MSExperiment.
void addPrimaryMSRunPath(const StringList &s, bool raw=false)
void setSearchParameters(const SearchParameters &search_parameters)
Sets the search parameters.
std::vector< ProteinGroup > & getProteinGroups()
Returns the protein groups (mutable)
A more convenient string class.
Definition: String.h:34
unsigned int UInt
Unsigned integer type.
Definition: Types.h:64
size_t Size
Size type e.g. used as variable which can hold result of size()
Definition: Types.h:97
std::vector< String > StringList
Vector of String.
Definition: ListUtils.h:44
const std::string ID_MERGE_INDEX
Definition: Constants.h:307
Main OpenMS namespace.
Definition: openswathalgo/include/OpenMS/OPENSWATHALGO/DATAACCESS/ISpectrumAccess.h:19
two way mapping from ms-run-path to protID|pepID-identifier
Definition: ProteinIdentification.h:57
void create(const std::vector< ProteinIdentification > &prot_ids)
Definition: ProteinIdentification.h:68
std::map< StringList, String > runpath_to_identifier
Definition: ProteinIdentification.h:59
Mapping(const std::vector< ProteinIdentification > &prot_ids)
Definition: ProteinIdentification.h:63
String getPrimaryMSRunPath(const PeptideIdentification &pepid) const
Definition: ProteinIdentification.h:92
std::map< String, StringList > identifier_to_msrunpath
Definition: ProteinIdentification.h:58
Search parameters of the DB search.
Definition: ProteinIdentification.h:247
String db_version
The database version.
Definition: ProteinIdentification.h:249
std::pair< int, int > getChargeRange() const
returns the charge range from the search engine settings as a pair of ints
bool operator!=(const SearchParameters &rhs) const
bool fragment_mass_tolerance_ppm
Mass tolerance unit of fragment ions (true: ppm, false: Dalton)
Definition: ProteinIdentification.h:257
EnzymaticDigestion::Specificity enzyme_term_specificity
The number of required cutting-rule matching termini during search (none=0, semi=1,...
Definition: ProteinIdentification.h:261
String taxonomy
The taxonomy restriction.
Definition: ProteinIdentification.h:250
std::vector< String > fixed_modifications
Used fixed modifications.
Definition: ProteinIdentification.h:253
SearchParameters & operator=(const SearchParameters &)=default
Assignment operator.
SearchParameters(const SearchParameters &)=default
Copy constructor.
String charges
The allowed charges for the search.
Definition: ProteinIdentification.h:251
Protease digestion_enzyme
The cleavage site information in details (from ProteaseDB)
Definition: ProteinIdentification.h:260
bool operator==(const SearchParameters &rhs) const
bool mergeable(const ProteinIdentification::SearchParameters &sp, const String &experiment_type) const
double fragment_mass_tolerance
Mass tolerance of fragment ions (Dalton or ppm)
Definition: ProteinIdentification.h:256
bool precursor_mass_tolerance_ppm
Mass tolerance unit of precursor ions (true: ppm, false: Dalton)
Definition: ProteinIdentification.h:259
SearchParameters(SearchParameters &&)=default
Move constructor.
double precursor_mass_tolerance
Mass tolerance of precursor ions (Dalton or ppm)
Definition: ProteinIdentification.h:258
SearchParameters & operator=(SearchParameters &&) &=default
Move assignment operator.
std::vector< String > variable_modifications
Allowed variable modifications.
Definition: ProteinIdentification.h:254
PeakMassType mass_type
Mass type of the peaks.
Definition: ProteinIdentification.h:252
String db
The used database.
Definition: ProteinIdentification.h:248
int getChargeValue_(String &charge_str) const
UInt missed_cleavages
The number of allowed missed cleavages.
Definition: ProteinIdentification.h:255