OpenMS  2.6.0
ProteinIdentification.h
Go to the documentation of this file.
1 // --------------------------------------------------------------------------
2 // OpenMS -- Open-Source Mass Spectrometry
3 // --------------------------------------------------------------------------
4 // Copyright The OpenMS Team -- Eberhard Karls University Tuebingen,
5 // ETH Zurich, and Freie Universitaet Berlin 2002-2020.
6 //
7 // This software is released under a three-clause BSD license:
8 // * Redistributions of source code must retain the above copyright
9 // notice, this list of conditions and the following disclaimer.
10 // * Redistributions in binary form must reproduce the above copyright
11 // notice, this list of conditions and the following disclaimer in the
12 // documentation and/or other materials provided with the distribution.
13 // * Neither the name of any author or any participating institution
14 // may be used to endorse or promote products derived from this software
15 // without specific prior written permission.
16 // For a full list of authors, refer to the file AUTHORS.
17 // --------------------------------------------------------------------------
18 // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
19 // AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
20 // IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
21 // ARE DISCLAIMED. IN NO EVENT SHALL ANY OF THE AUTHORS OR THE CONTRIBUTING
22 // INSTITUTIONS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
23 // EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
24 // PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
25 // OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
26 // WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR
27 // OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF
28 // ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
29 //
30 // --------------------------------------------------------------------------
31 // $Maintainer: Chris Bielow $
32 // $Authors: Nico Pfeifer, Chris Bielow $
33 // --------------------------------------------------------------------------
34 
35 #pragma once
36 
43 
44 #include <set>
45 
46 namespace OpenMS
47 {
48  class MSExperiment;
49  class PeptideIdentification;
50 
70  class OPENMS_DLLAPI ProteinIdentification :
71  public MetaInfoInterface
72  {
73 public:
76 
78  struct Mapping
79  {
80  std::map<String, StringList> identifier_to_msrunpath;
81  std::map<StringList, String> runpath_to_identifier;
82 
83  Mapping() = default;
84 
85  explicit Mapping(const std::vector<ProteinIdentification>& prot_ids)
86  {
87  create(prot_ids);
88  }
89  void create(const std::vector<ProteinIdentification>& prot_ids)
90  {
91  identifier_to_msrunpath.clear();
92  runpath_to_identifier.clear();
93  StringList filenames;
94  for (const ProteinIdentification& prot_id : prot_ids)
95  {
96  prot_id.getPrimaryMSRunPath(filenames);
97  if (filenames.empty())
98  {
99  throw Exception::MissingInformation(__FILE__, __LINE__, OPENMS_PRETTY_FUNCTION, "No MS run path annotated in ProteinIdentification.");
100  }
101  identifier_to_msrunpath[prot_id.getIdentifier()] = filenames;
102  const auto& it = runpath_to_identifier.find(filenames);
103  if (it != runpath_to_identifier.end())
104  {
105  throw Exception::InvalidValue(__FILE__, __LINE__, OPENMS_PRETTY_FUNCTION,
106  "Multiple protein identifications with the same ms-run-path in Consensus/FeatureXML. Check input!\n",
107  ListUtils::concatenate(filenames, ","));
108  }
109  runpath_to_identifier[filenames] = prot_id.getIdentifier();
110  }
111  }
112  };
113 
117  class OPENMS_DLLAPI ProteinGroup
118  {
119  public:
122  typedef std::vector<FloatDataArray> FloatDataArrays;
125  typedef std::vector<StringDataArray> StringDataArrays;
128  typedef std::vector<IntegerDataArray> IntegerDataArrays;
129 
131  double probability;
132 
134  std::vector<String> accessions;
135 
136  ProteinGroup();
137 
139  bool operator==(const ProteinGroup& rhs) const;
140 
141  /*
142  @brief Comparison operator (for sorting)
143 
144  This operator is intended for sorting protein groups in a "best first"
145  manner. That means higher probabilities are "less" than lower
146  probabilities (!); smaller groups are "less" than larger groups;
147  everything else being equal, accessions are compared lexicographically.
148  */
149  bool operator<(const ProteinGroup& rhs) const;
150 
152 
163  const FloatDataArrays& getFloatDataArrays() const;
165 
168  {
169  return float_data_arrays_;
170  }
171 
173  void setFloatDataArrays(const FloatDataArrays& fda);
174 
176  const StringDataArrays& getStringDataArrays() const;
177 
179  StringDataArrays& getStringDataArrays();
180 
182  void setStringDataArrays(const StringDataArrays& sda);
183 
185  const IntegerDataArrays& getIntegerDataArrays() const;
186 
188  IntegerDataArrays& getIntegerDataArrays();
189 
191  void setIntegerDataArrays(const IntegerDataArrays& ida);
192 
195  {
196  return *std::find_if(integer_data_arrays_.begin(), integer_data_arrays_.end(),
197  [&name](const IntegerDataArray& da) { return da.getName() == name; } );
198  }
199 
202  {
203  return *std::find_if(string_data_arrays_.begin(), string_data_arrays_.end(),
204  [&name](const StringDataArray& da) { return da.getName() == name; } );
205  }
206 
209  {
210  return *std::find_if(float_data_arrays_.begin(), float_data_arrays_.end(),
211  [&name](const FloatDataArray& da) { return da.getName() == name; } );
212  }
213 
216  {
217  return *std::find_if(integer_data_arrays_.begin(), integer_data_arrays_.end(),
218  [&name](const IntegerDataArray& da) { return da.getName() == name; } );
219  }
220 
223  {
224  return *std::find_if(string_data_arrays_.begin(), string_data_arrays_.end(),
225  [&name](const StringDataArray& da) { return da.getName() == name; } );
226  }
227 
230  {
231  return *std::find_if(float_data_arrays_.begin(), float_data_arrays_.end(),
232  [&name](const FloatDataArray& da) { return da.getName() == name; } );
233  }
234 
235  private:
238 
241 
244  };
245 
248  {
251  SIZE_OF_PEAKMASSTYPE
252  };
253 
255  static const std::string NamesOfPeakMassType[SIZE_OF_PEAKMASSTYPE];
256 
258  struct OPENMS_DLLAPI SearchParameters :
259  public MetaInfoInterface
260  {
266  std::vector<String> fixed_modifications;
267  std::vector<String> variable_modifications;
275 
278  SearchParameters(const SearchParameters&) = default;
280  SearchParameters(SearchParameters&&) = default;
282  ~SearchParameters() = default;
283 
285  SearchParameters& operator=(const SearchParameters&) = default;
287  SearchParameters& operator=(SearchParameters&&)& = default;
288 
289  bool operator==(const SearchParameters& rhs) const;
290 
291  bool operator!=(const SearchParameters& rhs) const;
292 
294  std::pair<int,int> getChargeRange() const;
295 
300  bool mergeable(const ProteinIdentification::SearchParameters& sp, const String& experiment_type) const;
301 
302  private:
303  int getChargeValue_(String& charge_str) const;
304  };
305 
315  virtual ~ProteinIdentification();
316 
318  ProteinIdentification& operator=(const ProteinIdentification&) = default;
320  ProteinIdentification& operator=(ProteinIdentification&&) = default;
321 
323  bool operator==(const ProteinIdentification& rhs) const;
325  bool operator!=(const ProteinIdentification& rhs) const;
327 
329 
330  const std::vector<ProteinHit>& getHits() const;
333  std::vector<ProteinHit>& getHits();
335  void insertHit(const ProteinHit& input);
337  void insertHit(ProteinHit&& input);
338 
344  void setHits(const std::vector<ProteinHit>& hits);
345 
347  std::vector<ProteinHit>::iterator findHit(const String& accession);
348 
350  const std::vector<ProteinGroup>& getProteinGroups() const;
352  std::vector<ProteinGroup>& getProteinGroups();
354  void insertProteinGroup(const ProteinGroup& group);
355 
357  const std::vector<ProteinGroup>& getIndistinguishableProteins() const;
359  std::vector<ProteinGroup>& getIndistinguishableProteins();
361  void insertIndistinguishableProteins(const ProteinGroup& group);
363  void fillIndistinguishableGroupsWithSingletons();
364 
366  double getSignificanceThreshold() const;
368  void setSignificanceThreshold(double value);
370  const String& getScoreType() const;
372  void setScoreType(const String& type);
374  bool isHigherScoreBetter() const;
376  void setHigherScoreBetter(bool higher_is_better);
378  void sort();
380  void assignRanks();
388  void computeCoverage(const std::vector<PeptideIdentification>& pep_ids);
390 
397  void computeModifications(
398  const std::vector<PeptideIdentification>& pep_ids,
399  const StringList& skip_modifications);
400 
401 
403 
404  const DateTime& getDateTime() const;
407  void setDateTime(const DateTime& date);
409  void setSearchEngine(const String& search_engine);
411  const String& getSearchEngine() const;
413  const String getOriginalSearchEngineName() const;
415  void setSearchEngineVersion(const String& search_engine_version);
417  const String& getSearchEngineVersion() const;
419  void setInferenceEngine(const String& search_engine);
421  const String getInferenceEngine() const;
423  void setInferenceEngineVersion(const String& inference_engine_version);
425  const String getInferenceEngineVersion() const;
427  void setSearchParameters(const SearchParameters& search_parameters);
429  void setSearchParameters(SearchParameters&& search_parameters);
431  const SearchParameters& getSearchParameters() const;
433  SearchParameters& getSearchParameters();
435  const String& getIdentifier() const;
437  void setIdentifier(const String& id);
443  void setPrimaryMSRunPath(const StringList& s, bool raw = false);
444 
446  void setPrimaryMSRunPath(const StringList& s, MSExperiment& e);
447  void addPrimaryMSRunPath(const String& s, bool raw = false);
448  void addPrimaryMSRunPath(const StringList& s, bool raw = false);
449 
455  void getPrimaryMSRunPath(StringList& output, bool raw = false) const;
456 
458  Size nrPrimaryMSRunPaths(bool raw = false) const;
459 
462  bool hasInferenceData() const;
463 
465  bool hasInferenceEngineAsSearchEngine() const;
466 
470  bool peptideIDsMergeable(const ProteinIdentification& id_run, const String& experiment_type) const;
471 
474  std::vector<std::pair<String,String>> getSearchEngineSettingsAsPairs(const String& se = "") const;
475 
477 
478 protected:
480 
487 
489 
492  std::vector<ProteinHit> protein_hits_;
493  std::vector<ProteinGroup> protein_groups_;
495  std::vector<ProteinGroup> indistinguishable_proteins_;
498  };
499 
500 } //namespace OpenMS
OpenMS::EnzymaticDigestion::Specificity
Specificity
when querying for valid digestion products, this determines if the specificity of the two peptide end...
Definition: EnzymaticDigestion.h:66
OpenMS::DataArrays::IntegerDataArray
Integer data array class.
Definition: DataArrays.h:52
OpenMS::ProteinIdentification::ProteinGroup::StringDataArrays
std::vector< StringDataArray > StringDataArrays
Definition: ProteinIdentification.h:125
OpenMS::ProteinIdentification::SearchParameters::digestion_enzyme
Protease digestion_enzyme
The cleavage site information in details (from ProteaseDB)
Definition: ProteinIdentification.h:273
DataArrays.h
OpenMS::ProteinIdentification::Mapping::Mapping
Mapping(const std::vector< ProteinIdentification > &prot_ids)
Definition: ProteinIdentification.h:85
OpenMS::ProteinIdentification::ProteinGroup::FloatDataArrays
std::vector< FloatDataArray > FloatDataArrays
Definition: ProteinIdentification.h:122
OpenMS::ProteinIdentification::ProteinGroup::getStringDataArrayByName
const StringDataArray & getStringDataArrayByName(String name) const
Returns a const reference to the first string meta data array with the given name.
Definition: ProteinIdentification.h:222
EnzymaticDigestion.h
OpenMS::ProteinIdentification::PeakMassType
PeakMassType
Peak mass type.
Definition: ProteinIdentification.h:247
OpenMS::Exception::InvalidValue
Invalid value exception.
Definition: Exception.h:335
OpenMS::ProteinIdentification::Mapping::identifier_to_msrunpath
std::map< String, StringList > identifier_to_msrunpath
Definition: ProteinIdentification.h:80
DateTime.h
OpenMS::ProteinIdentification::SearchParameters::missed_cleavages
UInt missed_cleavages
The number of allowed missed cleavages.
Definition: ProteinIdentification.h:268
OpenMS::String
A more convenient string class.
Definition: String.h:59
OpenMS::ProteinIdentification::ProteinGroup::accessions
std::vector< String > accessions
Accessions of (indistinguishable) proteins that belong to the same group.
Definition: ProteinIdentification.h:134
KDTree::operator!=
bool operator!=(_Iterator< _Val, _Ref, _Ptr > const &, _Iterator< _Val, _Ref, _Ptr > const &)
Definition: KDTree.h:824
OpenMS::MSExperiment
In-Memory representation of a mass spectrometry experiment.
Definition: MSExperiment.h:77
OpenMS::Size
size_t Size
Size type e.g. used as variable which can hold result of size()
Definition: Types.h:127
OpenMS::ProteinHit
Representation of a protein hit.
Definition: ProteinHit.h:58
OpenMS::ProteinIdentification::ProteinGroup::integer_data_arrays_
IntegerDataArrays integer_data_arrays_
Integer data arrays.
Definition: ProteinIdentification.h:243
OpenMS::ProteinIdentification::MONOISOTOPIC
Definition: ProteinIdentification.h:249
OpenMS::ProteinIdentification::ProteinGroup::getIntegerDataArrayByName
const IntegerDataArray & getIntegerDataArrayByName(String name) const
Returns a const reference to the first integer meta data array with the given name.
Definition: ProteinIdentification.h:215
OpenMS::ProteinIdentification::ProteinGroup::float_data_arrays_
FloatDataArrays float_data_arrays_
Float data arrays.
Definition: ProteinIdentification.h:237
OpenMS::operator<
bool operator<(const QTCluster &q1, const QTCluster &q2)
OpenMS::ProteinIdentification::Mapping::runpath_to_identifier
std::map< StringList, String > runpath_to_identifier
Definition: ProteinIdentification.h:81
OpenMS::ProteinIdentification
Representation of a protein identification run.
Definition: ProteinIdentification.h:70
OpenMS::ProteinIdentification::ProteinGroup::FloatDataArray
OpenMS::DataArrays::FloatDataArray FloatDataArray
Float data array vector type.
Definition: ProteinIdentification.h:121
OpenMS::ProteinIdentification::Mapping
two way mapping from ms-run-path to protID|pepID-identifier
Definition: ProteinIdentification.h:78
OpenMS::ProteinIdentification::search_engine_
String search_engine_
Definition: ProteinIdentification.h:482
OpenMS::ProteinIdentification::SearchParameters::fragment_mass_tolerance_ppm
bool fragment_mass_tolerance_ppm
Mass tolerance unit of fragment ions (true: ppm, false: Dalton)
Definition: ProteinIdentification.h:270
OpenMS::ListUtils::concatenate
static String concatenate(const std::vector< T > &container, const String &glue="")
Concatenates all elements of the container and puts the glue string between elements.
Definition: ListUtils.h:193
OpenMS::ProteinIdentification::protein_groups_
std::vector< ProteinGroup > protein_groups_
Definition: ProteinIdentification.h:493
OpenMS::ProteinIdentification::SearchParameters::charges
String charges
The allowed charges for the search.
Definition: ProteinIdentification.h:264
OpenMS
Main OpenMS namespace.
Definition: FeatureDeconvolution.h:46
OpenMS::ProteinIdentification::AVERAGE
Definition: ProteinIdentification.h:250
DigestionEnzymeProtein.h
OpenMS::ProteinIdentification::protein_score_type_
String protein_score_type_
Definition: ProteinIdentification.h:490
OpenMS::ProteinIdentification::SearchParameters::variable_modifications
std::vector< String > variable_modifications
Allowed variable modifications.
Definition: ProteinIdentification.h:267
OpenMS::ProteinIdentification::SearchParameters::mass_type
PeakMassType mass_type
Mass type of the peaks.
Definition: ProteinIdentification.h:265
OpenMS::ProteinIdentification::ProteinGroup::getFloatDataArrayByName
FloatDataArray & getFloatDataArrayByName(String name)
Returns a mutable reference to the first float meta data array with the given name.
Definition: ProteinIdentification.h:208
OpenMS::ProteinIdentification::protein_significance_threshold_
double protein_significance_threshold_
Definition: ProteinIdentification.h:496
OpenMS::MetaInfoInterface
Interface for classes that can store arbitrary meta information (Type-Name-Value tuples).
Definition: MetaInfoInterface.h:60
ProteinHit.h
OpenMS::ProteinIdentification::SearchParameters::db
String db
The used database.
Definition: ProteinIdentification.h:261
OpenMS::DigestionEnzymeProtein
Representation of a digestion enzyme for proteins (protease)
Definition: DigestionEnzymeProtein.h:48
OpenMS::ProteinIdentification::SearchParameters::db_version
String db_version
The database version.
Definition: ProteinIdentification.h:262
OpenMS::ProteinIdentification::ProteinGroup::getFloatDataArrays
FloatDataArrays & getFloatDataArrays()
Returns a mutable reference to the float meta data arrays.
Definition: ProteinIdentification.h:167
OpenMS::DataArrays::StringDataArray
String data array class.
Definition: DataArrays.h:59
OpenMS::ProteinIdentification::SearchParameters::fragment_mass_tolerance
double fragment_mass_tolerance
Mass tolerance of fragment ions (Dalton or ppm)
Definition: ProteinIdentification.h:269
OpenMS::StringList
std::vector< String > StringList
Vector of String.
Definition: ListUtils.h:70
OpenMS::ProteinIdentification::search_parameters_
SearchParameters search_parameters_
Definition: ProteinIdentification.h:484
OpenMS::UInt
unsigned int UInt
Unsigned integer type.
Definition: Types.h:94
OpenMS::ProteinIdentification::SearchParameters::fixed_modifications
std::vector< String > fixed_modifications
Used fixed modifications.
Definition: ProteinIdentification.h:266
OpenMS::ProteinIdentification::Mapping::create
void create(const std::vector< ProteinIdentification > &prot_ids)
Definition: ProteinIdentification.h:89
OpenMS::ProteinIdentification::SearchParameters::precursor_mass_tolerance
double precursor_mass_tolerance
Mass tolerance of precursor ions (Dalton or ppm)
Definition: ProteinIdentification.h:271
OpenMS::ProteinIdentification::indistinguishable_proteins_
std::vector< ProteinGroup > indistinguishable_proteins_
Indistinguishable proteins: accessions[0] is "group leader", probability is meaningless.
Definition: ProteinIdentification.h:495
OpenMS::Internal::operator==
bool operator==(const IDBoostGraph::ProteinGroup &lhs, const IDBoostGraph::ProteinGroup &rhs)
OpenMS::ProteinIdentification::ProteinGroup::StringDataArray
OpenMS::DataArrays::StringDataArray StringDataArray
String data array vector type.
Definition: ProteinIdentification.h:124
MetaInfoInterface.h
OpenMS::ProteinIdentification::ProteinGroup::string_data_arrays_
StringDataArrays string_data_arrays_
String data arrays.
Definition: ProteinIdentification.h:240
OpenMS::ProteinIdentification::ProteinGroup::getStringDataArrayByName
StringDataArray & getStringDataArrayByName(String name)
Returns a mutable reference to the first string meta data array with the given name.
Definition: ProteinIdentification.h:201
OpenMS::ProteinIdentification::SearchParameters::precursor_mass_tolerance_ppm
bool precursor_mass_tolerance_ppm
Mass tolerance unit of precursor ions (true: ppm, false: Dalton)
Definition: ProteinIdentification.h:272
OpenMS::ProteinIdentification::date_
DateTime date_
Definition: ProteinIdentification.h:485
OpenMS::Exception::MissingInformation
Not all required information provided.
Definition: Exception.h:195
OpenMS::ProteinIdentification::ProteinGroup::getIntegerDataArrayByName
IntegerDataArray & getIntegerDataArrayByName(String name)
Returns a mutable reference to the first integer meta data array with the given name.
Definition: ProteinIdentification.h:194
OpenMS::ProteinIdentification::ProteinGroup::probability
double probability
Probability of this group.
Definition: ProteinIdentification.h:131
OpenMS::ProteinIdentification::SearchParameters
Search parameters of the DB search.
Definition: ProteinIdentification.h:258
OpenMS::ProteinIdentification::ProteinGroup::IntegerDataArrays
std::vector< IntegerDataArray > IntegerDataArrays
Definition: ProteinIdentification.h:128
OpenMS::ProteinIdentification::SearchParameters::enzyme_term_specificity
EnzymaticDigestion::Specificity enzyme_term_specificity
The number of required cutting-rule matching termini during search (none=0, semi=1,...
Definition: ProteinIdentification.h:274
OpenMS::DataArrays::FloatDataArray
Float data array class.
Definition: DataArrays.h:45
OpenMS::ProteinIdentification::protein_hits_
std::vector< ProteinHit > protein_hits_
Definition: ProteinIdentification.h:492
OpenMS::ProteinIdentification::SearchParameters::taxonomy
String taxonomy
The taxonomy restriction.
Definition: ProteinIdentification.h:263
OpenMS::ProteinIdentification::ProteinGroup::getFloatDataArrayByName
const FloatDataArray & getFloatDataArrayByName(String name) const
Returns a const reference to the first float meta data array with the given name.
Definition: ProteinIdentification.h:229
OpenMS::DateTime
DateTime Class.
Definition: DateTime.h:54
OpenMS::ProteinIdentification::search_engine_version_
String search_engine_version_
Definition: ProteinIdentification.h:483
OpenMS::ProteinIdentification::HitType
ProteinHit HitType
Hit type definition.
Definition: ProteinIdentification.h:75
OpenMS::ProteinIdentification::ProteinGroup::IntegerDataArray
OpenMS::DataArrays::IntegerDataArray IntegerDataArray
Integer data array vector type.
Definition: ProteinIdentification.h:127
OpenMS::ProteinIdentification::higher_score_better_
bool higher_score_better_
Definition: ProteinIdentification.h:491
OpenMS::ProteinIdentification::id_
String id_
Definition: ProteinIdentification.h:481
OpenMS::ProteinIdentification::ProteinGroup
Bundles multiple (e.g. indistinguishable) proteins in a group.
Definition: ProteinIdentification.h:117