OpenMS  2.5.0
ProteinIdentification.h
Go to the documentation of this file.
1 // --------------------------------------------------------------------------
2 // OpenMS -- Open-Source Mass Spectrometry
3 // --------------------------------------------------------------------------
4 // Copyright The OpenMS Team -- Eberhard Karls University Tuebingen,
5 // ETH Zurich, and Freie Universitaet Berlin 2002-2020.
6 //
7 // This software is released under a three-clause BSD license:
8 // * Redistributions of source code must retain the above copyright
9 // notice, this list of conditions and the following disclaimer.
10 // * Redistributions in binary form must reproduce the above copyright
11 // notice, this list of conditions and the following disclaimer in the
12 // documentation and/or other materials provided with the distribution.
13 // * Neither the name of any author or any participating institution
14 // may be used to endorse or promote products derived from this software
15 // without specific prior written permission.
16 // For a full list of authors, refer to the file AUTHORS.
17 // --------------------------------------------------------------------------
18 // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
19 // AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
20 // IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
21 // ARE DISCLAIMED. IN NO EVENT SHALL ANY OF THE AUTHORS OR THE CONTRIBUTING
22 // INSTITUTIONS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
23 // EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
24 // PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
25 // OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
26 // WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR
27 // OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF
28 // ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
29 //
30 // --------------------------------------------------------------------------
31 // $Maintainer: Chris Bielow $
32 // $Authors: Nico Pfeifer, Chris Bielow $
33 // --------------------------------------------------------------------------
34 
35 #pragma once
36 
42 
43 
44 
45 #include <set>
46 
47 namespace OpenMS
48 {
49  class MSExperiment;
50  class PeptideIdentification;
51 
71  class OPENMS_DLLAPI ProteinIdentification :
72  public MetaInfoInterface
73  {
74 public:
77 
81  class OPENMS_DLLAPI ProteinGroup
82  {
83  public:
86  typedef std::vector<FloatDataArray> FloatDataArrays;
89  typedef std::vector<StringDataArray> StringDataArrays;
92  typedef std::vector<IntegerDataArray> IntegerDataArrays;
93 
95  double probability;
96 
98  std::vector<String> accessions;
99 
100  ProteinGroup();
101 
103  bool operator==(const ProteinGroup& rhs) const;
104 
105  /*
106  @brief Comparison operator (for sorting)
107 
108  This operator is intended for sorting protein groups in a "best first"
109  manner. That means higher probabilities are "less" than lower
110  probabilities (!); smaller groups are "less" than larger groups;
111  everything else being equal, accessions are compared lexicographically.
112  */
113  bool operator<(const ProteinGroup& rhs) const;
114 
116 
127  const FloatDataArrays& getFloatDataArrays() const;
129 
132  {
133  return float_data_arrays_;
134  }
135 
137  void setFloatDataArrays(const FloatDataArrays& fda);
138 
140  const StringDataArrays& getStringDataArrays() const;
141 
143  StringDataArrays& getStringDataArrays();
144 
146  void setStringDataArrays(const StringDataArrays& sda);
147 
149  const IntegerDataArrays& getIntegerDataArrays() const;
150 
152  IntegerDataArrays& getIntegerDataArrays();
153 
155  void setIntegerDataArrays(const IntegerDataArrays& ida);
156 
159  {
160  return *std::find_if(integer_data_arrays_.begin(), integer_data_arrays_.end(),
161  [&name](const IntegerDataArray& da) { return da.getName() == name; } );
162  }
163 
166  {
167  return *std::find_if(string_data_arrays_.begin(), string_data_arrays_.end(),
168  [&name](const StringDataArray& da) { return da.getName() == name; } );
169  }
170 
173  {
174  return *std::find_if(float_data_arrays_.begin(), float_data_arrays_.end(),
175  [&name](const FloatDataArray& da) { return da.getName() == name; } );
176  }
177 
180  {
181  return *std::find_if(integer_data_arrays_.begin(), integer_data_arrays_.end(),
182  [&name](const IntegerDataArray& da) { return da.getName() == name; } );
183  }
184 
187  {
188  return *std::find_if(string_data_arrays_.begin(), string_data_arrays_.end(),
189  [&name](const StringDataArray& da) { return da.getName() == name; } );
190  }
191 
194  {
195  return *std::find_if(float_data_arrays_.begin(), float_data_arrays_.end(),
196  [&name](const FloatDataArray& da) { return da.getName() == name; } );
197  }
198 
199  private:
202 
205 
208  };
209 
212  {
215  SIZE_OF_PEAKMASSTYPE
216  };
218  static const std::string NamesOfPeakMassType[SIZE_OF_PEAKMASSTYPE];
219 
221  struct OPENMS_DLLAPI SearchParameters :
222  public MetaInfoInterface
223  {
229  std::vector<String> fixed_modifications;
230  std::vector<String> variable_modifications;
237 
240  SearchParameters(const SearchParameters&) = default;
242  SearchParameters(SearchParameters&&) = default;
244  ~SearchParameters() = default;
245 
247  SearchParameters& operator=(const SearchParameters&) = default;
249  SearchParameters& operator=(SearchParameters&&)& = default;
250 
251  bool operator==(const SearchParameters& rhs) const;
252 
253  bool operator!=(const SearchParameters& rhs) const;
254 
256  std::pair<int,int> getChargeRange() const;
257 
262  bool mergeable(const ProteinIdentification::SearchParameters& sp, const String& experiment_type) const;
263 
264  private:
265  int getChargeValue_(String& charge_str) const;
266  };
267 
277  virtual ~ProteinIdentification();
278 
280  ProteinIdentification& operator=(const ProteinIdentification&) = default;
282  ProteinIdentification& operator=(ProteinIdentification&&) = default;
283 
285  bool operator==(const ProteinIdentification& rhs) const;
287  bool operator!=(const ProteinIdentification& rhs) const;
289 
291 
292  const std::vector<ProteinHit>& getHits() const;
295  std::vector<ProteinHit>& getHits();
297  void insertHit(const ProteinHit& input);
299  void insertHit(ProteinHit&& input);
300 
306  void setHits(const std::vector<ProteinHit>& hits);
307 
309  std::vector<ProteinHit>::iterator findHit(const String& accession);
310 
312  const std::vector<ProteinGroup>& getProteinGroups() const;
314  std::vector<ProteinGroup>& getProteinGroups();
316  void insertProteinGroup(const ProteinGroup& group);
317 
319  const std::vector<ProteinGroup>& getIndistinguishableProteins() const;
321  std::vector<ProteinGroup>& getIndistinguishableProteins();
323  void insertIndistinguishableProteins(const ProteinGroup& group);
325  void fillIndistinguishableGroupsWithSingletons();
326 
328  double getSignificanceThreshold() const;
330  void setSignificanceThreshold(double value);
332  const String& getScoreType() const;
334  void setScoreType(const String& type);
336  bool isHigherScoreBetter() const;
338  void setHigherScoreBetter(bool higher_is_better);
340  void sort();
342  void assignRanks();
350  void computeCoverage(const std::vector<PeptideIdentification>& pep_ids);
352 
359  void computeModifications(
360  const std::vector<PeptideIdentification>& pep_ids,
361  const StringList& skip_modifications);
362 
363 
365 
366  const DateTime& getDateTime() const;
369  void setDateTime(const DateTime& date);
371  void setSearchEngine(const String& search_engine);
373  const String& getSearchEngine() const;
375  void setSearchEngineVersion(const String& search_engine_version);
377  const String& getSearchEngineVersion() const;
379  void setInferenceEngine(const String& search_engine);
381  const String getInferenceEngine() const;
383  void setInferenceEngineVersion(const String& inference_engine_version);
385  const String getInferenceEngineVersion() const;
387  void setSearchParameters(const SearchParameters& search_parameters);
389  void setSearchParameters(SearchParameters&& search_parameters);
391  const SearchParameters& getSearchParameters() const;
393  SearchParameters& getSearchParameters();
395  const String& getIdentifier() const;
397  void setIdentifier(const String& id);
403  void setPrimaryMSRunPath(const StringList& s, bool raw = false);
404 
406  void setPrimaryMSRunPath(const StringList& s, MSExperiment& e);
407  void addPrimaryMSRunPath(const String& s, bool raw = false);
408  void addPrimaryMSRunPath(const StringList& s, bool raw = false);
409 
415  void getPrimaryMSRunPath(StringList& output, bool raw = false) const;
416 
419  bool hasInferenceData() const;
420 
422  bool hasInferenceEngineAsSearchEngine() const;
423 
427  bool peptideIDsMergeable(const ProteinIdentification& id_run, const String& experiment_type) const;
429 
430 protected:
432 
439 
441 
444  std::vector<ProteinHit> protein_hits_;
445  std::vector<ProteinGroup> protein_groups_;
447  std::vector<ProteinGroup> indistinguishable_proteins_;
450  };
451 
452 } //namespace OpenMS
OpenMS::DataArrays::IntegerDataArray
Integer data array class.
Definition: DataArrays.h:52
OpenMS::ProteinIdentification::ProteinGroup::StringDataArrays
std::vector< StringDataArray > StringDataArrays
Definition: ProteinIdentification.h:89
OpenMS::ProteinIdentification::SearchParameters::digestion_enzyme
Protease digestion_enzyme
The cleavage site information in details (from ProteaseDB)
Definition: ProteinIdentification.h:236
DataArrays.h
OpenMS::ProteinIdentification::ProteinGroup::FloatDataArrays
std::vector< FloatDataArray > FloatDataArrays
Definition: ProteinIdentification.h:86
OpenMS::ProteinIdentification::ProteinGroup::getStringDataArrayByName
const StringDataArray & getStringDataArrayByName(String name) const
Returns a const reference to the first string meta data array with the given name.
Definition: ProteinIdentification.h:186
OpenMS::ProteinIdentification::PeakMassType
PeakMassType
Peak mass type.
Definition: ProteinIdentification.h:211
DateTime.h
OpenMS::ProteinIdentification::SearchParameters::missed_cleavages
UInt missed_cleavages
The number of allowed missed cleavages.
Definition: ProteinIdentification.h:231
OpenMS::String
A more convenient string class.
Definition: String.h:58
OpenMS::ProteinIdentification::ProteinGroup::accessions
std::vector< String > accessions
Accessions of (indistinguishable) proteins that belong to the same group.
Definition: ProteinIdentification.h:98
KDTree::operator!=
bool operator!=(_Iterator< _Val, _Ref, _Ptr > const &, _Iterator< _Val, _Ref, _Ptr > const &)
Definition: KDTree.h:824
OpenMS::MSExperiment
In-Memory representation of a mass spectrometry experiment.
Definition: MSExperiment.h:77
OpenMS::ProteinHit
Representation of a protein hit.
Definition: ProteinHit.h:57
OpenMS::ProteinIdentification::ProteinGroup::integer_data_arrays_
IntegerDataArrays integer_data_arrays_
Integer data arrays.
Definition: ProteinIdentification.h:207
OpenMS::ProteinIdentification::MONOISOTOPIC
Definition: ProteinIdentification.h:213
OpenMS::ProteinIdentification::ProteinGroup::getIntegerDataArrayByName
const IntegerDataArray & getIntegerDataArrayByName(String name) const
Returns a const reference to the first integer meta data array with the given name.
Definition: ProteinIdentification.h:179
OpenMS::ProteinIdentification::ProteinGroup::float_data_arrays_
FloatDataArrays float_data_arrays_
Float data arrays.
Definition: ProteinIdentification.h:201
OpenMS::operator<
bool operator<(const MultiplexDeltaMasses &dm1, const MultiplexDeltaMasses &dm2)
OpenMS::ProteinIdentification
Representation of a protein identification run.
Definition: ProteinIdentification.h:71
OpenMS::ProteinIdentification::ProteinGroup::FloatDataArray
OpenMS::DataArrays::FloatDataArray FloatDataArray
Float data array vector type.
Definition: ProteinIdentification.h:85
OpenMS::ProteinIdentification::search_engine_
String search_engine_
Definition: ProteinIdentification.h:434
OpenMS::ProteinIdentification::SearchParameters::fragment_mass_tolerance_ppm
bool fragment_mass_tolerance_ppm
Mass tolerance unit of fragment ions (true: ppm, false: Dalton)
Definition: ProteinIdentification.h:233
OpenMS::ProteinIdentification::protein_groups_
std::vector< ProteinGroup > protein_groups_
Definition: ProteinIdentification.h:445
OpenMS::ProteinIdentification::SearchParameters::charges
String charges
The allowed charges for the search.
Definition: ProteinIdentification.h:227
OpenMS
Main OpenMS namespace.
Definition: FeatureDeconvolution.h:46
OpenMS::ProteinIdentification::AVERAGE
Definition: ProteinIdentification.h:214
DigestionEnzymeProtein.h
OpenMS::ProteinIdentification::protein_score_type_
String protein_score_type_
Definition: ProteinIdentification.h:442
OpenMS::ProteinIdentification::SearchParameters::variable_modifications
std::vector< String > variable_modifications
Allowed variable modifications.
Definition: ProteinIdentification.h:230
OpenMS::ProteinIdentification::SearchParameters::mass_type
PeakMassType mass_type
Mass type of the peaks.
Definition: ProteinIdentification.h:228
OpenMS::ProteinIdentification::ProteinGroup::getFloatDataArrayByName
FloatDataArray & getFloatDataArrayByName(String name)
Returns a mutable reference to the first float meta data array with the given name.
Definition: ProteinIdentification.h:172
OpenMS::ProteinIdentification::protein_significance_threshold_
double protein_significance_threshold_
Definition: ProteinIdentification.h:448
OpenMS::MetaInfoInterface
Interface for classes that can store arbitrary meta information (Type-Name-Value tuples).
Definition: MetaInfoInterface.h:60
ProteinHit.h
OpenMS::ProteinIdentification::SearchParameters::db
String db
The used database.
Definition: ProteinIdentification.h:224
OpenMS::DigestionEnzymeProtein
Representation of a digestion enzyme for proteins (protease)
Definition: DigestionEnzymeProtein.h:48
OpenMS::ProteinIdentification::SearchParameters::db_version
String db_version
The database version.
Definition: ProteinIdentification.h:225
OpenMS::ProteinIdentification::ProteinGroup::getFloatDataArrays
FloatDataArrays & getFloatDataArrays()
Returns a mutable reference to the float meta data arrays.
Definition: ProteinIdentification.h:131
OpenMS::DataArrays::StringDataArray
String data array class.
Definition: DataArrays.h:59
OpenMS::ProteinIdentification::SearchParameters::fragment_mass_tolerance
double fragment_mass_tolerance
Mass tolerance of fragment ions (Dalton or ppm)
Definition: ProteinIdentification.h:232
OpenMS::StringList
std::vector< String > StringList
Vector of String.
Definition: ListUtils.h:70
OpenMS::ProteinIdentification::search_parameters_
SearchParameters search_parameters_
Definition: ProteinIdentification.h:436
OpenMS::UInt
unsigned int UInt
Unsigned integer type.
Definition: Types.h:94
OpenMS::ProteinIdentification::SearchParameters::fixed_modifications
std::vector< String > fixed_modifications
Used fixed modifications.
Definition: ProteinIdentification.h:229
OpenMS::ProteinIdentification::SearchParameters::precursor_mass_tolerance
double precursor_mass_tolerance
Mass tolerance of precursor ions (Dalton or ppm)
Definition: ProteinIdentification.h:234
KDTree::operator==
bool operator==(_Iterator< _Val, _Ref, _Ptr > const &, _Iterator< _Val, _Ref, _Ptr > const &)
Definition: KDTree.h:806
OpenMS::ProteinIdentification::indistinguishable_proteins_
std::vector< ProteinGroup > indistinguishable_proteins_
Indistinguishable proteins: accessions[0] is "group leader", probability is meaningless.
Definition: ProteinIdentification.h:447
OpenMS::ProteinIdentification::ProteinGroup::StringDataArray
OpenMS::DataArrays::StringDataArray StringDataArray
String data array vector type.
Definition: ProteinIdentification.h:88
MetaInfoInterface.h
OpenMS::ProteinIdentification::ProteinGroup::string_data_arrays_
StringDataArrays string_data_arrays_
String data arrays.
Definition: ProteinIdentification.h:204
OpenMS::ProteinIdentification::ProteinGroup::getStringDataArrayByName
StringDataArray & getStringDataArrayByName(String name)
Returns a mutable reference to the first string meta data array with the given name.
Definition: ProteinIdentification.h:165
OpenMS::ProteinIdentification::SearchParameters::precursor_mass_tolerance_ppm
bool precursor_mass_tolerance_ppm
Mass tolerance unit of precursor ions (true: ppm, false: Dalton)
Definition: ProteinIdentification.h:235
OpenMS::ProteinIdentification::date_
DateTime date_
Definition: ProteinIdentification.h:437
OpenMS::ProteinIdentification::ProteinGroup::getIntegerDataArrayByName
IntegerDataArray & getIntegerDataArrayByName(String name)
Returns a mutable reference to the first integer meta data array with the given name.
Definition: ProteinIdentification.h:158
OpenMS::ProteinIdentification::ProteinGroup::probability
double probability
Probability of this group.
Definition: ProteinIdentification.h:95
OpenMS::ProteinIdentification::SearchParameters
Search parameters of the DB search.
Definition: ProteinIdentification.h:221
OpenMS::ProteinIdentification::ProteinGroup::IntegerDataArrays
std::vector< IntegerDataArray > IntegerDataArrays
Definition: ProteinIdentification.h:92
OpenMS::DataArrays::FloatDataArray
Float data array class.
Definition: DataArrays.h:45
OpenMS::ProteinIdentification::protein_hits_
std::vector< ProteinHit > protein_hits_
Definition: ProteinIdentification.h:444
OpenMS::ProteinIdentification::SearchParameters::taxonomy
String taxonomy
The taxonomy restriction.
Definition: ProteinIdentification.h:226
OpenMS::ProteinIdentification::ProteinGroup::getFloatDataArrayByName
const FloatDataArray & getFloatDataArrayByName(String name) const
Returns a const reference to the first float meta data array with the given name.
Definition: ProteinIdentification.h:193
OpenMS::DateTime
DateTime Class.
Definition: DateTime.h:54
OpenMS::ProteinIdentification::search_engine_version_
String search_engine_version_
Definition: ProteinIdentification.h:435
OpenMS::ProteinIdentification::HitType
ProteinHit HitType
Hit type definition.
Definition: ProteinIdentification.h:76
OpenMS::ProteinIdentification::ProteinGroup::IntegerDataArray
OpenMS::DataArrays::IntegerDataArray IntegerDataArray
Integer data array vector type.
Definition: ProteinIdentification.h:91
OpenMS::ProteinIdentification::higher_score_better_
bool higher_score_better_
Definition: ProteinIdentification.h:443
OpenMS::ProteinIdentification::id_
String id_
Definition: ProteinIdentification.h:433
OpenMS::ProteinIdentification::ProteinGroup
Bundles multiple (e.g. indistinguishable) proteins in a group.
Definition: ProteinIdentification.h:81