OpenMS  2.7.0
IdentificationDataConverter.h
Go to the documentation of this file.
1 // --------------------------------------------------------------------------
2 // OpenMS -- Open-Source Mass Spectrometry
3 // --------------------------------------------------------------------------
4 // Copyright The OpenMS Team -- Eberhard Karls University Tuebingen,
5 // ETH Zurich, and Freie Universitaet Berlin 2002-2021.
6 //
7 // This software is released under a three-clause BSD license:
8 // * Redistributions of source code must retain the above copyright
9 // notice, this list of conditions and the following disclaimer.
10 // * Redistributions in binary form must reproduce the above copyright
11 // notice, this list of conditions and the following disclaimer in the
12 // documentation and/or other materials provided with the distribution.
13 // * Neither the name of any author or any participating institution
14 // may be used to endorse or promote products derived from this software
15 // without specific prior written permission.
16 // For a full list of authors, refer to the file AUTHORS.
17 // --------------------------------------------------------------------------
18 // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
19 // AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
20 // IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
21 // ARE DISCLAIMED. IN NO EVENT SHALL ANY OF THE AUTHORS OR THE CONTRIBUTING
22 // INSTITUTIONS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
23 // EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
24 // PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
25 // OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
26 // WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR
27 // OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF
28 // ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
29 //
30 // --------------------------------------------------------------------------
31 // $Maintainer: Hendrik Weisser $
32 // $Authors: Hendrik Weisser $
33 // --------------------------------------------------------------------------
34 
35 #pragma once
36 
39 #include <OpenMS/FORMAT/MzTab.h>
42 
43 namespace OpenMS
44 {
45  class OPENMS_DLLAPI IdentificationDataConverter
46  {
47  public:
48 
50  static void importIDs(IdentificationData& id_data,
51  const std::vector<ProteinIdentification>& proteins,
52  const std::vector<PeptideIdentification>& peptides);
53 
55  static void exportIDs(const IdentificationData& id_data,
56  std::vector<ProteinIdentification>& proteins,
57  std::vector<PeptideIdentification>& peptides,
58  bool export_oligonucleotides = false);
59 
61  static MzTab exportMzTab(const IdentificationData& id_data);
62 
64  static void importSequences(IdentificationData& id_data,
65  const std::vector<FASTAFile::FASTAEntry>& fasta,
68  const String& decoy_pattern = "");
69 
70  protected:
71 
73  template <typename MzTabSectionRow>
76  std::vector<MzTabSectionRow>& output,
77  std::map<IdentificationData::ScoreTypeRef, Size>& score_map)
78  {
79  MzTabSectionRow row;
80  row.accession.set(parent.accession);
81  exportStepsAndScoresToMzTab_(parent.steps_and_scores, row.search_engine,
82  row.best_search_engine_score, score_map);
83  row.description.set(parent.description);
84  row.coverage.set(parent.coverage);
85  if (!parent.sequence.empty())
86  {
88  opt_seq.first = "opt_sequence";
89  opt_seq.second.set(parent.sequence);
90  row.opt_.push_back(opt_seq);
91  }
92  output.push_back(row);
93  }
94 
96  template <typename MzTabSectionRow, typename IdentSeq>
98  const IdentSeq& identified, std::vector<MzTabSectionRow>& output,
99  std::map<IdentificationData::ScoreTypeRef, Size>& score_map)
100  {
101  MzTabSectionRow row;
102  // @TODO: handle modifications properly
103  row.sequence.set(identified.sequence.toString());
104  exportStepsAndScoresToMzTab_(identified.steps_and_scores,
105  row.search_engine,
106  row.best_search_engine_score, score_map);
107  if (identified.parent_matches.empty()) // no parent information given
108  {
109  // row.unique.set(false); // leave this unset?
110  output.push_back(row);
111  }
112  else // generate entries (with duplicated data) for every accession
113  {
114  // in mzTab, "unique" means "peptide is unique for this protein"
115  row.unique.set(identified.parent_matches.size() == 1);
116  for (const auto& match_pair : identified.parent_matches)
117  {
118  row.accession.set(match_pair.first->accession);
119  for (const IdentificationData::MoleculeParentMatch& match :
120  match_pair.second)
121  {
122  MzTabSectionRow copy = row;
123  addMzTabMoleculeParentContext_(match, copy);
124  output.push_back(copy);
125  }
126  }
127  }
128  }
129 
131  template <typename MzTabSectionRow>
133  const String& sequence,
134  const IdentificationData::MoleculeQueryMatch& match, double calc_mass,
135  std::vector<MzTabSectionRow>& output,
136  std::map<IdentificationData::ScoreTypeRef, Size>& score_map,
137  std::map<IdentificationData::InputFileRef, Size>& file_map)
138  {
139  MzTabSectionRow xsm; // PSM or OSM
140  // @TODO: handle modifications properly
141  xsm.sequence.set(sequence);
142  exportStepsAndScoresToMzTab_(match.steps_and_scores, xsm.search_engine,
143  xsm.search_engine_score, score_map);
144  const IdentificationData::DataQuery& query = *match.data_query_ref;
145  std::vector<MzTabDouble> rts(1);
146  rts[0].set(query.rt);
147  xsm.retention_time.set(rts);
148  xsm.charge.set(match.charge);
149  xsm.exp_mass_to_charge.set(query.mz);
150  xsm.calc_mass_to_charge.set(calc_mass / abs(match.charge));
151  if (query.input_file_opt)
152  {
153  xsm.spectra_ref.setMSFile(file_map[*query.input_file_opt]);
154  }
155  xsm.spectra_ref.setSpecRef(query.data_id);
156  // @TODO: find a way of passing in the names of relevant meta values
157  // (e.g. from NucleicAcidSearchEngine), instead of hard-coding them here
158  static const std::vector<String> meta_out({"adduct", "isotope_offset"});
159  for (const String& meta : meta_out)
160  {
161  if (match.metaValueExists(meta))
162  {
163  MzTabOptionalColumnEntry opt_meta;
164  opt_meta.first = "opt_" + meta;
165  opt_meta.second.set(match.getMetaValue(meta));
166  xsm.opt_.push_back(opt_meta);
167  }
168  }
169  // don't repeat data from the peptide section (e.g. accessions)
170  // why are "pre"/"post"/"start"/"end" not in the peptide section?!
171  output.push_back(xsm);
172  }
173 
176  const IdentificationData::AppliedProcessingSteps& steps_and_scores,
177  MzTabParameterList& steps_out, std::map<Size, MzTabDouble>& scores_out,
178  std::map<IdentificationData::ScoreTypeRef, Size>& score_map);
179 
181  static void addMzTabSEScores_(
182  const std::map<IdentificationData::ScoreTypeRef, Size>& scores,
183  std::map<Size, MzTabParameter>& output);
184 
189 
194 
198  IdentificationData& id_data);
199 
203 
207  ProteinIdentification& protein);
208  };
209 }
Definition: IdentificationDataConverter.h:46
static void addMzTabSEScores_(const std::map< IdentificationData::ScoreTypeRef, Size > &scores, std::map< Size, MzTabParameter > &output)
Helper function to add search engine score entries to MzTab's meta data section.
static void importSequences(IdentificationData &id_data, const std::vector< FASTAFile::FASTAEntry > &fasta, IdentificationData::MoleculeType type=IdentificationData::MoleculeType::PROTEIN, const String &decoy_pattern="")
Import FASTA sequences as parent molecules.
static void exportIDs(const IdentificationData &id_data, std::vector< ProteinIdentification > &proteins, std::vector< PeptideIdentification > &peptides, bool export_oligonucleotides=false)
Export to legacy peptide/protein identifications.
static void addMzTabMoleculeParentContext_(const IdentificationData::MoleculeParentMatch &match, MzTabOligonucleotideSectionRow &row)
Helper function for exportPeptideOrOligoToMzTab_() - oligonucleotide variant.
static void importIDs(IdentificationData &id_data, const std::vector< ProteinIdentification > &proteins, const std::vector< PeptideIdentification > &peptides)
Import from legacy peptide/protein identifications.
static IdentificationData::SearchParamRef importDBSearchParameters_(const ProteinIdentification::SearchParameters &pisp, IdentificationData &id_data)
Helper function to import DB search parameters from legacy format.
static ProteinIdentification::SearchParameters exportDBSearchParameters_(IdentificationData::SearchParamRef ref)
Helper function to export DB search parameters to legacy format.
static void exportPeptideOrOligoToMzTab_(const IdentSeq &identified, std::vector< MzTabSectionRow > &output, std::map< IdentificationData::ScoreTypeRef, Size > &score_map)
Export an identified sequence (peptide or oligonucleotide, but not small molecule/compound) to mzTab.
Definition: IdentificationDataConverter.h:97
static MzTab exportMzTab(const IdentificationData &id_data)
Export to mzTab format.
static void exportParentMoleculeToMzTab_(const IdentificationData::ParentMolecule &parent, std::vector< MzTabSectionRow > &output, std::map< IdentificationData::ScoreTypeRef, Size > &score_map)
Export a parent molecule (protein or nucleic acid) to mzTab.
Definition: IdentificationDataConverter.h:74
static void addMzTabMoleculeParentContext_(const IdentificationData::MoleculeParentMatch &match, MzTabPeptideSectionRow &row)
Helper function for exportPeptideOrOligoToMzTab_() - peptide variant.
static void exportStepsAndScoresToMzTab_(const IdentificationData::AppliedProcessingSteps &steps_and_scores, MzTabParameterList &steps_out, std::map< Size, MzTabDouble > &scores_out, std::map< IdentificationData::ScoreTypeRef, Size > &score_map)
Helper function to add processing steps (search engines) and their scores to MzTab.
static void exportMSRunInformation_(IdentificationData::ProcessingStepRef step_ref, ProteinIdentification &protein)
Helper function to export (primary) MS run information to legacy format.
static void exportQueryMatchToMzTab_(const String &sequence, const IdentificationData::MoleculeQueryMatch &match, double calc_mass, std::vector< MzTabSectionRow > &output, std::map< IdentificationData::ScoreTypeRef, Size > &score_map, std::map< IdentificationData::InputFileRef, Size > &file_map)
Export a molecule-query match (peptide- or oligonucleotide-spectrum match) to mzTab.
Definition: IdentificationDataConverter.h:132
Representation of spectrum identification results and associated data.
Definition: IdentificationData.h:90
IdentificationDataInternal::AppliedProcessingSteps AppliedProcessingSteps
Definition: IdentificationData.h:123
bool metaValueExists(const String &name) const
Returns whether an entry with the given name exists.
const DataValue & getMetaValue(const String &name, const DataValue &default_value=DataValue::EMPTY) const
Returns the value corresponding to a string, or a default value (default: DataValue::EMPTY) if not fo...
Definition: MzTab.h:271
Data model of MzTab files. Please see the official MzTab specification at https://code....
Definition: MzTab.h:809
Representation of a protein identification run.
Definition: ProteinIdentification.h:72
A more convenient string class.
Definition: String.h:61
MoleculeType
Definition: MetaData.h:64
@ PROTEIN
Definition: MetaData.h:65
Main OpenMS namespace.
Definition: FeatureDeconvolution.h:47
std::pair< String, MzTabString > MzTabOptionalColumnEntry
Definition: MzTab.h:531
OLI - Oligonucleotide section (table-based)
Definition: MzTab.h:728
PEP - Peptide section (Table based)
Definition: MzTab.h:574
Search query, e.g. spectrum or feature.
Definition: DataQuery.h:48
boost::optional< InputFileRef > input_file_opt
Definition: DataQuery.h:53
double mz
Definition: DataQuery.h:55
String data_id
spectrum or feature ID (from the file referenced by "input_file_ref"):
Definition: DataQuery.h:50
double rt
Definition: DataQuery.h:55
Wrapper that adds operator< to iterators, so they can be used as (part of) keys in maps/sets or multi...
Definition: MetaData.h:44
Meta data for the association between an identified molecule (e.g. peptide) and a parent molecule (e....
Definition: MoleculeParentMatch.h:46
Meta data for a search hit (e.g. peptide-spectrum match).
Definition: MoleculeQueryMatch.h:62
DataQueryRef data_query_ref
Definition: MoleculeQueryMatch.h:65
Int charge
Definition: MoleculeQueryMatch.h:67
Representation of a parent molecule that is identified only indirectly (e.g. a protein).
Definition: ParentMolecule.h:50
String sequence
Definition: ParentMolecule.h:57
String description
Definition: ParentMolecule.h:59
double coverage
sequence coverage as a fraction between 0 and 1
Definition: ParentMolecule.h:61
String accession
Definition: ParentMolecule.h:51
AppliedProcessingSteps steps_and_scores
Definition: ScoredProcessingResult.h:46
Search parameters of the DB search.
Definition: ProteinIdentification.h:260