OpenMS
IdentificationDataConverter.h
Go to the documentation of this file.
1 // Copyright (c) 2002-present, OpenMS Inc. -- EKU Tuebingen, ETH Zurich, and FU Berlin
2 // SPDX-License-Identifier: BSD-3-Clause
3 //
4 // --------------------------------------------------------------------------
5 // $Maintainer: Hendrik Weisser $
6 // $Authors: Hendrik Weisser $
7 // --------------------------------------------------------------------------
8 
9 #pragma once
10 
13 #include <OpenMS/FORMAT/MzTab.h>
16 
17 namespace OpenMS
18 {
19  class FeatureMap;
20 
21  class OPENMS_DLLAPI IdentificationDataConverter
22  {
23  public:
24 
26  static void importIDs(IdentificationData& id_data,
27  const std::vector<ProteinIdentification>& proteins,
28  const std::vector<PeptideIdentification>& peptides);
29 
35  static void exportIDs(const IdentificationData& id_data,
36  std::vector<ProteinIdentification>& proteins,
37  std::vector<PeptideIdentification>& peptides,
38  bool export_ids_wo_scores = false);
39 
41  static MzTab exportMzTab(const IdentificationData& id_data);
42 
44  static void importSequences(IdentificationData& id_data,
45  const std::vector<FASTAFile::FASTAEntry>& fasta,
48  const String& decoy_pattern = "");
49 
51  static void exportParentMatches(
52  const IdentificationData::ParentMatches& parent_matches, PeptideHit& hit);
53 
60  static void importFeatureIDs(FeatureMap& features, bool clear_original = true);
61 
68  static void exportFeatureIDs(FeatureMap& features, bool clear_original = true);
69 
76  static void importConsensusIDs(ConsensusMap& consensus, bool clear_original = true);
77 
84  static void exportConsensusIDs(ConsensusMap& consensus, bool clear_original = true);
85 
86  protected:
87 
88  using StepOpt = std::optional<IdentificationData::ProcessingStepRef>;
89 
92  {
93  bool operator()(const StepOpt& left, const StepOpt& right) const
94  {
95  // @TODO: should runs without associated step go first or last?
96  if (!left) return bool(right);
97  if (!right) return false;
98  return **left < **right;
99  }
100  };
101 
104  {
106  const PeptideIdentification& right) const
107  {
108  // @TODO: should IDs without RT go first or last?
109  if (left.hasRT())
110  {
111  if (right.hasRT())
112  {
113  if (right.getRT() != left.getRT())
114  {
115  return left.getRT() < right.getRT();
116  } // else: compare by m/z (below)
117  }
118  else
119  {
120  return false;
121  }
122  }
123  else if (right.hasRT())
124  {
125  return true;
126  }
127  // no RTs or same RTs -> try to compare by m/z:
128  if (left.hasMZ())
129  {
130  if (right.hasMZ())
131  {
132  return left.getMZ() < right.getMZ();
133  }
134  else
135  {
136  return false;
137  }
138  }
139  // if both PI's have nothing, return false (to ensure 'x < x' is false for strict weak ordering)
140  return right.hasMZ();
141  }
142  };
143 
145  template <typename MzTabSectionRow>
148  std::vector<MzTabSectionRow>& output,
149  std::map<IdentificationData::ScoreTypeRef, Size>& score_map)
150  {
151  MzTabSectionRow row;
152  row.accession.set(parent.accession);
153  exportStepsAndScoresToMzTab_(parent.steps_and_scores, row.search_engine,
154  row.best_search_engine_score, score_map);
155  row.description.set(parent.description);
156  row.coverage.set(parent.coverage);
157  if (!parent.sequence.empty())
158  {
159  MzTabOptionalColumnEntry opt_seq;
160  opt_seq.first = "opt_sequence";
161  opt_seq.second.set(parent.sequence);
162  row.opt_.push_back(opt_seq);
163  }
164  output.push_back(row);
165  }
166 
168  template <typename MzTabSectionRow, typename IdentSeq>
170  const IdentSeq& identified, std::vector<MzTabSectionRow>& output,
171  std::map<IdentificationData::ScoreTypeRef, Size>& score_map)
172  {
173  MzTabSectionRow row;
174  // @TODO: handle modifications properly
175  row.sequence.set(identified.sequence.toString());
176  exportStepsAndScoresToMzTab_(identified.steps_and_scores,
177  row.search_engine,
178  row.best_search_engine_score, score_map);
179  if (identified.parent_matches.empty()) // no parent information given
180  {
181  // row.unique.set(false); // leave this unset?
182  output.push_back(row);
183  }
184  else // generate entries (with duplicated data) for every accession
185  {
186  // in mzTab, "unique" means "peptide is unique for this protein"
187  row.unique.set(identified.parent_matches.size() == 1);
188  for (const auto& match_pair : identified.parent_matches)
189  {
190  row.accession.set(match_pair.first->accession);
191  for (const IdentificationData::ParentMatch& match :
192  match_pair.second)
193  {
194  MzTabSectionRow copy = row;
195  addMzTabMoleculeParentContext_(match, copy);
196  output.push_back(copy);
197  }
198  }
199  }
200  }
201 
203  template <typename MzTabSectionRow>
205  const String& sequence,
206  const IdentificationData::ObservationMatch& match, double calc_mass,
207  std::vector<MzTabSectionRow>& output,
208  std::map<IdentificationData::ScoreTypeRef, Size>& score_map,
209  std::map<IdentificationData::InputFileRef, Size>& file_map)
210  {
211  MzTabSectionRow xsm; // PSM or OSM
212  // @TODO: handle modifications properly
213  xsm.sequence.set(sequence);
214  exportStepsAndScoresToMzTab_(match.steps_and_scores, xsm.search_engine,
215  xsm.search_engine_score, score_map);
216  const IdentificationData::Observation& query = *match.observation_ref;
217  std::vector<MzTabDouble> rts(1);
218  rts[0].set(query.rt);
219  xsm.retention_time.set(rts);
220  xsm.charge.set(match.charge);
221  xsm.exp_mass_to_charge.set(query.mz);
222  xsm.calc_mass_to_charge.set(calc_mass / abs(match.charge));
223  xsm.spectra_ref.setMSFile(file_map[query.input_file]);
224  xsm.spectra_ref.setSpecRef(query.data_id);
225  // optional column for adduct:
226  if (match.adduct_opt)
227  {
228  MzTabOptionalColumnEntry opt_adduct;
229  opt_adduct.first = "opt_adduct";
230  opt_adduct.second.set((*match.adduct_opt)->getName());
231  xsm.opt_.push_back(opt_adduct);
232  }
233  // optional columns for isotope offset:
234  // @TODO: find a way of passing in the names of relevant meta values
235  // (e.g. from NucleicAcidSearchEngine), instead of hard-coding them here
236  if (match.metaValueExists("isotope_offset"))
237  {
238  MzTabOptionalColumnEntry opt_meta;
239  opt_meta.first = "opt_isotope_offset";
240  opt_meta.second.set(match.getMetaValue("isotope_offset"));
241  xsm.opt_.push_back(opt_meta);
242  }
243  // don't repeat data from the peptide section (e.g. accessions)
244  // why are "pre"/"post"/"start"/"end" not in the peptide section?!
245  output.push_back(xsm);
246  }
247 
250  const IdentificationData::AppliedProcessingSteps& steps_and_scores,
251  MzTabParameterList& steps_out, std::map<Size, MzTabDouble>& scores_out,
252  std::map<IdentificationData::ScoreTypeRef, Size>& score_map);
253 
255  static void addMzTabSEScores_(
256  const std::map<IdentificationData::ScoreTypeRef, Size>& scores,
257  std::map<Size, MzTabParameter>& output);
258 
261  const IdentificationData::ParentMatch& match,
263 
266  const IdentificationData::ParentMatch& match,
268 
272  IdentificationData& id_data);
273 
277 
281  ProteinIdentification& protein);
282 
283  static void handleFeatureImport_(Feature& feature, const IntList& indexes,
284  std::vector<PeptideIdentification>& peptides,
285  Size& id_counter, bool clear_original);
286 
287  static void handleFeatureExport_(Feature& feature, const IntList& indexes,
288  IdentificationData& id_data, Size& id_counter);
289  };
290 }
A container for consensus elements.
Definition: ConsensusMap.h:66
A container for features.
Definition: FeatureMap.h:80
An LC-MS feature.
Definition: Feature.h:46
Definition: IdentificationDataConverter.h:22
static void addMzTabSEScores_(const std::map< IdentificationData::ScoreTypeRef, Size > &scores, std::map< Size, MzTabParameter > &output)
Helper function to add search engine score entries to MzTab's meta data section.
static void exportFeatureIDs(FeatureMap &features, bool clear_original=true)
Convert IDs in a feature map to legacy peptide/protein identifications.
static void importSequences(IdentificationData &id_data, const std::vector< FASTAFile::FASTAEntry > &fasta, IdentificationData::MoleculeType type=IdentificationData::MoleculeType::PROTEIN, const String &decoy_pattern="")
Import FASTA sequences as parent sequences.
static void exportObservationMatchToMzTab_(const String &sequence, const IdentificationData::ObservationMatch &match, double calc_mass, std::vector< MzTabSectionRow > &output, std::map< IdentificationData::ScoreTypeRef, Size > &score_map, std::map< IdentificationData::InputFileRef, Size > &file_map)
Export an input match (peptide- or oligonucleotide-spectrum match) to mzTab.
Definition: IdentificationDataConverter.h:204
static void exportIDs(const IdentificationData &id_data, std::vector< ProteinIdentification > &proteins, std::vector< PeptideIdentification > &peptides, bool export_ids_wo_scores=false)
Export to legacy peptide/protein identifications.
static void importIDs(IdentificationData &id_data, const std::vector< ProteinIdentification > &proteins, const std::vector< PeptideIdentification > &peptides)
Import from legacy peptide/protein identifications.
static IdentificationData::SearchParamRef importDBSearchParameters_(const ProteinIdentification::SearchParameters &pisp, IdentificationData &id_data)
Helper function to import DB search parameters from legacy format.
static void addMzTabMoleculeParentContext_(const IdentificationData::ParentMatch &match, MzTabOligonucleotideSectionRow &row)
Helper function for exportPeptideOrOligoToMzTab_() - oligonucleotide variant.
static ProteinIdentification::SearchParameters exportDBSearchParameters_(IdentificationData::SearchParamRef ref)
Helper function to export DB search parameters to legacy format.
static void exportPeptideOrOligoToMzTab_(const IdentSeq &identified, std::vector< MzTabSectionRow > &output, std::map< IdentificationData::ScoreTypeRef, Size > &score_map)
Export an identified sequence (peptide or oligonucleotide, but not small molecule/compound) to mzTab.
Definition: IdentificationDataConverter.h:169
static void handleFeatureImport_(Feature &feature, const IntList &indexes, std::vector< PeptideIdentification > &peptides, Size &id_counter, bool clear_original)
static MzTab exportMzTab(const IdentificationData &id_data)
Export to mzTab format.
static void handleFeatureExport_(Feature &feature, const IntList &indexes, IdentificationData &id_data, Size &id_counter)
std::optional< IdentificationData::ProcessingStepRef > StepOpt
Definition: IdentificationDataConverter.h:88
static void exportParentSequenceToMzTab_(const IdentificationData::ParentSequence &parent, std::vector< MzTabSectionRow > &output, std::map< IdentificationData::ScoreTypeRef, Size > &score_map)
Export a parent sequence (protein or nucleic acid) to mzTab.
Definition: IdentificationDataConverter.h:146
static void exportStepsAndScoresToMzTab_(const IdentificationData::AppliedProcessingSteps &steps_and_scores, MzTabParameterList &steps_out, std::map< Size, MzTabDouble > &scores_out, std::map< IdentificationData::ScoreTypeRef, Size > &score_map)
Helper function to add processing steps (search engines) and their scores to MzTab.
static void exportMSRunInformation_(IdentificationData::ProcessingStepRef step_ref, ProteinIdentification &protein)
Helper function to export (primary) MS run information to legacy format.
static void importFeatureIDs(FeatureMap &features, bool clear_original=true)
Convert IDs from legacy peptide/protein identifications in a feature map.
static void exportParentMatches(const IdentificationData::ParentMatches &parent_matches, PeptideHit &hit)
Convert parent matches to peptide evidences.
static void exportConsensusIDs(ConsensusMap &consensus, bool clear_original=true)
Convert IDs in a consensus map to legacy peptide/protein identifications.
static void addMzTabMoleculeParentContext_(const IdentificationData::ParentMatch &match, MzTabPeptideSectionRow &row)
Helper function for exportPeptideOrOligoToMzTab_() - peptide variant.
static void importConsensusIDs(ConsensusMap &consensus, bool clear_original=true)
Convert IDs from legacy peptide/protein identifications in a consensus map.
Definition: IdentificationData.h:87
IdentificationDataInternal::ParentMatches ParentMatches
Definition: IdentificationData.h:138
IdentificationDataInternal::AppliedProcessingSteps AppliedProcessingSteps
Definition: IdentificationData.h:127
bool metaValueExists(const String &name) const
Returns whether an entry with the given name exists.
const DataValue & getMetaValue(const String &name) const
Returns the value corresponding to a string, or DataValue::EMPTY if not found.
Definition: MzTabBase.h:243
Data model of MzTab files. Please see the official MzTab specification at https://code....
Definition: MzTab.h:452
Representation of a peptide hit.
Definition: PeptideHit.h:31
Represents the peptide hits for a spectrum.
Definition: PeptideIdentification.h:39
double getRT() const
returns the RT of the MS2 spectrum where the identification occurred
bool hasMZ() const
shortcut for isnan(getRT())
bool hasRT() const
shortcut for isnan(getRT())
double getMZ() const
returns the MZ of the MS2 spectrum
Representation of a protein identification run.
Definition: ProteinIdentification.h:50
A more convenient string class.
Definition: String.h:34
size_t Size
Size type e.g. used as variable which can hold result of size()
Definition: Types.h:97
std::vector< Int > IntList
Vector of signed integers.
Definition: ListUtils.h:29
MoleculeType
Definition: MetaData.h:40
@ PROTEIN
Definition: MetaData.h:41
Main OpenMS namespace.
Definition: openswathalgo/include/OpenMS/OPENSWATHALGO/DATAACCESS/ISpectrumAccess.h:19
std::pair< String, MzTabString > MzTabOptionalColumnEntry
Definition: MzTabBase.h:203
OLI - Oligonucleotide section (table-based)
Definition: MzTab.h:371
PEP - Peptide section (Table based)
Definition: MzTab.h:217
Functor for ordering peptide IDs by RT and m/z (if available)
Definition: IdentificationDataConverter.h:104
bool operator()(const PeptideIdentification &left, const PeptideIdentification &right) const
Definition: IdentificationDataConverter.h:105
Functor for ordering StepOpt (by date of the steps, if available):
Definition: IdentificationDataConverter.h:92
bool operator()(const StepOpt &left, const StepOpt &right) const
Definition: IdentificationDataConverter.h:93
Wrapper that adds operator< to iterators, so they can be used as (part of) keys in maps/sets or multi...
Definition: MetaData.h:20
Representation of a search hit (e.g. peptide-spectrum match).
Definition: ObservationMatch.h:48
AdductOpt adduct_opt
optional reference to adduct
Definition: ObservationMatch.h:55
Int charge
Definition: ObservationMatch.h:53
ObservationRef observation_ref
Definition: ObservationMatch.h:51
Representation of an observation, e.g. a spectrum or feature, in an input data file.
Definition: Observation.h:28
double mz
Definition: Observation.h:35
InputFileRef input_file
Reference to the input file.
Definition: Observation.h:33
String data_id
Spectrum or feature ID (from the file referenced by input_file)
Definition: Observation.h:30
double rt
Definition: Observation.h:35
Meta data for the association between an identified molecule (e.g. peptide) and a parent sequence (e....
Definition: ParentMatch.h:20
Representation of a parent sequence that is identified only indirectly (e.g. a protein).
Definition: ParentSequence.h:24
String sequence
Definition: ParentSequence.h:31
String description
Definition: ParentSequence.h:33
double coverage
sequence coverage as a fraction between 0 and 1
Definition: ParentSequence.h:35
String accession
Definition: ParentSequence.h:25
AppliedProcessingSteps steps_and_scores
Definition: ScoredProcessingResult.h:20
Search parameters of the DB search.
Definition: ProteinIdentification.h:247