OpenMS
Loading...
Searching...
No Matches
IdentificationDataConverter.h
Go to the documentation of this file.
1// Copyright (c) 2002-present, OpenMS Inc. -- EKU Tuebingen, ETH Zurich, and FU Berlin
2// SPDX-License-Identifier: BSD-3-Clause
3//
4// --------------------------------------------------------------------------
5// $Maintainer: Hendrik Weisser $
6// $Authors: Hendrik Weisser $
7// --------------------------------------------------------------------------
8
9#pragma once
10
13#include <OpenMS/FORMAT/MzTab.h>
17
18namespace OpenMS
19{
20 class FeatureMap;
21
22 class OPENMS_DLLAPI IdentificationDataConverter
23 {
24 public:
25
27 static void importIDs(IdentificationData& id_data,
28 const std::vector<ProteinIdentification>& proteins,
29 const PeptideIdentificationList& peptides);
30
36 static void exportIDs(const IdentificationData& id_data,
37 std::vector<ProteinIdentification>& proteins,
39 bool export_ids_wo_scores = false);
40
42 static MzTab exportMzTab(const IdentificationData& id_data);
43
45 static void importSequences(IdentificationData& id_data,
46 const std::vector<FASTAFile::FASTAEntry>& fasta,
48 IdentificationData::MoleculeType::PROTEIN,
49 const String& decoy_pattern = "");
50
53 const IdentificationData::ParentMatches& parent_matches, PeptideHit& hit);
54
61 static void importFeatureIDs(FeatureMap& features, bool clear_original = true);
62
69 static void exportFeatureIDs(FeatureMap& features, bool clear_original = true);
70
77 static void importConsensusIDs(ConsensusMap& consensus, bool clear_original = true);
78
85 static void exportConsensusIDs(ConsensusMap& consensus, bool clear_original = true);
86
87 protected:
88
89 using StepOpt = std::optional<IdentificationData::ProcessingStepRef>;
90
93 {
94 bool operator()(const StepOpt& left, const StepOpt& right) const
95 {
96 // @TODO: should runs without associated step go first or last?
97 if (!left) return bool(right);
98 if (!right) return false;
99 return **left < **right;
100 }
101 };
102
105 {
107 const PeptideIdentification& right) const
108 {
109 // @TODO: should IDs without RT go first or last?
110 if (left.hasRT())
111 {
112 if (right.hasRT())
113 {
114 if (right.getRT() != left.getRT())
115 {
116 return left.getRT() < right.getRT();
117 } // else: compare by m/z (below)
118 }
119 else
120 {
121 return false;
122 }
123 }
124 else if (right.hasRT())
125 {
126 return true;
127 }
128 // no RTs or same RTs -> try to compare by m/z:
129 if (left.hasMZ())
130 {
131 if (right.hasMZ())
132 {
133 return left.getMZ() < right.getMZ();
134 }
135 else
136 {
137 return false;
138 }
139 }
140 // if both PI's have nothing, return false (to ensure 'x < x' is false for strict weak ordering)
141 return right.hasMZ();
142 }
143 };
144
146 template <typename MzTabSectionRow>
149 std::vector<MzTabSectionRow>& output,
150 std::map<IdentificationData::ScoreTypeRef, Size>& score_map)
151 {
152 MzTabSectionRow row;
153 row.accession.set(parent.accession);
154 exportStepsAndScoresToMzTab_(parent.steps_and_scores, row.search_engine,
155 row.best_search_engine_score, score_map);
156 row.description.set(parent.description);
157 row.coverage.set(parent.coverage);
158 if (!parent.sequence.empty())
159 {
161 opt_seq.first = "opt_sequence";
162 opt_seq.second.set(parent.sequence);
163 row.opt_.push_back(opt_seq);
164 }
165 output.push_back(row);
166 }
167
169 template <typename MzTabSectionRow, typename IdentSeq>
171 const IdentSeq& identified, std::vector<MzTabSectionRow>& output,
172 std::map<IdentificationData::ScoreTypeRef, Size>& score_map)
173 {
174 MzTabSectionRow row;
175 // @TODO: handle modifications properly
176 row.sequence.set(identified.sequence.toString());
177 exportStepsAndScoresToMzTab_(identified.steps_and_scores,
178 row.search_engine,
179 row.best_search_engine_score, score_map);
180 if (identified.parent_matches.empty()) // no parent information given
181 {
182 // row.unique.set(false); // leave this unset?
183 output.push_back(row);
184 }
185 else // generate entries (with duplicated data) for every accession
186 {
187 // in mzTab, "unique" means "peptide is unique for this protein"
188 row.unique.set(identified.parent_matches.size() == 1);
189 for (const auto& match_pair : identified.parent_matches)
190 {
191 row.accession.set(match_pair.first->accession);
192 for (const IdentificationData::ParentMatch& match :
193 match_pair.second)
194 {
195 MzTabSectionRow copy = row;
196 addMzTabMoleculeParentContext_(match, copy);
197 output.push_back(copy);
198 }
199 }
200 }
201 }
202
204 template <typename MzTabSectionRow>
206 const String& sequence,
207 const IdentificationData::ObservationMatch& match, double calc_mass,
208 std::vector<MzTabSectionRow>& output,
209 std::map<IdentificationData::ScoreTypeRef, Size>& score_map,
210 std::map<IdentificationData::InputFileRef, Size>& file_map)
211 {
212 MzTabSectionRow xsm; // PSM or OSM
213 // @TODO: handle modifications properly
214 xsm.sequence.set(sequence);
215 exportStepsAndScoresToMzTab_(match.steps_and_scores, xsm.search_engine,
216 xsm.search_engine_score, score_map);
218 std::vector<MzTabDouble> rts(1);
219 rts[0].set(query.rt);
220 xsm.retention_time.set(rts);
221 xsm.charge.set(match.charge);
222 xsm.exp_mass_to_charge.set(query.mz);
223 xsm.calc_mass_to_charge.set(calc_mass / abs(match.charge));
224 xsm.spectra_ref.setMSFile(file_map[query.input_file]);
225 xsm.spectra_ref.setSpecRef(query.data_id);
226 // optional column for adduct:
227 if (match.adduct_opt)
228 {
229 MzTabOptionalColumnEntry opt_adduct;
230 opt_adduct.first = "opt_adduct";
231 opt_adduct.second.set((*match.adduct_opt)->getName());
232 xsm.opt_.push_back(opt_adduct);
233 }
234 // optional columns for isotope offset:
235 // @TODO: find a way of passing in the names of relevant meta values
236 // (e.g. from NucleicAcidSearchEngine), instead of hard-coding them here
237 if (match.metaValueExists("isotope_offset"))
238 {
240 opt_meta.first = "opt_isotope_offset";
241 opt_meta.second.set(match.getMetaValue("isotope_offset"));
242 xsm.opt_.push_back(opt_meta);
243 }
244 // don't repeat data from the peptide section (e.g. accessions)
245 // why are "pre"/"post"/"start"/"end" not in the peptide section?!
246 output.push_back(xsm);
247 }
248
251 const IdentificationData::AppliedProcessingSteps& steps_and_scores,
252 MzTabParameterList& steps_out, std::map<Size, MzTabDouble>& scores_out,
253 std::map<IdentificationData::ScoreTypeRef, Size>& score_map);
254
256 static void addMzTabSEScores_(
257 const std::map<IdentificationData::ScoreTypeRef, Size>& scores,
258 std::map<Size, MzTabParameter>& output);
259
264
269
273 IdentificationData& id_data);
274
278
282 ProteinIdentification& protein);
283
284 static void handleFeatureImport_(Feature& feature, const IntList& indexes,
286 Size& id_counter, bool clear_original);
287
288 static void handleFeatureExport_(Feature& feature, const IntList& indexes,
289 IdentificationData& id_data, Size& id_counter);
290 };
291}
A container for consensus elements.
Definition ConsensusMap.h:68
A container for features.
Definition FeatureMap.h:82
An LC-MS feature.
Definition Feature.h:46
Definition IdentificationDataConverter.h:23
static void addMzTabSEScores_(const std::map< IdentificationData::ScoreTypeRef, Size > &scores, std::map< Size, MzTabParameter > &output)
Helper function to add search engine score entries to MzTab's meta data section.
static void exportFeatureIDs(FeatureMap &features, bool clear_original=true)
Convert IDs in a feature map to legacy peptide/protein identifications.
static void importSequences(IdentificationData &id_data, const std::vector< FASTAFile::FASTAEntry > &fasta, IdentificationData::MoleculeType type=IdentificationData::MoleculeType::PROTEIN, const String &decoy_pattern="")
Import FASTA sequences as parent sequences.
static void exportObservationMatchToMzTab_(const String &sequence, const IdentificationData::ObservationMatch &match, double calc_mass, std::vector< MzTabSectionRow > &output, std::map< IdentificationData::ScoreTypeRef, Size > &score_map, std::map< IdentificationData::InputFileRef, Size > &file_map)
Export an input match (peptide- or oligonucleotide-spectrum match) to mzTab.
Definition IdentificationDataConverter.h:205
static IdentificationData::SearchParamRef importDBSearchParameters_(const ProteinIdentification::SearchParameters &pisp, IdentificationData &id_data)
Helper function to import DB search parameters from legacy format.
static void addMzTabMoleculeParentContext_(const IdentificationData::ParentMatch &match, MzTabOligonucleotideSectionRow &row)
Helper function for exportPeptideOrOligoToMzTab_() - oligonucleotide variant.
static ProteinIdentification::SearchParameters exportDBSearchParameters_(IdentificationData::SearchParamRef ref)
Helper function to export DB search parameters to legacy format.
static void exportPeptideOrOligoToMzTab_(const IdentSeq &identified, std::vector< MzTabSectionRow > &output, std::map< IdentificationData::ScoreTypeRef, Size > &score_map)
Export an identified sequence (peptide or oligonucleotide, but not small molecule/compound) to mzTab.
Definition IdentificationDataConverter.h:170
static void handleFeatureImport_(Feature &feature, const IntList &indexes, PeptideIdentificationList &peptides, Size &id_counter, bool clear_original)
static void importIDs(IdentificationData &id_data, const std::vector< ProteinIdentification > &proteins, const PeptideIdentificationList &peptides)
Import from legacy peptide/protein identifications.
static MzTab exportMzTab(const IdentificationData &id_data)
Export to mzTab format.
static void exportIDs(const IdentificationData &id_data, std::vector< ProteinIdentification > &proteins, PeptideIdentificationList &peptides, bool export_ids_wo_scores=false)
Export to legacy peptide/protein identifications.
static void handleFeatureExport_(Feature &feature, const IntList &indexes, IdentificationData &id_data, Size &id_counter)
std::optional< IdentificationData::ProcessingStepRef > StepOpt
Definition IdentificationDataConverter.h:89
static void exportParentSequenceToMzTab_(const IdentificationData::ParentSequence &parent, std::vector< MzTabSectionRow > &output, std::map< IdentificationData::ScoreTypeRef, Size > &score_map)
Export a parent sequence (protein or nucleic acid) to mzTab.
Definition IdentificationDataConverter.h:147
static void exportStepsAndScoresToMzTab_(const IdentificationData::AppliedProcessingSteps &steps_and_scores, MzTabParameterList &steps_out, std::map< Size, MzTabDouble > &scores_out, std::map< IdentificationData::ScoreTypeRef, Size > &score_map)
Helper function to add processing steps (search engines) and their scores to MzTab.
static void exportMSRunInformation_(IdentificationData::ProcessingStepRef step_ref, ProteinIdentification &protein)
Helper function to export (primary) MS run information to legacy format.
static void importFeatureIDs(FeatureMap &features, bool clear_original=true)
Convert IDs from legacy peptide/protein identifications in a feature map.
static void exportParentMatches(const IdentificationData::ParentMatches &parent_matches, PeptideHit &hit)
Convert parent matches to peptide evidences.
static void exportConsensusIDs(ConsensusMap &consensus, bool clear_original=true)
Convert IDs in a consensus map to legacy peptide/protein identifications.
static void addMzTabMoleculeParentContext_(const IdentificationData::ParentMatch &match, MzTabPeptideSectionRow &row)
Helper function for exportPeptideOrOligoToMzTab_() - peptide variant.
static void importConsensusIDs(ConsensusMap &consensus, bool clear_original=true)
Convert IDs from legacy peptide/protein identifications in a consensus map.
Definition IdentificationData.h:87
IdentificationDataInternal::ParentMatches ParentMatches
Definition IdentificationData.h:138
IdentificationDataInternal::AppliedProcessingSteps AppliedProcessingSteps
Definition IdentificationData.h:127
bool metaValueExists(const String &name) const
Returns whether an entry with the given name exists.
const DataValue & getMetaValue(const String &name) const
Returns the value corresponding to a string, or DataValue::EMPTY if not found.
Definition MzTabBase.h:243
Data model of MzTab files. Please see the official MzTab specification at https://code....
Definition MzTab.h:455
Represents a single spectrum match (candidate) for a specific tandem mass spectrum (MS/MS).
Definition PeptideHit.h:52
Container for peptide identifications from multiple spectra.
Definition PeptideIdentificationList.h:66
Represents the set of candidates (SpectrumMatches) identified for a single precursor spectrum.
Definition PeptideIdentification.h:64
double getRT() const
returns the RT of the MS2 spectrum where the identification occurred
bool hasMZ() const
shortcut for isnan(getRT())
bool hasRT() const
shortcut for isnan(getRT())
double getMZ() const
returns the MZ of the MS2 spectrum
Representation of a protein identification run.
Definition ProteinIdentification.h:54
A more convenient string class.
Definition String.h:34
size_t Size
Size type e.g. used as variable which can hold result of size()
Definition Types.h:97
std::vector< Int > IntList
Vector of signed integers.
Definition ListUtils.h:29
MoleculeType
Definition MetaData.h:40
Main OpenMS namespace.
Definition openswathalgo/include/OpenMS/OPENSWATHALGO/DATAACCESS/ISpectrumAccess.h:19
std::pair< String, MzTabString > MzTabOptionalColumnEntry
Definition MzTabBase.h:203
OLI - Oligonucleotide section (table-based)
Definition MzTab.h:374
PEP - Peptide section (Table based)
Definition MzTab.h:220
Functor for ordering peptide IDs by RT and m/z (if available)
Definition IdentificationDataConverter.h:105
bool operator()(const PeptideIdentification &left, const PeptideIdentification &right) const
Definition IdentificationDataConverter.h:106
Functor for ordering StepOpt (by date of the steps, if available):
Definition IdentificationDataConverter.h:93
bool operator()(const StepOpt &left, const StepOpt &right) const
Definition IdentificationDataConverter.h:94
Wrapper that adds operator< to iterators, so they can be used as (part of) keys in maps/sets or multi...
Definition MetaData.h:20
Representation of a search hit (e.g. peptide-spectrum match).
Definition ObservationMatch.h:48
AdductOpt adduct_opt
optional reference to adduct
Definition ObservationMatch.h:55
Int charge
Definition ObservationMatch.h:53
ObservationRef observation_ref
Definition ObservationMatch.h:51
Representation of an observation, e.g. a spectrum or feature, in an input data file.
Definition Observation.h:28
double mz
Definition Observation.h:35
InputFileRef input_file
Reference to the input file.
Definition Observation.h:33
String data_id
Spectrum or feature ID (from the file referenced by input_file)
Definition Observation.h:30
double rt
Definition Observation.h:35
Meta data for the association between an identified molecule (e.g. peptide) and a parent sequence (e....
Definition ParentMatch.h:20
Representation of a parent sequence that is identified only indirectly (e.g. a protein).
Definition ParentSequence.h:24
String sequence
Definition ParentSequence.h:31
String description
Definition ParentSequence.h:33
double coverage
sequence coverage as a fraction between 0 and 1
Definition ParentSequence.h:35
String accession
Definition ParentSequence.h:25
AppliedProcessingSteps steps_and_scores
Definition ScoredProcessingResult.h:20
Search parameters of the DB search.
Definition ProteinIdentification.h:254