OpenMS
Loading...
Searching...
No Matches
MzIdentMLDOMHandler.h
Go to the documentation of this file.
1// Copyright (c) 2002-present, OpenMS Inc. -- EKU Tuebingen, ETH Zurich, and FU Berlin
2// SPDX-License-Identifier: BSD-3-Clause
3//
4// --------------------------------------------------------------------------
5// $Maintainer: Mathias Walzer $
6// $Authors: Mathias Walzer$
7// --------------------------------------------------------------------------
8
9#pragma once
10
21
22#include <xercesc/dom/DOM.hpp>
23#include <xercesc/dom/DOMDocument.hpp>
24#include <xercesc/dom/DOMDocumentType.hpp>
25#include <xercesc/dom/DOMElement.hpp>
26#include <xercesc/dom/DOMImplementation.hpp>
27#include <xercesc/dom/DOMImplementationLS.hpp>
28#include <xercesc/dom/DOMNodeIterator.hpp>
29#include <xercesc/dom/DOMNodeList.hpp>
30#include <xercesc/dom/DOMText.hpp>
31#include <xercesc/framework/LocalFileFormatTarget.hpp>
32#include <xercesc/framework/psvi/XSValue.hpp>
33#include <xercesc/parsers/XercesDOMParser.hpp>
34#include <xercesc/util/OutOfMemoryException.hpp>
35#include <xercesc/util/PlatformUtils.hpp>
36#include <xercesc/util/XMLString.hpp>
37#include <xercesc/util/XMLUni.hpp>
38
39#include <list>
40#include <map>
41#include <set>
42#include <stdexcept>
43#include <string>
44#include <vector>
45
46// Error codes
47//enum {
48// ERROR_ARGS = 1,
49// ERROR_XERCES_INIT,
50// ERROR_PARSE,
51// ERROR_EMPTY_DOCUMENT
52//};
53
54namespace OpenMS
55{
56 class ProgressLogger;
57
58 namespace Internal
59 {
71 class OPENMS_DLLAPI MzIdentMLDOMHandler
72 {
73public:
77 MzIdentMLDOMHandler(const std::vector<ProteinIdentification>& pro_id, const PeptideIdentificationList& pep_id, const std::string& version, const ProgressLogger& logger);
78
80 MzIdentMLDOMHandler(std::vector<ProteinIdentification>& pro_id, PeptideIdentificationList& pep_id, const std::string& version, const ProgressLogger& logger);
81
85
87 void readMzIdentMLFile(const std::string& mzid_file);
89 void writeMzIdentMLFile(const std::string& mzid_file);
90
91protected:
94
99
101 std::vector<ProteinIdentification>* pro_id_ = nullptr;
103 PeptideIdentificationList* pep_id_ = nullptr;
104
106 const std::vector<ProteinIdentification>* cpro_id_ = nullptr;
108 const PeptideIdentificationList* cpep_id_ = nullptr;
109
111 const std::string schema_version_;
112
114 ControlledVocabulary::CVTerm getChildWithName_(const std::string& parent_accession, const std::string& name) const;
115
118
121
123 std::pair<CVTermList, std::map<std::string, DataValue> > parseParamGroup_(xercesc::DOMNodeList* paramGroup);
124 CVTerm parseCvParam_(xercesc::DOMElement* param);
125 std::pair<std::string, DataValue> parseUserParam_(xercesc::DOMElement* param);
126 void parseAnalysisSoftwareList_(xercesc::DOMNodeList* analysisSoftwareElements);
127 void parseDBSequenceElements_(xercesc::DOMNodeList* dbSequenceElements);
128 void parsePeptideElements_(xercesc::DOMNodeList* peptideElements);
129 //AASequence parsePeptideSiblings_(xercesc::DOMNodeList* peptideSiblings);
130 AASequence parsePeptideSiblings_(xercesc::DOMElement* peptide);
131 void parsePeptideEvidenceElements_(xercesc::DOMNodeList* peptideEvidenceElements);
132 void parseSpectrumIdentificationElements_(xercesc::DOMNodeList* spectrumIdentificationElements);
133 void parseSpectrumIdentificationProtocolElements_(xercesc::DOMNodeList* spectrumIdentificationProtocolElements);
134 void parseInputElements_(xercesc::DOMNodeList* inputElements);
135 void parseSpectrumIdentificationListElements_(xercesc::DOMNodeList* spectrumIdentificationListElements);
136 void parseSpectrumIdentificationItemSetXLMS(std::set<std::string>::const_iterator set_it, std::multimap<std::string, int> xl_val_map, xercesc::DOMElement* element_res, const std::string& spectrumID);
137 void parseSpectrumIdentificationItemElement_(xercesc::DOMElement* spectrumIdentificationItemElement, PeptideIdentification& spectrum_identification, std::string& spectrumIdentificationList_ref);
138 void parseProteinDetectionHypothesisElement_(xercesc::DOMElement* proteinDetectionHypothesisElement, ProteinIdentification& protein_identification);
139 void parseProteinAmbiguityGroupElement_(xercesc::DOMElement* proteinAmbiguityGroupElement, ProteinIdentification& protein_identification);
140 void parseProteinDetectionListElements_(xercesc::DOMNodeList* proteinDetectionListElements);
141 static ProteinIdentification::SearchParameters findSearchParameters_(std::pair<CVTermList, std::map<std::string, DataValue> > as_params);
143
145 void buildCvList_(xercesc::DOMElement* cvElements);
146 void buildAnalysisSoftwareList_(xercesc::DOMElement* analysisSoftwareElements);
147 void buildSequenceCollection_(xercesc::DOMElement* sequenceCollectionElements);
148 void buildAnalysisCollection_(xercesc::DOMElement* analysisCollectionElements);
149 void buildAnalysisProtocolCollection_(xercesc::DOMElement* protocolElements);
150 void buildInputDataCollection_(xercesc::DOMElement* inputElements);
151 void buildEnclosedCV_(xercesc::DOMElement* parentElement, const std::string& encel, const std::string& acc, const std::string& name, const std::string& cvref);
152 void buildAnalysisDataCollection_(xercesc::DOMElement* analysisElements);
154
155
156private:
160
163 {
164 std::string name;
165 std::string version;
166 };
169 {
170 int start;
171 int stop;
172 char pre;
173 char post;
174 bool idec;
175 };
178 {
179 std::string sequence;
180 std::string database_ref;
181 std::string accession;
183 };
203 {
205 std::string enzyme;
207 std::map<std::string, DataValue> parameter_ups;
208// std::vector<ModificationParam> modification_parameter;
213 std::map<std::string, DataValue> threshold_ups;
214 };
217 {
218 std::string name;
219 std::string location;
220 std::string version;
222 };
223
227
228 xercesc::XercesDOMParser mzid_parser_;
229
230 std::unique_ptr<XMLHandler> xml_handler_ = nullptr;
231
232 //from AnalysisSoftware
233 std::string search_engine_;
235 //mapping from AnalysisSoftware
236 std::map<std::string, AnalysisSoftware> as_map_;
237
238 //mapping from DataCollection Inputs
239 std::map<std::string, std::string> sr_map_;
240 std::map<std::string, std::string> sd_map_;
241 std::map<std::string, DatabaseInput> db_map_;
242
243 //mapping from SpectrumIdentification - SpectrumIdentification will be the new IdentificationRuns
244 std::map<std::string, SpectrumIdentification> si_map_;
245 std::map<std::string, size_t> si_pro_map_;
246
247 //mapping from SpectrumIdentificationProtocol
248 std::map<std::string, SpectrumIdentificationProtocol> sp_map_;
249
250 //mapping from SequenceCollection
251 std::map<std::string, AASequence> pep_map_;
252 std::map<std::string, PeptideEvidence> pe_ev_map_;
253 std::map<std::string, std::string> pv_db_map_;
254 std::multimap<std::string, std::string> p_pv_map_;
255 std::map<std::string, DBSequence> db_sq_map_;
256
257 std::list<std::list<std::string> > hit_pev_;
258
260 std::map<std::string, std::string> xl_id_donor_map_;
261 //std::map<std::string, std::string> xl_id_acceptor_map_; ///< mapping Peptide id -> crosslink acceptor value
262 std::map<std::string, std::string> xl_id_acceptor_map_;
263 std::map<std::string, SignedSize> xl_donor_pos_map_;
264 std::map<std::string, SignedSize> xl_acceptor_pos_map_;
265 std::map<std::string, double> xl_mass_map_;
266 std::map<std::string, std::string> xl_mod_map_;
267
269 std::set<std::string> q_score_child_terms_;
270 std::set<std::string> e_score_child_terms_;
271 std::set<std::string> specific_score_child_terms_;
272 };
273 } // namespace Internal
274} // namespace OpenMS
275
char16_t XMLCh
Definition ClassTest.h:30
Representation of a peptide/protein sequence.
Definition AASequence.h:88
Representation of controlled vocabulary term list.
Definition CVTermList.h:29
Representation of controlled vocabulary term.
Definition CVTerm.h:28
Definition ControlledVocabulary.h:29
DateTime Class.
Definition DateTime.h:31
XML DOM handler for MzIdentMLFile.
Definition MzIdentMLDOMHandler.h:72
ControlledVocabulary::CVTerm getChildWithName_(const std::string &parent_accession, const std::string &name) const
Looks up a child CV term of parent_accession with the name name. If no such term is found,...
void readMzIdentMLFile(const std::string &mzid_file)
Provides the functionality of reading a mzid with a handler object.
std::map< std::string, DataValue > threshold_ups
Definition MzIdentMLDOMHandler.h:213
MzIdentMLDOMHandler & operator=(const MzIdentMLDOMHandler &rhs)
void buildAnalysisSoftwareList_(xercesc::DOMElement *analysisSoftwareElements)
std::map< std::string, PeptideEvidence > pe_ev_map_
mapping PeptideEvidence id -> PeptideEvidence
Definition MzIdentMLDOMHandler.h:252
static ProteinIdentification::SearchParameters findSearchParameters_(std::pair< CVTermList, std::map< std::string, DataValue > > as_params)
CVTermList parameter_cvs
Definition MzIdentMLDOMHandler.h:206
bool xl_ms_search_
is true when reading a file containing Cross-Linking MS search results
Definition MzIdentMLDOMHandler.h:259
std::map< std::string, DBSequence > db_sq_map_
mapping DBSequence id -> Sequence
Definition MzIdentMLDOMHandler.h:255
DateTime date
Definition MzIdentMLDOMHandler.h:221
void parseSpectrumIdentificationItemElement_(xercesc::DOMElement *spectrumIdentificationItemElement, PeptideIdentification &spectrum_identification, std::string &spectrumIdentificationList_ref)
CVTerm parseCvParam_(xercesc::DOMElement *param)
std::map< std::string, std::string > sr_map_
mapping sourcefile id -> sourcefile location
Definition MzIdentMLDOMHandler.h:239
XMLCh * xml_root_tag_ptr_
Definition MzIdentMLDOMHandler.h:224
std::string spectrum_identification_list_ref
Definition MzIdentMLDOMHandler.h:190
int start
Definition MzIdentMLDOMHandler.h:170
std::string spectrum_identification_protocol_ref
Definition MzIdentMLDOMHandler.h:189
std::string residues
Definition MzIdentMLDOMHandler.h:197
std::map< std::string, SpectrumIdentificationProtocol > sp_map_
mapping SpectrumIdentificationProtocol id -> SpectrumIdentificationProtocol
Definition MzIdentMLDOMHandler.h:248
void parseProteinDetectionListElements_(xercesc::DOMNodeList *proteinDetectionListElements)
long double precursor_tolerance
Definition MzIdentMLDOMHandler.h:210
void parseInputElements_(xercesc::DOMNodeList *inputElements)
std::map< std::string, std::string > xl_mod_map_
mapping peptide id -> cross-linking reagent name
Definition MzIdentMLDOMHandler.h:266
void parseProteinDetectionHypothesisElement_(xercesc::DOMElement *proteinDetectionHypothesisElement, ProteinIdentification &protein_identification)
const ProgressLogger & logger_
Progress logger.
Definition MzIdentMLDOMHandler.h:93
ControlledVocabulary cv_
Controlled vocabulary (psi-ms from OpenMS/share/OpenMS/CV/psi-ms.obo)
Definition MzIdentMLDOMHandler.h:96
std::map< std::string, SignedSize > xl_acceptor_pos_map_
mapping acceptor value -> cross-link modification location
Definition MzIdentMLDOMHandler.h:264
void buildAnalysisCollection_(xercesc::DOMElement *analysisCollectionElements)
std::string search_engine_
Definition MzIdentMLDOMHandler.h:233
std::map< std::string, std::string > xl_id_acceptor_map_
mapping peptide id of acceptor peptide -> crosslink acceptor value
Definition MzIdentMLDOMHandler.h:262
std::string search_engine_version_
Definition MzIdentMLDOMHandler.h:234
std::pair< std::string, DataValue > parseUserParam_(xercesc::DOMElement *param)
long double mass_delta
Definition MzIdentMLDOMHandler.h:196
std::map< std::string, double > xl_mass_map_
mapping Peptide id -> cross-link mass
Definition MzIdentMLDOMHandler.h:265
CVTermList modification_param_cvs
Definition MzIdentMLDOMHandler.h:198
XMLCh * xml_cvparam_tag_ptr_
Definition MzIdentMLDOMHandler.h:225
long double fragment_tolerance
Definition MzIdentMLDOMHandler.h:211
int stop
Definition MzIdentMLDOMHandler.h:171
std::map< std::string, std::string > pv_db_map_
mapping PeptideEvidence id -> DBSequence id
Definition MzIdentMLDOMHandler.h:253
std::map< std::string, AASequence > pep_map_
mapping Peptide id -> Sequence
Definition MzIdentMLDOMHandler.h:251
std::map< std::string, AnalysisSoftware > as_map_
mapping AnalysisSoftware id -> AnalysisSoftware
Definition MzIdentMLDOMHandler.h:236
std::string version
Definition MzIdentMLDOMHandler.h:165
std::map< std::string, SpectrumIdentification > si_map_
mapping SpectrumIdentification id -> SpectrumIdentification (id refs)
Definition MzIdentMLDOMHandler.h:244
void parseProteinAmbiguityGroupElement_(xercesc::DOMElement *proteinAmbiguityGroupElement, ProteinIdentification &protein_identification)
void parseSpectrumIdentificationElements_(xercesc::DOMNodeList *spectrumIdentificationElements)
void parsePeptideEvidenceElements_(xercesc::DOMNodeList *peptideEvidenceElements)
void buildAnalysisDataCollection_(xercesc::DOMElement *analysisElements)
std::pair< CVTermList, std::map< std::string, DataValue > > parseParamGroup_(xercesc::DOMNodeList *paramGroup)
First: CVparams, Second: userParams (independent of each other)
bool idec
Definition MzIdentMLDOMHandler.h:174
ControlledVocabulary unimod_
Controlled vocabulary for modifications (unimod from OpenMS/share/OpenMS/CV/unimod....
Definition MzIdentMLDOMHandler.h:98
void parseSpectrumIdentificationProtocolElements_(xercesc::DOMNodeList *spectrumIdentificationProtocolElements)
std::map< std::string, std::string > xl_id_donor_map_
mapping Peptide id -> crosslink donor value
Definition MzIdentMLDOMHandler.h:260
std::map< std::string, SignedSize > xl_donor_pos_map_
mapping donor value -> cross-link modification location
Definition MzIdentMLDOMHandler.h:263
AASequence parsePeptideSiblings_(xercesc::DOMElement *peptide)
std::map< std::string, DatabaseInput > db_map_
mapping database id -> DatabaseInput
Definition MzIdentMLDOMHandler.h:241
std::string sequence
Definition MzIdentMLDOMHandler.h:179
std::string name
Definition MzIdentMLDOMHandler.h:164
void buildInputDataCollection_(xercesc::DOMElement *inputElements)
void writeMzIdentMLFile(const std::string &mzid_file)
Provides the functionality to write a mzid with a handler object.
xercesc::XercesDOMParser mzid_parser_
Definition MzIdentMLDOMHandler.h:228
CVTermList threshold_cvs
Definition MzIdentMLDOMHandler.h:212
std::string database_ref
Definition MzIdentMLDOMHandler.h:180
CVTermList specificities
Definition MzIdentMLDOMHandler.h:199
MzIdentMLDOMHandler(std::vector< ProteinIdentification > &pro_id, PeptideIdentificationList &pep_id, const std::string &version, const ProgressLogger &logger)
Constructor for a read-only handler for internal identification structures.
void initScoreTermCaches_()
Precompute the CV child-term sets used per PSM (constant across a file); shared by both constructors.
char pre
Definition MzIdentMLDOMHandler.h:172
std::string spectra_data_ref
Definition MzIdentMLDOMHandler.h:187
std::string enzyme
Definition MzIdentMLDOMHandler.h:205
std::set< std::string > specific_score_child_terms_
Definition MzIdentMLDOMHandler.h:271
CVTermList cvs
Definition MzIdentMLDOMHandler.h:182
std::string location
Definition MzIdentMLDOMHandler.h:219
const std::string schema_version_
Internal version keeping.
Definition MzIdentMLDOMHandler.h:111
void parseAnalysisSoftwareList_(xercesc::DOMNodeList *analysisSoftwareElements)
void buildEnclosedCV_(xercesc::DOMElement *parentElement, const std::string &encel, const std::string &acc, const std::string &name, const std::string &cvref)
std::map< std::string, std::string > sd_map_
mapping spectradata id -> spectradata location
Definition MzIdentMLDOMHandler.h:240
MzIdentMLDOMHandler(const MzIdentMLDOMHandler &rhs)
CVTermList modification_parameter
Definition MzIdentMLDOMHandler.h:209
void buildCvList_(xercesc::DOMElement *cvElements)
std::list< std::list< std::string > > hit_pev_
writing help only
Definition MzIdentMLDOMHandler.h:257
void buildSequenceCollection_(xercesc::DOMElement *sequenceCollectionElements)
void parseSpectrumIdentificationListElements_(xercesc::DOMNodeList *spectrumIdentificationListElements)
std::string accession
Definition MzIdentMLDOMHandler.h:181
void parseSpectrumIdentificationItemSetXLMS(std::set< std::string >::const_iterator set_it, std::multimap< std::string, int > xl_val_map, xercesc::DOMElement *element_res, const std::string &spectrumID)
std::set< std::string > e_score_child_terms_
Definition MzIdentMLDOMHandler.h:270
std::string fixed_mod
Definition MzIdentMLDOMHandler.h:195
void parseDBSequenceElements_(xercesc::DOMNodeList *dbSequenceElements)
std::map< std::string, DataValue > parameter_ups
Definition MzIdentMLDOMHandler.h:207
char post
Definition MzIdentMLDOMHandler.h:173
std::string search_database_ref
Definition MzIdentMLDOMHandler.h:188
void buildAnalysisProtocolCollection_(xercesc::DOMElement *protocolElements)
MzIdentMLDOMHandler(const std::vector< ProteinIdentification > &pro_id, const PeptideIdentificationList &pep_id, const std::string &version, const ProgressLogger &logger)
Constructor for a write-only handler for internal identification structures.
std::map< std::string, size_t > si_pro_map_
mapping SpectrumIdentificationList id -> index to ProteinIdentification in pro_id_
Definition MzIdentMLDOMHandler.h:245
XMLCh * xml_name_attr_ptr_
Definition MzIdentMLDOMHandler.h:226
std::set< std::string > q_score_child_terms_
cached CV child term sets (computed once, reused per PSM)
Definition MzIdentMLDOMHandler.h:269
std::multimap< std::string, std::string > p_pv_map_
mapping Peptide id -> PeptideEvidence id, multiple PeptideEvidences can have equivalent Peptides.
Definition MzIdentMLDOMHandler.h:254
virtual ~MzIdentMLDOMHandler()
Destructor.
void parsePeptideElements_(xercesc::DOMNodeList *peptideElements)
Struct to hold the used analysis software for that file.
Definition MzIdentMLDOMHandler.h:163
Struct to hold the information from the DBSequence xml tag.
Definition MzIdentMLDOMHandler.h:178
Struct to hold the information from the DatabaseInput xml tag.
Definition MzIdentMLDOMHandler.h:217
Struct to hold the information from the ModificationParam xml tag.
Definition MzIdentMLDOMHandler.h:194
Struct to hold the PeptideEvidence information.
Definition MzIdentMLDOMHandler.h:169
Struct to hold the information from the SpectrumIdentification xml tag.
Definition MzIdentMLDOMHandler.h:186
Struct to hold the information from the SpectrumIdentificationProtocol xml tag.
Definition MzIdentMLDOMHandler.h:203
Container for peptide identifications from multiple spectra.
Definition PeptideIdentificationList.h:66
Represents the set of candidates (SpectrumMatches) identified for a single precursor spectrum.
Definition PeptideIdentification.h:66
Base class for all classes that want to report their progress.
Definition ProgressLogger.h:27
Representation of a protein identification run.
Definition ProteinIdentification.h:55
Main OpenMS namespace.
Definition openswathalgo/include/OpenMS/OPENSWATHALGO/DATAACCESS/ISpectrumAccess.h:19
Representation of a CV term.
Definition ControlledVocabulary.h:50
Search parameters of the DB search.
Definition ProteinIdentification.h:248