OpenMS
Loading...
Searching...
No Matches
FileInfo.h
Go to the documentation of this file.
1// Copyright (c) 2002-present, OpenMS Inc. -- EKU Tuebingen, ETH Zurich, and FU Berlin
2// SPDX-License-Identifier: BSD-3-Clause
3//
4// --------------------------------------------------------------------------
5// $Maintainer: Timo Sachsenberg $
6// $Authors: Timo Sachsenberg $
7// --------------------------------------------------------------------------
8
9#pragma once
10
11#include <OpenMS/config.h>
12#include <OpenMS/CONCEPT/Types.h> // Int / UInt / UInt64
14#include <OpenMS/MATH/StatisticFunctions.h> // Math::SummaryStatistics
15
16#include <cstdint>
17#include <iosfwd>
18#include <map>
19#include <optional>
20#include <string>
21#include <tuple>
22#include <utility>
23#include <vector>
24
25namespace OpenMS
26{
47 class OPENMS_DLLAPI FileInfo
48 {
49 public:
50 FileInfo() = default;
51 ~FileInfo() = default;
52
53 // ---------------------------------------------------------------------
54 // nested value types (all pyOpenMS-bindable: plain structs / vector / map)
55 // ---------------------------------------------------------------------
56
58 struct OPENMS_DLLAPI Range
59 {
60 bool present = false;
61 double min = 0.0;
62 double max = 0.0;
63 };
64
66 struct OPENMS_DLLAPI RangeSet
67 {
72 bool has_mobility = false;
73 };
74
76 struct OPENMS_DLLAPI Ranges
77 {
80 std::map<UInt, RangeSet> per_ms_level;
82 bool is_experiment = false;
83 };
84
86 struct OPENMS_DLLAPI FileMeta
87 {
88 std::string file_name;
89 FileTypes::Type file_type = FileTypes::UNKNOWN;
90 std::string file_type_name;
91 };
92
94 struct OPENMS_DLLAPI ExperimentMeta
95 {
96 bool present = false;
97 std::string document_id;
98 std::string date;
99 // sample
100 std::string sample_name, sample_organism, sample_comment;
101 // instrument
102 std::string instrument_name, instrument_model, instrument_vendor;
103 std::vector<std::string> ion_sources, mass_analyzers, detectors;
104 // contacts
105 struct OPENMS_DLLAPI Contact { std::string first_name, last_name, email; };
106 std::vector<Contact> contacts;
107 };
108
110 struct OPENMS_DLLAPI ProcessingStep
111 {
112 std::string software_name, software_version, completion_time;
113 std::vector<std::string> actions;
114 };
115
117 struct OPENMS_DLLAPI NamedStats
118 {
119 std::string title;
121 };
122
124 struct OPENMS_DLLAPI PeakInfo
125 {
126 std::string instrument_name;
127 std::vector<std::pair<std::string, double>> mass_analyzers;
128 std::vector<Int> ms_levels;
129 UInt64 total_peaks = 0;
130 UInt64 num_spectra = 0;
131 std::map<Int, UInt64> spectra_per_ms_level;
132 std::map<Int, std::string> peak_type_per_ms_level;
134 std::map<std::pair<Int, std::string>, UInt64> activation_methods;
135 std::map<Int, UInt64> precursor_charges;
136 std::map<std::string, UInt64> float_arrays, int_arrays, string_arrays;
137 std::vector<double> faims_cvs;
138 UInt64 num_chromatograms = 0;
139 UInt64 num_chrom_peaks = 0;
140 std::map<std::string, UInt64> chromatogram_types;
141
143 std::vector<std::tuple<Int, std::string, UInt64>> activationMethodsFlat() const;
144 };
145
147 struct OPENMS_DLLAPI FeatureInfo
148 {
149 bool is_consensus = false;
150 UInt64 num_features = 0;
151 double tic = 0.0;
152 std::map<Int, UInt64> charges;
153 std::map<UInt64, UInt64> ids_per_element;
154 UInt64 assigned_ids = 0, unassigned_ids = 0;
155 // consensus only:
156 std::map<UInt64, UInt64> size_distribution;
157 struct OPENMS_DLLAPI MapColumn { std::string filename, identifier, label; UInt64 size = 0; };
158 std::vector<MapColumn> map_columns;
159 };
160
162 struct OPENMS_DLLAPI IdentInfo
163 {
164 std::string db_name, db_version, taxonomy;
165 std::vector<std::string> search_engines;
166 UInt64 num_runs = 0, protein_hits = 0, non_redundant_protein_hits = 0;
167 UInt64 matched_spectra = 0, peptide_hits = 0;
168 double psms_per_spectrum = 0.0, avg_peptide_length = 0.0;
169 UInt64 non_redundant_peptides = 0, modified_tophits = 0;
170 std::map<std::string, UInt64> modification_counts;
171 };
172
174 struct OPENMS_DLLAPI FastaInfo
175 {
176 UInt64 num_sequences = 0, total_residues = 0;
177 bool is_nucleic_acid = false;
179 std::map<char, UInt64> residue_counts;
180 UInt64 seq_with_ambiguous = 0, dup_headers = 0, dup_sequences = 0;
181 std::map<std::string, UInt64> ambiguity_counts;
182 };
183
185 struct OPENMS_DLLAPI MzTabInfo
186 {
187 std::string version, mode, type;
188 UInt64 psms = 0, peptides = 0, proteins = 0, oligonucleotides = 0, osms = 0,
189 small_molecules = 0, nucleic_acids = 0;
190 };
191
193 struct OPENMS_DLLAPI ValidationInfo
194 {
195 bool performed = false, supported = true, valid = false;
196 std::string schema_version;
197 std::string detail;
198 std::vector<std::string> warnings, errors;
199 // indexed-mzML (-i)
200 bool index_checked = false, index_valid = false;
201 UInt64 indexed_spectra = 0, indexed_chromatograms = 0;
202 };
203
205 struct OPENMS_DLLAPI CorruptionInfo
206 {
207 bool performed = false;
208 std::vector<std::string> errors, warnings;
209 };
210
212 struct OPENMS_DLLAPI DetailInfo
213 {
214 bool performed = false;
215 std::vector<std::string> lines;
216 };
217
219 struct OPENMS_DLLAPI Result
220 {
223 std::optional<PeakInfo> peak;
224 std::optional<FeatureInfo> feature;
225 std::optional<IdentInfo> ident;
226 std::optional<FastaInfo> fasta;
227 std::optional<MzTabInfo> mztab;
228 std::optional<ExperimentMeta> experiment_meta;
229 std::vector<ProcessingStep> processing;
230 std::vector<NamedStats> statistics;
235 std::string targeted_summary;
236
238 std::string text;
240 std::string tsv;
241 };
242
244 struct OPENMS_DLLAPI Options
245 {
246 FileTypes::Type forced_type = FileTypes::UNKNOWN;
247 bool meta = false;
248 bool processing = false;
249 bool statistics = false;
250 bool detailed = false;
251 bool check_corrupt = false;
252 bool validate = false;
253 bool check_index = false;
254 };
255
256 // ---------------------------------------------------------------------
257 // public API (all pyOpenMS-bindable)
258 // ---------------------------------------------------------------------
259
266 Result run(const std::string& filename, const Options& options);
267
269 Result run(const std::string& filename) { return run(filename, Options()); }
270
272 Result runAll(const std::string& filename);
273
276 static std::string toText(const Result& r, const Options& options);
277
279 static std::string toText(const Result& r) { return toText(r, Options()); }
280
283 static std::string toTSV(const Result& r, const Options& options);
284
286 static std::string toTSV(const Result& r) { return toTSV(r, Options()); }
287
288 private:
292 void report_(const std::string& in, FileTypes::Type in_type, const Options& o,
293 std::ostream& os, std::ostream& os_tsv, Result& r);
294 };
295} // namespace OpenMS
Library-level equivalent of the FileInfo tool.
Definition FileInfo.h:48
std::string document_id
Definition FileInfo.h:97
std::string transformation_summary
trafoXML: model + printSummary
Definition FileInfo.h:234
std::string detail
captured validator output (re-emitted verbatim)
Definition FileInfo.h:197
std::vector< ProcessingStep > processing
Definition FileInfo.h:229
DetailInfo detail
Definition FileInfo.h:233
std::string text
Human-readable rendering, identical to the FileInfo CLI -out output (filled by run()).
Definition FileInfo.h:238
void report_(const std::string &in, FileTypes::Type in_type, const Options &o, std::ostream &os, std::ostream &os_tsv, Result &r)
std::map< std::string, UInt64 > modification_counts
Definition FileInfo.h:170
std::vector< std::string > actions
Definition FileInfo.h:113
std::vector< std::string > detectors
Definition FileInfo.h:103
std::map< UInt, RangeSet > per_ms_level
MSExperiment only, keyed by MS level.
Definition FileInfo.h:80
RangeSet chromatograms
MSExperiment only.
Definition FileInfo.h:81
std::string email
Definition FileInfo.h:105
std::vector< std::string > lines
Definition FileInfo.h:215
RangeSet combined
spectra + chromatograms (or the whole map)
Definition FileInfo.h:78
std::optional< ExperimentMeta > experiment_meta
Definition FileInfo.h:228
std::string file_type_name
FileTypes::typeToName(file_type)
Definition FileInfo.h:90
FileMeta meta
Definition FileInfo.h:221
~FileInfo()=default
std::string targeted_summary
PQP: getSummary()
Definition FileInfo.h:235
std::optional< MzTabInfo > mztab
Definition FileInfo.h:227
std::string file_name
Definition FileInfo.h:88
Math::SummaryStatistics< std::vector< double > > length_stats
Definition FileInfo.h:178
std::string db_name
Definition FileInfo.h:164
std::string mode
Definition FileInfo.h:187
std::map< UInt64, UInt64 > ids_per_element
number of IDs -> number of elements
Definition FileInfo.h:153
Result runAll(const std::string &filename)
Convenience: compute all content metrics (meta/processing/statistics on; no v/i/d/c).
RangeSet spectra_overall
MSExperiment only.
Definition FileInfo.h:79
std::optional< IdentInfo > ident
Definition FileInfo.h:225
std::string tsv
TSV rendering, identical to the FileInfo CLI -out_tsv output (filled by run()).
Definition FileInfo.h:240
std::string date
Definition FileInfo.h:98
std::vector< std::string > search_engines
"engine (version)"
Definition FileInfo.h:165
Range rt
retention time (seconds)
Definition FileInfo.h:68
Range mobility
ion mobility (only filled when the source carries it)
Definition FileInfo.h:70
std::optional< FastaInfo > fasta
Definition FileInfo.h:226
std::string completion_time
Definition FileInfo.h:112
Ranges ranges
filled for peak / feature / consensus
Definition FileInfo.h:222
ValidationInfo validation
Definition FileInfo.h:231
std::vector< std::string > errors
semantic validation
Definition FileInfo.h:198
std::string sample_comment
Definition FileInfo.h:100
std::vector< Contact > contacts
Definition FileInfo.h:106
static std::string toTSV(const Result &r)
Return the TSV rendering produced by run() (== the FileInfo CLI -out_tsv output, cached in r).
Definition FileInfo.h:286
std::optional< FeatureInfo > feature
Definition FileInfo.h:224
std::string title
Definition FileInfo.h:119
std::optional< PeakInfo > peak
Definition FileInfo.h:223
std::map< std::string, UInt64 > ambiguity_counts
buckets depend on is_nucleic_acid
Definition FileInfo.h:181
std::vector< MapColumn > map_columns
Definition FileInfo.h:158
std::string instrument_model
Definition FileInfo.h:102
Range mz
mass-to-charge
Definition FileInfo.h:69
Result run(const std::string &filename)
Load filename with default options (no extra flags).
Definition FileInfo.h:269
std::vector< NamedStats > statistics
Definition FileInfo.h:230
Range intensity
Definition FileInfo.h:71
std::map< char, UInt64 > residue_counts
Definition FileInfo.h:179
FileInfo()=default
static std::string toText(const Result &r)
Return the human-readable rendering produced by run() (== the FileInfo CLI -out output,...
Definition FileInfo.h:279
std::string schema_version
Definition FileInfo.h:196
Result run(const std::string &filename, const Options &options)
Load filename (type auto-detected unless Options::forced_type set) and compute everything requested.
std::string filename
Definition FileInfo.h:157
std::map< Int, UInt64 > charges
charge -> count
Definition FileInfo.h:152
static std::string toTSV(const Result &r, const Options &options)
static std::string toText(const Result &r, const Options &options)
Math::SummaryStatistics< std::vector< double > > stats
Definition FileInfo.h:120
CorruptionInfo corruption
Definition FileInfo.h:232
std::map< UInt64, UInt64 > size_distribution
consensus-feature size -> count
Definition FileInfo.h:156
the -c (corrupt data) block
Definition FileInfo.h:206
detailed per-spectrum listing (the -d block); kept as pre-rendered lines
Definition FileInfo.h:213
experiment / instrument / sample / contact metadata (the -m block, peak files)
Definition FileInfo.h:95
FASTA specifics.
Definition FileInfo.h:175
feature / consensus specifics
Definition FileInfo.h:148
general header block (always populated)
Definition FileInfo.h:87
identification specifics (idXML / mzIdentML)
Definition FileInfo.h:163
mzTab specifics
Definition FileInfo.h:186
one named SummaryStatistics block; title is the label used in both renderers
Definition FileInfo.h:118
what to compute (mirrors the CLI flags; lets callers opt into expensive work)
Definition FileInfo.h:245
one data-processing step (the -p block)
Definition FileInfo.h:111
one [min,max] interval for a single dimension; present==false => "<none>"
Definition FileInfo.h:59
the four range dimensions FileInfo reports for a map / spectrum group
Definition FileInfo.h:67
MSExperiment carries four range categories; non-MSExperiment maps fill only combined.
Definition FileInfo.h:77
the master result aggregate
Definition FileInfo.h:220
the -v (schema/semantic validation) and -i (indexed mzML) blocks
Definition FileInfo.h:194
uint64_t UInt64
Unsigned integer type (64bit)
Definition Types.h:47
Main OpenMS namespace.
Definition openswathalgo/include/OpenMS/OPENSWATHALGO/DATAACCESS/ISpectrumAccess.h:19
peak-file (MSExperiment) specifics
Definition FileInfo.h:125
std::map< std::string, UInt64 > float_arrays
Definition FileInfo.h:136
std::map< std::string, UInt64 > chromatogram_types
type name -> count
Definition FileInfo.h:140
std::vector< double > faims_cvs
Definition FileInfo.h:137
std::vector< std::pair< std::string, double > > mass_analyzers
(type, resolution)
Definition FileInfo.h:127
std::vector< std::tuple< Int, std::string, UInt64 > > activationMethodsFlat() const
flattened (ms_level, method, count) view for ergonomic binding (mirrors the TSV columns)
std::vector< Int > ms_levels
Definition FileInfo.h:128
std::map< Int, UInt64 > spectra_per_ms_level
Definition FileInfo.h:131
std::string instrument_name
Definition FileInfo.h:126
std::map< Int, UInt64 > precursor_charges
charge -> count
Definition FileInfo.h:135
std::map< std::pair< Int, std::string >, UInt64 > activation_methods
activation methods: (ms_level, method_full_name) -> count
Definition FileInfo.h:134
std::map< Int, std::string > peak_type_per_ms_level
Definition FileInfo.h:132
Type
Actual file types enum.
Definition FileTypes.h:31
Helper class to gather (and dump) some statistics from a e.g. vector<double>.
Definition StatisticFunctions.h:922