OpenMS
Loading...
Searching...
No Matches
FileInfo.h
Go to the documentation of this file.
1// Copyright (c) 2002-present, OpenMS Inc. -- EKU Tuebingen, ETH Zurich, and FU Berlin
2// SPDX-License-Identifier: BSD-3-Clause
3//
4// --------------------------------------------------------------------------
5// $Maintainer: Timo Sachsenberg $
6// $Authors: Timo Sachsenberg $
7// --------------------------------------------------------------------------
8
9#pragma once
10
11#include <OpenMS/config.h>
12#include <OpenMS/CONCEPT/ProgressLogger.h> // ProgressLogger::LogType (nested enum; cannot be forward-declared)
13#include <OpenMS/CONCEPT/Types.h> // Int / UInt / UInt64
15#include <OpenMS/MATH/StatisticFunctions.h> // Math::SummaryStatistics
16
17#include <cstdint>
18#include <iosfwd>
19#include <map>
20#include <optional>
21#include <string>
22#include <tuple>
23#include <utility>
24#include <vector>
25
26namespace OpenMS
27{
48 class OPENMS_DLLAPI FileInfo
49 {
50 public:
51 FileInfo() = default;
52 ~FileInfo() = default;
53
54 // ---------------------------------------------------------------------
55 // nested value types (all pyOpenMS-bindable: plain structs / vector / map)
56 // ---------------------------------------------------------------------
57
59 struct OPENMS_DLLAPI Range
60 {
61 bool present = false;
62 double min = 0.0;
63 double max = 0.0;
64 };
65
67 struct OPENMS_DLLAPI RangeSet
68 {
73 bool has_mobility = false;
74 };
75
77 struct OPENMS_DLLAPI Ranges
78 {
81 std::map<UInt, RangeSet> per_ms_level;
83 bool is_experiment = false;
84 };
85
87 struct OPENMS_DLLAPI FileMeta
88 {
89 std::string file_name;
90 FileTypes::Type file_type = FileTypes::UNKNOWN;
91 std::string file_type_name;
92 };
93
95 struct OPENMS_DLLAPI ExperimentMeta
96 {
97 bool present = false;
98 std::string document_id;
99 std::string date;
100 // sample
101 std::string sample_name, sample_organism, sample_comment;
102 // instrument
103 std::string instrument_name, instrument_model, instrument_vendor;
104 std::vector<std::string> ion_sources, mass_analyzers, detectors;
105 // contacts
106 struct OPENMS_DLLAPI Contact { std::string first_name, last_name, email; };
107 std::vector<Contact> contacts;
108 };
109
111 struct OPENMS_DLLAPI ProcessingStep
112 {
113 std::string software_name, software_version, completion_time;
114 std::vector<std::string> actions;
115 };
116
118 struct OPENMS_DLLAPI NamedStats
119 {
120 std::string title;
122 };
123
125 struct OPENMS_DLLAPI PeakInfo
126 {
127 std::string instrument_name;
128 std::vector<std::pair<std::string, double>> mass_analyzers;
129 std::vector<Int> ms_levels;
130 UInt64 total_peaks = 0;
131 UInt64 num_spectra = 0;
132 std::map<Int, UInt64> spectra_per_ms_level;
133 std::map<Int, std::string> peak_type_per_ms_level;
135 std::map<std::pair<Int, std::string>, UInt64> activation_methods;
136 std::map<Int, UInt64> precursor_charges;
137 std::map<std::string, UInt64> float_arrays, int_arrays, string_arrays;
138 std::vector<double> faims_cvs;
139 UInt64 num_chromatograms = 0;
140 UInt64 num_chrom_peaks = 0;
141 std::map<std::string, UInt64> chromatogram_types;
142
144 std::vector<std::tuple<Int, std::string, UInt64>> activationMethodsFlat() const;
145 };
146
148 struct OPENMS_DLLAPI FeatureInfo
149 {
150 bool is_consensus = false;
151 UInt64 num_features = 0;
152 double tic = 0.0;
153 std::map<Int, UInt64> charges;
154 std::map<UInt64, UInt64> ids_per_element;
155 UInt64 assigned_ids = 0, unassigned_ids = 0;
156 // consensus only:
157 std::map<UInt64, UInt64> size_distribution;
158 struct OPENMS_DLLAPI MapColumn { std::string filename, identifier, label; UInt64 size = 0; };
159 std::vector<MapColumn> map_columns;
160 };
161
163 struct OPENMS_DLLAPI IdentInfo
164 {
165 std::string db_name, db_version, taxonomy;
166 std::vector<std::string> search_engines;
167 UInt64 num_runs = 0, protein_hits = 0, non_redundant_protein_hits = 0;
168 UInt64 matched_spectra = 0, peptide_hits = 0;
169 double psms_per_spectrum = 0.0, avg_peptide_length = 0.0;
170 UInt64 non_redundant_peptides = 0, modified_tophits = 0;
171 std::map<std::string, UInt64> modification_counts;
172 };
173
175 struct OPENMS_DLLAPI FastaInfo
176 {
177 UInt64 num_sequences = 0, total_residues = 0;
178 bool is_nucleic_acid = false;
180 std::map<char, UInt64> residue_counts;
181 UInt64 seq_with_ambiguous = 0, dup_headers = 0, dup_sequences = 0;
182 std::map<std::string, UInt64> ambiguity_counts;
183 };
184
186 struct OPENMS_DLLAPI MzTabInfo
187 {
188 std::string version, mode, type;
189 UInt64 psms = 0, peptides = 0, proteins = 0, oligonucleotides = 0, osms = 0,
190 small_molecules = 0, nucleic_acids = 0;
191 };
192
194 struct OPENMS_DLLAPI ValidationInfo
195 {
196 bool performed = false, supported = true, valid = false;
197 std::string schema_version;
198 std::string detail;
199 std::vector<std::string> warnings, errors;
200 // indexed-mzML (-i)
201 bool index_checked = false, index_valid = false;
202 UInt64 indexed_spectra = 0, indexed_chromatograms = 0;
203 };
204
206 struct OPENMS_DLLAPI CorruptionInfo
207 {
208 bool performed = false;
209 std::vector<std::string> errors, warnings;
210 };
211
213 struct OPENMS_DLLAPI DetailInfo
214 {
215 bool performed = false;
216 std::vector<std::string> lines;
217 };
218
220 struct OPENMS_DLLAPI Result
221 {
224 std::optional<PeakInfo> peak;
225 std::optional<FeatureInfo> feature;
226 std::optional<IdentInfo> ident;
227 std::optional<FastaInfo> fasta;
228 std::optional<MzTabInfo> mztab;
229 std::optional<ExperimentMeta> experiment_meta;
230 std::vector<ProcessingStep> processing;
231 std::vector<NamedStats> statistics;
236 std::string targeted_summary;
237
239 std::string text;
241 std::string tsv;
242 };
243
245 struct OPENMS_DLLAPI Options
246 {
247 FileTypes::Type forced_type = FileTypes::UNKNOWN;
248 bool meta = false;
249 bool processing = false;
250 bool statistics = false;
251 bool detailed = false;
252 bool check_corrupt = false;
253 bool validate = false;
254 bool check_index = false;
256 ProgressLogger::LogType log_type = ProgressLogger::NONE;
257 };
258
259 // ---------------------------------------------------------------------
260 // public API (all pyOpenMS-bindable)
261 // ---------------------------------------------------------------------
262
269 Result run(const std::string& filename, const Options& options);
270
272 Result run(const std::string& filename) { return run(filename, Options()); }
273
275 Result runAll(const std::string& filename);
276
279 static std::string toText(const Result& r, const Options& options);
280
282 static std::string toText(const Result& r) { return toText(r, Options()); }
283
286 static std::string toTSV(const Result& r, const Options& options);
287
289 static std::string toTSV(const Result& r) { return toTSV(r, Options()); }
290
291 private:
295 void report_(const std::string& in, FileTypes::Type in_type, const Options& o,
296 std::ostream& os, std::ostream& os_tsv, Result& r);
297 };
298} // namespace OpenMS
Library-level equivalent of the FileInfo tool.
Definition FileInfo.h:49
std::string document_id
Definition FileInfo.h:98
std::string transformation_summary
trafoXML: model + printSummary
Definition FileInfo.h:235
std::string detail
captured validator output (re-emitted verbatim)
Definition FileInfo.h:198
std::vector< ProcessingStep > processing
Definition FileInfo.h:230
DetailInfo detail
Definition FileInfo.h:234
std::string text
Human-readable rendering, identical to the FileInfo CLI -out output (filled by run()).
Definition FileInfo.h:239
void report_(const std::string &in, FileTypes::Type in_type, const Options &o, std::ostream &os, std::ostream &os_tsv, Result &r)
std::map< std::string, UInt64 > modification_counts
Definition FileInfo.h:171
std::vector< std::string > actions
Definition FileInfo.h:114
std::vector< std::string > detectors
Definition FileInfo.h:104
std::map< UInt, RangeSet > per_ms_level
MSExperiment only, keyed by MS level.
Definition FileInfo.h:81
RangeSet chromatograms
MSExperiment only.
Definition FileInfo.h:82
std::string email
Definition FileInfo.h:106
std::vector< std::string > lines
Definition FileInfo.h:216
RangeSet combined
spectra + chromatograms (or the whole map)
Definition FileInfo.h:79
std::optional< ExperimentMeta > experiment_meta
Definition FileInfo.h:229
std::string file_type_name
FileTypes::typeToName(file_type)
Definition FileInfo.h:91
FileMeta meta
Definition FileInfo.h:222
~FileInfo()=default
std::string targeted_summary
PQP: getSummary()
Definition FileInfo.h:236
std::optional< MzTabInfo > mztab
Definition FileInfo.h:228
std::string file_name
Definition FileInfo.h:89
Math::SummaryStatistics< std::vector< double > > length_stats
Definition FileInfo.h:179
std::string db_name
Definition FileInfo.h:165
std::string mode
Definition FileInfo.h:188
std::map< UInt64, UInt64 > ids_per_element
number of IDs -> number of elements
Definition FileInfo.h:154
Result runAll(const std::string &filename)
Convenience: compute all content metrics (meta/processing/statistics on; no v/i/d/c).
RangeSet spectra_overall
MSExperiment only.
Definition FileInfo.h:80
std::optional< IdentInfo > ident
Definition FileInfo.h:226
std::string tsv
TSV rendering, identical to the FileInfo CLI -out_tsv output (filled by run()).
Definition FileInfo.h:241
std::string date
Definition FileInfo.h:99
std::vector< std::string > search_engines
"engine (version)"
Definition FileInfo.h:166
Range rt
retention time (seconds)
Definition FileInfo.h:69
Range mobility
ion mobility (only filled when the source carries it)
Definition FileInfo.h:71
std::optional< FastaInfo > fasta
Definition FileInfo.h:227
std::string completion_time
Definition FileInfo.h:113
Ranges ranges
filled for peak / feature / consensus
Definition FileInfo.h:223
ValidationInfo validation
Definition FileInfo.h:232
std::vector< std::string > errors
semantic validation
Definition FileInfo.h:199
std::string sample_comment
Definition FileInfo.h:101
std::vector< Contact > contacts
Definition FileInfo.h:107
static std::string toTSV(const Result &r)
Return the TSV rendering produced by run() (== the FileInfo CLI -out_tsv output, cached in r).
Definition FileInfo.h:289
std::optional< FeatureInfo > feature
Definition FileInfo.h:225
std::string title
Definition FileInfo.h:120
std::optional< PeakInfo > peak
Definition FileInfo.h:224
std::map< std::string, UInt64 > ambiguity_counts
buckets depend on is_nucleic_acid
Definition FileInfo.h:182
std::vector< MapColumn > map_columns
Definition FileInfo.h:159
std::string instrument_model
Definition FileInfo.h:103
Range mz
mass-to-charge
Definition FileInfo.h:70
Result run(const std::string &filename)
Load filename with default options (no extra flags).
Definition FileInfo.h:272
std::vector< NamedStats > statistics
Definition FileInfo.h:231
Range intensity
Definition FileInfo.h:72
std::map< char, UInt64 > residue_counts
Definition FileInfo.h:180
FileInfo()=default
static std::string toText(const Result &r)
Return the human-readable rendering produced by run() (== the FileInfo CLI -out output,...
Definition FileInfo.h:282
std::string schema_version
Definition FileInfo.h:197
Result run(const std::string &filename, const Options &options)
Load filename (type auto-detected unless Options::forced_type set) and compute everything requested.
std::string filename
Definition FileInfo.h:158
std::map< Int, UInt64 > charges
charge -> count
Definition FileInfo.h:153
static std::string toTSV(const Result &r, const Options &options)
static std::string toText(const Result &r, const Options &options)
Math::SummaryStatistics< std::vector< double > > stats
Definition FileInfo.h:121
CorruptionInfo corruption
Definition FileInfo.h:233
std::map< UInt64, UInt64 > size_distribution
consensus-feature size -> count
Definition FileInfo.h:157
the -c (corrupt data) block
Definition FileInfo.h:207
detailed per-spectrum listing (the -d block); kept as pre-rendered lines
Definition FileInfo.h:214
experiment / instrument / sample / contact metadata (the -m block, peak files)
Definition FileInfo.h:96
FASTA specifics.
Definition FileInfo.h:176
feature / consensus specifics
Definition FileInfo.h:149
general header block (always populated)
Definition FileInfo.h:88
identification specifics (idXML / mzIdentML)
Definition FileInfo.h:164
mzTab specifics
Definition FileInfo.h:187
one named SummaryStatistics block; title is the label used in both renderers
Definition FileInfo.h:119
what to compute (mirrors the CLI flags; lets callers opt into expensive work)
Definition FileInfo.h:246
one data-processing step (the -p block)
Definition FileInfo.h:112
one [min,max] interval for a single dimension; present==false => "<none>"
Definition FileInfo.h:60
the four range dimensions FileInfo reports for a map / spectrum group
Definition FileInfo.h:68
MSExperiment carries four range categories; non-MSExperiment maps fill only combined.
Definition FileInfo.h:78
the master result aggregate
Definition FileInfo.h:221
the -v (schema/semantic validation) and -i (indexed mzML) blocks
Definition FileInfo.h:195
LogType
Possible log types.
Definition ProgressLogger.h:43
uint64_t UInt64
Unsigned integer type (64bit)
Definition Types.h:47
Main OpenMS namespace.
Definition openswathalgo/include/OpenMS/OPENSWATHALGO/DATAACCESS/ISpectrumAccess.h:19
peak-file (MSExperiment) specifics
Definition FileInfo.h:126
std::map< std::string, UInt64 > float_arrays
Definition FileInfo.h:137
std::map< std::string, UInt64 > chromatogram_types
type name -> count
Definition FileInfo.h:141
std::vector< double > faims_cvs
Definition FileInfo.h:138
std::vector< std::pair< std::string, double > > mass_analyzers
(type, resolution)
Definition FileInfo.h:128
std::vector< std::tuple< Int, std::string, UInt64 > > activationMethodsFlat() const
flattened (ms_level, method, count) view for ergonomic binding (mirrors the TSV columns)
std::vector< Int > ms_levels
Definition FileInfo.h:129
std::map< Int, UInt64 > spectra_per_ms_level
Definition FileInfo.h:132
std::string instrument_name
Definition FileInfo.h:127
std::map< Int, UInt64 > precursor_charges
charge -> count
Definition FileInfo.h:136
std::map< std::pair< Int, std::string >, UInt64 > activation_methods
activation methods: (ms_level, method_full_name) -> count
Definition FileInfo.h:135
std::map< Int, std::string > peak_type_per_ms_level
Definition FileInfo.h:133
Type
Actual file types enum.
Definition FileTypes.h:31
Helper class to gather (and dump) some statistics from a e.g. vector<double>.
Definition StatisticFunctions.h:934