37 UInt annotation_id{std::numeric_limits<UInt>::max()};
39 enum class Type { PSI_MOD, UNIMOD, GENERIC };
44 : position(pos), accession(acc), name(n), optional_tag(tag), annotation_id(aid)
59 name == rhs.
name && type == rhs.
type &&
72 char variant_aa{
'\0'};
74 UInt annotation_id{std::numeric_limits<UInt>::max()};
78 : position(pos), variant_aa(aa), optional_tag(tag), annotation_id(aid) {}
98 UInt annotation_id{std::numeric_limits<UInt>::max()};
102 : start_position(start), end_position(end), replacement(repl), optional_tag(tag), annotation_id(aid) {}
124 UInt annotation_id{std::numeric_limits<UInt>::max()};
128 : start_position(start), end_position(end), accession(acc), name(n), optional_tag(tag), annotation_id(aid) {}
148 UInt annotation_id{std::numeric_limits<UInt>::max()};
152 : id1(i1), id2(i2), optional_tag(tag), annotation_id(aid) {}
213 return prefix == rhs.
prefix &&
268 std::vector<std::string>& descriptions,
269 std::vector<AASequence>& sequences,
270 bool include_complex =
false)
const;
300 std::vector<std::string>& descriptions,
301 std::vector<AASequence>& sequences,
303 Size max_length = 40,
304 bool include_reference =
true,
305 bool include_variants =
true,
306 bool include_modifications =
false)
const;
327 std::vector<std::string>& descriptions,
328 std::vector<AASequence>& sequences,
329 const std::vector<std::string>& fixed_mods = {},
330 const std::vector<std::string>& variable_mods = {},
331 Size max_variable_mods_per_peptide = 2,
333 Size max_length = 40,
334 bool include_reference =
true,
335 bool include_peff_variants =
true,
336 bool include_peff_modifications =
true)
const;
353 const std::vector<std::pair<Size, const PEFFModification*>>& peff_mods,
354 const String& base_description);
388 bool is_decoy{
false};
402 bool has_annotation_identifiers{
false};
403 bool is_proteoform_db{
false};
411 return version == rhs.
version &&
466 std::vector<PEFFEntry>& entries,
467 std::vector<PEFFDatabaseMetadata>& headers)
const;
479 const std::vector<PEFFEntry>& entries,
496 const std::vector<PEFFEntry>& entries,
497 const std::vector<PEFFDatabaseMetadata>& headers)
const;
615 bool readEntry_(std::string&
id, std::string& description, std::string& seq);
621 std::streampos fileSize_{0};
Representation of a peptide/protein sequence.
Definition AASequence.h:88
This class serves for reading and writing PEFF (PSI Extended FASTA Format) files.
Definition PEFFFile.h:447
void writeEnd()
Closes the output file (called automatically in destructor)
void parseAnnotations_(const String &description, PEFFEntry &entry)
Parse annotations from the description line.
PEFFFile()=default
Default constructor.
PEFFVariantComplex parseVariantComplex_(const String &tuple)
Parse a complex variant tuple.
bool atEnd() const
Returns true if the end of the file has been reached.
std::string seq_
Current sequence buffer.
Definition PEFFFile.h:622
PEFFDisulfideBond parseDisulfideBond_(const String &tuple)
Parse a disulfide bond tuple.
String formatHeader_(const std::vector< PEFFDatabaseMetadata > &headers) const
Format the header section for output (multiple database blocks)
~PEFFFile() override=default
Destructor.
PEFFProcessedRegion parseProcessedRegion_(const String &tuple)
Parse a processed region tuple.
void readStart(const String &filename)
Prepares a PEFF file for streamed reading using readNext().
void store(const String &filename, const std::vector< PEFFEntry > &entries, const std::vector< PEFFDatabaseMetadata > &headers) const
Stores entries to a PEFF file with multiple database headers.
String formatEntry_(const PEFFEntry &entry) const
Format a single entry for output.
PEFFVariantSimple parseVariantSimple_(const String &tuple)
Parse a simple variant tuple.
void writeStart(const String &filename, const std::vector< PEFFDatabaseMetadata > &headers)
Prepares a PEFF file for streamed writing using writeNext(), with multiple headers.
String formatHeader_(const PEFFDatabaseMetadata &header) const
Format the header section for output.
bool readNext(PEFFEntry &entry)
Reads the next PEFF entry from the file.
std::ofstream outfile_
Output file stream.
Definition PEFFFile.h:618
void parseHeaderLine_(const String &line, PEFFDatabaseMetadata &header, bool &new_db)
Parse a header line (# Key=Value or # //)
void writeStart(const String &filename, const PEFFDatabaseMetadata &header)
Prepares a PEFF file for streamed writing using writeNext().
void writeNext(const PEFFEntry &entry)
Writes the next PEFF entry to the file.
void load(const String &filename, std::vector< PEFFEntry > &entries, std::vector< PEFFDatabaseMetadata > &headers) const
Loads a PEFF file and stores entries and headers.
const std::vector< PEFFDatabaseMetadata > & getHeaders() const
Returns the headers parsed during readStart().
static bool isPEFFFile(const String &filename)
Checks if a file appears to be a PEFF file (by checking for # PEFF header).
bool readEntry_(std::string &id, std::string &description, std::string &seq)
Read entry data (identifier, description, sequence)
static String toProForma(const PEFFEntry &entry)
Converts a PEFF entry to ProForma notation.
std::string id_
Current identifier buffer.
Definition PEFFFile.h:623
std::vector< String > parseParenList_(const String &value)
Parse a parenthesized list of values.
std::vector< PEFFDatabaseMetadata > headers_
Parsed headers.
Definition PEFFFile.h:619
std::fstream infile_
Input file stream.
Definition PEFFFile.h:617
PEFFModification parseModification_(const String &tuple)
Parse a single modification tuple.
std::string description_
Current description buffer.
Definition PEFFFile.h:624
void store(const String &filename, const std::vector< PEFFEntry > &entries, const PEFFDatabaseMetadata &header) const
Stores entries to a PEFF file with the given header.
Base class for all classes that want to report their progress.
Definition ProgressLogger.h:27
Class for the enzymatic digestion of proteins represented as AASequence or String.
Definition ProteaseDigestion.h:32
A more convenient string class.
Definition String.h:34
bool hasPrefix(const String &string) const
true if String begins with string, false otherwise
int Int
Signed integer type.
Definition Types.h:72
unsigned int UInt
Unsigned integer type.
Definition Types.h:64
size_t Size
Size type e.g. used as variable which can hold result of size()
Definition Types.h:97
Main OpenMS namespace.
Definition openswathalgo/include/OpenMS/OPENSWATHALGO/DATAACCESS/ISpectrumAccess.h:19
Definition AhoCorasickAmbiguous.h:101
FASTA entry type (identifier, description and sequence) The first String corresponds to the identifie...
Definition FASTAFile.h:46
Represents a custom key definition from the PEFF header.
Definition PEFFFile.h:361
String key_name
Definition PEFFFile.h:362
bool operator==(const PEFFCustomKeyDef &rhs) const
Definition PEFFFile.h:369
String description
Definition PEFFFile.h:363
std::vector< String > field_names
Definition PEFFFile.h:366
String regexp
Definition PEFFFile.h:365
String concept_curie
Definition PEFFFile.h:364
std::vector< String > field_types
Definition PEFFFile.h:367
Represents a disulfide bond annotation in PEFF.
Definition PEFFFile.h:144
UInt annotation_id
Optional annotation identifier, max() = not set.
Definition PEFFFile.h:148
bool operator==(const PEFFDisulfideBond &rhs) const
Definition PEFFFile.h:154
String id1
First cysteine reference (AnnotationIdentifier of the cysteine residue)
Definition PEFFFile.h:145
String id2
Second cysteine reference (AnnotationIdentifier of the cysteine residue)
Definition PEFFFile.h:146
PEFFDisulfideBond(const String &i1, const String &i2, const String &tag="", UInt aid=std::numeric_limits< UInt >::max())
Definition PEFFFile.h:151
PEFFDisulfideBond()=default
String optional_tag
Optional tag (e.g., "between chains")
Definition PEFFFile.h:147
Represents a single entry in a PEFF file with all annotations.
Definition PEFFFile.h:174
String sequence_version
\SV
Definition PEFFFile.h:186
static std::vector< std::pair< String, AASequence > > enumeratePEFFModifications_(const AASequence &peptide, const std::vector< std::pair< Size, const PEFFModification * > > &peff_mods, const String &base_description)
Apply PEFF modifications at specific positions to a peptide.
void generatePeptides(const ProteaseDigestion &digestor, std::vector< std::string > &descriptions, std::vector< AASequence > &sequences, const std::vector< std::string > &fixed_mods={}, const std::vector< std::string > &variable_mods={}, Size max_variable_mods_per_peptide=2, Size min_length=6, Size max_length=40, bool include_reference=true, bool include_peff_variants=true, bool include_peff_modifications=true) const
Generate peptides with PEFF annotations and optional sample handling modifications.
String taxonomy_name
\TaxName
Definition PEFFFile.h:184
bool operator==(const PEFFEntry &rhs) const
Definition PEFFFile.h:211
Size sequence_length
\Length
Definition PEFFFile.h:185
PEFFEntry(const PEFFEntry &rhs)=default
String prefix
Database prefix from description line (e.g., "sp" from ">sp:P12345")
Definition PEFFFile.h:176
Int protein_existence
\PE (1-5)
Definition PEFFFile.h:188
AASequence getModifiedSequence() const
Get an AASequence with all annotated modifications applied.
std::vector< PEFFModification > modifications
Definition PEFFFile.h:194
std::map< String, String > custom_annotations
Definition PEFFFile.h:202
String sequence
Definition PEFFFile.h:178
std::vector< String > proteoforms
ProForma notation.
Definition PEFFFile.h:199
std::vector< PEFFProcessedRegion > processed_regions
Definition PEFFFile.h:197
void getVariantSequences(std::vector< std::string > &descriptions, std::vector< AASequence > &sequences, bool include_complex=false) const
Get all variant sequences (each variant applied individually).
PEFFEntry(PEFFEntry &&rhs) noexcept=default
AASequence getSequence() const
Get the base AASequence for this entry (unmodified sequence).
std::vector< PEFFVariantSimple > simple_variants
Definition PEFFFile.h:195
Int ncbi_tax_id
\NcbiTaxId or \OX
Definition PEFFFile.h:183
std::vector< PEFFDisulfideBond > disulfide_bonds
\DisulfideBond
Definition PEFFFile.h:198
void digestWithVariants(const ProteaseDigestion &digestor, std::vector< std::string > &descriptions, std::vector< AASequence > &sequences, Size min_length=6, Size max_length=40, bool include_reference=true, bool include_variants=true, bool include_modifications=false) const
Generate all variant and/or modification peptides by digesting with a given protease.
PEFFEntry & operator=(PEFFEntry &&rhs) noexcept=default
static PEFFEntry fromFASTAEntry(const FASTAFile::FASTAEntry &fasta)
Create a PEFFEntry from a FASTAEntry (basic fields only)
std::vector< PEFFVariantComplex > complex_variants
Definition PEFFFile.h:196
String entry_version
\EV
Definition PEFFFile.h:187
FASTAFile::FASTAEntry toFASTAEntry() const
Convert to a FASTAFile::FASTAEntry (loses PEFF-specific annotations)
PEFFEntry & operator=(const PEFFEntry &rhs)=default
String identifier
Definition PEFFFile.h:177
String entry_id
\ID (e.g., NPM_HUMAN)
Definition PEFFFile.h:190
std::vector< String > alt_accessions
\AltAC - alternative accessions
Definition PEFFFile.h:191
AASequence getProcessedSequence(const String ®ion_accession="PEFF:0001021") const
Get processed sequence (e.g., mature protein without signal peptide).
std::vector< String > protein_names
\PName - may have multiple names
Definition PEFFFile.h:181
String db_unique_id
\DbUniqueId
Definition PEFFFile.h:189
String gene_name
\GName
Definition PEFFFile.h:182
Represents a PEFF modification annotation.
Definition PEFFFile.h:32
bool operator==(const PEFFModification &rhs) const
Definition PEFFFile.h:56
PEFFModification(Size pos, const String &acc, const String &n, const String &tag="", UInt aid=std::numeric_limits< UInt >::max())
Definition PEFFFile.h:43
PEFFModification()=default
UInt annotation_id
Optional annotation identifier (when HasAnnotationIdentifiers=true), max() = not set.
Definition PEFFFile.h:37
Type
Definition PEFFFile.h:39
String name
Human-readable name.
Definition PEFFFile.h:35
String accession
"MOD:00046", "UNIMOD:35", or custom
Definition PEFFFile.h:34
Type type
Definition PEFFFile.h:40
Size position
1-based position, 0 = unknown position (?)
Definition PEFFFile.h:33
String optional_tag
Optional tag (last component of annotation tuple)
Definition PEFFFile.h:36
Represents a PEFF processed region (signal peptide, transit peptide, etc.).
Definition PEFFFile.h:118
UInt annotation_id
Optional annotation identifier, max() = not set.
Definition PEFFFile.h:124
PEFFProcessedRegion(Size start, Size end, const String &acc, const String &n="", const String &tag="", UInt aid=std::numeric_limits< UInt >::max())
Definition PEFFFile.h:127
PEFFProcessedRegion()=default
String name
Optional name (e.g., "signal peptide")
Definition PEFFFile.h:122
String accession
PEFF CV accession (e.g., "PEFF:0001021")
Definition PEFFFile.h:121
Size end_position
1-based end position
Definition PEFFFile.h:120
bool operator==(const PEFFProcessedRegion &rhs) const
Definition PEFFFile.h:130
Size start_position
1-based start position
Definition PEFFFile.h:119
String optional_tag
Optional tag (last component of annotation tuple)
Definition PEFFFile.h:123
Represents a complex PEFF variant (insertion, deletion, or substitution of multiple amino acids).
Definition PEFFFile.h:93
bool operator==(const PEFFVariantComplex &rhs) const
Definition PEFFFile.h:104
UInt annotation_id
Optional annotation identifier, max() = not set.
Definition PEFFFile.h:98
PEFFVariantComplex(Size start, Size end, const String &repl, const String &tag="", UInt aid=std::numeric_limits< UInt >::max())
Definition PEFFFile.h:101
String replacement
Replacement sequence (empty = deletion)
Definition PEFFFile.h:96
Size end_position
1-based end position
Definition PEFFFile.h:95
PEFFVariantComplex()=default
Size start_position
1-based start position
Definition PEFFFile.h:94
String optional_tag
Optional tag (last component of annotation tuple)
Definition PEFFFile.h:97
Represents a simple PEFF variant (single amino acid substitution).
Definition PEFFFile.h:70
UInt annotation_id
Optional annotation identifier, max() = not set.
Definition PEFFFile.h:74
bool operator==(const PEFFVariantSimple &rhs) const
Definition PEFFFile.h:80
PEFFVariantSimple(Size pos, char aa, const String &tag="", UInt aid=std::numeric_limits< UInt >::max())
Definition PEFFFile.h:77
PEFFVariantSimple()=default
char variant_aa
Variant amino acid.
Definition PEFFFile.h:72
Size position
1-based position
Definition PEFFFile.h:71
String optional_tag
Optional tag (last component of annotation tuple)
Definition PEFFFile.h:73