OpenMS  2.8.0
LibSVMEncoder.h
Go to the documentation of this file.
1 // --------------------------------------------------------------------------
2 // OpenMS -- Open-Source Mass Spectrometry
3 // --------------------------------------------------------------------------
4 // Copyright The OpenMS Team -- Eberhard Karls University Tuebingen,
5 // ETH Zurich, and Freie Universitaet Berlin 2002-2021.
6 //
7 // This software is released under a three-clause BSD license:
8 // * Redistributions of source code must retain the above copyright
9 // notice, this list of conditions and the following disclaimer.
10 // * Redistributions in binary form must reproduce the above copyright
11 // notice, this list of conditions and the following disclaimer in the
12 // documentation and/or other materials provided with the distribution.
13 // * Neither the name of any author or any participating institution
14 // may be used to endorse or promote products derived from this software
15 // without specific prior written permission.
16 // For a full list of authors, refer to the file AUTHORS.
17 // --------------------------------------------------------------------------
18 // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
19 // AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
20 // IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
21 // ARE DISCLAIMED. IN NO EVENT SHALL ANY OF THE AUTHORS OR THE CONTRIBUTING
22 // INSTITUTIONS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
23 // EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
24 // PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
25 // OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
26 // WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR
27 // OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF
28 // ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
29 //
30 // --------------------------------------------------------------------------
31 // $Maintainer: Timo Sachsenberg $
32 // $Authors: Nico Pfeifer $
33 // --------------------------------------------------------------------------
34 
35 #pragma once
36 
40 
41 #include <vector>
42 #include <utility>
43 
44 struct svm_problem;
45 struct svm_parameter;
46 struct svm_model;
47 
48 namespace OpenMS
49 {
58  class OPENMS_DLLAPI LibSVMEncoder
59  {
60 public:
62  LibSVMEncoder() = default;
64  ~LibSVMEncoder() = default;
65 
75  void encodeCompositionVector(const String & sequence, std::vector<std::pair<Int, double> > & encoded_vector, const String & allowed_characters = "ACDEFGHIKLMNPQRSTVWY");
76 
86  void encodeCompositionVectors(const std::vector<String> & sequences, const String & allowed_characters, std::vector<std::vector<std::pair<Int, double> > > & composition_vectors);
88  svm_node * encodeLibSVMVector(const std::vector<std::pair<Int, double> > & feature_vector);
89 
91  void encodeLibSVMVectors(const std::vector<std::vector<std::pair<Int, double> > > & feature_vectors, std::vector<svm_node *> & libsvm_vectors);
92 
94  svm_problem * encodeLibSVMProblem(const std::vector<svm_node *> & vectors,
95  std::vector<double> & labels);
96 
98  svm_problem * encodeLibSVMProblemWithCompositionVectors(const std::vector<String> & sequences,
99  std::vector<double> & labels,
100  const String & allowed_characters);
101 
107  svm_problem * encodeLibSVMProblemWithCompositionAndLengthVectors(const std::vector<String> & sequences,
108  std::vector<double> & labels,
109  const String & allowed_characters,
110  UInt maximum_sequence_length);
111 
117  svm_problem * encodeLibSVMProblemWithCompositionLengthAndWeightVectors(const std::vector<String> & sequences,
118  std::vector<double> & labels,
119  const String & allowed_characters);
120 
122  bool storeLibSVMProblem(const String & filename, const svm_problem * problem) const;
123 
125  svm_problem * loadLibSVMProblem(const String & filename);
126 
128  void encodeOligoBorders(String sequence,
129  UInt k_mer_length,
130  const String & allowed_characters,
131  UInt border_length,
132  std::vector<std::pair<Int, double> > & libsvm_vector,
133  bool strict = false,
134  bool unpaired = false,
135  bool length_encoding = false);
136 
138  svm_problem * encodeLibSVMProblemWithOligoBorderVectors(const std::vector<String> & sequences,
139  std::vector<double> & labels,
140  UInt k_mer_length,
141  const String & allowed_characters,
142  UInt border_length,
143  bool strict = false,
144  bool unpaired = false,
145  bool length_encoding = false);
146 
148  void encodeProblemWithOligoBorderVectors(const std::vector<AASequence> & sequences,
149  UInt k_mer_length,
150  const String & allowed_characters,
151  UInt border_length,
152  std::vector<std::vector<std::pair<Int, double> > > & vectors);
153 
160  void libSVMVectorToString(svm_node * vector, String & output);
161 
168  void libSVMVectorsToString(svm_problem * vector, String & output);
169 
176  void encodeOligo(const AASequence & sequence,
177  UInt k_mer_length,
178  const String & allowed_characters,
179  std::vector<std::pair<Int, double> > & values,
180  bool is_right_border = false);
181 
187  static void destroyProblem(svm_problem * problem);
188 
189  static std::vector<double> predictPeptideRT(const std::vector<String> & sequences,
190  SVMWrapper& svm,
191  const String & allowed_characters = "ACDEFGHIKLMNPQRSTVWY",
192  UInt maximum_sequence_length = 50)
193  {
194  std::vector<double> predicted_retention_times;
195 
196  LibSVMEncoder encoder;
197  std::vector<double> temp_rts;
198  temp_rts.resize(sequences.size(), 0);
199  svm_problem * prediction_data =
201  temp_rts,
202  allowed_characters,
203  maximum_sequence_length);
204  svm.predict(prediction_data, predicted_retention_times);
205  LibSVMEncoder::destroyProblem(prediction_data);
206  return predicted_retention_times;
207  }
208 
209 private:
211  static bool cmpOligos_(std::pair<Int, double> a,
212  std::pair<Int, double> b);
213 
214  };
215 
216 } // namespace OpenMS
217 
Representation of a peptide/protein sequence.
Definition: AASequence.h:112
Serves for encoding sequences into feature vectors.
Definition: LibSVMEncoder.h:59
svm_problem * loadLibSVMProblem(const String &filename)
loads the LibSVM-encoded data stored in 'filename'
void encodeOligoBorders(String sequence, UInt k_mer_length, const String &allowed_characters, UInt border_length, std::vector< std::pair< Int, double > > &libsvm_vector, bool strict=false, bool unpaired=false, bool length_encoding=false)
encodes the borders of the sequence as k_mer oligos and stores them in 'libsvm_vector'
void encodeCompositionVectors(const std::vector< String > &sequences, const String &allowed_characters, std::vector< std::vector< std::pair< Int, double > > > &composition_vectors)
stores composition vectors of the sequences given by 'sequence' in 'composition_vectors'
svm_problem * encodeLibSVMProblemWithCompositionVectors(const std::vector< String > &sequences, std::vector< double > &labels, const String &allowed_characters)
creates composition vectors for 'sequences' and stores them in LibSVM compliant format
svm_node * encodeLibSVMVector(const std::vector< std::pair< Int, double > > &feature_vector)
encodes the feature vector in LibSVM compliant format
svm_problem * encodeLibSVMProblemWithCompositionLengthAndWeightVectors(const std::vector< String > &sequences, std::vector< double > &labels, const String &allowed_characters)
creates composition vectors with additional length and average weight information for 'sequences' and...
svm_problem * encodeLibSVMProblemWithCompositionAndLengthVectors(const std::vector< String > &sequences, std::vector< double > &labels, const String &allowed_characters, UInt maximum_sequence_length)
creates composition vectors with additional length information for 'sequences' and stores them in Lib...
static void destroyProblem(svm_problem *problem)
frees all the memory of the svm_problem instance
void libSVMVectorsToString(svm_problem *vector, String &output)
stores a string representation of the encoded sequences in 'vectors' in 'output'
svm_problem * encodeLibSVMProblemWithOligoBorderVectors(const std::vector< String > &sequences, std::vector< double > &labels, UInt k_mer_length, const String &allowed_characters, UInt border_length, bool strict=false, bool unpaired=false, bool length_encoding=false)
creates oligo border vectors vectors for 'sequences' and stores them in LibSVM compliant format
void encodeCompositionVector(const String &sequence, std::vector< std::pair< Int, double > > &encoded_vector, const String &allowed_characters="ACDEFGHIKLMNPQRSTVWY")
stores a composition vector of 'sequence' in 'encoded_vector'
static std::vector< double > predictPeptideRT(const std::vector< String > &sequences, SVMWrapper &svm, const String &allowed_characters="ACDEFGHIKLMNPQRSTVWY", UInt maximum_sequence_length=50)
Definition: LibSVMEncoder.h:189
bool storeLibSVMProblem(const String &filename, const svm_problem *problem) const
stores the LibSVM-encoded data in a text file that can be used by the LibSVM applications (svm-scale,...
void encodeOligo(const AASequence &sequence, UInt k_mer_length, const String &allowed_characters, std::vector< std::pair< Int, double > > &values, bool is_right_border=false)
encodes an AASequence instance in oligo encoding
LibSVMEncoder()=default
Constructor.
svm_problem * encodeLibSVMProblem(const std::vector< svm_node * > &vectors, std::vector< double > &labels)
encodes the LibSVM compliant vectors into a LibSVM compliant structure
~LibSVMEncoder()=default
Destructor.
static bool cmpOligos_(std::pair< Int, double > a, std::pair< Int, double > b)
comparator for oligos encoded by encodeOligo
void encodeLibSVMVectors(const std::vector< std::vector< std::pair< Int, double > > > &feature_vectors, std::vector< svm_node * > &libsvm_vectors)
encodes the feature vectors in LibSVM compliant format
void libSVMVectorToString(svm_node *vector, String &output)
stores a string representation of the encoded sequence 'vector' in 'output'
void encodeProblemWithOligoBorderVectors(const std::vector< AASequence > &sequences, UInt k_mer_length, const String &allowed_characters, UInt border_length, std::vector< std::vector< std::pair< Int, double > > > &vectors)
creates oligo border vectors vectors for 'sequences' and stores them in 'vectors'
Serves as a wrapper for the libsvm.
Definition: SVMWrapper.h:85
void predict(struct svm_problem *problem, std::vector< double > &predicted_labels)
predicts the labels using the trained model
A more convenient string class.
Definition: String.h:60
unsigned int UInt
Unsigned integer type.
Definition: Types.h:94
Main OpenMS namespace.
Definition: FeatureDeconvolution.h:47