QuickRank  v2.0
QuickRank: A C++ suite of Learning to Rank algorithms
mart.h
Go to the documentation of this file.
1 /*
2  * QuickRank - A C++ suite of Learning to Rank algorithms
3  * Webpage: http://quickrank.isti.cnr.it/
4  * Contact: quickrank@isti.cnr.it
5  *
6  * Unless explicitly acquired and licensed from Licensor under another
7  * license, the contents of this file are subject to the Reciprocal Public
8  * License ("RPL") Version 1.5, or subsequent versions as allowed by the RPL,
9  * and You may not copy or use this file in either source code or executable
10  * form, except in compliance with the terms and conditions of the RPL.
11  *
12  * All software distributed under the RPL is provided strictly on an "AS
13  * IS" basis, WITHOUT WARRANTY OF ANY KIND, EITHER EXPRESS OR IMPLIED, AND
14  * LICENSOR HEREBY DISCLAIMS ALL SUCH WARRANTIES, INCLUDING WITHOUT
15  * LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR
16  * PURPOSE, QUIET ENJOYMENT, OR NON-INFRINGEMENT. See the RPL for specific
17  * language governing rights and limitations under the RPL.
18  *
19  * Contributor:
20  * HPC. Laboratory - ISTI - CNR - http://hpc.isti.cnr.it/
21  */
22 #pragma once
23 
24 #include "types.h"
25 #include "learning/ltr_algorithm.h"
26 #include "learning/tree/rt.h"
27 #include "learning/tree/ensemble.h"
28 
29 namespace quickrank {
30 namespace learning {
31 namespace forests {
32 
33 class Mart: public LTR_Algorithm {
34  public:
35  /// Initializes a new Mart instance with the given learning parameters.
36  ///
37  /// \param ntrees Maximum number of trees.
38  /// \param shrinkage Learning rate.
39  /// \param nthresholds Number of bins in discretization. 0 means no discretization.
40  /// \param ntreeleaves Maximum number of leaves in each tree.
41  /// \param minleafsupport Minimum number of instances in each leaf.
42  /// \param valid_iterations Early stopping if no improvement after \esr iterations
43  /// on the validation set.
44  Mart(size_t ntrees, double shrinkage, size_t nthresholds,
45  size_t ntreeleaves, size_t minleafsupport,
46  size_t valid_iterations)
47  : ntrees_(ntrees),
48  shrinkage_(shrinkage),
49  nthresholds_(nthresholds),
50  nleaves_(ntreeleaves),
51  minleafsupport_(minleafsupport),
52  valid_iterations_(valid_iterations) {
53  }
54 
55  /// Generates a LTR_Algorithm instance from a previously saved XML model.
56  Mart(const pugi::xml_document &model);
57 
58  virtual ~Mart() {
59  }
60 
61  /// Start the learning process.
62  virtual void learn(std::shared_ptr<data::Dataset> training_dataset,
63  std::shared_ptr<data::Dataset> validation_dataset,
64  std::shared_ptr<metric::ir::Metric> training_metric,
65  size_t partial_save,
66  const std::string output_basename);
67 
68  /// Returns the score by the current ranker
69  ///
70  /// \param d Document to be scored.
71  virtual Score score_document(const Feature *d) const {
72  return ensemble_model_.score_instance(d, 1);
73  }
74 
75  /// Returns the partial scores of a given document, tree.
76  /// \param d is a pointer to the document to be evaluated
77  /// \param next_fx_offset The offset to the next feature in the data representation.
78  /// \note Each algorithm has a different implementation.
79  virtual std::shared_ptr<std::vector<Score>> partial_scores_document(
80  const Feature *d) const {
81  // TODO: remove the following code...
82  if (!ensemble_model_.get_size()) {
83  std::cerr << "Zero alberi nell'ensemble..." << std::endl;
84  exit(EXIT_FAILURE);
85  }
87  }
88 
89  /// Print additional statistics.
90  ///
91  /// At the moment this include only number of comparisons for tree-based algorithms.
92  virtual void print_additional_stats(void) const;
93 
94  /// Returns the name of the ranker.
95  virtual std::string name() const {
96  return NAME_;
97  }
98 
99  virtual bool update_weights(std::shared_ptr<std::vector<double>> weights);
100 
101  virtual std::shared_ptr<std::vector<double>> get_weights() const {
102  return ensemble_model_.get_weights();
103  }
104 
105  static const std::string NAME_;
106 
107  protected:
108 
109  /// Prepares private data structures before training takes place.
110  virtual void init(std::shared_ptr<data::VerticalDataset> training_dataset);
111 
112  /// De-allocates private data structure after training has taken place.
113  virtual void clear(size_t num_features);
114 
115  /// Computes pseudo responses.
116  ///
117  /// \param training_dataset The training data.
118  /// \param metric The metric to be optimized.
119  virtual void compute_pseudoresponses(
120  std::shared_ptr<data::VerticalDataset> training_dataset,
121  metric::ir::Metric *metric);
122 
123  /// Fits a regression tree on the gradient given by the pseudo residuals
124  ///
125  /// \param training_dataset The dataset used for training
126  virtual std::unique_ptr<RegressionTree> fit_regressor_on_gradient(
127  std::shared_ptr<data::VerticalDataset> training_dataset);
128 
129  /// Updates scores with the last learnt regression tree.
130  ///
131  /// \param dataset Dataset to be scored.
132  /// \param scores Scores vector to be updated.
133  /// \param tree Last regression tree leartn.
134  virtual void update_modelscores(std::shared_ptr<data::Dataset> dataset,
135  Score *scores, RegressionTree *tree);
136  virtual void
137  update_modelscores(std::shared_ptr<data::VerticalDataset> dataset,
138  Score *scores, RegressionTree *tree);
139 
140  virtual pugi::xml_document *get_xml_model() const;
141 
142  protected:
143  float **thresholds_ = NULL;
144  size_t *thresholds_size_ = NULL;
145  double *scores_on_training_ = NULL; //[0..nentries-1]
146  quickrank::Score *scores_on_validation_ = NULL; //[0..nentries-1]
148  double *pseudoresponses_ = NULL; //[0..nentries-1]
150 
151  size_t ntrees_; //>0
152  double shrinkage_; //>0.0f
153  size_t nthresholds_; //if ==0 then no. of thresholds is not limited
154  size_t nleaves_; //>0
155  size_t minleafsupport_; //>0
156  size_t
157  valid_iterations_; //If no performance gain on validation data is observed in 'esr' rounds, stop the training process right away (if esr==0 feature is disabled).
158 
159  size_t **sortedsid_ = NULL;
160  size_t sortedsize_ = 0;
162 
163  private:
164  /// The output stream operator.
165  friend std::ostream &operator<<(std::ostream &os, const Mart &a) {
166  return a.put(os);
167  }
168 
169  /// Prints the description of Algorithm, including its parameters.
170  virtual std::ostream &put(std::ostream &os) const;
171 
172 };
173 
174 } // namespace forests
175 } // namespace learning
176 } // namespace quickrank
177 
size_t sortedsize_
Definition: mart.h:160
size_t ntrees_
Definition: mart.h:151
Definition: dataset.cc:28
virtual std::string name() const
Returns the name of the ranker.
Definition: mart.h:95
virtual std::shared_ptr< std::vector< double > > get_weights() const
Return the weights for the ensemble models (only).
Definition: mart.h:101
virtual pugi::xml_document * get_xml_model() const
Return the xml model representing the current object.
Definition: mart.cc:355
virtual std::unique_ptr< RegressionTree > fit_regressor_on_gradient(std::shared_ptr< data::VerticalDataset > training_dataset)
Fits a regression tree on the gradient given by the pseudo residuals.
Definition: mart.cc:309
This class implements the basic functionalities of an IR evaluation metric.
Definition: metric.h:43
RTRootHistogram * hist_
Definition: mart.h:161
virtual void print_additional_stats(void) const
Print additional statistics.
Definition: mart.cc:378
size_t get_size() const
Definition: ensemble.h:36
size_t validation_bestmodel_
Definition: mart.h:147
friend std::ostream & operator<<(std::ostream &os, const Mart &a)
The output stream operator.
Definition: mart.h:165
Definition: ltr_algorithm.h:33
float Feature
data type for instance predicted label
Definition: types.h:31
double shrinkage_
Definition: mart.h:152
double * pseudoresponses_
Definition: mart.h:148
virtual void init(std::shared_ptr< data::VerticalDataset > training_dataset)
Prepares private data structures before training takes place.
Definition: mart.cc:93
quickrank::Score * scores_on_validation_
Definition: mart.h:146
virtual void learn(std::shared_ptr< data::Dataset > training_dataset, std::shared_ptr< data::Dataset > validation_dataset, std::shared_ptr< metric::ir::Metric > training_metric, size_t partial_save, const std::string output_basename)
Start the learning process.
Definition: mart.cc:176
float ** thresholds_
Definition: mart.h:143
static const std::string NAME_
Definition: mart.h:105
virtual ~Mart()
Definition: mart.h:58
size_t nthresholds_
Definition: mart.h:153
virtual void update_modelscores(std::shared_ptr< data::Dataset > dataset, Score *scores, RegressionTree *tree)
Updates scores with the last learnt regression tree.
Definition: mart.cc:323
size_t * thresholds_size_
Definition: mart.h:144
Mart(size_t ntrees, double shrinkage, size_t nthresholds, size_t ntreeleaves, size_t minleafsupport, size_t valid_iterations)
Initializes a new Mart instance with the given learning parameters.
Definition: mart.h:44
Definition: ensemble.h:28
Definition: rtnode_histogram.h:53
size_t valid_iterations_
Definition: mart.h:157
double * scores_on_training_
Definition: mart.h:145
double Score
data type for instance truth label
Definition: types.h:30
virtual std::shared_ptr< std::vector< quickrank::Score > > partial_scores_instance(const quickrank::Feature *d, const size_t offset=1) const
Definition: ensemble.cc:61
virtual void clear(size_t num_features)
De-allocates private data structure after training has taken place.
Definition: mart.cc:157
Definition: rt.h:44
virtual Score score_document(const Feature *d) const
Returns the score by the current ranker.
Definition: mart.h:71
size_t nleaves_
Definition: mart.h:154
virtual bool update_weights(std::shared_ptr< std::vector< double >> weights)
Update the weights for the ensemble models (only).
Definition: mart.cc:374
virtual quickrank::Score score_instance(const quickrank::Feature *d, const size_t offset=1) const
Definition: ensemble.cc:51
virtual std::ostream & put(std::ostream &os) const
Prints the description of Algorithm, including its parameters.
Definition: mart.cc:76
size_t minleafsupport_
Definition: mart.h:155
virtual void compute_pseudoresponses(std::shared_ptr< data::VerticalDataset > training_dataset, metric::ir::Metric *metric)
Computes pseudo responses.
Definition: mart.cc:300
virtual std::shared_ptr< std::vector< Score > > partial_scores_document(const Feature *d) const
Returns the partial scores of a given document, tree.
Definition: mart.h:79
Ensemble ensemble_model_
Definition: mart.h:149
virtual std::shared_ptr< std::vector< double > > get_weights() const
Definition: ensemble.cc:104
size_t ** sortedsid_
Definition: mart.h:159