QuickRank  v2.0
QuickRank: A C++ suite of Learning to Rank algorithms
dataset.h
Go to the documentation of this file.
1 /*
2  * QuickRank - A C++ suite of Learning to Rank algorithms
3  * Webpage: http://quickrank.isti.cnr.it/
4  * Contact: quickrank@isti.cnr.it
5  *
6  * Unless explicitly acquired and licensed from Licensor under another
7  * license, the contents of this file are subject to the Reciprocal Public
8  * License ("RPL") Version 1.5, or subsequent versions as allowed by the RPL,
9  * and You may not copy or use this file in either source code or executable
10  * form, except in compliance with the terms and conditions of the RPL.
11  *
12  * All software distributed under the RPL is provided strictly on an "AS
13  * IS" basis, WITHOUT WARRANTY OF ANY KIND, EITHER EXPRESS OR IMPLIED, AND
14  * LICENSOR HEREBY DISCLAIMS ALL SUCH WARRANTIES, INCLUDING WITHOUT
15  * LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR
16  * PURPOSE, QUIET ENJOYMENT, OR NON-INFRINGEMENT. See the RPL for specific
17  * language governing rights and limitations under the RPL.
18  *
19  * Contributor:
20  * HPC. Laboratory - ISTI - CNR - http://hpc.isti.cnr.it/
21  */
22 #pragma once
23 
24 #include <iostream>
25 #include <memory>
26 #include <vector>
27 
28 #include "types.h"
29 #include "data/queryresults.h"
30 
31 namespace quickrank {
32 namespace data {
33 
34 /**
35  * This class implements a Dataset to be used for a L-t-R task.
36  *
37  * The internal representation is quite simple: a row vector
38  * of size \a num_instances() x \a num_features().
39  * (A training instance is indeed a document.)
40  * We allow to directly
41  * access the internal representation through the function \a at()
42  * to support fast access and custom high performance implementations.
43  * Internal representation is horizontal (instances x features).
44  */
45 class Dataset {
46  public:
47 
48  /// Allocates an empty Dataset of given size in horizontal format.
49  ///
50  /// \param n_instances The number of training instances (lines) in the dataset.
51  /// \param n_features The number of features.
52  Dataset(size_t n_instances, size_t n_features);
53  virtual ~Dataset();
54 
55  /// Avoid inefficient copy constructor
56  Dataset(const Dataset &other) = delete;
57  /// Avoid inefficient copy assignment
58  Dataset &operator=(const Dataset &) = delete;
59 
60  /// Returns a pointer to a specific data item.
61  ///
62  /// \param document_id The document of interest.
63  /// \param feature_id The feature of interest.
64  /// \returns A reference to the requested feature value of the given document id.
65  quickrank::Feature *at(size_t document_id, size_t feature_id) {
66  return data_ + document_id * num_features_ + feature_id;
67  }
68 
69  /// Returns the value of the i-th relevance label.
70  Label getLabel(size_t document_id) {
71  return labels_[document_id];
72  }
73 
74  /// Returns the offset in the internal data structure of the i-th query
75  /// results list.
76  ///
77  /// \param i The i-th query results list of interest.
78  /// \returns The offset of the first document in the i-th query results list.
79  /// This can be used to later invoke the \a at() function.
80  size_t offset(size_t i) const {
81  return offsets_[i];
82  }
83 
84  /// Returns the i-th QueryResults in the dataset.
85  ///
86  /// \param i The i-th query results list of interest.
87  /// \returns The requested QueryResults.
88  std::unique_ptr<QueryResults> getQueryResults(size_t i) const;
89 
90  /// Add a new training instance, i.e., a labeled document, to the dataset.
91  ///
92  /// \warning Currently the addition works only when data is in HORIZ format.
93  /// \param q_id The query ID.
94  /// \param i_label The relevance label of the result.
95  /// \param i_features The feature vector of the document.
96  void addInstance(QueryID q_id, Label i_label,
97  std::vector<Feature> i_features);
98 
99  /// Returns the number of features used to represent a document.
100  size_t num_features() const {
101  return num_features_;
102  }
103  /// Returns the number of queries in the dataset.
104  size_t num_queries() const {
105  return num_queries_;
106  }
107  /// Returns the number of documents in the dataset.
108  size_t num_instances() const {
109  return num_instances_;
110  }
111 
112  // - support normalization
113  // - support discretisation, or simply provide discr.ed thresholds
114  // - support horiz. and vert. sampling
115 
116  private:
117 
119  size_t num_queries_;
121 
124  std::vector<size_t> offsets_;
125 
128 
129  /// The output stream operator.
130  /// Prints the data reading time stats
131  friend std::ostream &operator<<(std::ostream &os, const Dataset &me) {
132  return me.put(os);
133  }
134 
135  /// Prints the data reading time stats
136  virtual std::ostream &put(std::ostream &os) const;
137 
138 };
139 
140 } // namespace data
141 } // namespace quickrank
142 
size_t offset(size_t i) const
Returns the offset in the internal data structure of the i-th query results list. ...
Definition: dataset.h:80
Definition: dataset.cc:28
size_t last_instance_id_
Definition: dataset.h:126
std::unique_ptr< QueryResults > getQueryResults(size_t i) const
Returns the i-th QueryResults in the dataset.
Definition: dataset.cc:89
Dataset & operator=(const Dataset &)=delete
Avoid inefficient copy assignment.
size_t max_instances_
Definition: dataset.h:127
This class implements a Dataset to be used for a L-t-R task.
Definition: dataset.h:45
virtual ~Dataset()
Definition: dataset.cc:57
quickrank::Feature * at(size_t document_id, size_t feature_id)
Returns a pointer to a specific data item.
Definition: dataset.h:65
size_t num_features_
Definition: dataset.h:118
Label getLabel(size_t document_id)
Returns the value of the i-th relevance label.
Definition: dataset.h:70
float Feature
data type for instance predicted label
Definition: types.h:31
quickrank::Feature * data_
Definition: dataset.h:122
std::vector< size_t > offsets_
Definition: dataset.h:124
size_t num_instances_
Definition: dataset.h:120
unsigned int QueryID
data type for instance feature
Definition: types.h:32
size_t num_queries() const
Returns the number of queries in the dataset.
Definition: dataset.h:104
size_t num_queries_
Definition: dataset.h:119
friend std::ostream & operator<<(std::ostream &os, const Dataset &me)
The output stream operator.
Definition: dataset.h:131
void addInstance(QueryID q_id, Label i_label, std::vector< Feature > i_features)
Add a new training instance, i.e., a labeled document, to the dataset.
Definition: dataset.cc:64
Dataset(size_t n_instances, size_t n_features)
Allocates an empty Dataset of given size in horizontal format.
Definition: dataset.cc:31
size_t num_instances() const
Returns the number of documents in the dataset.
Definition: dataset.h:108
quickrank::Label * labels_
Definition: dataset.h:123
size_t num_features() const
Returns the number of features used to represent a document.
Definition: dataset.h:100
virtual std::ostream & put(std::ostream &os) const
Prints the data reading time stats.
Definition: dataset.cc:99
float Label
Definition: types.h:29