pktools 2.6.7
Processing Kernel for geospatial data
FeatureSelector.h
1/**********************************************************************
2FeatureSelector.h: select features, typical use: feature selection for classification
3Copyright (C) 2008-2012 Pieter Kempeneers
4
5This file is part of pktools
6
7pktools is free software: you can redistribute it and/or modify
8it under the terms of the GNU General Public License as published by
9the Free Software Foundation, either version 3 of the License, or
10(at your option) any later version.
11
12pktools is distributed in the hope that it will be useful,
13but WITHOUT ANY WARRANTY; without even the implied warranty of
14MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15GNU General Public License for more details.
16
17You should have received a copy of the GNU General Public License
18along with pktools. If not, see <http://www.gnu.org/licenses/>.
19***********************************************************************/
20#ifndef _FEATURESELECTOR_H_
21#define _FEATURESELECTOR_H_
22
23#include <math.h>
24#include <vector>
25#include <list>
26#include <algorithm>
27#include <iostream>
28#include <iomanip>
29#include "ConfusionMatrix.h"
30#include "base/IndexValue.h"
31#include "base/Vector2d.h"
32#include "gsl/gsl_combination.h"
33#include "CostFactory.h"
34
36{
37 public:
40 template<class T> double forward(std::vector< Vector2d<T> >& v, CostFactory& theCostFactory, std::list<int>& subset, int maxFeatures=0, short verbose=0);
41 template<class T> double backward(std::vector< Vector2d<T> >& v, CostFactory& theCostFactory, std::list<int>& subset, int minFeatures, short verbose=0);
42 template<class T> double floating(std::vector< Vector2d<T> >& v, CostFactory& theCostFactory, std::list<int>& subset, int maxFeatures=0, double epsilon=0.001, short verbose=0);
43 template<class T> double bruteForce(std::vector< Vector2d<T> >& v, CostFactory& theCostFactory, std::list<int>& subset, int maxFeatures=0, short verbose=0);
44
45 private:
46 template<class T> double addFeature(std::vector< Vector2d<T> >& v, CostFactory& theCostFactory, std::list<int>& subset, short verbose=0);
47 template<class T> double removeFeature(std::vector< Vector2d<T> >& v, CostFactory& theCostFactory, std::list<int>& subset, int& r, short verbose=0);
48 template<class T> double forwardUnivariate(std::vector< Vector2d<T> >& v, CostFactory& theCostFactory, std::list<int>& subset, int maxFeatures=0, short verbose=0);
49};
50
51//sequential forward selection Univariate (N single best features)
52template<class T> double FeatureSelector::forwardUnivariate(std::vector< Vector2d<T> >& v, CostFactory& theCostFactory, std::list<int>& subset, int maxFeatures, short verbose){
53 int maxLevels=v[0][0].size();
54 if(!maxFeatures)
55 maxFeatures=maxLevels;
56 int k=subset.size();
57 if(k>=maxFeatures)
58 return -1;
59 std::vector<IndexValue> cost(maxLevels);
60 std::list<int> tmpset=subset;//temporary set of selected features (levels)
61 std::vector< Vector2d<T> > tmp(v.size());
62 for(int ilevel=0;ilevel<maxLevels;++ilevel){
63 if(find(tmpset.begin(),tmpset.end(),ilevel)==tmpset.end()){
64 tmpset.push_back(ilevel);
65 for(int iclass=0;iclass<v.size();++iclass){
66// tmp[iclass].resize(v[iclass].size());
67 v[iclass].selectCols(tmpset,tmp[iclass]);
68 }
69 try{
70 IndexValue pv;
71 pv.position=ilevel;
72 pv.value=theCostFactory.getCost(tmp);
73 cost[ilevel]=pv;
74 }
75 catch(...){
76 IndexValue pv;
77 pv.position=ilevel;
78 pv.value=-1;
79 cost[ilevel]=pv;
80 }
81 tmpset.pop_back();
82 }
83 }
84 sort(cost.begin(),cost.end(),Compare_IndexValue());//decreasing order
85 int ilevel=0;
86 while((subset.size()<maxFeatures)&&(ilevel<maxLevels)){
87 if(cost[ilevel].value>0)
88 subset.push_back(cost[ilevel].position);
89 if(verbose)
90 std::cout << "feature " << subset.back() << " has cost: " << cost[ilevel].value << std::endl;
91 ++ilevel;
92 }
93 double maxCost=-1;
94 while(subset.size()){
95 for(int iclass=0;iclass<v.size();++iclass){
96// tmp[iclass].resize(v[iclass].size());
97 v[iclass].selectCols(subset,tmp[iclass]);
98 }
99 try{
100 maxCost=theCostFactory.getCost(tmp);
101 }
102 catch(...){
103 subset.pop_back();
104 continue;
105 }
106 return maxCost;
107 }
108}
109
110//sequential forward selection Multivariate (Combination of N best features)
111template<class T> double FeatureSelector::forward(std::vector< Vector2d<T> >& v, CostFactory& theCostFactory, std::list<int>& subset, int maxFeatures, short verbose){
112 //Select feature with the best value (get maximal cost for 1 feature)
113 double maxCost=0;
114 int maxLevels=v[0][0].size();
115 if(!maxFeatures)
116 maxFeatures=maxLevels;
117 while(subset.size()<maxFeatures){
118 maxCost=addFeature(v,theCostFactory,subset,verbose);
119 if(verbose){
120 for(std::list<int>::const_iterator lit=subset.begin();lit!=subset.end();++lit)
121 std::cout << *lit << " ";
122 std::cout << std::endl;
123 // std::cout << "added " << subset.back() << ", " << subset.size() << "/" << maxFeatures << " features selected with cost: " << maxCost << std::endl;
124 }
125 }//while
126 return maxCost;
127}
128
129//sequential backward selection
130template<class T> double FeatureSelector::backward(std::vector< Vector2d<T> >& v, CostFactory& theCostFactory, std::list<int>& subset, int minFeatures, short verbose){
131 //Select features with least effect on cost when removed (obtain minFeatures eventually)
132 double maxCost=0;
133 int removedFeature;
134 if(subset.empty()){
135 for(int iFeature=0;iFeature<v[0][0].size();++iFeature)
136 subset.push_back(iFeature);
137 }
138 if(subset.size()==minFeatures)
139 maxCost=theCostFactory.getCost(v);
140 while(subset.size()>minFeatures){
141 maxCost=removeFeature(v,theCostFactory,subset,removedFeature,verbose);
142 if(verbose){
143 for(std::list<int>::const_iterator lit=subset.begin();lit!=subset.end();++lit)
144 std::cout << *lit << " ";
145 std::cout << std::endl;
146 // std::cout << "removed " << removedFeature << ", " << subset.size() << "/" << minFeatures << " features remain with cost: " << maxCost << std::endl;
147 }
148 }//while
149 return maxCost;
150}
151
152//floating search
153template<class T> double FeatureSelector::floating(std::vector< Vector2d<T> >& v, CostFactory& theCostFactory, std::list<int>& subset, int maxFeatures, double epsilon, short verbose){
154 std::vector<T> cost;
155 int maxLevels=v[0][0].size();
156 if(maxFeatures<1)
157 maxFeatures=maxLevels;
158 int k=subset.size();
159 if(k>=maxFeatures)
160 return -1;
161 while(cost.size()<subset.size())
162 cost.push_back(1);//init original features as cost 1
163 cost.push_back(addFeature(v,theCostFactory,subset,verbose));
164 ++k;
165 if(verbose>1)
166 std::cout << "added " << subset.back() << ", " << subset.size() << "/" << maxFeatures << " features selected with cost: " << cost.back() << std::endl;
167 else if(verbose){
168 for(std::list<int>::const_iterator lit=subset.begin();lit!=subset.end();++lit)
169 std::cout << *lit << " ";
170 std::cout << std::endl;
171 }
172 while(k<maxFeatures){
173 cost.push_back(addFeature(v,theCostFactory,subset,verbose));
174 ++k;
175 if(verbose>1)
176 std::cout << "added " << subset.back() << ", " << subset.size() << "/" << maxFeatures << " features selected with cost: " << cost.back() << std::endl;
177 else if(verbose){
178 for(std::list<int>::const_iterator lit=subset.begin();lit!=subset.end();++lit)
179 std::cout << *lit << " ";
180 std::cout << " (cost: " << cost.back() << ")" << std::endl;
181 }
182
183 while(k>1){
184 int x_r;
185 double cost_r=removeFeature(v,theCostFactory,subset,x_r,verbose);
186 if(cost_r>cost[k-1]+epsilon){
187 --k;
188 cost[k]=cost_r;
189 cost.pop_back();
190 if(verbose>1)
191 std::cout << "removed " << x_r << ", " << subset.size() << "/" << maxFeatures << " features remain with cost: " << cost_r << std::endl;
192 else if(verbose){
193 for(std::list<int>::const_iterator lit=subset.begin();lit!=subset.end();++lit)
194 std::cout << *lit << " ";
195 std::cout << " (cost: " << cost.back() << ")" << std::endl;
196 }
197 continue;
198 }
199 else if(cost_r>=0){
200 subset.push_back(x_r);
201 break;
202 }
203 else if(verbose)
204 std::cout << "could not remove any feature" << std::endl;
205 cost.pop_back();
206 }
207 }
208 return cost.back();
209}
210
211//brute force search (search for all possible combinations and select best)
212template<class T> double FeatureSelector::bruteForce(std::vector< Vector2d<T> >& v, CostFactory& theCostFactory, std::list<int>& subset, int maxFeatures, short verbose){
213 int maxLevels=v[0][0].size();
214 if(maxFeatures<1)
215 maxFeatures=maxLevels;
216 int k=subset.size();
217 if(k>=maxFeatures)
218 return -1;
219// gslmm::combination c(v1[0].size(),maxFeatures,false);
220 gsl_combination *c;
221 c=gsl_combination_calloc(v[0][0].size(),maxFeatures);
222
223 std::list<int> tmpset;//temporary set of selected features (levels)
224 std::vector< Vector2d<T> > tmpv(v.size());
225 std::list<int> catchset;//restore set in case of catch all the way to last level (no better cost)
226 //initialize maxCost with actual cost for current subset (-1 if empty subset)
227 double maxCost=-1;
228 double cost;
229 if(subset.size()>=maxLevels)
230 return maxCost;
231 gsl_combination_next(c);
232 do{
233 for(int ifeature=0;ifeature<maxFeatures;++ifeature)
234 tmpset.push_back(c->data[ifeature]);
235 for(int iclass=0;iclass<v.size();++iclass)
236 v[iclass].selectCols(tmpset,tmpv[iclass]);
237 try{
238 cost=theCostFactory.getCost(tmpv);
239 }
240 catch(...){ //singular matrix encountered
241 catchset=tmpset;//this tmpset resulted in failure of getCost
242 if(verbose){
243 std::cout << "Could not get cost from set: " << std::endl;
244 gsl_combination_fprintf(stdout,c," %u");
245 printf("\n");
246 }
247 tmpset.clear();
248 continue;
249 }
250 if(maxCost<cost){ //set with better cost is found
251 maxCost=cost;
252 subset=tmpset;
253 }
254 tmpset.clear();
255 }while(gsl_combination_next(c)==GSL_SUCCESS);
256 gsl_combination_free(c);
257// }while(c.next());
258 if(maxCost<0) //no level added to better maxCost than current subset (catchset)
259 subset=catchset;
260 //test: assert list contains no duplicate elements
261 for(std::list<int>::iterator lit=subset.begin();lit!=subset.end();++lit){
262 std::list<int>::iterator lit2=lit;//start searching from next element
263 assert(find(++lit2,subset.end(),*lit)==subset.end());
264 }
265 return maxCost;
266}
267
268template<class T> double FeatureSelector::addFeature(std::vector< Vector2d<T> >& v, CostFactory& theCostFactory, std::list<int>& subset, short verbose){
269 //select feature with the best value (get maximal cost for 1 feature)
270 std::list<int> tmpset=subset;//temporary set of selected features (levels)
271 std::vector< Vector2d<T> > tmp(v.size());
272 std::list<int> catchset;//restore set in case of catch all the way to last level (no better cost)
273 //initialize maxCost with actual cost for current subset (-1 if empty subset)
274 double maxCost=-1;
275 double cost;
276 int maxLevels=v[0][0].size();
277 if(subset.size()>=maxLevels)
278 return maxCost;
279 for(int ilevel=0;ilevel<maxLevels;++ilevel){
280 if(find(tmpset.begin(),tmpset.end(),ilevel)!=tmpset.end())
281 continue;
282 tmpset.push_back(ilevel);
283 for(int iclass=0;iclass<v.size();++iclass){
284// tmp[iclass].resize(v[iclass].size());
285 v[iclass].selectCols(tmpset,tmp[iclass]);
286 }
287 try{
288 cost=theCostFactory.getCost(tmp);
289 }
290 catch(...){
291 catchset=tmpset;//this tmpset resulted in singular matrix
292 if(verbose)
293 std::cout << "Could not add feature " << tmpset.back() << std::endl;
294 tmpset.pop_back();
295 continue;
296 }
297 if(maxCost<cost){ //level with better cost is found
298 maxCost=cost;
299 subset=tmpset;
300 }
301 tmpset.pop_back();
302 }
303 if(maxCost<0) //no level added to better maxCost than current subset (catchset)
304 subset=catchset;
305 //test: assert list contains no duplicate elements
306 for(std::list<int>::iterator lit=subset.begin();lit!=subset.end();++lit){
307 std::list<int>::iterator lit2=lit;//start searching from next element
308 assert(find(++lit2,subset.end(),*lit)==subset.end());
309 }
310 return maxCost;
311}
312
313template<class T> double FeatureSelector::removeFeature(std::vector< Vector2d<T> >& v, CostFactory& theCostFactory, std::list<int>& subset, int& r, short verbose){
314 //find the feature that has the least effect on the cost when it is removed from subset
315 std::list<int> tmpset=subset;//temporary set of selected features (levels)
316 std::vector< Vector2d<T> > tmp(v.size());
317 int nFeatures=subset.size();
318 std::list<int> catchset;//restore set in case of catch all the way to last level (no better cost)
319 //initialize maxCost with actual cost for current subset (-1 if empty subset)
320 double maxCost=-1;
321 int last;
322 double cost;
323 int maxLevels=v[0][0].size();
324 if(subset.size()>maxLevels||subset.empty()){
325 return maxCost;
326 }
327 std::list<int>::const_iterator it;
328 for(int i=0;i<nFeatures;++i){
329 last=tmpset.back();
330 tmpset.pop_back();
331 for(int iclass=0;iclass<v.size();++iclass){
332// tmp[iclass].resize(v[iclass].size());
333 v[iclass].selectCols(tmpset,tmp[iclass]);
334 }
335 try{
336 cost=theCostFactory.getCost(tmp);
337 }
338 catch(...){
339 catchset=tmpset;//this tmpset resulted in singular matrix
340 if(verbose)
341 std::cout << "Could not remove feature " << last << std::endl;
342 tmpset.push_front(last);
343 continue;
344 }
345 if(maxCost<cost){ //level with better cost is found
346 maxCost=cost;
347 subset=tmpset;
348 r=last;
349 }
350 tmpset.push_front(last);
351 }
352 if(maxCost<0){//all levels removed were caught
353 subset=catchset;
354 }
355 //test: assert list contains no duplicate elements
356 for(std::list<int>::iterator lit=subset.begin();lit!=subset.end();++lit){
357 std::list<int>::iterator lit2=lit;//start searching from next element
358 assert(find(++lit2,subset.end(),*lit)==subset.end());
359 }
360 return maxCost;
361}
362
363#endif /* _FEATURESELECTOR_H_ */