49 #include "EST_Wagon.h"
50 #include "EST_cmd_line.h"
52 enum wn_strategy_type {wn_decision_list, wn_decision_tree};
54 static wn_strategy_type wagon_type = wn_decision_tree;
56 static int wagon_main(
int argc,
char **argv);
107 int main(
int argc,
char **argv)
110 wagon_main(argc,argv);
116 static int set_Vertex_Feats(
EST_Track &wgn_VertexFeats,
123 wgn_VertexFeats.
a(0,i) = 0.0;
134 const EST_String ws = (
const char *)token.whitespace();
138 wgn_VertexFeats.
a(0,i) = 1.0;
140 }
else if ((ws ==
",") || (ws ==
""))
142 s = atoi(token.string());
143 wgn_VertexFeats.
a(0,s) = 1.0;
144 }
else if (ws ==
"-")
149 e = atoi(token.string());
150 for (i=s; i<=e && i<wgn_VertexFeats.
num_channels(); i++)
151 wgn_VertexFeats.
a(0,i) = 1.0;
154 printf(
"wagon: track_feats invalid: %s at position %d\n",
155 (
const char *)wagon_track_features,
164 static int wagon_main(
int argc,
char **argv)
170 ostream *wgn_coutput = 0;
171 float stepwise_limit = 0;
172 int feats_start=0, feats_end=0;
178 "Summary: CART building program\n"+
179 "-desc <ifile> Field description file\n"+
180 "-data <ifile> Datafile, one vector per line\n"+
181 "-stop <int> {50} Minimum number of examples for leaf nodes\n"+
182 "-test <ifile> Datafile to test tree on\n"+
183 "-frs <float> {10} Float range split, number of partitions to\n"+
184 " split a float feature range into\n"+
185 "-dlist Build a decision list (rather than tree)\n"+
186 "-dtree Build a decision tree (rather than list) default\n"+
187 "-output <ofile> \n"+
188 "-o <ofile> File to save output tree in\n"+
189 "-distmatrix <ifile>\n"+
190 " A distance matrix for clustering\n"+
192 " track for vertex indices\n"+
193 "-track_start <int>\n"+
194 " start channel vertex indices\n"+
195 "-track_end <int>\n"+
196 " end (inclusive) channel for vertex indices\n"+
197 "-track_feats <string>\n"+
198 " Track features to use, comma separated list\n"+
199 " with feature numbers and/or ranges, 0 start\n"+
200 "-unittrack <ifile>\n"+
201 " track for unit start and length in vertex track\n"+
202 "-quiet No questions printed during building\n"+
203 "-verbose Lost of information printing during build\n"+
204 "-predictee <string>\n"+
205 " name of field to predict (default is first field)\n"+
206 "-ignore <string>\n"+
207 " Filename or bracket list of fields to ignore\n"+
208 "-count_field <string>\n"+
209 " Name of field containing count weight for samples\n"+
210 "-stepwise Incrementally find best features\n"+
211 "-swlimit <float> {0.0}\n"+
212 " Percentage necessary improvement for stepwise,\n"+
213 " may be negative.\n"+
214 "-swopt <string> Parameter to optimize for stepwise, for \n"+
215 " classification options are correct or entropy\n"+
216 " for regression options are rmse or correlation\n"+
217 " correct and correlation are the defaults\n"+
218 "-balance <float> For derived stop size, if dataset at node, divided\n"+
219 " by balance is greater than stop it is used as stop\n"+
220 " if balance is 0 (default) always use stop as is.\n"+
221 "-vertex_output <string> Output <mean> or <best> of cluster\n"+
222 "-held_out <int> Percent to hold out for pruning\n"+
223 "-heap <int> {210000}\n"+
224 " Set size of Lisp heap, should not normally need\n"+
225 " to be changed from its default, only with *very*\n"+
226 " large description files (> 1M)\n"+
227 "-noprune No (same class) pruning required\n",
231 wgn_held_out = al.
ival(
"-held_out");
233 wgn_balance = al.
fval(
"-balance");
236 cerr << argv[0] <<
": missing description and/or datafile" << endl;
237 cerr <<
"use -h for description of arguments" << endl;
246 wgn_min_cluster_size = atoi(al.
val(
"-stop"));
250 wgn_predictee_name = al.
val(
"-predictee");
251 if (al.
present(
"-count_field"))
252 wgn_count_field_name = al.
val(
"-count_field");
254 stepwise_limit = al.
fval(
"-swlimit");
256 wgn_float_range_split = atof(al.
val(
"-frs"));
258 wgn_opt_param = al.
val(
"-swopt");
259 if (al.
present(
"-vertex_output"))
260 wgn_vertex_output = al.
val(
"-vertex_output");
264 wgn_oname = al.
val(
"-o");
266 wgn_oname = al.
val(
"-output");
267 wgn_coutput =
new ofstream(wgn_oname);
270 cerr <<
"Wagon: can't open file \"" << wgn_oname <<
271 "\" for output " << endl;
279 if (wgn_DistMatrix.
load(al.
val(
"-distmatrix")) != 0)
281 cerr <<
"Wagon: failed to load Distance Matrix from \"" <<
282 al.
val(
"-distmatrix") <<
"\"\n" << endl;
287 wagon_type = wn_decision_list;
293 siod_init(al.
ival(
"-heap"));
299 ignores = read_from_string(ig);
301 ignores = vload(ig,1);
304 wgn_load_datadescription(al.
val(
"-desc"),ignores);
305 wgn_load_dataset(wgn_dataset,al.
val(
"-data"));
306 if (al.
present(
"-distmatrix") &&
307 (wgn_DistMatrix.
num_rows() < wgn_dataset.length()))
309 cerr <<
"wagon: distance matrix is smaller than number of training elements\n";
314 wgn_VertexTrack.
load(al.
val(
"-track"));
317 wgn_VertexFeats.
a(0,i) = 1.0;
320 if (al.
present(
"-track_start"))
322 feats_start = al.
ival(
"-track_start");
323 if ((feats_start < 0) ||
326 printf(
"wagon: track_start invalid: %d out of %d channels\n",
331 for (i=0; i<feats_start; i++)
332 wgn_VertexFeats.
a(0,i) = 0.0;
338 feats_end = al.
ival(
"-track_end");
339 if ((feats_end < feats_start) ||
342 printf(
"wagon: track_end invalid: %d between start %d out of %d channels\n",
348 for (i=feats_end+1; i<wgn_VertexTrack.
num_channels(); i++)
349 wgn_VertexFeats.
a(0,i) = 0.0;
351 if (al.
present(
"-track_feats"))
354 set_Vertex_Feats(wgn_VertexFeats,wagon_track_features);
366 wgn_UnitTrack.
load(al.
val(
"-unittrack"));
370 wgn_load_dataset(wgn_test_dataset,al.
val(
"-test"));
374 tree = wagon_stepwise(stepwise_limit);
375 else if (wagon_type == wn_decision_tree)
376 tree = wgn_build_tree(score);
377 else if (wagon_type == wn_decision_list)
379 tree = wgn_build_dlist(score,wgn_coutput);
382 cerr <<
"Wagon: unknown operation, not tree or list" << endl;
388 *wgn_coutput << *tree;
389 summary_results(*tree,wgn_coutput);
392 if (wgn_coutput != &cout)