00001
00008 #include <getopt.h>
00009 #include <fcntl.h>
00010
00011 #include <errno.h>
00012 #include <err.h>
00013 #include <cstdio>
00014 #include <cstdlib>
00015 #include <cstring>
00016 #include <cmath>
00017
00018 #include <iostream>
00019 #include <fstream>
00020 #include <string>
00021
00022 #include <boost/timer.hpp>
00023
00024 #include "svm.h"
00025 #include "LibsvmFileLoader.h"
00026 #include "SVClustering.h"
00027 #include "CCLSVClustering.h"
00028 #include "GKWGenerator.h"
00029 #include "KernelMachines.h"
00030
00031
00032 #include "L1Distance.h"
00033 #include "L2Distance.h"
00034
00035
00036 #define DEFAULT_KW -1 // default kernel width (no specified - rely on Kernel Width Generator)
00037 #define DEFAULT_C 1 // default soft constraint value
00038 #define DEFAULT_ALG 0 // default cluster labeling algorithm
00039 #define DEFAULT_RUNS 100000 // default number of runs
00040 #define DEFAULT_L_OFFSET 1 // default label offset
00041 #define DEFAULT_VALID_THRESHOLD 2 // default minimum number of elements per cluster
00042 #define DEFAULT_SOFTENING 1 // default softening constant for kernel width generation
00043 #define DEFAULT_EDIST 2 // default euclidean distance value (no specified - rely on SVC constructor)
00044 #define DEFAULT_KERNEL -1 // default kernel type (no specified - rely on SVC constructor)
00045
00046 void printAllPoints(damina::SVClustering *, unsigned long int, char*);
00047
00048 void countRightAndWrongClassificationsPerClass(damina::SVClustering *, vector<int> &, vector<int> &, vector<int> &, vector<int> &, int, int, vector<unsigned long int> &, unsigned long int);
00049
00050 void computeEvaluatingMeasreusPerClass(vector<int>, vector<int>, vector<int>, vector<int>, vector<double> &, vector<double> &, vector<double> &, vector<double> &, vector<double> &, double &, double &);
00051
00052 struct svc_options *parseCommandLineArgs(int, char **);
00053
00054 void usage(char **);
00055
00056 void printUsedCommandLine(int, char *[]);
00057
00061 enum {CCL = 0, CG = 1};
00062
00063
00067 enum {L1 = 1, L2 = 2};
00068
00072 enum {BSV_CLASSIC = 0, BSV_ENHANCED = 1};
00073
00074
00078 struct svc_options {
00079 unsigned long int runs;
00080 unsigned long int alg_type;
00081 double q;
00082 double C;
00083 char *filename;
00084 unsigned long int clusters;
00085 unsigned long int l_offset;
00086 unsigned long int valid_threshold;
00087 double softening;
00088 char *detfile;
00089 unsigned long int e_dist;
00090 int kernel;
00091 unsigned int bsv;
00092 };
00093
00097 int main(int argc, char *argv[]) {
00098
00099 using namespace std;
00100 using namespace damina;
00101 using namespace boost;
00102
00103 struct svc_options *opt;
00104
00105 opt = parseCommandLineArgs(argc, argv);
00106 if (opt == NULL) {
00107 usage(argv);
00108 exit(1);
00109 }
00110
00114 LibsvmFileLoader *fl;
00115 string datafile(opt->filename);
00116 struct svm_problem *p = NULL;
00117 cout << "==================================" << endl;
00118 cout << "Loading file" << endl;
00119 cout << "==================================" << endl;
00120 fl = new LibsvmFileLoader(datafile);
00121 timer tfl;
00122 p = fl->load(false);
00123 cout << "File loaded in " << tfl.elapsed() << " secs" << endl;
00124 cout << "Number of objects: " << p->l << endl << endl;
00125 delete fl;
00126
00127
00131 SVClustering *svc;
00132 switch (opt->alg_type) {
00133 case CG: svc = new SVClustering(opt->C, p); break;
00134 case CCL: svc = new CCLSVClustering(opt->C, p); break;
00135 default:
00136 svc = new SVClustering(opt->C, p); break;
00137 }
00138
00139 if (opt->bsv) {
00140 svc->useSpecialClusteringPolicyForBSV();
00141 }
00142 else {
00143 svc->useClassicClusteringPolicyForBSV();
00144 }
00145
00149 if (opt->kernel != DEFAULT_KERNEL) {
00150 svc->setKernelType(opt->kernel);
00151 }
00152
00156 GKWGenerator *kwg;
00157 if (opt->q == -1) {
00158 kwg = new GKWGenerator(svc, 0.000000001);
00159 }
00160 else {
00161 kwg = new GKWGenerator(svc, 0.000000001, opt->q);
00162 }
00163
00164 kwg->setGrowthSoftening(opt->softening);
00165
00169 switch (opt->e_dist) {
00170 case L1: { svc->setEuclideanDistance(new L1Distance()); kwg->setEuclideanDistance(new L1Distance()); } break;
00171 case L2: { svc->setEuclideanDistance(new L2Distance()); kwg->setEuclideanDistance(new L2Distance()); } break;
00172 default: ;
00173 }
00174
00175 bool target = false;
00176 unsigned long int invalid = 0, runs = 0;
00177 double q, prev_q = -1, prev_C = -1;
00178
00179 unsigned long int nrClusters,
00180
00181 nrValidClusters;
00182
00183 timer partial,
00184 total;
00185
00186 vector<unsigned long int> ppc;
00187 vector<int> tp, fp, tn, fn;
00188 vector<double> precision, recall, f1, contamination, specificity;
00189 double ma, accuracy, overall_time = 0;
00190
00191 while (true) {
00192 prev_q = q;
00193 prev_C = svc->getSoftConstraintEstimate();
00194
00195 runs++;
00196 if (runs > opt->runs) break;
00197
00198 q = kwg->getNextKernelWidthValue();
00199 if (q == -1) {
00200 cout << "No more valid kernel width values" << endl << endl;
00201 break;
00202 }
00203
00204 if (opt->runs > 1) {
00205 cout << "==================================" << endl;
00206 cout << " Finding Kernel Width # " << runs << endl;
00207 cout << "==================================" << endl;
00208 }
00209
00210 svc->setKernelWidth(q);
00211
00215 total.restart();
00216 partial.restart();
00217
00218 svc->learn();
00219 cout << "radius = " << svc->getSphereRadius() << endl;
00220 cout << endl << "Domain description time taken: ";
00221 cout << partial.elapsed() << " seconds " << endl;
00222
00223 partial.restart();
00224
00225 svc->clusterize();
00226 cout << "Clustering time taken: ";
00227 cout << partial.elapsed() << " seconds " << endl;
00228
00229 cout << "Total time taken: ";
00230 overall_time += total.elapsed();
00231 cout << total.elapsed() << " seconds " << endl << endl;
00232
00233
00234 nrClusters = svc->getNumberOfClusters();
00235
00236 nrValidClusters = svc->getNumberOfValidClusters(opt->valid_threshold);
00237
00238 cout << "Soft Constraint Estimate: " << svc->getSoftConstraintEstimate() << endl;
00239 cout << "Current Soft Constraint: " << svc->getSoftConstraint() << endl;
00240 cout << "Kernel Width: " << q << endl;
00241 cout << "Number of clusters: " << nrClusters << endl;
00242 cout << "Number of valid clusters: " << nrValidClusters << endl;
00243 cout << "Number of non valid clusters (cardinality < " << opt->valid_threshold << "): " << nrClusters - nrValidClusters << endl;
00244
00245 cout << "Points per cluster: " << endl;
00246
00247 ppc.clear();
00248 ppc = svc->getPointPerCluster();
00249
00250 for (unsigned long int i = 0; i < ppc.size(); i++) {
00251 if (ppc[i] >= opt->valid_threshold) {
00252 cout << "\tCluster " << i << ": " << ppc[i] << endl;
00253 }
00254 }
00255 cout << endl << endl;
00256
00257 if (opt->detfile != NULL) {
00258 printAllPoints(svc, opt->l_offset, opt->detfile);
00259 }
00260
00261 countRightAndWrongClassificationsPerClass(svc, tp, fp, tn, fn, opt->clusters, opt->l_offset, ppc, opt->valid_threshold);
00262 computeEvaluatingMeasreusPerClass(tp, fp, tn, fn, contamination, specificity, precision, recall, f1, ma, accuracy);
00263
00264 for (unsigned long int i = 0; i < tp.size(); i++) {
00265 cout << "Class " << i << endl << "\tTP: " << tp[i] << "\tFP: " << fp[i] << endl << "\tFN: " << fn[i] << "\tTN: " << tn[i] << endl << endl;
00266 cout << "Precision: " << precision[i] << " - Recall: " << recall[i] << " - F1: " << f1[i] << endl;
00267 cout << "Completeness: " << recall[i] << " - Contamination: " << contamination[i] << endl;
00268 cout << "Sensitivity: " << recall[i] << " - Specificity: " << specificity[i] << endl << endl;
00269 }
00270
00271 if (opt->clusters > 2) {
00272 cout << "Macroaveraging: " << ma << endl;
00273 }
00274
00275 cout << "Accuracy: " << accuracy << endl << endl;
00276
00277
00278 if (target) {
00279 if (nrValidClusters != opt->clusters) {
00280 break;
00281 }
00282 else {
00283 if (nrClusters - nrValidClusters > invalid) {
00284 break;
00285 }
00286 }
00287 }
00288 else {
00289 if (nrValidClusters == opt->clusters) {
00290 target = true;
00291 invalid = nrClusters - nrValidClusters;
00292 }
00293 }
00294
00295 }
00296
00297 cout << "==================================" << endl;
00298 cout << "Clustering process finished" << endl;
00299 cout << "==================================" << endl;
00300 if (opt->runs > 1) {
00301 cout << "Check out the last run and second last run." << endl;
00302 cout << "Kernel Width Found: " << prev_q << endl;
00303 }
00304 cout << "Soft Constraint Estimate: " << prev_C << endl;
00305 cout << "Overall time elapsed: " << overall_time << " secs" << endl;
00306
00307 cout << "Original command line: ";
00308 printUsedCommandLine(argc, argv);
00309
00310 delete svc;
00311 delete kwg;
00312 free(opt);
00313 }
00314
00315
00322 void printAllPoints(damina::SVClustering *clust, unsigned long int classOffset, char *filename) {
00323
00324 filebuf fb;
00325 fb.open(filename, ios::app);
00326 ostream out(&fb);
00327
00328 std::vector<int> labels = clust->getClustersAssignment();
00329 std::vector<int> orig_c = clust->getOriginalClasses((int)classOffset);
00330
00331 for (unsigned long int i = 0; i < labels.size(); i++) {
00332 out << "Point #" << i << " - " << "Cluster: " << labels[i] << ", Class: " << orig_c[i] << endl;
00333 }
00334 out << endl;
00335 fb.close();
00336 }
00337
00349 void countRightAndWrongClassificationsPerClass(damina::SVClustering *clust, vector<int> &tp, vector<int> &fp, vector<int> &tn, vector<int> &fn, int nrClasses, int classOffset, vector<unsigned long int> &ppc, unsigned long int valid_t) {
00350
00351 std::vector<int> labels = clust->getClustersAssignment();
00352 std::vector<int> orig_c = clust->getOriginalClasses(classOffset);
00353
00354 int datasetDimension = clust->getProblem()->l;
00355
00356 tp.clear();
00357 fp.clear();
00358 tn.clear();
00359 fn.clear();
00360
00361 tp.resize(nrClasses);
00362 fp.resize(nrClasses);
00363 tn.resize(nrClasses);
00364 fn.resize(nrClasses);
00365
00366 for (unsigned long int i = 0; i < labels.size(); i++) {
00367 if (orig_c[i] >= nrClasses) continue;
00368
00369 if ((ppc[labels[i]] >= valid_t) && (labels[i] == orig_c[i])) {
00370 tp[orig_c[i]]++;
00371 }
00372 else {
00373 fp[labels[i]]++;
00374 fn[orig_c[i]]++;
00375 }
00376 }
00377
00378 for (int i = 0; i < nrClasses; i++) {
00379 tn[i] = datasetDimension - tp[i] - fp[i] - fn[i];
00380 }
00381 }
00382
00383
00396 void computeEvaluatingMeasreusPerClass(vector<int> tp, vector<int> fp, vector<int> tn, vector<int> fn,
00397 vector<double> &contamination, vector<double> &specificity,
00398 vector<double> &precision, vector<double> &recall, vector<double> &f1, double &ma, double &accuracy) {
00399
00400 unsigned long int nrClasses = tp.size();
00401 int datasedDim = tp[0] + fp[0] + tn[0] + fn[0];
00402
00403 precision.clear();
00404 recall.clear();
00405 f1.clear();
00406
00407 contamination.resize(nrClasses);
00408 precision.resize(nrClasses);
00409 recall.resize(nrClasses);
00410 f1.resize(nrClasses);
00411 specificity.resize(nrClasses);
00412
00413 ma = 0;
00414 accuracy = 0;
00415
00416 for (unsigned long int i = 0; i < nrClasses ; i++) {
00417 accuracy += tp[i];
00418 precision[i] = ((double)tp[i] / (double)(tp[i] + fp[i])) * 100;
00419 recall[i] = ((double)tp[i] / (double)(tp[i] + fn[i])) * 100;
00420 specificity[i] = ((double)tn[i] / (double)(tn[i] + fp[i])) * 100;
00421 contamination[i] = (double)fp[i] / (double)tp[i];
00422 if (precision[i] + recall[i] == 0) {
00423 f1[i] = -1;
00424 }
00425 else {
00426 f1[i] = (2 * (precision[i] * recall[i])) / (precision[i] + recall[i]);
00427 ma += f1[i];
00428 }
00429
00430 }
00431
00432 ma = ma / (double) nrClasses;
00433 accuracy = accuracy / (double)datasedDim * 100;
00434 }
00435
00436
00446 struct svc_options *parseCommandLineArgs(int argc, char **argv) {
00447
00448 using namespace damina;
00449
00450 struct svc_options *opt;
00451 opt = (struct svc_options *)malloc(sizeof(struct svc_options));
00452
00453 opt->filename = (char *)malloc(2 * sizeof(char));
00454 strcpy(opt->filename, "");
00455
00456 opt->q = DEFAULT_KW;
00457 opt->C = DEFAULT_C;
00458 opt->alg_type = DEFAULT_ALG;
00459 opt->runs = DEFAULT_RUNS;
00460 opt->l_offset = DEFAULT_L_OFFSET;
00461 opt->valid_threshold = DEFAULT_VALID_THRESHOLD;
00462 opt->softening = DEFAULT_SOFTENING;
00463 opt->detfile = NULL;
00464 opt->e_dist = DEFAULT_EDIST;
00465 opt->kernel = DEFAULT_KERNEL;
00466 opt->bsv = BSV_CLASSIC;
00467
00468 bool file_f = true;
00469 bool cluster_f = false;
00470
00471 int ch;
00472
00473
00474 static struct option longopts[] = {
00475 { "kernel-width", required_argument, NULL, 'q' },
00476 { "soft-margin", required_argument, NULL, 'C' },
00477 { "num-clusters", required_argument, NULL, 'c' },
00478 { "labeling-alg", required_argument, NULL, 'l' },
00479 { "dataset-file", required_argument, NULL, 'f' },
00480 { "runs", required_argument, NULL, 'r' },
00481 { "labels-offset", required_argument, NULL, 'o' },
00482 { "valid-threshold",required_argument, NULL, 't' },
00483 { "softening", required_argument, NULL, 's' },
00484 { "dump-details", required_argument, NULL, 'D' },
00485 { "euclidean-distance", required_argument, NULL, 'e' },
00486 { "kernel-type", required_argument, NULL, 'k' },
00487 { "bsv-clustering", required_argument, NULL, 'b' }
00488
00489 };
00490
00491 while ((ch = getopt_long(argc, argv, "q:C:c:l:f:r:o:t:s:D:e:k:b:", longopts, NULL)) != -1) {
00492 switch (ch) {
00493
00494 case 'b': {
00495 opt->bsv = (int)strtol(optarg, (char**)NULL, 10);
00496
00497
00498 break;
00499 }
00500
00501
00502 case 'k': {
00503 opt->kernel = (int)strtol(optarg, (char**)NULL, 10);
00504
00505
00506 break;
00507 }
00508
00509 case 'e': {
00510 opt->e_dist = (int)strtol(optarg, (char**)NULL, 10);
00511 if ((errno == EINVAL) || (opt->e_dist < 1) || (opt->e_dist > 2))
00512 err(1, "%s is an invalid value for Euclidean Distance type. It must be an inter value in [1,2].", optarg);
00513 break;
00514 }
00515
00516 case 'q': {
00517 opt->q = (double)strtod(optarg, (char**)NULL);
00518 if ((errno == EINVAL) || (opt->q <= 0))
00519 err(1, "%s is an invalid value for Kernel Width. It must be a real value greater than 0.", optarg);
00520 break;
00521 }
00522
00523 case 'C': {
00524 opt->C = (double)strtod(optarg, (char**)NULL);
00525 if ((errno == EINVAL) || (opt->C <= 0))
00526 err(1, "%s is an invalid value for Soft Margin Constant. It must be a real value greater than 0.", optarg);
00527 break;
00528 }
00529
00530 case 'c': {
00531 opt->clusters = (int)strtol(optarg, (char**)NULL, 10);
00532 if ((errno == EINVAL) || (opt->clusters <= 0))
00533 err(1, "%s is an invalid value for Number of Clusters. It must be an integer value greater than 0.", optarg);
00534
00535 cluster_f = true;
00536 break;
00537 }
00538
00539 case 'r': {
00540 opt->runs = (int)strtol(optarg, (char**)NULL, 10);
00541 if ((errno == EINVAL) || (opt->runs <= 0))
00542 err(1, "%s is an invalid value for Number of Runs. It must be an integer value greater than 0.", optarg);
00543 break;
00544 }
00545
00546 case 's': {
00547 opt->softening = (double)strtod(optarg, (char**)NULL);
00548 if ((errno == EINVAL) || (opt->softening <= 0))
00549 err(1, "%s is an invalid value for the Softening Constant. It must be an real value greater than 0.", optarg);
00550 break;
00551 }
00552
00553 case 't': {
00554 opt->valid_threshold = (int)strtol(optarg, (char**)NULL, 10);
00555 if ((errno == EINVAL) || (opt->valid_threshold <= 0))
00556 err(1, "%s is an invalid value for Minimum Clusters Cardinality. It must be an integer value greater than 0.", optarg);
00557 break;
00558 }
00559
00560 case 'o': {
00561 opt->l_offset = (int)strtol(optarg, (char**)NULL, 10);
00562 if ((errno == EINVAL) || (opt->l_offset < 0))
00563 err(1, "%s is an invalid value for Labels Offset. It must be an integer value greater than or equal to 0.", optarg);
00564 break;
00565 }
00566
00567 case 'l': {
00568 opt->alg_type = (int)strtol(optarg, (char**)NULL, 10);
00569 if ((errno == EINVAL) || (opt->alg_type > 1) || (opt->alg_type < 0))
00570 err(1, "%s is an invalid value for Cluster Labeling algorithm. It must be 0 or 1.", optarg);
00571 break;
00572 }
00573
00574 case 'f': {
00575 if ((open(optarg, O_RDONLY, 0)) == -1) {
00576 err(1, "Unable to open %s", optarg);
00577 break;
00578 }
00579
00580 free(opt->filename);
00581 opt->filename = (char *)malloc((strlen(optarg) + 1) * sizeof(char));
00582 strcpy(opt->filename, optarg);
00583
00584 file_f = true;
00585 break;
00586 }
00587
00588 case 'D': {
00589 free(opt->detfile);
00590 opt->detfile = (char *)malloc((strlen(optarg) + 1) * sizeof(char));
00591 strcpy(opt->detfile, optarg);
00592 break;
00593 }
00594
00595 default: {
00596 usage(argv);
00597 }
00598 }
00599 }
00600 argc -= optind;
00601 argv += optind;
00602
00603 if ((file_f == false) || (cluster_f == false)) {
00604 return NULL;
00605 }
00606 return opt;
00607 }
00608
00614 void usage(char **argv) {
00615 using namespace damina;
00616
00617 cerr << endl << "Usage: " << argv[0] << " -q (0,+inf) -C (0,+inf) -b [" << BSV_CLASSIC << "," << BSV_ENHANCED << "] -k" << KernelType::gaussian << "," << KernelType::laplacian << " -e [1,2] -c [1, N] -l [0-1] -f <dataset-filename> -D <detail-filename> -r [1, N] -o [0, N] -t [1,N] -s (0,1]" << endl << endl;
00618 cerr << "\t -q: the kernel width value must be a real value greater than zero - DEFAULT: auto" << endl;
00619 cerr << "\t -C: the soft margin constant must be a real value greater than zero - DEFAULT: " << DEFAULT_C << endl;
00620 cerr << "\t -b: the BSV Clustering Policy - DEFAULT: " << BSV_ENHANCED << endl;
00621 cerr << "\t\t " << BSV_CLASSIC << " for classic policy" << endl;
00622 cerr << "\t\t " << BSV_ENHANCED << " for enhanced policy (aim to get better results)" << endl;
00623 cerr << "\t -k: the Kernel Type - DEFAULT: " << KernelType::gaussian << endl;
00624 cerr << "\t\t " << KernelType::gaussian << " for Gaussian Kernel (K(x,y) = exp(-gamma * ||x-y||^2))" << endl;
00625 cerr << "\t\t " << KernelType::laplacian << " for Laplacian Kernel (K(x,y) = exp(-gamma*|x-y|_1)) " << endl;
00626 cerr << "\t -e: the Euclidean Measure to use in vector-distance measurement - DEFAULT: " << L2 << endl;
00627 cerr << "\t\t " << L1 << " for L1 distance (sum |x_i - y_i|) " << endl;
00628 cerr << "\t\t " << L2 << " for L2 distance (sum sqrt(||x_i - y_i||^2)) " << endl;
00629 cerr << "\t -l: the cluster labeling algorithm type - DEFAULT: " << DEFAULT_ALG << endl;
00630 cerr << "\t\t " << CCL << " for Cone Cluster Labeling" << endl;
00631 cerr << "\t\t " << CG << " for Complete Graph Cluster Labeling" << endl;
00632 cerr << "\t -c: the number of cluster to request must be an integer value greater than zero" << endl;
00633 cerr << "\t -f: the filename of the dataset file, in LIBSVM format" << endl;
00634 cerr << "\t -D: the filename of the file to store detailed clusters assignment" << endl;
00635 cerr << "\t -r: the number of runs to perform - DEFAULT: " << DEFAULT_RUNS << endl;
00636 cerr << "\t -o: the labels offset (i.e. the label of the first class) - DEFAULT: " << DEFAULT_L_OFFSET << endl;
00637 cerr << "\t -s: the softening constant for kernel width generation - DEFAULT: " << DEFAULT_SOFTENING << endl;
00638 cerr << "\t -t: the minimum clusters cardinality - DEFAULT: " << DEFAULT_VALID_THRESHOLD << endl << endl;
00639 }
00640
00646 void printUsedCommandLine(int argc, char *argv[]) {
00647 cout << argv[0] << " ";
00648
00649 for (int i = 1; i <= argc; i++) {
00650 cout << argv[i] << " ";
00651 }
00652 cout << endl << endl;
00653 }