SVClustering.cpp

Go to the documentation of this file.
00001 #include "SVClustering.h"
00002 #include "L2Distance.h"
00003 
00004 namespace damina
00005 {
00006 
00012         SVClustering::SVClustering(DataSet *train) {
00013                 this->eucDist = new L2Distance();
00014                 this->trainer = new OneClassSVM();
00015                 this->trainer->setKernel(KernelType::gaussian);
00016                 this->trainer->setCacheSize(100);
00017                 this->trainer->setTolerance(0.000001);
00018                 this->trainer->setTrainingSet(train);
00019                 this->trainer->setKernelWidth(initialKernelWidth());
00020                 this->quadratic = NULL;
00021                 this->clusterizeBSV = true;
00022         }
00023         
00029         SVClustering::SVClustering(struct svm_problem *p) {
00030                 this->eucDist = new L2Distance();
00031                 this->trainer = new OneClassSVM();
00032                 this->trainer->setKernel(KernelType::gaussian);
00033                 this->trainer->setCacheSize(100);
00034                 this->trainer->setTolerance(0.000001);
00035                 this->trainer->setProblem(p);
00036                 this->trainer->setKernelWidth(initialKernelWidth());
00037                 this->quadratic = NULL;
00038                 this->clusterizeBSV = true;
00039         }
00040         
00047         SVClustering::SVClustering(double C, DataSet *train) {
00048                 this->eucDist = new L2Distance();
00049                 this->trainer = new OneClassSVM();
00050                 this->trainer->setKernel(KernelType::gaussian);
00051                 this->trainer->setCacheSize(100);
00052                 this->trainer->setTolerance(0.000001);
00053                 
00054                 this->trainer->setTrainingSet(train);
00055                 
00056                 this->trainer->setKernelWidth(initialKernelWidth());
00057 
00058                 
00059                 this->trainer->setNU((1 / (train->getSize() * C)));
00060                 this->C = C;
00061                 this->quadratic = NULL;
00062                 this->clusterizeBSV = true;
00063         }
00064         
00071         SVClustering::SVClustering(double C, struct svm_problem *prob) {
00072                 this->eucDist = new L2Distance();
00073                 this->trainer = new OneClassSVM();
00074                 this->trainer->setKernel(KernelType::gaussian);
00075                 this->trainer->setCacheSize(100);
00076                 this->trainer->setTolerance(0.000001);
00077                 
00078                 this->trainer->setProblem(prob);
00079                 
00080                 this->trainer->setKernelWidth(initialKernelWidth());
00081                 
00082                 this->trainer->setNU((1 / (prob->l * C)));
00083                 this->C = C;
00084                 this->quadratic = NULL;
00085                 this->clusterizeBSV = true;
00086         }
00087         
00095         SVClustering::SVClustering(double C, double w, DataSet *train) {
00096                 this->eucDist = new L2Distance();
00097                 this->trainer = new OneClassSVM();
00098                 this->trainer->setKernel(KernelType::gaussian);
00099                 this->trainer->setCacheSize(100);
00100                 this->trainer->setTolerance(0.000001);
00101                 
00102                 this->trainer->setTrainingSet(train);
00103                 
00104                 this->trainer->setNU((1 / (train->getSize() * C)));
00105                 this->C = C;
00106                 
00107                 this->trainer->setKernelWidth(w);
00108                 this->quadratic = NULL;
00109                 this->clusterizeBSV = true;
00110         }
00111         
00119         SVClustering::SVClustering(double C, double w, struct svm_problem *prob) {
00120                 this->eucDist = new L2Distance();
00121                 this->trainer = new OneClassSVM();
00122                 this->trainer->setKernel(KernelType::gaussian);
00123                 this->trainer->setCacheSize(100);
00124                 this->trainer->setTolerance(0.000001);
00125                 
00126                 this->trainer->setProblem(prob);
00127                 
00128                 this->trainer->setNU((1 / (prob->l * C)));
00129                 this->C = C;
00130                 
00131                 this->trainer->setKernelWidth(w);
00132                 this->quadratic = NULL;
00133                 this->clusterizeBSV = true;
00134         }
00135                 
00140         SVClustering::~SVClustering() {
00141                 delete trainer;
00142                 delete eucDist;
00143                 // if (quadratic != NULL) free(quadratic);
00144         }
00145 
00149         void SVClustering::learn() {
00150                 free(quadratic);
00151                 quadratic = NULL;
00152                 this->numberOfClusters = -1;
00153                 this->pointPerCluster.clear();
00154                 this->labels.clear();
00155                 this->trainer->learn();
00156         }
00157 
00163         void SVClustering::learn(DataSet *train) {
00164                 free(quadratic);
00165                 quadratic = NULL;
00166                 this->numberOfClusters = -1;
00167                 this->pointPerCluster.clear();
00168                 this->labels.clear();
00169                 this->trainer->learn(train);
00170         }
00171         
00177         void SVClustering::learn(struct svm_problem *p) {
00178                 free(quadratic);
00179                 quadratic = NULL;
00180                 this->numberOfClusters = -1;
00181                 this->pointPerCluster.clear();
00182                 this->labels.clear();
00183                 this->trainer->learn(p);
00184         }
00185         
00186                 
00187         
00193         double SVClustering::getSphereRadius() {
00194                 calculateSphereRadius();
00195                 return sphereRadius;
00196         }
00197         
00203         void SVClustering::setTrainingSet(DataSet *t) {
00204                 this->trainer->setTrainingSet(t);
00205         }
00206         
00212         DataSet *SVClustering::getTrainingSet() {
00213                 return this->trainer->getTrainingSet();
00214         }
00215         
00216         
00222         void SVClustering::setProblem(struct svm_problem *p) {
00223                 this->trainer->setProblem(p);
00224         }
00225         
00231         struct svm_problem *SVClustering::getProblem() {
00232                 return this->trainer->getProblem();
00233         }
00234         
00235         
00241         struct svm_model *SVClustering::getModel() {
00242                 return this->trainer->getModel();
00243         }
00244 
00250         struct svm_parameter *SVClustering::getParameters() {
00251                 return trainer->getParameters();
00252         }
00253         
00265         double SVClustering::initialKernelWidth() {
00266                 struct svm_problem *p = this->trainer->getProblem();
00267                 int N = this->trainer->getProblem()->l;
00268                 
00269                 double max = eucDist->calculateDistance(p->x[0], p->x[1]);
00270                 double dist;
00271                 
00272                 for (int i = 0; i < N; i++) {
00273                         for (int j = 0; j < N; j++) {
00274                                 dist = eucDist->calculateDistance(p->x[i], p->x[j]);
00275                                 if (dist > max) {
00276                                         max = dist;
00277                                 }
00278                         }
00279                 }
00280                 
00281                 this->trainer->setKernelWidth(1/max);
00282                 
00283                 return (1/max);
00284         }
00285         
00286         
00292         void SVClustering::setKernelWidth(double w) {
00293                 this->trainer->setKernelWidth(w);
00294         }
00295         
00296         
00302         double SVClustering::getKernelWidth() {
00303                 return this->trainer->getKernelWidth();
00304         }
00305 
00316         void SVClustering::setKernelType(int k) {
00317                 this->trainer->setKernel(k);
00318         }       
00319         
00326         int SVClustering::getKernelType() {
00327                 return this->trainer->getKernel();
00328         }
00329                 
00335         void SVClustering::setSoftConstraint(double C) {
00336                 this->C = C;
00337                 
00338                 /*      The OneClass SVM needs the NU param to be set in function of C param */
00339                 this->trainer->setNU((1 / (this->trainer->getProblem()->l * C)));       
00340         }
00341         
00347         double SVClustering::getSoftConstraint() {
00348                 return this->C;
00349         }
00350 
00351 
00357         double SVClustering::getSoftConstraintEstimate() {
00358                 return C_estimation;
00359         }
00360         
00361         
00376         void SVClustering::clusterize() {
00377                 this->separateClusters();
00378         }
00379         
00387         std::vector<int>& SVClustering::getClustersAssignment() {
00388                 return labels;
00389         }
00390         
00398         std::vector<unsigned long int>& SVClustering::getPointPerCluster() {
00399                 if (numberOfClusters > -1) {
00400                         return pointPerCluster;
00401                 }
00402                 
00403                 // calculating points per cluster and number of clusters detected
00404                 
00405                 // theoretically impossible
00406                 // here to detect some programming errors
00407                 if (labels.size() == 0) {
00408                         this->numberOfClusters = 0;
00409                         return pointPerCluster;
00410                 }
00411                 
00412                 std::vector<unsigned long int>::size_type idx;
00413                 this->pointPerCluster.clear();
00414                 this->pointPerCluster.push_back(0);
00415                 this->numberOfClusters = 1;
00416         for (idx = 0; idx < labels.size(); idx++) {
00417                         if ((int)(pointPerCluster.size() - 1) < labels[idx]) {
00418                         this->numberOfClusters++;
00419                         this->pointPerCluster.push_back(1);
00420                 }
00421                 else {
00422                         this->pointPerCluster[labels[idx]]++;
00423                 }
00424         }
00425                         
00426                 return pointPerCluster;
00427         }
00428         
00434         unsigned long int SVClustering::getNumberOfClusters() {
00435                 
00436                 if (numberOfClusters > -1)
00437                         return numberOfClusters;
00438                 
00439                 // calculating points per cluster and number of clusters detected
00440                 
00441                 // theoretically impossible
00442                 // here to detect some programming errors
00443                 if (labels.size() == 0) {
00444                         this->numberOfClusters = 0;
00445                         return numberOfClusters;
00446                 }
00447                 
00448                 
00449                 std::vector<int>::size_type idx;
00450                 this->pointPerCluster.push_back(0);
00451                 this->numberOfClusters = 1;
00452         for (idx = 0; idx < labels.size(); idx++) {
00453                         if ((int)(pointPerCluster.size() - 1) < labels[idx]) {
00454                         this->numberOfClusters++;
00455                         this->pointPerCluster.push_back(1);
00456                 }
00457                 else {
00458                         this->pointPerCluster[labels[idx]]++;
00459                 }
00460         }
00461 
00462                 return numberOfClusters;
00463         }
00464         
00470         unsigned long int SVClustering::getNumberOfNonSingletonClusters() {
00471                 return getNumberOfValidClusters(2);
00472         }
00473 
00480         unsigned long int SVClustering::getNumberOfValidClusters(unsigned long int threshold) {
00481                 getPointPerCluster();
00482                 unsigned long int counter = 0;
00483                 for (unsigned long int i = 0; i < pointPerCluster.size(); i++) {
00484                         if (pointPerCluster[i] >= threshold) {
00485                                 counter++;
00486                         }
00487                 }
00488                 return counter;
00489         }
00490                 
00504         std::vector<int>& SVClustering::getOriginalClasses(int classOffset) {
00505                 double *classes = this->trainer->getProblem()->y;
00506                 unsigned long int i, N = this->trainer->getProblem()->l;
00507                 
00508                 originalClasses.resize(N);
00509                 
00510                 for (i = 0; i < N; i++) {
00511                         originalClasses[i] = (int)classes[i] - classOffset;
00512                 }
00513                 
00514                 return originalClasses;
00515         }
00516         
00517 
00523         void SVClustering::setEuclideanDistance(EuclideanDistance *ed) {
00524                 eucDist = ed;
00525         }       
00526 
00527 
00532         void SVClustering::useSpecialClusteringPolicyForBSV() {
00533                 this->clusterizeBSV = true;
00534         }
00535         
00540         void SVClustering::useClassicClusteringPolicyForBSV() {
00541                 this->clusterizeBSV = false;
00542         }
00543 
00549         bool SVClustering::usingSpecialClusteringPolicyForBSV() {
00550                 return this->clusterizeBSV;
00551         }
00552 
00553         
00554         /*      PROTECTED METHODS FOLLOW        ************************************************************************/
00555         
00556         
00569          double SVClustering::beta(int i) {
00570                 return this->trainer->getModel()->sv_coef[0][i] * C; 
00571          }
00572          
00573                  
00590         double SVClustering::beta(int i, bool scaled) {
00591                 if (scaled) {
00592                         return this->trainer->getModel()->sv_coef[0][i];
00593                 }
00594                 else {
00595                         return this->trainer->getModel()->sv_coef[0][i] * getSoftConstraint();
00596                 }
00597         }
00598         
00599          
00609          double SVClustering::rho() {
00610                 return this->trainer->getModel()->rho[0] * C;
00611          }
00612 
00624          double SVClustering::rho(bool scaled) {
00625                 if (scaled) {
00626                         return this->trainer->getModel()->rho[0];
00627                 }
00628                 else {
00629                         return this->trainer->getModel()->rho[0] * getSoftConstraint();
00630                 }
00631          }
00632          
00633         
00652         void SVClustering::calculateQuadraticPartOfDistanceFromCenter() {
00653                 
00654                 if (quadratic != NULL) {
00655                         return;
00656                 }
00657                 
00658                 struct svm_node **SV = this->trainer->getModel()->SV;
00659                 int nSV = this->trainer->getModel()->l;
00660                 double tmp = 0;
00661                 
00662                 
00663                 for (int i = 0; i < nSV; i++) {
00664                         for (int j = 0; j < nSV; j++) {
00665                                 tmp += beta(i) * beta(j) * kernelfunction(SV[i], SV[j], this->trainer->getModel()->param);
00666                         }
00667                 }
00668                 
00669                 this->quadratic = (double *)malloc(sizeof(double));
00670                 *(this->quadratic) = tmp;
00671         } 
00672          
00695         double SVClustering::calculateDistanceFromCenter(struct svm_node *x) {
00696                 double sum = 0;
00697                 
00698                 struct svm_node **SV = this->trainer->getModel()->SV;
00699                 int nSV = this->trainer->getModel()->l;
00700                 
00701                 
00702                 for (int i = 0; i < nSV; i++) {
00703                         sum += beta(i) * kernelfunction(SV[i], x, this->trainer->getModel()->param);
00704                 }
00705                 
00706                 
00707                 calculateQuadraticPartOfDistanceFromCenter();
00708                 
00709                 // 1 is because (in case of RBF kernel used here) K(x,x) is equal to 1 
00710                 // (therotically provable and pratically tested with libsvm) 
00711                 //return sqrt(1 - (2 * sum) + *(this->quadratic));
00712                 return 1 - (2 * sum) + *(this->quadratic);
00713         }
00714         
00741         void SVClustering::calculateSphereRadius() {
00742                 
00743                 // first tecnique
00744                 //
00745                 // assuming SVs have all the same distanace from center (theoretically a correct assumption)
00746                 // to avoid a check on the distance of all SVs
00747                 //
00748                 this->sphereRadius = this->calculateDistanceFromCenter(this->trainer->getModel()->SV[0]);
00749                 
00750                 //if (this->sphereRadius >= 1) {
00751                 //      free(quadratic);
00752                 //      this->sphereRadius = this->calculateDistanceFromCenter(this->trainer->getModel()->SV[0]);
00753                 //}
00754 
00755                 // second tecnique
00756                 //
00757                 // we can use the average among SVs' distances to calculate the radius
00758                 
00759                 //int nSV = this->trainer->getModel()->l;
00760                 //this->sphereRadius = 0;
00761                 //for (int i = 0; i < nSV; i++) {
00762                 //      this->sphereRadius += this->calculateDistanceFromCenter(this->trainer->getModel()->SV[i]);                      
00763                 //}
00764                 //this->sphereRadius = this->sphereRadius / nSV;
00765 
00766                 //      third tecnique
00767                 //      Using One Class SVM, Radius = sqrt(1 - rho/2)
00768                 //sphereRadius = sqrt(1 - (rho()/2));
00769         }
00770 
00771         
00779         struct svm_node *SVClustering::pointOnThePath(struct svm_node *start, struct svm_node *stop, double interval) {
00780                 int start_dim = 0, stop_dim = 0;
00781                 struct svm_node *t, *z;
00782                 
00783                 t = start;
00784                 while (t->index != -1) {
00785                         start_dim++;
00786                         t++;
00787                 }
00788                 
00789                 t = stop;
00790                 while (t->index != -1) {
00791                         stop_dim++;
00792                         t++;
00793                 }
00794                 
00795                 start_dim++;
00796                 stop_dim++;
00797                 
00798                 
00799                 if (start_dim >= stop_dim) {
00800                         z = (struct svm_node *) malloc(sizeof(struct svm_node) * start_dim);
00801                 }
00802                 else {
00803                         z = (struct svm_node *) malloc(sizeof(struct svm_node) * stop_dim);
00804                 }
00805 
00806                 t = z;
00807                 
00808                 struct svm_node *x = start;
00809                 struct svm_node *y = stop;
00810                                 
00811                 while(x->index != -1 && y->index !=-1) {
00812                         if (x->index == y->index) {
00813                                 z->index = x->index;
00814                                 z->value = ((y->value - x->value) * interval) + x->value;
00815                                 x++;
00816                                 y++;
00817                                 z++;
00818                         }
00819                         else {
00820                                 if (x->index > y->index) {      
00821                                         z->index = y->index;
00822                                         z->value = y->value * interval; // (y->value - 0) * interval + 0 
00823                                         ++y;
00824                                         ++z;
00825                                 }
00826                                 else {
00827                                         z->index = x->index;
00828                                         z->value = -(x->value * interval) + x->value;  // (0 - x->value) * interval + x->value 
00829                                         ++x;
00830                                         ++z;
00831                                 }
00832                         }
00833                 }
00834 
00835                 while(x->index != -1) {
00836                         z->index = x->index;
00837                         z->value = -(x->value * interval) + x->value;  // (0 - x->value) * interval + x->value
00838                         ++x;
00839                         ++z;
00840                 }
00841 
00842                 while(y->index != -1) {
00843                         z->index = y->index;
00844                         z->value = y->value * interval; // (y->value - 0) * interval + 0 
00845                         ++y;
00846                         ++z;
00847                 }
00848                 
00849                 z->index = -1;
00850                 z->value = 0;
00851                 
00852                 return t;
00853         }
00854         
00855         
00874         void SVClustering::separateClusters() {
00875                 static struct svm_node *z;
00876                 bool flag;
00877                 int i,j;
00878                 double zdist;
00879                 int N = this->trainer->getProblem()->l;
00880                 
00881                 std::pair<UndirectedGraph::edge_descriptor, bool> temp;
00882                         
00883                 UndirectedGraph aGraph(N);
00884                 
00885                 // calculating adj matrix
00886                 
00887                 for (i = 0; i < N; i++) {
00888                         for (j = 0; j < N; j++) {
00889                                 
00890                                 // if the j is adjacent to i, then all j adjacent's are also adjacent to i.
00891                                 if (j < i) {
00892                                         temp = edge(i, j, aGraph);
00893                                         if (temp.second == true) {
00894                                                 for (int k = 0; k < N; k++) {
00895                                                         if (edge(i, k, aGraph).second || edge(j, k, aGraph).second) {
00896                                                                 add_edge(i, k, aGraph);
00897                                                         }
00898                                                         else {
00899                                                                 remove_edge(i, k, aGraph);
00900                                                         }
00901                                                 }
00902                                         }               
00903                                 }
00904                                 else {
00905                                         // the i-th node is adiacent to itself - trivial
00906                                         if (i == j) {
00907                                                 add_edge(i, j, aGraph);
00908                                                 continue;
00909                                         }
00910 
00911                                         // we avoid to compute the same edge more than once                                     
00912                                         temp = edge(i, j, aGraph);
00913                                         if (temp.second == false) {
00914                                                 flag = true;
00915 
00916                                                 //      sampling the point on the path between x[i] and x[j]
00917                                                 //      usually 10-20 points are sampled in order to limit running time
00918                                                 //      here 10 points are sampled
00919                                                 for (double k = 1; k <= 10; k = k + 1) {
00920                                                         z = pointOnThePath(this->trainer->getProblem()->x[i], this->trainer->getProblem()->x[j], k / 10);
00921                                                         
00922                                                         zdist = calculateDistanceFromCenter(z);
00923 
00924                                                         if (zdist > this->getSphereRadius()) {
00925                                                                 flag = false;
00926                                                                 break;
00927                                                         }
00928                                                 }
00929                                                 
00930                                                 if (flag) {
00931                                                         add_edge(i, j, aGraph);
00932                                                         add_edge(j, i, aGraph);
00933                                                 }
00934                                                 else {
00935                                                         remove_edge(i, j, aGraph);
00936                                                         remove_edge(j, i, aGraph);
00937                                                 }
00938                                         }
00939                                 }
00940                         }
00941                 }
00942                         
00943                 // finding connected components (and so the clusters)
00944                 
00945                 labels.resize(N);
00946                 connected_components(aGraph, &labels[0]);
00947                 
00948                 
00949                 // clusterize the BSVs
00950                 
00951                 int nBSV = this->trainer->getModel()->lbounded;
00952                 int *BSV_index = this->trainer->getModel()->BSV_index;
00953 
00954                 int minimizer;
00955                 double min_BSV_distance, d;
00956                 
00957                 for (int i = 0; i < nBSV; i++) {
00958                         min_BSV_distance = MAXFLOAT; 
00959                         minimizer = 0;
00960                         for (int j = 0; j < (int) labels.size(); j++) {
00961                                 if (j == BSV_index[i]) continue;        //      ignore if the point is the current BSV
00962                                 if (beta(j, true) == 1) continue;       //      ignore all BSVs
00963                                 
00964                                 d = eucDist->calculateDistance(this->trainer->getProblem()->x[BSV_index[i]], this->trainer->getProblem()->x[j]);
00965                                 if (d < min_BSV_distance) {
00966                                         min_BSV_distance = d;
00967                                         minimizer = i;
00968                                 }
00969                         }
00970                         labels[BSV_index[i]] = labels[minimizer];
00971                 }
00972                 
00973         }
00974         
00975 }

Generated on Mon Sep 24 22:26:49 2007 for SVClustering by  doxygen 1.5.2