Muster: src/bic.h Source File

Go to the documentation of this file.
00001 //////////////////////////////////////////////////////////////////////////////////////////////////
00002 // Copyright (c) 2010, Lawrence Livermore National Security, LLC.  
00003 // Produced at the Lawrence Livermore National Laboratory  
00004 // LLNL-CODE-433662
00005 // All rights reserved.  
00006 //
00007 // This file is part of Muster. For details, see http://github.com/tgamblin/muster. 
00008 // Please also read the LICENSE file for further information.
00009 //
00010 // Redistribution and use in source and binary forms, with or without modification, are
00011 // permitted provided that the following conditions are met:
00012 //
00013 //  * Redistributions of source code must retain the above copyright notice, this list of
00014 //    conditions and the disclaimer below.
00015 //  * Redistributions in binary form must reproduce the above copyright notice, this list of
00016 //    conditions and the disclaimer (as noted below) in the documentation and/or other materials
00017 //    provided with the distribution.
00018 //  * Neither the name of the LLNS/LLNL nor the names of its contributors may be used to endorse
00019 //    or promote products derived from this software without specific prior written permission.
00020 //
00021 // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS
00022 // OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
00023 // MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL
00024 // LAWRENCE LIVERMORE NATIONAL SECURITY, LLC, THE U.S. DEPARTMENT OF ENERGY OR CONTRIBUTORS BE
00025 // LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
00026 // (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
00027 // DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
00028 // WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
00029 // ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
00030 //////////////////////////////////////////////////////////////////////////////////////////////////
00031 
00032 ///
00033 /// @file bic.h
00034 /// @author Todd Gamblin tgamblin@llnl.gov
00035 /// @brief Template function implementations of the Bayesian Information Criterion.
00036 ///
00037 /// The Bayesian Information Criterion (BIC) is a criterion for model selection
00038 /// that balances a maximum likelihood estimator and a parameter count.  This
00039 /// implementation is designed for clustering algorithms, in particular K-means and
00040 /// K-medoids clustering algorithms where we expect clusters with  spherical gaussian
00041 /// distributions.  
00042 ///
00043 /// Here, we want to test whether a clustering's centroids or medoids are good predictors
00044 /// of the points in a data set, so these are our parameters, and we try to find the best
00045 /// clustering without too many clusters.  For more on this technique and the approach
00046 /// we've based this implementation on, see this paper:
00047 /// @par
00048 /// Dan Pelleg and Andrew Moore.  <a href="http://www.cs.cmu.edu/~dpelleg/download/xmeans.pdf">
00049 /// <b>X-Means: Extending K-Means with Efficient Estimation of the Number of Clusters</b></a>.
00050 /// <i>Proceedings of the Seventeenth International Conference on Machine Learning</i>, 
00051 /// San Francisco, CA. June 29-July 2, 2000.  pp 727-734.
00052 /// 
00053 #ifndef BAYESIAN_INFORMATION_CRITERION_H
00054 #define BAYESIAN_INFORMATION_CRITERION_H
00055 
00056 #include <stdint.h>
00057 #include <numeric>
00058 #include <iostream>
00059 #include <cmath>
00060 #include <vector>
00061 #include "dissimilarity.h"
00062 #include "partition.h"
00063 
00064 namespace cluster {
00065   
00066   ///
00067   /// Directly computes the BIC from a partition object based on the cluster centroids
00068   /// and the number of clusters.
00069   /// 
00070   /// @param[in] p         A partition object describing the clustering to be evaluated.
00071   /// @param[in] distance  A distance function callable on two \em indices from the partition p.
00072   /// @param[in] M         Dimensionality parameter -- degrees of freedom in the input dataset.
00073   ///
00074   /// @return A valid BIC score based on the input parameters.  Higher numbers indicate better fit.
00075   /// 
00076   template <typename D>
00077   double bic(const partition& p, D distance, size_t M) {
00078     double R = p.size();
00079     size_t k = p.num_clusters();
00080 
00081     // calculate variance.
00082     double s2 = total_squared_dissimilarity(p, distance) / (R - k);
00083     double s  = sqrt(s2);
00084     double sM = pow(s, (double)M);
00085 
00086     // compute sizes of the clusters in the partition.
00087     size_t sizes[k];
00088     for (size_t i=0; i < k; i++) {
00089       sizes[i] = p.size(i);
00090     }
00091 
00092     // compute BIC from point probabilities
00093     double root2pi = sqrt(2 * M_PI);
00094     double lD = 0;
00095     for (size_t i=0; i < p.size(); i++) {
00096       double d  = distance(i, p.medoid_ids[p.cluster_ids[i]]);
00097       double Ri = sizes[p.cluster_ids[i]];
00098       lD += 
00099         + log(1.0 / (root2pi * sM))
00100         - (1 / (2 * s2)) * d * d
00101         + log(Ri / R);
00102     }
00103 
00104     const size_t pj = (k-1) + M*k + 1;   // free parameter count
00105     return lD - pj/2 * log((double)R);
00106   }
00107 
00108 
00109   ///
00110   /// This version of the BIC assumes some precomputed information.  This is useful for
00111   /// parallel implementations, where it is more efficient to compute some global sums as a 
00112   /// reduction rather than aggregating a full partition to one process.  Here,
00113   /// we assume that the sizes of the distributed clusters as well as the total squared intra-cluster
00114   /// dissimilarity (between each object and its medoid) is known.
00115   /// 
00116   ///    @param[in] k               Number of clusters in the clustering.  Same as k from k-means or k-medoids.
00117   ///    @param[in] cluster_sizes   Start of range of k sizes.  
00118   ///                               <code>*cluster_sizes .. *(cluster_sizes + k)</code> should be the
00119   ///                               sizes of clusters 1 to k
00120   ///    @param[in] sum2_dissim     Sum of squared dissimilarities of each object w.r.t. its nearest medoid.
00121   ///    @param[in] dimensionality  Dimensionality of clustered data.  e.g., 2 for 2-dimensional points.
00122   ///
00123   template <typename SizeIterator, typename DissimIterator>
00124   double bic(size_t k, SizeIterator cluster_sizes, DissimIterator sum2_dissim, size_t dimensionality) {
00125     // figure out total size of data set and put it in R.
00126     const double R      = std::accumulate(cluster_sizes, cluster_sizes + k, 0);
00127     const double M      = dimensionality;    // Shorthand for model dimensionality
00128     const double logR   = log(R);
00129     const double log2pi = log(2 * M_PI);
00130     const double pj     = (k-1) + M*k + 1;   // free parameter count
00131     const double s2     = std::accumulate(sum2_dissim, sum2_dissim + k, 0.0) / (R - k);
00132 
00133     // apply criterion formula from xmeans paper.
00134     double criterion = 0;
00135     for (size_t i=0; i < k; i++) {
00136       const double Rn = *(cluster_sizes + i);
00137       criterion += 
00138         - (Rn * log2pi) / 2.0
00139         - (Rn * M * log(s2)) / 2.0 
00140         - (Rn - 1) / 2.0 
00141         + Rn * log(Rn) 
00142         - Rn * logR;
00143     }
00144     criterion -= (pj/2.0 * logR);
00145     
00146     return criterion;
00147   }
00148 
00149 };
00150 
00151 #endif // BAYESIAN_INFORMATION_CRITERION_H