00001 ////////////////////////////////////////////////////////////////////////////////////////////////// 00002 // Copyright (c) 2010, Lawrence Livermore National Security, LLC. 00003 // Produced at the Lawrence Livermore National Laboratory 00004 // LLNL-CODE-433662 00005 // All rights reserved. 00006 // 00007 // This file is part of Muster. For details, see http://github.com/tgamblin/muster. 00008 // Please also read the LICENSE file for further information. 00009 // 00010 // Redistribution and use in source and binary forms, with or without modification, are 00011 // permitted provided that the following conditions are met: 00012 // 00013 // * Redistributions of source code must retain the above copyright notice, this list of 00014 // conditions and the disclaimer below. 00015 // * Redistributions in binary form must reproduce the above copyright notice, this list of 00016 // conditions and the disclaimer (as noted below) in the documentation and/or other materials 00017 // provided with the distribution. 00018 // * Neither the name of the LLNS/LLNL nor the names of its contributors may be used to endorse 00019 // or promote products derived from this software without specific prior written permission. 00020 // 00021 // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS 00022 // OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF 00023 // MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL 00024 // LAWRENCE LIVERMORE NATIONAL SECURITY, LLC, THE U.S. DEPARTMENT OF ENERGY OR CONTRIBUTORS BE 00025 // LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES 00026 // (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 00027 // DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, 00028 // WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN 00029 // ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 00030 ////////////////////////////////////////////////////////////////////////////////////////////////// 00031 00032 /// 00033 /// @file bic.h 00034 /// @author Todd Gamblin tgamblin@llnl.gov 00035 /// @brief Template function implementations of the Bayesian Information Criterion. 00036 /// 00037 /// The Bayesian Information Criterion (BIC) is a criterion for model selection 00038 /// that balances a maximum likelihood estimator and a parameter count. This 00039 /// implementation is designed for clustering algorithms, in particular K-means and 00040 /// K-medoids clustering algorithms where we expect clusters with spherical gaussian 00041 /// distributions. 00042 /// 00043 /// Here, we want to test whether a clustering's centroids or medoids are good predictors 00044 /// of the points in a data set, so these are our parameters, and we try to find the best 00045 /// clustering without too many clusters. For more on this technique and the approach 00046 /// we've based this implementation on, see this paper: 00047 /// @par 00048 /// Dan Pelleg and Andrew Moore. <a href="http://www.cs.cmu.edu/~dpelleg/download/xmeans.pdf"> 00049 /// <b>X-Means: Extending K-Means with Efficient Estimation of the Number of Clusters</b></a>. 00050 /// <i>Proceedings of the Seventeenth International Conference on Machine Learning</i>, 00051 /// San Francisco, CA. June 29-July 2, 2000. pp 727-734. 00052 /// 00053 #ifndef BAYESIAN_INFORMATION_CRITERION_H 00054 #define BAYESIAN_INFORMATION_CRITERION_H 00055 00056 #include <stdint.h> 00057 #include <numeric> 00058 #include <iostream> 00059 #include <cmath> 00060 #include <vector> 00061 #include "dissimilarity.h" 00062 #include "partition.h" 00063 00064 namespace cluster { 00065 00066 /// 00067 /// Directly computes the BIC from a partition object based on the cluster centroids 00068 /// and the number of clusters. 00069 /// 00070 /// @param[in] p A partition object describing the clustering to be evaluated. 00071 /// @param[in] distance A distance function callable on two \em indices from the partition p. 00072 /// @param[in] M Dimensionality parameter -- degrees of freedom in the input dataset. 00073 /// 00074 /// @return A valid BIC score based on the input parameters. Higher numbers indicate better fit. 00075 /// 00076 template <typename D> 00077 double bic(const partition& p, D distance, size_t M) { 00078 double R = p.size(); 00079 size_t k = p.num_clusters(); 00080 00081 // calculate variance. 00082 double s2 = total_squared_dissimilarity(p, distance) / (R - k); 00083 double s = sqrt(s2); 00084 double sM = pow(s, (double)M); 00085 00086 // compute sizes of the clusters in the partition. 00087 size_t sizes[k]; 00088 for (size_t i=0; i < k; i++) { 00089 sizes[i] = p.size(i); 00090 } 00091 00092 // compute BIC from point probabilities 00093 double root2pi = sqrt(2 * M_PI); 00094 double lD = 0; 00095 for (size_t i=0; i < p.size(); i++) { 00096 double d = distance(i, p.medoid_ids[p.cluster_ids[i]]); 00097 double Ri = sizes[p.cluster_ids[i]]; 00098 lD += 00099 + log(1.0 / (root2pi * sM)) 00100 - (1 / (2 * s2)) * d * d 00101 + log(Ri / R); 00102 } 00103 00104 const size_t pj = (k-1) + M*k + 1; // free parameter count 00105 return lD - pj/2 * log((double)R); 00106 } 00107 00108 00109 /// 00110 /// This version of the BIC assumes some precomputed information. This is useful for 00111 /// parallel implementations, where it is more efficient to compute some global sums as a 00112 /// reduction rather than aggregating a full partition to one process. Here, 00113 /// we assume that the sizes of the distributed clusters as well as the total squared intra-cluster 00114 /// dissimilarity (between each object and its medoid) is known. 00115 /// 00116 /// @param[in] k Number of clusters in the clustering. Same as k from k-means or k-medoids. 00117 /// @param[in] cluster_sizes Start of range of k sizes. 00118 /// <code>*cluster_sizes .. *(cluster_sizes + k)</code> should be the 00119 /// sizes of clusters 1 to k 00120 /// @param[in] sum2_dissim Sum of squared dissimilarities of each object w.r.t. its nearest medoid. 00121 /// @param[in] dimensionality Dimensionality of clustered data. e.g., 2 for 2-dimensional points. 00122 /// 00123 template <typename SizeIterator, typename DissimIterator> 00124 double bic(size_t k, SizeIterator cluster_sizes, DissimIterator sum2_dissim, size_t dimensionality) { 00125 // figure out total size of data set and put it in R. 00126 const double R = std::accumulate(cluster_sizes, cluster_sizes + k, 0); 00127 const double M = dimensionality; // Shorthand for model dimensionality 00128 const double logR = log(R); 00129 const double log2pi = log(2 * M_PI); 00130 const double pj = (k-1) + M*k + 1; // free parameter count 00131 const double s2 = std::accumulate(sum2_dissim, sum2_dissim + k, 0.0) / (R - k); 00132 00133 // apply criterion formula from xmeans paper. 00134 double criterion = 0; 00135 for (size_t i=0; i < k; i++) { 00136 const double Rn = *(cluster_sizes + i); 00137 criterion += 00138 - (Rn * log2pi) / 2.0 00139 - (Rn * M * log(s2)) / 2.0 00140 - (Rn - 1) / 2.0 00141 + Rn * log(Rn) 00142 - Rn * logR; 00143 } 00144 criterion -= (pj/2.0 * logR); 00145 00146 return criterion; 00147 } 00148 00149 }; 00150 00151 #endif // BAYESIAN_INFORMATION_CRITERION_H