00001 ////////////////////////////////////////////////////////////////////////////////////////////////// 00002 // Copyright (c) 2010, Lawrence Livermore National Security, LLC. 00003 // Produced at the Lawrence Livermore National Laboratory 00004 // LLNL-CODE-433662 00005 // All rights reserved. 00006 // 00007 // This file is part of Muster. For details, see http://github.com/tgamblin/muster. 00008 // Please also read the LICENSE file for further information. 00009 // 00010 // Redistribution and use in source and binary forms, with or without modification, are 00011 // permitted provided that the following conditions are met: 00012 // 00013 // * Redistributions of source code must retain the above copyright notice, this list of 00014 // conditions and the disclaimer below. 00015 // * Redistributions in binary form must reproduce the above copyright notice, this list of 00016 // conditions and the disclaimer (as noted below) in the documentation and/or other materials 00017 // provided with the distribution. 00018 // * Neither the name of the LLNS/LLNL nor the names of its contributors may be used to endorse 00019 // or promote products derived from this software without specific prior written permission. 00020 // 00021 // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS 00022 // OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF 00023 // MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL 00024 // LAWRENCE LIVERMORE NATIONAL SECURITY, LLC, THE U.S. DEPARTMENT OF ENERGY OR CONTRIBUTORS BE 00025 // LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES 00026 // (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 00027 // DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, 00028 // WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN 00029 // ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 00030 ////////////////////////////////////////////////////////////////////////////////////////////////// 00031 00032 /// 00033 /// @file kmedoids.h 00034 /// @author Todd Gamblin tgamblin@llnl.gov 00035 /// @brief Implementations of the classic clustering algorithms PAM and CLARA, from 00036 /// <i>Finding Groups in Data</i>, by Kaufman and Rousseeuw. 00037 /// 00038 #ifndef K_MEDOIDS_H 00039 #define K_MEDOIDS_H 00040 00041 #include <vector> 00042 #include <set> 00043 #include <iostream> 00044 #include <stdexcept> 00045 #include <cfloat> 00046 00047 #include <boost/random.hpp> 00048 00049 #include "random.h" 00050 #include "dissimilarity.h" 00051 #include "partition.h" 00052 #include "bic.h" 00053 00054 namespace cluster { 00055 00056 /// 00057 /// Implementations of the classic clustering algorithms PAM and CLARA, from 00058 /// <i>Finding Groups in Data</i>, by Kaufman and Rousseeuw. 00059 /// 00060 class kmedoids : public partition { 00061 public: 00062 /// 00063 /// Constructor. Can optionally specify number of objects to be clustered. 00064 /// and this will start out with all of them in one cluster. 00065 /// 00066 /// The random number generator associated with this kmedoids object is seeded 00067 /// with the time in microseconds since the epoch. 00068 /// 00069 kmedoids(size_t num_objects = 0); 00070 ~kmedoids(); 00071 00072 /// Get the average dissimilarity of objects w/their medoids for the last run. 00073 double average_dissimilarity() const; 00074 00075 /// Set whether medoids will be sorted by object id after clustering is complete. 00076 /// Defaults to true. 00077 void set_sort_medoids(bool sort_medoids); 00078 00079 /// Set tolerance for convergence. This is relative error, not absolute error. It will be 00080 /// multiplied by the mean distance before it is used to test convergence. 00081 /// Defaults to 1e-15; may need to be higher if there exist clusterings with very similar quality. 00082 void set_epsilon(double epsilon); 00083 00084 /// 00085 /// Classic K-Medoids clustering, using the Partitioning-Around-Medoids (PAM) 00086 /// algorithm as described in Kaufman and Rousseeuw. 00087 /// 00088 /// @param distance dissimilarity matrix for all objects to cluster 00089 /// @param k number of clusters to produce 00090 /// @param initial_medoids Optionally supply k initial object ids to be used as initial medoids. 00091 /// 00092 /// @see \link build_dissimilarity_matrix()\endlink, a function to automatically 00093 /// construct a dissimilarity matrix given a vector of objects and a distance function. 00094 /// 00095 void pam(const dissimilarity_matrix& distance, size_t k, const object_id *initial_medoids = NULL); 00096 00097 /// 00098 /// Classic K-Medoids clustering, using the Partitioning-Around-Medoids (PAM) 00099 /// algorithm as described in Kaufman and Rousseeuw. Runs PAM from 1 to max_k and selects 00100 /// the best k using the bayesian information criterion. Sets this partition to the best 00101 /// partition found using PAM from 1 to k. 00102 /// 00103 /// Based on X-Means, see Pelleg & Moore, 2000. 00104 /// 00105 /// @param distance dissimilarity matrix for all objects to cluster 00106 /// @param max_k Upper limit on number of clusters to find. 00107 /// @param dimensionality Number of dimensions in clustered data, for BIC. 00108 /// 00109 /// @return the best BIC value found (the bic value of the final partitioning). 00110 /// 00111 /// @see \link build_dissimilarity_matrix()\endlink, a function to automatically 00112 /// construct a dissimilarity matrix given a vector of objects and a distance function. 00113 /// 00114 double xpam(const dissimilarity_matrix& distance, size_t max_k, size_t dimensionality); 00115 00116 /// 00117 /// CLARA clustering algorithm, as per Kaufman and Rousseuw and 00118 /// R. Ng and J. Han, "Efficient and Effective Clustering Methods 00119 /// for Spatial Data Mining." 00120 /// 00121 /// @tparam T Type of objects to be clustered. 00122 /// @tparam D Dissimilarity metric type. D should be callable 00123 /// on (T, T) and should return a double. 00124 /// 00125 /// @param objects Objects to cluster 00126 /// @param dmetric Distance metric to build dissimilarity matrices with 00127 /// @param k Number of clusters to partition 00128 /// 00129 template <class T, class D> 00130 void clara(const std::vector<T>& objects, D dmetric, size_t k) { 00131 size_t sample_size = init_size + 2*k; 00132 00133 // Just run plain KMedoids once if sampling won't gain us anything 00134 if (objects.size() <= sample_size) { 00135 dissimilarity_matrix mat; 00136 build_dissimilarity_matrix(objects, dmetric, mat); 00137 pam(mat, k); 00138 return; 00139 } 00140 00141 // get everything the right size before starting. 00142 medoid_ids.resize(k); 00143 cluster_ids.resize(objects.size()); 00144 00145 // medoids and clusters for best partition so far. 00146 partition best_partition; 00147 00148 //run KMedoids on a sampled subset max_reps times 00149 total_dissimilarity = DBL_MAX; 00150 for (size_t i = 0; i < max_reps; i++) { 00151 // Take a random sample of objects, store sample in a vector 00152 std::vector<size_t> sample_to_full; 00153 random_subset(objects.size(), sample_size, back_inserter(sample_to_full), rng); 00154 00155 // Build a distance matrix for PAM 00156 dissimilarity_matrix distance; 00157 build_dissimilarity_matrix(objects, sample_to_full, dmetric, distance); 00158 00159 // Actually run PAM on the subset 00160 kmedoids subcall; 00161 subcall.set_sort_medoids(false); // skip sort for subcall since it's not needed 00162 subcall.pam(distance, k); 00163 00164 // copy medoids from the subcall to local data, being sure to translate indices 00165 for (size_t i=0; i < medoid_ids.size(); i++) { 00166 medoid_ids[i] = sample_to_full[subcall.medoid_ids[i]]; 00167 } 00168 00169 // sync up the cluster_ids matrix with the new medoids by assigning 00170 // each object to its closest medoid. Remember the quality of the clustering. 00171 double dissimilarity = assign_objects_to_clusters(lazy_distance(objects, dmetric)); 00172 00173 // keep the best clustering found so far around 00174 if (dissimilarity < total_dissimilarity) { 00175 swap(best_partition); 00176 total_dissimilarity = dissimilarity; 00177 } 00178 } 00179 00180 if (sort_medoids) sort(); // just do one final ordering of ids. 00181 } 00182 00183 00184 /// 00185 /// Takes existing clustering and reassigns medoids by taking closest medoid to mean 00186 /// of each cluster. This is O(n) and can give better representatives for CLARA clusterings. 00187 /// This is needed to apply gaussian-model BIC criteria to clusterings. 00188 /// 00189 /// @tparam T To use this function, T needs to support algebraic operations.<br> 00190 /// Specifically, T must support enough to construct a mean: 00191 /// - addition <code>T + T = T</code> 00192 /// - scalar division <code>T / c = T</code> 00193 /// 00194 template <class T, class D> 00195 void center_medoids(const std::vector<T>& objects, D distance) { 00196 // First sum up elements in all clusters to get the mean for each 00197 std::vector<T> means(medoid_ids.size()); 00198 std::vector<size_t> counts(medoid_ids.size()); 00199 for (size_t i=0; i < cluster_ids.size(); i++) { 00200 medoid_id m = cluster_ids[i]; 00201 means[m] = counts[m] ? (means[m] + objects[i]) : objects[i]; 00202 counts[m]++; 00203 } 00204 00205 // Now, turn cumulative sums into means and calculate distance from medoids to means 00206 std::vector<double> shortest(means.size()); 00207 for (size_t m=0; m < means.size(); m++) { 00208 means[m] = means[m] / counts[m]; 00209 shortest[m] = distance(means[m], objects[medoid_ids[m]]); 00210 } 00211 00212 // Find closest medoids to each mean element, preferring existing medoids if there are ties. 00213 for (size_t i=0; i < cluster_ids.size(); i++) { 00214 medoid_id m = cluster_ids[i]; 00215 double d = distance(objects[i], means[m]); 00216 if (d < shortest[m]) { 00217 medoid_ids[m] = i; 00218 shortest[m] = d; 00219 } 00220 } 00221 } 00222 00223 00224 /// 00225 /// K-Agnostic version of CLARA. This uses the BIC criterion as described in bic.h to 00226 /// run clara() a number of times and to select a best run of clara() from the trials. 00227 /// This will be slower than regular clara(). In particular, it's O(n*max_k). 00228 /// 00229 /// @param[in] objects Objects to cluster 00230 /// @param[in] dmetric Distance metric to build dissimilarity matrices with 00231 /// @param[in] max_k Max number of clusters to find. 00232 /// @param[in] dimensionality Dimensionality of objects, used by BIC. 00233 /// 00234 template <class T, class D> 00235 double xclara(const std::vector<T>& objects, D dmetric, size_t max_k, size_t dimensionality) { 00236 double best_bic = -DBL_MAX; // note that DBL_MIN isn't what you think it is. 00237 00238 for (size_t k = 1; k <= max_k; k++) { 00239 kmedoids subcall; 00240 subcall.clara(objects, dmetric, k); 00241 center_medoids(objects, dmetric); 00242 double cur_bic = bic(subcall, lazy_distance(objects, dmetric), dimensionality); 00243 00244 if (xcallback) xcallback(subcall, cur_bic); 00245 if (cur_bic > best_bic) { 00246 best_bic = cur_bic; 00247 swap(subcall); 00248 } 00249 } 00250 return best_bic; 00251 } 00252 00253 00254 void set_init_size(size_t sz) { init_size = sz; } 00255 void set_max_reps(size_t r) { max_reps = r; } 00256 00257 00258 /// Set callback function for XPAM and XCLARA. default is none. 00259 void set_xcallback(void (*)(const partition& part, double bic)); 00260 00261 protected: 00262 typedef boost::mt19937 random_type; /// Type for RNG used in this algorithm 00263 random_type random; /// Randomness source for this algorithm 00264 00265 /// Adaptor for STL algorithms. 00266 typedef boost::random_number_generator<random_type, unsigned long> rng_type; 00267 rng_type rng; 00268 00269 std::vector<medoid_id> sec_nearest; /// Index of second closest medoids. Used by PAM. 00270 double total_dissimilarity; /// Total dissimilarity bt/w objects and their medoid 00271 bool sort_medoids; /// Whether medoids should be canonically sorted by object id. 00272 double epsilon; /// Normalized sensitivity for convergence 00273 size_t init_size; /// initial sample size (before 2*k) 00274 size_t max_reps; /// initial sample size (before 2*k) 00275 00276 00277 /// Callback for each iteration of xpam. is called with the current clustering and its BIC score. 00278 void (*xcallback)(const partition& part, double bic); 00279 00280 /// KR BUILD algorithm for assigning initial medoids to a partition. 00281 void init_medoids(size_t k, const dissimilarity_matrix& distance); 00282 00283 /// Total cost of swapping object h with medoid i. 00284 /// Sums costs of this exchagne for all objects j. 00285 double cost(medoid_id i, object_id h, const dissimilarity_matrix& distance) const; 00286 00287 00288 /// Assign each object to the cluster with the closest medoid. 00289 /// 00290 /// @return Total dissimilarity of objects w/their medoids. 00291 /// 00292 /// @param distance a callable object that computes distances between indices, as a distance 00293 /// matrix would. Algorithms are free to use real distance matrices (as in PAM) 00294 /// or to compute lazily (as in CLARA medoid assignment). 00295 template <class DM> 00296 double assign_objects_to_clusters(DM distance) { 00297 if (sec_nearest.size() != cluster_ids.size()) { 00298 sec_nearest.resize(cluster_ids.size()); 00299 } 00300 00301 // go through and assign each object to nearest medoid, keeping track of total dissimilarity. 00302 double total_dissimilarity = 0; 00303 for (object_id i=0; i < cluster_ids.size(); i++) { 00304 double d1, d2; // smallest, second smallest distance to medoid, respectively 00305 medoid_id m1, m2; // index of medoids with distances d1, d2 from object i, respectively 00306 00307 d1 = d2 = DBL_MAX; 00308 m1 = m2 = medoid_ids.size(); 00309 for (medoid_id m=0; m < medoid_ids.size(); m++) { 00310 double d = distance(i, medoid_ids[m]); 00311 if (d < d1 || medoid_ids[m] == i) { // prefer the medoid in case of ties. 00312 d2 = d1; m2 = m1; 00313 d1 = d; m1 = m; 00314 } else if (d < d2) { 00315 d2 = d; m2 = m; 00316 } 00317 } 00318 00319 cluster_ids[i] = m1; 00320 sec_nearest[i] = m2; 00321 total_dissimilarity += d1; 00322 } 00323 00324 return total_dissimilarity; 00325 } 00326 }; 00327 00328 00329 } // namespace cluster 00330 00331 #endif //K_MEDOIDS_H