Muster: src/kmedoids.h Source File

Go to the documentation of this file.
00001 //////////////////////////////////////////////////////////////////////////////////////////////////
00002 // Copyright (c) 2010, Lawrence Livermore National Security, LLC.  
00003 // Produced at the Lawrence Livermore National Laboratory  
00004 // LLNL-CODE-433662
00005 // All rights reserved.  
00006 //
00007 // This file is part of Muster. For details, see http://github.com/tgamblin/muster. 
00008 // Please also read the LICENSE file for further information.
00009 //
00010 // Redistribution and use in source and binary forms, with or without modification, are
00011 // permitted provided that the following conditions are met:
00012 //
00013 //  * Redistributions of source code must retain the above copyright notice, this list of
00014 //    conditions and the disclaimer below.
00015 //  * Redistributions in binary form must reproduce the above copyright notice, this list of
00016 //    conditions and the disclaimer (as noted below) in the documentation and/or other materials
00017 //    provided with the distribution.
00018 //  * Neither the name of the LLNS/LLNL nor the names of its contributors may be used to endorse
00019 //    or promote products derived from this software without specific prior written permission.
00020 //
00021 // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS
00022 // OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
00023 // MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL
00024 // LAWRENCE LIVERMORE NATIONAL SECURITY, LLC, THE U.S. DEPARTMENT OF ENERGY OR CONTRIBUTORS BE
00025 // LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
00026 // (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
00027 // DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
00028 // WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
00029 // ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
00030 //////////////////////////////////////////////////////////////////////////////////////////////////
00031 
00032 ///
00033 /// @file kmedoids.h
00034 /// @author Todd Gamblin tgamblin@llnl.gov
00035 /// @brief Implementations of the classic clustering algorithms PAM and CLARA, from 
00036 /// <i>Finding Groups in Data</i>, by Kaufman and Rousseeuw.
00037 ///
00038 #ifndef K_MEDOIDS_H
00039 #define K_MEDOIDS_H
00040 
00041 #include <vector>
00042 #include <set>
00043 #include <iostream>
00044 #include <stdexcept>
00045 #include <cfloat>
00046 
00047 #include <boost/random.hpp>
00048 
00049 #include "random.h"
00050 #include "dissimilarity.h"
00051 #include "partition.h"
00052 #include "bic.h"
00053 
00054 namespace cluster {
00055 
00056   /// 
00057   /// Implementations of the classic clustering algorithms PAM and CLARA, from 
00058   /// <i>Finding Groups in Data</i>, by Kaufman and Rousseeuw.
00059   /// 
00060   class kmedoids : public partition {
00061   public:
00062     ///
00063     /// Constructor.  Can optionally specify number of objects to be clustered.
00064     /// and this will start out with all of them in one cluster.
00065     /// 
00066     /// The random number generator associated with this kmedoids object is seeded
00067     /// with the time in microseconds since the epoch.
00068     /// 
00069     kmedoids(size_t num_objects = 0);
00070     ~kmedoids();
00071 
00072     /// Get the average dissimilarity of objects w/their medoids for the last run.
00073     double average_dissimilarity() const;
00074     
00075     /// Set whether medoids will be sorted by object id after clustering is complete.
00076     /// Defaults to true.
00077     void set_sort_medoids(bool sort_medoids);
00078 
00079     /// Set tolerance for convergence.  This is relative error, not absolute error.  It will be
00080     /// multiplied by the mean distance before it is used to test convergence.
00081     /// Defaults to 1e-15; may need to be higher if there exist clusterings with very similar quality.
00082     void set_epsilon(double epsilon);
00083 
00084     /// 
00085     /// Classic K-Medoids clustering, using the Partitioning-Around-Medoids (PAM)
00086     /// algorithm as described in Kaufman and Rousseeuw. 
00087     ///
00088     /// @param distance         dissimilarity matrix for all objects to cluster
00089     /// @param k                number of clusters to produce
00090     /// @param initial_medoids  Optionally supply k initial object ids to be used as initial medoids.
00091     ///
00092     /// @see \link build_dissimilarity_matrix()\endlink, a function to automatically
00093     ///      construct a dissimilarity matrix given a vector of objects and a distance function.
00094     /// 
00095     void pam(const dissimilarity_matrix& distance, size_t k, const object_id *initial_medoids = NULL);
00096 
00097     ///
00098     /// Classic K-Medoids clustering, using the Partitioning-Around-Medoids (PAM)
00099     /// algorithm as described in Kaufman and Rousseeuw. Runs PAM from 1 to max_k and selects
00100     /// the best k using the bayesian information criterion.  Sets this partition to the best
00101     /// partition found using PAM from 1 to k.
00102     /// 
00103     /// Based on X-Means, see Pelleg & Moore, 2000.
00104     /// 
00105     /// @param distance         dissimilarity matrix for all objects to cluster
00106     /// @param max_k            Upper limit on number of clusters to find.
00107     /// @param dimensionality   Number of dimensions in clustered data, for BIC.
00108     ///
00109     /// @return the best BIC value found (the bic value of the final partitioning).
00110     ///
00111     /// @see \link build_dissimilarity_matrix()\endlink, a function to automatically
00112     ///      construct a dissimilarity matrix given a vector of objects and a distance function.
00113     ///
00114     double xpam(const dissimilarity_matrix& distance, size_t max_k, size_t dimensionality);
00115 
00116     ///
00117     /// CLARA clustering algorithm, as per Kaufman and Rousseuw and
00118     /// R. Ng and J. Han, "Efficient and Effective Clustering Methods 
00119     /// for Spatial Data Mining."
00120     /// 
00121     /// @tparam T    Type of objects to be clustered.
00122     /// @tparam D    Dissimilarity metric type.  D should be callable 
00123     ///              on (T, T) and should return a double.
00124     /// 
00125     /// @param objects        Objects to cluster
00126     /// @param dmetric        Distance metric to build dissimilarity matrices with
00127     /// @param k              Number of clusters to partition
00128     /// 
00129     template <class T, class D>
00130     void clara(const std::vector<T>& objects, D dmetric, size_t k) {
00131       size_t sample_size = init_size + 2*k;
00132     
00133       // Just run plain KMedoids once if sampling won't gain us anything
00134       if (objects.size() <= sample_size) {
00135         dissimilarity_matrix mat;
00136         build_dissimilarity_matrix(objects, dmetric, mat);
00137         pam(mat, k);
00138         return;
00139       }
00140 
00141       // get everything the right size before starting.
00142       medoid_ids.resize(k);
00143       cluster_ids.resize(objects.size());
00144 
00145       // medoids and clusters for best partition so far.
00146       partition best_partition;
00147 
00148       //run KMedoids on a sampled subset max_reps times
00149       total_dissimilarity = DBL_MAX;
00150       for (size_t i = 0; i < max_reps; i++) {
00151         // Take a random sample of objects, store sample in a vector
00152         std::vector<size_t> sample_to_full;
00153         random_subset(objects.size(), sample_size, back_inserter(sample_to_full), rng);
00154 
00155         // Build a distance matrix for PAM
00156         dissimilarity_matrix distance;
00157         build_dissimilarity_matrix(objects, sample_to_full, dmetric, distance);
00158 
00159         // Actually run PAM on the subset
00160         kmedoids subcall;
00161         subcall.set_sort_medoids(false); // skip sort for subcall since it's not needed
00162         subcall.pam(distance, k);  
00163 
00164         // copy medoids from the subcall to local data, being sure to translate indices
00165         for (size_t i=0; i < medoid_ids.size(); i++) {
00166           medoid_ids[i] = sample_to_full[subcall.medoid_ids[i]];
00167         }
00168 
00169         // sync up the cluster_ids matrix with the new medoids by assigning
00170         // each object to its closest medoid.  Remember the quality of the clustering.
00171         double dissimilarity = assign_objects_to_clusters(lazy_distance(objects, dmetric));
00172         
00173         // keep the best clustering found so far around
00174         if (dissimilarity < total_dissimilarity) {
00175           swap(best_partition);
00176           total_dissimilarity = dissimilarity;
00177         } 
00178       }
00179       
00180       if (sort_medoids) sort();   // just do one final ordering of ids.
00181     }    
00182 
00183 
00184     ///
00185     /// Takes existing clustering and reassigns medoids by taking closest medoid to mean
00186     /// of each cluster.  This is O(n) and can give better representatives for CLARA clusterings.
00187     /// This is needed to apply gaussian-model BIC criteria to clusterings.
00188     /// 
00189     /// @tparam T  To use this function, T needs to support algebraic operations.<br>
00190     ///            Specifically, T must support enough to construct a mean:
00191     ///            - addition        <code>T + T = T</code>
00192     ///            - scalar division <code>T / c = T</code>
00193     ///
00194     template <class T, class D>
00195     void center_medoids(const std::vector<T>& objects, D distance) {
00196       // First sum up elements in all clusters to get the mean for each
00197       std::vector<T>      means(medoid_ids.size());
00198       std::vector<size_t> counts(medoid_ids.size());
00199       for (size_t i=0; i < cluster_ids.size(); i++) {
00200         medoid_id m = cluster_ids[i];
00201         means[m] = counts[m] ? (means[m] + objects[i]) : objects[i];
00202         counts[m]++;
00203       }
00204       
00205       // Now, turn cumulative sums into means and calculate distance from medoids to means
00206       std::vector<double> shortest(means.size());
00207       for (size_t m=0; m < means.size(); m++) {
00208         means[m] = means[m] / counts[m];
00209         shortest[m] = distance(means[m], objects[medoid_ids[m]]);
00210       }
00211       
00212       // Find closest medoids to each mean element, preferring existing medoids if there are ties.
00213       for (size_t i=0; i < cluster_ids.size(); i++) {
00214         medoid_id m = cluster_ids[i];
00215         double d = distance(objects[i], means[m]);
00216         if (d < shortest[m]) {
00217           medoid_ids[m] = i;
00218           shortest[m]   = d;
00219         }
00220       }
00221     }
00222 
00223 
00224     ///
00225     /// K-Agnostic version of CLARA.  This uses the BIC criterion as described in bic.h to
00226     /// run clara() a number of times and to select a best run of clara() from the trials.
00227     /// This will be slower than regular clara().  In particular, it's O(n*max_k).
00228     /// 
00229     /// @param[in]  objects         Objects to cluster
00230     /// @param[in]  dmetric         Distance metric to build dissimilarity matrices with
00231     /// @param[in]  max_k           Max number of clusters to find.
00232     /// @param[in]  dimensionality  Dimensionality of objects, used by BIC.
00233     ///
00234     template <class T, class D>
00235     double xclara(const std::vector<T>& objects, D dmetric, size_t max_k, size_t dimensionality) {
00236       double best_bic = -DBL_MAX;   // note that DBL_MIN isn't what you think it is.
00237 
00238       for (size_t k = 1; k <= max_k; k++) {
00239         kmedoids subcall;
00240         subcall.clara(objects, dmetric, k);
00241         center_medoids(objects, dmetric);
00242         double cur_bic = bic(subcall, lazy_distance(objects, dmetric), dimensionality);
00243 
00244         if (xcallback) xcallback(subcall, cur_bic);
00245         if (cur_bic > best_bic) {
00246           best_bic = cur_bic;
00247           swap(subcall);
00248         }
00249       }
00250       return best_bic;
00251     }
00252 
00253 
00254     void set_init_size(size_t sz) { init_size = sz; }
00255     void set_max_reps(size_t r) { max_reps = r; }
00256 
00257 
00258     /// Set callback function for XPAM and XCLARA.  default is none.
00259     void set_xcallback(void (*)(const partition& part, double bic));
00260 
00261     protected:
00262     typedef boost::mt19937 random_type;                /// Type for RNG used in this algorithm
00263     random_type random;                                /// Randomness source for this algorithm
00264     
00265     /// Adaptor for STL algorithms.
00266     typedef boost::random_number_generator<random_type, unsigned long> rng_type;
00267     rng_type rng;
00268 
00269     std::vector<medoid_id> sec_nearest;      /// Index of second closest medoids.  Used by PAM.
00270     double total_dissimilarity;              /// Total dissimilarity bt/w objects and their medoid
00271     bool sort_medoids;                       /// Whether medoids should be canonically sorted by object id.
00272     double epsilon;                          /// Normalized sensitivity for convergence
00273     size_t init_size;                        /// initial sample size (before 2*k)
00274     size_t max_reps;                         /// initial sample size (before 2*k)
00275 
00276 
00277     /// Callback for each iteration of xpam.  is called with the current clustering and its BIC score.
00278     void (*xcallback)(const partition& part, double bic);
00279 
00280     /// KR BUILD algorithm for assigning initial medoids to a partition.
00281     void init_medoids(size_t k, const dissimilarity_matrix& distance);
00282 
00283     /// Total cost of swapping object h with medoid i.
00284     /// Sums costs of this exchagne for all objects j.
00285     double cost(medoid_id i, object_id h, const dissimilarity_matrix& distance) const;
00286 
00287 
00288     /// Assign each object to the cluster with the closest medoid.
00289     ///
00290     /// @return Total dissimilarity of objects w/their medoids.
00291     /// 
00292     /// @param distance a callable object that computes distances between indices, as a distance 
00293     ///                 matrix would.  Algorithms are free to use real distance matrices (as in PAM) 
00294     ///                 or to compute lazily (as in CLARA medoid assignment).
00295     template <class DM>
00296     double assign_objects_to_clusters(DM distance) {
00297       if (sec_nearest.size() != cluster_ids.size()) {
00298         sec_nearest.resize(cluster_ids.size());
00299       }
00300       
00301       // go through and assign each object to nearest medoid, keeping track of total dissimilarity.
00302       double total_dissimilarity = 0;
00303       for (object_id i=0; i < cluster_ids.size(); i++) {
00304         double    d1, d2;  // smallest, second smallest distance to medoid, respectively
00305         medoid_id m1, m2;  // index of medoids with distances d1, d2 from object i, respectively
00306 
00307         d1 = d2 = DBL_MAX;
00308         m1 = m2 = medoid_ids.size();
00309         for (medoid_id m=0; m < medoid_ids.size(); m++) {
00310           double d = distance(i, medoid_ids[m]);
00311           if (d < d1 || medoid_ids[m] == i) {  // prefer the medoid in case of ties.
00312             d2 = d1;  m2 = m1;
00313             d1 = d;   m1 = m;
00314           } else if (d < d2) {
00315             d2 = d;   m2 = m;
00316           }
00317         }
00318 
00319         cluster_ids[i] = m1;
00320         sec_nearest[i] = m2;
00321         total_dissimilarity += d1;
00322       }
00323 
00324       return total_dissimilarity;
00325     }
00326   };
00327 
00328 
00329 } // namespace cluster
00330 
00331 #endif //K_MEDOIDS_H