Go to the documentation of this file.00001
00002
00003
00004
00005
00006
00007
00008
00009
00010
00011
00012
00013
00014
00015
00016
00017
00018
00019
00020
00021
00022
00023
00024
00025
00026
00027
00028
00029
00030
00031
00032
00033
00034
00035
00036 #include "kmedoids.h"
00037 using namespace cluster;
00038
00039 #include <vector>
00040 #include <sstream>
00041
00042 #include <algorithm>
00043 #include <numeric>
00044 #include <cassert>
00045 #include <cstdlib>
00046 #include <sys/time.h>
00047 using namespace std;
00048
00049 #include "random.h"
00050 #include "bic.h"
00051 #include "matrix_utils.h"
00052
00053 namespace cluster {
00054
00055 kmedoids::kmedoids(size_t num_objects)
00056 : partition(num_objects),
00057 random(get_time_seed()),
00058 rng(random),
00059 total_dissimilarity(std::numeric_limits<double>::infinity()),
00060 sort_medoids(true),
00061 epsilon(1e-15),
00062 init_size(40),
00063 max_reps(5),
00064 xcallback(NULL)
00065 { }
00066
00067
00068 kmedoids::~kmedoids() { }
00069
00070 double kmedoids::average_dissimilarity() const {
00071 return total_dissimilarity / cluster_ids.size();
00072 }
00073
00074 void kmedoids::set_sort_medoids(bool sort) {
00075 sort_medoids = sort;
00076 }
00077
00078 void kmedoids::set_epsilon(double e) {
00079 epsilon = e;
00080 }
00081
00082 void kmedoids::set_xcallback(void (*xpc)(const partition& part, double bic)) {
00083 xcallback = xpc;
00084 }
00085
00086 void kmedoids::init_medoids(size_t k, const dissimilarity_matrix& distance) {
00087 medoid_ids.clear();
00088
00089 object_id first_medoid = 0;
00090 double min_dissim = DBL_MAX;
00091 for (size_t i=0; i < distance.size1(); i++) {
00092 double total = 0.0;
00093 for (size_t j=0; j < distance.size2(); j++) {
00094 total += distance(i,j);
00095 }
00096 if (total < min_dissim) {
00097 min_dissim = total;
00098 first_medoid = i;
00099 }
00100 }
00101
00102
00103 medoid_ids.push_back(first_medoid);
00104 assign_objects_to_clusters(matrix_distance(distance));
00105
00106
00107 for (size_t cur_k = 1; cur_k < k; cur_k++) {
00108 object_id best_obj = 0;
00109 double max_gain = 0.0;
00110 for (size_t i=0; i < distance.size1(); i++) {
00111 if (is_medoid(i)) continue;
00112
00113 double gain = 0.0;
00114 for (size_t j=0; j < distance.size1(); j++) {
00115 double Dj = distance(j, medoid_ids[cluster_ids[j]]);
00116 gain += max(Dj - distance(i,j), 0.0);
00117 }
00118
00119 if (gain >= max_gain) {
00120 max_gain = gain;
00121 best_obj = i;
00122 }
00123 }
00124
00125 medoid_ids.push_back(best_obj);
00126 assign_objects_to_clusters(matrix_distance(distance));
00127 }
00128 }
00129
00130
00131 double kmedoids::cost(medoid_id i, object_id h, const dissimilarity_matrix& distance) const {
00132 double total = 0;
00133 for (object_id j = 0; j < cluster_ids.size(); j++) {
00134 object_id mi = medoid_ids[i];
00135 double dhj = distance(h, j);
00136
00137 object_id mj1 = medoid_ids[cluster_ids[j]];
00138 double dj1 = distance(mj1, j);
00139
00140
00141 if (distance(mi, j) == dj1) {
00142 double dj2 = DBL_MAX;
00143 if (medoid_ids.size() > 1) {
00144 object_id mj2 = medoid_ids[sec_nearest[j]];
00145 dj2 = distance(mj2, j);
00146 }
00147 total += min(dj2, dhj) - dj1;
00148
00149 } else if (dhj < dj1) {
00150 total += dhj - dj1;
00151 }
00152 }
00153 return total;
00154 }
00155
00156
00157 void kmedoids::pam(const dissimilarity_matrix& distance, size_t k, const object_id *initial_medoids) {
00158 if (k > distance.size1()) {
00159 throw std::logic_error("Attempt to run PAM with more clusters than data.");
00160 }
00161
00162 if (distance.size1() != distance.size2()) {
00163 throw std::logic_error("Error: distance matrix is not square!");
00164 }
00165
00166
00167 cluster_ids.resize(distance.size1());
00168
00169
00170 if (initial_medoids) {
00171 medoid_ids.clear();
00172 copy(initial_medoids, initial_medoids + k, back_inserter(medoid_ids));
00173 } else {
00174 init_medoids(k, distance);
00175 }
00176
00177
00178
00179 double tolerance = epsilon * sum(distance) / (distance.size1() * distance.size2());
00180
00181 while (true) {
00182
00183 total_dissimilarity = assign_objects_to_clusters(matrix_distance(distance));
00184
00185
00186 double minTotalCost = DBL_MAX;
00187 medoid_id minMedoid = 0;
00188 object_id minObject = 0;
00189
00190
00191 for (medoid_id i=0; i < k; i++) {
00192
00193 for (object_id h = 0; h < cluster_ids.size(); h++) {
00194 if (is_medoid(h)) continue;
00195
00196
00197 double curCost = cost(i, h, distance);
00198 if (curCost < minTotalCost) {
00199 minTotalCost = curCost;
00200 minMedoid = i;
00201 minObject = h;
00202 }
00203 }
00204 }
00205
00206
00207 if (minTotalCost >= -tolerance) break;
00208
00209
00210 medoid_ids[minMedoid] = minObject;
00211 cluster_ids[minObject] = minMedoid;
00212 }
00213
00214 if (sort_medoids) sort();
00215 }
00216
00217
00218 double kmedoids::xpam(const dissimilarity_matrix& distance, size_t max_k, size_t dimensionality) {
00219 double best_bic = -DBL_MAX;
00220
00221 for (size_t k = 1; k <= max_k; k++) {
00222 kmedoids subcall;
00223 subcall.pam(distance, k);
00224 double cur_bic = bic(subcall, matrix_distance(distance), dimensionality);
00225
00226 if (xcallback) xcallback(subcall, cur_bic);
00227
00228 if (cur_bic > best_bic) {
00229 best_bic = cur_bic;
00230 swap(subcall);
00231 }
00232 }
00233 return best_bic;
00234 }
00235
00236
00237 }