00001
00002
00003
00004
00005
00006
00007
00008
00009
00010
00011
00012
00013
00014
00015
00016
00017
00018
00019
00020
00021
00022
00023
00024
00025
00026
00027
00028
00029
00030
00031
00032
00033
00034
00035
00036
00037 #ifndef PAR_KMEDOIDS_H
00038 #define PAR_KMEDOIDS_H
00039
00040 #include <mpi.h>
00041 #include <ostream>
00042 #include <vector>
00043 #include <functional>
00044
00045 #include <boost/iterator/permutation_iterator.hpp>
00046
00047 #include "Timer.h"
00048 #include "kmedoids.h"
00049 #include "multi_gather.h"
00050 #include "trial.h"
00051 #include "id_pair.h"
00052 #include "par_partition.h"
00053 #include "stl_utils.h"
00054 #include "bic.h"
00055 #include "mpi_bindings.h"
00056
00057 namespace cluster {
00058
00059
00060
00061
00062
00063
00064
00065
00066
00067
00068
00069
00070
00071
00072
00073
00074
00075
00076
00077
00078
00079
00080
00081
00082
00083
00084
00085
00086
00087
00088
00089
00090
00091
00092
00093
00094
00095
00096
00097
00098
00099
00100
00101
00102
00103
00104
00105
00106
00107
00108
00109
00110
00111
00112 class par_kmedoids : public par_partition {
00113 public:
00114
00115
00116
00117
00118
00119
00120 par_kmedoids(MPI_Comm comm = MPI_COMM_WORLD);
00121
00122 virtual ~par_kmedoids() { }
00123
00124
00125 double average_dissimilarity();
00126
00127
00128 double bic_score();
00129
00130
00131
00132
00133
00134 void set_max_reps(size_t reps) { max_reps = reps; }
00135
00136
00137
00138
00139 size_t get_max_reps() { return max_reps; }
00140
00141
00142
00143
00144
00145 void set_init_size(size_t size) { init_size = size; }
00146
00147
00148
00149
00150 size_t get_init_size() { return init_size; }
00151
00152
00153
00154
00155 void set_epsilon(double epsilon);
00156
00157
00158
00159
00160
00161
00162 template <class T, class D>
00163 void run_pam_trials(trial_generator& trials, const std::vector<T>& objects, D dmetric,
00164 std::vector<typename id_pair<T>::vector>& all_medoids, MPI_Comm comm)
00165 {
00166 int size, rank;
00167 CMPI_Comm_size(comm, &size);
00168 CMPI_Comm_rank(comm, &rank);
00169
00170
00171 std::vector< std::vector<char> > bcast_buffers(all_medoids.size());
00172
00173 for (size_t i=0; trials.has_next(); i++) {
00174 int my_k = -1;
00175 int my_trial = -1;
00176 std::vector<size_t> my_ids;
00177 std::vector<T> my_objects;
00178 multi_gather<T> gather(comm);
00179
00180
00181 for (int root=0; trials.has_next() && root < size; root++) {
00182 trial cur_trial = trials.next();
00183
00184
00185 std::vector<size_t> sample_ids;
00186 boost::random_number_generator<random_t> rng(random);
00187 random_subset(trials.num_objects, cur_trial.sample_size, std::back_inserter(sample_ids), rng);
00188
00189
00190 std::vector<int> sources;
00191 std::transform(sample_ids.begin(), sample_ids.end(), std::back_inserter(sources),
00192 std::bind2nd(std::divides<size_t>(), objects.size()));
00193 sources.erase(std::unique(sources.begin(), sources.end()), sources.end());
00194
00195
00196 std::vector<size_t> sample_indices;
00197 transform(std::lower_bound(sample_ids.begin(), sample_ids.end(), objects.size() * rank),
00198 std::lower_bound(sample_ids.begin(), sample_ids.end(), objects.size() * (rank + 1)),
00199 std::back_inserter(sample_indices),
00200 std::bind2nd(std::minus<int>(), objects.size() * rank));
00201
00202
00203 gather.start(boost::make_permutation_iterator(objects.begin(), sample_indices.begin()),
00204 boost::make_permutation_iterator(objects.begin(), sample_indices.end()),
00205 sources.begin(), sources.end(), my_objects, root);
00206
00207
00208 if (rank == root) {
00209 my_k = cur_trial.k;
00210 my_trial = trials.count() - 1;
00211 my_ids.swap(sample_ids);
00212 }
00213 }
00214 timer.record("StartGather");
00215
00216
00217 gather.finish();
00218 timer.record("FinishGather");
00219
00220
00221 if (my_trial >= 0) {
00222 kmedoids cluster;
00223 cluster.set_epsilon(epsilon);
00224
00225 dissimilarity_matrix mat;
00226 build_dissimilarity_matrix(my_objects, dmetric, mat);
00227 cluster.pam(mat, my_k);
00228 timer.record("LocalCluster");
00229
00230
00231
00232 for (size_t m=0; m < cluster.medoid_ids.size(); m++) {
00233 all_medoids[my_trial].push_back(
00234 make_id_pair(my_objects[cluster.medoid_ids[m]], my_ids[cluster.medoid_ids[m]]));
00235 }
00236 pack_vector(comm, all_medoids[my_trial], bcast_buffers[my_trial]);
00237 timer.record("PackForBroadcast");
00238 }
00239
00240
00241
00242 for (size_t trial_id = i * size; trial_id < trials.count(); trial_id++) {
00243 int root = trial_id % size;
00244
00245
00246 size_t packed_size = bcast_buffers[trial_id].size();
00247 CMPI_Bcast(&packed_size, 1, MPI_SIZE_T, root, comm);
00248 bcast_buffers[trial_id].resize(packed_size);
00249
00250
00251 CMPI_Bcast(&bcast_buffers[trial_id][0], packed_size, MPI_PACKED, root, comm);
00252 }
00253 timer.record("Broadcast");
00254
00255
00256 for (size_t trial_id = i * size; trial_id < trials.count(); trial_id++) {
00257 if ((size_t)my_trial != trial_id) {
00258 unpack_vector(comm, bcast_buffers[trial_id], all_medoids[trial_id]);
00259 }
00260
00261
00262 bcast_buffers[trial_id].clear();
00263 }
00264 timer.record("UnpackFromBroadcast");
00265 }
00266 }
00267
00268
00269
00270
00271
00272
00273
00274
00275
00276
00277
00278
00279
00280
00281
00282
00283
00284
00285
00286
00287
00288
00289
00290
00291
00292
00293
00294
00295 template <class T, class D>
00296 void capek(const std::vector<T>& objects, D dmetric, size_t k, std::vector<T> *medoids = NULL)
00297 {
00298 int size, rank;
00299 CMPI_Comm_size(comm, &size);
00300 CMPI_Comm_rank(comm, &rank);
00301
00302 seed_random_uniform(comm);
00303
00304
00305
00306 size_t num_objects = size * objects.size();
00307 k = std::min(num_objects, k);
00308 timer.record("Init");
00309
00310
00311
00312 std::vector<typename id_pair<T>::vector> all_medoids(max_reps);
00313 trial_generator trials(k, k, max_reps, init_size, num_objects);
00314 run_pam_trials(trials, objects, dmetric, all_medoids, comm);
00315
00316
00317 std::vector<double> all_dissimilarities(trials.count(), 0.0);
00318 std::vector< std::vector<medoid_id> > all_cluster_ids(trials.count());
00319
00320
00321
00322 for (size_t i=0; i < trials.count(); i++) {
00323 for (size_t o=0; o < objects.size(); o++) {
00324 object_id global_oid = rank + o;
00325 std::pair<double, size_t> closest = closest_medoid(objects[o], global_oid, all_medoids[i], dmetric);
00326
00327 all_dissimilarities[i] += closest.first;
00328 all_cluster_ids[i].push_back(closest.second);
00329 }
00330 }
00331 timer.record("FindMinima");
00332
00333
00334
00335 std::vector<double> sums(trials.count());
00336
00337 CMPI_Reduce(&all_dissimilarities[0], &sums[0], trials.count(), MPI_DOUBLE, MPI_SUM, 0, comm);
00338 CMPI_Bcast(&sums[0], trials.count(), MPI_DOUBLE, 0, comm);
00339 timer.record("GlobalSums");
00340
00341
00342 std::vector<double>::iterator min_sum = std::min_element(sums.begin(), sums.end());
00343 total_dissimilarity = *min_sum;
00344 size_t best = (min_sum - sums.begin());
00345
00346
00347
00348 medoid_ids.resize(all_medoids[best].size());
00349 for (size_t i = 0; i < medoid_ids.size(); i++) {
00350 medoid_ids[i] = all_medoids[best][i].id;
00351 }
00352
00353
00354 std::vector<size_t> mapping(medoid_ids.size());
00355 std::generate(mapping.begin(), mapping.end(), sequence());
00356 std::sort(mapping.begin(), mapping.end(), indexed_lt(medoid_ids));
00357 invert(mapping);
00358
00359
00360 for (size_t i=0; i < medoid_ids.size(); i++) {
00361 medoid_ids[i] = all_medoids[best][mapping[i]].id;
00362 }
00363
00364
00365 cluster_ids.swap(all_cluster_ids[best]);
00366
00367
00368 if (medoids) {
00369 medoids->resize(medoid_ids.size());
00370 for (size_t i=0; i < medoid_ids.size(); i++) {
00371 (*medoids)[i] = all_medoids[best][mapping[i]].element;
00372 }
00373 }
00374
00375 timer.record("BicScore");
00376 }
00377
00378
00379
00380
00381
00382
00383
00384
00385
00386
00387
00388
00389
00390
00391
00392
00393
00394
00395
00396
00397
00398
00399
00400
00401
00402
00403
00404
00405
00406
00407
00408 template <class T, class D>
00409 double xcapek(const std::vector<T>& objects, D dmetric, size_t max_k, size_t dimensionality,
00410 std::vector<T> *medoids = NULL)
00411 {
00412 int size, rank;
00413 CMPI_Comm_size(comm, &size);
00414 CMPI_Comm_rank(comm, &rank);
00415
00416 seed_random_uniform(comm);
00417
00418
00419
00420 size_t num_objects = size * objects.size();
00421 max_k = std::min(num_objects, max_k);
00422 timer.record("Init");
00423
00424 std::vector<typename id_pair<T>::vector> all_medoids(max_k * max_reps);
00425 trial_generator trials(max_k, max_reps, init_size, num_objects);
00426 run_pam_trials(trials, objects, dmetric, all_medoids, comm);
00427
00428
00429 std::vector<double> all_dissimilarities(trials.count(), 0.0);
00430 std::vector< std::vector<medoid_id> > all_cluster_ids(trials.count());
00431
00432 std::vector<double> all_dissim2;
00433 std::vector<size_t> cluster_sizes;
00434
00435
00436
00437 for (size_t i=0; i < trials.count(); i++) {
00438 const size_t num_medoids = all_medoids[i].size();
00439 for (size_t m=0; m < num_medoids; m++) {
00440 all_dissim2.push_back(0.0);
00441 cluster_sizes.push_back(0);
00442 }
00443 double *dissim2 = &all_dissim2[all_dissim2.size() - num_medoids];
00444 size_t *sizes = &cluster_sizes[cluster_sizes.size() - num_medoids];
00445
00446 for (size_t o=0; o < objects.size(); o++) {
00447 object_id global_oid = rank + o;
00448 std::pair<double, size_t> closest = closest_medoid(objects[o], global_oid, all_medoids[i], dmetric);
00449
00450 all_dissimilarities[i] += closest.first;
00451 dissim2[closest.second] += closest.first * closest.first;
00452 sizes[closest.second] += 1;
00453 all_cluster_ids[i].push_back(closest.second);
00454 }
00455 }
00456 timer.record("FindMinima");
00457
00458
00459
00460
00461 std::vector<double> sums(trials.count());
00462 std::vector<double> sums2(all_dissim2.size());
00463 std::vector<size_t> sizes(cluster_sizes.size());
00464
00465 CMPI_Reduce(&all_dissimilarities[0], &sums[0], trials.count(), MPI_DOUBLE, MPI_SUM, 0, comm);
00466 CMPI_Reduce(&all_dissim2[0], &sums2[0], sums2.size(), MPI_DOUBLE, MPI_SUM, 0, comm);
00467 CMPI_Bcast(&sums[0], trials.count(), MPI_DOUBLE, 0, comm);
00468 CMPI_Bcast(&sums2[0], sums2.size(), MPI_DOUBLE, 0, comm);
00469 CMPI_Allreduce(&cluster_sizes[0], &sizes[0], sizes.size(), MPI_SIZE_T, MPI_SUM, comm);
00470 timer.record("GlobalSums");
00471
00472
00473 std::vector<double>::iterator min_sum = std::min_element(sums.begin(), sums.end());
00474 total_dissimilarity = *min_sum;
00475 size_t best = (min_sum - sums.begin());
00476
00477
00478 size_t best_trial = 0;
00479 best_bic_score = -DBL_MAX;
00480
00481 size_t trial_offset = 0;
00482 for (size_t i=0; i < trials.count(); i++) {
00483 size_t k = all_medoids[i].size();
00484 double cur_bic = bic(k, &sizes[trial_offset], &sums2[trial_offset], dimensionality);
00485 if (cur_bic > best_bic_score) {
00486 best_trial = i;
00487 best_bic_score = cur_bic;
00488 }
00489 trial_offset += k;
00490 }
00491
00492 best = best_trial;
00493
00494
00495 medoid_ids.resize(all_medoids[best].size());
00496 for (size_t i = 0; i < medoid_ids.size(); i++) {
00497 medoid_ids[i] = all_medoids[best][i].id;
00498 }
00499
00500
00501 std::vector<size_t> mapping(medoid_ids.size());
00502 std::generate(mapping.begin(), mapping.end(), sequence());
00503 std::sort(mapping.begin(), mapping.end(), indexed_lt(medoid_ids));
00504 invert(mapping);
00505
00506
00507 for (size_t i=0; i < medoid_ids.size(); i++) {
00508 medoid_ids[i] = all_medoids[best][mapping[i]].id;
00509 }
00510
00511
00512 cluster_ids.swap(all_cluster_ids[best]);
00513
00514
00515 if (medoids) {
00516 medoids->resize(medoid_ids.size());
00517 for (size_t i=0; i < medoid_ids.size(); i++) {
00518 (*medoids)[i] = all_medoids[best][mapping[i]].element;
00519 }
00520 }
00521
00522 timer.record("BicScore");
00523 return best_bic_score;
00524 }
00525
00526
00527 const Timer& get_timer() { return timer; }
00528
00529 protected:
00530 typedef boost::mt19937 random_t;
00531 random_t random;
00532
00533 double total_dissimilarity;
00534 double best_bic_score;
00535 size_t init_size;
00536 size_t max_reps;
00537 double epsilon;
00538
00539 Timer timer;
00540
00541
00542
00543
00544
00545 void seed_random_uniform(MPI_Comm comm);
00546
00547
00548
00549
00550
00551
00552
00553
00554
00555 template <class T>
00556 void pack_vector(MPI_Comm comm, const std::vector<T>& packables, std::vector<char>& buffer) {
00557
00558 int packed_size = 0;
00559 packed_size += cmpi_packed_size(1, MPI_SIZE_T, comm);
00560 for (size_t i=0; i < packables.size(); i++) {
00561 packed_size += packables[i].packed_size(comm);
00562 }
00563 buffer.resize(packed_size);
00564
00565
00566 int pos = 0;
00567 size_t num_packables = packables.size();
00568 CMPI_Pack(&num_packables, 1, MPI_SIZE_T, &buffer[0], packed_size, &pos, comm);
00569 for (size_t i=0; i < num_packables; i++) {
00570 packables[i].pack(&buffer[0], packed_size, &pos, comm);
00571 }
00572 }
00573
00574
00575
00576
00577 template <class T>
00578 void unpack_vector(MPI_Comm comm, const std::vector<char>& buffer, std::vector<T>& packables) {
00579 int pos = 0;
00580 size_t num_packables;
00581 CMPI_Unpack((void*)&buffer[0], buffer.size(), &pos, &num_packables, 1, MPI_SIZE_T, comm);
00582
00583 packables.resize(num_packables);
00584 for (size_t i=0; i < packables.size(); i++) {
00585 packables[i] = T::unpack((void*)&buffer[0], buffer.size(), &pos, comm);
00586 }
00587 }
00588
00589
00590
00591
00592
00593
00594
00595
00596
00597
00598
00599 template <typename T, typename D>
00600 std::pair<double, size_t> closest_medoid(
00601 const T& object, object_id oid, const std::vector< id_pair<T> >& medoids, D dmetric
00602 ) {
00603 double min_distance = DBL_MAX;
00604 size_t min_id = medoids.size();
00605 for (size_t m=0; m < medoids.size(); m++) {
00606 double d = dmetric(medoids[m].element, object);
00607 if (d < min_distance || medoids[m].id == oid) {
00608 min_distance = d;
00609 min_id = m;
00610 }
00611 }
00612 return std::make_pair(min_distance, min_id);
00613 }
00614
00615 };
00616
00617 }
00618
00619 #endif // PAR_KMEDOIDS_H