00001 ////////////////////////////////////////////////////////////////////////////////////////////////// 00002 // Copyright (c) 2010, Lawrence Livermore National Security, LLC. 00003 // Produced at the Lawrence Livermore National Laboratory 00004 // LLNL-CODE-433662 00005 // All rights reserved. 00006 // 00007 // This file is part of Muster. For details, see http://github.com/tgamblin/muster. 00008 // Please also read the LICENSE file for further information. 00009 // 00010 // Redistribution and use in source and binary forms, with or without modification, are 00011 // permitted provided that the following conditions are met: 00012 // 00013 // * Redistributions of source code must retain the above copyright notice, this list of 00014 // conditions and the disclaimer below. 00015 // * Redistributions in binary form must reproduce the above copyright notice, this list of 00016 // conditions and the disclaimer (as noted below) in the documentation and/or other materials 00017 // provided with the distribution. 00018 // * Neither the name of the LLNS/LLNL nor the names of its contributors may be used to endorse 00019 // or promote products derived from this software without specific prior written permission. 00020 // 00021 // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS 00022 // OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF 00023 // MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL 00024 // LAWRENCE LIVERMORE NATIONAL SECURITY, LLC, THE U.S. DEPARTMENT OF ENERGY OR CONTRIBUTORS BE 00025 // LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES 00026 // (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 00027 // DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, 00028 // WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN 00029 // ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 00030 ////////////////////////////////////////////////////////////////////////////////////////////////// 00031 00032 /// 00033 /// @file partition.h 00034 /// @author Todd Gamblin tgamblin@llnl.gov 00035 /// @brief Class to represent a partitioning of a data set. 00036 /// 00037 #ifndef PARTITION_H 00038 #define PARTITION_H 00039 00040 #include <cstddef> 00041 #include <vector> 00042 #include <set> 00043 #include <ostream> 00044 #include <stdint.h> 00045 00046 /// 00047 /// Namespace for everything in the cluster library. 00048 /// 00049 namespace cluster { 00050 00051 typedef size_t medoid_id; ///< More descriptive type for medoid index 00052 typedef size_t object_id; ///< More descriptive type for object index 00053 00054 /// 00055 /// Explicit representation of a clustering. Instead of a vecto of representative 00056 /// ids, this has <i>k</i> sets of object_ids indicating which objects are in a 00057 /// particular cluster. You can convert a partition to a cluster_list with 00058 /// to_cluster_list(). 00059 /// 00060 typedef std::vector< std::set<object_id> > cluster_list; 00061 00062 /// 00063 /// This represents a partitioning of a data set. The data set consists of 00064 /// <i>objects</i>, each of which has an associated non-negative object_id. 00065 /// 00066 /// A partitioning divides a data set into groups, or <i>clusters</i>, and the 00067 /// partition object stores information about these clusters internally. In 00068 /// particular, it contains a vector of object_ids that indicate the 00069 /// <i>representative</i> object, or <i>medoid</i> of each cluster. It also 00070 /// contains a vector of medoid_ids indicating which cluster each object 00071 /// in the data set belongs to. 00072 /// 00073 /// Partition objects can be converted to a cluster_list, which is a more 00074 /// explicit representation of the partitioning. 00075 /// 00076 struct partition { 00077 /// Gives the index of the object that is the ith medoid. 00078 /// medoids[i] == index in object array for last call to findClusters() 00079 std::vector<object_id> medoid_ids; 00080 00081 /// Gives cluster id (index in medoids) for the ith object. 00082 /// clusterid[i] == id of cluster of which object i is a member. 00083 /// medoids[clusterid[i]] == representative of that cluster. 00084 std::vector<medoid_id> cluster_ids; 00085 00086 /// Constructor. Can optionall supply the number of objects to be partitioned 00087 /// and this will start out with one cluster containing all of them. 00088 partition(size_t num_objects = 0); 00089 00090 /// Virtual destructor; currently does nothing. 00091 virtual ~partition(); 00092 00093 /// True if and only if object i is a medoid. 00094 bool is_medoid(object_id oi) const { 00095 return medoid_ids[cluster_ids[oi]] == oi; 00096 } 00097 00098 /// Creates a list of std::sets from the partition info in 00099 /// medoids and cluster_ids. 00100 void to_cluster_list(cluster_list& list) const; 00101 00102 /// Fast swap with other patrition objects 00103 void swap(partition& other); 00104 00105 /// puts medoids in order by their object id, and adjusts cluster_ids accordingly. 00106 void sort(); 00107 00108 /// Total number of objects in the partition 00109 size_t size() const { return cluster_ids.size(); } 00110 00111 /// Total number of clusters in the partition 00112 size_t num_clusters() const { return medoid_ids.size(); } 00113 00114 /// Number of objects in cluster i 00115 size_t size(size_t i) const; 00116 00117 /// Write the members of cluster m out to the output stream as object_ids 00118 template <class OutputIterator> 00119 void write_members(medoid_id m, OutputIterator out) { 00120 for (object_id o=0; o < cluster_ids.size(); o++) { 00121 if (cluster_ids[o] == m) { 00122 *out++ = o; 00123 } 00124 } 00125 } 00126 00127 /// Write the members of cluster m out to the output stream formatted nicely with 00128 /// hyphenated runs of consecutive numbers 00129 void write_members_with_runs(medoid_id m, std::ostream& out); 00130 00131 /// writable structure returned by members() function. 00132 struct member_writer { 00133 partition* p; 00134 medoid_id m; 00135 member_writer(partition *_p, medoid_id _m) : p(_p), m(_m) { } 00136 }; 00137 member_writer members(medoid_id m) { return member_writer(this, m); } 00138 00139 }; // struct partition 00140 00141 inline std::ostream& operator<<(std::ostream& out, const partition::member_writer& mw) { 00142 mw.p->write_members_with_runs(mw.m, out); 00143 return out; 00144 } 00145 00146 /// Prints out nicely formatted clustering 00147 std::ostream& operator<<(std::ostream& out, const cluster_list& list); 00148 00149 /// For convenience 00150 std::ostream& operator<<(std::ostream& out, const partition& km); 00151 00152 /// Mirkin distance bt/w two clusterings. 00153 double mirkin_distance(const cluster_list& c1, const cluster_list& c2); 00154 00155 /// Convenience overload for comparing partition objects directly 00156 double mirkin_distance(const partition& c1, const partition& c2); 00157 00158 /// 00159 /// Expand a cluster_list by l levels. That is, replace each index i 00160 /// in the cluster_list with indices in [2^l * i ... 2^l * (i+1) - 1] 00161 /// 00162 /// @todo deprecate and delete this. 00163 /// 00164 void expand(cluster_list& list, size_t level = 1); 00165 00166 /// 00167 /// Compute the total dissimilarity between all objects and their medoids. 00168 /// 00169 template <typename D> 00170 double total_dissimilarity(const partition& p, D dist) { 00171 double dissim = 0.0; 00172 for (size_t i=0; i < p.cluster_ids.size(); i++) { 00173 dissim += dist(i, p.medoid_ids[p.cluster_ids[i]]); 00174 } 00175 return dissim; 00176 } 00177 00178 /// 00179 /// Compute the total dissimilarity between all objects in a particular cluster and its medoid. 00180 /// 00181 template <typename D> 00182 double total_dissimilarity(const partition& p, D dist, medoid_id m) { 00183 double dissim = 0.0; 00184 for (size_t i=0; i < p.cluster_ids.size(); i++) { 00185 if (p.cluster_ids[i] == m) { 00186 dissim += dist(i, p.medoid_ids[p.cluster_ids[i]]); 00187 } 00188 } 00189 return dissim; 00190 } 00191 00192 /// 00193 /// Compute the total squared dissimilarity between all objects and their medoids. 00194 /// 00195 template <typename D> 00196 double total_squared_dissimilarity(const partition& p, D dist) { 00197 double dissim2 = 0.0; 00198 for (size_t i=0; i < p.cluster_ids.size(); i++) { 00199 double d = dist(i, p.medoid_ids[p.cluster_ids[i]]); 00200 dissim2 += d * d; 00201 } 00202 return dissim2; 00203 } 00204 00205 /// 00206 /// Compute the total squared dissimilarity between all objects in a particular 00207 /// cluster and its medoid. 00208 /// 00209 template <typename D> 00210 double total_squared_dissimilarity(const partition& p, D dist, medoid_id m) { 00211 double dissim2 = 0.0; 00212 for (size_t i=0; i < p.cluster_ids.size(); i++) { 00213 if (p.cluster_ids[i] == m) { 00214 double d = dist(i, p.medoid_ids[p.cluster_ids[i]]); 00215 dissim2 += d * d; 00216 } 00217 } 00218 return dissim2; 00219 } 00220 00221 00222 } // namespace cluster 00223 00224 #endif // PARTITION_H