Commit 1f329e1a authored by Matthias Carnein's avatar Matthias Carnein

Initial commit

parent 49f9396b
#include <math.h>
#include "MC.hpp"
MC::MC(std::vector<double> centroid, int lastUpdate, double weight) {
this->centroid=centroid;
this->lastUpdate=lastUpdate;
this->weight=weight;
}
MC::MC(std::vector<double> centroid, int lastUpdate) {
this->centroid=centroid;
this->lastUpdate=lastUpdate;
this->weight=1;
}
double MC::getWeight(){
return(this->weight);
}
std::vector<double> MC::getCentroid(){
return(this->centroid);
}
void MC::merge(MC mc, int t, double lambda, double r) {
mc.fade(t, lambda);
this->fade(t, lambda);
// update statistics
this->weight += mc.weight;
// competetive learning
double d = this->distance(mc);
std::vector<double> mcCentroid = mc.getCentroid();
for(unsigned int i=0; i<this->centroid.size(); i++){
this->centroid[i] += exp(-pow(d/r*3.0, 2.0) /2.0) * (mcCentroid[i]-this->centroid[i]);
}
}
void MC::fade(int t, double lambda){
// apply fading
this->weight *= pow(2,(-lambda * (t-this->lastUpdate)));
// update time
this-> lastUpdate = t;
}
double MC::distance(MC mc){
std::vector<double> thisCentre = this->getCentroid();
std::vector<double> mcCentre = mc.getCentroid();
double sum = 0.0;
for(unsigned int i=0; i<thisCentre.size(); i++){
sum += pow(thisCentre[i] - mcCentre[i], 2);
}
return(sqrt(sum));
}
double MC::distance(std::vector<double> &x){
std::vector<double> thisCentre = this->getCentroid();
double sum = 0.0;
for(unsigned int i=0; i<thisCentre.size(); i++){
sum += pow(thisCentre[i] - x[i], 2);
}
return(sqrt(sum));
}
\ No newline at end of file
#include <vector>
class MC {
private:
std::vector<double> centroid;
int lastUpdate;
double weight;
public:
MC(std::vector<double> centroid, int lastUpdate, double weight);
MC(std::vector<double> centroid, int lastUpdate);
std::vector<double> getCentroid();
double getWeight();
double distance(MC mc);
double distance(std::vector<double> &x);
void merge(MC mc, int t, double lambda, double r);
void fade(int t, double lambda);
};
# evoStream - Evolutionary Stream Clustering Utilizing Idle Times
This is the implementation of an evolutionary stream clustering algorithm as proposed in our article in the Journal of Big Data Research.
The online component uses a simplified version of `DBSTREAM` to generate micro-clusters.
The micro-clusters are then incrementally reclustered using an evloutionary algorithm.
Evolutionary algorithms create slight variations by combining and randomly modifying existing solutions.
By iteratively selecting better solutions, an evolutionary pressure is created which improves the clustering over time.
Since the evolutionary algorithm is incremental, it is possible to apply it between observations, e.g. in the idle time of the stream.
Whenever there is idle time, we can call the `recluster` function of the reference class to improve the macro-clusters (see example).
The evolutionary algorithm can also be applied as a traditional reclustering step, or a combination of both.
In addition, this implementation also allows to evaluate a fixed number of generations after each observation.
## Installation
This is the Python port of evoStream. It is based on the C++ implementation with wrappers for Python.
In order to install the module, run the following command in the modules main directory:
```
python setup.py install --force
```
For convenience, the command can be issued using the `install.bat` or `install.sh` files.
## Usage
Once installed, the interfaces are the same as in the C++ and R implementations:
```Python
import evoStream
evo = evoStream.EvoStream(0.05, 0.001, 100, 4, .8, .001, 100, 2*4, 250) ## init
evo.cluster([10.0, 20.0, 30.0]) ## read observation
evo.get_microweights()
evo.get_microclusters()
evo.get_macroclusters()
evo.get_macroweights()
evo.recluster(100) ## evaluate 100 more macro solutions
evo.microToMacro()
## Full Example: Read CSV file (here: comma-separated, numeric values)
import csv
evo = evoStream.EvoStream(0.05, 0.001, 100, 4, .8, .001, 100, 2*4, 250);
with open('data.csv', 'r') as csvfile:
reader = csv.reader(csvfile, delimiter=',', quoting=csv.QUOTE_NONNUMERIC)
for row in reader:
evo.cluster(row)
evo.recluster(1) # evaluate 1 generation after every observation. This can be adapted to the available time
print("Micro Clusters:")
x = evo.get_microclusters()
print(x)
print("\nMicro Weights:")
x = evo.get_microweights()
print(x)
print("\nMacro Clusters (here: performs an additional 250 reclustering steps, see parameter)")
x = evo.get_macroclusters()
print(x)
print("\nMacro Weights")
x = evo.get_macroweights()
print(x)
print("\nAssignment of Micro Clusters to Macro Clusters")
x = evo.microToMacro()
print(x)
```
## Related Implementations
The original implementation is available as an R-package here: [https://wiwi-gitlab.uni-muenster.de/m_carn01/evoStream](https://wiwi-gitlab.uni-muenster.de/m_carn01/evoStream)
There is also a C++ port of evoStream. It is available here: [https://wiwi-gitlab.uni-muenster.de/m_carn01/evoStream_C](https://wiwi-gitlab.uni-muenster.de/m_carn01/evoStream_C)
This diff is collapsed.
#include <iostream>
#include <vector>
#include <random>
#include "MC.hpp"
class EvoStream {
private:
double r;
double lambda;
int tgap;
unsigned int k;
double crossoverRate;
double mutationRate;
unsigned int populationSize;
unsigned int initializeAfter;
int reclusterGenerations;
double omega;
int t;
int init;
int upToDate;
double delay;
int initTime;
std::vector<MC> micro;
std::vector<std::vector< std::vector<double> > > macro; // [solution][cluster][centre]
std::vector<double> macroFitness;
std::mt19937 rng;
void updateWeights();
void removeMicroCluster(int i);
void insert(std::vector<double> &distances, MC mc);
void evolution();
void calculateFitness();
double fitness(std::vector< std::vector<double> > &centres);
void initialize();
std::vector<std::vector< std::vector<double> > > selection();
std::vector<std::vector< std::vector<double> > > recombination(std::vector<std::vector< std::vector<double> > > &individuals);
std::vector<std::vector< std::vector<double> > > mutation(std::vector<std::vector< std::vector<double> > > &individuals);
std::vector<std::vector< std::vector<double> > > recombinationGAclust(std::vector<std::vector< std::vector<double> > > &individuals);
std::vector<std::vector< std::vector<double> > > recombinationPESAII(std::vector<std::vector< std::vector<double> > > &individuals);
std::vector<std::vector< std::vector<double> > > mutationGAclust(std::vector<std::vector< std::vector<double> > > &individuals);
std::vector<std::vector< std::vector<double> > > mutationPESAII(std::vector<std::vector< std::vector<double> > > &individuals);
int ndimensions();
int sampleProportionally(std::vector<double> &data);
std::vector<double> getDistanceVector(MC mc, std::vector<MC> &cluster);
public:
EvoStream(double r, double lambda, int tgap, unsigned int k, double crossoverRate, double mutationRate, int populationSize, unsigned int initializeAfter, int reclusterGenerations);
std::vector< std::vector<double> > get_microclusters();
std::vector<double> get_microweights();
std::vector< std::vector<double> > get_macroclusters();
std::vector<double> get_macroweights();
std::vector<int> microToMacro();
void recluster(int generations);
std::vector<int> getAssignment(std::vector< std::vector<double> > &centres);
double getMaxFitness();
void cluster(std::vector<double> &data);
void cleanup();
};
#include <Python.h>
#include <Vector>
#include <cstdio>
#include "evoStream.hpp"
typedef struct {
PyObject_HEAD
EvoStream * evoStream;
} PyEvoStream;
static PyModuleDef evoStreammodule = {
PyModuleDef_HEAD_INIT,
"evoStream",
"Example module that wrapped a C++ object",
-1,
NULL, NULL, NULL, NULL, NULL
};
// initialize PyEvoStream Object
static int PyEvoStream_init(PyEvoStream *self, PyObject *args, PyObject *kwds){
double r;
double lambda;
int tgap;
int k;
double crossoverRate;
double mutationRate;
int populationSize;
int initializeAfter;
int reclusterGenerations;
if (!PyArg_ParseTuple(args, "ddiiddiii", &r, &lambda, &tgap, &k, &crossoverRate, &mutationRate, &populationSize, &initializeAfter, &reclusterGenerations))
return -1;
self->evoStream = new EvoStream(r, lambda, tgap, k, crossoverRate, mutationRate, populationSize, initializeAfter, reclusterGenerations);
return 0;
}
// destruct the object
static void PyEvoStream_dealloc(PyEvoStream * self){
delete self->evoStream;
Py_TYPE(self)->tp_free(self);
}
// cluster
static PyObject *PyEvoStream_cluster(PyEvoStream *self, PyObject *args)
{
PyObject *float_list;
int size;
if (!PyArg_ParseTuple(args, "O", &float_list))
return NULL;
size = PyObject_Length(float_list);
if (size < 0)
return NULL;
std::vector<double> data(size);
for (int i = 0; i < size; i++) {
PyObject *item;
item = PyList_GetItem(float_list, i);
data[i] = PyFloat_AsDouble(item);
}
self->evoStream->cluster(data);
return Py_BuildValue("i", 0);
}
// get_microclusters
static PyObject *PyEvoStream_get_microclusters(PyEvoStream *self, PyObject *args){
std::vector<std::vector<double> > clusters = self->evoStream->get_microclusters();
PyObject *rows = PyList_New(0);
for(int i=0; i < clusters.size(); i++){
PyObject *cols = PyList_New(0);
for(int j=0; j < clusters[0].size(); j++){
PyList_Append(cols, PyFloat_FromDouble(clusters[i][j]));
}
PyList_Append(rows, cols);
}
return rows;
}
// get_microweights
static PyObject *PyEvoStream_get_microweights(PyEvoStream *self, PyObject *args){
std::vector<double> weights = self->evoStream->get_microweights();
PyObject *newlist = PyList_New(0);
for(int i=0; i < weights.size(); i++){
PyList_Append(newlist, PyFloat_FromDouble(weights[i]));
}
return newlist;
}
// get_macroclusters
static PyObject *PyEvoStream_get_macroclusters(PyEvoStream *self, PyObject *args){
std::vector<std::vector<double> > clusters = self->evoStream->get_macroclusters();
PyObject *rows = PyList_New(0);
for(int i=0; i < clusters.size(); i++){
PyObject *cols = PyList_New(0);
for(int j=0; j < clusters[0].size(); j++){
PyList_Append(cols, PyFloat_FromDouble(clusters[i][j]));
}
PyList_Append(rows, cols);
}
return rows;
}
// get_macroweights
static PyObject *PyEvoStream_get_macroweights(PyEvoStream *self, PyObject *args){
std::vector<double> weights = self->evoStream->get_macroweights();
PyObject *newlist = PyList_New(0);
for(int i=0; i < weights.size(); i++){
PyList_Append(newlist, PyFloat_FromDouble(weights[i]));
}
return newlist;
}
// microToMacro
static PyObject *PyEvoStream_microToMacro(PyEvoStream *self, PyObject *args){
std::vector<int> assignment = self->evoStream->microToMacro();
PyObject *newlist = PyList_New(0);
for(int i=0; i < assignment.size(); i++){
PyList_Append(newlist, PyLong_FromLong(assignment[i]));
}
return newlist;
}
// recluster
static PyObject *PyEvoStream_recluster(PyEvoStream *self, PyObject *args){
int generations;
PyArg_ParseTuple(args, "i", &generations);
self->evoStream->recluster(generations);
return Py_BuildValue("i", 0);
}
// methods
static PyMethodDef PyEvoStream_methods[] = {
{ "cluster", (PyCFunction)PyEvoStream_cluster, METH_VARARGS, "insert observation" },
{ "get_microclusters", (PyCFunction)PyEvoStream_get_microclusters, METH_VARARGS, "get centres of micro clusters" },
{ "get_microweights", (PyCFunction)PyEvoStream_get_microweights, METH_VARARGS, "get weight of micro clusters" },
{ "get_macroweights", (PyCFunction)PyEvoStream_get_macroweights, METH_VARARGS, "get weights of macro clusters" },
{ "get_macroclusters", (PyCFunction)PyEvoStream_get_macroclusters, METH_VARARGS, "get centres of macro clusters" },
{ "microToMacro", (PyCFunction)PyEvoStream_microToMacro, METH_VARARGS, "assignment of micro clusters to macro clusters" },
{ "recluster", (PyCFunction)PyEvoStream_recluster, METH_VARARGS, "start reclustering" },
{NULL} /* Sentinel */
};
static PyTypeObject PyEvoStreamType = { PyVarObject_HEAD_INIT(NULL, 0)
"evoStream.EvoStream" /* tp_name */
};
// create the module
PyMODINIT_FUNC PyInit_evoStream(void){
PyObject* m;
PyEvoStreamType.tp_new = PyType_GenericNew;
PyEvoStreamType.tp_basicsize=sizeof(PyEvoStream);
PyEvoStreamType.tp_dealloc=(destructor) PyEvoStream_dealloc;
PyEvoStreamType.tp_flags=Py_TPFLAGS_DEFAULT;
PyEvoStreamType.tp_doc="EvoStream objects";
PyEvoStreamType.tp_methods=PyEvoStream_methods;
//~ PyEvoStreamType.tp_members=Noddy_members;
PyEvoStreamType.tp_init=(initproc)PyEvoStream_init;
if (PyType_Ready(&PyEvoStreamType) < 0)
return NULL;
m = PyModule_Create(&evoStreammodule);
if (m == NULL)
return NULL;
Py_INCREF(&PyEvoStreamType);
PyModule_AddObject(m, "EvoStream", (PyObject *)&PyEvoStreamType); // Add EvoStream object to the module
return m;
}
import evoStream
evo = evoStream.EvoStream(0.05, 0.001, 100, 4, .8, .001, 100, 2*4, 250) ## init
evo.cluster([10.0, 20.0, 30.0]) ## read observation
evo.get_microweights()
evo.get_microclusters()
evo.get_macroclusters()
evo.get_macroweights()
evo.recluster(100) ## evaluate 100 more macro solutions
evo.microToMacro()
## Full Example: Read CSV file (here: comma-separated, numeric values)
import csv
evo = evoStream.EvoStream(0.05, 0.001, 100, 4, .8, .001, 100, 2*4, 250);
with open('data.csv', 'r') as csvfile:
reader = csv.reader(csvfile, delimiter=',', quoting=csv.QUOTE_NONNUMERIC)
for row in reader:
evo.cluster(row)
evo.recluster(1) # evaluate 1 generation after every observation. This can be adapted to the available time
print("Micro Clusters:")
x = evo.get_microclusters()
print(x)
print("\nMicro Weights:")
x = evo.get_microweights()
print(x)
print("\nMacro Clusters (here: performs an additional 250 reclustering steps, see parameter)")
x = evo.get_macroclusters()
print(x)
print("\nMacro Weights")
x = evo.get_macroweights()
print(x)
print("\nAssignment of Micro Clusters to Macro Clusters")
x = evo.microToMacro()
print(x)
python setup.py install --force
python setup.py install --force
from distutils.core import setup, Extension
setup(name='evoStreamPkg', version='1.0', \
ext_modules=[Extension('evoStream', ['evoStreamWrapper.cpp', 'evoStream.cpp', 'MC.cpp'])])
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment