Commit ddab3533 authored by Matthias Carnein's avatar Matthias Carnein
Browse files

Initial Commit

parents
.Rproj.user
.Rhistory
.RData
.Ruserdata
src/*.o
src/*.so
src/*.dll
.Rbuildignore
Package: evoStream
Type: Package
Title: evoStream - Evolutionary Stream Clustering Utilizing Idle Times
Version: 1.0.0
Author: Who wrote it
Maintainer: The package maintainer <yourself@somewhere.net>
Description: Stream clustering algorithm based on evolutionary optimization. Micro-clusters are incrementally reclustered using an evloutionary algorithm. Evolutionary algorithms create slight variations by combining and randomly modifying existing solutions. By iteratively selecting better solutions, an evolutionary pressure is created which improves the clustering over time. Since the evolutionary algorithm is incremental, it is possible to apply between observations, e.g. in the idle time of the stream.
License: GPL-3
Encoding: UTF-8
LazyData: true
Imports: Rcpp (>= 0.12.15), stream
LinkingTo: Rcpp
RoxygenNote: 6.0.1
PKG_CPPFLAGS=-I.
CXX_STD = CXX11
OBJECTS = BICO.o DBSTREAM.o DStream.o init.o kmeansw.o NumericVector.o evoStream.o RcppExports.o BICO/l2metric.o BICO/point.o BICO/pointcentroid.o BICO/randomness.o BICO/realspaceprovider.o BICO/squaredl2metric.o
all: $(SHLIB)
clean:
@rm -f $(OBJECTS)
# Generated by roxygen2: do not edit by hand
export(DSC_EA)
export(DSC_evoStream)
import(Rcpp)
useDynLib(evoStream)
#' Evolutionary Algorithm
#'
#' Reclustering using an evolutionary algorithm.
#' This approach is used by \code{evoStream} but can be used for all micro-clusters.
#' The evolutionary algorithm uses existing clustering solutions and creates small variations of them by combining and randomly modfiying them.
#' The modified solutions can yield better partitions and thus can improve the clustering over time.
#' The evolutionary algorithm is incremental, which allows to improve existing macro-clusters instead of recomputing them every time.
#'
#' @param r radius threshold for micro-cluster assignment
#' @param lambda decay rate
#' @param tgap time-interval between outlier detection and clean-up
#' @param k number of macro-clusters
#' @param crossoverRate cross-over rate for the evolutionary algorithm
#' @param mutationRate mutation rate for the evolutionary algorithm
#' @param populationsize number of solutions that the evolutionary algorithm maintains
#' @param initializeAfter number of micro-cluster required for the initialization of the evolutionary algorithm.
#' @param incrementalGenerations number of EA generations performed after each observation
#' @param reclusterGenerations number of EA generations performed during reclustering
#'
#' @author Matthias Carnein \email{Matthias.Carnein@@uni-muenster.de}
#'
#' @examples
#' stream <- DSD_Gaussians(k = 3, d = 2)
#' dbstream <- DSC_DBSTREAM(r=0.1)
#' EA <- DSC_EA(k=3)
#'
#' two <- DSC_TwoStage(dbstream, EA)
#' update(two, stream, n=1200)
#' plot(two, stream, type="both")
#'
#' update(dbstream, stream, n = 1200)
#' recluster(EA, dbstream)
#' plot(EA, stream)
#'
#'
#' @export
DSC_EA <- function(k, crossoverRate=.8, mutationRate=.001, populationSize=100, generations=2000) {
EA <- EA_R$new(k, crossoverRate, mutationRate, populationSize, generations)
structure(
list(
description = "EA",
RObj = EA
), class = c("DSC_EA", "DSC_Macro", "DSC_R", "DSC")
)
}
#' Reference Class EA_R
#'
#' Reference class used for Reclustering using an evolutionary algorithm
#'
#' @field crossoverRate cross-over rate for the evolutionary algorithm
#' @field mutationRate mutation rate for the evolutionary algorithm
#' @field populationSize number of solutions that the evolutionary algorithm maintains
#' @field k number of macro-clusters
#' @field data micro-clusters to recluster
#' @field weights weights of the micro-clusters
#' @field generations number of EA generations performed during reclustering
#' @field C exposed C class
#'
#' @author Matthias Carnein \email{matthias.carnein@@uni-muenster.de}
#'
EA_R <- setRefClass("EA",
fields = list(
crossoverRate = "numeric",
mutationRate = "numeric",
populationSize = "integer",
k = "integer",
data = "data.frame",
weights = "numeric",
generations = "integer",
C = "ANY"
),
methods = list(
initialize = function(k, crossoverRate, mutationRate, populationSize, generations) {
k <<- as.integer(k)
crossoverRate <<- crossoverRate
mutationRate <<- mutationRate
populationSize <<- as.integer(populationSize)
generations <<- as.integer(generations)
}
)
)
EA_R$methods(
cluster = function(x, weight = rep(1,nrow(x)), ...) {
data <<- x
weights <<- weight
## initialize C object to access reclustering
C <<- new(EvoStream)
.self$C$reclusterInitialize(as.matrix(data), weights, .self$k, .self$crossoverRate, .self$mutationRate, .self$populationSize)
## recluster based on number of generations or time
.self$C$recluster(.self$generations)
},
get_microclusters = function(...) { as.data.frame(.self$data) },
get_microweights = function(...) { .self$weights },
get_macroclusters = function(...) { as.data.frame(.self$C$get_macroclusters()) },
get_macroweights = function(...) { .self$C$get_macroweights() },
microToMacro = function(micro=NULL) {
clusterAssignment = .self$C$microToMacro()+1
if(!is.null(micro)){
return(clusterAssignment[micro])
} else{
return(clusterAssignment)
}
}
)
#' evoStream - Evolutionary Stream Clustering
#'
#' Stream clustering algorithm based on evolutionary optimization.
#' The online component uses a simplified version of \code{DBSTREAM} to generate micro-clusters.
#' The micro-clusters are then incrementally reclustered using an evloutionary algorithm.
#' Evolutionary algorithms create slight variations by combining and randomly modifying existing solutions.
#' By iteratively selecting better solutions, an evolutionary pressure is created which improves the clustering over time.
#' Since the evolutionary algorithm is incremental, it is possible to apply between observations, e.g. in the idle time of the stream.
#' Alternatively it can be applied as a traditional reclustering step, or a combination of both.
#' This implementation allows to uses fixed number of generations after each observation and during reclustering.
#'
#' @param r radius threshold for micro-cluster assignment
#' @param lambda decay rate
#' @param tgap time-interval between outlier detection and clean-up
#' @param k number of macro-clusters
#' @param crossoverRate cross-over rate for the evolutionary algorithm
#' @param mutationRate mutation rate for the evolutionary algorithm
#' @param populationsize number of solutions that the evolutionary algorithm maintains
#' @param initializeAfter number of micro-cluster required for the initialization of the evolutionary algorithm.
#' @param incrementalGenerations number of EA generations performed after each observation
#' @param reclusterGenerations number of EA generations performed during reclustering
#'
#' @author Matthias Carnein \email{Matthias.Carnein@@uni-muenster.de}
#'
#' @examples
#' stream <- DSD_Gaussians(k = 3, d = 2)
#' evoStream <- DSC_evoStream(r=0.05, k=3)
#' update(evoStream, stream, n = 1200)
#' plot(evoStream, stream, type = "both")
#'
#' @export
DSC_evoStream <- function(r, lambda=0.001, tgap=100, k=2, crossoverRate=.8, mutationRate=.001, populationSize=100, initializeAfter=2*k, incrementalGenerations=5, reclusterGenerations=2000) {
evoStream <- evoStream_R$new(r, lambda, tgap, k, crossoverRate, mutationRate, populationSize, initializeAfter, incrementalGenerations, reclusterGenerations)
structure(
list(
description = "evoStream",
RObj = evoStream
), class = c("DSC_evoStream", "DSC_Micro", "DSC_R", "DSC")
)
}
#' Reference Class evoStream_R
#'
#' Reference class mostly used to expose C class object
#'
#' @field C exposed C class
#'
#' @author Matthias Carnein \email{matthias.carnein@@uni-muenster.de}
#'
evoStream_R <- setRefClass("evoStream_R", fields = list(
C ="ANY"
))
evoStream_R$methods(
initialize = function(r, lambda, tgap, k, crossoverRate, mutationRate, populationSize, initializeAfter, incrementalGenerations, reclusterGenerations) {
C <<- new(EvoStream) ## Exposed C class
C$setFields(r, lambda, tgap, k, crossoverRate, mutationRate, populationSize, initializeAfter, incrementalGenerations, reclusterGenerations) ## since exposed constructors have limited parameters
.self
}
)
evoStream_R$methods(
cluster = function(newdata){
.self$C$cluster(as.matrix(newdata))
}
)
evoStream_R$methods(
get_microclusters = function() {
as.data.frame(.self$C$get_microclusters())
}
)
evoStream_R$methods(
get_microweights = function() {
.self$C$get_microweights()
}
)
evoStream_R$methods(
get_macroclusters = function() {
as.data.frame(.self$C$get_macroclusters())
}
)
evoStream_R$methods(
get_macroweights = function() {
.self$C$get_macroweights()
}
)
evoStream_R$methods(
microToMacro = function(micro=NULL) {
clusterAssignment = .self$C$microToMacro()+1
if(!is.null(micro)){
return(clusterAssignment[micro])
} else{
return(clusterAssignment)
}
}
)
# Generated by using Rcpp::compileAttributes() -> do not edit by hand
# Generator token: 10BE3573-1514-4C36-9D1C-5A225CD40393
#' evoStream
#'
#' @author Matthias Carnein \email{Matthias.Carnein@@uni-muenster.de}
#'
#' @name evoStream
#' @docType package
#' @useDynLib evoStream
#' @import Rcpp
NULL
loadModule("MOD_evoStream", TRUE)
# evoStream - Evolutionary Stream Clustering Utilizing Idle Times
This R package implements an evolutionary stream clustering algorithm. The corresponding publication is currently in review.
The algorithm uses a widely popuar two-phase clustering approach where the stream is first summarised in real time.
The result are many small preliminary clusters in the stream called 'micro-clusters'.
Our algorithm then incrementally reclusteres these micro-clusters using an evloutionary algorithm.
Evolutionary algorithms create slight variations by combining and randomly modifying existing solutions.
By iteratively selecting better solutions, an evolutionary pressure is created which improves the clustering over time.
Since the evolutionary algorithm is incremental, it is possible to apply between observations, e.g. in the idle time of the stream.
Alternatively it can be applied as a traditional reclustering step, or a combination of both.
This implementation allows to uses fixed number of generations after each observation and during reclustering.
## Installation
The easiest way to install the package is by using devtools:
```R
devtools::install_git("https://wiwi-gitlab.uni-muenster.de/stream/evoStream")
```
## Usage
The algorithm is implemented as an extension to the R-package [stream](https://github.com/mhahsler/stream). Once the publication has been accepted we plan to incorporate it into the package. its usage is therefore the same as in the stream package. An simple example is shown below:
```R
## create data stream
stream <- DSD_Gaussians(k = 3, d = 2)
## initialize evoStream
evoStream <- DSC_evoStream(r=0.05, k=3)
## update model
update(evoStream, stream, n = 1200)
## plot the result
plot(evoStream, stream, type = "both")
## get micro-clusters
get_centers(evoStream, type="micro")
## get macro-clusters
get_centers(evoStream, type="macro")
```
Version: 1.0
RestoreWorkspace: Default
SaveWorkspace: Default
AlwaysSaveHistory: Default
EnableCodeIndexing: Yes
UseSpacesForTab: Yes
NumSpacesForTab: 2
Encoding: UTF-8
RnwWeave: knitr
LaTeX: pdfLaTeX
AutoAppendNewline: Yes
StripTrailingWhitespace: Yes
BuildType: Package
PackageUseDevtools: Yes
PackageInstallArgs: --no-multiarch --with-keep.source
PackageRoxygenize: rd,collate,namespace
% Generated by roxygen2: do not edit by hand
% Please edit documentation in R/DSC_EA.R
\name{DSC_EA}
\alias{DSC_EA}
\title{Evolutionary Algorithm}
\usage{
DSC_EA(k, crossoverRate = 0.8, mutationRate = 0.001, populationSize = 100,
generations = 2000)
}
\arguments{
\item{k}{number of macro-clusters}
\item{crossoverRate}{cross-over rate for the evolutionary algorithm}
\item{mutationRate}{mutation rate for the evolutionary algorithm}
\item{r}{radius threshold for micro-cluster assignment}
\item{lambda}{decay rate}
\item{tgap}{time-interval between outlier detection and clean-up}
\item{populationsize}{number of solutions that the evolutionary algorithm maintains}
\item{initializeAfter}{number of micro-cluster required for the initialization of the evolutionary algorithm.}
\item{incrementalGenerations}{number of EA generations performed after each observation}
\item{reclusterGenerations}{number of EA generations performed during reclustering}
}
\description{
Reclustering using an evolutionary algorithm.
This approach is used by \code{evoStream} but can be used for all micro-clusters.
The evolutionary algorithm uses existing clustering solutions and creates small variations of them by combining and randomly modfiying them.
The modified solutions can yield better partitions and thus can improve the clustering over time.
The evolutionary algorithm is incremental, which allows to improve existing macro-clusters instead of recomputing them every time.
}
\examples{
stream <- DSD_Gaussians(k = 3, d = 2)
dbstream <- DSC_DBSTREAM(r=0.1)
EA <- DSC_EA(k=3)
two <- DSC_TwoStage(dbstream, EA)
update(two, stream, n=1200)
plot(two, stream, type="both")
update(dbstream, stream, n = 1200)
recluster(EA, dbstream)
plot(EA, stream)
}
\author{
Matthias Carnein \email{Matthias.Carnein@uni-muenster.de}
}
% Generated by roxygen2: do not edit by hand
% Please edit documentation in R/DSC_evoStream.R
\name{DSC_evoStream}
\alias{DSC_evoStream}
\title{evoStream - Evolutionary Stream Clustering}
\usage{
DSC_evoStream(r, lambda = 0.001, tgap = 100, k = 2, crossoverRate = 0.8,
mutationRate = 0.001, populationSize = 100, initializeAfter = 2 * k,
incrementalGenerations = 5, reclusterGenerations = 2000)
}
\arguments{
\item{r}{radius threshold for micro-cluster assignment}
\item{lambda}{decay rate}
\item{tgap}{time-interval between outlier detection and clean-up}
\item{k}{number of macro-clusters}
\item{crossoverRate}{cross-over rate for the evolutionary algorithm}
\item{mutationRate}{mutation rate for the evolutionary algorithm}
\item{initializeAfter}{number of micro-cluster required for the initialization of the evolutionary algorithm.}
\item{incrementalGenerations}{number of EA generations performed after each observation}
\item{reclusterGenerations}{number of EA generations performed during reclustering}
\item{populationsize}{number of solutions that the evolutionary algorithm maintains}
}
\description{
Stream clustering algorithm based on evolutionary optimization.
The online component uses a simplified version of \code{DBSTREAM} to generate micro-clusters.
The micro-clusters are then incrementally reclustered using an evloutionary algorithm.
Evolutionary algorithms create slight variations by combining and randomly modifying existing solutions.
By iteratively selecting better solutions, an evolutionary pressure is created which improves the clustering over time.
Since the evolutionary algorithm is incremental, it is possible to apply between observations, e.g. in the idle time of the stream.
Alternatively it can be applied as a traditional reclustering step, or a combination of both.
This implementation allows to uses fixed number of generations after each observation and during reclustering.
}
\examples{
stream <- DSD_Gaussians(k = 3, d = 2)
evoStream <- DSC_evoStream(r=0.05, k=3)
update(evoStream, stream, n = 1200)
plot(evoStream, stream, type = "both")
}
\author{
Matthias Carnein \email{Matthias.Carnein@uni-muenster.de}
}
% Generated by roxygen2: do not edit by hand
% Please edit documentation in R/DSC_EA.R
\docType{class}
\name{EA-class}
\alias{EA-class}
\alias{EA_R}
\title{Reference Class EA_R}
\description{
Reference class used for Reclustering using an evolutionary algorithm
}
\section{Fields}{
\describe{
\item{\code{crossoverRate}}{cross-over rate for the evolutionary algorithm}
\item{\code{mutationRate}}{mutation rate for the evolutionary algorithm}
\item{\code{populationSize}}{number of solutions that the evolutionary algorithm maintains}
\item{\code{k}}{number of macro-clusters}
\item{\code{data}}{micro-clusters to recluster}
\item{\code{weights}}{weights of the micro-clusters}
\item{\code{generations}}{number of EA generations performed during reclustering}
\item{\code{C}}{exposed C class}
}}
\author{
Matthias Carnein \email{matthias.carnein@uni-muenster.de}
}
% Generated by roxygen2: do not edit by hand
% Please edit documentation in R/evoStream-package.R
\docType{package}
\name{evoStream}
\alias{evoStream}
\alias{evoStream-package}
\title{evoStream}
\description{
evoStream
}
\author{
Matthias Carnein \email{Matthias.Carnein@uni-muenster.de}
}
% Generated by roxygen2: do not edit by hand
% Please edit documentation in R/DSC_evoStream.R
\docType{class}
\name{evoStream_R-class}
\alias{evoStream_R-class}
\alias{evoStream_R}
\title{Reference Class evoStream_R}
\description{
Reference class mostly used to expose C class object
}
\section{Fields}{
\describe{
\item{\code{C}}{exposed C class}
}}
\author{
Matthias Carnein \email{matthias.carnein@uni-muenster.de}
}
// Generated by using Rcpp::compileAttributes() -> do not edit by hand
// Generator token: 10BE3573-1514-4C36-9D1C-5A225CD40393
#include <Rcpp.h>
using namespace Rcpp;
RcppExport SEXP _rcpp_module_boot_MOD_evoStream();
static const R_CallMethodDef CallEntries[] = {
{"_rcpp_module_boot_MOD_evoStream", (DL_FUNC) &_rcpp_module_boot_MOD_evoStream, 0},
{NULL, NULL, 0}
};
RcppExport void R_init_evoStream(DllInfo *dll) {
R_registerRoutines(dll, NULL, CallEntries, NULL, NULL);
R_useDynamicSymbols(dll, FALSE);
}
#include <Rcpp.h>
#include <limits>
#include <ctime>
#define VERBOSE 0
using namespace Rcpp;
inline int randWrapper(const int n) { return floor(unif_rand()*n); }
class MC {
public:
Rcpp::NumericVector centroid;
int lastUpdate;
double weight;
MC(Rcpp::NumericVector centroid, int lastUpdate, double weight) {
this->centroid=centroid;
this->lastUpdate=lastUpdate;
this->weight=weight;
}
MC(Rcpp::NumericVector centroid, int lastUpdate) {
this->centroid=centroid;
this->lastUpdate=lastUpdate;
this->weight=1;
}
Rcpp::NumericVector getCentroid(){
return(centroid);
}
void merge(MC mc, int t, double lambda, double r) {
mc.fade(t, lambda);
this->fade(t, lambda);
// update statistics
this->weight += mc.weight;
// competetive learning
double d = this->distance(mc);
this->centroid += exp(-pow(d/r*3.0, 2.0) /2.0) * (mc.getCentroid() - this->getCentroid());
}
void fade(int t, double lambda){
// apply fading
this->weight *= pow(2,(-lambda * (t-this->lastUpdate)));
// update time
this-> lastUpdate = t;
}
double distance(MC mc){
return(sqrt(sum(pow(this->getCentroid() - mc.getCentroid(), 2))));
}
double distance(Rcpp::NumericVector x){
return(sqrt(sum(pow(this->getCentroid() - x, 2))));
}
};