Commit 8f4ac06c authored by Matthias Carnein's avatar Matthias Carnein

Improved cluster quality and performance

parent 6609c8e7
Package: textClust
Type: Package
Title: textClust
Version: 1.0.0
Title: textClust - Stream Clustering for Texts
Version: 2.0.0
Author: Matthias Carnein, Dennis Assenmacher
Maintainer: Matthias Carnein <matthias.carnein@uni-muenster.de>, Dennis Assenmacher <dennis.assenmacher@wi.uni-muenster.de>
Description: Clustering text stream
License: GPL3
Maintainer: Matthias Carnein <matthias.carnein@uni-muenster.de>, Dennis Assenmacher <dennis.assenmacher@wi.uni-muenster.de>
Description: Topic Discovery in Texts using Stream Clustering
License: GPL-3
LazyData: TRUE
Imports:
Rcpp (>= 0.12.9),
tokenizers,
stream
Suggests: zoom
stream,
stopwords,
clue,
fpc
LinkingTo: Rcpp
RoxygenNote: 6.0.1
RoxygenNote: 6.1.0
Encoding: UTF-8
# Generated by roxygen2: do not edit by hand
S3method(plot,DSC_textClust)
S3method(get_assignment,DSC_textClust)
S3method(get_points,DSD_textClust)
S3method(nclusters,DSC_textClust)
S3method(print,DSC_textClust)
export(DSC_textClust)
export(DSD_textClust)
export(evaluate)
export(evaluate_cluster)
export(get_assignment.DSC_textClust)
export(get_points.DSD_textClust)
export(nclusters.DSC_textClust)
export(textClust_R)
import(Rcpp)
import(stream)
importFrom(clue,as.cl_hard_partition)
importFrom(fpc,cluster.stats)
importFrom(stopwords,stopwords)
importFrom(tokenizers,tokenize_ngrams)
useDynLib(textClust)
This diff is collapsed.
......@@ -78,7 +78,12 @@
.all_measures <- c(.eval_measures_int, .eval_measures_ext,
.eval_measures_fpc_int, .eval_measures_fpc_ext)
#' Evaluate Clustering
#'
#' Adaption of evaluation method from the stream pacakge. Adapted for textual data.
#'
#' @export
#' @importFrom fpc cluster.stats
evaluate <- function (dsc, dsd, measure, n = 100,
type=c("auto", "micro", "macro"),
assign="micro",
......@@ -208,6 +213,11 @@ print.stream_eval <- function(x, ...) {
## evaluate during clustering
## uses single-fold prequential error estimate (eval and then learn the data)
#' Evaluate Clustering Using Prequential Evaluation
#'
#' Adaption of evaluate_cluster method from the stream pacakge.
#'
#' @export
evaluate_cluster <- function(dsc, dsd, measure,
n=1000, type=c("auto", "micro", "macro"), assign="micro",
......@@ -223,7 +233,7 @@ evaluate_cluster <- function(dsc, dsd, measure,
for(i in 1:rounds) {
if(verbose) print(paste("Round", i, "of", rounds))
# serialized=dsc$RObj$serialize()
# save(dsc, serialized, file=paste("debug/debug", dsc$RObj$textClust_C$t, ".RData", sep=""))
# save(dsc, serialized, file=paste("debug/debug", dsc$RObj$C$t, ".RData", sep=""))
d <- DSD_Memory(dsd, n=horizon, loop=FALSE)
## evaluate first
......@@ -412,19 +422,17 @@ ssq <- function(points, actual, predict, dsc, centers) {
## do nn assignment of non noise points
if(!is.null(actual)) points <- points[actual != 0L,]
colnames(points) = c("time", "user", "message")
## get mcs
microClusters = get_centers(dsc)
microClusters = dsc$RObj$C$get_microclusters()
## get idf
currentIDF = dsc$RObj$calculateIDF()
currentIDF = dsc$RObj$C$precalculateIDF(microClusters)
## for every row
assign_dist = apply(points, 1, function(x){
## split and tokenize sentence
tokens = tokenize_ngrams(x["message"], n = dsc$RObj$nmax, n_min = dsc$RObj$nmin, lowercase=TRUE, simplify = TRUE)
tokens = tokenize_ngrams(x, n = dsc$RObj$nmax, n_min = dsc$RObj$nmin, lowercase=TRUE, simplify = TRUE)
## count term frequency
tf = as.list(table(tokens))
......@@ -433,11 +441,11 @@ ssq <- function(points, actual, predict, dsc, centers) {
return(NA)
}
dist = dsc$RObj$textClust_C$findClosestDist(tf, currentIDF, centers)
dist = dsc$RObj$C$findClosestDist(tf, centers, currentIDF)
# print(dist)
# ## create temporary mc from text and timestamp
# microCluster = new(MicroCluster, tf, dsc$RObj$textClust_C$t)
# microCluster = new(MicroCluster, tf, dsc$RObj$C$t)
#
# ## calc dists to all other mcs
# dist = sapply(microClusters, function(mc){
......@@ -458,15 +466,14 @@ silhouette <- function(points, actual, predict, dsc) {
predict <- predict[!noise]
# if(any(predict==0)) warning("silhouette: ", sum(predict==0), " non-noise points were predicted noise incorrectly and form their own cluster.")
colnames(points) = c("time", "user", "message")
clusters = apply(points, 1, function(x){
tokens = tokenize_ngrams(x["message"], n = dsc$RObj$nmax, n_min = dsc$RObj$nmin, lowercase=TRUE, simplify = TRUE)
tokens = tokenize_ngrams(x, n = dsc$RObj$nmax, n_min = dsc$RObj$nmin, lowercase=TRUE, simplify = TRUE)
## count term frequency
tf = as.list(table(tokens))
if(length(tf)==0) return(NA)
## create temporary mc from text and timestamp
new(MicroCluster, tf, dsc$RObj$textClust_C$t)
new(MicroCluster, tf, dsc$RObj$C$t)
})
predict = predict[!is.na(clusters)]
......@@ -485,6 +492,7 @@ silhouette <- function(points, actual, predict, dsc) {
mean(cluster::silhouette(predict, d)[,"sil_width"])
}
#' @importFrom clue as.cl_hard_partition
clue_agreement <- function(predict, actual, measure) {
predict <- clue::as.cl_hard_partition(predict)
actual <- clue::as.cl_hard_partition(actual)
......
#' Text Stream Clustering
#' Topic Discovery in Texts using Stream Clustering
#'
#' @author
#' Matthias Carnein \email{Matthias.Carnein@@uni-muenster.de}
......@@ -8,6 +8,7 @@
#' @docType package
#' @useDynLib textClust
#' @import Rcpp
#' @import stream
#'
#' @references
#' Matthias Carnein, Dennis Assenmacher, Heike Trautmann (2017)"Stream Clustering of Chat Messages with Applications to Twitch Streams". In: Advances in Conceptual Modeling. ER 2017. Lecture Notes in Computer Science, vol 10651. Springer, Cham
......
This diff is collapsed.
......@@ -4,9 +4,13 @@
\alias{DSC_textClust}
\title{textClust}
\usage{
DSC_textClust(r = 0.4, lambda = 0.1, tgap = 1000, nmin = 1, nmax = 1,
updateAll = F, k = NA_integer_, h = NA_real_, verbose = F,
termFading = T)
DSC_textClust(r = 0.4, lambda = 0.1, tgap = 1000, nmin = 1,
nmax = 1, k = NA_integer_, h = NA_real_, verbose = F,
termFading = T, stopword = stopwords(language = "en", source =
"stopwords-iso"), linkage = "complete", weightedReclustering = TRUE,
minWeight = 2, textCol = 1, timeCol = NA_integer_,
groupByCol = NA_integer_, parentTextCol = NA_integer_,
parentTimeCol = NA_integer_)
}
\arguments{
\item{r}{distance threshold to merge two micro-clusters}
......@@ -19,8 +23,6 @@ DSC_textClust(r = 0.4, lambda = 0.1, tgap = 1000, nmin = 1, nmax = 1,
\item{nmax}{max number of ngrams to use}
\item{updateAll}{logical whether a new observations is used to update all micro-clusters within \code{r} or just the closest.}
\item{k}{number of clusters for macro-clustering}
\item{h}{height to determine number of clusters for macro-clustering}
......@@ -28,17 +30,36 @@ DSC_textClust(r = 0.4, lambda = 0.1, tgap = 1000, nmin = 1, nmax = 1,
\item{verbose}{logical whether to be more verbose}
\item{termFading}{logical whether individual terms should also be faded}
\item{stopword}{chracter vector of stopwords to remove}
\item{linkage}{method for hierarchical clustering}
\item{weightedReclustering}{logical whether reclustering should consider cluster weights}
\item{minWeight}{minimum weight of micro clusters to be used for reclustering}
\item{textCol}{index of column that contains the text which should be clustered}
\item{timeCol}{index of column that contains timestamps}
\item{groupByCol}{index of column that groups text into conversations (i.e. multiple texts into the same document)}
\item{parentTextCol}{index of column that contains the text of the parent when using groupByCol}
\item{parentTimeCol}{index of column that contains the time of the parent when using groupByCol}
}
\value{
micro clusters
}
\description{
Stream Clustering algorithm which clusters messages. Contains a description string as well as a reference class object of \code{textClust} which is the work-horse.
Stream Clustering algorithm which clusters text data.
}
\examples{
stream = DSD_ReadCSV("file.txt", sep = "\\t", comment.char="", quote="")
textClust <- DSC_textClust(r=.75, lambda=.1, tgap=1000)
update(textClust, stream, n=100)
data = data.frame(text=sample(c("Main Topic", "Similar Topic", "Something Different"), size=1000, replace=T),stringsAsFactors=F)
stream = DSD_Memory(data) ## Alternatively: stream = DSD_ReadCSV("file.txt", sep = "\\t", comment.char="", quote="")
algorithm = DSC_textClust(r=.4, lambda=0.1, tgap=100, nmin=1, nmax=2, k=3, stopword=c(), minWeight=3, textCol=1)
update(algorithm, stream, n=1000)
}
\references{
......
% Generated by roxygen2: do not edit by hand
% Please edit documentation in R/evaluate.R
\name{evaluate}
\alias{evaluate}
\title{Evaluate Clustering}
\usage{
evaluate(dsc, dsd, measure, n = 100, type = c("auto", "micro",
"macro"), assign = "micro", assignmentMethod = c("auto", "model",
"nn"), noise = c("class", "exclude"), ...)
}
\description{
Adaption of evaluation method from the stream pacakge. Adapted for textual data.
}
% Generated by roxygen2: do not edit by hand
% Please edit documentation in R/evaluate.R
\name{evaluate_cluster}
\alias{evaluate_cluster}
\title{Evaluate Clustering Using Prequential Evaluation}
\usage{
evaluate_cluster(dsc, dsd, measure, n = 1000, type = c("auto", "micro",
"macro"), assign = "micro", assignmentMethod = c("auto", "model",
"nn"), horizon = 100, verbose = FALSE, noise = c("class",
"exclude"), limit = Inf, ...)
}
\description{
Adaption of evaluate_cluster method from the stream pacakge.
}
......@@ -4,12 +4,14 @@
\alias{get_assignment.DSC_textClust}
\title{Get Assignment for DSC_textClust}
\usage{
get_assignment.DSC_textClust(dsc, points, ...)
\method{get_assignment}{DSC_textClust}(dsc, points)
}
\arguments{
\item{dsc}{object of class DSC_textClust}
\item{points}{matrix with the messages to assign}
\item{...}{optional arguments are ignored}
}
\value{
Indicies of the closest micro-clusters
......@@ -18,11 +20,12 @@ Indicies of the closest micro-clusters
Returns the index of the closest micro-cluster for every entry in points using the cosine similary of the tf-idf vectors.
}
\examples{
stream = DSD_ReadCSV("file.txt", sep = "\\t", comment.char="", quote="")
algorithm = DSC_textClust(lambda=0.001, r=0.2, tgap=100, verbose=T)
update(algorithm, stream, n=100)
data = data.frame(text=sample(c("Main Topic", "Similar Topic", "Something Different"), size=1000, replace=T),stringsAsFactors=F)
stream = DSD_Memory(data) ## Alternatively: stream = DSD_ReadCSV("file.txt", sep = "\\t", comment.char="", quote="")
algorithm = DSC_textClust(r=.4, lambda=0.1, tgap=100, nmin=1, nmax=2, k=3, stopword=c(), minWeight=3, textCol=1)
update(algorithm, stream, n=1000)
data = read.table("file.txt", header=F, sep="\\t", quote="", comment.char = "", stringsAsFactors =F)
data = data.frame(text=sample(c("Main Topic", "Something Different"), size=100, replace=T),stringsAsFactors=F)
get_assignment(algorithm, data)
}
......
......@@ -4,7 +4,7 @@
\alias{get_points.DSD_textClust}
\title{Get Points from textClust source}
\usage{
get_points.DSD_textClust(x, n = 1, cluster = FALSE, ...)
\method{get_points}{DSD_textClust}(x, n = 1, cluster = FALSE, ...)
}
\arguments{
\item{x}{\code{DSD_textClust} object to read the points from}
......
......@@ -4,17 +4,26 @@
\alias{nclusters.DSC_textClust}
\title{Number of clusters for DSC_textClust}
\usage{
nclusters.DSC_textClust(x, type = c("auto", "micro", "macro"), ...)
\method{nclusters}{DSC_textClust}(x, type = c("auto", "micro", "macro"),
...)
}
\arguments{
\item{x}{object of class DSC_textClust}
\item{type}{whether micro or macro clusters should be counted}
\item{...}{optional arguments passed to get_centers}
}
\description{
Returns the number of clusters for DSC_textClust
}
\examples{
algorithm = DSC_textClust(lambda=0.001, r=0.2, tgap=100, verbose=T)
print(algorithm)
data = data.frame(text=sample(c("Main Topic", "Similar Topic", "Something Different"), size=1000, replace=T),stringsAsFactors=F)
stream = DSD_Memory(data) ## Alternatively: stream = DSD_ReadCSV("file.txt", sep = "\\t", comment.char="", quote="")
algorithm = DSC_textClust(r=.4, lambda=0.1, tgap=100, nmin=1, nmax=2, k=3, stopword=c(), minWeight=3, textCol=1)
update(algorithm, stream, n=1000)
nclusters(algorithm)
}
\author{
......
% Generated by roxygen2: do not edit by hand
% Please edit documentation in R/DSC_textClust.R
\name{plot.DSC_textClust}
\alias{plot.DSC_textClust}
\title{Plot DSC_textClust}
\usage{
\method{plot}{DSC_textClust}(dsc, numRepresentatives = 1, type = "micro",
dendrogram = F, merge = F, interactive = T)
}
\arguments{
\item{dsc}{object of class DSC_textClust}
\item{numRepresentatives}{number of words to plot per micro cluster (default 1)}
\item{type}{string indicating whether to plot micro or macro cluster}
\item{dendrogram}{logical whether to plot dendrogram or use multidimensional scaling to plot representatives}
\item{merge}{logical whether to merge all clusters into one}
\item{interactive}{use interactive plots with the zoom package}
}
\description{
Plot function for object of type DSC_textClust. Uses Multidimensional Scaling (MDS) to plot the words with the heighest weight for each micro-cluster
}
\examples{
stream = DSD_ReadCSV("file.txt", sep = "\\t", comment.char="", quote="")
algorithm = DSC_textClust(lambda=0.001, r=0.2, tgap=100, verbose=T)
update(algorithm, stream, n=100)
plot(algorithm, numRepresentatives = 2)
}
\author{
Matthias Carnein \email{matthias.carnein@uni-muenster.de}
}
......@@ -4,9 +4,11 @@
\alias{print.DSC_textClust}
\title{Print DSC_textClust}
\usage{
\method{print}{DSC_textClust}(dsc)
\method{print}{DSC_textClust}(x, ...)
}
\arguments{
\item{...}{optional arguments are ignored}
\item{dsc}{object of class DSC_textClust}
}
\description{
......
......@@ -4,9 +4,9 @@
\name{textClust}
\alias{textClust}
\alias{textClust-package}
\title{Text Stream Clustering}
\title{Topic Discovery in Texts using Stream Clustering}
\description{
Text Stream Clustering
Topic Discovery in Texts using Stream Clustering
}
\references{
Matthias Carnein, Dennis Assenmacher, Heike Trautmann (2017)"Stream Clustering of Chat Messages with Applications to Twitch Streams". In: Advances in Conceptual Modeling. ER 2017. Lecture Notes in Computer Science, vol 10651. Springer, Cham
......
......@@ -11,7 +11,7 @@ Maintains the current clustering and implements the stream package interfaces.
\section{Fields}{
\describe{
\item{\code{textClust_C}}{Exposed C++ class}
\item{\code{C}}{Exposed C++ class}
\item{\code{nmin}}{min number of ngrams to use}
......@@ -21,15 +21,23 @@ Maintains the current clustering and implements the stream package interfaces.
\item{\code{h}}{height for hierarchical clustering}
\item{\code{upToDate}}{logical whether macro-clusters are up to date or need to be recomputed}
\item{\code{microMacroAssignment}}{assignment vector associating micro-clusters to a maco-cluster}
\item{\code{hc}}{hclust object}
\item{\code{throughput}}{stores the throughput of messages}
\item{\code{termFading}}{logical whether individual terms should be faded}
\item{\code{stopword}}{chracter vector of stopwords to remove}
\item{\code{linkage}}{method for hierarchical clustering}
\item{\code{weightedReclustering}}{logical whether reclustering should consider cluster weights}
\item{\code{minWeight}}{minimum weight of micro clusters to be used for reclustering}
\item{\code{textCol}}{index of column that contains the text}
\item{\code{timeCol}}{index of column that contains the timestamps}
}}
......
% Generated by roxygen2: do not edit by hand
% Please edit documentation in R/textClust_R.R
\name{textClust_R_calculateIDF}
\alias{textClust_R_calculateIDF}
\title{Calculate Inverted Document Frequency}
\value{
Inverted Document Frequency as a vector
}
\description{
Calculates the Inverted Document Frequency, i.e. the popularity of words accross the entire corpus.
}
\author{
Dennis Assenmacher \email{dennis.assenmacher@wi.uni-muenster.de}
}
......@@ -4,7 +4,7 @@
\alias{textClust_R_cluster}
\title{Clustering procedure of textClust}
\arguments{
\item{newdata}{matrix of data with three columns (time, username, message)}
\item{newdata}{matrix of text in a single column}
}
\description{
Main clustering procedure of textClust
......
% Generated by roxygen2: do not edit by hand
% Please edit documentation in R/textClust_R.R
\name{textClust_R_deserialize}
\alias{textClust_R_deserialize}
\title{Deserialize DSC_textClust_R object}
\name{textClust_R_get_assignment}
\alias{textClust_R_get_assignment}
\title{get_assignment}
\description{
Imports a serialized R-List back into an C++ Object
Returns the cluster assignment of text to a respective micro-clusters
}
\author{
Matthias Carnein \email{matthias.carnein@uni-muenster.de}
......
......@@ -7,7 +7,7 @@
list of micro-clusters
}
\description{
Getter for micro-clusters of textClust
Getter for micro-clusters of textClust, returns interfaced C objects
}
\author{
Matthias Carnein \email{matthias.carnein@uni-muenster.de}
......
......@@ -14,8 +14,6 @@
\item{nmax}{max number of ngrams to use}
\item{updateAll}{logical whether a new observations is used to update all micro-clusters within \code{r} or just the closest.}
\item{k}{number of clusters}
\item{h}{height for hierarchical clustering}
......@@ -23,9 +21,21 @@
\item{verbose}{logical whether to be more verbose}
\item{termFading}{logical whether individual terms should be faded}
\item{stopword}{chracter vector of stopwords to remove}
\item{linkage}{method for hierarchical clustering}
\item{weightedReclustering}{logical whether reclustering should consider cluster weights}
\item{minWeight}{minimum weight of micro clusters to be used for reclustering}
\item{textCol}{index of column that contains the text which should be clustered}
\item{timeCol}{index of column that contains timestamps}
}
\value{
refernce class object
reference class object
}
\description{
Constructor of textClust
......
% Generated by roxygen2: do not edit by hand
% Please edit documentation in R/textClust_R.R
\name{textClust_R_serialize}
\alias{textClust_R_serialize}
\title{Serialize DSC_textClust_R object}
\description{
Exports the C++ Object of DSC_textClust_R to an R-List
}
\author{
Matthias Carnein \email{matthias.carnein@uni-muenster.de}
}
CXX_STD = CXX11
\ No newline at end of file
//
// Created by Dede on 11.09.2016.
//
#include "Util.hpp"
namespace Utility{
double Util::innerProduct(Rcpp::List x,
Rcpp::List y) {
double erg=0;
for(int i = 0;i<x.size();i++){
erg = erg+ ((double)x[i]*(double)y[i]);
}
return erg;
}
double Util::vectorLengthEuclid(Rcpp::List x) {
double val = innerProduct(x,x);
return sqrt(val);
}
double Util::vectorLengthManhatten(Rcpp::NumericVector x) {
Rcpp::NumericVector result;
result = abs(x);
double erg = std::accumulate(result.begin(),result.end(), 0.0);
return erg;
}
Rcpp::NumericMatrix Util::multiply(Rcpp::NumericVector x){
Rcpp::NumericMatrix m = Rcpp::NumericMatrix(x.size(),x.size());
for(int i = 0;i<x.size();i++){
for(int j = 0; j<x.size();j++){
m(i,j)=x[i]*x[j];
}
}
return(m);
}
}
//
// Created by Dede on 11.09.2016.
//
#ifndef CFTREE_UTIL_HPP
#define CFTREE_UTIL_HPP
#include <numeric>
#include <Rcpp.h>
#include <algorithm>
#include <vector>
#include <math.h>
namespace Utility{
class Util {
public:
static double innerProduct(Rcpp::List x,
Rcpp::List y);
static double vectorLengthEuclid(Rcpp::List x);
static double vectorLengthManhatten(Rcpp::NumericVector x);
static Rcpp::NumericMatrix multiply(Rcpp::NumericVector x);
private:
Util() {}
};
}
#endif //CFTREE_UTIL_HPP
#include "microCluster.hpp"
#include <math.h>
#include "Util.hpp"
#include <unordered_map>
using namespace Rcpp;
MicroCluster::MicroCluster (Rcpp::List tokens, int time){
//init weight
this->weight = 1;
// set time of last update
this->time = time;
//split and tokenize sentence
this->tf = tokens;
}
MicroCluster::MicroCluster (Rcpp::List tokens, int time, double weight){
//init weight
this->weight = weight;
// set time of last update
this->time = time;
//split and tokenize sentence
this->tf = tokens;
// write list of tokens into hash map
Rcpp::StringVector names = tokens.names();
for(int i=0; i<names.size(); i++){
Rcpp::String name = names(i);
this->tf[name] = tokens[name];
}
}
Rcpp::List MicroCluster::getTf(){
Rcpp::List result;
for (std::unordered_map<std::string, double>::iterator it = this->tf.begin(); it != this->tf.end(); it++ ){
result[it->first] = it->second;
}
return(result);
}
void MicroCluster::merge(MicroCluster mc, int t, int omega, double lambda){
void MicroCluster::unmerge(MicroCluster* mc, int t, int omega, double lambda, bool termFading){
//update weights
this->fade(t, omega, lambda, this->termFading);
mc.fade(t, omega, lambda, this->termFading);
this->fade(t, omega, lambda, termFading);
mc->fade(t, omega, lambda, termFading);
this->time = t;
//sum cluster weights
this->weight = this->weight + mc.weight;
this->weight = this->weight - mc->weight;
// iterate tokens of mc
for (std::unordered_map<std::string, double>::iterator it = mc->tf.begin(); it != mc->tf.end(); it++ ){
// search key in current map (this)
std::unordered_map<std::string, double>::iterator valIt = this->tf.find(it->first);
if ( valIt != this->tf.end() ) {
// if element found: sum elements
valIt->second -= it->second;
// remove insufficient elements
if(valIt->second <= omega){
this->tf.erase(valIt);
}
}
}
}
void MicroCluster::merge(MicroCluster* mc, int t, int omega, double lambda, bool termFading){
//sum tokens of both MCs
Rcpp::StringVector names = this->tf.names();
Rcpp::StringVector namesMC = mc.tf.names();
//update weights
this->fade(t, omega, lambda, termFading);
mc->fade(t, omega, lambda, termFading);
//Union
Rcpp::StringVector v = Rcpp::union_(names,namesMC);
this->time = t;
Rcpp::List finalList =Rcpp::List::create();
//sum cluster weights
this->weight = this->weight + mc->weight;
for(int i = 0;i<v.size();i++){
Rcpp::String name = v(i);
double tfVal=0;
double mctfVal=0;
if(tf.containsElementNamed(v(i))) tfVal = tf[name];
if(mc.tf.containsElementNamed(v(i))) mctfVal = mc.tf[name];
finalList[name] = tfVal + mctfVal;
// iterate tokens of mc
for (std::unordered_map<std::string, double>::iterator it = mc->tf.begin(); it != mc->tf.end(); it++ ){
// search key in current map (this)
std::unordered_map<std::string, double>::iterator valIt = this->tf.find(it->first);
if ( valIt == this->tf.end() ) {
// if element not found: insert element
this->tf[it->first] = it->second;
} else {
// if element found: sum elements
valIt->second += it->second;
}
}
this->time = t;
this->tf = finalList;
}
void MicroCluster::fade(int tnow, double omega, double lambda, bool termFading){
//fade cluster
this->weight = this->weight * pow(2,(-lambda * (tnow-this->time)));
this->weight = this->weight * pow(2,(-lambda * (tnow-this->time)));
//fade tokens
//Here we fade each single entry in our TF vector according to lambda and the passed time
//From last index to first one in order to prevent problems with indices
if(termFading){
for(int i = tf.size()-1; i>=0;i--){
double tf = (double)this->tf[i];
tf = tf * pow(2,-lambda*(tnow-this->time));
if(tf<=omega){
this->tf.erase(i);
} else {
this->tf[i] = tf;
// iterate all tokens
std::unordered_map<std::string, double>::iterator it = this->tf.begin();
while(it != this->tf.end()){
// fade entry
it->second = it->second * pow(2,-lambda*(tnow-this->time));
if(it->second <= omega){
// if weight below threshold: remove
it = this->tf.erase(it);
} else{
it++;
}
}
}
// update time of last update
this->time = tnow;