The package contains two functions - binary.segmentation and wild.binary.segmentation. The first iteratively uses a bisection algorithm that first tests for the existence of change point(s) and then estimates their location(s). The algorithm starts over the entire interval and continues on sub-intervals until no more change points can be identified. The second (wild binary segmentation) generates thousands of random sub-intervals within the data and computes the test statistic for each of these intervals. The largest of these values is then compared to a threshold, and the original interval is divided in half if a change point is found. This algorithm continues until no more change points can be identified. The binary.segmentation function returns an estimate of M (the dominant temporal dependence), a list of the detected change points, and the corresponding p-values for each change. The wild.binary.segmentation function returns an estimate of M and a list of detected change points.
References: Li, J., Li, L., Xu, M., Zhong, P (2018). Change Point Detection in the Mean of High-Dimensional Time Series Data under Dependence. Manuscript. Fryzlewicz, P. (2014). Wild Binary Segmentation for Multiple Change-point Detection. The Annals of Statistics.
The code below simulates a 150x200 time series with temporal dependence and change points located at 70 and 80. Wild binary segmentation is called on this simulated data matrix to assess its accuracy. The results are tracked over several iterations, and the probability of detecting a change point at each point in the time series is plotted. The number of true positives, false positives, and false negatives is recorded for each iteration, and the mean and standard deviation of each is returned at the end:
library(ggplot2)
library(ggpubr)
#> Loading required package: magrittr
library(tidyverse)
#> -- Attaching packages ---------------------------------------------------------------------------------------------------------- tidyverse 1.2.1 --
#> v tibble 1.4.2 v purrr 0.2.5
#> v tidyr 0.8.1 v dplyr 0.7.6
#> v readr 1.1.1 v stringr 1.3.1
#> v tibble 1.4.2 v forcats 0.3.0
#> -- Conflicts ------------------------------------------------------------------------------------------------------------- tidyverse_conflicts() --
#> x tidyr::extract() masks magrittr::extract()
#> x dplyr::filter() masks stats::filter()
#> x dplyr::lag() masks stats::lag()
#> x purrr::set_names() masks magrittr::set_names()
library(HDcpDetect)
#################################################
#Compute the mean/sd of true positives and false positives
#################################################
<- function(true_cpts,data_mat){
means_n_sds <-NULL
tp_vec<-NULL
fp_vecfor (i in 1:nrow(data_mat)){
<-0
tp_countfor (j in 1:ncol(data_mat)){
if (data_mat[i,j] %in% true_cpts){
<-tp_count+1
tp_count
}
}<-c(tp_vec,tp_count)
tp_vec
}for (k in 1:nrow(data_mat)){
<-0
fp_countfor (l in 1:ncol(data_mat)){
if (data_mat[k,l] %in% true_cpts == FALSE & data_mat[k,l] != 0){
<- fp_count + 1
fp_count
}
}<- c(fp_vec,fp_count)
fp_vec
}<-mean(tp_vec)
tp.mean<-sd(tp_vec)
tp.sd<-mean(fp_vec)
fp.mean<-sd(fp_vec)
fp.sd<-length(true_cpts)-tp.mean
fn.mean<-tp.sd
fn.sd<- cbind(tp.mean, tp.sd, fp.mean, fp.sd, fn.mean, fn.sd)
results colnames(results) <- c("tp.mean", "tp.sd","fp.mean","fp.sd","fn.mean","fn.sd")
return(results)
}
#################################################
#Data generation function
#################################################
<-function(n,p,M,k,Gam,mu){
data_Ger<-matrix(0,n,p)
data_Mat<-M+1
L<-matrix(rnorm(p*(n+L-1)),p*(n+L-1),1) #Gaussian innovation
Z# Z<-matrix(rt(p*(n+L-1),8),p*(n+L-1),1)*sqrt(3/4) # t innovation
<-1/rep(c(L:1),each=p)
vec.coef<-t(apply(Gam,1,rep,L))*matrix(vec.coef,ncol=L*p,nrow=p,byrow=T)
Gam.mat<-length(k)+1
iter<-c(0,k,n)
kfor(i in 1:iter){
for (t in (k[i]+1):k[i+1])
{<-matrix((matrix(mu[,i],ncol=1,nrow=p,byrow=F)-Gam.mat%*%Z[((t-1)*p+1):((t+L-1)*p),]),1,p,byrow=F)
data_Mat[t,]
}
}return(data_Mat)
}
#################################################
#Wild Binary Segmentation simulation
#################################################
<-200; n<-150
p<-0.4667
rat1<- 0.5333
rat2 <-c(round(n*rat1),round(n*rat2)) # location of two change points
k<-1
M<-0.05
alpha###################################################
# generate different population mean vectors
###################################################
<-0.3;delta1<-0;delta2<-2;delta3 <- 0
s_beta<-round(p^(1-s_beta)) # number of nonzero signals
mm<-mu2<-mu3<-rep(0,p)
mu1<-sample(c(-1,1),mm,replace=T)
signal.sign1<-sample(c(-1,1),mm,replace=T)
signal.sign2<-sample(c(-1,1),mm,replace=T)
signal.sign3<-sort(sample(1:p,mm,replace=F))
post1<-sort(sample(1:p,mm,replace=F))
post2<-sort(sample(1:p,mm,replace=F))
post3<-signal.sign1*delta1
mu1[post1]<-signal.sign2*delta2
mu2[post2]<-signal.sign3*delta3
mu3[post3]<-cbind(mu1,mu2,mu3)
mu#####################################################
# generate covariance matrices
#####################################################
<-matrix(0,p,p)
Gam<-0.5
wfor(i in 1:p)
for(j in 1:p)
{<-abs(i-j)
dijif(dij<(p*w))
{<-w^(dij)
Gam[i,j]
}
}<-1:p; rho<-0.6
times<-abs(outer(times, times, "-"))
H<-rho^H
Gam
<-10 # the number of iterations
iter
<-matrix(0,iter,30)
Allcpts<- NULL
list for (i in 1:iter){
<-data_Ger(n,p,M,k,Gam,mu)
data_M<-wild.binary.segmentation(data_M, minsize = 15)
cptsprint(cpts)
if (!is.character(cpts))
1:length(cpts)]<-cpts
{Allcpts[i,<- c(list,cpts)
list
}
}#> [1] "dominant.temporal.dependence_M = 1"
#> [1] "FoundList"
#> [1] 48 70 80
#> [1] "dominant.temporal.dependence_M = 1"
#> [1] "FoundList"
#> [1] 40 67 74 93 117
#> [1] "dominant.temporal.dependence_M = 1"
#> [1] "FoundList"
#> [1] 16 70 80
#> [1] "dominant.temporal.dependence_M = 1"
#> [1] "FoundList"
#> [1] 70 80
#> [1] "dominant.temporal.dependence_M = 1"
#> [1] "FoundList"
#> [1] 70 78
#> [1] "dominant.temporal.dependence_M = 1"
#> [1] "FoundList"
#> [1] 70 80
#> [1] "dominant.temporal.dependence_M = 1"
#> [1] "FoundList"
#> [1] 70 80
#> [1] "dominant.temporal.dependence_M = 1"
#> [1] "FoundList"
#> [1] 69 79
#> [1] "dominant.temporal.dependence_M = 1"
#> [1] "FoundList"
#> [1] 27 80
#> [1] "dominant.temporal.dependence_M = 1"
#> [1] "FoundList"
#> [1] 56 70
if (is.null(list)){list<-0}
<- data.frame(list)
df colnames(df) <- "cpts"
<- df%>%dplyr::count(cpts)%>%dplyr::mutate(probability = n/ iter)
probs <- data.frame(1:150,0); colnames(all) <- c("point","probability")
all for (i in 1:150)
{ifelse(i %in% probs$cpts, all[all$point == i,]$probability <- probs[probs$cpts == i,]$probability, all[all$point == i,]$probability <- 0)
}<- ggplot(all,aes(x=point,y=probability))+geom_point()+geom_line()+labs(title="Wild Binary Segmentation Simulation (cpts at 70,80)", y= "Probability of Detection")+theme_bw()+theme(plot.title = element_text(hjust = 0.5))+xlim(0,150)+ylim(0,1)
wildbinaryseg
<- means_n_sds(c(70,80),Allcpts)
numeric.wildbinaryseg
#wildbinaryseg
numeric.wildbinaryseg#> tp.mean tp.sd fp.mean fp.sd fn.mean fn.sd
#> [1,] 1.3 0.8232726 1.2 1.47573 0.7 0.8232726