Chapter 11 Anomaly Detection
11.1 Anomaly Detection in R
Distance to k-Nearest Neighbor as Outlier Score
First Example Data Set
set.seed(5364)
x1=rnorm(50)
y1=rnorm(50)
mydata1=data.frame(x=c(x1,6),y=c(y1,6))
plot(mydata1,pch=16)
#Distance Matrix
D=as.matrix(dist(mydata1))
#Distance to kth nearest neighbor (k=5)
kdist=1:51
for(i in 1:51){
kdist[i]=(sort(D[i,]))[6]
}Plotting data
library(proto)
library(ggplot2)
library(gridExtra)
#Ordinary plot
ggplot(data=mydata1,aes(x=x,y=y,size=3))+geom_point()
#Plot with Color Determined by kdist
ggplot(data=mydata1,aes(x=x,y=y,col=kdist,size=3))+geom_point()
#Gradient Plot (Heatmap)
ggplot(data=mydata1,aes(x=x,y=y,col=kdist,size=3))+geom_point()+
scale_colour_gradientn(colours=c("black", "red"))
ggplot(data=mydata1,aes(x=x,y=y,col=kdist,size=3))+geom_point()+
scale_colour_gradientn(colours=c("blue", "red"))


Finding Rows with Outliers
## [1] 51
#my.kdist function
my.kdist=function(data,k){
n=nrow(data)
D=as.matrix(dist(data))
kdist=1:n
for(i in 1:n){
kdist[i]=(sort(D[i,]))[k+1]
}
return(kdist)
}
temp=my.kdist(mydata1,5)
cor(temp,kdist)## [1] 1
## [1] 0 0
Second Example Data Set
set.seed(5364)
x1=rnorm(50)
y1=rnorm(50)
x2=0.5*rnorm(5)+10
y2=0.5*rnorm(5)+10
set.seed(5364)
mydata2=data.frame(x=c(x1,x2),y=c(y1,y2))
plot(mydata2,pch=16)
#Plot using kdist with k=5
kdist=my.kdist(mydata2,5)
ggplot(data=mydata2,aes(x=x,y=y,col=kdist,size=3))+geom_point()+
scale_colour_gradientn(colours=c("black", "red"))

## [1] 51 52 53 54 55
#Plot using kdist with k=4
kdist=my.kdist(mydata2,4)
ggplot(data=mydata2,aes(x=x,y=y,col=kdist,size=3))+geom_point()+
scale_colour_gradientn(colours=c("black", "red"))

Third Example Data Set
set.seed(5366)
x1=rnorm(10)
y1=rnorm(10)
x2=0.1*rnorm(10)+10
y2=0.1*rnorm(10)
mydata3=data.frame(x=c(x1,x2,6),y=c(y1,y2,5))
plot(mydata3,pch=16)
#Detecting Outliers
kdist=my.kdist(mydata3,5)
ggplot(data=mydata3,aes(x=x,y=y,col=kdist,size=3))+geom_point()+
scale_colour_gradientn(colours=c("blue", "red"))

## [1] 21
11.1.1 Density as an Outlier Score
Writing a density function
my.density=function(data,k){
n=nrow(data)
D=as.matrix(dist(data))
density=1:n
for(i in 1:n){
knn.distances=(sort(D[i,]))[2:(k+1)]
density[i]=(mean(knn.distances))^(-1)
}
return(density)
}
mydata3.density=my.density(mydata3,5)
ggplot(data=mydata3,aes(x=x,y=y,col=mydata3.density,size=3))+geom_point()+
scale_colour_gradientn(colours=c("blue", "red"))
ggplot(data=mydata3,aes(x=x,y=y,col=mydata3.density,size=3))+geom_point()+
scale_colour_gradientn(colours=c("red","blue"))

11.1.2 Average Relative Density as an Outlier Score
11.1.3 Local Outlier Factor Method (LOF)
library(DMwR)
outlier.scores=lofactor(mydata3,k=5)
ggplot(data=mydata3,aes(x=x,y=y,col=outlier.scores,size=3))+geom_point()+
scale_colour_gradientn(colours=c("blue","red"))

11.1.4 Exploring Outliers in Iris Data with LOF
Remove Species Labels
## The following objects are masked from iris (pos = 3):
##
## Petal.Length, Petal.Width, Sepal.Length, Sepal.Width, Species

## [1] 3
## [1] 0.9149297 0.9175461 0.9211779 0.9247702 0.9280119 0.9298955 0.9383261 0.9438872 0.9456141
## [10] 0.9532838 0.9535062 0.9574717 0.9642619 0.9683073 0.9696800 0.9700819 0.9702753 0.9724783
## [19] 0.9736295 0.9742450 0.9812272 0.9827467 0.9837029 0.9847184 0.9871393 0.9886461 0.9901533
## [28] 0.9906144 0.9913695 0.9925829 0.9936177 0.9953949 0.9957788 0.9960430 0.9975503 0.9979707
## [37] 0.9985588 0.9987528 0.9987918 1.0008192 1.0012099 1.0012549 1.0019179 1.0030391 1.0038486
## [46] 1.0038486 1.0040905 1.0049730 1.0055536 1.0064616 1.0069813 1.0071882 1.0082068 1.0120380
## [55] 1.0145495 1.0152831 1.0153552 1.0160806 1.0165880 1.0180067 1.0199571 1.0213139 1.0215881
## [64] 1.0223231 1.0244915 1.0247701 1.0249838 1.0280724 1.0290194 1.0300098 1.0352453 1.0406309
## [73] 1.0431377 1.0473884 1.0484284 1.0496945 1.0501526 1.0503884 1.0504145 1.0545295 1.0553704
## [82] 1.0563053 1.0609483 1.0643763 1.0666094 1.0688565 1.0725190 1.0756115 1.0835872 1.0853880
## [91] 1.0896507 1.1094105 1.1104066 1.1131665 1.1154607 1.1158796 1.1231084 1.1234314 1.1243352
## [100] 1.1251169 1.1258939 1.1261694 1.1381879 1.1439846 1.1514986 1.1533540 1.1607787 1.1657260
## [109] 1.1680734 1.1680734 1.2026102 1.2214419 1.2288344 1.2324579 1.2390413 1.2411677 1.2436175
## [118] 1.2505891 1.2589051 1.2609826 1.2619762 1.2646144 1.2837610 1.2842448 1.2925090 1.2963231
## [127] 1.3015193 1.3052801 1.3157904 1.3222612 1.3243655 1.3370690 1.3403839 1.3408929 1.3457419
## [136] 1.3523665 1.3538518 1.3667996 1.3724774 1.3764034 1.3933552 1.4551038 1.4638963 1.4801222
## [145] 1.5286564 1.5482808 1.6025838 1.9591426 2.0292631 2.4799601
outliers=(outlier.scores>=1.7)
coloring=rep("black",150)
coloring[outliers]=c("red","green","blue")
plot(iris.x,col=coloring)