The DBSCAN function uses two parameters eps and min samples. The first parameter eps corresponds to the bandwidth parameter h, in the course notes. The second parameter min samples min samples is proportional to the level λ from the course notes. More precisely, we have
λ = min_samples / Vol(B(0,h))
Make appropriate choices of these parameters as follows.
• Set eps to be 20% quantile of interpoint distances when using DBSCAN, and same setting as for MeanShift
• For minsamples in the range [100,400,700,1000], pick the smallest value that gives more than one cluster
import numpy as np
from IPython.display import Image
from sklearn.cluster import KMeans
from sklearn.metrics.cluster import adjusted_rand_score
import matplotlib.pyplot as plt
from sklearn.decomposition import PCA
from sklearn.cluster import DBSCAN
# Problem 8
from sklearn.cluster import MeanShift , KMeans , SpectralClustering, estimate_bandwidth
import random
import math
random.seed(0)
##Data set 1
X1 = []
for i in range(1000):
theta = random.uniform(0,2*math.pi)
radius = random.gauss(0,0.2)+random.choice([1,3])
X1.append([radius*math.cos(theta),radius*math.sin(theta)])
X1 = np.array(X1)
##Data Set 2
X2 = []
for i in range(1000):
theta = random.uniform(0,2*math.pi)
radius = random.gauss(0,0.1) + 2
if theta<math.pi:
X2.append([radius*math.cos(theta)-1,radius*math.sin(theta)])
else:
X2.append([radius*math.cos(theta)+1,radius*math.sin(theta)])
X2 = np.array(X2)
##Data Set 3
X3 = []
for i in range(1000):
radius = random.gauss(0,1)
theta = random.uniform(0,2*math.pi)
center = random.choice([[0,1],[3,3],[1,-3]])
X3.append([radius*math.cos(theta)+center[0],radius*math.sin(theta)+center[1]])
X3 = np.array(X3)
fig, axs = plt.subplots(nrows=1, ncols=3, sharex=True)
ax = axs[0]
ax.scatter(X1[:,0],X1[:,1])
ax = axs[1]
ax.scatter(X2[:,0],X2[:,1])
ax = axs[2]
ax.scatter(X3[:,0],X3[:,1])
lamb = [100, 400, 700, 1000]
bandwidth1 = estimate_bandwidth(X1, quantile=0.2)
bandwidth2 = estimate_bandwidth(X2, quantile=0.2)
bandwidth3 = estimate_bandwidth(X3, quantile=0.2)
#db0 = DBSCAN(eps=bandwidth1, min_samples= 100).fit(X1)
#db0 does not work well
db1 = DBSCAN(eps=bandwidth1, min_samples= 400).fit(X1)
db2 = DBSCAN(eps=bandwidth2, min_samples= 100).fit(X2)
db3 = DBSCAN(eps=bandwidth3, min_samples= 100).fit(X3)
label_pred = db1.labels_
mark = ['or', 'ob', 'oy', 'g','c','m','k','om']
j=0
for i in label_pred:
plt.plot([X1[j:j+1,0]], [X1[j:j+1,1]], mark[i], markersize = 5)
j +=1
plt.show()
label_pred = db2.labels_
mark = ['or', 'ob', 'oy', 'g','c','m','k','om']
j=0
for i in label_pred:
plt.plot([X2[j:j+1,0]], [X2[j:j+1,1]], mark[i], markersize = 5)
j +=1
plt.show()
label_pred = db3.labels_
mark = ['or', 'ob', 'oy', 'g','c','m','k','om']
j=0
for i in label_pred:
plt.plot([X3[j:j+1,0]], [X3[j:j+1,1]], mark[i], markersize = 5)
j +=1
plt.show()
bandwidth1 = estimate_bandwidth(X1, quantile=0.2)
bandwidth2 = estimate_bandwidth(X2, quantile=0.2)
bandwidth3 = estimate_bandwidth(X3, quantile=0.2)
clustering1 = MeanShift(bandwidth=bandwidth1).fit(X1)
clustering2 = MeanShift(bandwidth=bandwidth2).fit(X2)
clustering3 = MeanShift(bandwidth=bandwidth3).fit(X3)
#clustering1.labels_
label_pred = clustering1.labels_
mark = ['or', 'ob', 'oy', 'g','c','m','k','w']
j=0
for i in label_pred:
plt.plot([X1[j:j+1,0]], [X1[j:j+1,1]], mark[i], markersize = 5)
j +=1
plt.show()
label_pred = clustering2.labels_
mark = ['or', 'ob', 'oy', 'g','c','m','k','w']
j=0
for i in label_pred:
plt.plot([X2[j:j+1,0]], [X2[j:j+1,1]], mark[i], markersize = 5)
j +=1
plt.show()
label_pred = clustering3.labels_
mark = ['or', 'ob', 'oy', 'g','c','m','k','w']
j=0
for i in label_pred:
plt.plot([X3[j:j+1,0]], [X3[j:j+1,1]], mark[i], markersize = 5)
j +=1
plt.show()
The DBSCAN outperforms the MeanShift on the first 2 datasets. DBSCAN performs better on the first 2 dataset, because It identifies outliers as noises, unlike mean-shift which detects clusters based on euclidean distances and assign points to the nearest mode.
The main drawback of DBSCAN is that it doesn’t perform as well as others when the clusters are of varying density. This is because the setting of the distance threshold ε and minPoints for identifying the neighborhood points will vary from cluster to cluster when the density varies. This drawback also occurs with very high-dimensional data since again the distance threshold ε becomes challenging to estimate.
Image(filename = "Adagrad.png", width = 1000, height = 800)
#Data sampler code:
## Code for generator/sampler
import numpy as np
import random
import time
from numpy import linalg as LA
import statistics #initialization
sigma = 1
d = 10
c_square = 100
cov = np.diag([(0.25**i)*c_square for i in range(1,d+1)])
mean = [0]*d
#coeficient given
w = np.array([1]*d)
# Sampler function
def sampler(n):
#data X generator
np.random.seed(int(time.time()*100000)%100000)
X = np.random.multivariate_normal(mean, cov, n)
#data Y generator
Y = np.matmul(X, w)+np.random.normal(0, sigma**2, n)
return (X,Y)
x,y =sampler(1)
print("x = ", x)
print("\ny = ", y)
## Adagrad
lambda_given = 0.1
lambda1 = ((2**-1) * 10)**2
beta = 2 * (lambda1 + 0.1) #(0.25*c_square+lambda_given)
T = 1000
def Adagrad(data, T):
X = data[0]
Y = data[1]
w1 = np.array([0]*d)
w_path = [w1]
cumu_derivative_square = 0
for t in range(T):
deriv = 2 * ( np.matmul(data[0][t],w1) * data[0][t] + 0.1 * w1 - data[1][t] * data[0][t])
cumu_derivative_square += deriv**2
ita = (1 / ( beta * cumu_derivative_square))**(0.5)
w1 = w1 - ita * deriv
w_path.append(w1)
return w_path
## implement and take average of distances
random.seed(1)
w_all_ada = []
for i in range(10):
data = sampler(T)
print(Adagrad(data, T)[-1])
w_all_ada.append(Adagrad(data, T))
Adagrad_distance = []
Adagrad_std = []
for t in range(T):
distances = []
for i in range(10):
distances.append(LA.norm(w-w_all_ada[i][t]))
Adagrad_distance.append(sum(distances)/10)
Adagrad_std.append(statistics.stdev(distances))
plt.plot(Adagrad_distance,label='Adagrad')
plt.legend()
plt.show()
Image(filename = "adam.png", width = 800, height = 500)
lambda_given = 0.1
lambda1 = ((2**-1) * 10)**2
beta = 2 * (lambda1 + 0.1) #(0.25*c_square+lambda_given)
T = 1000
exp_factor = 1.1
def Adam(data):
X = data[0]
Y = data[1]
w1 = np.array([0]*d)
w_path = [w1]
cumu_derivative_square = 0
for t in range(T):
deriv = 2*(np.matmul(data[0][t],w1)*data[0][t]+lambda_given*w1-data[1][t]*data[0][t])
cumu_derivative_square = cumu_derivative_square / exp_factor + (deriv**2)
ita = (1/(2*(0.25*c_square+lambda_given)*(1+t)*(1-1/exp_factor)/(1-(1/exp_factor)**(t+2))*cumu_derivative_square))**(0.5)
w1=w1-ita*deriv
w_path.append(w1)
return w_path
np.random.seed(1)
adam_w_all = []
for i in range(10):
data = sampler(T)
adam_w_all.append(Adam(data))
Adam_distance = []
Adam_std = []
for t in range(T):
distances = []
for i in range(10):
distances.append(LA.norm(w-adam_w_all[i][t]))
Adam_distance.append(sum(distances)/10)
Adam_std.append(statistics.stdev(distances))
plt.plot(Adagrad_distance,label='Adagrad')
plt.plot(Adam_distance,label='Adam')
plt.title("average")
plt.legend()
plt.show()
plt.plot(Adagrad_std,label='Adagrad')
plt.plot(Adam_std,label='Adam')
plt.title("std")
plt.legend()
plt.show()
plt.errorbar(list(range(0,T)), Adagrad_distance, yerr = Adagrad_std, fmt='g', ecolor='y', label='Adagrad')
plt.errorbar(list(range(0,T)), Adam_distance, yerr = Adam_std, fmt='b', ecolor='r',label='Adam')
plt.title("average & std")
plt.legend()
plt.show()
Adam solve the denominator decay problem of AdaGrad by using a discount factor. AdaGrad shrinks the learning rate very aggressively, i.e. it decays the learning rate for parameters in proportion to their update history (more updates means more decay). Therefore, after a while, the frequent parameters will start receiving very small updates because of the decayed learning rate.
import random
import numpy as np
d=2
mu0 = [[-2,-3.5],[0,0],[2,-3.5]]
Id = [[25,0],[0,25]]
n=500
def Data_Generate(n):
outX = []
outY = np.random.choice(list(range(3)),n,replace = True, p = [0.5,0.25,0.25])
for i in range(n):
mu = mu0[outY[i]]
X = np.random.multivariate_normal(mu,Id,1)[0].tolist()
outX.append(X)
return outX,outY
When k=n, the model is stable but poor, because every datapoint in the dataset is considered to be within the same neighborhood. In this problem, when k = n, the model would be constant over all x, because we are taking the majority label of all the datapoints in the dataset, instead of only considering certain numbers of closest members. In general a larger k suppresses the effects of noise, but makes the classification boundaries less distinct.
Image(filename = "testerror.png", width = 400, height = 300)
from sklearn.neighbors import KNeighborsClassifier
testX, testY = Data_Generate(n)
# according to instructors ans on piazza
k = list(range(1,500, 50))
print(k)
k = list(range(1,500, 50)) #list(range(1,500, 50))
def test_error(x,y):
count = 0
for i in range(len(y)):
if x[i] != y[i] : count += 1
count /= len(y)
return count
errorlist = np.zeros((len(k),10))
for j in range(len(k)):
error = 0
for i in range(10):
trainX , trainY = Data_Generate(n)
ENN = KNeighborsClassifier(n_neighbors = k[j])
ENN.fit(trainX , trainY)
errorlist[j][i] = test_error(ENN.predict(testX), testY)
print(errorlist)
average = list(map(lambda x: np.mean(errorlist[x,:]),list(range(len(k)))))
std = list(map(lambda x: np.std(errorlist[x,:]),list(range(len(k)))))
#print(average, np.mean(errorlist[0,]),std, np.std(errorlist[0,]))
#print(std)
plt.errorbar(k, average, yerr = std, fmt='b', ecolor='r',label='average & std')
plt.title("k & test error")
plt.legend()
plt.show()
print(max(std))
place = np.where(std == max(std))[0]
#std[place]
print(place , k[place[0]], std[place[0]])
We find the highest variance at k=1. That is, under the KNN setting, for smaller values of k, the variance is higher. In KNN, when the k is larger, it means that the flexibility is lower, and according to the bias-variance tradeoff, the variance of estimate would be lower. In other words, at k=1, it means that there is only one point within each neighbor. The high flexibility would have high variance and low bias.