-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathNormalized_Spectral_Clustering.py
96 lines (74 loc) · 4.24 KB
/
Normalized_Spectral_Clustering.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
# 208539270 yardenbakish
# 208983700 tamircohen1
"""The Normalized Spectral Clustering Algorithm main interface
input: obs - array of n vectors with n_dim dimentions, number of clusters; and the Random parameter indicating whether the initial invoke command
of the entire program assigned True or False to Random
output: a labeling array where the value at the i'th index is the computed centroid which the i'th observation was assaigned to,
and the k used for kmeans"""
"""IMPORTANT NOTE: we dont allow division of numbers which are equal or lesser than 0.0001 as it is considered as 0 (as instructed).
in general, we treat 0.0001 as 0"""
import numpy as np
import QR_Iter as QR
import mykmeanssp as km
import kmeans_plus as kmeans_pp
import EuclideanDistance as EC
import math
#constant for kmeans' number of iterations
MAX_ITER = 300
def Normalized_Spectral_Clustering(obs,n,k, Random,n_dim):
#Forming The Weighted Adjacency Matrix
W = np.zeros((n, n), dtype = np.float64)
for i in range(n):
for j in range(i, n):
if (i == j):
W[i][i] = 0
else:
W[i][j] = np.exp(-EC.eucDist(obs[i], obs[j])/ 2) #calling euclidean distance from another module
W[j][i] = W[i][j] #W is symmetric
# D is the Diagonal Degree Matrix, D_half is D^-0.5
D_half = np.zeros((n, n), dtype = np.float64)
for i in range(n):
sq_sum_row = np.sqrt(sum(W[i]))
if (sq_sum_row<= 0.0001): #division by 0.0001 is considered as division by zero
print("Divison by a value < 0.0001 while generating D^(-1/2) matrix was encountered.\nThe program will terminate")
exit(0)
D_half[i][i] = 1/sq_sum_row
DWD = np.matmul(np.matmul(D_half,W, dtype = np.float64),D_half, dtype = np.float64) # D^-0.5 *W * D^-0.5
# The Normalized Graph Laplacian
L_norm = np.identity(n, dtype=np.float64) - DWD
"""VERY IMPORTANT NOTE: as suggested in both the project's forum and the Lecturer Messages forum,
in order to solve the problems with dividing by zero in the modified GramSchmidt algorithm -
we add the identity matrix to the L_norm matrix beofre calling to the QR algorithm"""
L_norm= L_norm + np.identity(n,dtype=np.float64)
#finding the eigenvalues and eigenvectors of L_norm
A_bar, Q_bar = QR.QR_Iter(L_norm,n) #calling to the QR function from another module and receiving eigenvalues and eigenvectors
# sorting eigenvalues and keeping the column's index so its possible to connect each evalue to its evector
for i in range(n): #according to the instructions and clarifications in the project's forum - each eigenvalue smaller than 0.0001 is considered zero
if abs(A_bar[i][i])<= 0.0001:
A_bar[i][i] = 0
eigenvals=sorted([(A_bar[i][i],i) for i in range (n)],key=lambda x: x[0]) #sort eigenvalues and remember their initial column index (1'th index)
#k is either provided by user or by eigengap heuristic - depandant on Random
if (Random == False):
k2 = k
else:
#calculate k using eigengap heuristic
eigengaps=[np.abs(eigenvals[i][0]-eigenvals[i+1][0]) for i in range(int(math.ceil(n/2)))]
k2=np.min(np.argmax(eigengaps))+1 # plus 1 cause 1st index is 0 (min because it returns an array)
if k2 ==1: #if eigengap heuristic determined that k is 1 - simply return with this tuple and the rest of the algorithm will identify it (solved in main module)
return (0,1)
#step 4 of NSC algorithm
T = Q_bar[:,[eigenvals[i][1] for i in range(k2)]]
#step 5 of NSC algorithm
for i in range(n):
row_sum = np.sqrt(np.sum(T[i]**2))
if row_sum <= 0.0001: #numbers below 0.0001 is considered zero - no division allowed
print("Divison by a value < 0.0001 while generating T matrix was encountered.\nThe program will terminate")
exit(0)
T[i]=T[i]/row_sum
#we flatten T and send it as a 1-d python list
T= T.ravel().tolist()
#calling for the kmeans_plus algorithm for choosing initial centroids
indices = kmeans_pp.kmeans_pp(T,k2,k2,n)
#calling for the kmeans algorithm implemented as C enxtension
spec_results= km.kmeans(k2,n,k2,MAX_ITER, T, indices)
return (spec_results,k2)