-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathhierarchical_cluster_mine_2.py
More file actions
104 lines (82 loc) · 3.31 KB
/
hierarchical_cluster_mine_2.py
File metadata and controls
104 lines (82 loc) · 3.31 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
# -*- coding: utf-8 -*-
"""
Created on Fri Mar 22 18:47:06 2019
@author: moose
"""
import os
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.cluster import KMeans
import scipy.cluster.hierarchy as sch
from sklearn.cluster import AgglomerativeClustering
os.chdir('C:\\\\Users\\\\moose\\\\Desktop')
class clustering(object):
# choose columns you want to cluster from dataset when instantiating class
def __init__(self,column1,column2,file):
self.column1 = column1
self.column2 = column2
self.file = file
self.datasetNew = pd.read_csv(self.file)
self.X = self.datasetNew.iloc[:,[self.column1,self.column2]].values
def dendrogram(self,linkage):
# using dendrogram to optimal number of clusters
dendrogram = sch.dendrogram(sch.linkage(self.X,linkage))
plt.title('Dendrogram')
plt.xlabel('X-Value')
plt.ylabel('Y-Value')
plt.show()
def agglomViz(self,affin,link,clusters):
# fitting heiarchical clustering to the dataset
hc = AgglomerativeClustering(n_clusters = clusters, affinity = affin, linkage = link)
y_hc = hc.fit_predict(self.X)
# visualizing data using agglomerative method
for i in range(0,clusters):
plt.scatter(self.X[y_hc == i, 0], self.X[y_hc == i, 1], s = 100)
plt.show()
plt.title('Clusters of Data: Agglomerative Method')
plt.xlabel('X-Value')
plt.ylabel('Y-Value')
plt.legend()
plt.show()
def kMeansElbow(self):
# using the elbow method to find optimal number of clusters
wcss = []
for i in range(1, 11):
kmeans = KMeans(n_clusters = i, init = 'k-means++', max_iter=300,n_init=10,random_state=0)
kmeans.fit(self.X)
wcss.append(kmeans.inertia_)
plt.plot(range(1, 11),wcss)
plt.title('The Elbow Method')
plt.xlabel('Number of Data')
plt.ylabel('WCSS')
plt.show()
def kMeansViz(self,clusterNum):
# applying k means to the dataset
kmeans = KMeans(n_clusters = clusterNum, init = 'k-means++',max_iter=300,n_init=10,random_state=0)
y_kmeans = kmeans.fit_predict(self.X)
# visualizing the clusters using K-means
for i in range(0,clusterNum):
plt.scatter(self.X[y_kmeans == i, 0], self.X[y_kmeans == i, 1], s = 100)
plt.scatter(kmeans.cluster_centers_[:,0],kmeans.cluster_centers_[:,1],s=50,c='yellow',label='Centroids')
plt.title('Clusters of Data: K-Means Method')
plt.xlabel('X-Value')
plt.ylabel('Y-Value')
plt.show()
csv_file = 'mall_customers.csv'
first_column = 3
second_column = 4
number_clusters = 5
affinity = 'euclidean'
linkage = 'ward'
# create the dendrogram for hierarchical method
hc1 = clustering(first_column,second_column,csv_file)
hc1.dendrogram(linkage)
# displays the elbow method for determining custer number
elbow = clustering(first_column,second_column,csv_file)
elbow.kMeansElbow()
# create the agglomorative hierarchical cluster
hc2 = clustering(first_column,second_column,csv_file)
hc2.agglomViz(affinity,linkage,number_clusters)
# create the K-Means cluster with yellow centroids
kMeans = clustering(first_column,second_column,csv_file)
kMeans.kMeansViz(number_clusters)