import random
from copy import deepcopy
from itertools import groupby
from statistics import median
# This function should have one parameter, a file name (including, if necessary, its path).
# The function should read this CSV file and return a matrix (list of lists) in which a row
# in the file is a row in the matrix. If the file has N rows, the matrix should have N
# elements (each element is a list). Notice that in CSV files a comma separates columns
# (CSV = comma separated values). You can assume the file will contain solely numeric values
# (and commas, of course) with no quotes.
def load_from_csv(fileName):
# Use list to save matrix
res = []
# Use a loop structure to read each line in the data file.
with open(fileName) as f:
lines = f.readlines()
for l in lines:
res.append(l.strip().split(','))
return res
# This function should have two parameters, both of them lists. It should return the Manhattan
# distance between the two lists. For details about this distance, read the appendix.
def get_distance(l1, l2):
# Use a judgment structure to determine whether the length of list1 and list2 are the same
if len(l1) != len(l2):
print('Error! List1 and List2 are not the same length!')
return -1
else:
d = 0
# Use a loop structure look into each index of two lists
for i in range(len(l1)):
d += abs(l1[i] - l2[i])
return d
# This function should have two parameters, a matrix (list of lists) and a column number. It should
# look into all the elements of the data matrix in this column number, and return the highest value.
def get_max(m, c):
# Use a judgment structure to determine if the column number is less than the number of columns in the matrix
if c >= len(m[0]):
print('Error! The column number should be smaller than the number of columns in the matrix')
return -1
else:
# Loop through each row of a matrix using a loop structure
maxValue = m[0][c]
for r in m:
# Determine if the current value is greater than the maximum
if r[c] > maxValue:
maxValue = r[c]
return maxValue
# This function should have two parameters, a matrix (list of lists) and a column number. It should
# look into all the elements of the data matrix in this column number, and return the lowest value.
def get_min(m, c):
# Use a judgment structure to determine if the column number is less than the number of columns in the matrix
if c >= len(m[0]):
print('Error! The column number should be smaller than the number of columns in the matrix')
return -1
else:
# Loop through each row of a matrix using a loop structure
minValue = m[0][c]
for r in m:
# Determines whether the current value is smaller than the minimum value
if r[c] < minValue:
minValue = r[c]
return minValue
# This function should have two parameters, a matrix (list of lists) and a column number. It should
# look into all the elements of the data matrix in this column number, and return the average of this column number.
def get_mean(m, c):
# Use a judgment structure to determine if the column number is less than the number of columns in the matrix
if c >= len(m[0]):
print('Error! The column number should be smaller than the number of columns in the matrix')
return -1
else:
# Loop through each row of a matrix using a loop structure
mean = 0
for r in m:
mean += float(r[c])
return mean/len(m)
# This function should take one parameter, a matrix (list of lists). It should return a matrix containing
# the standardised version of the matrix passed as a parameter. This function should somehow use the get_max
# and get_min functions. For details on how to standardise a matrix, read the appendix.
def get_standardised_matrix(m):
maxList = [ get_max(m,c) for c in range(len(m[0]))]
minList = [ get_min(m,c) for c in range(len(m[0]))]
meanList = [ get_mean(m,c) for c in range(len(m[0]))]
lists = []
for r in m:
l = []
for c, i in enumerate(r):
l.append((float(i) - float(meanList[c]))/(float(maxList[c])-float(minList[c])))
lists.append(l)
return lists
# This function should have two parameters: a matrix (list of lists), and a column number. It should return
# the median of the values in the data matrix at the column number passed as a parameter. Details about
# median can be easily found online, eg. https://en.wikipedia.org/wiki/Median.
def get_median(m, c):
l = [float(r[c]) for r in m]
return median(l)
# This function should have three parameters: (i) a matrix (list of lists), (iii) the list S, (iii) the value
# of K. This function should implement the Step 6 of the algorithm described in the appendix. It should return
# a list containing K elements, c1, c2, ... , cK. Clearly, each of these elements is also a list.
def get_centroids(m, S, K):
res = []
for i in range(K):
c = []
# select those rows that have been assigned to cluster k.
m_i = [x for index, x in enumerate(m) if S[index] == i]
# Each element j of ckshould be equal to the median of the column D'j
for j in range(len(m[0])):
c.append(get_median(m_i, j))
# Update ck.
res.append(c)
return res
# This function should have two parameters: a data matrix (list of lists) and the number of groups to be
# created (K). This function follows the algorithm described in the appendix. It should return a list
# S (defined in the appendix). This function should use the other functions you wrote as much as possible.
# Do not keep repeating code you already wrote.
def get_groups(D, K):
S = []
# Select K different rows from the data matrix at random.
c_list = deepcopy(random.sample(D, K))
while True:
S_temp = []
for i in range(len(D)):
# Calculate the Manhattan distance between data row D'i and each of the lists c1, c2, ... , cK. cK
dist = get_distance(D[i], c_list[0])
s_i = 0
for index, c_i in enumerate(c_list):
if get_distance(D[i], c_i) < dist:
dist = get_distance(D[i], c_i)
s_i = index
# Assign the row D'i to the cluster of the nearest c
S_temp.append(s_i)
# If the previous step does not change S, stop
if S_temp == S:
break
else:
S = deepcopy(S_temp)
c_list = get_centroids(D, S, K)
return S
def run_test():
# load data file
fileName = 'Data(1).csv'
m = load_from_csv(fileName)
# get standardised matrix
D = get_standardised_matrix(m)
res = []
# run experiments with K = 2, 3, 4, 5, 6.
for K in range(2,7):
res.append(get_groups(D, K))
# print the result
for K in range(2, 7):
print("="*100)
print("K = " + str(K) + ":")
print(res[K-2])
# show how many entities (wines) have been assigned to each group
for key, group in groupby(sorted(res[K-2])):
print("group " + str(key) + ": " + str(len(list(group))))
if __name__ == "__main__":
# test::load_from_csv(fileName)
# fileName = 'Data(1).csv'
# m = load_from_csv(fileName)
# # test::get_distance(l1, l2)
# l1 = [1,2,3]
# l2 = [4,5,6]
# print(get_distance(l1, l2))
# c = 1
# # test::get_max(m, c)
# maxValue = get_max(m, c)
# print(maxValue)
# # test::get_min(m, c)
# minValue = get_min(m, c)
# print(minValue)
# # test::get_mean(m, c)
# meanValue = get_mean(m, c)
# print(meanValue)
# # test::get_standardised_matrix(m)
# D = get_standardised_matrix(m)
# print(D)
# # test::get_median(m, c)
# print(get_median(m, c))
# K = 2
# S = [0, 0, 0, 1, 1, 1]
# # test::get_centroids(m, S, K)
# c_k = get_centroids(m, S, K)
# print(c_k)
# # test::get_groups(D, K)
# S = get_groups(m, K)
# print(S)
# test::run_test()
run_test()