from numpy import *
import operator
def createDataSet():
group = array([[1.0,1.1],[1.0,1.0],[0,0],[0,0.1]])
lables = ['A','A','B','B']
return group , lables
In [4]:
group,labels = createDataSet()
In [5]:
print group
print labels
In [6]:
'''
Here we have four pieces of data. Each piece of data has two attributes or features, things
we know about it.
In the "group" matrix each row is a different piece of data. Think of it
as a different measurement or entry in some sort of log.
As humans, we can visualize things in one, two, or sometimes three dimensions, but that’s about the limit of our
brains; to keep things easy to visualize, we’ll use only two features for each data point.
'''
Out[6]:
In [7]:
type(group)
Out[7]:
In [8]:
type(labels)
Out[8]:
In [9]:
def classify0(inX, dataSet, labels, k):
dataSetSize = dataSet.shape[0]
diffMat = tile(inX, (dataSetSize,1)) - dataSet
sqDiffMat = diffMat ** 2
sqDistances = sqDiffMat.sum(axis = 1)
distances = sqDistances ** 0.5
sortedDistIndicies = distances.argsort()
# Voting with lowest k distances
classCount={}
for i in range(k):
voteIlabel = labels[sortedDistIndicies[i]]
classCount[voteIlabel] = classCount.get(voteIlabel, 0) + 1
# Decompose into a list of tuples and sort by second item
sortedClassCount = sorted(classCount.iteritems(),
key=operator.itemgetter(1), reverse=True)
return sortedClassCount[0][0]
In [10]:
group, labels = createDataSet()
classify0([1,0], group, labels, 3)
Out[10]:
In [11]:
def file2matrix(filename):
fr = open(filename)
arrayOLines = fr.readlines()
numberOfLines = len(arrayOLines)
returnMat = zeros((numberOfLines,3))
classLabelVector = []
index = 0
for line in arrayOLines:
line = line.strip()
listFromLine = line.split('\t')
returnMat[index,:] = listFromLine[0:3]
classLabelVector.append(int(listFromLine[-1]))
index += 1
return returnMat,classLabelVector
In [12]:
datingDataMat,datingLabels = file2matrix('datingTestSet2.txt')
In [13]:
print datingDataMat
print "_____________________"
print type(datingDataMat) #<type 'numpy.ndarray'>
print "_____________________"
print "First Five Observations from Column-1 [FREQ FLIER MILES] :--" , datingDataMat[0:5,0] # 0 == Column1
print "_____________________"
print "First Five Observations from Column-2 :--" , datingDataMat[0:5,1] # 1 == Column2
print "_____________________"
print "First Five Observations from Column-3 :--" , datingDataMat[0:5,2] # 2 == Column3
In [14]:
print datingLabels[0:15] # Print the First 15 List Elements , of Data Lables Converted to NUM...
print type(datingLabels) # Print TYPE == LIST
# love_dictionary={'largeDoses':3, 'smallDoses':2, 'didntLike':1}
In [15]:
%matplotlib inline
#
import pandas as pd
from pandas.tseries.resample import TimeGrouper
import matplotlib
import matplotlib.pyplot as plt
matplotlib.style.use('ggplot')
fig = plt.figure()
ax = fig.add_subplot(111)
#ax.scatter(datingDataMat[:,1], datingDataMat[:,2]) # Initial Line changed below ----
ax.scatter(datingDataMat[:,1], datingDataMat[:,2],
15.0*array(datingLabels), 15.0*array(datingLabels))
plt.show()
# TBD -- .set_ylabel('Time Spent Playing Video Games')
# TBD -- .set_xlabel('Freq Flier Miles')
In [16]:
fig = plt.figure()
ax = fig.add_subplot(111)
#ax.scatter(datingDataMat[:,1], datingDataMat[:,2]) # Initial Line changed below ----
ax.scatter(datingDataMat[:,0], datingDataMat[:,2],
15.0*array(datingLabels), 15.0*array(datingLabels))
plt.show()
# TBD -- .set_ylabel('Ice Cream Consumption')
# TBD -- .set_xlabel('Frequent Flier Miles ')
In [17]:
fig = plt.figure()
ax = fig.add_subplot(111)
#ax.scatter(datingDataMat[:,1], datingDataMat[:,2]) # Initial Line changed below ----
ax.scatter(datingDataMat[:,0], datingDataMat[:,1],
15.0*array(datingLabels), 15.0*array(datingLabels))
# TBD --- fig.legend((l1, l2,l3), ('Line 1', 'Line 2','Line3'), 'upper left')
plt.show()
# TBD -- .set_ylabel('Time Spent Playing Video Games')
# TBD -- .set_xlabel('Freq Flier Miles')
In [18]:
def autoNorm(dataSet):
minVals = dataSet.min(0) # Numpy Array of - Min Values - from each Column of dataSET
maxVals = dataSet.max(0) # Max Values from each Column of dataSET
ranges = maxVals - minVals
normDataSet = zeros(shape(dataSet))
m = dataSet.shape[0]
normDataSet = dataSet - tile(minVals, (m,1))
normDataSet = normDataSet/tile(ranges, (m,1)) #element wise divide
return normDataSet, ranges, minVals
In [24]:
# Testing ---
minVals = datingDataMat.min(0) # Min Values from each Column of dataSET
#type(minVals) # numpy.ndarray
print minVals
In [25]:
# Testing ---
maxVals = datingDataMat.max(0) # Max Values from each Column of dataSET
#type(maxVals) # numpy.ndarray
print maxVals # These have been checked to be correct from RAW Text File
In [30]:
#Testing ---
normDataSet = zeros(shape(datingDataMat))
print normDataSet
#type(normDataSet) # numpy.ndarray
m = datingDataMat.shape[0]
#type(m) <int>
print m # This m is == 1000 which shows that our datingDataMat [ Matrix or NumpyArray ] is shaped == 1000X3
# As we have our MinVals and MaxVals shaped == 1X3 - as seen above they are just 1 Row of 3 Data points each
# We will use Numpy Tiles to fill up a 1000X3 Matrix - Further TBD ........
In [19]:
normDataSet, ranges, minVals=autoNorm(datingDataMat)
In [22]:
#Seeing the Results of Normalization of Matrix
print "First Five Observations from Column-1 [FREQ FLIER MILES] of datingDataMat:--" , datingDataMat[0:5,0] # 0 == Column1
print "____________###___________________"
print "First Five Observations from Column-1 [FREQ FLIER MILES] of datingDataMat:--" , normDataSet[0:5,0] # 0 == Column1
In [34]:
#As the data is Normalized now - we can split into 90% TRAINING and 10%TEST
def datingClassTest():
hoRatio = 0.10 #Hold Out Ratio - Hold Out Test Set ....
datingDataMat,datingLabels = file2matrix('datingTestSet2.txt') #load data setfrom file
normMat, ranges, minVals = autoNorm(datingDataMat)
m = normMat.shape[0]
numTestVecs = int(m*hoRatio)
errorCount = 0.0
for i in range(numTestVecs):
classifierResult = classify0(normMat[i,:],normMat[numTestVecs:m,:],datingLabels[numTestVecs:m],3)
print "the classifier came back with: %d, the real answer is: %d" % (classifierResult, datingLabels[i])
if (classifierResult != datingLabels[i]): errorCount += 1.0
print "the total error rate is: %f" % (errorCount/float(numTestVecs))
print errorCount
# If we RUN this Cell we get --- NameError: name 'errorCount' is not defined
# Ignore this is a Function Definition Cell - Need not be run ...
#
In [35]:
datingClassTest()
# print errorCount