from numpy import *
import operator
def createDataSet():
group = array([[1.0,1.1],[1.0,1.0],[0,0],[0,0.1]])
lables = ['A','A','B','B']
return group , lables
group,labels = createDataSet()
print group
print labels
Here we have four pieces of data. Each piece of data has two attributes or features, things
we know about it.
In the "group" matrix each row is a different piece of data. Think of it
as a different measurement or entry in some sort of log.
As humans, we can visualize things in one, two, or sometimes three dimensions, but that’s about the limit of our
brains; to keep things easy to visualize, we’ll use only two features for each data point.
def classify0(inX, dataSet, labels, k):
dataSetSize = dataSet.shape[0]
diffMat = tile(inX, (dataSetSize,1)) - dataSet
sqDiffMat = diffMat ** 2
sqDistances = sqDiffMat.sum(axis = 1)
distances = sqDistances ** 0.5
sortedDistIndicies = distances.argsort()
# Voting with lowest k distances
for i in range(k):
voteIlabel = labels[sortedDistIndicies[i]]
classCount[voteIlabel] = classCount.get(voteIlabel, 0) + 1
# Decompose into a list of tuples and sort by second item
sortedClassCount = sorted(classCount.iteritems(),
key=operator.itemgetter(1), reverse=True)
return sortedClassCount[0][0]
group, labels = createDataSet()
classify0([1,0], group, labels, 3)
def file2matrix(filename):
fr = open(filename)
arrayOLines = fr.readlines()
numberOfLines = len(arrayOLines)
returnMat = zeros((numberOfLines,3))
classLabelVector = []
index = 0
for line in arrayOLines:
line = line.strip()
listFromLine = line.split('\t')
returnMat[index,:] = listFromLine[0:3]
index += 1
return returnMat,classLabelVector
datingDataMat,datingLabels = file2matrix('datingTestSet2.txt')
print datingDataMat
print "_____________________"
print type(datingDataMat) #<type 'numpy.ndarray'>
print "_____________________"
print "First Five Observations from Column-1 [FREQ FLIER MILES] :--" , datingDataMat[0:5,0] # 0 == Column1
print "_____________________"
print "First Five Observations from Column-2 :--" , datingDataMat[0:5,1] # 1 == Column2
print "_____________________"
print "First Five Observations from Column-3 :--" , datingDataMat[0:5,2] # 2 == Column3
print datingLabels[0:15] # Print the First 15 List Elements , of Data Lables Converted to NUM...
print type(datingLabels) # Print TYPE == LIST
# love_dictionary={'largeDoses':3, 'smallDoses':2, 'didntLike':1}
%matplotlib inline
import pandas as pd
from pandas.tseries.resample import TimeGrouper
import matplotlib
import matplotlib.pyplot as plt'ggplot')
fig = plt.figure()
ax = fig.add_subplot(111)
#ax.scatter(datingDataMat[:,1], datingDataMat[:,2]) # Initial Line changed below ----
ax.scatter(datingDataMat[:,1], datingDataMat[:,2],
15.0*array(datingLabels), 15.0*array(datingLabels))
# TBD -- .set_ylabel('Time Spent Playing Video Games')
# TBD -- .set_xlabel('Freq Flier Miles')
fig = plt.figure()
ax = fig.add_subplot(111)
#ax.scatter(datingDataMat[:,1], datingDataMat[:,2]) # Initial Line changed below ----
ax.scatter(datingDataMat[:,0], datingDataMat[:,2],
15.0*array(datingLabels), 15.0*array(datingLabels))
# TBD -- .set_ylabel('Ice Cream Consumption')
# TBD -- .set_xlabel('Frequent Flier Miles ')
fig = plt.figure()
ax = fig.add_subplot(111)
#ax.scatter(datingDataMat[:,1], datingDataMat[:,2]) # Initial Line changed below ----
ax.scatter(datingDataMat[:,0], datingDataMat[:,1],
15.0*array(datingLabels), 15.0*array(datingLabels))
# TBD --- fig.legend((l1, l2,l3), ('Line 1', 'Line 2','Line3'), 'upper left')
# TBD -- .set_ylabel('Time Spent Playing Video Games')
# TBD -- .set_xlabel('Freq Flier Miles')
def autoNorm(dataSet):
minVals = dataSet.min(0) # Numpy Array of - Min Values - from each Column of dataSET
maxVals = dataSet.max(0) # Max Values from each Column of dataSET
ranges = maxVals - minVals
normDataSet = zeros(shape(dataSet))
m = dataSet.shape[0]
normDataSet = dataSet - tile(minVals, (m,1))
normDataSet = normDataSet/tile(ranges, (m,1)) #element wise divide
return normDataSet, ranges, minVals
# Testing ---
minVals = datingDataMat.min(0) # Min Values from each Column of dataSET
#type(minVals) # numpy.ndarray
print minVals
# Testing ---
maxVals = datingDataMat.max(0) # Max Values from each Column of dataSET
#type(maxVals) # numpy.ndarray
print maxVals # These have been checked to be correct from RAW Text File
#Testing ---
normDataSet = zeros(shape(datingDataMat))
print normDataSet
#type(normDataSet) # numpy.ndarray
m = datingDataMat.shape[0]
#type(m) <int>
print m # This m is == 1000 which shows that our datingDataMat [ Matrix or NumpyArray ] is shaped == 1000X3
# As we have our MinVals and MaxVals shaped == 1X3 - as seen above they are just 1 Row of 3 Data points each
# We will use Numpy Tiles to fill up a 1000X3 Matrix - Further TBD ........
normDataSet, ranges, minVals=autoNorm(datingDataMat)
#Seeing the Results of Normalization of Matrix
print "First Five Observations from Column-1 [FREQ FLIER MILES] of datingDataMat:--" , datingDataMat[0:5,0] # 0 == Column1
print "____________###___________________"
print "First Five Observations from Column-1 [FREQ FLIER MILES] of datingDataMat:--" , normDataSet[0:5,0] # 0 == Column1
#As the data is Normalized now - we can split into 90% TRAINING and 10%TEST
def datingClassTest():
hoRatio = 0.10 #Hold Out Ratio - Hold Out Test Set ....
datingDataMat,datingLabels = file2matrix('datingTestSet2.txt') #load data setfrom file
normMat, ranges, minVals = autoNorm(datingDataMat)
m = normMat.shape[0]
numTestVecs = int(m*hoRatio)
errorCount = 0.0
for i in range(numTestVecs):
classifierResult = classify0(normMat[i,:],normMat[numTestVecs:m,:],datingLabels[numTestVecs:m],3)
print "the classifier came back with: %d, the real answer is: %d" % (classifierResult, datingLabels[i])
if (classifierResult != datingLabels[i]): errorCount += 1.0
print "the total error rate is: %f" % (errorCount/float(numTestVecs))
print errorCount
# If we RUN this Cell we get --- NameError: name 'errorCount' is not defined
# Ignore this is a Function Definition Cell - Need not be run ...
# print errorCount
