Saturday 26 November 2016

Basics of kNN Classification Python 2.7

from numpy import *
import operator 

def createDataSet():
    group = array([[1.0,1.1],[1.0,1.0],[0,0],[0,0.1]])
    lables = ['A','A','B','B']
    return group , lables
In [4]:
group,labels = createDataSet()
In [5]:
print group
print labels
[[ 1.   1.1]
 [ 1.   1. ]
 [ 0.   0. ]
 [ 0.   0.1]]
['A', 'A', 'B', 'B']
In [6]:
'''

Here we have four pieces of data. Each piece of data has two attributes or features, things
we know about it. 

In the "group" matrix each row is a different piece of data. Think of it
as a different measurement or entry in some sort of log. 

As humans, we can visualize things in one, two, or sometimes three dimensions, but that’s about the limit of our
brains; to keep things easy to visualize, we’ll use only two features for each data point.

'''
Out[6]:
'\n\nHere we have four pieces of data. Each piece of data has two attributes or features, things\nwe know about it. \n\nIn the "group" matrix each row is a different piece of data. Think of it\nas a different measurement or entry in some sort of log. \n\nAs humans, we can visualize things in one, two, or sometimes three dimensions, but that\xe2\x80\x99s about the limit of our\nbrains; to keep things easy to visualize, we\xe2\x80\x99ll use only two features for each data point.\n\n'
In [7]:
type(group)
Out[7]:
numpy.ndarray
In [8]:
type(labels)
Out[8]:
list
In [9]:
def classify0(inX, dataSet, labels, k):
    dataSetSize = dataSet.shape[0]

    diffMat = tile(inX, (dataSetSize,1)) - dataSet
    sqDiffMat = diffMat ** 2
    sqDistances = sqDiffMat.sum(axis = 1)
    distances = sqDistances ** 0.5
    
    sortedDistIndicies = distances.argsort()

    # Voting with lowest k distances
    classCount={}
    for i in range(k):
        voteIlabel = labels[sortedDistIndicies[i]]
        classCount[voteIlabel] = classCount.get(voteIlabel, 0) + 1

    # Decompose into a list of tuples and sort by second item
    sortedClassCount = sorted(classCount.iteritems(),
        key=operator.itemgetter(1), reverse=True)

    return sortedClassCount[0][0]
In [10]:
group, labels = createDataSet()
classify0([1,0], group, labels, 3)
Out[10]:
'B'
In [11]:
def file2matrix(filename):
 fr = open(filename)
 arrayOLines = fr.readlines()
 numberOfLines = len(arrayOLines)
 returnMat = zeros((numberOfLines,3))
 classLabelVector = []
 index = 0
 for line in arrayOLines:
  line = line.strip()
  listFromLine = line.split('\t')
  returnMat[index,:] = listFromLine[0:3]
  classLabelVector.append(int(listFromLine[-1]))
  index += 1
 return returnMat,classLabelVector
In [12]:
datingDataMat,datingLabels = file2matrix('datingTestSet2.txt')
In [13]:
print datingDataMat
print "_____________________"
print type(datingDataMat) #<type 'numpy.ndarray'>
print "_____________________"

print "First Five Observations from Column-1 [FREQ FLIER MILES] :--" , datingDataMat[0:5,0] # 0 == Column1

print "_____________________"
print "First Five Observations from Column-2 :--" , datingDataMat[0:5,1] # 1 == Column2

print "_____________________"
print "First Five Observations from Column-3 :--" , datingDataMat[0:5,2] # 2 == Column3
[[  4.09200000e+04   8.32697600e+00   9.53952000e-01]
 [  1.44880000e+04   7.15346900e+00   1.67390400e+00]
 [  2.60520000e+04   1.44187100e+00   8.05124000e-01]
 ..., 
 [  2.65750000e+04   1.06501020e+01   8.66627000e-01]
 [  4.81110000e+04   9.13452800e+00   7.28045000e-01]
 [  4.37570000e+04   7.88260100e+00   1.33244600e+00]]
_____________________
<type 'numpy.ndarray'>
_____________________
First Five Observations from Column-1 [FREQ FLIER MILES] :-- [ 40920.  14488.  26052.  75136.  38344.]
_____________________
First Five Observations from Column-2 :-- [  8.326976   7.153469   1.441871  13.147394   1.669788]
_____________________
First Five Observations from Column-3 :-- [ 0.953952  1.673904  0.805124  0.428964  0.134296]
In [14]:
print datingLabels[0:15] # Print the First 15 List Elements , of Data Lables Converted to NUM...

print type(datingLabels) # Print TYPE == LIST

#  love_dictionary={'largeDoses':3, 'smallDoses':2, 'didntLike':1}
[3, 2, 1, 1, 1, 1, 3, 3, 1, 3, 1, 1, 2, 1, 1]
<type 'list'>
In [15]:
%matplotlib inline
# 
import pandas as pd
from pandas.tseries.resample import TimeGrouper
import matplotlib 
import matplotlib.pyplot as plt
matplotlib.style.use('ggplot')

fig = plt.figure()
ax = fig.add_subplot(111)
#ax.scatter(datingDataMat[:,1], datingDataMat[:,2]) # Initial Line changed below ---- 

ax.scatter(datingDataMat[:,1], datingDataMat[:,2],
15.0*array(datingLabels), 15.0*array(datingLabels))
plt.show()

# TBD -- .set_ylabel('Time Spent Playing Video Games')
# TBD -- .set_xlabel('Freq Flier Miles')
In [16]:
fig = plt.figure()
ax = fig.add_subplot(111)
#ax.scatter(datingDataMat[:,1], datingDataMat[:,2]) # Initial Line changed below ---- 

ax.scatter(datingDataMat[:,0], datingDataMat[:,2],
15.0*array(datingLabels), 15.0*array(datingLabels))
plt.show()

# TBD -- .set_ylabel('Ice Cream Consumption')
# TBD -- .set_xlabel('Frequent Flier Miles ')
In [17]:
fig = plt.figure()
ax = fig.add_subplot(111)
#ax.scatter(datingDataMat[:,1], datingDataMat[:,2]) # Initial Line changed below ---- 

ax.scatter(datingDataMat[:,0], datingDataMat[:,1],
15.0*array(datingLabels), 15.0*array(datingLabels))
# TBD --- fig.legend((l1, l2,l3), ('Line 1', 'Line 2','Line3'), 'upper left')
plt.show()

# TBD -- .set_ylabel('Time Spent Playing Video Games')
# TBD -- .set_xlabel('Freq Flier Miles')
In [18]:
def autoNorm(dataSet):
    minVals = dataSet.min(0) # Numpy Array of - Min Values - from each Column of dataSET
    maxVals = dataSet.max(0) # Max Values from each Column of dataSET
    ranges = maxVals - minVals 
    normDataSet = zeros(shape(dataSet))
    m = dataSet.shape[0]
    normDataSet = dataSet - tile(minVals, (m,1))
    normDataSet = normDataSet/tile(ranges, (m,1))   #element wise divide
    return normDataSet, ranges, minVals
In [24]:
# Testing ---

minVals = datingDataMat.min(0) # Min Values from each Column of dataSET

#type(minVals) # numpy.ndarray

print minVals
[ 0.        0.        0.001156]
In [25]:
# Testing ---

maxVals = datingDataMat.max(0) # Max Values from each Column of dataSET

#type(maxVals) # numpy.ndarray

print maxVals # These have been checked to be correct from RAW Text File 
[  9.12730000e+04   2.09193490e+01   1.69551700e+00]
In [30]:
#Testing ---

normDataSet = zeros(shape(datingDataMat))

print normDataSet

#type(normDataSet) # numpy.ndarray

m = datingDataMat.shape[0]

#type(m) <int>

print m  # This m is == 1000 which shows that our datingDataMat [ Matrix or NumpyArray ] is shaped == 1000X3 
# As we have our MinVals and MaxVals shaped == 1X3 - as seen above they are just 1 Row of 3 Data points each 
# We will use Numpy Tiles to fill up a 1000X3 Matrix - Further TBD ........
[[ 0.  0.  0.]
 [ 0.  0.  0.]
 [ 0.  0.  0.]
 ..., 
 [ 0.  0.  0.]
 [ 0.  0.  0.]
 [ 0.  0.  0.]]
1000
In [19]:
normDataSet, ranges, minVals=autoNorm(datingDataMat)
In [22]:
#Seeing the Results of Normalization of Matrix 

print "First Five Observations from Column-1 [FREQ FLIER MILES] of datingDataMat:--" , datingDataMat[0:5,0] # 0 == Column1

print "____________###___________________"

print "First Five Observations from Column-1 [FREQ FLIER MILES] of datingDataMat:--" , normDataSet[0:5,0] # 0 == Column1
First Five Observations from Column-1 [FREQ FLIER MILES] of datingDataMat:-- [ 40920.  14488.  26052.  75136.  38344.]
____________###___________________
First Five Observations from Column-1 [FREQ FLIER MILES] of datingDataMat:-- [ 0.44832535  0.15873259  0.28542943  0.82320073  0.42010233]
In [34]:
#As the data is Normalized now - we can split into 90% TRAINING and 10%TEST 

def datingClassTest():
    hoRatio = 0.10      #Hold Out Ratio - Hold Out Test Set .... 
    datingDataMat,datingLabels = file2matrix('datingTestSet2.txt')       #load data setfrom file
    normMat, ranges, minVals = autoNorm(datingDataMat)
    m = normMat.shape[0]
    numTestVecs = int(m*hoRatio)
    errorCount = 0.0
    for i in range(numTestVecs):
        classifierResult = classify0(normMat[i,:],normMat[numTestVecs:m,:],datingLabels[numTestVecs:m],3)
        print "the classifier came back with: %d, the real answer is: %d" % (classifierResult, datingLabels[i])
        if (classifierResult != datingLabels[i]): errorCount += 1.0
    print "the total error rate is: %f" % (errorCount/float(numTestVecs))
print errorCount

# If we RUN this Cell we get --- NameError: name 'errorCount' is not defined
# Ignore this is a Function Definition Cell - Need not be run ...
#
---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
<ipython-input-34-f1b820a8c01d> in <module>()
     13         if (classifierResult != datingLabels[i]): errorCount += 1.0
     14     print "the total error rate is: %f" % (errorCount/float(numTestVecs))
---> 15 print errorCount

NameError: name 'errorCount' is not defined
In [35]:
datingClassTest()

# print errorCount
the classifier came back with: 3, the real answer is: 3
the classifier came back with: 2, the real answer is: 2
the classifier came back with: 1, the real answer is: 1
the classifier came back with: 1, the real answer is: 1
the classifier came back with: 1, the real answer is: 1
the classifier came back with: 1, the real answer is: 1
the classifier came back with: 3, the real answer is: 3
the classifier came back with: 3, the real answer is: 3
the classifier came back with: 1, the real answer is: 1
the classifier came back with: 3, the real answer is: 3
the classifier came back with: 1, the real answer is: 1
the classifier came back with: 1, the real answer is: 1
the classifier came back with: 2, the real answer is: 2
the classifier came back with: 1, the real answer is: 1
the classifier came back with: 1, the real answer is: 1
the classifier came back with: 1, the real answer is: 1
the classifier came back with: 1, the real answer is: 1
the classifier came back with: 1, the real answer is: 1
the classifier came back with: 2, the real answer is: 2
the classifier came back with: 3, the real answer is: 3
the classifier came back with: 2, the real answer is: 2
the classifier came back with: 1, the real answer is: 1
the classifier came back with: 1, the real answer is: 2
the classifier came back with: 3, the real answer is: 3
the classifier came back with: 2, the real answer is: 2
the classifier came back with: 3, the real answer is: 3
the classifier came back with: 2, the real answer is: 2
the classifier came back with: 3, the real answer is: 3
the classifier came back with: 2, the real answer is: 2
the classifier came back with: 1, the real answer is: 1
the classifier came back with: 3, the real answer is: 3
the classifier came back with: 1, the real answer is: 1
the classifier came back with: 3, the real answer is: 3
the classifier came back with: 1, the real answer is: 1
the classifier came back with: 2, the real answer is: 2
the classifier came back with: 1, the real answer is: 1
the classifier came back with: 1, the real answer is: 1
the classifier came back with: 2, the real answer is: 2
the classifier came back with: 3, the real answer is: 3
the classifier came back with: 3, the real answer is: 3
the classifier came back with: 1, the real answer is: 1
the classifier came back with: 2, the real answer is: 2
the classifier came back with: 3, the real answer is: 3
the classifier came back with: 3, the real answer is: 3
the classifier came back with: 3, the real answer is: 3
the classifier came back with: 1, the real answer is: 1
the classifier came back with: 1, the real answer is: 1
the classifier came back with: 1, the real answer is: 1
the classifier came back with: 1, the real answer is: 1
the classifier came back with: 2, the real answer is: 2
the classifier came back with: 2, the real answer is: 2
the classifier came back with: 1, the real answer is: 1
the classifier came back with: 3, the real answer is: 3
the classifier came back with: 2, the real answer is: 2
the classifier came back with: 2, the real answer is: 2
the classifier came back with: 2, the real answer is: 2
the classifier came back with: 2, the real answer is: 2
the classifier came back with: 3, the real answer is: 3
the classifier came back with: 1, the real answer is: 1
the classifier came back with: 2, the real answer is: 2
the classifier came back with: 1, the real answer is: 1
the classifier came back with: 2, the real answer is: 2
the classifier came back with: 2, the real answer is: 2
the classifier came back with: 2, the real answer is: 2
the classifier came back with: 2, the real answer is: 2
the classifier came back with: 2, the real answer is: 2
the classifier came back with: 3, the real answer is: 3
the classifier came back with: 2, the real answer is: 2
the classifier came back with: 3, the real answer is: 3
the classifier came back with: 1, the real answer is: 1
the classifier came back with: 2, the real answer is: 2
the classifier came back with: 3, the real answer is: 3
the classifier came back with: 2, the real answer is: 2
the classifier came back with: 2, the real answer is: 2
the classifier came back with: 3, the real answer is: 1
the classifier came back with: 3, the real answer is: 3
the classifier came back with: 1, the real answer is: 1
the classifier came back with: 1, the real answer is: 1
the classifier came back with: 3, the real answer is: 3
the classifier came back with: 3, the real answer is: 3
the classifier came back with: 1, the real answer is: 1
the classifier came back with: 2, the real answer is: 2
the classifier came back with: 3, the real answer is: 3
the classifier came back with: 3, the real answer is: 1
the classifier came back with: 3, the real answer is: 3
the classifier came back with: 1, the real answer is: 1
the classifier came back with: 2, the real answer is: 2
the classifier came back with: 2, the real answer is: 2
the classifier came back with: 1, the real answer is: 1
the classifier came back with: 1, the real answer is: 1
the classifier came back with: 3, the real answer is: 3
the classifier came back with: 2, the real answer is: 3
the classifier came back with: 1, the real answer is: 1
the classifier came back with: 2, the real answer is: 2
the classifier came back with: 1, the real answer is: 1
the classifier came back with: 3, the real answer is: 3
the classifier came back with: 3, the real answer is: 3
the classifier came back with: 2, the real answer is: 2
the classifier came back with: 1, the real answer is: 1
the classifier came back with: 3, the real answer is: 1
the total error rate is: 0.050000