import findspark
findspark.init()

In [5]:

import pyspark
import random

In [3]:

sc = pyspark.SparkContext(appName="Pi")
num_samples = 100000000

def inside(p):
    #
    x, y = random.random(), random.random()
    return x*x + y*y < 1

count = sc.parallelize(range(0, num_samples)).filter(inside).count()

pi = 4 * count / num_samples
print(pi)

sc.stop()

3.14130496

Screenshot%20from%202019-01-04%2018-13-57.png

In [6]:

sc = pyspark.SparkContext(appName="Daily_Show_Test1")
 
print(sc) #

<SparkContext master=local[*] appName=Daily_Show_Test1>

In [5]:

## Dhankar >> PySpark initiated with FindSpark
# SparkContext started 
# Got CSV from 
# Converted CSV to TSV 
# Now importing data from TSV 
# raw_d == the SPARK RDD Object
# Print out top 15 Rows

raw_d = sc.textFile("dsT1.tsv")
# 
# In the above line of Code - actual Loading of CSV in RDD is Not yet Done 
# Its Done LAZILY - "as and when ABOSULUTELY required" as below - 
#

In [6]:

raw_d.take(15)

Out[6]:

['YEAR\tGoogleKnowlege_Occupation\tShow\tGroup\tRaw_Guest_List',
 '1999\tactor\t1/11/99\tActing\tMichael J. Fox',
 '1999\tComedian\t1/12/99\tComedy\tSandra Bernhard',
 '1999\ttelevision actress\t1/13/99\tActing\tTracey Ullman',
 '1999\tfilm actress\t1/14/99\tActing\tGillian Anderson',
 '1999\tactor\t1/18/99\tActing\tDavid Alan Grier',
 '1999\tactor\t1/19/99\tActing\tWilliam Baldwin',
 '1999\tSinger-lyricist\t1/20/99\tMusician\tMichael Stipe',
 '1999\tmodel\t1/21/99\tMedia\tCarmen Electra',
 '1999\tactor\t1/25/99\tActing\tMatthew Lillard',
 '1999\tstand-up comedian\t1/26/99\tComedy\tDavid Cross',
 '1999\tactress\t1/27/99\tActing\tYasmine Bleeth',
 '1999\tactor\t1/28/99\tActing\tD. L. Hughley',
 '1999\ttelevision actress\t10/18/99\tActing\tRebecca Gayheart',
 '1999\tComedian\t10/19/99\tComedy\tSteven Wright']

In [7]:

# Using a 'map' function operate on all elements within a RDD object
# 

daily_show = raw_d.map(lambda line: line.split('\t'))
daily_show.take(5)

Out[7]:

[['YEAR', 'GoogleKnowlege_Occupation', 'Show', 'Group', 'Raw_Guest_List'],
 ['1999', 'actor', '1/11/99', 'Acting', 'Michael J. Fox'],
 ['1999', 'Comedian', '1/12/99', 'Comedy', 'Sandra Bernhard'],
 ['1999', 'television actress', '1/13/99', 'Acting', 'Tracey Ullman'],
 ['1999', 'film actress', '1/14/99', 'Acting', 'Gillian Anderson']]

In [8]:

daily_show = raw_d.map(lambda line: line.split('99'))
daily_show.take(10)

Out[8]:

[['YEAR\tGoogleKnowlege_Occupation\tShow\tGroup\tRaw_Guest_List'],
 ['1', '9\tactor\t1/11/', '\tActing\tMichael J. Fox'],
 ['1', '9\tComedian\t1/12/', '\tComedy\tSandra Bernhard'],
 ['1', '9\ttelevision actress\t1/13/', '\tActing\tTracey Ullman'],
 ['1', '9\tfilm actress\t1/14/', '\tActing\tGillian Anderson'],
 ['1', '9\tactor\t1/18/', '\tActing\tDavid Alan Grier'],
 ['1', '9\tactor\t1/19/', '\tActing\tWilliam Baldwin'],
 ['1', '9\tSinger-lyricist\t1/20/', '\tMusician\tMichael Stipe'],
 ['1', '9\tmodel\t1/21/', '\tMedia\tCarmen Electra'],
 ['1', '9\tactor\t1/25/', '\tActing\tMatthew Lillard']]

In [9]:

print(type(daily_show)) ## <class 'pyspark.rdd.PipelinedRDD'>

<class 'pyspark.rdd.PipelinedRDD'>

In [10]:

#reduceByKey() ##

tally = daily_show.map(lambda x: (x[0], 1)).reduceByKey(lambda x,y: x+y)
#
print(tally) #PythonRDD[9] at RDD at PythonRDD.scala:53

PythonRDD[9] at RDD at PythonRDD.scala:53

In [29]:

## Explanation ##
#%time bigger_chunk = raw_d.take(5000) 

# 1st RUN 
# CPU times: user 8 ms, sys: 8 ms, total: 16 ms
# Wall time: 274 ms


#%time bigger_chunk = raw_d.take(50000) 
# 2nd RUN
# CPU times: user 12 ms, sys: 8 ms, total: 20 ms
# Wall time: 299 ms

#%time tally.take(tally.count())
"""
CPU times: user 56 ms, sys: 0 ns, total: 56 ms
Wall time: 503 ms
"""

Out[29]:

'\nCPU times: user 56 ms, sys: 0 ns, total: 56 ms\nWall time: 503 ms\n'

In [7]:

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib notebook

dates = pd.date_range('20190101', periods=30)
print(type(dates))
print("  "*90)
print(dates)

<class 'pandas.core.indexes.datetimes.DatetimeIndex'>
                                                                                                                                                                                    
DatetimeIndex(['2019-01-01', '2019-01-02', '2019-01-03', '2019-01-04',
               '2019-01-05', '2019-01-06', '2019-01-07', '2019-01-08',
               '2019-01-09', '2019-01-10', '2019-01-11', '2019-01-12',
               '2019-01-13', '2019-01-14', '2019-01-15', '2019-01-16',
               '2019-01-17', '2019-01-18', '2019-01-19', '2019-01-20',
               '2019-01-21', '2019-01-22', '2019-01-23', '2019-01-24',
               '2019-01-25', '2019-01-26', '2019-01-27', '2019-01-28',
               '2019-01-29', '2019-01-30'],
              dtype='datetime64[ns]', freq='D')

In [8]:

df_dts = pd.DataFrame(np.random.randn(30,10), index=dates, columns=list('ABCDEFGHIJ'))
print(df_dts)

                   A         B         C         D         E         F  \
2019-01-01 -0.274021 -0.526743 -1.417882 -0.052270  0.736923  1.251460   
2019-01-02  1.019388 -0.276377 -0.086586 -0.075384 -0.469768  1.357838   
2019-01-03  0.115425  0.835901  0.174856 -0.817301  0.367212 -1.491671   
2019-01-04  0.654194 -0.375799  0.350479 -0.924795 -0.573621  1.610364   
2019-01-05 -0.690174 -0.574103 -0.019763 -1.592275 -0.051804 -0.415804   
2019-01-06 -1.428117 -1.195625  0.609043  1.912394  1.852141 -2.135954   
2019-01-07  0.327856 -0.890929  0.218548 -1.032310  0.044817  0.357085   
2019-01-08  1.141624 -2.131249  1.046434  0.459372 -0.045934  0.075732   
2019-01-09  1.350714 -0.966212  0.553683  2.154412 -0.759263 -0.027373   
2019-01-10 -0.321909 -0.598362 -0.654002 -0.816495 -0.471047 -0.666584   
2019-01-11 -1.986969 -0.532504  2.118923 -0.485428  2.603792  1.257012   
2019-01-12 -0.598720 -1.633192 -0.472679  0.773392  1.404901 -0.000833   
2019-01-13  0.667746  2.127216  0.268052  1.845916 -0.227877 -1.183380   
2019-01-14  1.741883  0.026643  0.369157 -0.045462 -2.739400 -0.702299   
2019-01-15  0.848398  0.351326  0.865877  0.319250 -0.195197 -0.513348   
2019-01-16  0.655090 -0.361821 -0.836388 -0.828380 -0.117627 -1.019363   
2019-01-17  1.599691  0.166938  0.282925  0.094405 -1.861371  1.677585   
2019-01-18  0.574490  0.602791  0.583982  0.787998  1.778040 -0.169008   
2019-01-19 -1.058829 -1.188905 -0.623866 -2.035183  0.706366  0.400563   
2019-01-20  1.324508  0.175267 -1.152810  0.955806  0.007369  1.056387   
2019-01-21  1.623349 -0.397769 -1.588138  1.248571 -0.449837 -0.952811   
2019-01-22  0.767953 -0.892549  0.235145 -1.014856 -0.095927  2.548800   
2019-01-23 -0.775355 -1.151074  3.020047 -1.450639 -1.609157 -1.087332   
2019-01-24  1.642149 -0.077271 -0.008464  0.413203 -0.282940 -0.587642   
2019-01-25 -1.543763 -0.618524  1.356922 -0.227444 -0.819156  0.601908   
2019-01-26 -0.624861 -0.422055  0.327485 -2.313460  0.073149  1.044513   
2019-01-27 -0.806985 -0.478424 -0.894642  0.289233  0.930200 -0.433874   
2019-01-28 -0.605447  1.661376  1.733454  0.593430  0.859583 -0.234789   
2019-01-29  0.642139 -0.376939  1.475743 -0.523448  0.698307  0.332714   
2019-01-30  0.295201 -0.118483 -1.186537 -2.639239  0.929674  0.967022   

                   G         H         I         J  
2019-01-01 -0.490712 -0.838672 -1.216661 -0.004694  
2019-01-02  0.132799  0.077553  0.232142 -0.775557  
2019-01-03  1.233055 -0.140058  1.488011 -1.744126  
2019-01-04 -0.473251  1.635270 -0.264727  0.990922  
2019-01-05  0.864059  1.056416 -1.126592  0.344389  
2019-01-06  0.697988  0.128379  0.456285 -0.003687  
2019-01-07 -0.211996  1.918649 -0.589488  0.011021  
2019-01-08  1.093153 -0.219218 -0.538248  1.070504  
2019-01-09 -1.286772  0.146400  0.116849 -0.017887  
2019-01-10  0.763914  0.120239 -0.011433  2.133015  
2019-01-11  2.124366  0.990242  0.341616 -0.789076  
2019-01-12 -1.002664  1.054574  0.385570 -0.287139  
2019-01-13  1.857313 -0.014900  0.999002 -1.067333  
2019-01-14 -0.417712  0.395657  0.912798  0.195096  
2019-01-15  0.117987 -1.392914  1.166000  0.048143  
2019-01-16  0.765581 -0.136652 -0.767044  0.512799  
2019-01-17 -1.606204 -0.194305  0.028409  0.603023  
2019-01-18 -0.005221  0.105919  0.814699  0.531684  
2019-01-19  0.537335 -0.920859 -0.077355 -0.466790  
2019-01-20 -0.292622  1.055973 -0.423919  0.431782  
2019-01-21  0.255851 -1.162695 -1.629117 -1.005843  
2019-01-22  1.211909  1.573213  1.042724  0.659565  
2019-01-23 -1.617104  0.043820  0.561523  0.027695  
2019-01-24 -1.287953  0.044805  0.643131 -1.199871  
2019-01-25 -0.002411 -0.619762 -0.762996  0.342799  
2019-01-26  1.495126 -0.144582 -2.195270  0.251064  
2019-01-27  2.872342  0.086359  1.233108  1.090616  
2019-01-28 -0.915709  0.160317 -1.174612  1.014472  
2019-01-29  0.968395 -0.254014 -0.959329  1.743712  
2019-01-30 -0.470300  0.337169  0.516209  0.742324

In [9]:

my_dict = {'A':199.,
           #'B':pd.Timestamp('20190101'),
           'B':pd.date_range('20190101', periods=40),
           'C':pd.Series(1,index=list(range(40)),dtype='float32'),
           'D':np.array([100]*40,dtype='int32'),
           'D_a':np.array([1000]*40,dtype='float64'),
           #'E':pd.Categorical(["Ytest","Ytrain","Xtest","Xtrain"]), # 4 hardcoded strings
           'E':np.array(["my_STR"]*40,dtype='str'),
           'F':'foo_bar'}

df2 = pd.DataFrame(my_dict)
print(df2.head(5))
print(df2.tail(5))

## my_dict_from_dict_comprehension =  {x: x**2 for x in (2, 4, 6)}

       A          B    C    D     D_a       E        F
0  199.0 2019-01-01  1.0  100  1000.0  my_STR  foo_bar
1  199.0 2019-01-02  1.0  100  1000.0  my_STR  foo_bar
2  199.0 2019-01-03  1.0  100  1000.0  my_STR  foo_bar
3  199.0 2019-01-04  1.0  100  1000.0  my_STR  foo_bar
4  199.0 2019-01-05  1.0  100  1000.0  my_STR  foo_bar
        A          B    C    D     D_a       E        F
35  199.0 2019-02-05  1.0  100  1000.0  my_STR  foo_bar
36  199.0 2019-02-06  1.0  100  1000.0  my_STR  foo_bar
37  199.0 2019-02-07  1.0  100  1000.0  my_STR  foo_bar
38  199.0 2019-02-08  1.0  100  1000.0  my_STR  foo_bar
39  199.0 2019-02-09  1.0  100  1000.0  my_STR  foo_bar

In [36]:

my_dict = {'A':199.,
           #'B':pd.Timestamp('20190101'),
           'B':pd.date_range('20190101', periods=40),
           'C':pd.Series(1,index=list(range(40)),dtype='float32'),
           'D':np.array([100]*40,dtype='int32'), ## SPARK Dtype == long (nullable = true)
           'D_a':np.array([1000]*40,dtype='float64'), ## SPARK Dtype == double (nullable = true)
           'D_b':np.random.uniform(low=0, high=40, size=(40,)), ## DEFAULT SPARK Dtype == double (nullable = true)
           'D_c':np.random.uniform(low=-100, high=100, size=(40,)), ## DEFAULT SPARK Dtype == double (nullable = true)
           #'E':pd.Categorical(["Ytest","Ytrain","Xtest","Xtrain"]), # 4 hardcoded strings
           'E':np.array(["my_STR"]*40,dtype='str'),
           'F':'foo_bar'}

df2 = pd.DataFrame(my_dict)
print(df2.head(5))
print(df2.tail(5))

       A          B    C    D     D_a        D_b        D_c       E        F
0  199.0 2019-01-01  1.0  100  1000.0   4.259934  13.199441  my_STR  foo_bar
1  199.0 2019-01-02  1.0  100  1000.0  26.578697  10.935413  my_STR  foo_bar
2  199.0 2019-01-03  1.0  100  1000.0  30.883259  34.493439  my_STR  foo_bar
3  199.0 2019-01-04  1.0  100  1000.0   1.443396  88.544454  my_STR  foo_bar
4  199.0 2019-01-05  1.0  100  1000.0  28.604473  22.817267  my_STR  foo_bar
        A          B    C    D     D_a        D_b        D_c       E        F
35  199.0 2019-02-05  1.0  100  1000.0  10.875673  46.251329  my_STR  foo_bar
36  199.0 2019-02-06  1.0  100  1000.0  13.217765 -63.030147  my_STR  foo_bar
37  199.0 2019-02-07  1.0  100  1000.0  22.675758  71.711212  my_STR  foo_bar
38  199.0 2019-02-08  1.0  100  1000.0  20.710182 -61.840822  my_STR  foo_bar
39  199.0 2019-02-09  1.0  100  1000.0  26.918973 -81.035641  my_STR  foo_bar

In [11]:

from collections import OrderedDict
## https://docs.python.org/2/library/collections.html#ordereddict-objects
## https://docs.python.org/2/library/collections.html#ordereddict-examples-and-recipes

my_dict_from_dict_comprehension =  OrderedDict({x: x**5 for x in (1, 5, 10)}).__class__
my_ORDERED_dict_from_dict_comprehension =  OrderedDict({x: x**5 for x in (1, 5, 10)})
my_dict_from_dict_comprehension1 =  {x: x*5 for x in (1, 5, 10 , 20 , 33)}
my_dict_from_dict_comprehension_class =  {x: x*5 for x in (1, 5, 10 , 20 , 33)}.__class__
#
print(my_dict_from_dict_comprehension) ## Doesnt give an ORDERED DICT ?
print(" "*90)
print(my_dict_from_dict_comprehension1) ## Doesnt give an ORDERED DICT ?
print(" "*90)
print(my_dict_from_dict_comprehension_class) #.__class__ ## Chained above with DOT Notation 
print(" "*90)
print(my_ORDERED_dict_from_dict_comprehension) # OrderedDict([(1, 1), (10, 100000), (5, 3125)])

<class 'collections.OrderedDict'>
                                                                                          
{1: 5, 10: 50, 20: 100, 5: 25, 33: 165}
                                                                                          
<class 'dict'>
                                                                                          
OrderedDict([(1, 1), (10, 100000), (5, 3125)])

In [37]:

print("The DATA Types ---\n\n", df2.dtypes)

The DATA Types ---

 A             float64
B      datetime64[ns]
C             float32
D               int32
D_a           float64
D_b           float64
D_c           float64
E              object
F              object
dtype: object

In [38]:

df2.columns

Out[38]:

Index(['A', 'B', 'C', 'D', 'D_a', 'D_b', 'D_c', 'E', 'F'], dtype='object')

In [39]:

print(df2.values[2]) ## Numpy Array of "values" - print the 2nd RECORD
#
print("   "*90)
print(df2.values[3]) ## Numpy Array of "values" - print the 3rd RECORD
#
print("   "*90)
print(df2.values[3:5]) ## Numpy Array of "values" - print the 3rd RECORD

[199.0 Timestamp('2019-01-03 00:00:00') 1.0 100 1000.0 30.883258934544383
 34.49343921968767 'my_STR' 'foo_bar']
                                                                                                                                                                                                                                                                              
[199.0 Timestamp('2019-01-04 00:00:00') 1.0 100 1000.0 1.443395958291327
 88.54445359138737 'my_STR' 'foo_bar']
                                                                                                                                                                                                                                                                              
[[199.0 Timestamp('2019-01-04 00:00:00') 1.0 100 1000.0 1.443395958291327
  88.54445359138737 'my_STR' 'foo_bar']
 [199.0 Timestamp('2019-01-05 00:00:00') 1.0 100 1000.0
  28.604472974851415 22.817266950362438 'my_STR' 'foo_bar']]

In [40]:

print(df2.describe)

<bound method NDFrame.describe of         A          B    C    D     D_a        D_b        D_c       E        F
0   199.0 2019-01-01  1.0  100  1000.0   4.259934  13.199441  my_STR  foo_bar
1   199.0 2019-01-02  1.0  100  1000.0  26.578697  10.935413  my_STR  foo_bar
2   199.0 2019-01-03  1.0  100  1000.0  30.883259  34.493439  my_STR  foo_bar
3   199.0 2019-01-04  1.0  100  1000.0   1.443396  88.544454  my_STR  foo_bar
4   199.0 2019-01-05  1.0  100  1000.0  28.604473  22.817267  my_STR  foo_bar
5   199.0 2019-01-06  1.0  100  1000.0   1.546117 -75.137443  my_STR  foo_bar
6   199.0 2019-01-07  1.0  100  1000.0   9.433286 -10.111833  my_STR  foo_bar
7   199.0 2019-01-08  1.0  100  1000.0  18.338019 -61.566053  my_STR  foo_bar
8   199.0 2019-01-09  1.0  100  1000.0  21.607442 -87.060408  my_STR  foo_bar
9   199.0 2019-01-10  1.0  100  1000.0   1.018799  58.727274  my_STR  foo_bar
10  199.0 2019-01-11  1.0  100  1000.0   8.982050  59.780313  my_STR  foo_bar
11  199.0 2019-01-12  1.0  100  1000.0  32.723562  12.587770  my_STR  foo_bar
12  199.0 2019-01-13  1.0  100  1000.0   8.134931 -79.529600  my_STR  foo_bar
13  199.0 2019-01-14  1.0  100  1000.0   0.879642  63.455735  my_STR  foo_bar
14  199.0 2019-01-15  1.0  100  1000.0  31.597763  92.497674  my_STR  foo_bar
15  199.0 2019-01-16  1.0  100  1000.0  21.370875  40.951229  my_STR  foo_bar
16  199.0 2019-01-17  1.0  100  1000.0  30.604560 -61.473815  my_STR  foo_bar
17  199.0 2019-01-18  1.0  100  1000.0  11.771886  45.843190  my_STR  foo_bar
18  199.0 2019-01-19  1.0  100  1000.0  23.769982  96.968611  my_STR  foo_bar
19  199.0 2019-01-20  1.0  100  1000.0   0.393847  26.902931  my_STR  foo_bar
20  199.0 2019-01-21  1.0  100  1000.0  24.401790 -50.745732  my_STR  foo_bar
21  199.0 2019-01-22  1.0  100  1000.0  35.115058  38.516487  my_STR  foo_bar
22  199.0 2019-01-23  1.0  100  1000.0   7.259503 -40.433845  my_STR  foo_bar
23  199.0 2019-01-24  1.0  100  1000.0   7.459515  98.418381  my_STR  foo_bar
24  199.0 2019-01-25  1.0  100  1000.0  35.269885 -18.528472  my_STR  foo_bar
25  199.0 2019-01-26  1.0  100  1000.0  23.801511  72.796997  my_STR  foo_bar
26  199.0 2019-01-27  1.0  100  1000.0  38.119475 -52.166962  my_STR  foo_bar
27  199.0 2019-01-28  1.0  100  1000.0  18.694793  91.182023  my_STR  foo_bar
28  199.0 2019-01-29  1.0  100  1000.0   6.804958 -76.515165  my_STR  foo_bar
29  199.0 2019-01-30  1.0  100  1000.0  27.939217  70.647018  my_STR  foo_bar
30  199.0 2019-01-31  1.0  100  1000.0  22.895632  -4.511398  my_STR  foo_bar
31  199.0 2019-02-01  1.0  100  1000.0  17.303685  66.671929  my_STR  foo_bar
32  199.0 2019-02-02  1.0  100  1000.0  38.362984 -48.884021  my_STR  foo_bar
33  199.0 2019-02-03  1.0  100  1000.0  18.769033  88.410813  my_STR  foo_bar
34  199.0 2019-02-04  1.0  100  1000.0  20.882653 -78.548298  my_STR  foo_bar
35  199.0 2019-02-05  1.0  100  1000.0  10.875673  46.251329  my_STR  foo_bar
36  199.0 2019-02-06  1.0  100  1000.0  13.217765 -63.030147  my_STR  foo_bar
37  199.0 2019-02-07  1.0  100  1000.0  22.675758  71.711212  my_STR  foo_bar
38  199.0 2019-02-08  1.0  100  1000.0  20.710182 -61.840822  my_STR  foo_bar
39  199.0 2019-02-09  1.0  100  1000.0  26.918973 -81.035641  my_STR  foo_bar>

In [41]:

%time print(df2.T) # Transpose

                      0                    1                    2   \
A                    199                  199                  199   
B    2019-01-01 00:00:00  2019-01-02 00:00:00  2019-01-03 00:00:00   
C                      1                    1                    1   
D                    100                  100                  100   
D_a                 1000                 1000                 1000   
D_b              4.25993              26.5787              30.8833   
D_c              13.1994              10.9354              34.4934   
E                 my_STR               my_STR               my_STR   
F                foo_bar              foo_bar              foo_bar   

                      3                    4                    5   \
A                    199                  199                  199   
B    2019-01-04 00:00:00  2019-01-05 00:00:00  2019-01-06 00:00:00   
C                      1                    1                    1   
D                    100                  100                  100   
D_a                 1000                 1000                 1000   
D_b               1.4434              28.6045              1.54612   
D_c              88.5445              22.8173             -75.1374   
E                 my_STR               my_STR               my_STR   
F                foo_bar              foo_bar              foo_bar   

                      6                    7                    8   \
A                    199                  199                  199   
B    2019-01-07 00:00:00  2019-01-08 00:00:00  2019-01-09 00:00:00   
C                      1                    1                    1   
D                    100                  100                  100   
D_a                 1000                 1000                 1000   
D_b              9.43329               18.338              21.6074   
D_c             -10.1118             -61.5661             -87.0604   
E                 my_STR               my_STR               my_STR   
F                foo_bar              foo_bar              foo_bar   

                      9          ...                            30  \
A                    199         ...                           199   
B    2019-01-10 00:00:00         ...           2019-01-31 00:00:00   
C                      1         ...                             1   
D                    100         ...                           100   
D_a                 1000         ...                          1000   
D_b               1.0188         ...                       22.8956   
D_c              58.7273         ...                       -4.5114   
E                 my_STR         ...                        my_STR   
F                foo_bar         ...                       foo_bar   

                      31                   32                   33  \
A                    199                  199                  199   
B    2019-02-01 00:00:00  2019-02-02 00:00:00  2019-02-03 00:00:00   
C                      1                    1                    1   
D                    100                  100                  100   
D_a                 1000                 1000                 1000   
D_b              17.3037               38.363               18.769   
D_c              66.6719              -48.884              88.4108   
E                 my_STR               my_STR               my_STR   
F                foo_bar              foo_bar              foo_bar   

                      34                   35                   36  \
A                    199                  199                  199   
B    2019-02-04 00:00:00  2019-02-05 00:00:00  2019-02-06 00:00:00   
C                      1                    1                    1   
D                    100                  100                  100   
D_a                 1000                 1000                 1000   
D_b              20.8827              10.8757              13.2178   
D_c             -78.5483              46.2513             -63.0301   
E                 my_STR               my_STR               my_STR   
F                foo_bar              foo_bar              foo_bar   

                      37                   38                   39  
A                    199                  199                  199  
B    2019-02-07 00:00:00  2019-02-08 00:00:00  2019-02-09 00:00:00  
C                      1                    1                    1  
D                    100                  100                  100  
D_a                 1000                 1000                 1000  
D_b              22.6758              20.7102               26.919  
D_c              71.7112             -61.8408             -81.0356  
E                 my_STR               my_STR               my_STR  
F                foo_bar              foo_bar              foo_bar  

[9 rows x 40 columns]
CPU times: user 28 ms, sys: 0 ns, total: 28 ms
Wall time: 28.3 ms

In [42]:

%time print(df2.sort_values(by='B',ascending=False)) ## SORTS - in Descending order of DATES in COL - B

        A          B    C    D     D_a        D_b        D_c       E        F
39  199.0 2019-02-09  1.0  100  1000.0  26.918973 -81.035641  my_STR  foo_bar
38  199.0 2019-02-08  1.0  100  1000.0  20.710182 -61.840822  my_STR  foo_bar
37  199.0 2019-02-07  1.0  100  1000.0  22.675758  71.711212  my_STR  foo_bar
36  199.0 2019-02-06  1.0  100  1000.0  13.217765 -63.030147  my_STR  foo_bar
35  199.0 2019-02-05  1.0  100  1000.0  10.875673  46.251329  my_STR  foo_bar
34  199.0 2019-02-04  1.0  100  1000.0  20.882653 -78.548298  my_STR  foo_bar
33  199.0 2019-02-03  1.0  100  1000.0  18.769033  88.410813  my_STR  foo_bar
32  199.0 2019-02-02  1.0  100  1000.0  38.362984 -48.884021  my_STR  foo_bar
31  199.0 2019-02-01  1.0  100  1000.0  17.303685  66.671929  my_STR  foo_bar
30  199.0 2019-01-31  1.0  100  1000.0  22.895632  -4.511398  my_STR  foo_bar
29  199.0 2019-01-30  1.0  100  1000.0  27.939217  70.647018  my_STR  foo_bar
28  199.0 2019-01-29  1.0  100  1000.0   6.804958 -76.515165  my_STR  foo_bar
27  199.0 2019-01-28  1.0  100  1000.0  18.694793  91.182023  my_STR  foo_bar
26  199.0 2019-01-27  1.0  100  1000.0  38.119475 -52.166962  my_STR  foo_bar
25  199.0 2019-01-26  1.0  100  1000.0  23.801511  72.796997  my_STR  foo_bar
24  199.0 2019-01-25  1.0  100  1000.0  35.269885 -18.528472  my_STR  foo_bar
23  199.0 2019-01-24  1.0  100  1000.0   7.459515  98.418381  my_STR  foo_bar
22  199.0 2019-01-23  1.0  100  1000.0   7.259503 -40.433845  my_STR  foo_bar
21  199.0 2019-01-22  1.0  100  1000.0  35.115058  38.516487  my_STR  foo_bar
20  199.0 2019-01-21  1.0  100  1000.0  24.401790 -50.745732  my_STR  foo_bar
19  199.0 2019-01-20  1.0  100  1000.0   0.393847  26.902931  my_STR  foo_bar
18  199.0 2019-01-19  1.0  100  1000.0  23.769982  96.968611  my_STR  foo_bar
17  199.0 2019-01-18  1.0  100  1000.0  11.771886  45.843190  my_STR  foo_bar
16  199.0 2019-01-17  1.0  100  1000.0  30.604560 -61.473815  my_STR  foo_bar
15  199.0 2019-01-16  1.0  100  1000.0  21.370875  40.951229  my_STR  foo_bar
14  199.0 2019-01-15  1.0  100  1000.0  31.597763  92.497674  my_STR  foo_bar
13  199.0 2019-01-14  1.0  100  1000.0   0.879642  63.455735  my_STR  foo_bar
12  199.0 2019-01-13  1.0  100  1000.0   8.134931 -79.529600  my_STR  foo_bar
11  199.0 2019-01-12  1.0  100  1000.0  32.723562  12.587770  my_STR  foo_bar
10  199.0 2019-01-11  1.0  100  1000.0   8.982050  59.780313  my_STR  foo_bar
9   199.0 2019-01-10  1.0  100  1000.0   1.018799  58.727274  my_STR  foo_bar
8   199.0 2019-01-09  1.0  100  1000.0  21.607442 -87.060408  my_STR  foo_bar
7   199.0 2019-01-08  1.0  100  1000.0  18.338019 -61.566053  my_STR  foo_bar
6   199.0 2019-01-07  1.0  100  1000.0   9.433286 -10.111833  my_STR  foo_bar
5   199.0 2019-01-06  1.0  100  1000.0   1.546117 -75.137443  my_STR  foo_bar
4   199.0 2019-01-05  1.0  100  1000.0  28.604473  22.817267  my_STR  foo_bar
3   199.0 2019-01-04  1.0  100  1000.0   1.443396  88.544454  my_STR  foo_bar
2   199.0 2019-01-03  1.0  100  1000.0  30.883259  34.493439  my_STR  foo_bar
1   199.0 2019-01-02  1.0  100  1000.0  26.578697  10.935413  my_STR  foo_bar
0   199.0 2019-01-01  1.0  100  1000.0   4.259934  13.199441  my_STR  foo_bar
CPU times: user 20 ms, sys: 4 ms, total: 24 ms
Wall time: 26.7 ms

In [43]:

## Create a SPARK DF from Pandas DF 

from pyspark.sql import SparkSession
spark = SparkSession.builder.getOrCreate()

%time df2_spark = spark.createDataFrame(df2)

CPU times: user 40 ms, sys: 0 ns, total: 40 ms
Wall time: 107 ms

In [44]:

print(type(df2_spark)) ## <class 'pyspark.sql.dataframe.DataFrame'>
#
print("  "*90)
#
print(df2_spark.show())

<class 'pyspark.sql.dataframe.DataFrame'>
                                                                                                                                                                                    
+-----+-------------------+---+---+------+------------------+-------------------+------+-------+
|    A|                  B|  C|  D|   D_a|               D_b|                D_c|     E|      F|
+-----+-------------------+---+---+------+------------------+-------------------+------+-------+
|199.0|2019-01-01 00:00:00|1.0|100|1000.0| 4.259933692159192| 13.199441059561877|my_STR|foo_bar|
|199.0|2019-01-02 00:00:00|1.0|100|1000.0| 26.57869749948212|  10.93541258455221|my_STR|foo_bar|
|199.0|2019-01-03 00:00:00|1.0|100|1000.0|30.883258934544383|  34.49343921968767|my_STR|foo_bar|
|199.0|2019-01-04 00:00:00|1.0|100|1000.0| 1.443395958291327|  88.54445359138737|my_STR|foo_bar|
|199.0|2019-01-05 00:00:00|1.0|100|1000.0|28.604472974851415| 22.817266950362438|my_STR|foo_bar|
|199.0|2019-01-06 00:00:00|1.0|100|1000.0|1.5461168494795219| -75.13744276855519|my_STR|foo_bar|
|199.0|2019-01-07 00:00:00|1.0|100|1000.0| 9.433285717250467|-10.111832748581321|my_STR|foo_bar|
|199.0|2019-01-08 00:00:00|1.0|100|1000.0|18.338018873175823| -61.56605295393798|my_STR|foo_bar|
|199.0|2019-01-09 00:00:00|1.0|100|1000.0| 21.60744165591678|  -87.0604080801358|my_STR|foo_bar|
|199.0|2019-01-10 00:00:00|1.0|100|1000.0|1.0187992181849381|  58.72727378765413|my_STR|foo_bar|
|199.0|2019-01-11 00:00:00|1.0|100|1000.0|  8.98204995885893|  59.78031252752774|my_STR|foo_bar|
|199.0|2019-01-12 00:00:00|1.0|100|1000.0|32.723562482729946| 12.587769896698802|my_STR|foo_bar|
|199.0|2019-01-13 00:00:00|1.0|100|1000.0| 8.134930800460548| -79.52959972569133|my_STR|foo_bar|
|199.0|2019-01-14 00:00:00|1.0|100|1000.0|0.8796417457424965|  63.45573474746121|my_STR|foo_bar|
|199.0|2019-01-15 00:00:00|1.0|100|1000.0| 31.59776306295241|  92.49767384412067|my_STR|foo_bar|
|199.0|2019-01-16 00:00:00|1.0|100|1000.0|21.370875386248684| 40.951228598905715|my_STR|foo_bar|
|199.0|2019-01-17 00:00:00|1.0|100|1000.0|30.604560081591174| -61.47381544887569|my_STR|foo_bar|
|199.0|2019-01-18 00:00:00|1.0|100|1000.0|11.771886241243452| 45.843190433686686|my_STR|foo_bar|
|199.0|2019-01-19 00:00:00|1.0|100|1000.0|23.769982330219847|  96.96861102421505|my_STR|foo_bar|
|199.0|2019-01-20 00:00:00|1.0|100|1000.0| 0.393847493777697| 26.902931201777065|my_STR|foo_bar|
+-----+-------------------+---+---+------+------------------+-------------------+------+-------+
only showing top 20 rows

None

In [45]:

df2_spark.printSchema() ## DATA TYPES stored in SPARK

root
 |-- A: double (nullable = true)
 |-- B: timestamp (nullable = true)
 |-- C: double (nullable = true)
 |-- D: long (nullable = true)
 |-- D_a: double (nullable = true)
 |-- D_b: double (nullable = true)
 |-- D_c: double (nullable = true)
 |-- E: string (nullable = true)
 |-- F: string (nullable = true)

In [46]:

## SELECT Columns 
spark_col_names = ['A','C','D_a','D_b']
df2_spark.select(spark_col_names).show(3)

+-----+---+------+------------------+
|    A|  C|   D_a|               D_b|
+-----+---+------+------------------+
|199.0|1.0|1000.0| 4.259933692159192|
|199.0|1.0|1000.0| 26.57869749948212|
|199.0|1.0|1000.0|30.883258934544383|
+-----+---+------+------------------+
only showing top 3 rows

In [25]:

## FILTER same as the WHERE Clause from SQL

df2_spark.filter(df2_spark.B == '2019-01-06 00:00:00').show()
## SHOWS only ONE ROW where we have the TIMESTAMP - in COLUMN B == 2019-01-06 00:00:00

+-----+-------------------+---+---+------+------+-------+
|    A|                  B|  C|  D|   D_a|     E|      F|
+-----+-------------------+---+---+------+------+-------+
|199.0|2019-01-06 00:00:00|1.0|100|1000.0|my_STR|foo_bar|
+-----+-------------------+---+---+------+------+-------+

In [26]:

df2_spark.filter(df2_spark.B == '2019-01-06').show()  ## PARTIAL String Match within Column B ??

+-----+-------------------+---+---+------+------+-------+
|    A|                  B|  C|  D|   D_a|     E|      F|
+-----+-------------------+---+---+------+------+-------+
|199.0|2019-01-06 00:00:00|1.0|100|1000.0|my_STR|foo_bar|
+-----+-------------------+---+---+------+------+-------+

In [28]:

df2_spark.filter(df2_spark.B == '2019-01').show() ## Not OK ## PARTIAL String Match within Column B ??

+-----+-------------------+---+---+------+------+-------+
|    A|                  B|  C|  D|   D_a|     E|      F|
+-----+-------------------+---+---+------+------+-------+
|199.0|2019-01-01 00:00:00|1.0|100|1000.0|my_STR|foo_bar|
+-----+-------------------+---+---+------+------+-------+

In [47]:

df2_spark.filter("B == '2019-01'").show() # Note the DOUBLE QUOTES

+-----+-------------------+---+---+------+-----------------+------------------+------+-------+
|    A|                  B|  C|  D|   D_a|              D_b|               D_c|     E|      F|
+-----+-------------------+---+---+------+-----------------+------------------+------+-------+
|199.0|2019-01-01 00:00:00|1.0|100|1000.0|4.259933692159192|13.199441059561877|my_STR|foo_bar|
+-----+-------------------+---+---+------+-----------------+------------------+------+-------+

In [49]:

## Data Bricks --- SparkML --- Orange telecom CHURN Prediction data
## SOURCE -- https://github.com/databricks/spark-csv#python-api

from pyspark.sql import SQLContext
sqlContext = SQLContext(sc)

CV_data = sqlContext.read.load('./churn-bigml-80.csv', 
                          format='com.databricks.spark.csv', 
                          header='true', 
                          inferSchema='true')

In [50]:

print(type(CV_data))

<class 'pyspark.sql.dataframe.DataFrame'>

In [52]:

CV_data.show(3)

+-----+--------------+---------+------------------+---------------+---------------------+-----------------+---------------+----------------+-----------------+---------------+----------------+-------------------+-----------------+------------------+------------------+----------------+-----------------+----------------------+-----+
|State|Account length|Area code|International plan|Voice mail plan|Number vmail messages|Total day minutes|Total day calls|Total day charge|Total eve minutes|Total eve calls|Total eve charge|Total night minutes|Total night calls|Total night charge|Total intl minutes|Total intl calls|Total intl charge|Customer service calls|Churn|
+-----+--------------+---------+------------------+---------------+---------------------+-----------------+---------------+----------------+-----------------+---------------+----------------+-------------------+-----------------+------------------+------------------+----------------+-----------------+----------------------+-----+
|   KS|           128|      415|                No|            Yes|                   25|            265.1|            110|           45.07|            197.4|             99|           16.78|              244.7|               91|             11.01|              10.0|               3|              2.7|                     1|false|
|   OH|           107|      415|                No|            Yes|                   26|            161.6|            123|           27.47|            195.5|            103|           16.62|              254.4|              103|             11.45|              13.7|               3|              3.7|                     1|false|
|   NJ|           137|      415|                No|             No|                    0|            243.4|            114|           41.38|            121.2|            110|            10.3|              162.6|              104|              7.32|              12.2|               5|             3.29|                     0|false|
+-----+--------------+---------+------------------+---------------+---------------------+-----------------+---------------+----------------+-----------------+---------------+----------------+-------------------+-----------------+------------------+------------------+----------------+-----------------+----------------------+-----+
only showing top 3 rows

In [53]:

CV_data.columns

Out[53]:

['State',
 'Account length',
 'Area code',
 'International plan',
 'Voice mail plan',
 'Number vmail messages',
 'Total day minutes',
 'Total day calls',
 'Total day charge',
 'Total eve minutes',
 'Total eve calls',
 'Total eve charge',
 'Total night minutes',
 'Total night calls',
 'Total night charge',
 'Total intl minutes',
 'Total intl calls',
 'Total intl charge',
 'Customer service calls',
 'Churn']

In [54]:

CV_data.printSchema()

root
 |-- State: string (nullable = true)
 |-- Account length: integer (nullable = true)
 |-- Area code: integer (nullable = true)
 |-- International plan: string (nullable = true)
 |-- Voice mail plan: string (nullable = true)
 |-- Number vmail messages: integer (nullable = true)
 |-- Total day minutes: double (nullable = true)
 |-- Total day calls: integer (nullable = true)
 |-- Total day charge: double (nullable = true)
 |-- Total eve minutes: double (nullable = true)
 |-- Total eve calls: integer (nullable = true)
 |-- Total eve charge: double (nullable = true)
 |-- Total night minutes: double (nullable = true)
 |-- Total night calls: integer (nullable = true)
 |-- Total night charge: double (nullable = true)
 |-- Total intl minutes: double (nullable = true)
 |-- Total intl calls: integer (nullable = true)
 |-- Total intl charge: double (nullable = true)
 |-- Customer service calls: integer (nullable = true)
 |-- Churn: boolean (nullable = true)

In [56]:

## SELECT Columns --- after seeing them with --- .printSchema()
spark_col_names = ['International plan','State','Customer service calls','Churn']
CV_data.select(spark_col_names).show(10)

+------------------+-----+----------------------+-----+
|International plan|State|Customer service calls|Churn|
+------------------+-----+----------------------+-----+
|                No|   KS|                     1|false|
|                No|   OH|                     1|false|
|                No|   NJ|                     0|false|
|               Yes|   OH|                     2|false|
|               Yes|   OK|                     3|false|
|               Yes|   AL|                     0|false|
|                No|   MA|                     3|false|
|               Yes|   MO|                     0|false|
|               Yes|   WV|                     0|false|
|                No|   RI|                     0|false|
+------------------+-----+----------------------+-----+
only showing top 10 rows

In [ ]:

## 
spark_col_names = ['International plan','State','Customer service calls','Churn']
CV_data.select(spark_col_names).show(10)

In [59]:

spark_col_names = ['International plan','State','Customer service calls','Churn']

CV_data.select(spark_col_names).filter("Churn == 'true'").show(30) #
# DOT Notation Chained --- SELECT and FILTER
#Note the DOUBLE QUOTES

+------------------+-----+----------------------+-----+
|International plan|State|Customer service calls|Churn|
+------------------+-----+----------------------+-----+
|                No|   CO|                     5| true|
|                No|   AZ|                     1| true|
|               Yes|   MD|                     0| true|
|                No|   WY|                     5| true|
|                No|   CO|                     3| true|
|                No|   TX|                     4| true|
|                No|   DC|                     0| true|
|                No|   NY|                     4| true|
|                No|   TX|                     2| true|
|                No|   IN|                     4| true|
|                No|   NJ|                     1| true|
|                No|   MS|                     1| true|
|               Yes|   ME|                     0| true|
|                No|   NV|                     1| true|
|                No|   MS|                     4| true|
|                No|   MS|                     4| true|
|               Yes|   VT|                     2| true|
|                No|   OH|                     0| true|
|                No|   AK|                     5| true|
|               Yes|   ME|                     4| true|
|               Yes|   FL|                     2| true|
|                No|   MD|                     1| true|
|               Yes|   MD|                     0| true|
|               Yes|   NV|                     2| true|
|                No|   VA|                     0| true|
|                No|   OR|                     4| true|
|                No|   VT|                     1| true|
|                No|   MN|                     1| true|
|               Yes|   SD|                     2| true|
|                No|   MT|                     1| true|
+------------------+-----+----------------------+-----+
only showing top 30 rows

In [ ]:

In [ ]:

In [ ]:

In [ ]:

In [ ]:

In [ ]:

In [ ]:

In [ ]:

In [ ]:

In [ ]:

In [ ]:

In [ ]:

In [ ]:

In [ ]:

In [ ]:

In [ ]:

In [ ]:

In [ ]:

In [ ]:

In [ ]:

In [ ]:

In [ ]:

In [ ]:

In [ ]:

In [ ]:

Data Science with R and Python

Friday, 4 January 2019

PySpark INIT - JAN 19