import pandas as pd
import matplotlib.pyplot as plt
import random
%matplotlib inline

def make_fake_data_point():
    seed = random.randint(0,10)
    if seed < 2:
        return random.normalvariate(170, 40)
    if seed < 4:
        return random.normalvariate(80, 20)
    if seed < 5:
        return random.normalvariate(120, 10)
    if seed < 8:
        return random.normalvariate(320, 15)
    if seed < 9:
        return random.normalvariate(210, 10)
    return random.normalvariate(30, 5)

def create_fake_data(number_of_points):
    return pd.DataFrame([int(make_fake_data_point()) for i in range(number_of_points)], columns=["weight"])
df = create_fake_data(1000)
df.head()
weight
0 69
1 200
2 79
3 27
4 41
df['weight'].hist(bins=50)
<matplotlib.axes._subplots.AxesSubplot at 0x10905b2e8>

png

K-means

One among many clustering algorithms

from sklearn.cluster import KMeans

km = KMeans(n_clusters=6)
df[['weight']].head()
weight
0 69
1 200
2 79
3 27
4 41
km.fit(df[['weight']])
KMeans(copy_x=True, init='k-means++', max_iter=300, n_clusters=6, n_init=10,
    n_jobs=1, precompute_distances='auto', random_state=None, tol=0.0001,
    verbose=0)
df['prediction'] = km.predict(df[['weight']])
df.groupby('prediction')['weight'].hist(bins=10)
prediction
0    Axes(0.125,0.125;0.775x0.775)
1    Axes(0.125,0.125;0.775x0.775)
2    Axes(0.125,0.125;0.775x0.775)
3    Axes(0.125,0.125;0.775x0.775)
4    Axes(0.125,0.125;0.775x0.775)
5    Axes(0.125,0.125;0.775x0.775)
Name: weight, dtype: object

png

df = pd.read_csv("../nba_2013_cleaned.csv")
df.head()
Name Age Team POS Number Salary Height Weight Years 1st Year DOB School City State/Province Country Race HS Only
0 Gee, Alonzo 26 Cavaliers F 33 3250000.0 78 219 4 2009 5/29/1987 Alabama Riviera Beach, FL Florida US Black No
1 Wallace, Gerald 31 Celtics F 45 10105855.0 79 220 12 2001 7/23/1982 Alabama Sylacauga, AL Alabama US Black No
2 Williams, Mo 30 Trail Blazers G 25 2652000.0 73 195 10 2003 12/19/1982 Alabama Jackson, MS Mississippi US Black No
3 Gladness, Mickell 27 Magic C 40 762195.0 83 220 2 2011 7/26/1986 Alabama A&M Birmingham, AL Alabama US Black No
4 Jefferson, Richard 33 Jazz F 44 11046000.0 79 230 12 2001 6/21/1980 Arizona Los Angeles, CA California US Black No
df = df[(df['POS'] == 'C') | (df['POS'] == 'F') | (df['POS'] == 'G')]
df['POS'].value_counts()
G    175
F    142
C     67
Name: POS, dtype: int64
from sklearn import preprocessing
le = preprocessing.LabelEncoder()
df['POS_label'] = le.fit_transform(df['POS'])
df[['POS','POS_label']].head()
POS POS_label
0 F 1
1 F 1
2 G 2
3 C 0
4 F 1
plt.scatter(df['Weight'], df['Height'], edgecolor='none', c=df['POS_label'], alpha=0.5)
<matplotlib.collections.PathCollection at 0x10bf15198>

png

plt.scatter(df['Weight'], df['Height'], edgecolor='none', alpha=0.5)
<matplotlib.collections.PathCollection at 0x10bfc5278>

png

Order of features is important! keep the same between fit and predict

df[['Weight', 'Height']].head()
Weight Height
0 219 78
1 220 79
2 195 73
3 220 83
4 230 79
df[['Height', 'Weight']].head()
Height Weight
0 78 219
1 79 220
2 73 195
3 83 220
4 79 230
km = KMeans(n_clusters=4)
km.fit(df[['Weight', 'Height']])
KMeans(copy_x=True, init='k-means++', max_iter=300, n_clusters=4, n_init=10,
    n_jobs=1, precompute_distances='auto', random_state=None, tol=0.0001,
    verbose=0)
df['cluster_4'] = km.predict(df[['Weight', 'Height']])
df.head(2)
Name Age Team POS Number Salary Height Weight Years 1st Year DOB School City State/Province Country Race HS Only POS_label cluster_4
0 Gee, Alonzo 26 Cavaliers F 33 3250000.0 78 219 4 2009 5/29/1987 Alabama Riviera Beach, FL Florida US Black No 1 0
1 Wallace, Gerald 31 Celtics F 45 10105855.0 79 220 12 2001 7/23/1982 Alabama Sylacauga, AL Alabama US Black No 1 0

What’s in each cluster? Do they seem like they were categorized somewhat

correctly?

df.groupby('cluster_4')['POS'].value_counts()
cluster_4  POS
0          F      104
           G       30
           C       16
1          G      144
           F        9
           C        1
2          C       50
           F       29
3          G        1
Name: POS, dtype: int64

Not really… That’s a lot of Guard/Forward/Centers together

import numpy as np
plt.scatter(df['Weight'], df['Height'], edgecolor='none', c=df['cluster_4'], alpha=0.5)
plt.scatter(km.cluster_centers_[:,0], km.cluster_centers_[:,1], s=100, c=np.unique(km.labels_))
plt.grid()

png

Oh no, k-means assumes all features are the same!! - 50 lb of weight is the

same as 50” of height

Let’s visualize that on a chart

import numpy as np
plt.scatter(df['Weight'], df['Height'], edgecolor='none', c=df['cluster_4'], alpha=0.5)
plt.scatter(km.cluster_centers_[:,0], km.cluster_centers_[:,1], s=100, c=np.unique(km.labels_))
plt.grid()
plt.xlim([0,350])
plt.ylim([0,350])
(0, 350)

png

df['Weight'].hist()
df['Height'].hist()
<matplotlib.axes._subplots.AxesSubplot at 0x10d1b3a20>

png

from sklearn import preprocessing
preprocessing.scale(df['Height'])
/Users/soma/.virtualenvs/data/lib/python3.5/site-packages/sklearn/utils/validation.py:420: DataConversionWarning: Data with input dtype int64 was converted to float64 by the scale function.
  warnings.warn(msg, DataConversionWarning)





array([-0.17151614,  0.09952171, -1.52670541,  1.18367313,  0.09952171,
        0.09952171,  0.09952171,  0.37055957, -0.9846297 , -1.25566756,
       -0.9846297 ,  0.91263527, -0.442554  , -1.52670541, -1.52670541,
       -1.52670541,  0.64159742,  0.09952171,  1.18367313, -0.9846297 ,
       -0.9846297 ,  1.45471098,  0.64159742, -1.25566756, -0.9846297 ,
        0.91263527, -0.17151614,  0.37055957, -0.442554  ,  1.45471098,
        0.64159742,  0.09952171, -0.442554  ,  0.37055957, -1.25566756,
        0.91263527,  0.37055957, -0.17151614, -0.9846297 , -0.9846297 ,
        2.26782454,  0.91263527,  0.09952171,  0.09952171,  1.18367313,
        0.37055957, -1.52670541, -0.442554  , -1.25566756,  0.37055957,
       -0.9846297 ,  0.64159742, -1.52670541,  0.37055957, -0.9846297 ,
       -0.9846297 ,  1.72574884, -0.9846297 ,  0.64159742,  0.64159742,
        0.37055957,  1.18367313,  0.37055957, -0.17151614, -0.71359185,
       -0.71359185, -1.25566756, -1.25566756, -0.442554  ,  1.18367313,
        0.91263527,  0.91263527,  1.18367313,  0.64159742, -0.17151614,
       -0.442554  ,  0.91263527,  0.37055957, -1.25566756,  0.64159742,
        0.64159742,  0.91263527,  1.99678669,  0.64159742,  0.37055957,
       -0.442554  ,  0.37055957,  0.64159742, -0.9846297 , -0.442554  ,
       -1.79774327,  1.18367313,  0.37055957,  0.91263527,  1.45471098,
        1.18367313, -0.9846297 ,  1.18367313,  1.72574884,  0.64159742,
       -0.9846297 ,  0.64159742, -0.71359185, -0.71359185, -1.52670541,
        0.37055957,  0.37055957, -0.71359185, -1.25566756, -0.17151614,
        1.18367313,  1.45471098,  0.09952171,  0.64159742,  0.64159742,
        0.91263527,  0.64159742,  0.91263527, -0.71359185, -1.25566756,
       -0.442554  , -0.9846297 ,  0.91263527, -0.71359185,  1.18367313,
        0.91263527,  1.18367313,  0.91263527,  0.09952171,  0.64159742,
        0.37055957,  0.64159742, -0.9846297 , -1.25566756, -1.52670541,
       -0.71359185, -0.71359185, -1.52670541, -0.442554  , -0.71359185,
       -0.9846297 , -0.71359185,  0.37055957,  1.18367313,  0.91263527,
       -1.25566756, -1.79774327, -0.17151614,  1.45471098,  0.37055957,
       -0.71359185, -0.17151614, -0.17151614,  0.91263527, -0.71359185,
       -0.71359185, -0.442554  , -1.25566756, -0.9846297 ,  1.72574884,
       -0.9846297 , -0.17151614,  0.37055957,  0.64159742, -0.9846297 ,
       -0.442554  , -0.17151614, -0.17151614,  0.91263527,  0.37055957,
       -2.06878112, -0.71359185, -0.442554  , -1.52670541, -0.17151614,
       -0.442554  ,  0.64159742,  0.09952171, -0.71359185,  0.09952171,
        0.64159742,  0.91263527,  0.91263527, -1.25566756,  0.37055957,
       -2.06878112,  0.37055957, -1.79774327, -0.71359185,  0.64159742,
       -0.9846297 , -0.17151614, -1.25566756,  0.64159742,  1.18367313,
        1.18367313,  1.72574884,  1.18367313, -0.17151614, -0.442554  ,
        0.64159742,  1.45471098, -1.52670541,  0.64159742,  0.91263527,
        0.37055957, -0.442554  ,  1.18367313,  1.18367313,  1.45471098,
        1.72574884,  0.64159742, -0.17151614,  1.45471098, -0.9846297 ,
        1.72574884,  0.64159742, -0.9846297 , -0.71359185,  0.91263527,
        1.18367313,  1.45471098,  0.91263527,  0.91263527,  1.18367313,
        1.18367313,  1.72574884, -0.9846297 , -0.9846297 ,  1.45471098,
       -0.9846297 ,  0.09952171,  0.37055957,  0.91263527,  1.45471098,
        1.18367313,  0.64159742,  0.91263527,  0.64159742,  0.37055957,
       -1.52670541, -0.71359185, -2.06878112, -0.71359185,  0.64159742,
        0.64159742,  0.37055957, -0.442554  ,  0.37055957, -1.79774327,
       -0.442554  ,  1.45471098,  0.64159742, -1.52670541,  1.45471098,
        1.45471098,  0.64159742, -1.52670541,  0.37055957,  0.91263527,
       -0.17151614, -2.06878112, -0.442554  , -0.17151614, -1.25566756,
       -1.79774327, -0.71359185,  0.09952171,  1.45471098,  1.45471098,
        0.09952171,  0.64159742, -0.71359185, -0.17151614,  1.45471098,
       -1.79774327, -1.79774327, -1.79774327, -0.442554  , -0.9846297 ,
        1.18367313,  0.37055957, -0.71359185, -0.71359185, -1.52670541,
        0.64159742,  0.09952171,  1.45471098,  1.45471098,  0.64159742,
       -0.71359185, -0.17151614,  0.64159742,  0.64159742,  0.09952171,
        0.64159742,  1.45471098, -0.71359185,  0.64159742,  0.37055957,
        1.45471098,  1.45471098,  0.37055957,  0.64159742,  1.18367313,
        1.45471098,  0.09952171,  0.37055957,  0.37055957, -0.17151614,
       -0.71359185, -0.71359185, -0.442554  ,  0.64159742, -1.25566756,
        0.64159742, -0.9846297 ,  0.64159742, -0.9846297 ,  1.18367313,
        1.18367313, -0.17151614,  0.64159742, -1.25566756, -0.71359185,
       -1.79774327,  1.18367313,  0.37055957, -0.9846297 , -0.17151614,
       -0.71359185, -0.442554  ,  0.37055957,  1.18367313,  1.45471098,
        0.09952171, -1.79774327, -1.25566756, -1.25566756, -0.71359185,
       -0.442554  , -0.9846297 , -1.52670541,  0.64159742,  0.37055957,
        0.64159742,  0.09952171,  1.45471098, -1.25566756, -1.25566756,
        1.18367313, -0.71359185,  0.37055957, -1.52670541, -0.71359185,
       -1.79774327,  0.37055957, -0.442554  , -0.71359185, -0.9846297 ,
        0.64159742,  0.64159742, -1.79774327, -1.25566756, -1.79774327,
        1.72574884, -0.17151614, -0.9846297 , -2.61085683, -2.61085683,
       -0.17151614, -0.9846297 ,  0.37055957,  1.18367313,  0.64159742,
        0.09952171,  0.64159742, -0.9846297 ,  1.18367313,  0.91263527,
        0.09952171, -0.9846297 ,  0.64159742, -0.71359185])
df['scaled_height'] = preprocessing.scale(df['Height'])
df['scaled_weight'] = preprocessing.scale(df['Weight'])
/Users/soma/.virtualenvs/data/lib/python3.5/site-packages/sklearn/utils/validation.py:420: DataConversionWarning: Data with input dtype int64 was converted to float64 by the scale function.
  warnings.warn(msg, DataConversionWarning)
/Users/soma/.virtualenvs/data/lib/python3.5/site-packages/sklearn/utils/validation.py:420: DataConversionWarning: Data with input dtype int64 was converted to float64 by the scale function.
  warnings.warn(msg, DataConversionWarning)
import numpy as np
plt.scatter(df['scaled_weight'], df['scaled_height'], edgecolor='none', c=df['cluster_4'], alpha=0.5)
plt.grid()

png

# Make a new KMeans, 3 clusters
# fit it with the scaled weight and height
# predict it based on scaled weight and height
# store those labels into a new column
# graph it with the new labels
km = KMeans(n_clusters=5)
km.fit(df[['scaled_weight', 'scaled_height']])
df['scaled_prediction'] = km.predict(df[['scaled_weight', 'scaled_height']])
df['scaled_prediction'].value_counts()
2    110
4     99
0     92
1     82
3      1
Name: scaled_prediction, dtype: int64
plt.scatter(df['scaled_weight'], df['scaled_height'], edgecolor='none', c=df['scaled_prediction'], alpha=0.5)
plt.scatter(km.cluster_centers_[:,0], km.cluster_centers_[:,1], s=100, c=np.unique(km.labels_))
plt.grid()

png

plt.scatter(df['scaled_weight'], df['scaled_height'], edgecolor='none', c=df['POS_label'], alpha=0.5)
plt.grid()

png

df.groupby('scaled_prediction')['POS'].value_counts()
scaled_prediction  POS
0                  C       61
                   F       31
1                  G       82
2                  F      105
                   C        5
3                  G        1
4                  G       92
                   F        6
                   C        1
Name: POS, dtype: int64

Trying to use DBSCAN

from sklearn.cluster import DBSCAN

db = DBSCAN()
df['db_category'] = db.fit_predict(df[["Height", "Weight"]])
plt.scatter(df['scaled_weight'], df['scaled_height'], edgecolor='none', c=df['db_category'], alpha=0.5)
<matplotlib.collections.PathCollection at 0x10ccaa518>

png

# similarity measure between two clusterings by considering 
# all pairs of samples and counting pairs that are assigned
# in the same or different clusters in the predicted and true clusterings

from sklearn import metrics
metrics.adjusted_rand_score(df['POS_label'], df['scaled_prediction'])  
0.6837098610421475
# Mutual Information is a function that measures the agreement predicted plus actual
from sklearn import metrics
metrics.adjusted_mutual_info_score(df['POS_label'], df['scaled_prediction'])  
0.64644401811332386
# homogeneity: each cluster contains only members of a single class.

from sklearn import metrics
metrics.homogeneity_score(df['POS_label'], df['scaled_prediction'])  
0.67802428868913123
# completeness: all members of a given class are assigned to the same cluster.

from sklearn import metrics
metrics.completeness_score(df['POS_label'], df['scaled_prediction'])  
0.64816747653318207