# KMeans clustering ANY documents

## Read in your files if they're all separate

In [49]:
import pandas as pd
import glob

filenames = glob.glob("fanfiction-harry-potter/*.txt")

# Maybe use open(filename, encoding='latin-1').read()
# if you're running into trouble
contents = [open(filename, encoding='latin-1').read() for filename in filenames]

df = pd.DataFrame({
    'text': contents,
    'filenames': filenames
})
df.head()

Unnamed: 0,filenames,text
0,fanfiction-harry-potter/10001898.txt,Prologue: The MissionDisclaimer: All character...
1,fanfiction-harry-potter/10004131.txt,BlackDisclaimer: I do not own Harry PotterAuth...
2,fanfiction-harry-potter/10004927.txt,"Chapter 1""I'm pregnant.""""""""Mum please say some..."
3,fanfiction-harry-potter/10007980.txt,"Author's Note: Hey, just so you know, this is ..."
4,fanfiction-harry-potter/10010343.txt,Disclaimer: I do not own Harry Potter and frie...


## Or read in your CSV with the text column if not

In [50]:
# import pandas as pd
# df = pd.read_csv("WHAT IS THE FILE???")

## Vectorize your documents

What are the options when creating a `TfidfVectorizer`?

In [37]:
TfidfVectorizer?

Let's think about:

* **ngram_range**: Do we just want single words? Or more? `(1,2)` is one- and two-word phrases, etc.
* **max_features**: Can it make things faster? `1` and up
* **max_df**: Should we ignore words that show up too often? `0.0`-`1.0` for percent, OR an integer for absolute document counts
* **min_df**: Should we ignore words that show up too little? `0.0`-`1.0` for percent, OR an integer for absolute document counts
* **vocabulary**: Only care about certain words

Also... how many documents do we have?

In [52]:
df.shape

(1874, 2)

In [67]:
%%time
from sklearn.feature_extraction.text import TfidfVectorizer
from textblob import TextBlob


def textblob_tokenizer(str_input):
    blob = TextBlob(str_input.lower())
    tokens = blob.words
    words = [token.stem() for token in tokens]
    return words

# CUSTOM STOPWORDS
# FIRST, create a list of custom stopwords
from sklearn.feature_extraction import stop_words
custom_stopwords = ['just', 'said', 'love'] + list(stop_words.ENGLISH_STOP_WORDS)

# CUSTOM STOPWORDS
# THEN, instead of stop_words='english'
# add your custom stopwords into the vectorizer
vec = TfidfVectorizer(stop_words=custom_stopwords,
                      max_df=0.9, # If you're in >90%, ignore
                      min_df=0.15, # If you're only in 15%, ignore
                      max_features=250,
                      use_idf=True)

# Fit from the 'text' column of our dataframe
matrix = vec.fit_transform(df['text'])

# Then turn it into a new dataframe
results = pd.DataFrame(matrix.toarray(), columns=vec.get_feature_names())

CPU times: user 4.2 s, sys: 85.9 ms, total: 4.28 s
Wall time: 4.4 s


In [68]:
results.head()

Unnamed: 0,able,actually,albus,arm,arms,ask,asked,away,bad,bed,...,won,words,work,world,wouldn,yeah,year,years,yes,young
0,0.0,0.0,0.0,0.0,0.0,0.0,0.05829,0.0,0.0,0.0,...,0.026641,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.021998,0.161977
1,0.007969,0.01581,0.021035,0.0,0.015247,0.008246,0.03487,0.05408,0.0,0.014964,...,0.0,0.068611,0.007508,0.0,0.021064,0.0,0.028493,0.036743,0.032899,0.088822
2,0.0,0.030736,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.029091,...,0.12393,0.0,0.029191,0.0,0.027299,0.0,0.044313,0.0,0.025583,0.094186
3,0.009575,0.0,0.012638,0.019594,0.009161,0.039633,0.034917,0.006498,0.009812,0.0,...,0.0,0.009161,0.009022,0.0,0.0,0.043148,0.102715,0.066227,0.039532,0.058217
4,0.023051,0.0,0.0,0.04717,0.022052,0.0,0.050434,0.078219,0.02362,0.0,...,0.0,0.022052,0.0,0.060497,0.0,0.0,0.049454,0.053143,0.038067,0.256938


> ...Try it without the TextBlob tokenizer

## Cluster your documents

In [69]:
%%time
from sklearn.cluster import KMeans

# How many clusters?
number_of_clusters=2
km = KMeans(n_clusters=number_of_clusters)

print("Fitting", number_of_clusters, "clusters usinga ", matrix.shape, "matrix")

# Let's fit it!
km.fit(matrix)
km.fit

Fitting 2 clusters usinga  (1874, 250) matrix
CPU times: user 18.5 s, sys: 148 ms, total: 18.7 s
Wall time: 19.3 s


## See what they look like

In [70]:
print("Top terms per cluster:")
order_centroids = km.cluster_centers_.argsort()[:, ::-1]
terms = vec.get_feature_names()
for i in range(number_of_clusters):
    top_ten_words = [terms[ind] for ind in order_centroids[i, :5]]
    print("Cluster {}: {}".format(i, ' '.join(top_ten_words)))

Top terms per cluster:
Cluster 0: james lily said just like
Cluster 1: harry hermione ron draco said


## Push the category back to the original dataframe

In [72]:
df['category'] = km.labels_
df

Unnamed: 0,filenames,text,category
0,fanfiction-harry-potter/10001898.txt,Prologue: The MissionDisclaimer: All character...,0
1,fanfiction-harry-potter/10004131.txt,BlackDisclaimer: I do not own Harry PotterAuth...,0
2,fanfiction-harry-potter/10004927.txt,"Chapter 1""I'm pregnant.""""""""Mum please say some...",0
3,fanfiction-harry-potter/10007980.txt,"Author's Note: Hey, just so you know, this is ...",1
4,fanfiction-harry-potter/10010343.txt,Disclaimer: I do not own Harry Potter and frie...,0
5,fanfiction-harry-potter/10017757.txt,Disclaimer: I don't own any character in the H...,0
6,fanfiction-harry-potter/10018490.txt,DISCLAIMER: I don't own Harry Potter and its c...,0
7,fanfiction-harry-potter/10018889.txt,Katherine Rose-TylerChapter One: the Introduct...,0
8,fanfiction-harry-potter/10019142.txt,I am no longer that shy little boy anymore.I w...,0
9,fanfiction-harry-potter/10019987.txt,Happy New year! *throws confetti*I've really b...,0


## Be pleased

In [73]:
df[df.category == 1]

Unnamed: 0,filenames,text,category
3,fanfiction-harry-potter/10007980.txt,"Author's Note: Hey, just so you know, this is ...",1
10,fanfiction-harry-potter/10021604.txt,"2014""It's ridiculous."" The red-headed boy shoo...",1
19,fanfiction-harry-potter/10041730.txt,A/N: This story follows all of canon besides f...,1
20,fanfiction-harry-potter/10043489.txt,It's 8th year at Hogwarts. Voldemort is dead. ...,1
22,fanfiction-harry-potter/10045762.txt,Prologue: The Puzzle That Wasn't Meant To Be S...,1
24,fanfiction-harry-potter/10050162.txt,I stood at the top of the Astronomy Tower thin...,1
25,fanfiction-harry-potter/10051779.txt,Title: Twin Dragon Heartstring Cores.Author: L...,1
26,fanfiction-harry-potter/10052973.txt,"Chapter 1Walking down the corridor to potions,...",1
30,fanfiction-harry-potter/10061747.txt,The air whistles and blows through chestnut lo...,1
31,fanfiction-harry-potter/10061794.txt,"Yay! Okay, now to clear this up. Story takes p...",1


In [None]:
# bella
# edward
# just
# said

In [76]:
from sklearn.feature_extraction import stop_words
['said', 'thee', 'ye'] + list(stop_words.ENGLISH_STOP_WORDS)

['said',
 'thee',
 'ye',
 'after',
 'mostly',
 'my',
 'whereafter',
 'been',
 'sincere',
 'see',
 'con',
 'and',
 'elsewhere',
 'every',
 'however',
 'others',
 'couldnt',
 'made',
 'over',
 'such',
 'since',
 'became',
 'any',
 'else',
 'below',
 'these',
 'without',
 'against',
 'amoungst',
 'still',
 'whole',
 'we',
 'all',
 'or',
 'full',
 'am',
 'have',
 'together',
 'across',
 'into',
 'hence',
 'sometimes',
 'thereafter',
 'must',
 'first',
 'moreover',
 'only',
 'alone',
 'than',
 'empty',
 'they',
 'yourselves',
 'myself',
 'few',
 'out',
 'cry',
 'ten',
 'done',
 'throughout',
 'twenty',
 'were',
 'whither',
 'also',
 'much',
 'here',
 'except',
 'sometime',
 'everything',
 'move',
 'as',
 'this',
 'un',
 'fire',
 'him',
 'an',
 'nevertheless',
 'latter',
 'take',
 'us',
 'etc',
 'along',
 'some',
 'off',
 'beside',
 'get',
 'whence',
 'anyone',
 'seemed',
 'towards',
 'further',
 'thus',
 'back',
 'interest',
 'never',
 'he',
 'ltd',
 'part',
 'yours',
 'no',
 'seem',
 'perh