A little more about counting and stemming

Believe it or not, beyond just stemming there are multiple ways to count words!

And what words do you count? Originals? Lowercased? Stemmed? Lemmatized?

There are so many options!

import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
import re

# # Give me a THING that will count words for me!!!!!
# vec = CountVectorizer()
# # I have some sentences, please count the words in them
# matrix = vec.fit_transform(phrases)
# # And turn it into a dataframe
# docs = pd.DataFrame(matrix.toarray(), columns=vec.get_feature_names())

# We'll use this later!
# def stemming_tokenizer(str_input):
#     words = re.sub(r"[^A-Za-z0-9\-]", " ", str_input).lower().split()
#     words = [porter_stemmer.stem(word) for word in words]
#     return words

texts = [
    "Penny bought bright blue fishes.",
    "Penny bought a bright blue and orange fish.",
    "The fish fished fish.",
    "I'm fishing fish.",
    "I hate blue bugs",
    "A blue bug ate a fish",
    "fish"
]

# Make a new Count Vectorizer!!!!
vec = CountVectorizer()

# Say hey vectorizer, please read our stuff
matrix = vec.fit_transform(texts)
matrix.toarray()

array([[0, 1, 1, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0],
       [1, 1, 1, 1, 0, 1, 0, 0, 0, 0, 1, 1, 0],
       [0, 0, 0, 0, 0, 2, 1, 0, 0, 0, 0, 0, 1],
       [0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0],
       [0, 1, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0]], dtype=int64)

vec.get_feature_names()

['and',
 'blue',
 'bought',
 'bright',
 'bugs',
 'fish',
 'fished',
 'fishes',
 'fishing',
 'hate',
 'orange',
 'penny',
 'the']

results = pd.DataFrame(matrix.toarray(), columns=vec.get_feature_names())
results

	and	blue	bought	bright	bugs	fish	fished	fishes	fishing	hate	orange	penny	the
0	0	1	1	1	0	0	0	1	0	0	0	1	0
1	1	1	1	1	0	1	0	0	0	0	1	1	0
2	0	0	0	0	0	2	1	0	0	0	0	0	1
3	0	0	0	0	0	1	0	0	1	0	0	0	0
4	0	1	0	0	1	0	0	0	0	1	0	0	0

# Make a new Count Vectorizer!!!!
vec = CountVectorizer()

# Say hey vectorizer, please read our stuff
matrix = vec.fit_transform(texts)

# And make a dataframe out of it
results = pd.DataFrame(matrix.toarray(), columns=vec.get_feature_names())
results

	and	blue	bought	bright	bugs	fish	fished	fishes	fishing	hate	orange	penny	the
0	0	1	1	1	0	0	0	1	0	0	0	1	0
1	1	1	1	1	0	1	0	0	0	0	1	1	0
2	0	0	0	0	0	2	1	0	0	0	0	0	1
3	0	0	0	0	0	1	0	0	1	0	0	0	0
4	0	1	0	0	1	0	0	0	0	1	0	0	0

Stop words or stopwords

# Make a new Count Vectorizer!!!!
vec = CountVectorizer(stop_words=['a', 'the'])

# Say hey vectorizer, please read our stuff
matrix = vec.fit_transform(texts)

# And make a dataframe out of it
results = pd.DataFrame(matrix.toarray(), columns=vec.get_feature_names())
results

	and	blue	bought	bright	bugs	fish	fished	fishes	fishing	hate	orange	penny
0	0	1	1	1	0	0	0	1	0	0	0	1
1	1	1	1	1	0	1	0	0	0	0	1	1
2	0	0	0	0	0	2	1	0	0	0	0	0
3	0	0	0	0	0	1	0	0	1	0	0	0
4	0	1	0	0	1	0	0	0	0	1	0	0

# Make a new Count Vectorizer!!!!
vec = CountVectorizer(stop_words='english')

# Say hey vectorizer, please read our stuff
matrix = vec.fit_transform(texts)

# And make a dataframe out of it
results = pd.DataFrame(matrix.toarray(), columns=vec.get_feature_names())
results

	ate	blue	bought	bright	bug	bugs	fish	fished	fishes	fishing	hate	orange	penny
0	0	1	1	1	0	0	0	0	1	0	0	0	1
1	0	1	1	1	0	0	1	0	0	0	0	1	1
2	0	0	0	0	0	0	2	1	0	0	0	0	0
3	0	0	0	0	0	0	1	0	0	1	0	0	0
4	0	1	0	0	0	1	0	0	0	0	1	0	0
5	1	1	0	0	1	0	1	0	0	0	0	0	0

Stemming

from nltk.stem.porter import PorterStemmer
porter_stemmer = PorterStemmer()

porter_stemmer.stem("fish")

'fish'

porter_stemmer.stem("fishes")

'fish'

porter_stemmer.stem("fishing")

'fish'

porter_stemmer.stem("fishingsaloos")

'fishingsaloo'

from textblob import TextBlob

# Use TextBlob
def textblob_tokenizer(str_input):
    blob = TextBlob(str_input.lower())
    tokens = blob.words
    words = [token.stem() for token in tokens]
    return words

# Use NLTK's PorterStemmer
def stemming_tokenizer(str_input):
    words = re.sub(r"[^A-Za-z0-9\-]", " ", str_input).lower().split()
    words = [porter_stemmer.stem(word) for word in words]
    return words

stemming_tokenizer("I went fishing to get fishes")

['i', 'went', 'fish', 'to', 'get', 'fish']

textblob_tokenizer("I went fishing to get fishes")

['i', 'went', 'fish', 'to', 'get', 'fish']

Using our tokenizer that also stems

# Make a new Count Vectorizer!!!!
vec = CountVectorizer(stop_words='english', tokenizer=textblob_tokenizer)

# Say hey vectorizer, please read our stuff
matrix = vec.fit_transform(texts)

# And make a dataframe out of it
results = pd.DataFrame(matrix.toarray(), columns=vec.get_feature_names())
results

	'm	ate	blue	bought	bright	bug	fish	hate	orang	penni
0	0	0	1	1	1	0	1	0	0	1
1	0	0	1	1	1	0	1	0	1	1
2	0	0	0	0	0	0	3	0	0	0
3	1	0	0	0	0	0	2	0	0	0
4	0	0	1	0	0	1	0	1	0	0
5	0	1	1	0	0	1	1	0	0	0

TF-IDF

from sklearn.feature_extraction.text import TfidfVectorizer

# Make a new Tfidf Vectorizer!!!!
vec = TfidfVectorizer(stop_words='english', 
                      tokenizer=textblob_tokenizer,
                      use_idf=False,
                      norm='l1') # L - ONE

# Say hey vectorizer, please read our stuff
matrix = vec.fit_transform(texts)

# And make a dataframe out of it
results = pd.DataFrame(matrix.toarray(), columns=vec.get_feature_names())
results

	'm	ate	blue	bought	bright	bug	fish	hate	orang	penni
0	0.000000	0.00	0.200000	0.200000	0.200000	0.000000	0.200000	0.000000	0.000000	0.200000
1	0.000000	0.00	0.166667	0.166667	0.166667	0.000000	0.166667	0.000000	0.166667	0.166667
2	0.000000	0.00	0.000000	0.000000	0.000000	0.000000	1.000000	0.000000	0.000000	0.000000
3	0.333333	0.00	0.000000	0.000000	0.000000	0.000000	0.666667	0.000000	0.000000	0.000000
4	0.000000	0.00	0.333333	0.000000	0.000000	0.333333	0.000000	0.333333	0.000000	0.000000
5	0.000000	0.25	0.250000	0.000000	0.000000	0.250000	0.250000	0.000000	0.000000	0.000000
6	0.000000	0.00	0.000000	0.000000	0.000000	0.000000	1.000000	0.000000	0.000000	0.000000

results.sort_values(by='fish', ascending=False)

	'm	ate	blue	bought	bright	bug	fish	hate	orang	penni
2	0.000000	0.00	0.000000	0.000000	0.000000	0.000000	1.000000	0.000000	0.000000	0.000000
6	0.000000	0.00	0.000000	0.000000	0.000000	0.000000	1.000000	0.000000	0.000000	0.000000
3	0.333333	0.00	0.000000	0.000000	0.000000	0.000000	0.666667	0.000000	0.000000	0.000000
5	0.000000	0.25	0.250000	0.000000	0.000000	0.250000	0.250000	0.000000	0.000000	0.000000
0	0.000000	0.00	0.200000	0.200000	0.200000	0.000000	0.200000	0.000000	0.000000	0.200000
1	0.000000	0.00	0.166667	0.166667	0.166667	0.000000	0.166667	0.000000	0.166667	0.166667
4	0.000000	0.00	0.333333	0.000000	0.000000	0.333333	0.000000	0.333333	0.000000	0.000000

Only counting certain words

from sklearn.feature_extraction.text import TfidfVectorizer

# Make a new Tfidf Vectorizer!!!!
vec = TfidfVectorizer(stop_words='english', 
                      vocabulary=['fish', 'bug'],
                      tokenizer=textblob_tokenizer,
                      use_idf=False,
                      norm='l1') # L - ONE

# Say hey vectorizer, please read our stuff
matrix = vec.fit_transform(texts)

# And make a dataframe out of it
results = pd.DataFrame(matrix.toarray(), columns=vec.get_feature_names())
results

	fish	bug
0	1.0	0.0
1	1.0	0.0
2	1.0	0.0
3	1.0	0.0
4	0.0	1.0
5	0.5	0.5
6	1.0	0.0

# Make a new Count Vectorizer!!!!
vec = CountVectorizer(stop_words='english', 
                      vocabulary=['fish', 'bug'],
                      tokenizer=textblob_tokenizer)

# Say hey vectorizer, please read our stuff
matrix = vec.fit_transform(texts)

# And make a dataframe out of it
results = pd.DataFrame(matrix.toarray(), columns=vec.get_feature_names())
results

	fish	bug
0	1	0
1	1	0
2	3	0
3	2	0
4	0	1
5	1	1
6	1	0

	and	blue	bought	bright	bugs	fish	fished	fishes	fishing	hate	orange	penny	the
0	0	1	1	1	0	0	0	1	0	0	0	1	0
1	1	1	1	1	0	1	0	0	0	0	1	1	0
2	0	0	0	0	0	2	1	0	0	0	0	0	1
3	0	0	0	0	0	1	0	0	1	0	0	0	0
4	0	1	0	0	1	0	0	0	0	1	0	0	0

	and	blue	bought	bright	bugs	fish	fished	fishes	fishing	hate	orange	penny	the
0	0	1	1	1	0	0	0	1	0	0	0	1	0
1	1	1	1	1	0	1	0	0	0	0	1	1	0
2	0	0	0	0	0	2	1	0	0	0	0	0	1
3	0	0	0	0	0	1	0	0	1	0	0	0	0
4	0	1	0	0	1	0	0	0	0	1	0	0	0

	and	blue	bought	bright	bugs	fish	fished	fishes	fishing	hate	orange	penny
0	0	1	1	1	0	0	0	1	0	0	0	1
1	1	1	1	1	0	1	0	0	0	0	1	1
2	0	0	0	0	0	2	1	0	0	0	0	0
3	0	0	0	0	0	1	0	0	1	0	0	0
4	0	1	0	0	1	0	0	0	0	1	0	0

	ate	blue	bought	bright	bug	bugs	fish	fished	fishes	fishing	hate	orange	penny
0	0	1	1	1	0	0	0	0	1	0	0	0	1
1	0	1	1	1	0	0	1	0	0	0	0	1	1
2	0	0	0	0	0	0	2	1	0	0	0	0	0
3	0	0	0	0	0	0	1	0	0	1	0	0	0
4	0	1	0	0	0	1	0	0	0	0	1	0	0
5	1	1	0	0	1	0	1	0	0	0	0	0	0

	'm	ate	blue	bought	bright	bug	fish	hate	orang	penni
0	0	0	1	1	1	0	1	0	0	1
1	0	0	1	1	1	0	1	0	1	1
2	0	0	0	0	0	0	3	0	0	0
3	1	0	0	0	0	0	2	0	0	0
4	0	0	1	0	0	1	0	1	0	0
5	0	1	1	0	0	1	1	0	0	0

	and	blue	bought	bright	bugs	fish	fished	fishes	fishing	hate	orange	penny	the
0	0	1	1	1	0	0	0	1	0	0	0	1	0
1	1	1	1	1	0	1	0	0	0	0	1	1	0
2	0	0	0	0	0	2	1	0	0	0	0	0	1
3	0	0	0	0	0	1	0	0	1	0	0	0	0
4	0	1	0	0	1	0	0	0	0	1	0	0	0

	and	blue	bought	bright	bugs	fish	fished	fishes	fishing	hate	orange	penny	the
0	0	1	1	1	0	0	0	1	0	0	0	1	0
1	1	1	1	1	0	1	0	0	0	0	1	1	0
2	0	0	0	0	0	2	1	0	0	0	0	0	1
3	0	0	0	0	0	1	0	0	1	0	0	0	0
4	0	1	0	0	1	0	0	0	0	1	0	0	0

	and	blue	bought	bright	bugs	fish	fished	fishes	fishing	hate	orange	penny
0	0	1	1	1	0	0	0	1	0	0	0	1
1	1	1	1	1	0	1	0	0	0	0	1	1
2	0	0	0	0	0	2	1	0	0	0	0	0
3	0	0	0	0	0	1	0	0	1	0	0	0
4	0	1	0	0	1	0	0	0	0	1	0	0

	ate	blue	bought	bright	bug	bugs	fish	fished	fishes	fishing	hate	orange	penny
0	0	1	1	1	0	0	0	0	1	0	0	0	1
1	0	1	1	1	0	0	1	0	0	0	0	1	1
2	0	0	0	0	0	0	2	1	0	0	0	0	0
3	0	0	0	0	0	0	1	0	0	1	0	0	0
4	0	1	0	0	0	1	0	0	0	0	1	0	0
5	1	1	0	0	1	0	1	0	0	0	0	0	0

	'm	ate	blue	bought	bright	bug	fish	hate	orang	penni
0	0	0	1	1	1	0	1	0	0	1
1	0	0	1	1	1	0	1	0	1	1
2	0	0	0	0	0	0	3	0	0	0
3	1	0	0	0	0	0	2	0	0	0
4	0	0	1	0	0	1	0	1	0	0
5	0	1	1	0	0	1	1	0	0	0

	and	blue	bought	bright	bugs	fish	fished	fishes	fishing	hate	orange	penny	the
0	0	1	1	1	0	0	0	1	0	0	0	1	0
1	1	1	1	1	0	1	0	0	0	0	1	1	0
2	0	0	0	0	0	2	1	0	0	0	0	0	1
3	0	0	0	0	0	1	0	0	1	0	0	0	0
4	0	1	0	0	1	0	0	0	0	1	0	0	0

	and	blue	bought	bright	bugs	fish	fished	fishes	fishing	hate	orange	penny	the
0	0	1	1	1	0	0	0	1	0	0	0	1	0
1	1	1	1	1	0	1	0	0	0	0	1	1	0
2	0	0	0	0	0	2	1	0	0	0	0	0	1
3	0	0	0	0	0	1	0	0	1	0	0	0	0
4	0	1	0	0	1	0	0	0	0	1	0	0	0

	and	blue	bought	bright	bugs	fish	fished	fishes	fishing	hate	orange	penny
0	0	1	1	1	0	0	0	1	0	0	0	1
1	1	1	1	1	0	1	0	0	0	0	1	1
2	0	0	0	0	0	2	1	0	0	0	0	0
3	0	0	0	0	0	1	0	0	1	0	0	0
4	0	1	0	0	1	0	0	0	0	1	0	0

	ate	blue	bought	bright	bug	bugs	fish	fished	fishes	fishing	hate	orange	penny
0	0	1	1	1	0	0	0	0	1	0	0	0	1
1	0	1	1	1	0	0	1	0	0	0	0	1	1
2	0	0	0	0	0	0	2	1	0	0	0	0	0
3	0	0	0	0	0	0	1	0	0	1	0	0	0
4	0	1	0	0	0	1	0	0	0	0	1	0	0
5	1	1	0	0	1	0	1	0	0	0	0	0	0

	'm	ate	blue	bought	bright	bug	fish	hate	orang	penni
0	0	0	1	1	1	0	1	0	0	1
1	0	0	1	1	1	0	1	0	1	1
2	0	0	0	0	0	0	3	0	0	0
3	1	0	0	0	0	0	2	0	0	0
4	0	0	1	0	0	1	0	1	0	0
5	0	1	1	0	0	1	1	0	0	0