A little more about counting and stemming

Believe it or not, beyond just stemming there are multiple ways to count words!

And what words do you count? Originals? Lowercased? Stemmed? Lemmatized?

There are so many options!

import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
import re
# # Give me a THING that will count words for me!!!!!
# vec = CountVectorizer()
# # I have some sentences, please count the words in them
# matrix = vec.fit_transform(phrases)
# # And turn it into a dataframe
# docs = pd.DataFrame(matrix.toarray(), columns=vec.get_feature_names())
# We'll use this later!
# def stemming_tokenizer(str_input):
#     words = re.sub(r"[^A-Za-z0-9\-]", " ", str_input).lower().split()
#     words = [porter_stemmer.stem(word) for word in words]
#     return words
texts = [
    "Penny bought bright blue fishes.",
    "Penny bought a bright blue and orange fish.",
    "The fish fished fish.",
    "I'm fishing fish.",
    "I hate blue bugs",
    "A blue bug ate a fish",
    "fish"
]
# Make a new Count Vectorizer!!!!
vec = CountVectorizer()
# Say hey vectorizer, please read our stuff
matrix = vec.fit_transform(texts)
matrix.toarray()
array([[0, 1, 1, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0],
       [1, 1, 1, 1, 0, 1, 0, 0, 0, 0, 1, 1, 0],
       [0, 0, 0, 0, 0, 2, 1, 0, 0, 0, 0, 0, 1],
       [0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0],
       [0, 1, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0]], dtype=int64)
vec.get_feature_names()
['and',
 'blue',
 'bought',
 'bright',
 'bugs',
 'fish',
 'fished',
 'fishes',
 'fishing',
 'hate',
 'orange',
 'penny',
 'the']
results = pd.DataFrame(matrix.toarray(), columns=vec.get_feature_names())
results
and blue bought bright bugs fish fished fishes fishing hate orange penny the
0 0 1 1 1 0 0 0 1 0 0 0 1 0
1 1 1 1 1 0 1 0 0 0 0 1 1 0
2 0 0 0 0 0 2 1 0 0 0 0 0 1
3 0 0 0 0 0 1 0 0 1 0 0 0 0
4 0 1 0 0 1 0 0 0 0 1 0 0 0
# Make a new Count Vectorizer!!!!
vec = CountVectorizer()

# Say hey vectorizer, please read our stuff
matrix = vec.fit_transform(texts)

# And make a dataframe out of it
results = pd.DataFrame(matrix.toarray(), columns=vec.get_feature_names())
results
and blue bought bright bugs fish fished fishes fishing hate orange penny the
0 0 1 1 1 0 0 0 1 0 0 0 1 0
1 1 1 1 1 0 1 0 0 0 0 1 1 0
2 0 0 0 0 0 2 1 0 0 0 0 0 1
3 0 0 0 0 0 1 0 0 1 0 0 0 0
4 0 1 0 0 1 0 0 0 0 1 0 0 0

Stop words or stopwords

# Make a new Count Vectorizer!!!!
vec = CountVectorizer(stop_words=['a', 'the'])

# Say hey vectorizer, please read our stuff
matrix = vec.fit_transform(texts)

# And make a dataframe out of it
results = pd.DataFrame(matrix.toarray(), columns=vec.get_feature_names())
results
and blue bought bright bugs fish fished fishes fishing hate orange penny
0 0 1 1 1 0 0 0 1 0 0 0 1
1 1 1 1 1 0 1 0 0 0 0 1 1
2 0 0 0 0 0 2 1 0 0 0 0 0
3 0 0 0 0 0 1 0 0 1 0 0 0
4 0 1 0 0 1 0 0 0 0 1 0 0
# Make a new Count Vectorizer!!!!
vec = CountVectorizer(stop_words='english')

# Say hey vectorizer, please read our stuff
matrix = vec.fit_transform(texts)

# And make a dataframe out of it
results = pd.DataFrame(matrix.toarray(), columns=vec.get_feature_names())
results
ate blue bought bright bug bugs fish fished fishes fishing hate orange penny
0 0 1 1 1 0 0 0 0 1 0 0 0 1
1 0 1 1 1 0 0 1 0 0 0 0 1 1
2 0 0 0 0 0 0 2 1 0 0 0 0 0
3 0 0 0 0 0 0 1 0 0 1 0 0 0
4 0 1 0 0 0 1 0 0 0 0 1 0 0
5 1 1 0 0 1 0 1 0 0 0 0 0 0

Stemming

from nltk.stem.porter import PorterStemmer
porter_stemmer = PorterStemmer()
porter_stemmer.stem("fish")
'fish'
porter_stemmer.stem("fishes")
'fish'
porter_stemmer.stem("fishing")
'fish'
porter_stemmer.stem("fishingsaloos")
'fishingsaloo'
from textblob import TextBlob

# Use TextBlob
def textblob_tokenizer(str_input):
    blob = TextBlob(str_input.lower())
    tokens = blob.words
    words = [token.stem() for token in tokens]
    return words

# Use NLTK's PorterStemmer
def stemming_tokenizer(str_input):
    words = re.sub(r"[^A-Za-z0-9\-]", " ", str_input).lower().split()
    words = [porter_stemmer.stem(word) for word in words]
    return words
stemming_tokenizer("I went fishing to get fishes")
['i', 'went', 'fish', 'to', 'get', 'fish']
textblob_tokenizer("I went fishing to get fishes")
['i', 'went', 'fish', 'to', 'get', 'fish']

Using our tokenizer that also stems

# Make a new Count Vectorizer!!!!
vec = CountVectorizer(stop_words='english', tokenizer=textblob_tokenizer)

# Say hey vectorizer, please read our stuff
matrix = vec.fit_transform(texts)

# And make a dataframe out of it
results = pd.DataFrame(matrix.toarray(), columns=vec.get_feature_names())
results
'm ate blue bought bright bug fish hate orang penni
0 0 0 1 1 1 0 1 0 0 1
1 0 0 1 1 1 0 1 0 1 1
2 0 0 0 0 0 0 3 0 0 0
3 1 0 0 0 0 0 2 0 0 0
4 0 0 1 0 0 1 0 1 0 0
5 0 1 1 0 0 1 1 0 0 0

TF-IDF

from sklearn.feature_extraction.text import TfidfVectorizer

# Make a new Tfidf Vectorizer!!!!
vec = TfidfVectorizer(stop_words='english', 
                      tokenizer=textblob_tokenizer,
                      use_idf=False,
                      norm='l1') # L - ONE

# Say hey vectorizer, please read our stuff
matrix = vec.fit_transform(texts)

# And make a dataframe out of it
results = pd.DataFrame(matrix.toarray(), columns=vec.get_feature_names())
results
'm ate blue bought bright bug fish hate orang penni
0 0.000000 0.00 0.200000 0.200000 0.200000 0.000000 0.200000 0.000000 0.000000 0.200000
1 0.000000 0.00 0.166667 0.166667 0.166667 0.000000 0.166667 0.000000 0.166667 0.166667
2 0.000000 0.00 0.000000 0.000000 0.000000 0.000000 1.000000 0.000000 0.000000 0.000000
3 0.333333 0.00 0.000000 0.000000 0.000000 0.000000 0.666667 0.000000 0.000000 0.000000
4 0.000000 0.00 0.333333 0.000000 0.000000 0.333333 0.000000 0.333333 0.000000 0.000000
5 0.000000 0.25 0.250000 0.000000 0.000000 0.250000 0.250000 0.000000 0.000000 0.000000
6 0.000000 0.00 0.000000 0.000000 0.000000 0.000000 1.000000 0.000000 0.000000 0.000000
results.sort_values(by='fish', ascending=False)
'm ate blue bought bright bug fish hate orang penni
2 0.000000 0.00 0.000000 0.000000 0.000000 0.000000 1.000000 0.000000 0.000000 0.000000
6 0.000000 0.00 0.000000 0.000000 0.000000 0.000000 1.000000 0.000000 0.000000 0.000000
3 0.333333 0.00 0.000000 0.000000 0.000000 0.000000 0.666667 0.000000 0.000000 0.000000
5 0.000000 0.25 0.250000 0.000000 0.000000 0.250000 0.250000 0.000000 0.000000 0.000000
0 0.000000 0.00 0.200000 0.200000 0.200000 0.000000 0.200000 0.000000 0.000000 0.200000
1 0.000000 0.00 0.166667 0.166667 0.166667 0.000000 0.166667 0.000000 0.166667 0.166667
4 0.000000 0.00 0.333333 0.000000 0.000000 0.333333 0.000000 0.333333 0.000000 0.000000

Only counting certain words

from sklearn.feature_extraction.text import TfidfVectorizer

# Make a new Tfidf Vectorizer!!!!
vec = TfidfVectorizer(stop_words='english', 
                      vocabulary=['fish', 'bug'],
                      tokenizer=textblob_tokenizer,
                      use_idf=False,
                      norm='l1') # L - ONE

# Say hey vectorizer, please read our stuff
matrix = vec.fit_transform(texts)

# And make a dataframe out of it
results = pd.DataFrame(matrix.toarray(), columns=vec.get_feature_names())
results
fish bug
0 1.0 0.0
1 1.0 0.0
2 1.0 0.0
3 1.0 0.0
4 0.0 1.0
5 0.5 0.5
6 1.0 0.0
# Make a new Count Vectorizer!!!!
vec = CountVectorizer(stop_words='english', 
                      vocabulary=['fish', 'bug'],
                      tokenizer=textblob_tokenizer)

# Say hey vectorizer, please read our stuff
matrix = vec.fit_transform(texts)

# And make a dataframe out of it
results = pd.DataFrame(matrix.toarray(), columns=vec.get_feature_names())
results
fish bug
0 1 0
1 1 0
2 3 0
3 2 0
4 0 1
5 1 1
6 1 0