# A little more about counting and stemming

Believe it or not, beyond just stemming there are **multiple ways to count words!**

And what words do you count? Originals? Lowercased? Stemmed? Lemmatized?

There are so many options!

In [1]:
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
import re

In [2]:
# # Give me a THING that will count words for me!!!!!
# vec = CountVectorizer()
# # I have some sentences, please count the words in them
# matrix = vec.fit_transform(phrases)
# # And turn it into a dataframe
# docs = pd.DataFrame(matrix.toarray(), columns=vec.get_feature_names())

In [3]:
# We'll use this later!
# def stemming_tokenizer(str_input):
#     words = re.sub(r"[^A-Za-z0-9\-]", " ", str_input).lower().split()
#     words = [porter_stemmer.stem(word) for word in words]
#     return words

In [51]:
texts = [
    "Penny bought bright blue fishes.",
    "Penny bought a bright blue and orange fish.",
    "The fish fished fish.",
    "I'm fishing fish.",
    "I hate blue bugs",
    "A blue bug ate a fish",
    "fish"
]

In [5]:
# Make a new Count Vectorizer!!!!
vec = CountVectorizer()

In [8]:
# Say hey vectorizer, please read our stuff
matrix = vec.fit_transform(texts)
matrix.toarray()

array([[0, 1, 1, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0],
       [1, 1, 1, 1, 0, 1, 0, 0, 0, 0, 1, 1, 0],
       [0, 0, 0, 0, 0, 2, 1, 0, 0, 0, 0, 0, 1],
       [0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0],
       [0, 1, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0]], dtype=int64)

In [9]:
vec.get_feature_names()

['and',
 'blue',
 'bought',
 'bright',
 'bugs',
 'fish',
 'fished',
 'fishes',
 'fishing',
 'hate',
 'orange',
 'penny',
 'the']

In [11]:
results = pd.DataFrame(matrix.toarray(), columns=vec.get_feature_names())
results

Unnamed: 0,and,blue,bought,bright,bugs,fish,fished,fishes,fishing,hate,orange,penny,the
0,0,1,1,1,0,0,0,1,0,0,0,1,0
1,1,1,1,1,0,1,0,0,0,0,1,1,0
2,0,0,0,0,0,2,1,0,0,0,0,0,1
3,0,0,0,0,0,1,0,0,1,0,0,0,0
4,0,1,0,0,1,0,0,0,0,1,0,0,0


In [12]:
# Make a new Count Vectorizer!!!!
vec = CountVectorizer()

# Say hey vectorizer, please read our stuff
matrix = vec.fit_transform(texts)

# And make a dataframe out of it
results = pd.DataFrame(matrix.toarray(), columns=vec.get_feature_names())
results

Unnamed: 0,and,blue,bought,bright,bugs,fish,fished,fishes,fishing,hate,orange,penny,the
0,0,1,1,1,0,0,0,1,0,0,0,1,0
1,1,1,1,1,0,1,0,0,0,0,1,1,0
2,0,0,0,0,0,2,1,0,0,0,0,0,1
3,0,0,0,0,0,1,0,0,1,0,0,0,0
4,0,1,0,0,1,0,0,0,0,1,0,0,0


## Stop words or stopwords

In [14]:
# Make a new Count Vectorizer!!!!
vec = CountVectorizer(stop_words=['a', 'the'])

# Say hey vectorizer, please read our stuff
matrix = vec.fit_transform(texts)

# And make a dataframe out of it
results = pd.DataFrame(matrix.toarray(), columns=vec.get_feature_names())
results

Unnamed: 0,and,blue,bought,bright,bugs,fish,fished,fishes,fishing,hate,orange,penny
0,0,1,1,1,0,0,0,1,0,0,0,1
1,1,1,1,1,0,1,0,0,0,0,1,1
2,0,0,0,0,0,2,1,0,0,0,0,0
3,0,0,0,0,0,1,0,0,1,0,0,0
4,0,1,0,0,1,0,0,0,0,1,0,0


In [18]:
# Make a new Count Vectorizer!!!!
vec = CountVectorizer(stop_words='english')

# Say hey vectorizer, please read our stuff
matrix = vec.fit_transform(texts)

# And make a dataframe out of it
results = pd.DataFrame(matrix.toarray(), columns=vec.get_feature_names())
results

Unnamed: 0,ate,blue,bought,bright,bug,bugs,fish,fished,fishes,fishing,hate,orange,penny
0,0,1,1,1,0,0,0,0,1,0,0,0,1
1,0,1,1,1,0,0,1,0,0,0,0,1,1
2,0,0,0,0,0,0,2,1,0,0,0,0,0
3,0,0,0,0,0,0,1,0,0,1,0,0,0
4,0,1,0,0,0,1,0,0,0,0,1,0,0
5,1,1,0,0,1,0,1,0,0,0,0,0,0


## Stemming

In [19]:
from nltk.stem.porter import PorterStemmer
porter_stemmer = PorterStemmer()

In [20]:
porter_stemmer.stem("fish")

'fish'

In [21]:
porter_stemmer.stem("fishes")

'fish'

In [22]:
porter_stemmer.stem("fishing")

'fish'

In [26]:
porter_stemmer.stem("fishingsaloos")

'fishingsaloo'

In [43]:
from textblob import TextBlob

# Use TextBlob
def textblob_tokenizer(str_input):
    blob = TextBlob(str_input.lower())
    tokens = blob.words
    words = [token.stem() for token in tokens]
    return words

# Use NLTK's PorterStemmer
def stemming_tokenizer(str_input):
    words = re.sub(r"[^A-Za-z0-9\-]", " ", str_input).lower().split()
    words = [porter_stemmer.stem(word) for word in words]
    return words

In [44]:
stemming_tokenizer("I went fishing to get fishes")

['i', 'went', 'fish', 'to', 'get', 'fish']

In [45]:
textblob_tokenizer("I went fishing to get fishes")

['i', 'went', 'fish', 'to', 'get', 'fish']

### Using our tokenizer that also stems

In [46]:
# Make a new Count Vectorizer!!!!
vec = CountVectorizer(stop_words='english', tokenizer=textblob_tokenizer)

# Say hey vectorizer, please read our stuff
matrix = vec.fit_transform(texts)

# And make a dataframe out of it
results = pd.DataFrame(matrix.toarray(), columns=vec.get_feature_names())
results

Unnamed: 0,'m,ate,blue,bought,bright,bug,fish,hate,orang,penni
0,0,0,1,1,1,0,1,0,0,1
1,0,0,1,1,1,0,1,0,1,1
2,0,0,0,0,0,0,3,0,0,0
3,1,0,0,0,0,0,2,0,0,0
4,0,0,1,0,0,1,0,1,0,0
5,0,1,1,0,0,1,1,0,0,0


## TF-IDF

In [52]:
from sklearn.feature_extraction.text import TfidfVectorizer

# Make a new Tfidf Vectorizer!!!!
vec = TfidfVectorizer(stop_words='english', 
                      tokenizer=textblob_tokenizer,
                      use_idf=False,
                      norm='l1') # L - ONE

# Say hey vectorizer, please read our stuff
matrix = vec.fit_transform(texts)

# And make a dataframe out of it
results = pd.DataFrame(matrix.toarray(), columns=vec.get_feature_names())
results

Unnamed: 0,'m,ate,blue,bought,bright,bug,fish,hate,orang,penni
0,0.0,0.0,0.2,0.2,0.2,0.0,0.2,0.0,0.0,0.2
1,0.0,0.0,0.166667,0.166667,0.166667,0.0,0.166667,0.0,0.166667,0.166667
2,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
3,0.333333,0.0,0.0,0.0,0.0,0.0,0.666667,0.0,0.0,0.0
4,0.0,0.0,0.333333,0.0,0.0,0.333333,0.0,0.333333,0.0,0.0
5,0.0,0.25,0.25,0.0,0.0,0.25,0.25,0.0,0.0,0.0
6,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0


In [54]:
results.sort_values(by='fish', ascending=False)

Unnamed: 0,'m,ate,blue,bought,bright,bug,fish,hate,orang,penni
2,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
6,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
3,0.333333,0.0,0.0,0.0,0.0,0.0,0.666667,0.0,0.0,0.0
5,0.0,0.25,0.25,0.0,0.0,0.25,0.25,0.0,0.0,0.0
0,0.0,0.0,0.2,0.2,0.2,0.0,0.2,0.0,0.0,0.2
1,0.0,0.0,0.166667,0.166667,0.166667,0.0,0.166667,0.0,0.166667,0.166667
4,0.0,0.0,0.333333,0.0,0.0,0.333333,0.0,0.333333,0.0,0.0


## Only counting certain words

In [55]:
from sklearn.feature_extraction.text import TfidfVectorizer

# Make a new Tfidf Vectorizer!!!!
vec = TfidfVectorizer(stop_words='english', 
                      vocabulary=['fish', 'bug'],
                      tokenizer=textblob_tokenizer,
                      use_idf=False,
                      norm='l1') # L - ONE

# Say hey vectorizer, please read our stuff
matrix = vec.fit_transform(texts)

# And make a dataframe out of it
results = pd.DataFrame(matrix.toarray(), columns=vec.get_feature_names())
results

Unnamed: 0,fish,bug
0,1.0,0.0
1,1.0,0.0
2,1.0,0.0
3,1.0,0.0
4,0.0,1.0
5,0.5,0.5
6,1.0,0.0


In [56]:
# Make a new Count Vectorizer!!!!
vec = CountVectorizer(stop_words='english', 
                      vocabulary=['fish', 'bug'],
                      tokenizer=textblob_tokenizer)

# Say hey vectorizer, please read our stuff
matrix = vec.fit_transform(texts)

# And make a dataframe out of it
results = pd.DataFrame(matrix.toarray(), columns=vec.get_feature_names())
results

Unnamed: 0,fish,bug
0,1,0
1,1,0
2,3,0
3,2,0
4,0,1
5,1,1
6,1,0
