# # Give me a THING that will count words for me!!!!!# vec = CountVectorizer()# # I have some sentences, please count the words in them# matrix = vec.fit_transform(phrases)# # And turn it into a dataframe# docs = pd.DataFrame(matrix.toarray(), columns=vec.get_feature_names())
# We'll use this later!# def stemming_tokenizer(str_input):# words = re.sub(r"[^A-Za-z0-9\-]", " ", str_input).lower().split()# words = [porter_stemmer.stem(word) for word in words]# return words
texts=["Penny bought bright blue fishes.","Penny bought a bright blue and orange fish.","The fish fished fish.","I'm fishing fish.","I hate blue bugs","A blue bug ate a fish","fish"]
# Make a new Count Vectorizer!!!!vec=CountVectorizer()
# Say hey vectorizer, please read our stuffmatrix=vec.fit_transform(texts)matrix.toarray()
# Make a new Count Vectorizer!!!!vec=CountVectorizer()# Say hey vectorizer, please read our stuffmatrix=vec.fit_transform(texts)# And make a dataframe out of itresults=pd.DataFrame(matrix.toarray(),columns=vec.get_feature_names())results
and
blue
bought
bright
bugs
fish
fished
fishes
fishing
hate
orange
penny
the
0
0
1
1
1
0
0
0
1
0
0
0
1
0
1
1
1
1
1
0
1
0
0
0
0
1
1
0
2
0
0
0
0
0
2
1
0
0
0
0
0
1
3
0
0
0
0
0
1
0
0
1
0
0
0
0
4
0
1
0
0
1
0
0
0
0
1
0
0
0
Stop words or stopwords
# Make a new Count Vectorizer!!!!vec=CountVectorizer(stop_words=['a','the'])# Say hey vectorizer, please read our stuffmatrix=vec.fit_transform(texts)# And make a dataframe out of itresults=pd.DataFrame(matrix.toarray(),columns=vec.get_feature_names())results
and
blue
bought
bright
bugs
fish
fished
fishes
fishing
hate
orange
penny
0
0
1
1
1
0
0
0
1
0
0
0
1
1
1
1
1
1
0
1
0
0
0
0
1
1
2
0
0
0
0
0
2
1
0
0
0
0
0
3
0
0
0
0
0
1
0
0
1
0
0
0
4
0
1
0
0
1
0
0
0
0
1
0
0
# Make a new Count Vectorizer!!!!vec=CountVectorizer(stop_words='english')# Say hey vectorizer, please read our stuffmatrix=vec.fit_transform(texts)# And make a dataframe out of itresults=pd.DataFrame(matrix.toarray(),columns=vec.get_feature_names())results
fromtextblobimportTextBlob# Use TextBlobdeftextblob_tokenizer(str_input):blob=TextBlob(str_input.lower())tokens=blob.wordswords=[token.stem()fortokenintokens]returnwords# Use NLTK's PorterStemmerdefstemming_tokenizer(str_input):words=re.sub(r"[^A-Za-z0-9\-]"," ",str_input).lower().split()words=[porter_stemmer.stem(word)forwordinwords]returnwords
stemming_tokenizer("I went fishing to get fishes")
['i', 'went', 'fish', 'to', 'get', 'fish']
textblob_tokenizer("I went fishing to get fishes")
['i', 'went', 'fish', 'to', 'get', 'fish']
Using our tokenizer that also stems
# Make a new Count Vectorizer!!!!vec=CountVectorizer(stop_words='english',tokenizer=textblob_tokenizer)# Say hey vectorizer, please read our stuffmatrix=vec.fit_transform(texts)# And make a dataframe out of itresults=pd.DataFrame(matrix.toarray(),columns=vec.get_feature_names())results
'm
ate
blue
bought
bright
bug
fish
hate
orang
penni
0
0
0
1
1
1
0
1
0
0
1
1
0
0
1
1
1
0
1
0
1
1
2
0
0
0
0
0
0
3
0
0
0
3
1
0
0
0
0
0
2
0
0
0
4
0
0
1
0
0
1
0
1
0
0
5
0
1
1
0
0
1
1
0
0
0
TF-IDF
fromsklearn.feature_extraction.textimportTfidfVectorizer# Make a new Tfidf Vectorizer!!!!vec=TfidfVectorizer(stop_words='english',tokenizer=textblob_tokenizer,use_idf=False,norm='l1')# L - ONE# Say hey vectorizer, please read our stuffmatrix=vec.fit_transform(texts)# And make a dataframe out of itresults=pd.DataFrame(matrix.toarray(),columns=vec.get_feature_names())results
'm
ate
blue
bought
bright
bug
fish
hate
orang
penni
0
0.000000
0.00
0.200000
0.200000
0.200000
0.000000
0.200000
0.000000
0.000000
0.200000
1
0.000000
0.00
0.166667
0.166667
0.166667
0.000000
0.166667
0.000000
0.166667
0.166667
2
0.000000
0.00
0.000000
0.000000
0.000000
0.000000
1.000000
0.000000
0.000000
0.000000
3
0.333333
0.00
0.000000
0.000000
0.000000
0.000000
0.666667
0.000000
0.000000
0.000000
4
0.000000
0.00
0.333333
0.000000
0.000000
0.333333
0.000000
0.333333
0.000000
0.000000
5
0.000000
0.25
0.250000
0.000000
0.000000
0.250000
0.250000
0.000000
0.000000
0.000000
6
0.000000
0.00
0.000000
0.000000
0.000000
0.000000
1.000000
0.000000
0.000000
0.000000
results.sort_values(by='fish',ascending=False)
'm
ate
blue
bought
bright
bug
fish
hate
orang
penni
2
0.000000
0.00
0.000000
0.000000
0.000000
0.000000
1.000000
0.000000
0.000000
0.000000
6
0.000000
0.00
0.000000
0.000000
0.000000
0.000000
1.000000
0.000000
0.000000
0.000000
3
0.333333
0.00
0.000000
0.000000
0.000000
0.000000
0.666667
0.000000
0.000000
0.000000
5
0.000000
0.25
0.250000
0.000000
0.000000
0.250000
0.250000
0.000000
0.000000
0.000000
0
0.000000
0.00
0.200000
0.200000
0.200000
0.000000
0.200000
0.000000
0.000000
0.200000
1
0.000000
0.00
0.166667
0.166667
0.166667
0.000000
0.166667
0.000000
0.166667
0.166667
4
0.000000
0.00
0.333333
0.000000
0.000000
0.333333
0.000000
0.333333
0.000000
0.000000
Only counting certain words
fromsklearn.feature_extraction.textimportTfidfVectorizer# Make a new Tfidf Vectorizer!!!!vec=TfidfVectorizer(stop_words='english',vocabulary=['fish','bug'],tokenizer=textblob_tokenizer,use_idf=False,norm='l1')# L - ONE# Say hey vectorizer, please read our stuffmatrix=vec.fit_transform(texts)# And make a dataframe out of itresults=pd.DataFrame(matrix.toarray(),columns=vec.get_feature_names())results
fish
bug
0
1.0
0.0
1
1.0
0.0
2
1.0
0.0
3
1.0
0.0
4
0.0
1.0
5
0.5
0.5
6
1.0
0.0
# Make a new Count Vectorizer!!!!vec=CountVectorizer(stop_words='english',vocabulary=['fish','bug'],tokenizer=textblob_tokenizer)# Say hey vectorizer, please read our stuffmatrix=vec.fit_transform(texts)# And make a dataframe out of itresults=pd.DataFrame(matrix.toarray(),columns=vec.get_feature_names())results
fish
bug
0
1
0
1
1
0
2
3
0
3
2
0
4
0
1
5
1
1
6
1
0
Want to hear when I release new things? My infrequent and sporadic newsletter can help with that.