{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# A little more about counting and stemming\n",
"\n",
"Believe it or not, beyond just stemming there are **multiple ways to count words!**\n",
"\n",
"And what words do you count? Originals? Lowercased? Stemmed? Lemmatized?\n",
"\n",
"There are so many options!"
]
},
{
"cell_type": "code",
"execution_count": 1,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"import pandas as pd\n",
"from sklearn.feature_extraction.text import CountVectorizer\n",
"from sklearn.feature_extraction.text import TfidfVectorizer\n",
"import re"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"# # Give me a THING that will count words for me!!!!!\n",
"# vec = CountVectorizer()\n",
"# # I have some sentences, please count the words in them\n",
"# matrix = vec.fit_transform(phrases)\n",
"# # And turn it into a dataframe\n",
"# docs = pd.DataFrame(matrix.toarray(), columns=vec.get_feature_names())"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"# We'll use this later!\n",
"# def stemming_tokenizer(str_input):\n",
"# words = re.sub(r\"[^A-Za-z0-9\\-]\", \" \", str_input).lower().split()\n",
"# words = [porter_stemmer.stem(word) for word in words]\n",
"# return words"
]
},
{
"cell_type": "code",
"execution_count": 51,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"texts = [\n",
" \"Penny bought bright blue fishes.\",\n",
" \"Penny bought a bright blue and orange fish.\",\n",
" \"The fish fished fish.\",\n",
" \"I'm fishing fish.\",\n",
" \"I hate blue bugs\",\n",
" \"A blue bug ate a fish\",\n",
" \"fish\"\n",
"]"
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"# Make a new Count Vectorizer!!!!\n",
"vec = CountVectorizer()"
]
},
{
"cell_type": "code",
"execution_count": 8,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"array([[0, 1, 1, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0],\n",
" [1, 1, 1, 1, 0, 1, 0, 0, 0, 0, 1, 1, 0],\n",
" [0, 0, 0, 0, 0, 2, 1, 0, 0, 0, 0, 0, 1],\n",
" [0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0],\n",
" [0, 1, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0]], dtype=int64)"
]
},
"execution_count": 8,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# Say hey vectorizer, please read our stuff\n",
"matrix = vec.fit_transform(texts)\n",
"matrix.toarray()"
]
},
{
"cell_type": "code",
"execution_count": 9,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"['and',\n",
" 'blue',\n",
" 'bought',\n",
" 'bright',\n",
" 'bugs',\n",
" 'fish',\n",
" 'fished',\n",
" 'fishes',\n",
" 'fishing',\n",
" 'hate',\n",
" 'orange',\n",
" 'penny',\n",
" 'the']"
]
},
"execution_count": 9,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"vec.get_feature_names()"
]
},
{
"cell_type": "code",
"execution_count": 11,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"
\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" and | \n",
" blue | \n",
" bought | \n",
" bright | \n",
" bugs | \n",
" fish | \n",
" fished | \n",
" fishes | \n",
" fishing | \n",
" hate | \n",
" orange | \n",
" penny | \n",
" the | \n",
"
\n",
" \n",
" \n",
" \n",
" 0 | \n",
" 0 | \n",
" 1 | \n",
" 1 | \n",
" 1 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 1 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 1 | \n",
" 0 | \n",
"
\n",
" \n",
" 1 | \n",
" 1 | \n",
" 1 | \n",
" 1 | \n",
" 1 | \n",
" 0 | \n",
" 1 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 1 | \n",
" 1 | \n",
" 0 | \n",
"
\n",
" \n",
" 2 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 2 | \n",
" 1 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 1 | \n",
"
\n",
" \n",
" 3 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 1 | \n",
" 0 | \n",
" 0 | \n",
" 1 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
"
\n",
" \n",
" 4 | \n",
" 0 | \n",
" 1 | \n",
" 0 | \n",
" 0 | \n",
" 1 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 1 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" and blue bought bright bugs fish fished fishes fishing hate \\\n",
"0 0 1 1 1 0 0 0 1 0 0 \n",
"1 1 1 1 1 0 1 0 0 0 0 \n",
"2 0 0 0 0 0 2 1 0 0 0 \n",
"3 0 0 0 0 0 1 0 0 1 0 \n",
"4 0 1 0 0 1 0 0 0 0 1 \n",
"\n",
" orange penny the \n",
"0 0 1 0 \n",
"1 1 1 0 \n",
"2 0 0 1 \n",
"3 0 0 0 \n",
"4 0 0 0 "
]
},
"execution_count": 11,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"results = pd.DataFrame(matrix.toarray(), columns=vec.get_feature_names())\n",
"results"
]
},
{
"cell_type": "code",
"execution_count": 12,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" and | \n",
" blue | \n",
" bought | \n",
" bright | \n",
" bugs | \n",
" fish | \n",
" fished | \n",
" fishes | \n",
" fishing | \n",
" hate | \n",
" orange | \n",
" penny | \n",
" the | \n",
"
\n",
" \n",
" \n",
" \n",
" 0 | \n",
" 0 | \n",
" 1 | \n",
" 1 | \n",
" 1 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 1 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 1 | \n",
" 0 | \n",
"
\n",
" \n",
" 1 | \n",
" 1 | \n",
" 1 | \n",
" 1 | \n",
" 1 | \n",
" 0 | \n",
" 1 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 1 | \n",
" 1 | \n",
" 0 | \n",
"
\n",
" \n",
" 2 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 2 | \n",
" 1 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 1 | \n",
"
\n",
" \n",
" 3 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 1 | \n",
" 0 | \n",
" 0 | \n",
" 1 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
"
\n",
" \n",
" 4 | \n",
" 0 | \n",
" 1 | \n",
" 0 | \n",
" 0 | \n",
" 1 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 1 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" and blue bought bright bugs fish fished fishes fishing hate \\\n",
"0 0 1 1 1 0 0 0 1 0 0 \n",
"1 1 1 1 1 0 1 0 0 0 0 \n",
"2 0 0 0 0 0 2 1 0 0 0 \n",
"3 0 0 0 0 0 1 0 0 1 0 \n",
"4 0 1 0 0 1 0 0 0 0 1 \n",
"\n",
" orange penny the \n",
"0 0 1 0 \n",
"1 1 1 0 \n",
"2 0 0 1 \n",
"3 0 0 0 \n",
"4 0 0 0 "
]
},
"execution_count": 12,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# Make a new Count Vectorizer!!!!\n",
"vec = CountVectorizer()\n",
"\n",
"# Say hey vectorizer, please read our stuff\n",
"matrix = vec.fit_transform(texts)\n",
"\n",
"# And make a dataframe out of it\n",
"results = pd.DataFrame(matrix.toarray(), columns=vec.get_feature_names())\n",
"results"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Stop words or stopwords"
]
},
{
"cell_type": "code",
"execution_count": 14,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" and | \n",
" blue | \n",
" bought | \n",
" bright | \n",
" bugs | \n",
" fish | \n",
" fished | \n",
" fishes | \n",
" fishing | \n",
" hate | \n",
" orange | \n",
" penny | \n",
"
\n",
" \n",
" \n",
" \n",
" 0 | \n",
" 0 | \n",
" 1 | \n",
" 1 | \n",
" 1 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 1 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 1 | \n",
"
\n",
" \n",
" 1 | \n",
" 1 | \n",
" 1 | \n",
" 1 | \n",
" 1 | \n",
" 0 | \n",
" 1 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 1 | \n",
" 1 | \n",
"
\n",
" \n",
" 2 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 2 | \n",
" 1 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
"
\n",
" \n",
" 3 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 1 | \n",
" 0 | \n",
" 0 | \n",
" 1 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
"
\n",
" \n",
" 4 | \n",
" 0 | \n",
" 1 | \n",
" 0 | \n",
" 0 | \n",
" 1 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 1 | \n",
" 0 | \n",
" 0 | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" and blue bought bright bugs fish fished fishes fishing hate \\\n",
"0 0 1 1 1 0 0 0 1 0 0 \n",
"1 1 1 1 1 0 1 0 0 0 0 \n",
"2 0 0 0 0 0 2 1 0 0 0 \n",
"3 0 0 0 0 0 1 0 0 1 0 \n",
"4 0 1 0 0 1 0 0 0 0 1 \n",
"\n",
" orange penny \n",
"0 0 1 \n",
"1 1 1 \n",
"2 0 0 \n",
"3 0 0 \n",
"4 0 0 "
]
},
"execution_count": 14,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# Make a new Count Vectorizer!!!!\n",
"vec = CountVectorizer(stop_words=['a', 'the'])\n",
"\n",
"# Say hey vectorizer, please read our stuff\n",
"matrix = vec.fit_transform(texts)\n",
"\n",
"# And make a dataframe out of it\n",
"results = pd.DataFrame(matrix.toarray(), columns=vec.get_feature_names())\n",
"results"
]
},
{
"cell_type": "code",
"execution_count": 18,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" ate | \n",
" blue | \n",
" bought | \n",
" bright | \n",
" bug | \n",
" bugs | \n",
" fish | \n",
" fished | \n",
" fishes | \n",
" fishing | \n",
" hate | \n",
" orange | \n",
" penny | \n",
"
\n",
" \n",
" \n",
" \n",
" 0 | \n",
" 0 | \n",
" 1 | \n",
" 1 | \n",
" 1 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 1 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 1 | \n",
"
\n",
" \n",
" 1 | \n",
" 0 | \n",
" 1 | \n",
" 1 | \n",
" 1 | \n",
" 0 | \n",
" 0 | \n",
" 1 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 1 | \n",
" 1 | \n",
"
\n",
" \n",
" 2 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 2 | \n",
" 1 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
"
\n",
" \n",
" 3 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 1 | \n",
" 0 | \n",
" 0 | \n",
" 1 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
"
\n",
" \n",
" 4 | \n",
" 0 | \n",
" 1 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 1 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 1 | \n",
" 0 | \n",
" 0 | \n",
"
\n",
" \n",
" 5 | \n",
" 1 | \n",
" 1 | \n",
" 0 | \n",
" 0 | \n",
" 1 | \n",
" 0 | \n",
" 1 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" ate blue bought bright bug bugs fish fished fishes fishing hate \\\n",
"0 0 1 1 1 0 0 0 0 1 0 0 \n",
"1 0 1 1 1 0 0 1 0 0 0 0 \n",
"2 0 0 0 0 0 0 2 1 0 0 0 \n",
"3 0 0 0 0 0 0 1 0 0 1 0 \n",
"4 0 1 0 0 0 1 0 0 0 0 1 \n",
"5 1 1 0 0 1 0 1 0 0 0 0 \n",
"\n",
" orange penny \n",
"0 0 1 \n",
"1 1 1 \n",
"2 0 0 \n",
"3 0 0 \n",
"4 0 0 \n",
"5 0 0 "
]
},
"execution_count": 18,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# Make a new Count Vectorizer!!!!\n",
"vec = CountVectorizer(stop_words='english')\n",
"\n",
"# Say hey vectorizer, please read our stuff\n",
"matrix = vec.fit_transform(texts)\n",
"\n",
"# And make a dataframe out of it\n",
"results = pd.DataFrame(matrix.toarray(), columns=vec.get_feature_names())\n",
"results"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Stemming"
]
},
{
"cell_type": "code",
"execution_count": 19,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"from nltk.stem.porter import PorterStemmer\n",
"porter_stemmer = PorterStemmer()"
]
},
{
"cell_type": "code",
"execution_count": 20,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"'fish'"
]
},
"execution_count": 20,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"porter_stemmer.stem(\"fish\")"
]
},
{
"cell_type": "code",
"execution_count": 21,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"'fish'"
]
},
"execution_count": 21,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"porter_stemmer.stem(\"fishes\")"
]
},
{
"cell_type": "code",
"execution_count": 22,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"'fish'"
]
},
"execution_count": 22,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"porter_stemmer.stem(\"fishing\")"
]
},
{
"cell_type": "code",
"execution_count": 26,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"'fishingsaloo'"
]
},
"execution_count": 26,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"porter_stemmer.stem(\"fishingsaloos\")"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": []
},
{
"cell_type": "code",
"execution_count": 43,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"from textblob import TextBlob\n",
"\n",
"# Use TextBlob\n",
"def textblob_tokenizer(str_input):\n",
" blob = TextBlob(str_input.lower())\n",
" tokens = blob.words\n",
" words = [token.stem() for token in tokens]\n",
" return words\n",
"\n",
"# Use NLTK's PorterStemmer\n",
"def stemming_tokenizer(str_input):\n",
" words = re.sub(r\"[^A-Za-z0-9\\-]\", \" \", str_input).lower().split()\n",
" words = [porter_stemmer.stem(word) for word in words]\n",
" return words"
]
},
{
"cell_type": "code",
"execution_count": 44,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"['i', 'went', 'fish', 'to', 'get', 'fish']"
]
},
"execution_count": 44,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"stemming_tokenizer(\"I went fishing to get fishes\")"
]
},
{
"cell_type": "code",
"execution_count": 45,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"['i', 'went', 'fish', 'to', 'get', 'fish']"
]
},
"execution_count": 45,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"textblob_tokenizer(\"I went fishing to get fishes\")"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Using our tokenizer that also stems"
]
},
{
"cell_type": "code",
"execution_count": 46,
"metadata": {
"scrolled": true
},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" 'm | \n",
" ate | \n",
" blue | \n",
" bought | \n",
" bright | \n",
" bug | \n",
" fish | \n",
" hate | \n",
" orang | \n",
" penni | \n",
"
\n",
" \n",
" \n",
" \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 1 | \n",
" 1 | \n",
" 1 | \n",
" 0 | \n",
" 1 | \n",
" 0 | \n",
" 0 | \n",
" 1 | \n",
"
\n",
" \n",
" 1 | \n",
" 0 | \n",
" 0 | \n",
" 1 | \n",
" 1 | \n",
" 1 | \n",
" 0 | \n",
" 1 | \n",
" 0 | \n",
" 1 | \n",
" 1 | \n",
"
\n",
" \n",
" 2 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 3 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
"
\n",
" \n",
" 3 | \n",
" 1 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 2 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
"
\n",
" \n",
" 4 | \n",
" 0 | \n",
" 0 | \n",
" 1 | \n",
" 0 | \n",
" 0 | \n",
" 1 | \n",
" 0 | \n",
" 1 | \n",
" 0 | \n",
" 0 | \n",
"
\n",
" \n",
" 5 | \n",
" 0 | \n",
" 1 | \n",
" 1 | \n",
" 0 | \n",
" 0 | \n",
" 1 | \n",
" 1 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" 'm ate blue bought bright bug fish hate orang penni\n",
"0 0 0 1 1 1 0 1 0 0 1\n",
"1 0 0 1 1 1 0 1 0 1 1\n",
"2 0 0 0 0 0 0 3 0 0 0\n",
"3 1 0 0 0 0 0 2 0 0 0\n",
"4 0 0 1 0 0 1 0 1 0 0\n",
"5 0 1 1 0 0 1 1 0 0 0"
]
},
"execution_count": 46,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# Make a new Count Vectorizer!!!!\n",
"vec = CountVectorizer(stop_words='english', tokenizer=textblob_tokenizer)\n",
"\n",
"# Say hey vectorizer, please read our stuff\n",
"matrix = vec.fit_transform(texts)\n",
"\n",
"# And make a dataframe out of it\n",
"results = pd.DataFrame(matrix.toarray(), columns=vec.get_feature_names())\n",
"results"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## TF-IDF"
]
},
{
"cell_type": "code",
"execution_count": 52,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" 'm | \n",
" ate | \n",
" blue | \n",
" bought | \n",
" bright | \n",
" bug | \n",
" fish | \n",
" hate | \n",
" orang | \n",
" penni | \n",
"
\n",
" \n",
" \n",
" \n",
" 0 | \n",
" 0.000000 | \n",
" 0.00 | \n",
" 0.200000 | \n",
" 0.200000 | \n",
" 0.200000 | \n",
" 0.000000 | \n",
" 0.200000 | \n",
" 0.000000 | \n",
" 0.000000 | \n",
" 0.200000 | \n",
"
\n",
" \n",
" 1 | \n",
" 0.000000 | \n",
" 0.00 | \n",
" 0.166667 | \n",
" 0.166667 | \n",
" 0.166667 | \n",
" 0.000000 | \n",
" 0.166667 | \n",
" 0.000000 | \n",
" 0.166667 | \n",
" 0.166667 | \n",
"
\n",
" \n",
" 2 | \n",
" 0.000000 | \n",
" 0.00 | \n",
" 0.000000 | \n",
" 0.000000 | \n",
" 0.000000 | \n",
" 0.000000 | \n",
" 1.000000 | \n",
" 0.000000 | \n",
" 0.000000 | \n",
" 0.000000 | \n",
"
\n",
" \n",
" 3 | \n",
" 0.333333 | \n",
" 0.00 | \n",
" 0.000000 | \n",
" 0.000000 | \n",
" 0.000000 | \n",
" 0.000000 | \n",
" 0.666667 | \n",
" 0.000000 | \n",
" 0.000000 | \n",
" 0.000000 | \n",
"
\n",
" \n",
" 4 | \n",
" 0.000000 | \n",
" 0.00 | \n",
" 0.333333 | \n",
" 0.000000 | \n",
" 0.000000 | \n",
" 0.333333 | \n",
" 0.000000 | \n",
" 0.333333 | \n",
" 0.000000 | \n",
" 0.000000 | \n",
"
\n",
" \n",
" 5 | \n",
" 0.000000 | \n",
" 0.25 | \n",
" 0.250000 | \n",
" 0.000000 | \n",
" 0.000000 | \n",
" 0.250000 | \n",
" 0.250000 | \n",
" 0.000000 | \n",
" 0.000000 | \n",
" 0.000000 | \n",
"
\n",
" \n",
" 6 | \n",
" 0.000000 | \n",
" 0.00 | \n",
" 0.000000 | \n",
" 0.000000 | \n",
" 0.000000 | \n",
" 0.000000 | \n",
" 1.000000 | \n",
" 0.000000 | \n",
" 0.000000 | \n",
" 0.000000 | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" 'm ate blue bought bright bug fish hate \\\n",
"0 0.000000 0.00 0.200000 0.200000 0.200000 0.000000 0.200000 0.000000 \n",
"1 0.000000 0.00 0.166667 0.166667 0.166667 0.000000 0.166667 0.000000 \n",
"2 0.000000 0.00 0.000000 0.000000 0.000000 0.000000 1.000000 0.000000 \n",
"3 0.333333 0.00 0.000000 0.000000 0.000000 0.000000 0.666667 0.000000 \n",
"4 0.000000 0.00 0.333333 0.000000 0.000000 0.333333 0.000000 0.333333 \n",
"5 0.000000 0.25 0.250000 0.000000 0.000000 0.250000 0.250000 0.000000 \n",
"6 0.000000 0.00 0.000000 0.000000 0.000000 0.000000 1.000000 0.000000 \n",
"\n",
" orang penni \n",
"0 0.000000 0.200000 \n",
"1 0.166667 0.166667 \n",
"2 0.000000 0.000000 \n",
"3 0.000000 0.000000 \n",
"4 0.000000 0.000000 \n",
"5 0.000000 0.000000 \n",
"6 0.000000 0.000000 "
]
},
"execution_count": 52,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"from sklearn.feature_extraction.text import TfidfVectorizer\n",
"\n",
"# Make a new Tfidf Vectorizer!!!!\n",
"vec = TfidfVectorizer(stop_words='english', \n",
" tokenizer=textblob_tokenizer,\n",
" use_idf=False,\n",
" norm='l1') # L - ONE\n",
"\n",
"# Say hey vectorizer, please read our stuff\n",
"matrix = vec.fit_transform(texts)\n",
"\n",
"# And make a dataframe out of it\n",
"results = pd.DataFrame(matrix.toarray(), columns=vec.get_feature_names())\n",
"results"
]
},
{
"cell_type": "code",
"execution_count": 54,
"metadata": {
"scrolled": true
},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" 'm | \n",
" ate | \n",
" blue | \n",
" bought | \n",
" bright | \n",
" bug | \n",
" fish | \n",
" hate | \n",
" orang | \n",
" penni | \n",
"
\n",
" \n",
" \n",
" \n",
" 2 | \n",
" 0.000000 | \n",
" 0.00 | \n",
" 0.000000 | \n",
" 0.000000 | \n",
" 0.000000 | \n",
" 0.000000 | \n",
" 1.000000 | \n",
" 0.000000 | \n",
" 0.000000 | \n",
" 0.000000 | \n",
"
\n",
" \n",
" 6 | \n",
" 0.000000 | \n",
" 0.00 | \n",
" 0.000000 | \n",
" 0.000000 | \n",
" 0.000000 | \n",
" 0.000000 | \n",
" 1.000000 | \n",
" 0.000000 | \n",
" 0.000000 | \n",
" 0.000000 | \n",
"
\n",
" \n",
" 3 | \n",
" 0.333333 | \n",
" 0.00 | \n",
" 0.000000 | \n",
" 0.000000 | \n",
" 0.000000 | \n",
" 0.000000 | \n",
" 0.666667 | \n",
" 0.000000 | \n",
" 0.000000 | \n",
" 0.000000 | \n",
"
\n",
" \n",
" 5 | \n",
" 0.000000 | \n",
" 0.25 | \n",
" 0.250000 | \n",
" 0.000000 | \n",
" 0.000000 | \n",
" 0.250000 | \n",
" 0.250000 | \n",
" 0.000000 | \n",
" 0.000000 | \n",
" 0.000000 | \n",
"
\n",
" \n",
" 0 | \n",
" 0.000000 | \n",
" 0.00 | \n",
" 0.200000 | \n",
" 0.200000 | \n",
" 0.200000 | \n",
" 0.000000 | \n",
" 0.200000 | \n",
" 0.000000 | \n",
" 0.000000 | \n",
" 0.200000 | \n",
"
\n",
" \n",
" 1 | \n",
" 0.000000 | \n",
" 0.00 | \n",
" 0.166667 | \n",
" 0.166667 | \n",
" 0.166667 | \n",
" 0.000000 | \n",
" 0.166667 | \n",
" 0.000000 | \n",
" 0.166667 | \n",
" 0.166667 | \n",
"
\n",
" \n",
" 4 | \n",
" 0.000000 | \n",
" 0.00 | \n",
" 0.333333 | \n",
" 0.000000 | \n",
" 0.000000 | \n",
" 0.333333 | \n",
" 0.000000 | \n",
" 0.333333 | \n",
" 0.000000 | \n",
" 0.000000 | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" 'm ate blue bought bright bug fish hate \\\n",
"2 0.000000 0.00 0.000000 0.000000 0.000000 0.000000 1.000000 0.000000 \n",
"6 0.000000 0.00 0.000000 0.000000 0.000000 0.000000 1.000000 0.000000 \n",
"3 0.333333 0.00 0.000000 0.000000 0.000000 0.000000 0.666667 0.000000 \n",
"5 0.000000 0.25 0.250000 0.000000 0.000000 0.250000 0.250000 0.000000 \n",
"0 0.000000 0.00 0.200000 0.200000 0.200000 0.000000 0.200000 0.000000 \n",
"1 0.000000 0.00 0.166667 0.166667 0.166667 0.000000 0.166667 0.000000 \n",
"4 0.000000 0.00 0.333333 0.000000 0.000000 0.333333 0.000000 0.333333 \n",
"\n",
" orang penni \n",
"2 0.000000 0.000000 \n",
"6 0.000000 0.000000 \n",
"3 0.000000 0.000000 \n",
"5 0.000000 0.000000 \n",
"0 0.000000 0.200000 \n",
"1 0.166667 0.166667 \n",
"4 0.000000 0.000000 "
]
},
"execution_count": 54,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"results.sort_values(by='fish', ascending=False)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": []
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Only counting certain words"
]
},
{
"cell_type": "code",
"execution_count": 55,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" fish | \n",
" bug | \n",
"
\n",
" \n",
" \n",
" \n",
" 0 | \n",
" 1.0 | \n",
" 0.0 | \n",
"
\n",
" \n",
" 1 | \n",
" 1.0 | \n",
" 0.0 | \n",
"
\n",
" \n",
" 2 | \n",
" 1.0 | \n",
" 0.0 | \n",
"
\n",
" \n",
" 3 | \n",
" 1.0 | \n",
" 0.0 | \n",
"
\n",
" \n",
" 4 | \n",
" 0.0 | \n",
" 1.0 | \n",
"
\n",
" \n",
" 5 | \n",
" 0.5 | \n",
" 0.5 | \n",
"
\n",
" \n",
" 6 | \n",
" 1.0 | \n",
" 0.0 | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" fish bug\n",
"0 1.0 0.0\n",
"1 1.0 0.0\n",
"2 1.0 0.0\n",
"3 1.0 0.0\n",
"4 0.0 1.0\n",
"5 0.5 0.5\n",
"6 1.0 0.0"
]
},
"execution_count": 55,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"from sklearn.feature_extraction.text import TfidfVectorizer\n",
"\n",
"# Make a new Tfidf Vectorizer!!!!\n",
"vec = TfidfVectorizer(stop_words='english', \n",
" vocabulary=['fish', 'bug'],\n",
" tokenizer=textblob_tokenizer,\n",
" use_idf=False,\n",
" norm='l1') # L - ONE\n",
"\n",
"# Say hey vectorizer, please read our stuff\n",
"matrix = vec.fit_transform(texts)\n",
"\n",
"# And make a dataframe out of it\n",
"results = pd.DataFrame(matrix.toarray(), columns=vec.get_feature_names())\n",
"results"
]
},
{
"cell_type": "code",
"execution_count": 56,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" fish | \n",
" bug | \n",
"
\n",
" \n",
" \n",
" \n",
" 0 | \n",
" 1 | \n",
" 0 | \n",
"
\n",
" \n",
" 1 | \n",
" 1 | \n",
" 0 | \n",
"
\n",
" \n",
" 2 | \n",
" 3 | \n",
" 0 | \n",
"
\n",
" \n",
" 3 | \n",
" 2 | \n",
" 0 | \n",
"
\n",
" \n",
" 4 | \n",
" 0 | \n",
" 1 | \n",
"
\n",
" \n",
" 5 | \n",
" 1 | \n",
" 1 | \n",
"
\n",
" \n",
" 6 | \n",
" 1 | \n",
" 0 | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" fish bug\n",
"0 1 0\n",
"1 1 0\n",
"2 3 0\n",
"3 2 0\n",
"4 0 1\n",
"5 1 1\n",
"6 1 0"
]
},
"execution_count": 56,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# Make a new Count Vectorizer!!!!\n",
"vec = CountVectorizer(stop_words='english', \n",
" vocabulary=['fish', 'bug'],\n",
" tokenizer=textblob_tokenizer)\n",
"\n",
"# Say hey vectorizer, please read our stuff\n",
"matrix = vec.fit_transform(texts)\n",
"\n",
"# And make a dataframe out of it\n",
"results = pd.DataFrame(matrix.toarray(), columns=vec.get_feature_names())\n",
"results"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.6.1"
}
},
"nbformat": 4,
"nbformat_minor": 2
}