{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "# A little more about counting and stemming\n", "\n", "Believe it or not, beyond just stemming there are **multiple ways to count words!**\n", "\n", "And what words do you count? Originals? Lowercased? Stemmed? Lemmatized?\n", "\n", "There are so many options!" ] }, { "cell_type": "code", "execution_count": 1, "metadata": { "collapsed": true }, "outputs": [], "source": [ "import pandas as pd\n", "from sklearn.feature_extraction.text import CountVectorizer\n", "from sklearn.feature_extraction.text import TfidfVectorizer\n", "import re" ] }, { "cell_type": "code", "execution_count": 2, "metadata": { "collapsed": true }, "outputs": [], "source": [ "# # Give me a THING that will count words for me!!!!!\n", "# vec = CountVectorizer()\n", "# # I have some sentences, please count the words in them\n", "# matrix = vec.fit_transform(phrases)\n", "# # And turn it into a dataframe\n", "# docs = pd.DataFrame(matrix.toarray(), columns=vec.get_feature_names())" ] }, { "cell_type": "code", "execution_count": 3, "metadata": { "collapsed": true }, "outputs": [], "source": [ "# We'll use this later!\n", "# def stemming_tokenizer(str_input):\n", "# words = re.sub(r\"[^A-Za-z0-9\\-]\", \" \", str_input).lower().split()\n", "# words = [porter_stemmer.stem(word) for word in words]\n", "# return words" ] }, { "cell_type": "code", "execution_count": 51, "metadata": { "collapsed": true }, "outputs": [], "source": [ "texts = [\n", " \"Penny bought bright blue fishes.\",\n", " \"Penny bought a bright blue and orange fish.\",\n", " \"The fish fished fish.\",\n", " \"I'm fishing fish.\",\n", " \"I hate blue bugs\",\n", " \"A blue bug ate a fish\",\n", " \"fish\"\n", "]" ] }, { "cell_type": "code", "execution_count": 5, "metadata": { "collapsed": true }, "outputs": [], "source": [ "# Make a new Count Vectorizer!!!!\n", "vec = CountVectorizer()" ] }, { "cell_type": "code", "execution_count": 8, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "array([[0, 1, 1, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0],\n", " [1, 1, 1, 1, 0, 1, 0, 0, 0, 0, 1, 1, 0],\n", " [0, 0, 0, 0, 0, 2, 1, 0, 0, 0, 0, 0, 1],\n", " [0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0],\n", " [0, 1, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0]], dtype=int64)" ] }, "execution_count": 8, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# Say hey vectorizer, please read our stuff\n", "matrix = vec.fit_transform(texts)\n", "matrix.toarray()" ] }, { "cell_type": "code", "execution_count": 9, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "['and',\n", " 'blue',\n", " 'bought',\n", " 'bright',\n", " 'bugs',\n", " 'fish',\n", " 'fished',\n", " 'fishes',\n", " 'fishing',\n", " 'hate',\n", " 'orange',\n", " 'penny',\n", " 'the']" ] }, "execution_count": 9, "metadata": {}, "output_type": "execute_result" } ], "source": [ "vec.get_feature_names()" ] }, { "cell_type": "code", "execution_count": 11, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
andblueboughtbrightbugsfishfishedfishesfishinghateorangepennythe
00111000100010
11111010000110
20000021000001
30000010010000
40100100001000
\n", "
" ], "text/plain": [ " and blue bought bright bugs fish fished fishes fishing hate \\\n", "0 0 1 1 1 0 0 0 1 0 0 \n", "1 1 1 1 1 0 1 0 0 0 0 \n", "2 0 0 0 0 0 2 1 0 0 0 \n", "3 0 0 0 0 0 1 0 0 1 0 \n", "4 0 1 0 0 1 0 0 0 0 1 \n", "\n", " orange penny the \n", "0 0 1 0 \n", "1 1 1 0 \n", "2 0 0 1 \n", "3 0 0 0 \n", "4 0 0 0 " ] }, "execution_count": 11, "metadata": {}, "output_type": "execute_result" } ], "source": [ "results = pd.DataFrame(matrix.toarray(), columns=vec.get_feature_names())\n", "results" ] }, { "cell_type": "code", "execution_count": 12, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
andblueboughtbrightbugsfishfishedfishesfishinghateorangepennythe
00111000100010
11111010000110
20000021000001
30000010010000
40100100001000
\n", "
" ], "text/plain": [ " and blue bought bright bugs fish fished fishes fishing hate \\\n", "0 0 1 1 1 0 0 0 1 0 0 \n", "1 1 1 1 1 0 1 0 0 0 0 \n", "2 0 0 0 0 0 2 1 0 0 0 \n", "3 0 0 0 0 0 1 0 0 1 0 \n", "4 0 1 0 0 1 0 0 0 0 1 \n", "\n", " orange penny the \n", "0 0 1 0 \n", "1 1 1 0 \n", "2 0 0 1 \n", "3 0 0 0 \n", "4 0 0 0 " ] }, "execution_count": 12, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# Make a new Count Vectorizer!!!!\n", "vec = CountVectorizer()\n", "\n", "# Say hey vectorizer, please read our stuff\n", "matrix = vec.fit_transform(texts)\n", "\n", "# And make a dataframe out of it\n", "results = pd.DataFrame(matrix.toarray(), columns=vec.get_feature_names())\n", "results" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Stop words or stopwords" ] }, { "cell_type": "code", "execution_count": 14, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
andblueboughtbrightbugsfishfishedfishesfishinghateorangepenny
0011100010001
1111101000011
2000002100000
3000001001000
4010010000100
\n", "
" ], "text/plain": [ " and blue bought bright bugs fish fished fishes fishing hate \\\n", "0 0 1 1 1 0 0 0 1 0 0 \n", "1 1 1 1 1 0 1 0 0 0 0 \n", "2 0 0 0 0 0 2 1 0 0 0 \n", "3 0 0 0 0 0 1 0 0 1 0 \n", "4 0 1 0 0 1 0 0 0 0 1 \n", "\n", " orange penny \n", "0 0 1 \n", "1 1 1 \n", "2 0 0 \n", "3 0 0 \n", "4 0 0 " ] }, "execution_count": 14, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# Make a new Count Vectorizer!!!!\n", "vec = CountVectorizer(stop_words=['a', 'the'])\n", "\n", "# Say hey vectorizer, please read our stuff\n", "matrix = vec.fit_transform(texts)\n", "\n", "# And make a dataframe out of it\n", "results = pd.DataFrame(matrix.toarray(), columns=vec.get_feature_names())\n", "results" ] }, { "cell_type": "code", "execution_count": 18, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
ateblueboughtbrightbugbugsfishfishedfishesfishinghateorangepenny
00111000010001
10111001000011
20000002100000
30000001001000
40100010000100
51100101000000
\n", "
" ], "text/plain": [ " ate blue bought bright bug bugs fish fished fishes fishing hate \\\n", "0 0 1 1 1 0 0 0 0 1 0 0 \n", "1 0 1 1 1 0 0 1 0 0 0 0 \n", "2 0 0 0 0 0 0 2 1 0 0 0 \n", "3 0 0 0 0 0 0 1 0 0 1 0 \n", "4 0 1 0 0 0 1 0 0 0 0 1 \n", "5 1 1 0 0 1 0 1 0 0 0 0 \n", "\n", " orange penny \n", "0 0 1 \n", "1 1 1 \n", "2 0 0 \n", "3 0 0 \n", "4 0 0 \n", "5 0 0 " ] }, "execution_count": 18, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# Make a new Count Vectorizer!!!!\n", "vec = CountVectorizer(stop_words='english')\n", "\n", "# Say hey vectorizer, please read our stuff\n", "matrix = vec.fit_transform(texts)\n", "\n", "# And make a dataframe out of it\n", "results = pd.DataFrame(matrix.toarray(), columns=vec.get_feature_names())\n", "results" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Stemming" ] }, { "cell_type": "code", "execution_count": 19, "metadata": { "collapsed": true }, "outputs": [], "source": [ "from nltk.stem.porter import PorterStemmer\n", "porter_stemmer = PorterStemmer()" ] }, { "cell_type": "code", "execution_count": 20, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "'fish'" ] }, "execution_count": 20, "metadata": {}, "output_type": "execute_result" } ], "source": [ "porter_stemmer.stem(\"fish\")" ] }, { "cell_type": "code", "execution_count": 21, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "'fish'" ] }, "execution_count": 21, "metadata": {}, "output_type": "execute_result" } ], "source": [ "porter_stemmer.stem(\"fishes\")" ] }, { "cell_type": "code", "execution_count": 22, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "'fish'" ] }, "execution_count": 22, "metadata": {}, "output_type": "execute_result" } ], "source": [ "porter_stemmer.stem(\"fishing\")" ] }, { "cell_type": "code", "execution_count": 26, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "'fishingsaloo'" ] }, "execution_count": 26, "metadata": {}, "output_type": "execute_result" } ], "source": [ "porter_stemmer.stem(\"fishingsaloos\")" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": true }, "outputs": [], "source": [] }, { "cell_type": "code", "execution_count": 43, "metadata": { "collapsed": true }, "outputs": [], "source": [ "from textblob import TextBlob\n", "\n", "# Use TextBlob\n", "def textblob_tokenizer(str_input):\n", " blob = TextBlob(str_input.lower())\n", " tokens = blob.words\n", " words = [token.stem() for token in tokens]\n", " return words\n", "\n", "# Use NLTK's PorterStemmer\n", "def stemming_tokenizer(str_input):\n", " words = re.sub(r\"[^A-Za-z0-9\\-]\", \" \", str_input).lower().split()\n", " words = [porter_stemmer.stem(word) for word in words]\n", " return words" ] }, { "cell_type": "code", "execution_count": 44, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "['i', 'went', 'fish', 'to', 'get', 'fish']" ] }, "execution_count": 44, "metadata": {}, "output_type": "execute_result" } ], "source": [ "stemming_tokenizer(\"I went fishing to get fishes\")" ] }, { "cell_type": "code", "execution_count": 45, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "['i', 'went', 'fish', 'to', 'get', 'fish']" ] }, "execution_count": 45, "metadata": {}, "output_type": "execute_result" } ], "source": [ "textblob_tokenizer(\"I went fishing to get fishes\")" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Using our tokenizer that also stems" ] }, { "cell_type": "code", "execution_count": 46, "metadata": { "scrolled": true }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
'mateblueboughtbrightbugfishhateorangpenni
00011101001
10011101011
20000003000
31000002000
40010010100
50110011000
\n", "
" ], "text/plain": [ " 'm ate blue bought bright bug fish hate orang penni\n", "0 0 0 1 1 1 0 1 0 0 1\n", "1 0 0 1 1 1 0 1 0 1 1\n", "2 0 0 0 0 0 0 3 0 0 0\n", "3 1 0 0 0 0 0 2 0 0 0\n", "4 0 0 1 0 0 1 0 1 0 0\n", "5 0 1 1 0 0 1 1 0 0 0" ] }, "execution_count": 46, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# Make a new Count Vectorizer!!!!\n", "vec = CountVectorizer(stop_words='english', tokenizer=textblob_tokenizer)\n", "\n", "# Say hey vectorizer, please read our stuff\n", "matrix = vec.fit_transform(texts)\n", "\n", "# And make a dataframe out of it\n", "results = pd.DataFrame(matrix.toarray(), columns=vec.get_feature_names())\n", "results" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## TF-IDF" ] }, { "cell_type": "code", "execution_count": 52, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
'mateblueboughtbrightbugfishhateorangpenni
00.0000000.000.2000000.2000000.2000000.0000000.2000000.0000000.0000000.200000
10.0000000.000.1666670.1666670.1666670.0000000.1666670.0000000.1666670.166667
20.0000000.000.0000000.0000000.0000000.0000001.0000000.0000000.0000000.000000
30.3333330.000.0000000.0000000.0000000.0000000.6666670.0000000.0000000.000000
40.0000000.000.3333330.0000000.0000000.3333330.0000000.3333330.0000000.000000
50.0000000.250.2500000.0000000.0000000.2500000.2500000.0000000.0000000.000000
60.0000000.000.0000000.0000000.0000000.0000001.0000000.0000000.0000000.000000
\n", "
" ], "text/plain": [ " 'm ate blue bought bright bug fish hate \\\n", "0 0.000000 0.00 0.200000 0.200000 0.200000 0.000000 0.200000 0.000000 \n", "1 0.000000 0.00 0.166667 0.166667 0.166667 0.000000 0.166667 0.000000 \n", "2 0.000000 0.00 0.000000 0.000000 0.000000 0.000000 1.000000 0.000000 \n", "3 0.333333 0.00 0.000000 0.000000 0.000000 0.000000 0.666667 0.000000 \n", "4 0.000000 0.00 0.333333 0.000000 0.000000 0.333333 0.000000 0.333333 \n", "5 0.000000 0.25 0.250000 0.000000 0.000000 0.250000 0.250000 0.000000 \n", "6 0.000000 0.00 0.000000 0.000000 0.000000 0.000000 1.000000 0.000000 \n", "\n", " orang penni \n", "0 0.000000 0.200000 \n", "1 0.166667 0.166667 \n", "2 0.000000 0.000000 \n", "3 0.000000 0.000000 \n", "4 0.000000 0.000000 \n", "5 0.000000 0.000000 \n", "6 0.000000 0.000000 " ] }, "execution_count": 52, "metadata": {}, "output_type": "execute_result" } ], "source": [ "from sklearn.feature_extraction.text import TfidfVectorizer\n", "\n", "# Make a new Tfidf Vectorizer!!!!\n", "vec = TfidfVectorizer(stop_words='english', \n", " tokenizer=textblob_tokenizer,\n", " use_idf=False,\n", " norm='l1') # L - ONE\n", "\n", "# Say hey vectorizer, please read our stuff\n", "matrix = vec.fit_transform(texts)\n", "\n", "# And make a dataframe out of it\n", "results = pd.DataFrame(matrix.toarray(), columns=vec.get_feature_names())\n", "results" ] }, { "cell_type": "code", "execution_count": 54, "metadata": { "scrolled": true }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
'mateblueboughtbrightbugfishhateorangpenni
20.0000000.000.0000000.0000000.0000000.0000001.0000000.0000000.0000000.000000
60.0000000.000.0000000.0000000.0000000.0000001.0000000.0000000.0000000.000000
30.3333330.000.0000000.0000000.0000000.0000000.6666670.0000000.0000000.000000
50.0000000.250.2500000.0000000.0000000.2500000.2500000.0000000.0000000.000000
00.0000000.000.2000000.2000000.2000000.0000000.2000000.0000000.0000000.200000
10.0000000.000.1666670.1666670.1666670.0000000.1666670.0000000.1666670.166667
40.0000000.000.3333330.0000000.0000000.3333330.0000000.3333330.0000000.000000
\n", "
" ], "text/plain": [ " 'm ate blue bought bright bug fish hate \\\n", "2 0.000000 0.00 0.000000 0.000000 0.000000 0.000000 1.000000 0.000000 \n", "6 0.000000 0.00 0.000000 0.000000 0.000000 0.000000 1.000000 0.000000 \n", "3 0.333333 0.00 0.000000 0.000000 0.000000 0.000000 0.666667 0.000000 \n", "5 0.000000 0.25 0.250000 0.000000 0.000000 0.250000 0.250000 0.000000 \n", "0 0.000000 0.00 0.200000 0.200000 0.200000 0.000000 0.200000 0.000000 \n", "1 0.000000 0.00 0.166667 0.166667 0.166667 0.000000 0.166667 0.000000 \n", "4 0.000000 0.00 0.333333 0.000000 0.000000 0.333333 0.000000 0.333333 \n", "\n", " orang penni \n", "2 0.000000 0.000000 \n", "6 0.000000 0.000000 \n", "3 0.000000 0.000000 \n", "5 0.000000 0.000000 \n", "0 0.000000 0.200000 \n", "1 0.166667 0.166667 \n", "4 0.000000 0.000000 " ] }, "execution_count": 54, "metadata": {}, "output_type": "execute_result" } ], "source": [ "results.sort_values(by='fish', ascending=False)" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": true }, "outputs": [], "source": [] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Only counting certain words" ] }, { "cell_type": "code", "execution_count": 55, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
fishbug
01.00.0
11.00.0
21.00.0
31.00.0
40.01.0
50.50.5
61.00.0
\n", "
" ], "text/plain": [ " fish bug\n", "0 1.0 0.0\n", "1 1.0 0.0\n", "2 1.0 0.0\n", "3 1.0 0.0\n", "4 0.0 1.0\n", "5 0.5 0.5\n", "6 1.0 0.0" ] }, "execution_count": 55, "metadata": {}, "output_type": "execute_result" } ], "source": [ "from sklearn.feature_extraction.text import TfidfVectorizer\n", "\n", "# Make a new Tfidf Vectorizer!!!!\n", "vec = TfidfVectorizer(stop_words='english', \n", " vocabulary=['fish', 'bug'],\n", " tokenizer=textblob_tokenizer,\n", " use_idf=False,\n", " norm='l1') # L - ONE\n", "\n", "# Say hey vectorizer, please read our stuff\n", "matrix = vec.fit_transform(texts)\n", "\n", "# And make a dataframe out of it\n", "results = pd.DataFrame(matrix.toarray(), columns=vec.get_feature_names())\n", "results" ] }, { "cell_type": "code", "execution_count": 56, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
fishbug
010
110
230
320
401
511
610
\n", "
" ], "text/plain": [ " fish bug\n", "0 1 0\n", "1 1 0\n", "2 3 0\n", "3 2 0\n", "4 0 1\n", "5 1 1\n", "6 1 0" ] }, "execution_count": 56, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# Make a new Count Vectorizer!!!!\n", "vec = CountVectorizer(stop_words='english', \n", " vocabulary=['fish', 'bug'],\n", " tokenizer=textblob_tokenizer)\n", "\n", "# Say hey vectorizer, please read our stuff\n", "matrix = vec.fit_transform(texts)\n", "\n", "# And make a dataframe out of it\n", "results = pd.DataFrame(matrix.toarray(), columns=vec.get_feature_names())\n", "results" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": true }, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.6.1" } }, "nbformat": 4, "nbformat_minor": 2 }