stemming

codebasics · codebasics · commit 9949f1094393 · 2022-05-05T17:57:47.000-04:00
diff --git a/6_stemming_lematization/6_stemming_lematization.ipynb b/6_stemming_lematization/6_stemming_lematization.ipynb
@@ -0,0 +1,222 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "<h3>Stemming in NLTK</h3>"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 4,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from nltk.stem import PorterStemmer\n",
+    "stemmer = PorterStemmer()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 10,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "eating | eat\n",
+      "eats | eat\n",
+      "eat | eat\n",
+      "ate | ate\n",
+      "adjustable | adjust\n",
+      "rafting | raft\n",
+      "ability | abil\n",
+      "meeting | meet\n"
+     ]
+    }
+   ],
+   "source": [
+    "words = [\"eating\", \"eats\", \"eat\", \"ate\", \"adjustable\", \"rafting\", \"ability\", \"meeting\"]\n",
+    "\n",
+    "for word in words:\n",
+    "    print(word, \"|\", stemmer.stem(word))"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "<h3>Lemmatization in Spacy</h3>"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 24,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import spacy"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 25,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "eating  |  eat\n",
+      "eats  |  eat\n",
+      "eat  |  eat\n",
+      "ate  |  eat\n",
+      "adjustable  |  adjustable\n",
+      "rafting  |  rafting\n",
+      "ability  |  ability\n",
+      "meeting  |  meeting\n",
+      "better  |  well\n"
+     ]
+    }
+   ],
+   "source": [
+    "nlp = spacy.load(\"en_core_web_sm\")\n",
+    "\n",
+    "doc = nlp(\"Mando talked for 3 hours although talking isn't his thing\")\n",
+    "doc = nlp(\"eating eats eat ate adjustable rafting ability meeting better\")\n",
+    "for token in doc:\n",
+    "    print(token, \" | \", token.lemma_)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "<h3>Customizing lemmatizer</h3>"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 26,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "['tok2vec', 'tagger', 'parser', 'attribute_ruler', 'lemmatizer', 'ner']"
+      ]
+     },
+     "execution_count": 26,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "nlp.pipe_names"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 29,
+   "metadata": {
+    "scrolled": true
+   },
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Bro | Brother\n",
+      ", | ,\n",
+      "you | you\n",
+      "wanna | wanna\n",
+      "go | go\n",
+      "? | ?\n",
+      "Brah | Brother\n",
+      ", | ,\n",
+      "do | do\n",
+      "n't | not\n",
+      "say | say\n",
+      "no | no\n",
+      "! | !\n",
+      "I | I\n",
+      "am | be\n",
+      "exhausted | exhaust\n"
+     ]
+    }
+   ],
+   "source": [
+    "ar = nlp.get_pipe('attribute_ruler')\n",
+    "\n",
+    "ar.add([[{\"TEXT\":\"Bro\"}],[{\"TEXT\":\"Brah\"}]],{\"LEMMA\":\"Brother\"})\n",
+    "\n",
+    "doc = nlp(\"Bro, you wanna go? Brah, don't say no! I am exhausted\")\n",
+    "for token in doc:\n",
+    "    print(token.text, \"|\", token.lemma_)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 35,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "Brah"
+      ]
+     },
+     "execution_count": 35,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "doc[6]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 36,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "'Brother'"
+      ]
+     },
+     "execution_count": 36,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "doc[6].lemma_"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.8.5"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 4
+}