Skip to content

Commit 9949f10

Browse files
author
codebasics
committed
stemming
1 parent 8698c1e commit 9949f10

File tree

1 file changed

+222
-0
lines changed

1 file changed

+222
-0
lines changed
Lines changed: 222 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,222 @@
1+
{
2+
"cells": [
3+
{
4+
"cell_type": "markdown",
5+
"metadata": {},
6+
"source": [
7+
"<h3>Stemming in NLTK</h3>"
8+
]
9+
},
10+
{
11+
"cell_type": "code",
12+
"execution_count": 4,
13+
"metadata": {},
14+
"outputs": [],
15+
"source": [
16+
"from nltk.stem import PorterStemmer\n",
17+
"stemmer = PorterStemmer()"
18+
]
19+
},
20+
{
21+
"cell_type": "code",
22+
"execution_count": 10,
23+
"metadata": {},
24+
"outputs": [
25+
{
26+
"name": "stdout",
27+
"output_type": "stream",
28+
"text": [
29+
"eating | eat\n",
30+
"eats | eat\n",
31+
"eat | eat\n",
32+
"ate | ate\n",
33+
"adjustable | adjust\n",
34+
"rafting | raft\n",
35+
"ability | abil\n",
36+
"meeting | meet\n"
37+
]
38+
}
39+
],
40+
"source": [
41+
"words = [\"eating\", \"eats\", \"eat\", \"ate\", \"adjustable\", \"rafting\", \"ability\", \"meeting\"]\n",
42+
"\n",
43+
"for word in words:\n",
44+
" print(word, \"|\", stemmer.stem(word))"
45+
]
46+
},
47+
{
48+
"cell_type": "markdown",
49+
"metadata": {},
50+
"source": [
51+
"<h3>Lemmatization in Spacy</h3>"
52+
]
53+
},
54+
{
55+
"cell_type": "code",
56+
"execution_count": 24,
57+
"metadata": {},
58+
"outputs": [],
59+
"source": [
60+
"import spacy"
61+
]
62+
},
63+
{
64+
"cell_type": "code",
65+
"execution_count": 25,
66+
"metadata": {},
67+
"outputs": [
68+
{
69+
"name": "stdout",
70+
"output_type": "stream",
71+
"text": [
72+
"eating | eat\n",
73+
"eats | eat\n",
74+
"eat | eat\n",
75+
"ate | eat\n",
76+
"adjustable | adjustable\n",
77+
"rafting | rafting\n",
78+
"ability | ability\n",
79+
"meeting | meeting\n",
80+
"better | well\n"
81+
]
82+
}
83+
],
84+
"source": [
85+
"nlp = spacy.load(\"en_core_web_sm\")\n",
86+
"\n",
87+
"doc = nlp(\"Mando talked for 3 hours although talking isn't his thing\")\n",
88+
"doc = nlp(\"eating eats eat ate adjustable rafting ability meeting better\")\n",
89+
"for token in doc:\n",
90+
" print(token, \" | \", token.lemma_)"
91+
]
92+
},
93+
{
94+
"cell_type": "markdown",
95+
"metadata": {},
96+
"source": [
97+
"<h3>Customizing lemmatizer</h3>"
98+
]
99+
},
100+
{
101+
"cell_type": "code",
102+
"execution_count": 26,
103+
"metadata": {},
104+
"outputs": [
105+
{
106+
"data": {
107+
"text/plain": [
108+
"['tok2vec', 'tagger', 'parser', 'attribute_ruler', 'lemmatizer', 'ner']"
109+
]
110+
},
111+
"execution_count": 26,
112+
"metadata": {},
113+
"output_type": "execute_result"
114+
}
115+
],
116+
"source": [
117+
"nlp.pipe_names"
118+
]
119+
},
120+
{
121+
"cell_type": "code",
122+
"execution_count": 29,
123+
"metadata": {
124+
"scrolled": true
125+
},
126+
"outputs": [
127+
{
128+
"name": "stdout",
129+
"output_type": "stream",
130+
"text": [
131+
"Bro | Brother\n",
132+
", | ,\n",
133+
"you | you\n",
134+
"wanna | wanna\n",
135+
"go | go\n",
136+
"? | ?\n",
137+
"Brah | Brother\n",
138+
", | ,\n",
139+
"do | do\n",
140+
"n't | not\n",
141+
"say | say\n",
142+
"no | no\n",
143+
"! | !\n",
144+
"I | I\n",
145+
"am | be\n",
146+
"exhausted | exhaust\n"
147+
]
148+
}
149+
],
150+
"source": [
151+
"ar = nlp.get_pipe('attribute_ruler')\n",
152+
"\n",
153+
"ar.add([[{\"TEXT\":\"Bro\"}],[{\"TEXT\":\"Brah\"}]],{\"LEMMA\":\"Brother\"})\n",
154+
"\n",
155+
"doc = nlp(\"Bro, you wanna go? Brah, don't say no! I am exhausted\")\n",
156+
"for token in doc:\n",
157+
" print(token.text, \"|\", token.lemma_)"
158+
]
159+
},
160+
{
161+
"cell_type": "code",
162+
"execution_count": 35,
163+
"metadata": {},
164+
"outputs": [
165+
{
166+
"data": {
167+
"text/plain": [
168+
"Brah"
169+
]
170+
},
171+
"execution_count": 35,
172+
"metadata": {},
173+
"output_type": "execute_result"
174+
}
175+
],
176+
"source": [
177+
"doc[6]"
178+
]
179+
},
180+
{
181+
"cell_type": "code",
182+
"execution_count": 36,
183+
"metadata": {},
184+
"outputs": [
185+
{
186+
"data": {
187+
"text/plain": [
188+
"'Brother'"
189+
]
190+
},
191+
"execution_count": 36,
192+
"metadata": {},
193+
"output_type": "execute_result"
194+
}
195+
],
196+
"source": [
197+
"doc[6].lemma_"
198+
]
199+
}
200+
],
201+
"metadata": {
202+
"kernelspec": {
203+
"display_name": "Python 3",
204+
"language": "python",
205+
"name": "python3"
206+
},
207+
"language_info": {
208+
"codemirror_mode": {
209+
"name": "ipython",
210+
"version": 3
211+
},
212+
"file_extension": ".py",
213+
"mimetype": "text/x-python",
214+
"name": "python",
215+
"nbconvert_exporter": "python",
216+
"pygments_lexer": "ipython3",
217+
"version": "3.8.5"
218+
}
219+
},
220+
"nbformat": 4,
221+
"nbformat_minor": 4
222+
}

0 commit comments

Comments
 (0)