Skip to content

Commit 1dc8101

Browse files
author
codebasics
committed
ner
1 parent 80eb297 commit 1dc8101

File tree

1 file changed

+328
-0
lines changed

1 file changed

+328
-0
lines changed

8_NER/nlp_tutorial_NER.ipynb

Lines changed: 328 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,328 @@
1+
{
2+
"cells": [
3+
{
4+
"cell_type": "markdown",
5+
"metadata": {},
6+
"source": [
7+
"<h2 align='center'>NLP Tutorial: Named Entity Recognition (NER)</h2>"
8+
]
9+
},
10+
{
11+
"cell_type": "code",
12+
"execution_count": 1,
13+
"metadata": {},
14+
"outputs": [],
15+
"source": [
16+
"import spacy"
17+
]
18+
},
19+
{
20+
"cell_type": "code",
21+
"execution_count": 3,
22+
"metadata": {},
23+
"outputs": [
24+
{
25+
"data": {
26+
"text/plain": [
27+
"['tok2vec', 'tagger', 'parser', 'attribute_ruler', 'lemmatizer', 'ner']"
28+
]
29+
},
30+
"execution_count": 3,
31+
"metadata": {},
32+
"output_type": "execute_result"
33+
}
34+
],
35+
"source": [
36+
"nlp = spacy.load(\"en_core_web_sm\")\n",
37+
"nlp.pipe_names"
38+
]
39+
},
40+
{
41+
"cell_type": "code",
42+
"execution_count": 13,
43+
"metadata": {},
44+
"outputs": [
45+
{
46+
"name": "stdout",
47+
"output_type": "stream",
48+
"text": [
49+
"Tesla Inc | ORG | Companies, agencies, institutions, etc.\n",
50+
"$45 billion | MONEY | Monetary values, including unit\n"
51+
]
52+
}
53+
],
54+
"source": [
55+
"doc = nlp(\"Tesla Inc is going to acquire twitter for $45 billion\")\n",
56+
"for ent in doc.ents:\n",
57+
" print(ent.text, \" | \", ent.label_, \" | \", spacy.explain(ent.label_))"
58+
]
59+
},
60+
{
61+
"cell_type": "code",
62+
"execution_count": 11,
63+
"metadata": {
64+
"scrolled": true
65+
},
66+
"outputs": [
67+
{
68+
"data": {
69+
"text/html": [
70+
"<span class=\"tex2jax_ignore\"><div class=\"entities\" style=\"line-height: 2.5; direction: ltr\">\n",
71+
"<mark class=\"entity\" style=\"background: #7aecec; padding: 0.45em 0.6em; margin: 0 0.25em; line-height: 1; border-radius: 0.35em;\">\n",
72+
" Tesla Inc\n",
73+
" <span style=\"font-size: 0.8em; font-weight: bold; line-height: 1; border-radius: 0.35em; vertical-align: middle; margin-left: 0.5rem\">ORG</span>\n",
74+
"</mark>\n",
75+
" is going to acquire twitter for \n",
76+
"<mark class=\"entity\" style=\"background: #e4e7d2; padding: 0.45em 0.6em; margin: 0 0.25em; line-height: 1; border-radius: 0.35em;\">\n",
77+
" $45 billion\n",
78+
" <span style=\"font-size: 0.8em; font-weight: bold; line-height: 1; border-radius: 0.35em; vertical-align: middle; margin-left: 0.5rem\">MONEY</span>\n",
79+
"</mark>\n",
80+
"</div></span>"
81+
],
82+
"text/plain": [
83+
"<IPython.core.display.HTML object>"
84+
]
85+
},
86+
"metadata": {},
87+
"output_type": "display_data"
88+
}
89+
],
90+
"source": [
91+
"from spacy import displacy\n",
92+
"\n",
93+
"displacy.render(doc, style=\"ent\")"
94+
]
95+
},
96+
{
97+
"cell_type": "markdown",
98+
"metadata": {},
99+
"source": [
100+
"<h3>List down all the entities</h3>"
101+
]
102+
},
103+
{
104+
"cell_type": "code",
105+
"execution_count": 14,
106+
"metadata": {
107+
"scrolled": true
108+
},
109+
"outputs": [
110+
{
111+
"data": {
112+
"text/plain": [
113+
"['CARDINAL',\n",
114+
" 'DATE',\n",
115+
" 'EVENT',\n",
116+
" 'FAC',\n",
117+
" 'GPE',\n",
118+
" 'LANGUAGE',\n",
119+
" 'LAW',\n",
120+
" 'LOC',\n",
121+
" 'MONEY',\n",
122+
" 'NORP',\n",
123+
" 'ORDINAL',\n",
124+
" 'ORG',\n",
125+
" 'PERCENT',\n",
126+
" 'PERSON',\n",
127+
" 'PRODUCT',\n",
128+
" 'QUANTITY',\n",
129+
" 'TIME',\n",
130+
" 'WORK_OF_ART']"
131+
]
132+
},
133+
"execution_count": 14,
134+
"metadata": {},
135+
"output_type": "execute_result"
136+
}
137+
],
138+
"source": [
139+
"nlp.pipe_labels['ner']"
140+
]
141+
},
142+
{
143+
"cell_type": "markdown",
144+
"metadata": {},
145+
"source": [
146+
"List of entities are also documented on this page: https://spacy.io/models/en"
147+
]
148+
},
149+
{
150+
"cell_type": "code",
151+
"execution_count": 34,
152+
"metadata": {},
153+
"outputs": [
154+
{
155+
"name": "stdout",
156+
"output_type": "stream",
157+
"text": [
158+
"Michael Bloomberg | PERSON | People, including fictional\n",
159+
"Bloomberg | GPE | Countries, cities, states\n",
160+
"1982 | DATE | Absolute or relative dates or periods\n"
161+
]
162+
}
163+
],
164+
"source": [
165+
"doc = nlp(\"Michael Bloomberg founded Bloomberg in 1982\")\n",
166+
"for ent in doc.ents:\n",
167+
" print(ent.text, \"|\", ent.label_, \"|\", spacy.explain(ent.label_))"
168+
]
169+
},
170+
{
171+
"cell_type": "markdown",
172+
"metadata": {},
173+
"source": [
174+
"Above it made a mistake in identifying Bloomberg the company. Let's try hugging face for this now.\n",
175+
"\n",
176+
"https://huggingface.co/dslim/bert-base-NER?text=Michael+Bloomberg+founded+Bloomberg+in+1982\n",
177+
"\n",
178+
"Here also go through 3 sample examples for NER "
179+
]
180+
},
181+
{
182+
"cell_type": "code",
183+
"execution_count": 36,
184+
"metadata": {},
185+
"outputs": [
186+
{
187+
"name": "stdout",
188+
"output_type": "stream",
189+
"text": [
190+
"Tesla Inc | ORG | 0 | 9\n",
191+
"Twitter Inc | ORG | 30 | 41\n",
192+
"$45 billion | MONEY | 46 | 57\n"
193+
]
194+
}
195+
],
196+
"source": [
197+
"doc = nlp(\"Tesla Inc is going to acquire Twitter Inc for $45 billion\")\n",
198+
"for ent in doc.ents:\n",
199+
" print(ent.text, \" | \", ent.label_, \" | \", ent.start_char, \"|\", ent.end_char)"
200+
]
201+
},
202+
{
203+
"cell_type": "markdown",
204+
"metadata": {},
205+
"source": [
206+
"<h3>Setting custom entities</h3>"
207+
]
208+
},
209+
{
210+
"cell_type": "code",
211+
"execution_count": 43,
212+
"metadata": {},
213+
"outputs": [
214+
{
215+
"name": "stdout",
216+
"output_type": "stream",
217+
"text": [
218+
"Twitter | PRODUCT\n",
219+
"$45 billion | MONEY\n"
220+
]
221+
}
222+
],
223+
"source": [
224+
"doc = nlp(\"Tesla is going to acquire Twitter for $45 billion\")\n",
225+
"for ent in doc.ents:\n",
226+
" print(ent.text, \" | \", ent.label_)"
227+
]
228+
},
229+
{
230+
"cell_type": "code",
231+
"execution_count": 44,
232+
"metadata": {},
233+
"outputs": [
234+
{
235+
"data": {
236+
"text/plain": [
237+
"going to acquire"
238+
]
239+
},
240+
"execution_count": 44,
241+
"metadata": {},
242+
"output_type": "execute_result"
243+
}
244+
],
245+
"source": [
246+
"s = doc[2:5]\n",
247+
"s"
248+
]
249+
},
250+
{
251+
"cell_type": "code",
252+
"execution_count": 45,
253+
"metadata": {},
254+
"outputs": [
255+
{
256+
"data": {
257+
"text/plain": [
258+
"spacy.tokens.span.Span"
259+
]
260+
},
261+
"execution_count": 45,
262+
"metadata": {},
263+
"output_type": "execute_result"
264+
}
265+
],
266+
"source": [
267+
"type(s)"
268+
]
269+
},
270+
{
271+
"cell_type": "code",
272+
"execution_count": 46,
273+
"metadata": {},
274+
"outputs": [],
275+
"source": [
276+
"from spacy.tokens import Span\n",
277+
"\n",
278+
"s1 = Span(doc, 0, 1, label=\"ORG\")\n",
279+
"s2 = Span(doc, 5, 6, label=\"ORG\")\n",
280+
"\n",
281+
"doc.set_ents([s1, s2], default=\"unmodified\")"
282+
]
283+
},
284+
{
285+
"cell_type": "code",
286+
"execution_count": 47,
287+
"metadata": {
288+
"scrolled": true
289+
},
290+
"outputs": [
291+
{
292+
"name": "stdout",
293+
"output_type": "stream",
294+
"text": [
295+
"Tesla | ORG\n",
296+
"Twitter | ORG\n",
297+
"$45 billion | MONEY\n"
298+
]
299+
}
300+
],
301+
"source": [
302+
"for ent in doc.ents:\n",
303+
" print(ent.text, \" | \", ent.label_)"
304+
]
305+
}
306+
],
307+
"metadata": {
308+
"kernelspec": {
309+
"display_name": "Python 3",
310+
"language": "python",
311+
"name": "python3"
312+
},
313+
"language_info": {
314+
"codemirror_mode": {
315+
"name": "ipython",
316+
"version": 3
317+
},
318+
"file_extension": ".py",
319+
"mimetype": "text/x-python",
320+
"name": "python",
321+
"nbconvert_exporter": "python",
322+
"pygments_lexer": "ipython3",
323+
"version": "3.8.5"
324+
}
325+
},
326+
"nbformat": 4,
327+
"nbformat_minor": 4
328+
}

0 commit comments

Comments
 (0)