ner

codebasics · codebasics · commit 1dc81018318c · 2022-06-02T15:30:40.000-04:00
diff --git a/8_NER/nlp_tutorial_NER.ipynb b/8_NER/nlp_tutorial_NER.ipynb
@@ -0,0 +1,328 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "<h2 align='center'>NLP Tutorial: Named Entity Recognition (NER)</h2>"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import spacy"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "['tok2vec', 'tagger', 'parser', 'attribute_ruler', 'lemmatizer', 'ner']"
+      ]
+     },
+     "execution_count": 3,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "nlp = spacy.load(\"en_core_web_sm\")\n",
+    "nlp.pipe_names"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 13,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Tesla Inc  |  ORG  |  Companies, agencies, institutions, etc.\n",
+      "$45 billion  |  MONEY  |  Monetary values, including unit\n"
+     ]
+    }
+   ],
+   "source": [
+    "doc = nlp(\"Tesla Inc is going to acquire twitter for $45 billion\")\n",
+    "for ent in doc.ents:\n",
+    "    print(ent.text, \" | \", ent.label_, \" | \", spacy.explain(ent.label_))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 11,
+   "metadata": {
+    "scrolled": true
+   },
+   "outputs": [
+    {
+     "data": {
+      "text/html": [
+       "<span class=\"tex2jax_ignore\"><div class=\"entities\" style=\"line-height: 2.5; direction: ltr\">\n",
+       "<mark class=\"entity\" style=\"background: #7aecec; padding: 0.45em 0.6em; margin: 0 0.25em; line-height: 1; border-radius: 0.35em;\">\n",
+       "    Tesla Inc\n",
+       "    <span style=\"font-size: 0.8em; font-weight: bold; line-height: 1; border-radius: 0.35em; vertical-align: middle; margin-left: 0.5rem\">ORG</span>\n",
+       "</mark>\n",
+       " is going to acquire twitter for \n",
+       "<mark class=\"entity\" style=\"background: #e4e7d2; padding: 0.45em 0.6em; margin: 0 0.25em; line-height: 1; border-radius: 0.35em;\">\n",
+       "    $45 billion\n",
+       "    <span style=\"font-size: 0.8em; font-weight: bold; line-height: 1; border-radius: 0.35em; vertical-align: middle; margin-left: 0.5rem\">MONEY</span>\n",
+       "</mark>\n",
+       "</div></span>"
+      ],
+      "text/plain": [
+       "<IPython.core.display.HTML object>"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    }
+   ],
+   "source": [
+    "from spacy import displacy\n",
+    "\n",
+    "displacy.render(doc, style=\"ent\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "<h3>List down all the entities</h3>"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 14,
+   "metadata": {
+    "scrolled": true
+   },
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "['CARDINAL',\n",
+       " 'DATE',\n",
+       " 'EVENT',\n",
+       " 'FAC',\n",
+       " 'GPE',\n",
+       " 'LANGUAGE',\n",
+       " 'LAW',\n",
+       " 'LOC',\n",
+       " 'MONEY',\n",
+       " 'NORP',\n",
+       " 'ORDINAL',\n",
+       " 'ORG',\n",
+       " 'PERCENT',\n",
+       " 'PERSON',\n",
+       " 'PRODUCT',\n",
+       " 'QUANTITY',\n",
+       " 'TIME',\n",
+       " 'WORK_OF_ART']"
+      ]
+     },
+     "execution_count": 14,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "nlp.pipe_labels['ner']"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "List of entities are also documented on this page: https://spacy.io/models/en"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 34,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Michael Bloomberg | PERSON | People, including fictional\n",
+      "Bloomberg | GPE | Countries, cities, states\n",
+      "1982 | DATE | Absolute or relative dates or periods\n"
+     ]
+    }
+   ],
+   "source": [
+    "doc = nlp(\"Michael Bloomberg founded Bloomberg in 1982\")\n",
+    "for ent in doc.ents:\n",
+    "    print(ent.text, \"|\", ent.label_, \"|\", spacy.explain(ent.label_))"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Above it made a mistake in identifying Bloomberg the company. Let's try hugging face for this now.\n",
+    "\n",
+    "https://huggingface.co/dslim/bert-base-NER?text=Michael+Bloomberg+founded+Bloomberg+in+1982\n",
+    "\n",
+    "Here also go through 3 sample examples for NER "
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 36,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Tesla Inc  |  ORG  |  0 | 9\n",
+      "Twitter Inc  |  ORG  |  30 | 41\n",
+      "$45 billion  |  MONEY  |  46 | 57\n"
+     ]
+    }
+   ],
+   "source": [
+    "doc = nlp(\"Tesla Inc is going to acquire Twitter Inc for $45 billion\")\n",
+    "for ent in doc.ents:\n",
+    "    print(ent.text, \" | \", ent.label_, \" | \", ent.start_char, \"|\", ent.end_char)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "<h3>Setting custom entities</h3>"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 43,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Twitter  |  PRODUCT\n",
+      "$45 billion  |  MONEY\n"
+     ]
+    }
+   ],
+   "source": [
+    "doc = nlp(\"Tesla is going to acquire Twitter for $45 billion\")\n",
+    "for ent in doc.ents:\n",
+    "    print(ent.text, \" | \", ent.label_)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 44,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "going to acquire"
+      ]
+     },
+     "execution_count": 44,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "s = doc[2:5]\n",
+    "s"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 45,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "spacy.tokens.span.Span"
+      ]
+     },
+     "execution_count": 45,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "type(s)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 46,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from spacy.tokens import Span\n",
+    "\n",
+    "s1 = Span(doc, 0, 1, label=\"ORG\")\n",
+    "s2 = Span(doc, 5, 6, label=\"ORG\")\n",
+    "\n",
+    "doc.set_ents([s1, s2], default=\"unmodified\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 47,
+   "metadata": {
+    "scrolled": true
+   },
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Tesla  |  ORG\n",
+      "Twitter  |  ORG\n",
+      "$45 billion  |  MONEY\n"
+     ]
+    }
+   ],
+   "source": [
+    "for ent in doc.ents:\n",
+    "    print(ent.text, \" | \", ent.label_)"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.8.5"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 4
+}