word vectors gensim overview

codebasics · codebasics · commit 07e6028df2b2 · 2022-09-21T15:08:15.000-04:00
diff --git a/15_word_vectors_gensim_overview/nlp_word_vectors_gensim_overview.ipynb b/15_word_vectors_gensim_overview/nlp_word_vectors_gensim_overview.ipynb
@@ -0,0 +1,356 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "id": "33252ddf",
+   "metadata": {},
+   "source": [
+    "<h3>NLP Tutorial: Word Vectors Overview Using Gensim Library</h3>"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "18a0003a",
+   "metadata": {},
+   "source": [
+    "All gensim models are listed on this page: https://github.com/RaRe-Technologies/gensim-data"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 76,
+   "id": "82646d75",
+   "metadata": {
+    "scrolled": true
+   },
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "[==================================================] 100.0% 1662.8/1662.8MB downloaded\n"
+     ]
+    }
+   ],
+   "source": [
+    "import gensim.downloader as api\n",
+    "# This is a huge model (~1.6 gb) and it will take some time to load\n",
+    "\n",
+    "wv = api.load('word2vec-google-news-300')"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 77,
+   "id": "123d8ee5",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "0.729151"
+      ]
+     },
+     "execution_count": 77,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "wv.similarity(w1=\"great\", w2=\"good\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 78,
+   "id": "74496bc7",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "[('great', 0.7291510105133057),\n",
+       " ('bad', 0.7190051078796387),\n",
+       " ('terrific', 0.6889115571975708),\n",
+       " ('decent', 0.6837348341941833),\n",
+       " ('nice', 0.6836092472076416),\n",
+       " ('excellent', 0.644292950630188),\n",
+       " ('fantastic', 0.6407778263092041),\n",
+       " ('better', 0.6120728850364685),\n",
+       " ('solid', 0.5806034803390503),\n",
+       " ('lousy', 0.576420247554779)]"
+      ]
+     },
+     "execution_count": 78,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "wv.most_similar(\"good\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 79,
+   "id": "e692b6f0",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "[('dogs', 0.8680489659309387),\n",
+       " ('puppy', 0.8106428384780884),\n",
+       " ('pit_bull', 0.780396044254303),\n",
+       " ('pooch', 0.7627376914024353),\n",
+       " ('cat', 0.7609457969665527),\n",
+       " ('golden_retriever', 0.7500901818275452),\n",
+       " ('German_shepherd', 0.7465174198150635),\n",
+       " ('Rottweiler', 0.7437615394592285),\n",
+       " ('beagle', 0.7418621778488159),\n",
+       " ('pup', 0.740691065788269)]"
+      ]
+     },
+     "execution_count": 79,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "wv.most_similar(\"dog\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 81,
+   "id": "1ec84cbe",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "[('queen', 0.7118193507194519),\n",
+       " ('monarch', 0.6189674139022827),\n",
+       " ('princess', 0.5902431011199951),\n",
+       " ('crown_prince', 0.5499460697174072),\n",
+       " ('prince', 0.5377321839332581)]"
+      ]
+     },
+     "execution_count": 81,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "wv.most_similar(positive=['king', 'woman'], negative=['man'], topn=5)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 83,
+   "id": "c6b2589c",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "[('germany', 0.5094343423843384),\n",
+       " ('european', 0.48650455474853516),\n",
+       " ('german', 0.4714890420436859),\n",
+       " ('austria', 0.46964022517204285),\n",
+       " ('swedish', 0.4645182490348816)]"
+      ]
+     },
+     "execution_count": 83,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "wv.most_similar(positive=['france', 'berlin'], negative=['paris'], topn=5)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 117,
+   "id": "4b60309d",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "'cat'"
+      ]
+     },
+     "execution_count": 117,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "wv.doesnt_match([\"facebook\", \"cat\", \"google\", \"microsoft\"])"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 118,
+   "id": "a8abe1a2",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "'google'"
+      ]
+     },
+     "execution_count": 118,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "wv.doesnt_match([\"dog\", \"cat\", \"google\", \"mouse\"])"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "51df9555",
+   "metadata": {},
+   "source": [
+    "<h3>Gensim: Glove</h3>"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "46540508",
+   "metadata": {},
+   "source": [
+    "Stanford's page on GloVe: https://nlp.stanford.edu/projects/glove/"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 95,
+   "id": "d31b2bca",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "glv = api.load(\"glove-twitter-25\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 97,
+   "id": "cbf5e30d",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "[('too', 0.9648017287254333),\n",
+       " ('day', 0.9533665180206299),\n",
+       " ('well', 0.9503170847892761),\n",
+       " ('nice', 0.9438973665237427),\n",
+       " ('better', 0.9425962567329407),\n",
+       " ('fun', 0.9418926239013672),\n",
+       " ('much', 0.9413353800773621),\n",
+       " ('this', 0.9387555122375488),\n",
+       " ('hope', 0.9383506774902344),\n",
+       " ('great', 0.9378516674041748)]"
+      ]
+     },
+     "execution_count": 97,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "glv.most_similar(\"good\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 101,
+   "id": "1b47c704",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "'cereal'"
+      ]
+     },
+     "execution_count": 101,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "glv.doesnt_match(\"breakfast cereal dinner lunch\".split())"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 108,
+   "id": "99e10b38",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "'cat'"
+      ]
+     },
+     "execution_count": 108,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "glv.doesnt_match(\"facebook cat google microsoft\".split())"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 111,
+   "id": "0ce3f7c1",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "'human'"
+      ]
+     },
+     "execution_count": 111,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "glv.doesnt_match(\"banana grapes orange human\".split())"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.8.10"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}