Skip to content

Commit 07e6028

Browse files
author
codebasics
committed
word vectors gensim overview
1 parent 562c076 commit 07e6028

File tree

1 file changed

+356
-0
lines changed

1 file changed

+356
-0
lines changed
Lines changed: 356 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,356 @@
1+
{
2+
"cells": [
3+
{
4+
"cell_type": "markdown",
5+
"id": "33252ddf",
6+
"metadata": {},
7+
"source": [
8+
"<h3>NLP Tutorial: Word Vectors Overview Using Gensim Library</h3>"
9+
]
10+
},
11+
{
12+
"cell_type": "markdown",
13+
"id": "18a0003a",
14+
"metadata": {},
15+
"source": [
16+
"All gensim models are listed on this page: https://github.com/RaRe-Technologies/gensim-data"
17+
]
18+
},
19+
{
20+
"cell_type": "code",
21+
"execution_count": 76,
22+
"id": "82646d75",
23+
"metadata": {
24+
"scrolled": true
25+
},
26+
"outputs": [
27+
{
28+
"name": "stdout",
29+
"output_type": "stream",
30+
"text": [
31+
"[==================================================] 100.0% 1662.8/1662.8MB downloaded\n"
32+
]
33+
}
34+
],
35+
"source": [
36+
"import gensim.downloader as api\n",
37+
"# This is a huge model (~1.6 gb) and it will take some time to load\n",
38+
"\n",
39+
"wv = api.load('word2vec-google-news-300')"
40+
]
41+
},
42+
{
43+
"cell_type": "code",
44+
"execution_count": 77,
45+
"id": "123d8ee5",
46+
"metadata": {},
47+
"outputs": [
48+
{
49+
"data": {
50+
"text/plain": [
51+
"0.729151"
52+
]
53+
},
54+
"execution_count": 77,
55+
"metadata": {},
56+
"output_type": "execute_result"
57+
}
58+
],
59+
"source": [
60+
"wv.similarity(w1=\"great\", w2=\"good\")"
61+
]
62+
},
63+
{
64+
"cell_type": "code",
65+
"execution_count": 78,
66+
"id": "74496bc7",
67+
"metadata": {},
68+
"outputs": [
69+
{
70+
"data": {
71+
"text/plain": [
72+
"[('great', 0.7291510105133057),\n",
73+
" ('bad', 0.7190051078796387),\n",
74+
" ('terrific', 0.6889115571975708),\n",
75+
" ('decent', 0.6837348341941833),\n",
76+
" ('nice', 0.6836092472076416),\n",
77+
" ('excellent', 0.644292950630188),\n",
78+
" ('fantastic', 0.6407778263092041),\n",
79+
" ('better', 0.6120728850364685),\n",
80+
" ('solid', 0.5806034803390503),\n",
81+
" ('lousy', 0.576420247554779)]"
82+
]
83+
},
84+
"execution_count": 78,
85+
"metadata": {},
86+
"output_type": "execute_result"
87+
}
88+
],
89+
"source": [
90+
"wv.most_similar(\"good\")"
91+
]
92+
},
93+
{
94+
"cell_type": "code",
95+
"execution_count": 79,
96+
"id": "e692b6f0",
97+
"metadata": {},
98+
"outputs": [
99+
{
100+
"data": {
101+
"text/plain": [
102+
"[('dogs', 0.8680489659309387),\n",
103+
" ('puppy', 0.8106428384780884),\n",
104+
" ('pit_bull', 0.780396044254303),\n",
105+
" ('pooch', 0.7627376914024353),\n",
106+
" ('cat', 0.7609457969665527),\n",
107+
" ('golden_retriever', 0.7500901818275452),\n",
108+
" ('German_shepherd', 0.7465174198150635),\n",
109+
" ('Rottweiler', 0.7437615394592285),\n",
110+
" ('beagle', 0.7418621778488159),\n",
111+
" ('pup', 0.740691065788269)]"
112+
]
113+
},
114+
"execution_count": 79,
115+
"metadata": {},
116+
"output_type": "execute_result"
117+
}
118+
],
119+
"source": [
120+
"wv.most_similar(\"dog\")"
121+
]
122+
},
123+
{
124+
"cell_type": "code",
125+
"execution_count": 81,
126+
"id": "1ec84cbe",
127+
"metadata": {},
128+
"outputs": [
129+
{
130+
"data": {
131+
"text/plain": [
132+
"[('queen', 0.7118193507194519),\n",
133+
" ('monarch', 0.6189674139022827),\n",
134+
" ('princess', 0.5902431011199951),\n",
135+
" ('crown_prince', 0.5499460697174072),\n",
136+
" ('prince', 0.5377321839332581)]"
137+
]
138+
},
139+
"execution_count": 81,
140+
"metadata": {},
141+
"output_type": "execute_result"
142+
}
143+
],
144+
"source": [
145+
"wv.most_similar(positive=['king', 'woman'], negative=['man'], topn=5)"
146+
]
147+
},
148+
{
149+
"cell_type": "code",
150+
"execution_count": 83,
151+
"id": "c6b2589c",
152+
"metadata": {},
153+
"outputs": [
154+
{
155+
"data": {
156+
"text/plain": [
157+
"[('germany', 0.5094343423843384),\n",
158+
" ('european', 0.48650455474853516),\n",
159+
" ('german', 0.4714890420436859),\n",
160+
" ('austria', 0.46964022517204285),\n",
161+
" ('swedish', 0.4645182490348816)]"
162+
]
163+
},
164+
"execution_count": 83,
165+
"metadata": {},
166+
"output_type": "execute_result"
167+
}
168+
],
169+
"source": [
170+
"wv.most_similar(positive=['france', 'berlin'], negative=['paris'], topn=5)"
171+
]
172+
},
173+
{
174+
"cell_type": "code",
175+
"execution_count": 117,
176+
"id": "4b60309d",
177+
"metadata": {},
178+
"outputs": [
179+
{
180+
"data": {
181+
"text/plain": [
182+
"'cat'"
183+
]
184+
},
185+
"execution_count": 117,
186+
"metadata": {},
187+
"output_type": "execute_result"
188+
}
189+
],
190+
"source": [
191+
"wv.doesnt_match([\"facebook\", \"cat\", \"google\", \"microsoft\"])"
192+
]
193+
},
194+
{
195+
"cell_type": "code",
196+
"execution_count": 118,
197+
"id": "a8abe1a2",
198+
"metadata": {},
199+
"outputs": [
200+
{
201+
"data": {
202+
"text/plain": [
203+
"'google'"
204+
]
205+
},
206+
"execution_count": 118,
207+
"metadata": {},
208+
"output_type": "execute_result"
209+
}
210+
],
211+
"source": [
212+
"wv.doesnt_match([\"dog\", \"cat\", \"google\", \"mouse\"])"
213+
]
214+
},
215+
{
216+
"cell_type": "markdown",
217+
"id": "51df9555",
218+
"metadata": {},
219+
"source": [
220+
"<h3>Gensim: Glove</h3>"
221+
]
222+
},
223+
{
224+
"cell_type": "markdown",
225+
"id": "46540508",
226+
"metadata": {},
227+
"source": [
228+
"Stanford's page on GloVe: https://nlp.stanford.edu/projects/glove/"
229+
]
230+
},
231+
{
232+
"cell_type": "code",
233+
"execution_count": 95,
234+
"id": "d31b2bca",
235+
"metadata": {},
236+
"outputs": [],
237+
"source": [
238+
"glv = api.load(\"glove-twitter-25\")"
239+
]
240+
},
241+
{
242+
"cell_type": "code",
243+
"execution_count": 97,
244+
"id": "cbf5e30d",
245+
"metadata": {},
246+
"outputs": [
247+
{
248+
"data": {
249+
"text/plain": [
250+
"[('too', 0.9648017287254333),\n",
251+
" ('day', 0.9533665180206299),\n",
252+
" ('well', 0.9503170847892761),\n",
253+
" ('nice', 0.9438973665237427),\n",
254+
" ('better', 0.9425962567329407),\n",
255+
" ('fun', 0.9418926239013672),\n",
256+
" ('much', 0.9413353800773621),\n",
257+
" ('this', 0.9387555122375488),\n",
258+
" ('hope', 0.9383506774902344),\n",
259+
" ('great', 0.9378516674041748)]"
260+
]
261+
},
262+
"execution_count": 97,
263+
"metadata": {},
264+
"output_type": "execute_result"
265+
}
266+
],
267+
"source": [
268+
"glv.most_similar(\"good\")"
269+
]
270+
},
271+
{
272+
"cell_type": "code",
273+
"execution_count": 101,
274+
"id": "1b47c704",
275+
"metadata": {},
276+
"outputs": [
277+
{
278+
"data": {
279+
"text/plain": [
280+
"'cereal'"
281+
]
282+
},
283+
"execution_count": 101,
284+
"metadata": {},
285+
"output_type": "execute_result"
286+
}
287+
],
288+
"source": [
289+
"glv.doesnt_match(\"breakfast cereal dinner lunch\".split())"
290+
]
291+
},
292+
{
293+
"cell_type": "code",
294+
"execution_count": 108,
295+
"id": "99e10b38",
296+
"metadata": {},
297+
"outputs": [
298+
{
299+
"data": {
300+
"text/plain": [
301+
"'cat'"
302+
]
303+
},
304+
"execution_count": 108,
305+
"metadata": {},
306+
"output_type": "execute_result"
307+
}
308+
],
309+
"source": [
310+
"glv.doesnt_match(\"facebook cat google microsoft\".split())"
311+
]
312+
},
313+
{
314+
"cell_type": "code",
315+
"execution_count": 111,
316+
"id": "0ce3f7c1",
317+
"metadata": {},
318+
"outputs": [
319+
{
320+
"data": {
321+
"text/plain": [
322+
"'human'"
323+
]
324+
},
325+
"execution_count": 111,
326+
"metadata": {},
327+
"output_type": "execute_result"
328+
}
329+
],
330+
"source": [
331+
"glv.doesnt_match(\"banana grapes orange human\".split())"
332+
]
333+
}
334+
],
335+
"metadata": {
336+
"kernelspec": {
337+
"display_name": "Python 3",
338+
"language": "python",
339+
"name": "python3"
340+
},
341+
"language_info": {
342+
"codemirror_mode": {
343+
"name": "ipython",
344+
"version": 3
345+
},
346+
"file_extension": ".py",
347+
"mimetype": "text/x-python",
348+
"name": "python",
349+
"nbconvert_exporter": "python",
350+
"pygments_lexer": "ipython3",
351+
"version": "3.8.10"
352+
}
353+
},
354+
"nbformat": 4,
355+
"nbformat_minor": 5
356+
}

0 commit comments

Comments
 (0)