Skip to content

Commit d6dc657

Browse files
author
codebasics
committed
word embeddings spacy
1 parent 6edd984 commit d6dc657

File tree

3 files changed

+10653
-0
lines changed

3 files changed

+10653
-0
lines changed
Lines changed: 209 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,209 @@
1+
{
2+
"cells": [
3+
{
4+
"cell_type": "code",
5+
"execution_count": 1,
6+
"id": "e0a80da8",
7+
"metadata": {},
8+
"outputs": [],
9+
"source": [
10+
"import spacy\n",
11+
"\n",
12+
"# word vectors occupy lot of space. hence en_core_web_sm model do not have them included. \n",
13+
"# In order to download\n",
14+
"# word vectors you need to install large or medium english model. We will install the large one!\n",
15+
"# make sure you have run \"python -m spacy download en_core_web_lg\" to install large english model\n",
16+
"nlp = spacy.load(\"en_core_web_lg\")"
17+
]
18+
},
19+
{
20+
"cell_type": "code",
21+
"execution_count": 5,
22+
"id": "f7b0ef24",
23+
"metadata": {},
24+
"outputs": [
25+
{
26+
"name": "stdout",
27+
"output_type": "stream",
28+
"text": [
29+
"dog Vector: True OOV: False\n",
30+
"cat Vector: True OOV: False\n",
31+
"banana Vector: True OOV: False\n",
32+
"kem Vector: False OOV: True\n"
33+
]
34+
}
35+
],
36+
"source": [
37+
"doc = nlp(\"dog cat banana kem\")\n",
38+
"\n",
39+
"for token in doc:\n",
40+
" print(token.text, \"Vector:\", token.has_vector, \"OOV:\", token.is_oov)"
41+
]
42+
},
43+
{
44+
"cell_type": "code",
45+
"execution_count": 7,
46+
"id": "c1213a20",
47+
"metadata": {},
48+
"outputs": [
49+
{
50+
"data": {
51+
"text/plain": [
52+
"(300,)"
53+
]
54+
},
55+
"execution_count": 7,
56+
"metadata": {},
57+
"output_type": "execute_result"
58+
}
59+
],
60+
"source": [
61+
"doc[0].vector.shape"
62+
]
63+
},
64+
{
65+
"cell_type": "code",
66+
"execution_count": 11,
67+
"id": "e62cde6f",
68+
"metadata": {},
69+
"outputs": [
70+
{
71+
"data": {
72+
"text/plain": [
73+
"(300,)"
74+
]
75+
},
76+
"execution_count": 11,
77+
"metadata": {},
78+
"output_type": "execute_result"
79+
}
80+
],
81+
"source": [
82+
"base_token = nlp(\"bread\")\n",
83+
"base_token.vector.shape"
84+
]
85+
},
86+
{
87+
"cell_type": "code",
88+
"execution_count": 13,
89+
"id": "443e1130",
90+
"metadata": {},
91+
"outputs": [
92+
{
93+
"name": "stdout",
94+
"output_type": "stream",
95+
"text": [
96+
"bread <-> bread: 1.0\n",
97+
"sandwich <-> bread: 0.6341067010130894\n",
98+
"burger <-> bread: 0.47520687769584247\n",
99+
"car <-> bread: 0.06451533308853552\n",
100+
"tiger <-> bread: 0.04764611675903374\n",
101+
"human <-> bread: 0.2151154210812192\n",
102+
"wheat <-> bread: 0.6150360888607199\n"
103+
]
104+
}
105+
],
106+
"source": [
107+
"doc = nlp(\"bread sandwich burger car tiger human wheat\")\n",
108+
"\n",
109+
"for token in doc:\n",
110+
" print(f\"{token.text} <-> {base_token.text}:\", token.similarity(base_token))"
111+
]
112+
},
113+
{
114+
"cell_type": "code",
115+
"execution_count": 14,
116+
"id": "e9c35619",
117+
"metadata": {},
118+
"outputs": [],
119+
"source": [
120+
"def print_similarity(base_word, words_to_compare):\n",
121+
" base_token = nlp(base_word)\n",
122+
" doc = nlp(words_to_compare)\n",
123+
" for token in doc:\n",
124+
" print(f\"{token.text} <-> {base_token.text}: \", token.similarity(base_token))"
125+
]
126+
},
127+
{
128+
"cell_type": "code",
129+
"execution_count": 15,
130+
"id": "4071a3c7",
131+
"metadata": {},
132+
"outputs": [
133+
{
134+
"name": "stdout",
135+
"output_type": "stream",
136+
"text": [
137+
"apple <-> iphone: 0.4387907401919904\n",
138+
"samsung <-> iphone: 0.670859081425417\n",
139+
"iphone <-> iphone: 1.0\n",
140+
"dog <-> iphone: 0.08211864228011527\n",
141+
"kitten <-> iphone: 0.10222317834969896\n"
142+
]
143+
}
144+
],
145+
"source": [
146+
"print_similarity(\"iphone\", \"apple samsung iphone dog kitten\")"
147+
]
148+
},
149+
{
150+
"cell_type": "code",
151+
"execution_count": 16,
152+
"id": "daffd61f",
153+
"metadata": {},
154+
"outputs": [],
155+
"source": [
156+
"king = nlp.vocab[\"king\"].vector\n",
157+
"man = nlp.vocab[\"man\"].vector\n",
158+
"woman = nlp.vocab[\"woman\"].vector\n",
159+
"queen = nlp.vocab[\"queen\"].vector\n",
160+
"\n",
161+
"result = king - man + woman"
162+
]
163+
},
164+
{
165+
"cell_type": "code",
166+
"execution_count": 17,
167+
"id": "ab939b13",
168+
"metadata": {},
169+
"outputs": [
170+
{
171+
"data": {
172+
"text/plain": [
173+
"array([[0.6178015]], dtype=float32)"
174+
]
175+
},
176+
"execution_count": 17,
177+
"metadata": {},
178+
"output_type": "execute_result"
179+
}
180+
],
181+
"source": [
182+
"from sklearn.metrics.pairwise import cosine_similarity\n",
183+
"\n",
184+
"cosine_similarity([result], [queen])"
185+
]
186+
}
187+
],
188+
"metadata": {
189+
"kernelspec": {
190+
"display_name": "Python 3",
191+
"language": "python",
192+
"name": "python3"
193+
},
194+
"language_info": {
195+
"codemirror_mode": {
196+
"name": "ipython",
197+
"version": 3
198+
},
199+
"file_extension": ".py",
200+
"mimetype": "text/x-python",
201+
"name": "python",
202+
"nbconvert_exporter": "python",
203+
"pygments_lexer": "ipython3",
204+
"version": "3.8.10"
205+
}
206+
},
207+
"nbformat": 4,
208+
"nbformat_minor": 5
209+
}

0 commit comments

Comments
 (0)