fix bug on text generator code

x4nth055 · x4nth055 · commit df5242e4e94a · 2021-08-11T12:09:19.000+01:00
diff --git a/machine-learning/nlp/text-generator/text-generator-v2.ipynb b/machine-learning/nlp/text-generator/text-generator-v2.ipynb
@@ -0,0 +1,300 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "source": [
+    "import tensorflow as tf\r\n",
+    "import numpy as np\r\n",
+    "import os\r\n",
+    "import pickle\r\n",
+    "\r\n",
+    "SEQUENCE_LENGTH = 50\r\n",
+    "EMBEDDING_DIM = 200\r\n",
+    "BATCH_SIZE = 128\r\n",
+    "FILE_PATH = \"data/python_code.py\"\r\n",
+    "BASENAME = os.path.basename(FILE_PATH) + \"-lower\"\r\n",
+    "\r\n",
+    "text = open(FILE_PATH).read()\r\n",
+    "# comment this if you want to use uppercase letters\r\n",
+    "text = text.lower()\r\n",
+    "n_chars = len(text)\r\n",
+    "vocab = ''.join(sorted(set(text)))\r\n",
+    "print(\"vocab:\", vocab)\r\n",
+    "n_unique_chars = len(vocab)\r\n",
+    "print(\"Number of characters:\", n_chars)\r\n",
+    "print(\"Number of unique characters:\", n_unique_chars)"
+   ],
+   "outputs": [],
+   "metadata": {}
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "source": [
+    "# dictionary that converts characters to integers\r\n",
+    "char2int = {c: i for i, c in enumerate(vocab)}\r\n",
+    "# dictionary that converts integers to characters\r\n",
+    "int2char = {i: c for i, c in enumerate(vocab)}\r\n",
+    "\r\n",
+    "# save these dictionaries for later generation\r\n",
+    "pickle.dump(char2int, open(f\"{BASENAME}-char2int.pickle\", \"wb\"))\r\n",
+    "pickle.dump(int2char, open(f\"{BASENAME}-int2char.pickle\", \"wb\"))"
+   ],
+   "outputs": [],
+   "metadata": {}
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "source": [
+    "encoded_text = np.array([char2int[c] for c in text])"
+   ],
+   "outputs": [],
+   "metadata": {}
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "source": [
+    "char_dataset = tf.data.Dataset.from_tensor_slices(encoded_text)\r\n",
+    "for element in char_dataset.take(5):\r\n",
+    "    print(element.numpy())"
+   ],
+   "outputs": [],
+   "metadata": {}
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "source": [
+    "for element in char_dataset.batch(SEQUENCE_LENGTH+1).shuffle(1024).take(2):\r\n",
+    "    print(''.join([int2char[c] for c in element.numpy()]))"
+   ],
+   "outputs": [],
+   "metadata": {}
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "source": [
+    "#help(tf.one_hot)\r\n",
+    "#help(char_dataset.window)\r\n",
+    "windows = char_dataset.window(SEQUENCE_LENGTH+1, shift=1, drop_remainder=True)\r\n",
+    "sequences = windows.flat_map(lambda window: window.batch(SEQUENCE_LENGTH+1))\r\n",
+    "dataset = sequences.map(lambda x: (x[:-1], x[-1]))\r\n",
+    "for input_, target in dataset.take(10):\r\n",
+    "    print(input_.numpy().shape)\r\n",
+    "    print(target.numpy().shape)\r\n",
+    "    print(''.join([int2char[c] for c in input_.numpy()]), int2char[target.numpy()])\r\n",
+    "    print(\"=\"*50)"
+   ],
+   "outputs": [],
+   "metadata": {}
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "source": [
+    "sequences2 = char_dataset.batch(2*SEQUENCE_LENGTH+1, drop_remainder=True)\r\n",
+    "\r\n",
+    "def split_sample(sample):\r\n",
+    "    ds = tf.data.Dataset.from_tensors((sample[:SEQUENCE_LENGTH], sample[SEQUENCE_LENGTH]))\r\n",
+    "    for i in range(1, (len(sample)-1) // 2):\r\n",
+    "        input_ = sample[i:i+SEQUENCE_LENGTH]\r\n",
+    "        target = sample[i+SEQUENCE_LENGTH]\r\n",
+    "        other_ds = tf.data.Dataset.from_tensors((input_, target))\r\n",
+    "        ds = ds.concatenate(other_ds)\r\n",
+    "    return ds\r\n",
+    "\r\n",
+    "\r\n",
+    "dataset2 = sequences2.flat_map(split_sample)\r\n",
+    "for element in dataset2.take(10):\r\n",
+    "    print(element[0].shape, element[1].shape)\r\n",
+    "    print(''.join([int2char[c] for c in element[0].numpy()]), int2char[element[1].numpy()])"
+   ],
+   "outputs": [],
+   "metadata": {
+    "tags": [
+     "outputPrepend",
+     "outputPrepend",
+     "outputPrepend",
+     "outputPrepend"
+    ]
+   }
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "source": [
+    "for element1, element2 in zip(dataset.take(5), dataset2.take(5)):\r\n",
+    "    print(element1[0].numpy() == element2[0].numpy())\r\n",
+    "    "
+   ],
+   "outputs": [],
+   "metadata": {}
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "source": [
+    "def one_hot_samples(input_, target):\r\n",
+    "    return tf.one_hot(input_, len(vocab)), tf.one_hot(target, len(vocab))\r\n",
+    "#     return input_, tf.one_hot(target, len(vocab))\r\n",
+    "\r\n",
+    "dataset = dataset.map(one_hot_samples)\r\n",
+    "dataset2 = dataset2.map(one_hot_samples)\r\n",
+    "for element in dataset.take(10):\r\n",
+    "    print(element[0].shape, element[1].shape)"
+   ],
+   "outputs": [],
+   "metadata": {}
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "source": [
+    "ds = dataset.shuffle(1024).batch(BATCH_SIZE, drop_remainder=True).cache().prefetch(1).repeat()\r\n",
+    "ds2 = dataset2.shuffle(1024).batch(BATCH_SIZE, drop_remainder=True).cache().prefetch(1).repeat()"
+   ],
+   "outputs": [],
+   "metadata": {}
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "source": [
+    "def create_model(vocab_size, embedding_dim, rnn_units, batch_size):\r\n",
+    "    model = tf.keras.Sequential()\r\n",
+    "    # model.add(tf.keras.layers.Embedding(vocab_size, embedding_dim, input_shape=(SEQUENCE_LENGTH,)))\r\n",
+    "    model.add(tf.keras.layers.LSTM(rnn_units, input_shape=(SEQUENCE_LENGTH, len(vocab)), return_sequences=True))\r\n",
+    "    model.add(tf.keras.layers.Dropout(0.3))\r\n",
+    "    model.add(tf.keras.layers.LSTM(rnn_units)),\r\n",
+    "    model.add(tf.keras.layers.Dropout(0.3))\r\n",
+    "    model.add(tf.keras.layers.Dense(vocab_size, activation=\"softmax\"))\r\n",
+    "    return model"
+   ],
+   "outputs": [],
+   "metadata": {}
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "source": [
+    "model = create_model(len(vocab), embedding_dim=EMBEDDING_DIM, rnn_units=128, batch_size=BATCH_SIZE)\r\n",
+    "model.summary()\r\n",
+    "model.compile(optimizer=\"adam\", loss=\"categorical_crossentropy\", metrics=[\"accuracy\"])"
+   ],
+   "outputs": [],
+   "metadata": {}
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "source": [
+    "EPOCHS = 5\r\n",
+    "history = model.fit(ds2, steps_per_epoch=(len(encoded_text) - SEQUENCE_LENGTH ) // BATCH_SIZE, epochs=EPOCHS)"
+   ],
+   "outputs": [],
+   "metadata": {}
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "source": [
+    "# save the model\r\n",
+    "model_path = f\"results/{BASENAME}-{SEQUENCE_LENGTH}-NOEMBEDDING-moredata.h5\"\r\n",
+    "model.save(model_path)\r\n",
+    "# model.load_weights(model_path)"
+   ],
+   "outputs": [],
+   "metadata": {}
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "source": [
+    "seed = \"\"\"You can be a\"\"\".lower()\r\n",
+    "s = seed\r\n",
+    "# generate 400 characters\r\n",
+    "generated = \"\"\r\n",
+    "for i in range(200):\r\n",
+    "    # make the input sequence\r\n",
+    "    X = np.zeros((1, SEQUENCE_LENGTH, len(vocab)))\r\n",
+    "    # X = np.zeros((1, SEQUENCE_LENGTH))\r\n",
+    "    for t, char in enumerate(seed):\r\n",
+    "        X[0, (SEQUENCE_LENGTH - len(seed)) + t, char2int[char]] = 1\r\n",
+    "    # predict the next character\r\n",
+    "    predicted = model.predict(X, verbose=0)[0]\r\n",
+    "    # print(predicted)\r\n",
+    "    # converting the vector to an integer\r\n",
+    "    next_index = np.argmax(predicted)\r\n",
+    "#     next_index = np.squeeze(np.round(predicted))\r\n",
+    "    # converting the integer to a character\r\n",
+    "#     print(next_index)\r\n",
+    "    next_char = int2char[next_index]\r\n",
+    "    # add the character to results\r\n",
+    "    generated += next_char\r\n",
+    "    # shift seed and the predicted character\r\n",
+    "    seed = seed[1:] + next_char\r\n",
+    "\r\n",
+    "print(\"Generated text:\")\r\n",
+    "print(s + generated)"
+   ],
+   "outputs": [],
+   "metadata": {}
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "source": [
+    "char2int\r\n"
+   ],
+   "outputs": [],
+   "metadata": {}
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "source": [],
+   "outputs": [],
+   "metadata": {}
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "source": [],
+   "outputs": [],
+   "metadata": {}
+  }
+ ],
+ "metadata": {
+  "file_extension": ".py",
+  "kernelspec": {
+   "name": "python3",
+   "display_name": "Python 3.8.7 64-bit"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.8.7"
+  },
+  "mimetype": "text/x-python",
+  "name": "python",
+  "npconvert_exporter": "python",
+  "pygments_lexer": "ipython3",
+  "version": 3,
+  "interpreter": {
+   "hash": "777490da48e046e3b512f0b24bf037db286a787493a11bf82a9e0f2cbf21bb67"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 4
+}
diff --git a/machine-learning/nlp/text-generator/train.py b/machine-learning/nlp/text-generator/train.py
@@ -113,7 +113,10 @@ def one_hot_samples(input_, target):
     Dense(n_unique_chars, activation="softmax"),
 ])
 
-model.load_weights(f"results/{BASENAME}-{sequence_length}.h5")
+# define the model path
+model_weights_path = f"results/{BASENAME}-{sequence_length}.h5"
+# if os.path.isfile(model_weights_path):
+#     model.load_weights(model_weights_path)
 
 model.summary()
 model.compile(loss="categorical_crossentropy", optimizer="adam", metrics=["accuracy"])
@@ -126,4 +129,4 @@ def one_hot_samples(input_, target):
 # train the model
 model.fit(ds, steps_per_epoch=(len(encoded_text) - sequence_length) // BATCH_SIZE, epochs=EPOCHS)
 # save the model
-model.save(f"results/{BASENAME}-{sequence_length}.h5")
+model.save(model_weights_path)