hoseinit
diff --git a/‎1-Data-Cleaning.ipynb
Lines changed: 16 additions & 38 deletions b/‎1-Data-Cleaning.ipynb
Lines changed: 16 additions & 38 deletions
diff --git a/‎transcripts/DFI.txt
-222 KB b/‎transcripts/DFI.txt
-222 KB
@@ -64,7 +64,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 38,
+   "execution_count": null,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -73,6 +73,7 @@
     "from bs4 import BeautifulSoup\n",
     "import pickle\n",
     "\n",
+    "\n",
     "# Get all newsUrl from base url of Star newswebsite\n",
     "def theStar_url_to_newsUrls(url):\n",
     "    urls = []\n",
@@ -88,7 +89,7 @@
     "    \n",
     "    print()\n",
     "    print('page numbers in this url: ', maxPageNumber, ' ', url)\n",
-    "    \n",
+    "        \n",
     "    for page in range(1, maxPageNumber + 1):\n",
     "        response = requests.get(url, params={'pgno': page}).text\n",
     "        \n",
@@ -104,40 +105,33 @@
     "\n",
     "\n",
     "def getContent_from_newsUrl(url):\n",
+    "    \n",
+    "    file = open(\"transcripts/DFI.txt\", \"w\", encoding=\"utf-8\")\n",
+    "    \n",
     "    content = []\n",
     "    for u in url:\n",
     "        content.append(u)\n",
+    "        file.write(\"\\n\")\n",
+    "        file.write(u)\n",
     "        response = requests.get(u).text\n",
     "        soup = BeautifulSoup(response, \"lxml\")\n",
     "        for element in soup.select('#story-body'):\n",
-    "            content.append(\"\\n\")\n",
     "            content.append(element)\n",
-    "            content.append(\"\\n\")\n",
+    "            file.write(\"\\n\")\n",
+    "            file.write(str(element))            \n",
     "            print('.', end= '')\n",
     "    print('finish')\n",
+    "    file.close()\n",
     "    return content\n"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 39,
+   "execution_count": null,
    "metadata": {
-    "scrolled": false
+    "scrolled": true
    },
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "\n",
-      "page numbers in this url:  6   https://www.thestar.com.my/search?q=DFI\n",
-      "total number of urls fetched 57\n",
-      "\n",
-      "page numbers in this url:  18   https://www.thestar.com.my/search?q=Development%20Financial%20Institutions\n",
-      "total number of urls fetched 174\n"
-     ]
-    }
-   ],
+   "outputs": [],
    "source": [
     "# # First get all the news urls from the given TAGs\n",
     "url = ['https://www.thestar.com.my/search?q=DFI' , 'https://www.thestar.com.my/search?q=Development%20Financial%20Institutions']\n",
@@ -146,15 +140,15 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 41,
+   "execution_count": null,
    "metadata": {},
    "outputs": [
     {
      "name": "stdout",
      "output_type": "stream",
      "text": [
       ".......................................................finish\n",
-      ".............................................................................................................................................................................finish\n"
+      "...................................................................................."
      ]
     }
    ],
@@ -163,22 +157,6 @@
     "allContents= [getContent_from_newsUrl(newsUrl) for newsUrl in newsUrls]\n"
    ]
   },
-  {
-   "cell_type": "code",
-   "execution_count": 42,
-   "metadata": {
-    "scrolled": true
-   },
-   "outputs": [],
-   "source": [
-    "# Picke file\n",
-    "\n",
-    "#!mkdir transcripts\n",
-    "\n",
-    "file = open(\"transcripts/DFI.txt\", \"wb\")\n",
-    "data = pickle.dump(str(allContents), file)"
-   ]
-  },
   {
    "cell_type": "code",
    "execution_count": 10,