|
64 | 64 | },
|
65 | 65 | {
|
66 | 66 | "cell_type": "code",
|
67 |
| - "execution_count": 38, |
| 67 | + "execution_count": null, |
68 | 68 | "metadata": {},
|
69 | 69 | "outputs": [],
|
70 | 70 | "source": [
|
|
73 | 73 | "from bs4 import BeautifulSoup\n",
|
74 | 74 | "import pickle\n",
|
75 | 75 | "\n",
|
| 76 | + "\n", |
76 | 77 | "# Get all newsUrl from base url of Star newswebsite\n",
|
77 | 78 | "def theStar_url_to_newsUrls(url):\n",
|
78 | 79 | " urls = []\n",
|
|
88 | 89 | " \n",
|
89 | 90 | " print()\n",
|
90 | 91 | " print('page numbers in this url: ', maxPageNumber, ' ', url)\n",
|
91 |
| - " \n", |
| 92 | + " \n", |
92 | 93 | " for page in range(1, maxPageNumber + 1):\n",
|
93 | 94 | " response = requests.get(url, params={'pgno': page}).text\n",
|
94 | 95 | " \n",
|
|
104 | 105 | "\n",
|
105 | 106 | "\n",
|
106 | 107 | "def getContent_from_newsUrl(url):\n",
|
| 108 | + " \n", |
| 109 | + " file = open(\"transcripts/DFI.txt\", \"w\", encoding=\"utf-8\")\n", |
| 110 | + " \n", |
107 | 111 | " content = []\n",
|
108 | 112 | " for u in url:\n",
|
109 | 113 | " content.append(u)\n",
|
| 114 | + " file.write(\"\\n\")\n", |
| 115 | + " file.write(u)\n", |
110 | 116 | " response = requests.get(u).text\n",
|
111 | 117 | " soup = BeautifulSoup(response, \"lxml\")\n",
|
112 | 118 | " for element in soup.select('#story-body'):\n",
|
113 |
| - " content.append(\"\\n\")\n", |
114 | 119 | " content.append(element)\n",
|
115 |
| - " content.append(\"\\n\")\n", |
| 120 | + " file.write(\"\\n\")\n", |
| 121 | + " file.write(str(element)) \n", |
116 | 122 | " print('.', end= '')\n",
|
117 | 123 | " print('finish')\n",
|
| 124 | + " file.close()\n", |
118 | 125 | " return content\n"
|
119 | 126 | ]
|
120 | 127 | },
|
121 | 128 | {
|
122 | 129 | "cell_type": "code",
|
123 |
| - "execution_count": 39, |
| 130 | + "execution_count": null, |
124 | 131 | "metadata": {
|
125 |
| - "scrolled": false |
| 132 | + "scrolled": true |
126 | 133 | },
|
127 |
| - "outputs": [ |
128 |
| - { |
129 |
| - "name": "stdout", |
130 |
| - "output_type": "stream", |
131 |
| - "text": [ |
132 |
| - "\n", |
133 |
| - "page numbers in this url: 6 https://www.thestar.com.my/search?q=DFI\n", |
134 |
| - "total number of urls fetched 57\n", |
135 |
| - "\n", |
136 |
| - "page numbers in this url: 18 https://www.thestar.com.my/search?q=Development%20Financial%20Institutions\n", |
137 |
| - "total number of urls fetched 174\n" |
138 |
| - ] |
139 |
| - } |
140 |
| - ], |
| 134 | + "outputs": [], |
141 | 135 | "source": [
|
142 | 136 | "# # First get all the news urls from the given TAGs\n",
|
143 | 137 | "url = ['https://www.thestar.com.my/search?q=DFI' , 'https://www.thestar.com.my/search?q=Development%20Financial%20Institutions']\n",
|
|
146 | 140 | },
|
147 | 141 | {
|
148 | 142 | "cell_type": "code",
|
149 |
| - "execution_count": 41, |
| 143 | + "execution_count": null, |
150 | 144 | "metadata": {},
|
151 | 145 | "outputs": [
|
152 | 146 | {
|
153 | 147 | "name": "stdout",
|
154 | 148 | "output_type": "stream",
|
155 | 149 | "text": [
|
156 | 150 | ".......................................................finish\n",
|
157 |
| - ".............................................................................................................................................................................finish\n" |
| 151 | + "...................................................................................." |
158 | 152 | ]
|
159 | 153 | }
|
160 | 154 | ],
|
|
163 | 157 | "allContents= [getContent_from_newsUrl(newsUrl) for newsUrl in newsUrls]\n"
|
164 | 158 | ]
|
165 | 159 | },
|
166 |
| - { |
167 |
| - "cell_type": "code", |
168 |
| - "execution_count": 42, |
169 |
| - "metadata": { |
170 |
| - "scrolled": true |
171 |
| - }, |
172 |
| - "outputs": [], |
173 |
| - "source": [ |
174 |
| - "# Picke file\n", |
175 |
| - "\n", |
176 |
| - "#!mkdir transcripts\n", |
177 |
| - "\n", |
178 |
| - "file = open(\"transcripts/DFI.txt\", \"wb\")\n", |
179 |
| - "data = pickle.dump(str(allContents), file)" |
180 |
| - ] |
181 |
| - }, |
182 | 160 | {
|
183 | 161 | "cell_type": "code",
|
184 | 162 | "execution_count": 10,
|
|
0 commit comments