Skip to content

Commit 4c3da15

Browse files
committed
finish writing to file
1 parent 7f118e4 commit 4c3da15

File tree

2 files changed

+16
-38
lines changed

2 files changed

+16
-38
lines changed

1-Data-Cleaning.ipynb

Lines changed: 16 additions & 38 deletions
Original file line numberDiff line numberDiff line change
@@ -64,7 +64,7 @@
6464
},
6565
{
6666
"cell_type": "code",
67-
"execution_count": 38,
67+
"execution_count": null,
6868
"metadata": {},
6969
"outputs": [],
7070
"source": [
@@ -73,6 +73,7 @@
7373
"from bs4 import BeautifulSoup\n",
7474
"import pickle\n",
7575
"\n",
76+
"\n",
7677
"# Get all newsUrl from base url of Star newswebsite\n",
7778
"def theStar_url_to_newsUrls(url):\n",
7879
" urls = []\n",
@@ -88,7 +89,7 @@
8889
" \n",
8990
" print()\n",
9091
" print('page numbers in this url: ', maxPageNumber, ' ', url)\n",
91-
" \n",
92+
" \n",
9293
" for page in range(1, maxPageNumber + 1):\n",
9394
" response = requests.get(url, params={'pgno': page}).text\n",
9495
" \n",
@@ -104,40 +105,33 @@
104105
"\n",
105106
"\n",
106107
"def getContent_from_newsUrl(url):\n",
108+
" \n",
109+
" file = open(\"transcripts/DFI.txt\", \"w\", encoding=\"utf-8\")\n",
110+
" \n",
107111
" content = []\n",
108112
" for u in url:\n",
109113
" content.append(u)\n",
114+
" file.write(\"\\n\")\n",
115+
" file.write(u)\n",
110116
" response = requests.get(u).text\n",
111117
" soup = BeautifulSoup(response, \"lxml\")\n",
112118
" for element in soup.select('#story-body'):\n",
113-
" content.append(\"\\n\")\n",
114119
" content.append(element)\n",
115-
" content.append(\"\\n\")\n",
120+
" file.write(\"\\n\")\n",
121+
" file.write(str(element)) \n",
116122
" print('.', end= '')\n",
117123
" print('finish')\n",
124+
" file.close()\n",
118125
" return content\n"
119126
]
120127
},
121128
{
122129
"cell_type": "code",
123-
"execution_count": 39,
130+
"execution_count": null,
124131
"metadata": {
125-
"scrolled": false
132+
"scrolled": true
126133
},
127-
"outputs": [
128-
{
129-
"name": "stdout",
130-
"output_type": "stream",
131-
"text": [
132-
"\n",
133-
"page numbers in this url: 6 https://www.thestar.com.my/search?q=DFI\n",
134-
"total number of urls fetched 57\n",
135-
"\n",
136-
"page numbers in this url: 18 https://www.thestar.com.my/search?q=Development%20Financial%20Institutions\n",
137-
"total number of urls fetched 174\n"
138-
]
139-
}
140-
],
134+
"outputs": [],
141135
"source": [
142136
"# # First get all the news urls from the given TAGs\n",
143137
"url = ['https://www.thestar.com.my/search?q=DFI' , 'https://www.thestar.com.my/search?q=Development%20Financial%20Institutions']\n",
@@ -146,15 +140,15 @@
146140
},
147141
{
148142
"cell_type": "code",
149-
"execution_count": 41,
143+
"execution_count": null,
150144
"metadata": {},
151145
"outputs": [
152146
{
153147
"name": "stdout",
154148
"output_type": "stream",
155149
"text": [
156150
".......................................................finish\n",
157-
".............................................................................................................................................................................finish\n"
151+
"...................................................................................."
158152
]
159153
}
160154
],
@@ -163,22 +157,6 @@
163157
"allContents= [getContent_from_newsUrl(newsUrl) for newsUrl in newsUrls]\n"
164158
]
165159
},
166-
{
167-
"cell_type": "code",
168-
"execution_count": 42,
169-
"metadata": {
170-
"scrolled": true
171-
},
172-
"outputs": [],
173-
"source": [
174-
"# Picke file\n",
175-
"\n",
176-
"#!mkdir transcripts\n",
177-
"\n",
178-
"file = open(\"transcripts/DFI.txt\", \"wb\")\n",
179-
"data = pickle.dump(str(allContents), file)"
180-
]
181-
},
182160
{
183161
"cell_type": "code",
184162
"execution_count": 10,

transcripts/DFI.txt

-222 KB
Binary file not shown.

0 commit comments

Comments
 (0)