Skip to content

Commit baafef5

Browse files
committed
DFI changes
1 parent 995e489 commit baafef5

19 files changed

+7071
-1022
lines changed
Lines changed: 143 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,143 @@
1+
{
2+
"cells": [
3+
{
4+
"cell_type": "markdown",
5+
"metadata": {},
6+
"source": [
7+
"# Data Cleaning"
8+
]
9+
},
10+
{
11+
"cell_type": "markdown",
12+
"metadata": {},
13+
"source": [
14+
"## Introduction"
15+
]
16+
},
17+
{
18+
"cell_type": "code",
19+
"execution_count": 1,
20+
"metadata": {},
21+
"outputs": [],
22+
"source": [
23+
"# Load pickled files\n",
24+
"\n",
25+
"data = {}\n",
26+
"\n",
27+
"with open(\"transcripts/DFI.txt\", \"rb\") as file:\n",
28+
" data = file.read().decode(\"utf-8\") \n"
29+
]
30+
},
31+
{
32+
"cell_type": "markdown",
33+
"metadata": {},
34+
"source": [
35+
"## Cleaning The Data"
36+
]
37+
},
38+
{
39+
"cell_type": "code",
40+
"execution_count": 18,
41+
"metadata": {},
42+
"outputs": [
43+
{
44+
"name": "stdout",
45+
"output_type": "stream",
46+
"text": [
47+
"['dfi', 'should', 'be', 'assess', 'beyond', 'financi', 'perform', 'petal', 'jaya', ':', 'develop', 'financi', 'institut', '(', 'dfi', ')', 'should', 'be', 'assess', 'beyond', 'financi', 'perform', ',', 'by', 'look', 'at', 'the', 'deliver', 'of', 'their', 'mandat', 'role', '.', 'sme', 'develop', 'bank', 'malaysia', 'bhd', '(', 'sme', 'bank', ')', 'chief', 'oper', 'offic', ',', 'khairil', 'anuar', 'mohamad', 'anuar', 'said', 'the', 'dfi', '’', 'role', 'is', 'to', 'act', 'on', ',', 'or', 'to', 'drive', 'govern', 'agenda', ',', 'as', 'well', 'as', 'to', 'play', 'a', 'counter-cycl', 'role', 'to', 'support', 'and', 'push', 'the', 'economi', 'further', '.', '“', 'guidelin', 'by', 'bank', 'negara', 'to', 'dfi', 'in', 'assess', 'their', 'perform', 'are', 'not', 'onli', 'base', 'on', 'financi', 'but']\n"
48+
]
49+
}
50+
],
51+
"source": [
52+
"# Apply a third round of cleaning: https://machinelearningmastery.com/clean-text-machine-learning-python/\n",
53+
"# Tokenization and Cleaning with NLTK\n",
54+
"\n",
55+
"# split into words\n",
56+
"from nltk.tokenize import word_tokenize\n",
57+
"tokens = word_tokenize(data)\n",
58+
"# convert to lower case\n",
59+
"tokens = [w.lower() for w in tokens]\n",
60+
"# remove punctuation from each word\n",
61+
"import string\n",
62+
"table = str.maketrans('', '', string.punctuation)\n",
63+
"stripped = [w.translate(table) for w in tokens]\n",
64+
"# remove remaining tokens that are not alphabetic\n",
65+
"words = [word for word in stripped if word.isalpha()]\n",
66+
"# filter out stop words\n",
67+
"from nltk.corpus import stopwords\n",
68+
"stop_words = set(stopwords.words('english'))\n",
69+
"words = [w for w in words if not w in stop_words]\n",
70+
"\n",
71+
"\n",
72+
"tokens = word_tokenize(data)\n",
73+
"\n",
74+
"# stemming of words\n",
75+
"from nltk.stem.porter import PorterStemmer\n",
76+
"porter = PorterStemmer()\n",
77+
"stemmed = [porter.stem(word) for word in tokens]\n",
78+
"print(stemmed[:100])\n",
79+
"\n"
80+
]
81+
}
82+
],
83+
"metadata": {
84+
"kernelspec": {
85+
"display_name": "Python 3",
86+
"language": "python",
87+
"name": "python3"
88+
},
89+
"language_info": {
90+
"codemirror_mode": {
91+
"name": "ipython",
92+
"version": 3
93+
},
94+
"file_extension": ".py",
95+
"mimetype": "text/x-python",
96+
"name": "python",
97+
"nbconvert_exporter": "python",
98+
"pygments_lexer": "ipython3",
99+
"version": "3.7.6"
100+
},
101+
"toc": {
102+
"nav_menu": {},
103+
"number_sections": true,
104+
"sideBar": true,
105+
"skip_h1_title": false,
106+
"toc_cell": false,
107+
"toc_position": {},
108+
"toc_section_display": "block",
109+
"toc_window_display": false
110+
},
111+
"varInspector": {
112+
"cols": {
113+
"lenName": 16,
114+
"lenType": 16,
115+
"lenVar": 40
116+
},
117+
"kernels_config": {
118+
"python": {
119+
"delete_cmd_postfix": "",
120+
"delete_cmd_prefix": "del ",
121+
"library": "var_list.py",
122+
"varRefreshCmd": "print(var_dic_list())"
123+
},
124+
"r": {
125+
"delete_cmd_postfix": ") ",
126+
"delete_cmd_prefix": "rm(",
127+
"library": "var_list.r",
128+
"varRefreshCmd": "cat(var_dic_list()) "
129+
}
130+
},
131+
"types_to_exclude": [
132+
"module",
133+
"function",
134+
"builtin_function_or_method",
135+
"instance",
136+
"_Feature"
137+
],
138+
"window_display": false
139+
}
140+
},
141+
"nbformat": 4,
142+
"nbformat_minor": 2
143+
}

.ipynb_checkpoints/1-Data-Cleaning TEXT-checkpoint.ipynb

Lines changed: 1055 additions & 0 deletions
Large diffs are not rendered by default.

.ipynb_checkpoints/1-Data-Cleaning-checkpoint.ipynb

Lines changed: 1135 additions & 584 deletions
Large diffs are not rendered by default.

1-Data-Cleaning TEXT.ipynb

Lines changed: 1055 additions & 0 deletions
Large diffs are not rendered by default.

1-Data-Cleaning WORDS.ipynb

Lines changed: 143 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,143 @@
1+
{
2+
"cells": [
3+
{
4+
"cell_type": "markdown",
5+
"metadata": {},
6+
"source": [
7+
"# Data Cleaning"
8+
]
9+
},
10+
{
11+
"cell_type": "markdown",
12+
"metadata": {},
13+
"source": [
14+
"## Introduction"
15+
]
16+
},
17+
{
18+
"cell_type": "code",
19+
"execution_count": 1,
20+
"metadata": {},
21+
"outputs": [],
22+
"source": [
23+
"# Load pickled files\n",
24+
"\n",
25+
"data = {}\n",
26+
"\n",
27+
"with open(\"transcripts/DFI.txt\", \"rb\") as file:\n",
28+
" data = file.read().decode(\"utf-8\") \n"
29+
]
30+
},
31+
{
32+
"cell_type": "markdown",
33+
"metadata": {},
34+
"source": [
35+
"## Cleaning The Data"
36+
]
37+
},
38+
{
39+
"cell_type": "code",
40+
"execution_count": 18,
41+
"metadata": {},
42+
"outputs": [
43+
{
44+
"name": "stdout",
45+
"output_type": "stream",
46+
"text": [
47+
"['dfi', 'should', 'be', 'assess', 'beyond', 'financi', 'perform', 'petal', 'jaya', ':', 'develop', 'financi', 'institut', '(', 'dfi', ')', 'should', 'be', 'assess', 'beyond', 'financi', 'perform', ',', 'by', 'look', 'at', 'the', 'deliver', 'of', 'their', 'mandat', 'role', '.', 'sme', 'develop', 'bank', 'malaysia', 'bhd', '(', 'sme', 'bank', ')', 'chief', 'oper', 'offic', ',', 'khairil', 'anuar', 'mohamad', 'anuar', 'said', 'the', 'dfi', '’', 'role', 'is', 'to', 'act', 'on', ',', 'or', 'to', 'drive', 'govern', 'agenda', ',', 'as', 'well', 'as', 'to', 'play', 'a', 'counter-cycl', 'role', 'to', 'support', 'and', 'push', 'the', 'economi', 'further', '.', '“', 'guidelin', 'by', 'bank', 'negara', 'to', 'dfi', 'in', 'assess', 'their', 'perform', 'are', 'not', 'onli', 'base', 'on', 'financi', 'but']\n"
48+
]
49+
}
50+
],
51+
"source": [
52+
"# Apply a third round of cleaning: https://machinelearningmastery.com/clean-text-machine-learning-python/\n",
53+
"# Tokenization and Cleaning with NLTK\n",
54+
"\n",
55+
"# split into words\n",
56+
"from nltk.tokenize import word_tokenize\n",
57+
"tokens = word_tokenize(data)\n",
58+
"# convert to lower case\n",
59+
"tokens = [w.lower() for w in tokens]\n",
60+
"# remove punctuation from each word\n",
61+
"import string\n",
62+
"table = str.maketrans('', '', string.punctuation)\n",
63+
"stripped = [w.translate(table) for w in tokens]\n",
64+
"# remove remaining tokens that are not alphabetic\n",
65+
"words = [word for word in stripped if word.isalpha()]\n",
66+
"# filter out stop words\n",
67+
"from nltk.corpus import stopwords\n",
68+
"stop_words = set(stopwords.words('english'))\n",
69+
"words = [w for w in words if not w in stop_words]\n",
70+
"\n",
71+
"\n",
72+
"tokens = word_tokenize(data)\n",
73+
"\n",
74+
"# stemming of words\n",
75+
"from nltk.stem.porter import PorterStemmer\n",
76+
"porter = PorterStemmer()\n",
77+
"stemmed = [porter.stem(word) for word in tokens]\n",
78+
"print(stemmed[:100])\n",
79+
"\n"
80+
]
81+
}
82+
],
83+
"metadata": {
84+
"kernelspec": {
85+
"display_name": "Python 3",
86+
"language": "python",
87+
"name": "python3"
88+
},
89+
"language_info": {
90+
"codemirror_mode": {
91+
"name": "ipython",
92+
"version": 3
93+
},
94+
"file_extension": ".py",
95+
"mimetype": "text/x-python",
96+
"name": "python",
97+
"nbconvert_exporter": "python",
98+
"pygments_lexer": "ipython3",
99+
"version": "3.7.6"
100+
},
101+
"toc": {
102+
"nav_menu": {},
103+
"number_sections": true,
104+
"sideBar": true,
105+
"skip_h1_title": false,
106+
"toc_cell": false,
107+
"toc_position": {},
108+
"toc_section_display": "block",
109+
"toc_window_display": false
110+
},
111+
"varInspector": {
112+
"cols": {
113+
"lenName": 16,
114+
"lenType": 16,
115+
"lenVar": 40
116+
},
117+
"kernels_config": {
118+
"python": {
119+
"delete_cmd_postfix": "",
120+
"delete_cmd_prefix": "del ",
121+
"library": "var_list.py",
122+
"varRefreshCmd": "print(var_dic_list())"
123+
},
124+
"r": {
125+
"delete_cmd_postfix": ") ",
126+
"delete_cmd_prefix": "rm(",
127+
"library": "var_list.r",
128+
"varRefreshCmd": "cat(var_dic_list()) "
129+
}
130+
},
131+
"types_to_exclude": [
132+
"module",
133+
"function",
134+
"builtin_function_or_method",
135+
"instance",
136+
"_Feature"
137+
],
138+
"window_display": false
139+
}
140+
},
141+
"nbformat": 4,
142+
"nbformat_minor": 2
143+
}

1-Data-Cleaning.ipynb

Lines changed: 1157 additions & 264 deletions
Large diffs are not rendered by default.

0 commit comments

Comments
 (0)