Skip to content

Commit 29a3a1c

Browse files
author
minjk-bl
committed
Add userCommand for pre-defined functions
1 parent 3f3dbb2 commit 29a3a1c

File tree

3 files changed

+56
-33
lines changed

3 files changed

+56
-33
lines changed

js/com/com_Config.js

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -160,7 +160,8 @@ define([
160160
'printCommand.py',
161161
'fileNaviCommand.py',
162162
'pandasCommand.py',
163-
'variableCommand.py'
163+
'variableCommand.py',
164+
'userCommand.py'
164165
];
165166
let promiseList = [];
166167
libraryList.forEach(libName => {

js/m_apps/PDF.js

Lines changed: 0 additions & 32 deletions
Original file line numberDiff line numberDiff line change
@@ -30,37 +30,6 @@ import fitz
3030
import nltk
3131
nltk.download('punkt')`;
3232

33-
const PDF_FUNC = `def vp_pdf_get_sentence(fname_lst):
34-
'''
35-
Get sentence from pdf file by PyMuPDF
36-
'''
37-
df = pd.DataFrame()
38-
for fname in fname_lst:
39-
if fname.split('.')[-1] != 'pdf': continue
40-
try:
41-
doc = fitz.open(fname)
42-
sentence_lst = []
43-
for page in doc:
44-
block_lst = page.get_text('blocks')
45-
46-
text_lst = [block[4] for block in block_lst if block[6] == 0]
47-
text = '\\n'.join(text_lst)
48-
49-
sentence_lst.extend([sentence for sentence in nltk.sent_tokenize(text)])
50-
51-
doc.close()
52-
except Exception as e:
53-
print(e)
54-
continue
55-
56-
df_doc = pd.DataFrame({
57-
'fname': fname.split('/')[-1],
58-
'sentence': sentence_lst
59-
})
60-
df = pd.concat([df,df_doc])
61-
62-
return df.reset_index().drop('index', axis=1)`;
63-
6433
const PDF_CMD = 'df = vp_pdf_get_sentence(pdf_lst)\ndf'
6534
/**
6635
* PDF
@@ -98,7 +67,6 @@ nltk.download('punkt')`;
9867
// click import
9968
$(this.wrapSelector('.vp-pdf-import-btn')).on('click', function () {
10069
com_interface.insertCell('code', PDF_IMPORT);
101-
com_interface.insertCell('code', PDF_FUNC);
10270
});
10371

10472
// click file navigation button

python/userCommand.py

Lines changed: 54 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,54 @@
1+
import pandas as pd
2+
import numpy as np
3+
import fitz
4+
import nltk
5+
nltk.download('punkt')
6+
7+
def vp_pdf_get_sentence(fname_lst):
8+
'''
9+
Get sentence from pdf file by PyMuPDF
10+
'''
11+
df = pd.DataFrame()
12+
for fname in fname_lst:
13+
if fname.split('.')[-1] != 'pdf': continue
14+
try:
15+
doc = fitz.open(fname)
16+
sentence_lst = []
17+
for page in doc:
18+
block_lst = page.get_text('blocks')
19+
20+
text_lst = [block[4] for block in block_lst if block[6] == 0]
21+
text = '\\n'.join(text_lst)
22+
23+
sentence_lst.extend([sentence for sentence in nltk.sent_tokenize(text)])
24+
25+
doc.close()
26+
except Exception as e:
27+
print(e)
28+
continue
29+
30+
df_doc = pd.DataFrame({
31+
'fname': fname.split('/')[-1],
32+
'sentence': sentence_lst
33+
})
34+
df = pd.concat([df,df_doc])
35+
36+
return df.reset_index().drop('index', axis=1)
37+
38+
def vp_drop_outlier(df, col, weight=1.5):
39+
sr = df[col]
40+
41+
q25 = np.percentile(sr.values, 25)
42+
q75 = np.percentile(sr.values, 75)
43+
44+
iqr = q75 - q25
45+
iqr_w = iqr * weight
46+
47+
val_l = q25 - iqr_w
48+
val_h = q75 + iqr_w
49+
50+
outlier_index = sr[(sr < val_l) | (sr > val_h)].index
51+
52+
df_res = df.drop(outlier_index).copy()
53+
54+
return df_res

0 commit comments

Comments
 (0)