File tree Expand file tree Collapse file tree 3 files changed +56
-33
lines changed Expand file tree Collapse file tree 3 files changed +56
-33
lines changed Original file line number Diff line number Diff line change @@ -160,7 +160,8 @@ define([
160
160
'printCommand.py' ,
161
161
'fileNaviCommand.py' ,
162
162
'pandasCommand.py' ,
163
- 'variableCommand.py'
163
+ 'variableCommand.py' ,
164
+ 'userCommand.py'
164
165
] ;
165
166
let promiseList = [ ] ;
166
167
libraryList . forEach ( libName => {
Original file line number Diff line number Diff line change @@ -30,37 +30,6 @@ import fitz
30
30
import nltk
31
31
nltk.download('punkt')` ;
32
32
33
- const PDF_FUNC = `def vp_pdf_get_sentence(fname_lst):
34
- '''
35
- Get sentence from pdf file by PyMuPDF
36
- '''
37
- df = pd.DataFrame()
38
- for fname in fname_lst:
39
- if fname.split('.')[-1] != 'pdf': continue
40
- try:
41
- doc = fitz.open(fname)
42
- sentence_lst = []
43
- for page in doc:
44
- block_lst = page.get_text('blocks')
45
-
46
- text_lst = [block[4] for block in block_lst if block[6] == 0]
47
- text = '\\n'.join(text_lst)
48
-
49
- sentence_lst.extend([sentence for sentence in nltk.sent_tokenize(text)])
50
-
51
- doc.close()
52
- except Exception as e:
53
- print(e)
54
- continue
55
-
56
- df_doc = pd.DataFrame({
57
- 'fname': fname.split('/')[-1],
58
- 'sentence': sentence_lst
59
- })
60
- df = pd.concat([df,df_doc])
61
-
62
- return df.reset_index().drop('index', axis=1)` ;
63
-
64
33
const PDF_CMD = 'df = vp_pdf_get_sentence(pdf_lst)\ndf'
65
34
/**
66
35
* PDF
@@ -98,7 +67,6 @@ nltk.download('punkt')`;
98
67
// click import
99
68
$ ( this . wrapSelector ( '.vp-pdf-import-btn' ) ) . on ( 'click' , function ( ) {
100
69
com_interface . insertCell ( 'code' , PDF_IMPORT ) ;
101
- com_interface . insertCell ( 'code' , PDF_FUNC ) ;
102
70
} ) ;
103
71
104
72
// click file navigation button
Original file line number Diff line number Diff line change
1
+ import pandas as pd
2
+ import numpy as np
3
+ import fitz
4
+ import nltk
5
+ nltk .download ('punkt' )
6
+
7
+ def vp_pdf_get_sentence (fname_lst ):
8
+ '''
9
+ Get sentence from pdf file by PyMuPDF
10
+ '''
11
+ df = pd .DataFrame ()
12
+ for fname in fname_lst :
13
+ if fname .split ('.' )[- 1 ] != 'pdf' : continue
14
+ try :
15
+ doc = fitz .open (fname )
16
+ sentence_lst = []
17
+ for page in doc :
18
+ block_lst = page .get_text ('blocks' )
19
+
20
+ text_lst = [block [4 ] for block in block_lst if block [6 ] == 0 ]
21
+ text = '\\ n' .join (text_lst )
22
+
23
+ sentence_lst .extend ([sentence for sentence in nltk .sent_tokenize (text )])
24
+
25
+ doc .close ()
26
+ except Exception as e :
27
+ print (e )
28
+ continue
29
+
30
+ df_doc = pd .DataFrame ({
31
+ 'fname' : fname .split ('/' )[- 1 ],
32
+ 'sentence' : sentence_lst
33
+ })
34
+ df = pd .concat ([df ,df_doc ])
35
+
36
+ return df .reset_index ().drop ('index' , axis = 1 )
37
+
38
+ def vp_drop_outlier (df , col , weight = 1.5 ):
39
+ sr = df [col ]
40
+
41
+ q25 = np .percentile (sr .values , 25 )
42
+ q75 = np .percentile (sr .values , 75 )
43
+
44
+ iqr = q75 - q25
45
+ iqr_w = iqr * weight
46
+
47
+ val_l = q25 - iqr_w
48
+ val_h = q75 + iqr_w
49
+
50
+ outlier_index = sr [(sr < val_l ) | (sr > val_h )].index
51
+
52
+ df_res = df .drop (outlier_index ).copy ()
53
+
54
+ return df_res
You can’t perform that action at this time.
0 commit comments