File tree 1 file changed +6
-4
lines changed
1 file changed +6
-4
lines changed Original file line number Diff line number Diff line change @@ -25,7 +25,8 @@ define ([
25
25
26
26
const PDF_IMPORT = `import pandas as pd
27
27
import fitz
28
- from nltk.tokenize import sent_tokenize` ;
28
+ import nltk
29
+ nltk.download('punkt')` ;
29
30
30
31
const PDF_FUNC = `def vp_pdf_get_sentence(fname_lst):
31
32
'''
@@ -43,14 +44,15 @@ from nltk.tokenize import sent_tokenize`;
43
44
text_lst = [block[4] for block in block_lst if block[6] == 0]
44
45
text = '\\n'.join(text_lst)
45
46
46
- sentence_lst.extend([sentence for sentence in sent_tokenize(text)])
47
+ sentence_lst.extend([sentence for sentence in nltk. sent_tokenize(text)])
47
48
48
49
doc.close()
49
- except:
50
+ except Exception as e:
51
+ print(e)
50
52
continue
51
53
52
54
df_doc = pd.DataFrame({
53
- 'fname': fname,
55
+ 'fname': fname.split('/')[-1] ,
54
56
'sentence': sentence_lst
55
57
})
56
58
df = pd.concat([df,df_doc])
You can’t perform that action at this time.
0 commit comments