1
1
# -*- coding: utf-8 -*-
2
+ from __future__ import print_function
2
3
import re
3
4
import os
4
5
import sys
6
+
7
+ try :
8
+ xrange # Python 2
9
+ except NameError :
10
+ xrange = range # Python 3
11
+
5
12
class SEG (object ):
6
13
def __init__ (self ):
7
14
_localDir = os .path .dirname (__file__ )
8
15
_curpath = os .path .normpath (os .path .join (os .getcwd (),_localDir ))
9
16
curpath = _curpath
10
17
self .d = {}
11
- print >> sys . stderr , "loading dict..."
18
+ print ( "loading dict..." , file = sys . stderr )
12
19
self .set ([x .rstrip () for x in file (os .path .join (curpath ,"main.dic" )) ])
13
20
self .specialwords = set ([x .rstrip ().decode ('utf-8' ) for x in file (os .path .join (curpath ,"suffix.dic" ))])
14
- print >> sys . stderr , 'dict ok.'
21
+ print ( 'dict ok.' , file = sys . stderr )
15
22
#set dictionary(a list)
16
23
def set (self ,keywords ):
17
24
p = self .d
@@ -33,8 +40,6 @@ def set(self,keywords):
33
40
q = p
34
41
k = char
35
42
p = p [char ]
36
-
37
- pass
38
43
39
44
def _binary_seg (self ,s ):
40
45
ln = len (s )
@@ -47,7 +52,7 @@ def _binary_seg(self,s):
47
52
return R
48
53
49
54
def _pro_unreg (self ,piece ):
50
- #print piece
55
+ #print( piece)
51
56
R = []
52
57
tmp = re .sub (u"。|,|,|!|…|!|《|》|<|>|\" |'|:|:|?|\?|、|\||“|”|‘|’|;|—|(|)|·|\(|\)| " ," " ,piece ).split ()
53
58
ln1 = len (tmp )
@@ -77,7 +82,7 @@ def cut(self,text):
77
82
mem2 = None
78
83
while i - j > 0 :
79
84
t = text [i - j - 1 ].lower ()
80
- #print i,j,t,mem
85
+ #print( i,j,t,mem)
81
86
if not (t in p ):
82
87
if (mem != None ) or (mem2 != None ):
83
88
if mem != None :
@@ -86,9 +91,9 @@ def cut(self,text):
86
91
elif mem2 != None :
87
92
delta = mem2 [0 ]- i
88
93
if delta >= 1 :
89
- if (delta < 5 ) and (re .search (ur "[\w\u2E80-\u9FFF]" ,t )!= None ):
94
+ if (delta < 5 ) and (re .search (u "[\w\u2E80 -\u9FFF ]" ,t )!= None ):
90
95
pre = text [i - j ]
91
- #print pre
96
+ #print( pre)
92
97
if not (pre in self .specialwords ):
93
98
i ,j ,z ,q = mem2
94
99
del recognised [q :]
@@ -99,7 +104,7 @@ def cut(self,text):
99
104
unreg_tmp = self ._pro_unreg (text [i :z ])
100
105
recognised .extend (unreg_tmp )
101
106
recognised .append (text [i - j :i ])
102
- #print text[i-j:i],mem2
107
+ #print( text[i-j:i],mem2)
103
108
i = i - j
104
109
z = i
105
110
j = 0
@@ -113,18 +118,18 @@ def cut(self,text):
113
118
if chr (11 ) in p :
114
119
if j <= 2 :
115
120
mem = i ,j ,z
116
- #print text[i-1]
121
+ #print( text[i-1])
117
122
if (z - i < 2 ) and (text [i - 1 ] in self .specialwords ) and ((mem2 == None ) or ((mem2 != None and mem2 [0 ]- i > 1 ))):
118
- #print text[i-1]
123
+ #print( text[i-1])
119
124
mem = None
120
125
mem2 = i ,j ,z ,len (recognised )
121
126
p = self .d
122
127
i -= 1
123
128
j = 0
124
129
continue
125
- #print mem
130
+ #print( mem)
126
131
p = self .d
127
- #print i,j,z,text[i:z]
132
+ #print( i,j,z,text[i:z])
128
133
if ((i < ln ) and (i < z )):
129
134
unreg_tmp = self ._pro_unreg (text [i :z ])
130
135
recognised .extend (unreg_tmp )
@@ -134,11 +139,11 @@ def cut(self,text):
134
139
j = 0
135
140
mem = None
136
141
mem2 = None
137
- #print mem
142
+ #print( mem)
138
143
if mem != None :
139
144
i ,j ,z = mem
140
145
recognised .extend (self ._pro_unreg (text [i :z ]))
141
146
recognised .append (text [i - j :i ])
142
147
else :
143
148
recognised .extend (self ._pro_unreg (text [i - j :z ]))
144
- return recognised
149
+ return recognised
0 commit comments