@@ -487,7 +487,7 @@ def tgt(self):
487
487
return self .CHART ,
488
488
489
489
@classmethod
490
- def totree (cls , tokens , root = '' ):
490
+ def totree (cls , tokens , root = '' , special_tokens = { '(' : '-LRB-' , ')' : '-RRB-' } ):
491
491
r"""
492
492
Converts a list of tokens to a :class:`nltk.tree.Tree`.
493
493
Missing fields are filled with underscores.
@@ -497,6 +497,9 @@ def totree(cls, tokens, root=''):
497
497
This can be either a list of words or word/pos pairs.
498
498
root (str):
499
499
The root label of the tree. Default: ''.
500
+ special_tokens (dict):
501
+ A dict for normalizing some special tokens to avoid tree construction crash.
502
+ Default: {'(': '-LRB-', ')': '-RRB-'}.
500
503
501
504
Returns:
502
505
A :class:`nltk.tree.Tree` object.
@@ -508,8 +511,15 @@ def totree(cls, tokens, root=''):
508
511
509
512
if isinstance (tokens [0 ], str ):
510
513
tokens = [(token , '_' ) for token in tokens ]
511
- tree = ' ' .join ([f"( ({ pos } { word } ))" for word , pos in tokens ])
512
- return nltk .Tree .fromstring (f"({ root } { tree } )" )
514
+ mapped = []
515
+ for i , (word , pos ) in enumerate (tokens ):
516
+ if word in special_tokens :
517
+ tokens [i ] = (special_tokens [word ], pos )
518
+ mapped .append ((i , word ))
519
+ tree = nltk .Tree .fromstring (f"({ root } { ' ' .join ([f'( ({ pos } { word } ))' for word , pos in tokens ])} )" )
520
+ for i , word in mapped :
521
+ tree [i ][0 ][0 ] = word
522
+ return tree
513
523
514
524
@classmethod
515
525
def binarize (cls , tree ):
0 commit comments