Skip to content

Commit 5916caa

Browse files
committed
Fine-tuning lexer
+ Token changes to be compatible with Python ast module - Rename columnno to col_offset - col_offset starts from 0, not from 1 - Add end_col_offset property + EOL is always added by splitter + Refactoring and cleanup
1 parent 971a569 commit 5916caa

File tree

5 files changed

+765
-621
lines changed

5 files changed

+765
-621
lines changed

src/robot/parsing/lexer/readers.py

Lines changed: 17 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -21,7 +21,7 @@
2121
from .context import TestCaseFileContext, ResourceFileContext
2222
from .lexers import FileLexer
2323
from .splitter import Splitter
24-
from .tokens import EOL, EOS, Token
24+
from .tokens import EOS, Token
2525

2626

2727
def get_tokens(source, data_only=False):
@@ -92,7 +92,7 @@ def get_tokens(self):
9292
statements = self._handle_old_for(self.statements)
9393
if not self.data_only:
9494
statements = chain.from_iterable(
95-
self._split_trailing_comment_and_empty_lines(s)
95+
self._split_trailing_commented_and_empty_lines(s)
9696
for s in statements
9797
)
9898
# Setting local variables is performance optimization to avoid
@@ -151,36 +151,36 @@ def _get_first_data_token(self, statement):
151151
return token
152152
return None
153153

154-
def _split_trailing_comment_and_empty_lines(self, statement):
154+
def _split_trailing_commented_and_empty_lines(self, statement):
155155
lines = list(self._split_to_lines(statement))
156-
split_statements = []
156+
commented_or_empty = []
157157
for line in reversed(lines):
158-
is_split = False
159-
for token in line:
160-
if token.type not in (token.IGNORE, token.SEPARATOR):
161-
is_split = token.type in (token.EOL, token.COMMENT)
162-
break
163-
if not is_split:
158+
if not self._is_commented_or_empty(line):
164159
break
165-
split_statements.append(line)
160+
commented_or_empty.append(line)
166161
lines.pop()
167162
yield list(chain.from_iterable(lines))
168-
for split in reversed(split_statements):
169-
yield split
163+
for line in reversed(commented_or_empty):
164+
yield line
170165

171166
def _split_to_lines(self, statement):
172167
current = []
173-
eol = Token.EOL
174168
for token in statement:
175169
current.append(token)
176-
if token.type == eol:
170+
if token.type == Token.EOL:
177171
yield current
178172
current = []
179173
if current:
180-
if current[-1].type != eol:
181-
current.append(EOL.from_token(current[-1]))
182174
yield current
183175

176+
def _is_commented_or_empty(self, line):
177+
separator_or_ignore = (Token.SEPARATOR, Token.IGNORE)
178+
comment_or_eol = (Token.COMMENT, Token.EOL)
179+
for token in line:
180+
if token.type not in separator_or_ignore:
181+
return token.type in comment_or_eol
182+
return False
183+
184184

185185
class TestCaseFileReader(BaseReader):
186186
context_class = TestCaseFileContext

src/robot/parsing/lexer/splitter.py

Lines changed: 21 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -43,16 +43,15 @@ def _split_line(self, line, lineno, data_only=False):
4343
splitter = self._split_from_spaces
4444
else:
4545
splitter = self._split_from_pipes
46-
columnno = 1
46+
offset = 0
4747
data, sepa = Token.DATA, Token.SEPARATOR
4848
for value, is_data in splitter(rstrip(line)):
4949
if is_data or not data_only:
50-
yield Token(data if is_data else sepa, value, lineno, columnno)
51-
columnno += len(value)
50+
yield Token(data if is_data else sepa, value, lineno, offset)
51+
offset += len(value)
5252
if not data_only:
53-
trailing_whitespace = re.search(r'\s+$', line, flags=re.UNICODE)
54-
if trailing_whitespace:
55-
yield Token(Token.EOL, trailing_whitespace.group(), lineno, columnno)
53+
trailing_whitespace = re.search(r'\s*$', line, flags=re.UNICODE)
54+
yield Token(Token.EOL, trailing_whitespace.group(), lineno, offset)
5655

5756
def _split_from_spaces(self, line):
5857
for index, value in enumerate(self._space_splitter.split(line)):
@@ -82,47 +81,50 @@ def _handle_comments(self, tokens):
8281
has_data = False
8382
commented = False
8483
for token in tokens:
85-
if token.type == token.DATA:
84+
if token.type == Token.DATA:
8685
if token.value.startswith('#') or commented:
87-
token.type = token.COMMENT
86+
token.type = Token.COMMENT
8887
commented = True
8988
elif token.value:
9089
has_data = True
9190
return has_data
9291

9392
def _handle_continuation(self, tokens):
9493
for token in tokens:
95-
if token.value == '...' and token.type == token.DATA:
96-
token.type = token.CONTINUATION
94+
if token.value == '...' and token.type == Token.DATA:
95+
token.type = Token.CONTINUATION
9796
return True
98-
elif token.value and token.type != token.SEPARATOR:
97+
elif token.value and token.type != Token.SEPARATOR:
9998
return False
10099
return False
101100

102101
def _remove_trailing_empty(self, tokens):
103102
for token in reversed(tokens):
104-
if not token.value:
103+
if not token.value and token.type != Token.EOL:
105104
tokens.remove(token)
106-
elif token.type == token.DATA:
105+
elif token.type == Token.DATA:
107106
break
108107

109108
def _remove_leading_empty(self, tokens):
109+
data_or_continuation = (Token.DATA, Token.CONTINUATION)
110110
for token in list(tokens):
111111
if not token.value:
112112
tokens.remove(token)
113-
elif token.type in (token.DATA, token.CONTINUATION):
113+
elif token.type in data_or_continuation:
114114
break
115115

116116
def _ensure_data_after_continuation(self, tokens):
117-
if not any(t.type == t.DATA for t in tokens):
117+
data = Token.DATA
118+
if not any(t.type == data for t in tokens):
118119
cont = self._find_continuation(tokens)
119-
data = Token(Token.DATA, '', cont.lineno, cont.columnno + 3)
120-
tokens.insert(tokens.index(cont) + 1, data)
120+
token = Token(data, '', cont.lineno, cont.end_col_offset)
121+
tokens.insert(tokens.index(cont) + 1, token)
121122

122123
def _find_continuation(self, tokens):
123124
for token in tokens:
124-
if token.type == token.CONTINUATION:
125+
if token.type == Token.CONTINUATION:
125126
return token
126127

127128
def _remove_non_data(self, tokens):
128-
return [t for t in tokens if t.type == t.DATA]
129+
data = Token.DATA
130+
return [t for t in tokens if t.type == data]

src/robot/parsing/lexer/tokens.py

Lines changed: 12 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -56,10 +56,10 @@ class Token(object):
5656
END = 'END'
5757

5858
SEPARATOR = 'SEPARATOR'
59-
EOL = 'EOL'
6059
COMMENT = 'COMMENT'
6160
CONTINUATION = 'CONTINUATION'
6261
IGNORE = 'IGNORE'
62+
EOL = 'EOL'
6363
EOS = 'EOS'
6464
ERROR = 'ERROR'
6565
DATA = 'DATA'
@@ -101,32 +101,27 @@ class Token(object):
101101
KEYWORD_HEADER
102102
)
103103

104-
__slots__ = ['type', 'value', 'lineno', 'columnno', 'error']
104+
__slots__ = ['type', 'value', 'lineno', 'col_offset', 'error']
105105

106-
def __init__(self, type, value='', lineno=-1, columnno=-1):
106+
def __init__(self, type, value='', lineno=-1, col_offset=-1):
107107
self.type = type
108108
self.value = value
109109
self.lineno = lineno
110-
self.columnno = columnno
110+
self.col_offset = col_offset
111111
self.error = None
112112

113+
@property
114+
def end_col_offset(self):
115+
if self.col_offset == -1:
116+
return -1
117+
return self.col_offset + len(self.value)
118+
113119
def __unicode__(self):
114120
return self.value
115121

116122
def __repr__(self):
117123
return 'Token(%s, %r, %s, %s)' % (self.type, self.value,
118-
self.lineno, self.columnno)
119-
120-
121-
class EOL(Token):
122-
__slots__ = []
123-
124-
def __init__(self, value='', lineno=-1, columnno=-1):
125-
Token.__init__(self, Token.EOL, value, lineno, columnno)
126-
127-
@classmethod
128-
def from_token(cls, token):
129-
return EOL('', token.lineno, token.columnno + len(token.value))
124+
self.lineno, self.col_offset)
130125

131126

132127
class EOS(Token):
@@ -137,4 +132,4 @@ def __init__(self, lineno=-1, columnno=-1):
137132

138133
@classmethod
139134
def from_token(cls, token):
140-
return EOS(token.lineno, token.columnno + len(token.value))
135+
return EOS(token.lineno, token.end_col_offset)

0 commit comments

Comments
 (0)