Skip to content

Commit 0ac31a8

Browse files
author
whitequark
committed
Implement lexing of Python 3.6 (format strings are stubbed out).
1 parent 1f946e8 commit 0ac31a8

File tree

2 files changed

+82
-21
lines changed

2 files changed

+82
-21
lines changed

pythonparser/lexer.py

Lines changed: 56 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -78,6 +78,7 @@ class Lexer:
7878
(3, 3): _reserved_3_1,
7979
(3, 4): _reserved_3_1,
8080
(3, 5): _reserved_3_5,
81+
(3, 6): _reserved_3_5,
8182
}
8283
"""
8384
A map from a tuple (*major*, *minor*) corresponding to Python version to
@@ -86,6 +87,9 @@ class Lexer:
8687

8788
_string_prefixes_3_1 = frozenset(["", "r", "b", "br"])
8889
_string_prefixes_3_3 = frozenset(["", "r", "u", "b", "br", "rb"])
90+
_string_prefixes_3_6 = _string_prefixes_3_3.union(frozenset([
91+
"f", "F", "fr", "Fr", "fR", "FR", "rf", "rF", "Rf", "RF"
92+
]))
8993

9094
# holy mother of god why
9195
_string_prefixes = {
@@ -97,6 +101,7 @@ class Lexer:
97101
(3, 3): _string_prefixes_3_3,
98102
(3, 4): _string_prefixes_3_3,
99103
(3, 5): _string_prefixes_3_3,
104+
(3, 6): _string_prefixes_3_6,
100105
}
101106
"""
102107
A map from a tuple (*major*, *minor*) corresponding to Python version to
@@ -123,7 +128,8 @@ def __init__(self, source_buffer, version, diagnostic_engine, interactive=False)
123128
try:
124129
reserved = self._reserved[version]
125130
except KeyError:
126-
raise NotImplementedError("pythonparser.lexer.Lexer cannot lex Python %s" % str(version))
131+
raise NotImplementedError("pythonparser.lexer.Lexer cannot lex Python %s" %
132+
str(version))
127133

128134
# Sort for the regexp to obey longest-match rule.
129135
re_reserved = sorted(reserved, reverse=True, key=len)
@@ -136,6 +142,14 @@ def __init__(self, source_buffer, version, diagnostic_engine, interactive=False)
136142
else:
137143
id_xid = "X"
138144

145+
# Python 3.6+ permits underscores as number delimiters
146+
if self.version >= (3, 6):
147+
underscore = "_?"
148+
digit = "[0-9] (?: _? [0-9] )*"
149+
else:
150+
underscore = ""
151+
digit = "[0-9]+"
152+
139153
# To speed things up on CPython, we use the re module to generate a DFA
140154
# from our token set and execute it in C. Every result yielded by
141155
# iterating this regular expression has exactly one non-empty group
@@ -156,19 +170,21 @@ def __init__(self, source_buffer, version, diagnostic_engine, interactive=False)
156170
([\n]|[\r][\n]|[\r]) # 3 newline
157171
| (\#.*) # 4 comment
158172
| ( # 5 floating point or complex literal
159-
(?: [0-9]* \. [0-9]+
160-
| [0-9]+ \.?
161-
) [eE] [+-]? [0-9]+
162-
| [0-9]* \. [0-9]+
163-
| [0-9]+ \.
173+
(?: \. {d}
174+
| {d} \. {d}
175+
| {d} \.?
176+
) [eE] [+-]? {d}
177+
| \. {d}
178+
| {d} \. {d}
179+
| {d} \.
164180
) ([jJ])? # ?6 complex suffix
165-
| ([0-9]+) [jJ] # 7 complex literal
181+
| ({d}) [jJ] # 7 complex literal
166182
| (?: # integer literal
167-
( [1-9] [0-9]* ) # 8 dec
168-
| 0[oO] ( [0-7]+ ) # 9 oct
169-
| 0[xX] ( [0-9A-Fa-f]+ ) # 10 hex
170-
| 0[bB] ( [01]+ ) # 11 bin
171-
| ( [0-9] [0-9]* ) # 12 bare oct
183+
( [1-9] (?: {u} [0-9] )* ) # 8 dec
184+
| 0[oO] ( (?: {u} [0-7] )+ ) # 9 oct
185+
| 0[xX] ( (?: {u} [0-9A-Fa-f] )+ ) # 10 hex
186+
| 0[bB] ( (?: {u} [01] )+ ) # 11 bin
187+
| ( [0-9] (?: {u} [0-9] )* ) # 12 bare oct
172188
)
173189
[Ll]?
174190
| ([BbUu]?[Rr]?) # ?13 string literal options
@@ -185,8 +201,14 @@ def __init__(self, source_buffer, version, diagnostic_engine, interactive=False)
185201
| (\p{{{id_xid}ID_Start}}\p{{{id_xid}ID_Continue}}*) # 23 Unicode identifier
186202
| ($) # 24 end-of-file
187203
)
188-
""".format(keywords=re_keywords, operators=re_operators,
189-
id_xid=id_xid), re.VERBOSE|re.UNICODE)
204+
""".format(
205+
u=underscore,
206+
d=digit,
207+
keywords=re_keywords,
208+
operators=re_operators,
209+
id_xid=id_xid
210+
),
211+
re.VERBOSE|re.UNICODE)
190212

191213
# These are identical for all lexer instances.
192214
_lex_escape_pattern = r"""
@@ -327,25 +349,34 @@ def _refill(self, eof_token):
327349
self.new_line = False
328350

329351
if match.group(5) is not None: # floating point or complex literal
352+
literal = match.group(5).replace("_", "")
330353
if match.group(6) is None:
331-
self.queue.append(Token(tok_range, "float", float(match.group(5))))
354+
self.queue.append(Token(tok_range, "float",
355+
float(literal)))
332356
else:
333-
self.queue.append(Token(tok_range, "complex", float(match.group(5)) * 1j))
357+
self.queue.append(Token(tok_range, "complex",
358+
float(literal) * 1j))
334359

335360
elif match.group(7) is not None: # complex literal
336-
self.queue.append(Token(tok_range, "complex", int(match.group(7)) * 1j))
361+
literal = match.group(7).replace("_", "")
362+
self.queue.append(Token(tok_range, "complex",
363+
int(literal) * 1j))
337364

338365
elif match.group(8) is not None: # integer literal, dec
339-
self.queue.append(self._make_int_token(tok_range, match.group(1), 10))
366+
literal = match.group(8).replace("_", "")
367+
self.queue.append(self._make_int_token(tok_range, literal, 10))
340368

341369
elif match.group(9) is not None: # integer literal, oct
342-
self.queue.append(self._make_int_token(tok_range, match.group(1), 8))
370+
literal = match.group(9).replace("_", "")
371+
self.queue.append(self._make_int_token(tok_range, literal, 8))
343372

344373
elif match.group(10) is not None: # integer literal, hex
345-
self.queue.append(self._make_int_token(tok_range, match.group(1), 16))
374+
literal = match.group(10).replace("_", "")
375+
self.queue.append(self._make_int_token(tok_range, literal, 16))
346376

347377
elif match.group(11) is not None: # integer literal, bin
348-
self.queue.append(self._make_int_token(tok_range, match.group(1), 2))
378+
literal = match.group(11).replace("_", "")
379+
self.queue.append(self._make_int_token(tok_range, literal, 2))
349380

350381
elif match.group(12) is not None: # integer literal, bare oct
351382
if len(match.group(12)) > 1 and self.version >= (3, 0):
@@ -410,6 +441,10 @@ def _string_literal(self, options, begin_span, data, data_span, end_span):
410441
{"prefix": options, "major": self.version[0], "minor": self.version[1]},
411442
begin_range)
412443
self.diagnostic_engine.process(error)
444+
if "f" in options or "F" in options:
445+
error = diagnostic.Diagnostic(
446+
"error", "pythonparser does not yet support format strings",
447+
begin_range)
413448

414449
self.queue.append(Token(begin_range, "strbegin", options))
415450
self.queue.append(Token(data_range,

pythonparser/test/test_lexer.py

Lines changed: 26 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -118,12 +118,28 @@ def test_float(self):
118118
self.assertLexes("0e-0",
119119
"float", 0.0)
120120

121+
def test_float_underscore(self):
122+
self.assertLexesVersions("1_2.", [(3, 6)],
123+
"float", 12.)
124+
self.assertLexesVersions("1_2.3_4", [(3, 6)],
125+
"float", 12.34)
126+
self.assertLexesVersions("1_2.e1_0", [(3, 6)],
127+
"float", 12e10)
128+
self.assertLexesVersions(".1_2e1_0", [(3, 6)],
129+
"float", .12e10)
130+
self.assertLexesVersions("1_2.3_4e1_0", [(3, 6)],
131+
"float", 12.34e10)
132+
121133
def test_complex(self):
122134
self.assertLexes("1e+1j",
123135
"complex", 10j)
124136
self.assertLexes("10j",
125137
"complex", 10j)
126138

139+
def test_complex_underscore(self):
140+
self.assertLexesVersions("1_0j", [(3, 6)],
141+
"complex", 10j)
142+
127143
def test_integer(self):
128144
self.assertLexes("0",
129145
"int", 0)
@@ -160,6 +176,16 @@ def test_integer_py3(self):
160176
[("error", "in Python 3, long integer literals were removed", (3, 4))],
161177
"int", 123)
162178

179+
def test_integer_underscore(self):
180+
self.assertLexesVersions("1_2_3", [(3, 6)],
181+
"int", 123)
182+
self.assertLexesVersions("0o_1_2_3", [(3, 6)],
183+
"int", 0o123)
184+
self.assertLexesVersions("0x_1_2_3_a_f", [(3, 6)],
185+
"int", 0x123af)
186+
self.assertLexesVersions("0b_0_1_0_1", [(3, 6)],
187+
"int", 0b0101)
188+
163189
def test_string_literal(self):
164190
for version in self.VERSIONS:
165191
if version < (3,):

0 commit comments

Comments
 (0)