Skip to content

Commit c74cd90

Browse files
committed
Handle mixed spaces and tabs.
This allows the parser to handle tabs and spaces. Currently it requires that spaces come after any tabs, which is slightly more strict than python3. It also requires that neighboring indentation levels have either both more spaces and tabs or less spaces and tabs so that tab size can't make perception of indentation differ from what the parser interprets. I didn't opt to implement PartialOrd on IndentationLevel because I wasn't sure that the comparison logic would meet the logical requirements for that trait. One could easily switch to having it implement PartialOrd though. This would necessitate switching to manually implementing PartialEq so that it's behavior 'matches'. It's unclear from the docs what 'matches' exactly means so for now I'm avoiding implementing the traits.
1 parent 62c53d8 commit c74cd90

File tree

2 files changed

+148
-25
lines changed

2 files changed

+148
-25
lines changed

parser/src/lexer.rs

+137-25
Original file line numberDiff line numberDiff line change
@@ -6,12 +6,49 @@ use num_bigint::BigInt;
66
use num_traits::Num;
77
use std::collections::HashMap;
88
use std::str::FromStr;
9+
use std::cmp::Ordering;
10+
11+
#[derive(Clone, Copy, PartialEq, Debug)]
12+
struct IndentationLevel {
13+
tabs: usize,
14+
spaces: usize,
15+
}
16+
17+
impl IndentationLevel {
18+
fn new() -> IndentationLevel {
19+
IndentationLevel {
20+
tabs: 0,
21+
spaces: 0,
22+
}
23+
}
24+
fn compare_strict(&self, other: &IndentationLevel) -> Option<Ordering> {
25+
// We only know for sure that we're smaller or bigger if tabs
26+
// and spaces both differ in the same direction. Otherwise we're
27+
// dependent on the size of tabs.
28+
if self.tabs < other.tabs {
29+
if self.spaces <= other.spaces {
30+
Some(Ordering::Less)
31+
} else {
32+
None
33+
}
34+
} else if self.tabs > other.tabs {
35+
if self.spaces >= other.spaces {
36+
Some(Ordering::Greater)
37+
} else {
38+
None
39+
}
40+
41+
} else {
42+
Some(self.spaces.cmp(&other.spaces))
43+
}
44+
}
45+
}
946

1047
pub struct Lexer<T: Iterator<Item = char>> {
1148
chars: T,
1249
at_begin_of_line: bool,
1350
nesting: usize, // Amount of parenthesis
14-
indentation_stack: Vec<usize>,
51+
indentation_stack: Vec<IndentationLevel>,
1552
pending: Vec<Spanned<Tok>>,
1653
chr0: Option<char>,
1754
chr1: Option<char>,
@@ -218,7 +255,7 @@ where
218255
chars: input,
219256
at_begin_of_line: true,
220257
nesting: 0,
221-
indentation_stack: vec![0],
258+
indentation_stack: vec![IndentationLevel::new()],
222259
pending: Vec::new(),
223260
chr0: None,
224261
location: Location::new(0, 0),
@@ -576,13 +613,24 @@ where
576613
self.at_begin_of_line = false;
577614

578615
// Determine indentation:
579-
let mut col: usize = 0;
616+
let mut spaces: usize = 0;
617+
let mut tabs: usize = 0;
580618
loop {
581619
match self.chr0 {
582620
Some(' ') => {
583621
self.next_char();
584-
col += 1;
585-
}
622+
spaces += 1;
623+
},
624+
Some('\t') => {
625+
if spaces != 0 {
626+
// Don't allow tabs after spaces as part of indentation.
627+
// This is technically stricter than python3 but spaces before
628+
// tabs is even more insane than mixing spaces and tabs.
629+
panic!("Tabs not allowed as part of indentation after spaces");
630+
}
631+
self.next_char();
632+
tabs += 1;
633+
},
586634
Some('#') => {
587635
self.lex_comment();
588636
self.at_begin_of_line = true;
@@ -601,34 +649,54 @@ where
601649
}
602650
}
603651

652+
let indentation_level = IndentationLevel {
653+
spaces,
654+
tabs,
655+
};
656+
604657
if self.nesting == 0 {
605658
// Determine indent or dedent:
606659
let current_indentation = *self.indentation_stack.last().unwrap();
607-
if col == current_indentation {
608-
// Same same
609-
} else if col > current_indentation {
610-
// New indentation level:
611-
self.indentation_stack.push(col);
612-
let tok_start = self.get_pos();
613-
let tok_end = tok_start.clone();
614-
return Some(Ok((tok_start, Tok::Indent, tok_end)));
615-
} else if col < current_indentation {
616-
// One or more dedentations
617-
// Pop off other levels until col is found:
618-
619-
while col < *self.indentation_stack.last().unwrap() {
620-
self.indentation_stack.pop().unwrap();
660+
let ordering = indentation_level.compare_strict(&current_indentation);
661+
match ordering {
662+
Some(Ordering::Equal) => {
663+
// Same same
664+
},
665+
Some(Ordering::Greater) => {
666+
// New indentation level:
667+
self.indentation_stack.push(indentation_level);
621668
let tok_start = self.get_pos();
622669
let tok_end = tok_start.clone();
623-
self.pending.push(Ok((tok_start, Tok::Dedent, tok_end)));
670+
return Some(Ok((tok_start, Tok::Indent, tok_end)));
624671
}
672+
Some(Ordering::Less) => {
673+
// One or more dedentations
674+
// Pop off other levels until col is found:
675+
676+
loop {
677+
let ordering = indentation_level.compare_strict(self.indentation_stack.last().unwrap());
678+
match ordering {
679+
Some(Ordering::Less) => {
680+
self.indentation_stack.pop();
681+
let tok_start = self.get_pos();
682+
let tok_end = tok_start.clone();
683+
self.pending.push(Ok((tok_start, Tok::Dedent, tok_end)));
684+
},
685+
None => panic!("inconsistent use of tabs and spaces in indentation"),
686+
_ => {
687+
break;
688+
},
689+
};
690+
}
625691

626-
if col != *self.indentation_stack.last().unwrap() {
627-
// TODO: handle wrong indentations
628-
panic!("Non matching indentation levels!");
629-
}
692+
if indentation_level != *self.indentation_stack.last().unwrap() {
693+
// TODO: handle wrong indentations
694+
panic!("Non matching indentation levels!");
695+
}
630696

631-
return Some(self.pending.remove(0));
697+
return Some(self.pending.remove(0));
698+
}
699+
None => panic!("inconsistent use of tabs and spaces in indentation"),
632700
}
633701
}
634702
}
@@ -1233,12 +1301,56 @@ mod tests {
12331301
}
12341302
}
12351303

1304+
macro_rules! test_double_dedent_with_tabs {
1305+
($($name:ident: $eol:expr,)*) => {
1306+
$(
1307+
#[test]
1308+
fn $name() {
1309+
let source = String::from(format!("def foo():{}\tif x:{}{}\t return 99{}{}", $eol, $eol, $eol, $eol, $eol));
1310+
let tokens = lex_source(&source);
1311+
assert_eq!(
1312+
tokens,
1313+
vec![
1314+
Tok::Def,
1315+
Tok::Name {
1316+
name: String::from("foo"),
1317+
},
1318+
Tok::Lpar,
1319+
Tok::Rpar,
1320+
Tok::Colon,
1321+
Tok::Newline,
1322+
Tok::Indent,
1323+
Tok::If,
1324+
Tok::Name {
1325+
name: String::from("x"),
1326+
},
1327+
Tok::Colon,
1328+
Tok::Newline,
1329+
Tok::Indent,
1330+
Tok::Return,
1331+
Tok::Int { value: BigInt::from(99) },
1332+
Tok::Newline,
1333+
Tok::Dedent,
1334+
Tok::Dedent,
1335+
]
1336+
);
1337+
}
1338+
)*
1339+
}
1340+
}
1341+
12361342
test_double_dedent_with_eol! {
12371343
test_double_dedent_windows_eol: WINDOWS_EOL,
12381344
test_double_dedent_mac_eol: MAC_EOL,
12391345
test_double_dedent_unix_eol: UNIX_EOL,
12401346
}
12411347

1348+
test_double_dedent_with_tabs! {
1349+
test_double_dedent_tabs_windows_eol: WINDOWS_EOL,
1350+
test_double_dedent_tabs_mac_eol: MAC_EOL,
1351+
test_double_dedent_tabs_unix_eol: UNIX_EOL,
1352+
}
1353+
12421354
macro_rules! test_newline_in_brackets {
12431355
($($name:ident: $eol:expr,)*) => {
12441356
$(

tests/snippets/indentation.py

+11
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,11 @@
1+
# WARNING! This file contains mixed tabs and spaces
2+
# (because that's what it is testing)
3+
4+
def weird_indentation():
5+
return_value = "hi"
6+
if False:
7+
return return_value
8+
return "hi"
9+
10+
assert weird_indentation() == "hi"
11+

0 commit comments

Comments
 (0)