From c74cd90d8edcfe1e4d1326e0b8c32c7267f2e496 Mon Sep 17 00:00:00 2001 From: Gitea Date: Sun, 16 Dec 2018 17:08:36 -0500 Subject: [PATCH 1/2] Handle mixed spaces and tabs. This allows the parser to handle tabs and spaces. Currently it requires that spaces come after any tabs, which is slightly more strict than python3. It also requires that neighboring indentation levels have either both more spaces and tabs or less spaces and tabs so that tab size can't make perception of indentation differ from what the parser interprets. I didn't opt to implement PartialOrd on IndentationLevel because I wasn't sure that the comparison logic would meet the logical requirements for that trait. One could easily switch to having it implement PartialOrd though. This would necessitate switching to manually implementing PartialEq so that it's behavior 'matches'. It's unclear from the docs what 'matches' exactly means so for now I'm avoiding implementing the traits. --- parser/src/lexer.rs | 162 ++++++++++++++++++++++++++++------ tests/snippets/indentation.py | 11 +++ 2 files changed, 148 insertions(+), 25 deletions(-) create mode 100644 tests/snippets/indentation.py diff --git a/parser/src/lexer.rs b/parser/src/lexer.rs index da063fffc6..cc4824dabc 100644 --- a/parser/src/lexer.rs +++ b/parser/src/lexer.rs @@ -6,12 +6,49 @@ use num_bigint::BigInt; use num_traits::Num; use std::collections::HashMap; use std::str::FromStr; +use std::cmp::Ordering; + +#[derive(Clone, Copy, PartialEq, Debug)] +struct IndentationLevel { + tabs: usize, + spaces: usize, +} + +impl IndentationLevel { + fn new() -> IndentationLevel { + IndentationLevel { + tabs: 0, + spaces: 0, + } + } + fn compare_strict(&self, other: &IndentationLevel) -> Option { + // We only know for sure that we're smaller or bigger if tabs + // and spaces both differ in the same direction. Otherwise we're + // dependent on the size of tabs. + if self.tabs < other.tabs { + if self.spaces <= other.spaces { + Some(Ordering::Less) + } else { + None + } + } else if self.tabs > other.tabs { + if self.spaces >= other.spaces { + Some(Ordering::Greater) + } else { + None + } + + } else { + Some(self.spaces.cmp(&other.spaces)) + } + } +} pub struct Lexer> { chars: T, at_begin_of_line: bool, nesting: usize, // Amount of parenthesis - indentation_stack: Vec, + indentation_stack: Vec, pending: Vec>, chr0: Option, chr1: Option, @@ -218,7 +255,7 @@ where chars: input, at_begin_of_line: true, nesting: 0, - indentation_stack: vec![0], + indentation_stack: vec![IndentationLevel::new()], pending: Vec::new(), chr0: None, location: Location::new(0, 0), @@ -576,13 +613,24 @@ where self.at_begin_of_line = false; // Determine indentation: - let mut col: usize = 0; + let mut spaces: usize = 0; + let mut tabs: usize = 0; loop { match self.chr0 { Some(' ') => { self.next_char(); - col += 1; - } + spaces += 1; + }, + Some('\t') => { + if spaces != 0 { + // Don't allow tabs after spaces as part of indentation. + // This is technically stricter than python3 but spaces before + // tabs is even more insane than mixing spaces and tabs. + panic!("Tabs not allowed as part of indentation after spaces"); + } + self.next_char(); + tabs += 1; + }, Some('#') => { self.lex_comment(); self.at_begin_of_line = true; @@ -601,34 +649,54 @@ where } } + let indentation_level = IndentationLevel { + spaces, + tabs, + }; + if self.nesting == 0 { // Determine indent or dedent: let current_indentation = *self.indentation_stack.last().unwrap(); - if col == current_indentation { - // Same same - } else if col > current_indentation { - // New indentation level: - self.indentation_stack.push(col); - let tok_start = self.get_pos(); - let tok_end = tok_start.clone(); - return Some(Ok((tok_start, Tok::Indent, tok_end))); - } else if col < current_indentation { - // One or more dedentations - // Pop off other levels until col is found: - - while col < *self.indentation_stack.last().unwrap() { - self.indentation_stack.pop().unwrap(); + let ordering = indentation_level.compare_strict(¤t_indentation); + match ordering { + Some(Ordering::Equal) => { + // Same same + }, + Some(Ordering::Greater) => { + // New indentation level: + self.indentation_stack.push(indentation_level); let tok_start = self.get_pos(); let tok_end = tok_start.clone(); - self.pending.push(Ok((tok_start, Tok::Dedent, tok_end))); + return Some(Ok((tok_start, Tok::Indent, tok_end))); } + Some(Ordering::Less) => { + // One or more dedentations + // Pop off other levels until col is found: + + loop { + let ordering = indentation_level.compare_strict(self.indentation_stack.last().unwrap()); + match ordering { + Some(Ordering::Less) => { + self.indentation_stack.pop(); + let tok_start = self.get_pos(); + let tok_end = tok_start.clone(); + self.pending.push(Ok((tok_start, Tok::Dedent, tok_end))); + }, + None => panic!("inconsistent use of tabs and spaces in indentation"), + _ => { + break; + }, + }; + } - if col != *self.indentation_stack.last().unwrap() { - // TODO: handle wrong indentations - panic!("Non matching indentation levels!"); - } + if indentation_level != *self.indentation_stack.last().unwrap() { + // TODO: handle wrong indentations + panic!("Non matching indentation levels!"); + } - return Some(self.pending.remove(0)); + return Some(self.pending.remove(0)); + } + None => panic!("inconsistent use of tabs and spaces in indentation"), } } } @@ -1233,12 +1301,56 @@ mod tests { } } + macro_rules! test_double_dedent_with_tabs { + ($($name:ident: $eol:expr,)*) => { + $( + #[test] + fn $name() { + let source = String::from(format!("def foo():{}\tif x:{}{}\t return 99{}{}", $eol, $eol, $eol, $eol, $eol)); + let tokens = lex_source(&source); + assert_eq!( + tokens, + vec![ + Tok::Def, + Tok::Name { + name: String::from("foo"), + }, + Tok::Lpar, + Tok::Rpar, + Tok::Colon, + Tok::Newline, + Tok::Indent, + Tok::If, + Tok::Name { + name: String::from("x"), + }, + Tok::Colon, + Tok::Newline, + Tok::Indent, + Tok::Return, + Tok::Int { value: BigInt::from(99) }, + Tok::Newline, + Tok::Dedent, + Tok::Dedent, + ] + ); + } + )* + } + } + test_double_dedent_with_eol! { test_double_dedent_windows_eol: WINDOWS_EOL, test_double_dedent_mac_eol: MAC_EOL, test_double_dedent_unix_eol: UNIX_EOL, } + test_double_dedent_with_tabs! { + test_double_dedent_tabs_windows_eol: WINDOWS_EOL, + test_double_dedent_tabs_mac_eol: MAC_EOL, + test_double_dedent_tabs_unix_eol: UNIX_EOL, + } + macro_rules! test_newline_in_brackets { ($($name:ident: $eol:expr,)*) => { $( diff --git a/tests/snippets/indentation.py b/tests/snippets/indentation.py new file mode 100644 index 0000000000..de0bc0322a --- /dev/null +++ b/tests/snippets/indentation.py @@ -0,0 +1,11 @@ +# WARNING! This file contains mixed tabs and spaces +# (because that's what it is testing) + +def weird_indentation(): + return_value = "hi" + if False: + return return_value + return "hi" + +assert weird_indentation() == "hi" + From 9468b657d2fcc124c5840e247125917f2ab0e872 Mon Sep 17 00:00:00 2001 From: Gitea Date: Sun, 16 Dec 2018 17:40:25 -0500 Subject: [PATCH 2/2] Run `cargo fmt`. --- parser/src/lexer.rs | 30 +++++++++++++----------------- 1 file changed, 13 insertions(+), 17 deletions(-) diff --git a/parser/src/lexer.rs b/parser/src/lexer.rs index cc4824dabc..cf611e8a1f 100644 --- a/parser/src/lexer.rs +++ b/parser/src/lexer.rs @@ -4,9 +4,9 @@ pub use super::token::Tok; use num_bigint::BigInt; use num_traits::Num; +use std::cmp::Ordering; use std::collections::HashMap; use std::str::FromStr; -use std::cmp::Ordering; #[derive(Clone, Copy, PartialEq, Debug)] struct IndentationLevel { @@ -16,10 +16,7 @@ struct IndentationLevel { impl IndentationLevel { fn new() -> IndentationLevel { - IndentationLevel { - tabs: 0, - spaces: 0, - } + IndentationLevel { tabs: 0, spaces: 0 } } fn compare_strict(&self, other: &IndentationLevel) -> Option { // We only know for sure that we're smaller or bigger if tabs @@ -37,7 +34,6 @@ impl IndentationLevel { } else { None } - } else { Some(self.spaces.cmp(&other.spaces)) } @@ -620,7 +616,7 @@ where Some(' ') => { self.next_char(); spaces += 1; - }, + } Some('\t') => { if spaces != 0 { // Don't allow tabs after spaces as part of indentation. @@ -630,7 +626,7 @@ where } self.next_char(); tabs += 1; - }, + } Some('#') => { self.lex_comment(); self.at_begin_of_line = true; @@ -649,10 +645,7 @@ where } } - let indentation_level = IndentationLevel { - spaces, - tabs, - }; + let indentation_level = IndentationLevel { spaces, tabs }; if self.nesting == 0 { // Determine indent or dedent: @@ -661,7 +654,7 @@ where match ordering { Some(Ordering::Equal) => { // Same same - }, + } Some(Ordering::Greater) => { // New indentation level: self.indentation_stack.push(indentation_level); @@ -674,18 +667,21 @@ where // Pop off other levels until col is found: loop { - let ordering = indentation_level.compare_strict(self.indentation_stack.last().unwrap()); + let ordering = indentation_level + .compare_strict(self.indentation_stack.last().unwrap()); match ordering { Some(Ordering::Less) => { self.indentation_stack.pop(); let tok_start = self.get_pos(); let tok_end = tok_start.clone(); self.pending.push(Ok((tok_start, Tok::Dedent, tok_end))); - }, - None => panic!("inconsistent use of tabs and spaces in indentation"), + } + None => { + panic!("inconsistent use of tabs and spaces in indentation") + } _ => { break; - }, + } }; }