1#[cfg(not(feature = "std"))]
25use alloc::{
26 borrow::ToOwned,
27 format,
28 string::{String, ToString},
29 vec,
30 vec::Vec,
31};
32use core::iter::Peekable;
33use core::num::NonZeroU8;
34use core::str::Chars;
35use core::{cmp, fmt};
36
37#[cfg(feature = "serde")]
38use serde::{Deserialize, Serialize};
39
40#[cfg(feature = "visitor")]
41use sqlparser_derive::{Visit, VisitMut};
42
43use crate::dialect::Dialect;
44use crate::dialect::{
45 BigQueryDialect, DuckDbDialect, GenericDialect, MySqlDialect, PostgreSqlDialect,
46 SnowflakeDialect,
47};
48use crate::keywords::{Keyword, ALL_KEYWORDS, ALL_KEYWORDS_INDEX};
49use crate::{ast::DollarQuotedString, dialect::HiveDialect};
50
51#[derive(Debug, Clone, PartialEq, PartialOrd, Eq, Ord, Hash)]
53#[cfg_attr(feature = "serde", derive(Serialize, Deserialize))]
54#[cfg_attr(feature = "visitor", derive(Visit, VisitMut))]
55pub enum Token {
56 EOF,
58 Word(Word),
60 Number(String, bool),
62 Char(char),
64 SingleQuotedString(String),
66 DoubleQuotedString(String),
68 TripleSingleQuotedString(String),
71 TripleDoubleQuotedString(String),
74 DollarQuotedString(DollarQuotedString),
76 SingleQuotedByteStringLiteral(String),
79 DoubleQuotedByteStringLiteral(String),
81 TripleSingleQuotedByteStringLiteral(String),
84 TripleDoubleQuotedByteStringLiteral(String),
87 SingleQuotedRawStringLiteral(String),
90 DoubleQuotedRawStringLiteral(String),
93 TripleSingleQuotedRawStringLiteral(String),
96 TripleDoubleQuotedRawStringLiteral(String),
99 NationalStringLiteral(String),
101 EscapedStringLiteral(String),
103 UnicodeStringLiteral(String),
105 HexStringLiteral(String),
107 Comma,
109 Whitespace(Whitespace),
111 DoubleEq,
113 Eq,
115 Neq,
117 Lt,
119 Gt,
121 LtEq,
123 GtEq,
125 Spaceship,
127 Plus,
129 Minus,
131 Mul,
133 Div,
135 DuckIntDiv,
137 Mod,
139 StringConcat,
141 LParen,
143 RParen,
145 Period,
147 Colon,
149 DoubleColon,
151 Assignment,
153 SemiColon,
155 Backslash,
157 LBracket,
159 RBracket,
161 Ampersand,
163 Pipe,
165 Caret,
167 LBrace,
169 RBrace,
171 RArrow,
173 Sharp,
175 DoubleSharp,
177 Tilde,
179 TildeAsterisk,
181 ExclamationMarkTilde,
183 ExclamationMarkTildeAsterisk,
185 DoubleTilde,
187 DoubleTildeAsterisk,
189 ExclamationMarkDoubleTilde,
191 ExclamationMarkDoubleTildeAsterisk,
193 ShiftLeft,
195 ShiftRight,
197 Overlap,
199 ExclamationMark,
201 DoubleExclamationMark,
203 AtSign,
205 CaretAt,
207 PGSquareRoot,
209 PGCubeRoot,
211 Placeholder(String),
213 Arrow,
215 LongArrow,
217 HashArrow,
219 AtDashAt,
221 QuestionMarkDash,
223 AmpersandLeftAngleBracket,
225 AmpersandRightAngleBracket,
227 AmpersandLeftAngleBracketVerticalBar,
229 VerticalBarAmpersandRightAngleBracket,
231 TwoWayArrow,
233 LeftAngleBracketCaret,
235 RightAngleBracketCaret,
237 QuestionMarkSharp,
239 QuestionMarkDashVerticalBar,
241 QuestionMarkDoubleVerticalBar,
243 TildeEqual,
245 ShiftLeftVerticalBar,
247 VerticalBarShiftRight,
249 HashLongArrow,
251 AtArrow,
253 ArrowAt,
255 HashMinus,
258 AtQuestion,
261 AtAt,
265 Question,
268 QuestionAnd,
271 QuestionPipe,
274 CustomBinaryOperator(String),
278}
279
280impl fmt::Display for Token {
281 fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
282 match self {
283 Token::EOF => f.write_str("EOF"),
284 Token::Word(ref w) => write!(f, "{w}"),
285 Token::Number(ref n, l) => write!(f, "{}{long}", n, long = if *l { "L" } else { "" }),
286 Token::Char(ref c) => write!(f, "{c}"),
287 Token::SingleQuotedString(ref s) => write!(f, "'{s}'"),
288 Token::TripleSingleQuotedString(ref s) => write!(f, "'''{s}'''"),
289 Token::DoubleQuotedString(ref s) => write!(f, "\"{s}\""),
290 Token::TripleDoubleQuotedString(ref s) => write!(f, "\"\"\"{s}\"\"\""),
291 Token::DollarQuotedString(ref s) => write!(f, "{s}"),
292 Token::NationalStringLiteral(ref s) => write!(f, "N'{s}'"),
293 Token::EscapedStringLiteral(ref s) => write!(f, "E'{s}'"),
294 Token::UnicodeStringLiteral(ref s) => write!(f, "U&'{s}'"),
295 Token::HexStringLiteral(ref s) => write!(f, "X'{s}'"),
296 Token::SingleQuotedByteStringLiteral(ref s) => write!(f, "B'{s}'"),
297 Token::TripleSingleQuotedByteStringLiteral(ref s) => write!(f, "B'''{s}'''"),
298 Token::DoubleQuotedByteStringLiteral(ref s) => write!(f, "B\"{s}\""),
299 Token::TripleDoubleQuotedByteStringLiteral(ref s) => write!(f, "B\"\"\"{s}\"\"\""),
300 Token::SingleQuotedRawStringLiteral(ref s) => write!(f, "R'{s}'"),
301 Token::DoubleQuotedRawStringLiteral(ref s) => write!(f, "R\"{s}\""),
302 Token::TripleSingleQuotedRawStringLiteral(ref s) => write!(f, "R'''{s}'''"),
303 Token::TripleDoubleQuotedRawStringLiteral(ref s) => write!(f, "R\"\"\"{s}\"\"\""),
304 Token::Comma => f.write_str(","),
305 Token::Whitespace(ws) => write!(f, "{ws}"),
306 Token::DoubleEq => f.write_str("=="),
307 Token::Spaceship => f.write_str("<=>"),
308 Token::Eq => f.write_str("="),
309 Token::Neq => f.write_str("<>"),
310 Token::Lt => f.write_str("<"),
311 Token::Gt => f.write_str(">"),
312 Token::LtEq => f.write_str("<="),
313 Token::GtEq => f.write_str(">="),
314 Token::Plus => f.write_str("+"),
315 Token::Minus => f.write_str("-"),
316 Token::Mul => f.write_str("*"),
317 Token::Div => f.write_str("/"),
318 Token::DuckIntDiv => f.write_str("//"),
319 Token::StringConcat => f.write_str("||"),
320 Token::Mod => f.write_str("%"),
321 Token::LParen => f.write_str("("),
322 Token::RParen => f.write_str(")"),
323 Token::Period => f.write_str("."),
324 Token::Colon => f.write_str(":"),
325 Token::DoubleColon => f.write_str("::"),
326 Token::Assignment => f.write_str(":="),
327 Token::SemiColon => f.write_str(";"),
328 Token::Backslash => f.write_str("\\"),
329 Token::LBracket => f.write_str("["),
330 Token::RBracket => f.write_str("]"),
331 Token::Ampersand => f.write_str("&"),
332 Token::Caret => f.write_str("^"),
333 Token::Pipe => f.write_str("|"),
334 Token::LBrace => f.write_str("{"),
335 Token::RBrace => f.write_str("}"),
336 Token::RArrow => f.write_str("=>"),
337 Token::Sharp => f.write_str("#"),
338 Token::DoubleSharp => f.write_str("##"),
339 Token::ExclamationMark => f.write_str("!"),
340 Token::DoubleExclamationMark => f.write_str("!!"),
341 Token::Tilde => f.write_str("~"),
342 Token::TildeAsterisk => f.write_str("~*"),
343 Token::ExclamationMarkTilde => f.write_str("!~"),
344 Token::ExclamationMarkTildeAsterisk => f.write_str("!~*"),
345 Token::DoubleTilde => f.write_str("~~"),
346 Token::DoubleTildeAsterisk => f.write_str("~~*"),
347 Token::ExclamationMarkDoubleTilde => f.write_str("!~~"),
348 Token::ExclamationMarkDoubleTildeAsterisk => f.write_str("!~~*"),
349 Token::AtSign => f.write_str("@"),
350 Token::CaretAt => f.write_str("^@"),
351 Token::ShiftLeft => f.write_str("<<"),
352 Token::ShiftRight => f.write_str(">>"),
353 Token::Overlap => f.write_str("&&"),
354 Token::PGSquareRoot => f.write_str("|/"),
355 Token::PGCubeRoot => f.write_str("||/"),
356 Token::AtDashAt => f.write_str("@-@"),
357 Token::QuestionMarkDash => f.write_str("?-"),
358 Token::AmpersandLeftAngleBracket => f.write_str("&<"),
359 Token::AmpersandRightAngleBracket => f.write_str("&>"),
360 Token::AmpersandLeftAngleBracketVerticalBar => f.write_str("&<|"),
361 Token::VerticalBarAmpersandRightAngleBracket => f.write_str("|&>"),
362 Token::TwoWayArrow => f.write_str("<->"),
363 Token::LeftAngleBracketCaret => f.write_str("<^"),
364 Token::RightAngleBracketCaret => f.write_str(">^"),
365 Token::QuestionMarkSharp => f.write_str("?#"),
366 Token::QuestionMarkDashVerticalBar => f.write_str("?-|"),
367 Token::QuestionMarkDoubleVerticalBar => f.write_str("?||"),
368 Token::TildeEqual => f.write_str("~="),
369 Token::ShiftLeftVerticalBar => f.write_str("<<|"),
370 Token::VerticalBarShiftRight => f.write_str("|>>"),
371 Token::Placeholder(ref s) => write!(f, "{s}"),
372 Token::Arrow => write!(f, "->"),
373 Token::LongArrow => write!(f, "->>"),
374 Token::HashArrow => write!(f, "#>"),
375 Token::HashLongArrow => write!(f, "#>>"),
376 Token::AtArrow => write!(f, "@>"),
377 Token::ArrowAt => write!(f, "<@"),
378 Token::HashMinus => write!(f, "#-"),
379 Token::AtQuestion => write!(f, "@?"),
380 Token::AtAt => write!(f, "@@"),
381 Token::Question => write!(f, "?"),
382 Token::QuestionAnd => write!(f, "?&"),
383 Token::QuestionPipe => write!(f, "?|"),
384 Token::CustomBinaryOperator(s) => f.write_str(s),
385 }
386 }
387}
388
389impl Token {
390 pub fn make_keyword(keyword: &str) -> Self {
391 Token::make_word(keyword, None)
392 }
393
394 pub fn make_word(word: &str, quote_style: Option<char>) -> Self {
395 let word_uppercase = word.to_uppercase();
396 Token::Word(Word {
397 value: word.to_string(),
398 quote_style,
399 keyword: if quote_style.is_none() {
400 let keyword = ALL_KEYWORDS.binary_search(&word_uppercase.as_str());
401 keyword.map_or(Keyword::NoKeyword, |x| ALL_KEYWORDS_INDEX[x])
402 } else {
403 Keyword::NoKeyword
404 },
405 })
406 }
407}
408
409#[derive(Debug, Clone, PartialEq, PartialOrd, Eq, Ord, Hash)]
411#[cfg_attr(feature = "serde", derive(Serialize, Deserialize))]
412#[cfg_attr(feature = "visitor", derive(Visit, VisitMut))]
413pub struct Word {
414 pub value: String,
417 pub quote_style: Option<char>,
421 pub keyword: Keyword,
424}
425
426impl fmt::Display for Word {
427 fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
428 match self.quote_style {
429 Some(s) if s == '"' || s == '[' || s == '`' => {
430 write!(f, "{}{}{}", s, self.value, Word::matching_end_quote(s))
431 }
432 None => f.write_str(&self.value),
433 _ => panic!("Unexpected quote_style!"),
434 }
435 }
436}
437
438impl Word {
439 fn matching_end_quote(ch: char) -> char {
440 match ch {
441 '"' => '"', '[' => ']', '`' => '`', _ => panic!("unexpected quoting style!"),
445 }
446 }
447}
448
449#[derive(Debug, Clone, PartialEq, PartialOrd, Eq, Ord, Hash)]
450#[cfg_attr(feature = "serde", derive(Serialize, Deserialize))]
451#[cfg_attr(feature = "visitor", derive(Visit, VisitMut))]
452pub enum Whitespace {
453 Space,
454 Newline,
455 Tab,
456 SingleLineComment { comment: String, prefix: String },
457 MultiLineComment(String),
458}
459
460impl fmt::Display for Whitespace {
461 fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
462 match self {
463 Whitespace::Space => f.write_str(" "),
464 Whitespace::Newline => f.write_str("\n"),
465 Whitespace::Tab => f.write_str("\t"),
466 Whitespace::SingleLineComment { prefix, comment } => write!(f, "{prefix}{comment}"),
467 Whitespace::MultiLineComment(s) => write!(f, "/*{s}*/"),
468 }
469 }
470}
471
472#[derive(Eq, PartialEq, Hash, Clone, Copy, Ord, PartialOrd)]
492#[cfg_attr(feature = "serde", derive(Serialize, Deserialize))]
493#[cfg_attr(feature = "visitor", derive(Visit, VisitMut))]
494pub struct Location {
495 pub line: u64,
499 pub column: u64,
503}
504
505impl fmt::Display for Location {
506 fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
507 if self.line == 0 {
508 return Ok(());
509 }
510 write!(f, " at Line: {}, Column: {}", self.line, self.column)
511 }
512}
513
514impl fmt::Debug for Location {
515 fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
516 write!(f, "Location({},{})", self.line, self.column)
517 }
518}
519
520impl Location {
521 pub fn empty() -> Self {
523 Self { line: 0, column: 0 }
524 }
525
526 pub fn new(line: u64, column: u64) -> Self {
528 Self { line, column }
529 }
530
531 pub fn of(line: u64, column: u64) -> Self {
536 Self::new(line, column)
537 }
538
539 pub fn span_to(self, end: Self) -> Span {
541 Span { start: self, end }
542 }
543}
544
545impl From<(u64, u64)> for Location {
546 fn from((line, column): (u64, u64)) -> Self {
547 Self { line, column }
548 }
549}
550
551#[derive(Eq, PartialEq, Hash, Clone, PartialOrd, Ord, Copy)]
555#[cfg_attr(feature = "serde", derive(Serialize, Deserialize))]
556#[cfg_attr(feature = "visitor", derive(Visit, VisitMut))]
557pub struct Span {
558 pub start: Location,
559 pub end: Location,
560}
561
562impl fmt::Debug for Span {
563 fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
564 write!(f, "Span({:?}..{:?})", self.start, self.end)
565 }
566}
567
568impl Span {
569 const EMPTY: Span = Self::empty();
572
573 pub fn new(start: Location, end: Location) -> Span {
575 Span { start, end }
576 }
577
578 pub const fn empty() -> Span {
583 Span {
584 start: Location { line: 0, column: 0 },
585 end: Location { line: 0, column: 0 },
586 }
587 }
588
589 pub fn union(&self, other: &Span) -> Span {
605 match (self, other) {
608 (&Span::EMPTY, _) => *other,
609 (_, &Span::EMPTY) => *self,
610 _ => Span {
611 start: cmp::min(self.start, other.start),
612 end: cmp::max(self.end, other.end),
613 },
614 }
615 }
616
617 pub fn union_opt(&self, other: &Option<Span>) -> Span {
621 match other {
622 Some(other) => self.union(other),
623 None => *self,
624 }
625 }
626
627 pub fn union_iter<I: IntoIterator<Item = Span>>(iter: I) -> Span {
645 iter.into_iter()
646 .reduce(|acc, item| acc.union(&item))
647 .unwrap_or(Span::empty())
648 }
649}
650
651#[deprecated(since = "0.53.0", note = "please use `TokenWithSpan` instead")]
653pub type TokenWithLocation = TokenWithSpan;
654
655#[derive(Debug, Clone, Hash, Ord, PartialOrd, Eq, PartialEq)]
678#[cfg_attr(feature = "serde", derive(Serialize, Deserialize))]
679#[cfg_attr(feature = "visitor", derive(Visit, VisitMut))]
680pub struct TokenWithSpan {
681 pub token: Token,
682 pub span: Span,
683}
684
685impl TokenWithSpan {
686 pub fn new(token: Token, span: Span) -> Self {
688 Self { token, span }
689 }
690
691 pub fn wrap(token: Token) -> Self {
693 Self::new(token, Span::empty())
694 }
695
696 pub fn at(token: Token, start: Location, end: Location) -> Self {
698 Self::new(token, Span::new(start, end))
699 }
700
701 pub fn new_eof() -> Self {
703 Self::wrap(Token::EOF)
704 }
705}
706
707impl PartialEq<Token> for TokenWithSpan {
708 fn eq(&self, other: &Token) -> bool {
709 &self.token == other
710 }
711}
712
713impl PartialEq<TokenWithSpan> for Token {
714 fn eq(&self, other: &TokenWithSpan) -> bool {
715 self == &other.token
716 }
717}
718
719impl fmt::Display for TokenWithSpan {
720 fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
721 self.token.fmt(f)
722 }
723}
724
725#[derive(Debug, PartialEq, Eq)]
727pub struct TokenizerError {
728 pub message: String,
729 pub location: Location,
730}
731
732impl fmt::Display for TokenizerError {
733 fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
734 write!(f, "{}{}", self.message, self.location,)
735 }
736}
737
738#[cfg(feature = "std")]
739impl std::error::Error for TokenizerError {}
740
741struct State<'a> {
742 peekable: Peekable<Chars<'a>>,
743 pub line: u64,
744 pub col: u64,
745}
746
747impl State<'_> {
748 pub fn next(&mut self) -> Option<char> {
750 match self.peekable.next() {
751 None => None,
752 Some(s) => {
753 if s == '\n' {
754 self.line += 1;
755 self.col = 1;
756 } else {
757 self.col += 1;
758 }
759 Some(s)
760 }
761 }
762 }
763
764 pub fn peek(&mut self) -> Option<&char> {
766 self.peekable.peek()
767 }
768
769 pub fn location(&self) -> Location {
770 Location {
771 line: self.line,
772 column: self.col,
773 }
774 }
775}
776
777#[derive(Copy, Clone)]
779enum NumStringQuoteChars {
780 One,
782 Many(NonZeroU8),
784}
785
786struct TokenizeQuotedStringSettings {
788 quote_style: char,
790 num_quote_chars: NumStringQuoteChars,
792 num_opening_quotes_to_consume: u8,
798 backslash_escape: bool,
801}
802
803pub struct Tokenizer<'a> {
805 dialect: &'a dyn Dialect,
806 query: &'a str,
807 unescape: bool,
810}
811
812impl<'a> Tokenizer<'a> {
813 pub fn new(dialect: &'a dyn Dialect, query: &'a str) -> Self {
830 Self {
831 dialect,
832 query,
833 unescape: true,
834 }
835 }
836
837 pub fn with_unescape(mut self, unescape: bool) -> Self {
868 self.unescape = unescape;
869 self
870 }
871
872 pub fn tokenize(&mut self) -> Result<Vec<Token>, TokenizerError> {
874 let twl = self.tokenize_with_location()?;
875 Ok(twl.into_iter().map(|t| t.token).collect())
876 }
877
878 pub fn tokenize_with_location(&mut self) -> Result<Vec<TokenWithSpan>, TokenizerError> {
880 let mut tokens: Vec<TokenWithSpan> = vec![];
881 self.tokenize_with_location_into_buf(&mut tokens)
882 .map(|_| tokens)
883 }
884
885 pub fn tokenize_with_location_into_buf(
888 &mut self,
889 buf: &mut Vec<TokenWithSpan>,
890 ) -> Result<(), TokenizerError> {
891 let mut state = State {
892 peekable: self.query.chars().peekable(),
893 line: 1,
894 col: 1,
895 };
896
897 let mut location = state.location();
898 while let Some(token) = self.next_token(&mut state, buf.last().map(|t| &t.token))? {
899 let span = location.span_to(state.location());
900
901 buf.push(TokenWithSpan { token, span });
902
903 location = state.location();
904 }
905 Ok(())
906 }
907
908 fn tokenize_identifier_or_keyword(
910 &self,
911 ch: impl IntoIterator<Item = char>,
912 chars: &mut State,
913 ) -> Result<Option<Token>, TokenizerError> {
914 chars.next(); let ch: String = ch.into_iter().collect();
916 let word = self.tokenize_word(ch, chars);
917
918 if word.chars().all(|x| x.is_ascii_digit() || x == '.') {
920 let mut inner_state = State {
921 peekable: word.chars().peekable(),
922 line: 0,
923 col: 0,
924 };
925 let mut s = peeking_take_while(&mut inner_state, |ch| matches!(ch, '0'..='9' | '.'));
926 let s2 = peeking_take_while(chars, |ch| matches!(ch, '0'..='9' | '.'));
927 s += s2.as_str();
928 return Ok(Some(Token::Number(s, false)));
929 }
930
931 Ok(Some(Token::make_word(&word, None)))
932 }
933
934 fn next_token(
936 &self,
937 chars: &mut State,
938 prev_token: Option<&Token>,
939 ) -> Result<Option<Token>, TokenizerError> {
940 match chars.peek() {
941 Some(&ch) => match ch {
942 ' ' => self.consume_and_return(chars, Token::Whitespace(Whitespace::Space)),
943 '\t' => self.consume_and_return(chars, Token::Whitespace(Whitespace::Tab)),
944 '\n' => self.consume_and_return(chars, Token::Whitespace(Whitespace::Newline)),
945 '\r' => {
946 chars.next();
948 if let Some('\n') = chars.peek() {
949 chars.next();
950 }
951 Ok(Some(Token::Whitespace(Whitespace::Newline)))
952 }
953 b @ 'B' | b @ 'b' if dialect_of!(self is BigQueryDialect | PostgreSqlDialect | MySqlDialect | GenericDialect) =>
955 {
956 chars.next(); match chars.peek() {
958 Some('\'') => {
959 if self.dialect.supports_triple_quoted_string() {
960 return self
961 .tokenize_single_or_triple_quoted_string::<fn(String) -> Token>(
962 chars,
963 '\'',
964 false,
965 Token::SingleQuotedByteStringLiteral,
966 Token::TripleSingleQuotedByteStringLiteral,
967 );
968 }
969 let s = self.tokenize_single_quoted_string(chars, '\'', false)?;
970 Ok(Some(Token::SingleQuotedByteStringLiteral(s)))
971 }
972 Some('\"') => {
973 if self.dialect.supports_triple_quoted_string() {
974 return self
975 .tokenize_single_or_triple_quoted_string::<fn(String) -> Token>(
976 chars,
977 '"',
978 false,
979 Token::DoubleQuotedByteStringLiteral,
980 Token::TripleDoubleQuotedByteStringLiteral,
981 );
982 }
983 let s = self.tokenize_single_quoted_string(chars, '\"', false)?;
984 Ok(Some(Token::DoubleQuotedByteStringLiteral(s)))
985 }
986 _ => {
987 let s = self.tokenize_word(b, chars);
989 Ok(Some(Token::make_word(&s, None)))
990 }
991 }
992 }
993 b @ 'R' | b @ 'r' if dialect_of!(self is BigQueryDialect | GenericDialect) => {
995 chars.next(); match chars.peek() {
997 Some('\'') => self
998 .tokenize_single_or_triple_quoted_string::<fn(String) -> Token>(
999 chars,
1000 '\'',
1001 false,
1002 Token::SingleQuotedRawStringLiteral,
1003 Token::TripleSingleQuotedRawStringLiteral,
1004 ),
1005 Some('\"') => self
1006 .tokenize_single_or_triple_quoted_string::<fn(String) -> Token>(
1007 chars,
1008 '"',
1009 false,
1010 Token::DoubleQuotedRawStringLiteral,
1011 Token::TripleDoubleQuotedRawStringLiteral,
1012 ),
1013 _ => {
1014 let s = self.tokenize_word(b, chars);
1016 Ok(Some(Token::make_word(&s, None)))
1017 }
1018 }
1019 }
1020 n @ 'N' | n @ 'n' => {
1022 chars.next(); match chars.peek() {
1024 Some('\'') => {
1025 let backslash_escape =
1027 self.dialect.supports_string_literal_backslash_escape();
1028 let s =
1029 self.tokenize_single_quoted_string(chars, '\'', backslash_escape)?;
1030 Ok(Some(Token::NationalStringLiteral(s)))
1031 }
1032 _ => {
1033 let s = self.tokenize_word(n, chars);
1035 Ok(Some(Token::make_word(&s, None)))
1036 }
1037 }
1038 }
1039 x @ 'e' | x @ 'E' if self.dialect.supports_string_escape_constant() => {
1041 let starting_loc = chars.location();
1042 chars.next(); match chars.peek() {
1044 Some('\'') => {
1045 let s =
1046 self.tokenize_escaped_single_quoted_string(starting_loc, chars)?;
1047 Ok(Some(Token::EscapedStringLiteral(s)))
1048 }
1049 _ => {
1050 let s = self.tokenize_word(x, chars);
1052 Ok(Some(Token::make_word(&s, None)))
1053 }
1054 }
1055 }
1056 x @ 'u' | x @ 'U' if self.dialect.supports_unicode_string_literal() => {
1058 chars.next(); if chars.peek() == Some(&'&') {
1060 let mut chars_clone = chars.peekable.clone();
1062 chars_clone.next(); if chars_clone.peek() == Some(&'\'') {
1064 chars.next(); let s = unescape_unicode_single_quoted_string(chars)?;
1066 return Ok(Some(Token::UnicodeStringLiteral(s)));
1067 }
1068 }
1069 let s = self.tokenize_word(x, chars);
1071 Ok(Some(Token::make_word(&s, None)))
1072 }
1073 x @ 'x' | x @ 'X' => {
1076 chars.next(); match chars.peek() {
1078 Some('\'') => {
1079 let s = self.tokenize_single_quoted_string(chars, '\'', true)?;
1081 Ok(Some(Token::HexStringLiteral(s)))
1082 }
1083 _ => {
1084 let s = self.tokenize_word(x, chars);
1086 Ok(Some(Token::make_word(&s, None)))
1087 }
1088 }
1089 }
1090 '\'' => {
1092 if self.dialect.supports_triple_quoted_string() {
1093 return self
1094 .tokenize_single_or_triple_quoted_string::<fn(String) -> Token>(
1095 chars,
1096 '\'',
1097 self.dialect.supports_string_literal_backslash_escape(),
1098 Token::SingleQuotedString,
1099 Token::TripleSingleQuotedString,
1100 );
1101 }
1102 let s = self.tokenize_single_quoted_string(
1103 chars,
1104 '\'',
1105 self.dialect.supports_string_literal_backslash_escape(),
1106 )?;
1107
1108 Ok(Some(Token::SingleQuotedString(s)))
1109 }
1110 '\"' if !self.dialect.is_delimited_identifier_start(ch)
1112 && !self.dialect.is_identifier_start(ch) =>
1113 {
1114 if self.dialect.supports_triple_quoted_string() {
1115 return self
1116 .tokenize_single_or_triple_quoted_string::<fn(String) -> Token>(
1117 chars,
1118 '"',
1119 self.dialect.supports_string_literal_backslash_escape(),
1120 Token::DoubleQuotedString,
1121 Token::TripleDoubleQuotedString,
1122 );
1123 }
1124 let s = self.tokenize_single_quoted_string(
1125 chars,
1126 '"',
1127 self.dialect.supports_string_literal_backslash_escape(),
1128 )?;
1129
1130 Ok(Some(Token::DoubleQuotedString(s)))
1131 }
1132 quote_start if self.dialect.is_delimited_identifier_start(ch) => {
1134 let word = self.tokenize_quoted_identifier(quote_start, chars)?;
1135 Ok(Some(Token::make_word(&word, Some(quote_start))))
1136 }
1137 quote_start
1139 if self
1140 .dialect
1141 .is_nested_delimited_identifier_start(quote_start)
1142 && self
1143 .dialect
1144 .peek_nested_delimited_identifier_quotes(chars.peekable.clone())
1145 .is_some() =>
1146 {
1147 let Some((quote_start, nested_quote_start)) = self
1148 .dialect
1149 .peek_nested_delimited_identifier_quotes(chars.peekable.clone())
1150 else {
1151 return self.tokenizer_error(
1152 chars.location(),
1153 format!("Expected nested delimiter '{quote_start}' before EOF."),
1154 );
1155 };
1156
1157 let Some(nested_quote_start) = nested_quote_start else {
1158 let word = self.tokenize_quoted_identifier(quote_start, chars)?;
1159 return Ok(Some(Token::make_word(&word, Some(quote_start))));
1160 };
1161
1162 let mut word = vec![];
1163 let quote_end = Word::matching_end_quote(quote_start);
1164 let nested_quote_end = Word::matching_end_quote(nested_quote_start);
1165 let error_loc = chars.location();
1166
1167 chars.next(); peeking_take_while(chars, |ch| ch.is_whitespace());
1169 if chars.peek() != Some(&nested_quote_start) {
1170 return self.tokenizer_error(
1171 error_loc,
1172 format!("Expected nested delimiter '{nested_quote_start}' before EOF."),
1173 );
1174 }
1175 word.push(nested_quote_start.into());
1176 word.push(self.tokenize_quoted_identifier(nested_quote_end, chars)?);
1177 word.push(nested_quote_end.into());
1178 peeking_take_while(chars, |ch| ch.is_whitespace());
1179 if chars.peek() != Some("e_end) {
1180 return self.tokenizer_error(
1181 error_loc,
1182 format!("Expected close delimiter '{quote_end}' before EOF."),
1183 );
1184 }
1185 chars.next(); Ok(Some(Token::make_word(&word.concat(), Some(quote_start))))
1188 }
1189 '0'..='9' | '.' => {
1191 let is_number_separator = |ch: char, next_char: Option<char>| {
1194 self.dialect.supports_numeric_literal_underscores()
1195 && ch == '_'
1196 && next_char.is_some_and(|next_ch| next_ch.is_ascii_hexdigit())
1197 };
1198
1199 let mut s = peeking_next_take_while(chars, |ch, next_ch| {
1200 ch.is_ascii_digit() || is_number_separator(ch, next_ch)
1201 });
1202
1203 if s == "0" && chars.peek() == Some(&'x') {
1205 chars.next();
1206 let s2 = peeking_next_take_while(chars, |ch, next_ch| {
1207 ch.is_ascii_hexdigit() || is_number_separator(ch, next_ch)
1208 });
1209 return Ok(Some(Token::HexStringLiteral(s2)));
1210 }
1211
1212 if let Some('.') = chars.peek() {
1214 s.push('.');
1215 chars.next();
1216 }
1217
1218 if s == "." && self.dialect.supports_numeric_prefix() {
1224 if let Some(Token::Word(_)) = prev_token {
1225 return Ok(Some(Token::Period));
1226 }
1227 }
1228
1229 s += &peeking_next_take_while(chars, |ch, next_ch| {
1231 ch.is_ascii_digit() || is_number_separator(ch, next_ch)
1232 });
1233
1234 if s == "." {
1236 return Ok(Some(Token::Period));
1237 }
1238
1239 let mut exponent_part = String::new();
1241 if chars.peek() == Some(&'e') || chars.peek() == Some(&'E') {
1242 let mut char_clone = chars.peekable.clone();
1243 exponent_part.push(char_clone.next().unwrap());
1244
1245 match char_clone.peek() {
1247 Some(&c) if matches!(c, '+' | '-') => {
1248 exponent_part.push(c);
1249 char_clone.next();
1250 }
1251 _ => (),
1252 }
1253
1254 match char_clone.peek() {
1255 Some(&c) if c.is_ascii_digit() => {
1257 for _ in 0..exponent_part.len() {
1258 chars.next();
1259 }
1260 exponent_part +=
1261 &peeking_take_while(chars, |ch| ch.is_ascii_digit());
1262 s += exponent_part.as_str();
1263 }
1264 _ => (),
1266 }
1267 }
1268
1269 if self.dialect.supports_numeric_prefix() {
1273 if exponent_part.is_empty() {
1274 let word =
1277 peeking_take_while(chars, |ch| self.dialect.is_identifier_part(ch));
1278
1279 if !word.is_empty() {
1280 s += word.as_str();
1281 return Ok(Some(Token::make_word(s.as_str(), None)));
1282 }
1283 } else if prev_token == Some(&Token::Period) {
1284 return Ok(Some(Token::make_word(s.as_str(), None)));
1287 }
1288 }
1289
1290 let long = if chars.peek() == Some(&'L') {
1291 chars.next();
1292 true
1293 } else {
1294 false
1295 };
1296 Ok(Some(Token::Number(s, long)))
1297 }
1298 '(' => self.consume_and_return(chars, Token::LParen),
1300 ')' => self.consume_and_return(chars, Token::RParen),
1301 ',' => self.consume_and_return(chars, Token::Comma),
1302 '-' => {
1304 chars.next(); match chars.peek() {
1307 Some('-') => {
1308 let mut is_comment = true;
1309 if self.dialect.requires_single_line_comment_whitespace() {
1310 is_comment = Some(' ') == chars.peekable.clone().nth(1);
1311 }
1312
1313 if is_comment {
1314 chars.next(); let comment = self.tokenize_single_line_comment(chars);
1316 return Ok(Some(Token::Whitespace(
1317 Whitespace::SingleLineComment {
1318 prefix: "--".to_owned(),
1319 comment,
1320 },
1321 )));
1322 }
1323
1324 self.start_binop(chars, "-", Token::Minus)
1325 }
1326 Some('>') => {
1327 chars.next();
1328 match chars.peek() {
1329 Some('>') => self.consume_for_binop(chars, "->>", Token::LongArrow),
1330 _ => self.start_binop(chars, "->", Token::Arrow),
1331 }
1332 }
1333 _ => self.start_binop(chars, "-", Token::Minus),
1335 }
1336 }
1337 '/' => {
1338 chars.next(); match chars.peek() {
1340 Some('*') => {
1341 chars.next(); self.tokenize_multiline_comment(chars)
1343 }
1344 Some('/') if dialect_of!(self is SnowflakeDialect) => {
1345 chars.next(); let comment = self.tokenize_single_line_comment(chars);
1347 Ok(Some(Token::Whitespace(Whitespace::SingleLineComment {
1348 prefix: "//".to_owned(),
1349 comment,
1350 })))
1351 }
1352 Some('/') if dialect_of!(self is DuckDbDialect | GenericDialect) => {
1353 self.consume_and_return(chars, Token::DuckIntDiv)
1354 }
1355 _ => Ok(Some(Token::Div)),
1357 }
1358 }
1359 '+' => self.consume_and_return(chars, Token::Plus),
1360 '*' => self.consume_and_return(chars, Token::Mul),
1361 '%' => {
1362 chars.next(); match chars.peek() {
1364 Some(s) if s.is_whitespace() => Ok(Some(Token::Mod)),
1365 Some(sch) if self.dialect.is_identifier_start('%') => {
1366 self.tokenize_identifier_or_keyword([ch, *sch], chars)
1367 }
1368 _ => self.start_binop(chars, "%", Token::Mod),
1369 }
1370 }
1371 '|' => {
1372 chars.next(); match chars.peek() {
1374 Some('/') => self.consume_for_binop(chars, "|/", Token::PGSquareRoot),
1375 Some('|') => {
1376 chars.next(); match chars.peek() {
1378 Some('/') => {
1379 self.consume_for_binop(chars, "||/", Token::PGCubeRoot)
1380 }
1381 _ => self.start_binop(chars, "||", Token::StringConcat),
1382 }
1383 }
1384 Some('&') if self.dialect.supports_geometric_types() => {
1385 chars.next(); match chars.peek() {
1387 Some('>') => self.consume_for_binop(
1388 chars,
1389 "|&>",
1390 Token::VerticalBarAmpersandRightAngleBracket,
1391 ),
1392 _ => self.start_binop_opt(chars, "|&", None),
1393 }
1394 }
1395 Some('>') if self.dialect.supports_geometric_types() => {
1396 chars.next(); match chars.peek() {
1398 Some('>') => self.consume_for_binop(
1399 chars,
1400 "|>>",
1401 Token::VerticalBarShiftRight,
1402 ),
1403 _ => self.start_binop_opt(chars, "|>", None),
1404 }
1405 }
1406 _ => self.start_binop(chars, "|", Token::Pipe),
1408 }
1409 }
1410 '=' => {
1411 chars.next(); match chars.peek() {
1413 Some('>') => self.consume_and_return(chars, Token::RArrow),
1414 Some('=') => self.consume_and_return(chars, Token::DoubleEq),
1415 _ => Ok(Some(Token::Eq)),
1416 }
1417 }
1418 '!' => {
1419 chars.next(); match chars.peek() {
1421 Some('=') => self.consume_and_return(chars, Token::Neq),
1422 Some('!') => self.consume_and_return(chars, Token::DoubleExclamationMark),
1423 Some('~') => {
1424 chars.next();
1425 match chars.peek() {
1426 Some('*') => self
1427 .consume_and_return(chars, Token::ExclamationMarkTildeAsterisk),
1428 Some('~') => {
1429 chars.next();
1430 match chars.peek() {
1431 Some('*') => self.consume_and_return(
1432 chars,
1433 Token::ExclamationMarkDoubleTildeAsterisk,
1434 ),
1435 _ => Ok(Some(Token::ExclamationMarkDoubleTilde)),
1436 }
1437 }
1438 _ => Ok(Some(Token::ExclamationMarkTilde)),
1439 }
1440 }
1441 _ => Ok(Some(Token::ExclamationMark)),
1442 }
1443 }
1444 '<' => {
1445 chars.next(); match chars.peek() {
1447 Some('=') => {
1448 chars.next();
1449 match chars.peek() {
1450 Some('>') => self.consume_for_binop(chars, "<=>", Token::Spaceship),
1451 _ => self.start_binop(chars, "<=", Token::LtEq),
1452 }
1453 }
1454 Some('|') if self.dialect.supports_geometric_types() => {
1455 self.consume_for_binop(chars, "<<|", Token::ShiftLeftVerticalBar)
1456 }
1457 Some('>') => self.consume_for_binop(chars, "<>", Token::Neq),
1458 Some('<') if self.dialect.supports_geometric_types() => {
1459 chars.next(); match chars.peek() {
1461 Some('|') => self.consume_for_binop(
1462 chars,
1463 "<<|",
1464 Token::ShiftLeftVerticalBar,
1465 ),
1466 _ => self.start_binop(chars, "<<", Token::ShiftLeft),
1467 }
1468 }
1469 Some('<') => self.consume_for_binop(chars, "<<", Token::ShiftLeft),
1470 Some('-') if self.dialect.supports_geometric_types() => {
1471 chars.next(); match chars.peek() {
1473 Some('>') => {
1474 self.consume_for_binop(chars, "<->", Token::TwoWayArrow)
1475 }
1476 _ => self.start_binop_opt(chars, "<-", None),
1477 }
1478 }
1479 Some('^') if self.dialect.supports_geometric_types() => {
1480 self.consume_for_binop(chars, "<^", Token::LeftAngleBracketCaret)
1481 }
1482 Some('@') => self.consume_for_binop(chars, "<@", Token::ArrowAt),
1483 _ => self.start_binop(chars, "<", Token::Lt),
1484 }
1485 }
1486 '>' => {
1487 chars.next(); match chars.peek() {
1489 Some('=') => self.consume_for_binop(chars, ">=", Token::GtEq),
1490 Some('>') => self.consume_for_binop(chars, ">>", Token::ShiftRight),
1491 Some('^') if self.dialect.supports_geometric_types() => {
1492 self.consume_for_binop(chars, ">^", Token::RightAngleBracketCaret)
1493 }
1494 _ => self.start_binop(chars, ">", Token::Gt),
1495 }
1496 }
1497 ':' => {
1498 chars.next();
1499 match chars.peek() {
1500 Some(':') => self.consume_and_return(chars, Token::DoubleColon),
1501 Some('=') => self.consume_and_return(chars, Token::Assignment),
1502 _ => Ok(Some(Token::Colon)),
1503 }
1504 }
1505 ';' => self.consume_and_return(chars, Token::SemiColon),
1506 '\\' => self.consume_and_return(chars, Token::Backslash),
1507 '[' => self.consume_and_return(chars, Token::LBracket),
1508 ']' => self.consume_and_return(chars, Token::RBracket),
1509 '&' => {
1510 chars.next(); match chars.peek() {
1512 Some('>') if self.dialect.supports_geometric_types() => {
1513 chars.next();
1514 self.consume_and_return(chars, Token::AmpersandRightAngleBracket)
1515 }
1516 Some('<') if self.dialect.supports_geometric_types() => {
1517 chars.next(); match chars.peek() {
1519 Some('|') => self.consume_and_return(
1520 chars,
1521 Token::AmpersandLeftAngleBracketVerticalBar,
1522 ),
1523 _ => {
1524 self.start_binop(chars, "&<", Token::AmpersandLeftAngleBracket)
1525 }
1526 }
1527 }
1528 Some('&') => {
1529 chars.next(); self.start_binop(chars, "&&", Token::Overlap)
1531 }
1532 _ => self.start_binop(chars, "&", Token::Ampersand),
1534 }
1535 }
1536 '^' => {
1537 chars.next(); match chars.peek() {
1539 Some('@') => self.consume_and_return(chars, Token::CaretAt),
1540 _ => Ok(Some(Token::Caret)),
1541 }
1542 }
1543 '{' => self.consume_and_return(chars, Token::LBrace),
1544 '}' => self.consume_and_return(chars, Token::RBrace),
1545 '#' if dialect_of!(self is SnowflakeDialect | BigQueryDialect | MySqlDialect | HiveDialect) =>
1546 {
1547 chars.next(); let comment = self.tokenize_single_line_comment(chars);
1549 Ok(Some(Token::Whitespace(Whitespace::SingleLineComment {
1550 prefix: "#".to_owned(),
1551 comment,
1552 })))
1553 }
1554 '~' => {
1555 chars.next(); match chars.peek() {
1557 Some('*') => self.consume_for_binop(chars, "~*", Token::TildeAsterisk),
1558 Some('=') if self.dialect.supports_geometric_types() => {
1559 self.consume_for_binop(chars, "~=", Token::TildeEqual)
1560 }
1561 Some('~') => {
1562 chars.next();
1563 match chars.peek() {
1564 Some('*') => {
1565 self.consume_for_binop(chars, "~~*", Token::DoubleTildeAsterisk)
1566 }
1567 _ => self.start_binop(chars, "~~", Token::DoubleTilde),
1568 }
1569 }
1570 _ => self.start_binop(chars, "~", Token::Tilde),
1571 }
1572 }
1573 '#' => {
1574 chars.next();
1575 match chars.peek() {
1576 Some('-') => self.consume_for_binop(chars, "#-", Token::HashMinus),
1577 Some('>') => {
1578 chars.next();
1579 match chars.peek() {
1580 Some('>') => {
1581 self.consume_for_binop(chars, "#>>", Token::HashLongArrow)
1582 }
1583 _ => self.start_binop(chars, "#>", Token::HashArrow),
1584 }
1585 }
1586 Some(' ') => Ok(Some(Token::Sharp)),
1587 Some('#') if self.dialect.supports_geometric_types() => {
1588 self.consume_for_binop(chars, "##", Token::DoubleSharp)
1589 }
1590 Some(sch) if self.dialect.is_identifier_start('#') => {
1591 self.tokenize_identifier_or_keyword([ch, *sch], chars)
1592 }
1593 _ => self.start_binop(chars, "#", Token::Sharp),
1594 }
1595 }
1596 '@' => {
1597 chars.next();
1598 match chars.peek() {
1599 Some('@') if self.dialect.supports_geometric_types() => {
1600 self.consume_and_return(chars, Token::AtAt)
1601 }
1602 Some('-') if self.dialect.supports_geometric_types() => {
1603 chars.next();
1604 match chars.peek() {
1605 Some('@') => self.consume_and_return(chars, Token::AtDashAt),
1606 _ => self.start_binop_opt(chars, "@-", None),
1607 }
1608 }
1609 Some('>') => self.consume_and_return(chars, Token::AtArrow),
1610 Some('?') => self.consume_and_return(chars, Token::AtQuestion),
1611 Some('@') => {
1612 chars.next();
1613 match chars.peek() {
1614 Some(' ') => Ok(Some(Token::AtAt)),
1615 Some(tch) if self.dialect.is_identifier_start('@') => {
1616 self.tokenize_identifier_or_keyword([ch, '@', *tch], chars)
1617 }
1618 _ => Ok(Some(Token::AtAt)),
1619 }
1620 }
1621 Some(' ') => Ok(Some(Token::AtSign)),
1622 Some('\'') => Ok(Some(Token::AtSign)),
1632 Some('\"') => Ok(Some(Token::AtSign)),
1633 Some('`') => Ok(Some(Token::AtSign)),
1634 Some(sch) if self.dialect.is_identifier_start('@') => {
1635 self.tokenize_identifier_or_keyword([ch, *sch], chars)
1636 }
1637 _ => Ok(Some(Token::AtSign)),
1638 }
1639 }
1640 '?' if self.dialect.supports_geometric_types() => {
1642 chars.next(); match chars.peek() {
1644 Some('|') => {
1645 chars.next();
1646 match chars.peek() {
1647 Some('|') => self.consume_and_return(
1648 chars,
1649 Token::QuestionMarkDoubleVerticalBar,
1650 ),
1651 _ => Ok(Some(Token::QuestionPipe)),
1652 }
1653 }
1654
1655 Some('&') => self.consume_and_return(chars, Token::QuestionAnd),
1656 Some('-') => {
1657 chars.next(); match chars.peek() {
1659 Some('|') => self
1660 .consume_and_return(chars, Token::QuestionMarkDashVerticalBar),
1661 _ => Ok(Some(Token::QuestionMarkDash)),
1662 }
1663 }
1664 Some('#') => self.consume_and_return(chars, Token::QuestionMarkSharp),
1665 _ => self.consume_and_return(chars, Token::Question),
1666 }
1667 }
1668 '?' => {
1669 chars.next();
1670 let s = peeking_take_while(chars, |ch| ch.is_numeric());
1671 Ok(Some(Token::Placeholder(String::from("?") + &s)))
1672 }
1673
1674 ch if self.dialect.is_identifier_start(ch) => {
1676 self.tokenize_identifier_or_keyword([ch], chars)
1677 }
1678 '$' => Ok(Some(self.tokenize_dollar_preceded_value(chars)?)),
1679
1680 ch if ch.is_whitespace() => {
1682 self.consume_and_return(chars, Token::Whitespace(Whitespace::Space))
1683 }
1684 other => self.consume_and_return(chars, Token::Char(other)),
1685 },
1686 None => Ok(None),
1687 }
1688 }
1689
1690 fn consume_for_binop(
1692 &self,
1693 chars: &mut State,
1694 prefix: &str,
1695 default: Token,
1696 ) -> Result<Option<Token>, TokenizerError> {
1697 chars.next(); self.start_binop_opt(chars, prefix, Some(default))
1699 }
1700
1701 fn start_binop(
1703 &self,
1704 chars: &mut State,
1705 prefix: &str,
1706 default: Token,
1707 ) -> Result<Option<Token>, TokenizerError> {
1708 self.start_binop_opt(chars, prefix, Some(default))
1709 }
1710
1711 fn start_binop_opt(
1713 &self,
1714 chars: &mut State,
1715 prefix: &str,
1716 default: Option<Token>,
1717 ) -> Result<Option<Token>, TokenizerError> {
1718 let mut custom = None;
1719 while let Some(&ch) = chars.peek() {
1720 if !self.dialect.is_custom_operator_part(ch) {
1721 break;
1722 }
1723
1724 custom.get_or_insert_with(|| prefix.to_string()).push(ch);
1725 chars.next();
1726 }
1727 match (custom, default) {
1728 (Some(custom), _) => Ok(Token::CustomBinaryOperator(custom).into()),
1729 (None, Some(tok)) => Ok(Some(tok)),
1730 (None, None) => self.tokenizer_error(
1731 chars.location(),
1732 format!("Expected a valid binary operator after '{}'", prefix),
1733 ),
1734 }
1735 }
1736
1737 fn tokenize_dollar_preceded_value(&self, chars: &mut State) -> Result<Token, TokenizerError> {
1739 let mut s = String::new();
1740 let mut value = String::new();
1741
1742 chars.next();
1743
1744 if matches!(chars.peek(), Some('$')) && !self.dialect.supports_dollar_placeholder() {
1746 chars.next();
1747
1748 let mut is_terminated = false;
1749 let mut prev: Option<char> = None;
1750
1751 while let Some(&ch) = chars.peek() {
1752 if prev == Some('$') {
1753 if ch == '$' {
1754 chars.next();
1755 is_terminated = true;
1756 break;
1757 } else {
1758 s.push('$');
1759 s.push(ch);
1760 }
1761 } else if ch != '$' {
1762 s.push(ch);
1763 }
1764
1765 prev = Some(ch);
1766 chars.next();
1767 }
1768
1769 return if chars.peek().is_none() && !is_terminated {
1770 self.tokenizer_error(chars.location(), "Unterminated dollar-quoted string")
1771 } else {
1772 Ok(Token::DollarQuotedString(DollarQuotedString {
1773 value: s,
1774 tag: None,
1775 }))
1776 };
1777 } else {
1778 value.push_str(&peeking_take_while(chars, |ch| {
1779 ch.is_alphanumeric()
1780 || ch == '_'
1781 || matches!(ch, '$' if self.dialect.supports_dollar_placeholder())
1783 }));
1784
1785 if matches!(chars.peek(), Some('$')) && !self.dialect.supports_dollar_placeholder() {
1787 chars.next();
1788
1789 let mut temp = String::new();
1790 let end_delimiter = format!("${}$", value);
1791
1792 loop {
1793 match chars.next() {
1794 Some(ch) => {
1795 temp.push(ch);
1796
1797 if temp.ends_with(&end_delimiter) {
1798 if let Some(temp) = temp.strip_suffix(&end_delimiter) {
1799 s.push_str(temp);
1800 }
1801 break;
1802 }
1803 }
1804 None => {
1805 if temp.ends_with(&end_delimiter) {
1806 if let Some(temp) = temp.strip_suffix(&end_delimiter) {
1807 s.push_str(temp);
1808 }
1809 break;
1810 }
1811
1812 return self.tokenizer_error(
1813 chars.location(),
1814 "Unterminated dollar-quoted, expected $",
1815 );
1816 }
1817 }
1818 }
1819 } else {
1820 return Ok(Token::Placeholder(String::from("$") + &value));
1821 }
1822 }
1823
1824 Ok(Token::DollarQuotedString(DollarQuotedString {
1825 value: s,
1826 tag: if value.is_empty() { None } else { Some(value) },
1827 }))
1828 }
1829
1830 fn tokenizer_error<R>(
1831 &self,
1832 loc: Location,
1833 message: impl Into<String>,
1834 ) -> Result<R, TokenizerError> {
1835 Err(TokenizerError {
1836 message: message.into(),
1837 location: loc,
1838 })
1839 }
1840
1841 fn tokenize_single_line_comment(&self, chars: &mut State) -> String {
1843 let mut comment = peeking_take_while(chars, |ch| match ch {
1844 '\n' => false, '\r' if dialect_of!(self is PostgreSqlDialect) => false, _ => true, });
1848
1849 if let Some(ch) = chars.next() {
1850 assert!(ch == '\n' || ch == '\r');
1851 comment.push(ch);
1852 }
1853
1854 comment
1855 }
1856
1857 fn tokenize_word(&self, first_chars: impl Into<String>, chars: &mut State) -> String {
1859 let mut s = first_chars.into();
1860 s.push_str(&peeking_take_while(chars, |ch| {
1861 self.dialect.is_identifier_part(ch)
1862 }));
1863 s
1864 }
1865
1866 fn tokenize_quoted_identifier(
1868 &self,
1869 quote_start: char,
1870 chars: &mut State,
1871 ) -> Result<String, TokenizerError> {
1872 let error_loc = chars.location();
1873 chars.next(); let quote_end = Word::matching_end_quote(quote_start);
1875 let (s, last_char) = self.parse_quoted_ident(chars, quote_end);
1876
1877 if last_char == Some(quote_end) {
1878 Ok(s)
1879 } else {
1880 self.tokenizer_error(
1881 error_loc,
1882 format!("Expected close delimiter '{quote_end}' before EOF."),
1883 )
1884 }
1885 }
1886
1887 fn tokenize_escaped_single_quoted_string(
1889 &self,
1890 starting_loc: Location,
1891 chars: &mut State,
1892 ) -> Result<String, TokenizerError> {
1893 if let Some(s) = unescape_single_quoted_string(chars) {
1894 return Ok(s);
1895 }
1896
1897 self.tokenizer_error(starting_loc, "Unterminated encoded string literal")
1898 }
1899
1900 fn tokenize_single_or_triple_quoted_string<F>(
1903 &self,
1904 chars: &mut State,
1905 quote_style: char,
1906 backslash_escape: bool,
1907 single_quote_token: F,
1908 triple_quote_token: F,
1909 ) -> Result<Option<Token>, TokenizerError>
1910 where
1911 F: Fn(String) -> Token,
1912 {
1913 let error_loc = chars.location();
1914
1915 let mut num_opening_quotes = 0u8;
1916 for _ in 0..3 {
1917 if Some("e_style) == chars.peek() {
1918 chars.next(); num_opening_quotes += 1;
1920 } else {
1921 break;
1922 }
1923 }
1924
1925 let (token_fn, num_quote_chars) = match num_opening_quotes {
1926 1 => (single_quote_token, NumStringQuoteChars::One),
1927 2 => {
1928 return Ok(Some(single_quote_token("".into())));
1930 }
1931 3 => {
1932 let Some(num_quote_chars) = NonZeroU8::new(3) else {
1933 return self.tokenizer_error(error_loc, "invalid number of opening quotes");
1934 };
1935 (
1936 triple_quote_token,
1937 NumStringQuoteChars::Many(num_quote_chars),
1938 )
1939 }
1940 _ => {
1941 return self.tokenizer_error(error_loc, "invalid string literal opening");
1942 }
1943 };
1944
1945 let settings = TokenizeQuotedStringSettings {
1946 quote_style,
1947 num_quote_chars,
1948 num_opening_quotes_to_consume: 0,
1949 backslash_escape,
1950 };
1951
1952 self.tokenize_quoted_string(chars, settings)
1953 .map(token_fn)
1954 .map(Some)
1955 }
1956
1957 fn tokenize_single_quoted_string(
1959 &self,
1960 chars: &mut State,
1961 quote_style: char,
1962 backslash_escape: bool,
1963 ) -> Result<String, TokenizerError> {
1964 self.tokenize_quoted_string(
1965 chars,
1966 TokenizeQuotedStringSettings {
1967 quote_style,
1968 num_quote_chars: NumStringQuoteChars::One,
1969 num_opening_quotes_to_consume: 1,
1970 backslash_escape,
1971 },
1972 )
1973 }
1974
1975 fn tokenize_quoted_string(
1977 &self,
1978 chars: &mut State,
1979 settings: TokenizeQuotedStringSettings,
1980 ) -> Result<String, TokenizerError> {
1981 let mut s = String::new();
1982 let error_loc = chars.location();
1983
1984 for _ in 0..settings.num_opening_quotes_to_consume {
1986 if Some(settings.quote_style) != chars.next() {
1987 return self.tokenizer_error(error_loc, "invalid string literal opening");
1988 }
1989 }
1990
1991 let mut num_consecutive_quotes = 0;
1992 while let Some(&ch) = chars.peek() {
1993 let pending_final_quote = match settings.num_quote_chars {
1994 NumStringQuoteChars::One => Some(NumStringQuoteChars::One),
1995 n @ NumStringQuoteChars::Many(count)
1996 if num_consecutive_quotes + 1 == count.get() =>
1997 {
1998 Some(n)
1999 }
2000 NumStringQuoteChars::Many(_) => None,
2001 };
2002
2003 match ch {
2004 char if char == settings.quote_style && pending_final_quote.is_some() => {
2005 chars.next(); if let Some(NumStringQuoteChars::Many(count)) = pending_final_quote {
2008 let mut buf = s.chars();
2013 for _ in 1..count.get() {
2014 buf.next_back();
2015 }
2016 return Ok(buf.as_str().to_string());
2017 } else if chars
2018 .peek()
2019 .map(|c| *c == settings.quote_style)
2020 .unwrap_or(false)
2021 {
2022 s.push(ch);
2023 if !self.unescape {
2024 s.push(ch);
2026 }
2027 chars.next();
2028 } else {
2029 return Ok(s);
2030 }
2031 }
2032 '\\' if settings.backslash_escape => {
2033 chars.next();
2035
2036 num_consecutive_quotes = 0;
2037
2038 if let Some(next) = chars.peek() {
2039 if !self.unescape
2040 || (self.dialect.ignores_wildcard_escapes()
2041 && (*next == '%' || *next == '_'))
2042 {
2043 s.push(ch);
2047 s.push(*next);
2048 chars.next(); } else {
2050 let n = match next {
2051 '0' => '\0',
2052 'a' => '\u{7}',
2053 'b' => '\u{8}',
2054 'f' => '\u{c}',
2055 'n' => '\n',
2056 'r' => '\r',
2057 't' => '\t',
2058 'Z' => '\u{1a}',
2059 _ => *next,
2060 };
2061 s.push(n);
2062 chars.next(); }
2064 }
2065 }
2066 ch => {
2067 chars.next(); if ch == settings.quote_style {
2070 num_consecutive_quotes += 1;
2071 } else {
2072 num_consecutive_quotes = 0;
2073 }
2074
2075 s.push(ch);
2076 }
2077 }
2078 }
2079 self.tokenizer_error(error_loc, "Unterminated string literal")
2080 }
2081
2082 fn tokenize_multiline_comment(
2083 &self,
2084 chars: &mut State,
2085 ) -> Result<Option<Token>, TokenizerError> {
2086 let mut s = String::new();
2087 let mut nested = 1;
2088 let supports_nested_comments = self.dialect.supports_nested_comments();
2089
2090 loop {
2091 match chars.next() {
2092 Some('/') if matches!(chars.peek(), Some('*')) && supports_nested_comments => {
2093 chars.next(); s.push('/');
2095 s.push('*');
2096 nested += 1;
2097 }
2098 Some('*') if matches!(chars.peek(), Some('/')) => {
2099 chars.next(); nested -= 1;
2101 if nested == 0 {
2102 break Ok(Some(Token::Whitespace(Whitespace::MultiLineComment(s))));
2103 }
2104 s.push('*');
2105 s.push('/');
2106 }
2107 Some(ch) => {
2108 s.push(ch);
2109 }
2110 None => {
2111 break self.tokenizer_error(
2112 chars.location(),
2113 "Unexpected EOF while in a multi-line comment",
2114 );
2115 }
2116 }
2117 }
2118 }
2119
2120 fn parse_quoted_ident(&self, chars: &mut State, quote_end: char) -> (String, Option<char>) {
2121 let mut last_char = None;
2122 let mut s = String::new();
2123 while let Some(ch) = chars.next() {
2124 if ch == quote_end {
2125 if chars.peek() == Some("e_end) {
2126 chars.next();
2127 s.push(ch);
2128 if !self.unescape {
2129 s.push(ch);
2131 }
2132 } else {
2133 last_char = Some(quote_end);
2134 break;
2135 }
2136 } else {
2137 s.push(ch);
2138 }
2139 }
2140 (s, last_char)
2141 }
2142
2143 #[allow(clippy::unnecessary_wraps)]
2144 fn consume_and_return(
2145 &self,
2146 chars: &mut State,
2147 t: Token,
2148 ) -> Result<Option<Token>, TokenizerError> {
2149 chars.next();
2150 Ok(Some(t))
2151 }
2152}
2153
2154fn peeking_take_while(chars: &mut State, mut predicate: impl FnMut(char) -> bool) -> String {
2158 let mut s = String::new();
2159 while let Some(&ch) = chars.peek() {
2160 if predicate(ch) {
2161 chars.next(); s.push(ch);
2163 } else {
2164 break;
2165 }
2166 }
2167 s
2168}
2169
2170fn peeking_next_take_while(
2172 chars: &mut State,
2173 mut predicate: impl FnMut(char, Option<char>) -> bool,
2174) -> String {
2175 let mut s = String::new();
2176 while let Some(&ch) = chars.peek() {
2177 let next_char = chars.peekable.clone().nth(1);
2178 if predicate(ch, next_char) {
2179 chars.next(); s.push(ch);
2181 } else {
2182 break;
2183 }
2184 }
2185 s
2186}
2187
2188fn unescape_single_quoted_string(chars: &mut State<'_>) -> Option<String> {
2189 Unescape::new(chars).unescape()
2190}
2191
2192struct Unescape<'a: 'b, 'b> {
2193 chars: &'b mut State<'a>,
2194}
2195
2196impl<'a: 'b, 'b> Unescape<'a, 'b> {
2197 fn new(chars: &'b mut State<'a>) -> Self {
2198 Self { chars }
2199 }
2200 fn unescape(mut self) -> Option<String> {
2201 let mut unescaped = String::new();
2202
2203 self.chars.next();
2204
2205 while let Some(c) = self.chars.next() {
2206 if c == '\'' {
2207 if self.chars.peek().map(|c| *c == '\'').unwrap_or(false) {
2209 self.chars.next();
2210 unescaped.push('\'');
2211 continue;
2212 }
2213 return Some(unescaped);
2214 }
2215
2216 if c != '\\' {
2217 unescaped.push(c);
2218 continue;
2219 }
2220
2221 let c = match self.chars.next()? {
2222 'b' => '\u{0008}',
2223 'f' => '\u{000C}',
2224 'n' => '\n',
2225 'r' => '\r',
2226 't' => '\t',
2227 'u' => self.unescape_unicode_16()?,
2228 'U' => self.unescape_unicode_32()?,
2229 'x' => self.unescape_hex()?,
2230 c if c.is_digit(8) => self.unescape_octal(c)?,
2231 c => c,
2232 };
2233
2234 unescaped.push(Self::check_null(c)?);
2235 }
2236
2237 None
2238 }
2239
2240 #[inline]
2241 fn check_null(c: char) -> Option<char> {
2242 if c == '\0' {
2243 None
2244 } else {
2245 Some(c)
2246 }
2247 }
2248
2249 #[inline]
2250 fn byte_to_char<const RADIX: u32>(s: &str) -> Option<char> {
2251 match u32::from_str_radix(s, RADIX) {
2253 Err(_) => None,
2254 Ok(n) => {
2255 let n = n & 0xFF;
2256 if n <= 127 {
2257 char::from_u32(n)
2258 } else {
2259 None
2260 }
2261 }
2262 }
2263 }
2264
2265 fn unescape_hex(&mut self) -> Option<char> {
2267 let mut s = String::new();
2268
2269 for _ in 0..2 {
2270 match self.next_hex_digit() {
2271 Some(c) => s.push(c),
2272 None => break,
2273 }
2274 }
2275
2276 if s.is_empty() {
2277 return Some('x');
2278 }
2279
2280 Self::byte_to_char::<16>(&s)
2281 }
2282
2283 #[inline]
2284 fn next_hex_digit(&mut self) -> Option<char> {
2285 match self.chars.peek() {
2286 Some(c) if c.is_ascii_hexdigit() => self.chars.next(),
2287 _ => None,
2288 }
2289 }
2290
2291 fn unescape_octal(&mut self, c: char) -> Option<char> {
2293 let mut s = String::new();
2294
2295 s.push(c);
2296 for _ in 0..2 {
2297 match self.next_octal_digest() {
2298 Some(c) => s.push(c),
2299 None => break,
2300 }
2301 }
2302
2303 Self::byte_to_char::<8>(&s)
2304 }
2305
2306 #[inline]
2307 fn next_octal_digest(&mut self) -> Option<char> {
2308 match self.chars.peek() {
2309 Some(c) if c.is_digit(8) => self.chars.next(),
2310 _ => None,
2311 }
2312 }
2313
2314 fn unescape_unicode_16(&mut self) -> Option<char> {
2316 self.unescape_unicode::<4>()
2317 }
2318
2319 fn unescape_unicode_32(&mut self) -> Option<char> {
2321 self.unescape_unicode::<8>()
2322 }
2323
2324 fn unescape_unicode<const NUM: usize>(&mut self) -> Option<char> {
2325 let mut s = String::new();
2326 for _ in 0..NUM {
2327 s.push(self.chars.next()?);
2328 }
2329 match u32::from_str_radix(&s, 16) {
2330 Err(_) => None,
2331 Ok(n) => char::from_u32(n),
2332 }
2333 }
2334}
2335
2336fn unescape_unicode_single_quoted_string(chars: &mut State<'_>) -> Result<String, TokenizerError> {
2337 let mut unescaped = String::new();
2338 chars.next(); while let Some(c) = chars.next() {
2340 match c {
2341 '\'' => {
2342 if chars.peek() == Some(&'\'') {
2343 chars.next();
2344 unescaped.push('\'');
2345 } else {
2346 return Ok(unescaped);
2347 }
2348 }
2349 '\\' => match chars.peek() {
2350 Some('\\') => {
2351 chars.next();
2352 unescaped.push('\\');
2353 }
2354 Some('+') => {
2355 chars.next();
2356 unescaped.push(take_char_from_hex_digits(chars, 6)?);
2357 }
2358 _ => unescaped.push(take_char_from_hex_digits(chars, 4)?),
2359 },
2360 _ => {
2361 unescaped.push(c);
2362 }
2363 }
2364 }
2365 Err(TokenizerError {
2366 message: "Unterminated unicode encoded string literal".to_string(),
2367 location: chars.location(),
2368 })
2369}
2370
2371fn take_char_from_hex_digits(
2372 chars: &mut State<'_>,
2373 max_digits: usize,
2374) -> Result<char, TokenizerError> {
2375 let mut result = 0u32;
2376 for _ in 0..max_digits {
2377 let next_char = chars.next().ok_or_else(|| TokenizerError {
2378 message: "Unexpected EOF while parsing hex digit in escaped unicode string."
2379 .to_string(),
2380 location: chars.location(),
2381 })?;
2382 let digit = next_char.to_digit(16).ok_or_else(|| TokenizerError {
2383 message: format!("Invalid hex digit in escaped unicode string: {}", next_char),
2384 location: chars.location(),
2385 })?;
2386 result = result * 16 + digit;
2387 }
2388 char::from_u32(result).ok_or_else(|| TokenizerError {
2389 message: format!("Invalid unicode character: {:x}", result),
2390 location: chars.location(),
2391 })
2392}
2393
2394#[cfg(test)]
2395mod tests {
2396 use super::*;
2397 use crate::dialect::{
2398 BigQueryDialect, ClickHouseDialect, HiveDialect, MsSqlDialect, MySqlDialect, SQLiteDialect,
2399 };
2400 use crate::test_utils::all_dialects_where;
2401 use core::fmt::Debug;
2402
2403 #[test]
2404 fn tokenizer_error_impl() {
2405 let err = TokenizerError {
2406 message: "test".into(),
2407 location: Location { line: 1, column: 1 },
2408 };
2409 #[cfg(feature = "std")]
2410 {
2411 use std::error::Error;
2412 assert!(err.source().is_none());
2413 }
2414 assert_eq!(err.to_string(), "test at Line: 1, Column: 1");
2415 }
2416
2417 #[test]
2418 fn tokenize_select_1() {
2419 let sql = String::from("SELECT 1");
2420 let dialect = GenericDialect {};
2421 let tokens = Tokenizer::new(&dialect, &sql).tokenize().unwrap();
2422
2423 let expected = vec![
2424 Token::make_keyword("SELECT"),
2425 Token::Whitespace(Whitespace::Space),
2426 Token::Number(String::from("1"), false),
2427 ];
2428
2429 compare(expected, tokens);
2430 }
2431
2432 #[test]
2433 fn tokenize_select_float() {
2434 let sql = String::from("SELECT .1");
2435 let dialect = GenericDialect {};
2436 let tokens = Tokenizer::new(&dialect, &sql).tokenize().unwrap();
2437
2438 let expected = vec![
2439 Token::make_keyword("SELECT"),
2440 Token::Whitespace(Whitespace::Space),
2441 Token::Number(String::from(".1"), false),
2442 ];
2443
2444 compare(expected, tokens);
2445 }
2446
2447 #[test]
2448 fn tokenize_clickhouse_double_equal() {
2449 let sql = String::from("SELECT foo=='1'");
2450 let dialect = ClickHouseDialect {};
2451 let mut tokenizer = Tokenizer::new(&dialect, &sql);
2452 let tokens = tokenizer.tokenize().unwrap();
2453
2454 let expected = vec![
2455 Token::make_keyword("SELECT"),
2456 Token::Whitespace(Whitespace::Space),
2457 Token::Word(Word {
2458 value: "foo".to_string(),
2459 quote_style: None,
2460 keyword: Keyword::NoKeyword,
2461 }),
2462 Token::DoubleEq,
2463 Token::SingleQuotedString("1".to_string()),
2464 ];
2465
2466 compare(expected, tokens);
2467 }
2468
2469 #[test]
2470 fn tokenize_numeric_literal_underscore() {
2471 let dialect = GenericDialect {};
2472 let sql = String::from("SELECT 10_000");
2473 let mut tokenizer = Tokenizer::new(&dialect, &sql);
2474 let tokens = tokenizer.tokenize().unwrap();
2475 let expected = vec![
2476 Token::make_keyword("SELECT"),
2477 Token::Whitespace(Whitespace::Space),
2478 Token::Number("10".to_string(), false),
2479 Token::make_word("_000", None),
2480 ];
2481 compare(expected, tokens);
2482
2483 all_dialects_where(|dialect| dialect.supports_numeric_literal_underscores()).tokenizes_to(
2484 "SELECT 10_000, _10_000, 10_00_, 10___0",
2485 vec![
2486 Token::make_keyword("SELECT"),
2487 Token::Whitespace(Whitespace::Space),
2488 Token::Number("10_000".to_string(), false),
2489 Token::Comma,
2490 Token::Whitespace(Whitespace::Space),
2491 Token::make_word("_10_000", None), Token::Comma,
2493 Token::Whitespace(Whitespace::Space),
2494 Token::Number("10_00".to_string(), false),
2495 Token::make_word("_", None), Token::Comma,
2497 Token::Whitespace(Whitespace::Space),
2498 Token::Number("10".to_string(), false),
2499 Token::make_word("___0", None), ],
2501 );
2502 }
2503
2504 #[test]
2505 fn tokenize_select_exponent() {
2506 let sql = String::from("SELECT 1e10, 1e-10, 1e+10, 1ea, 1e-10a, 1e-10-10");
2507 let dialect = GenericDialect {};
2508 let tokens = Tokenizer::new(&dialect, &sql).tokenize().unwrap();
2509
2510 let expected = vec![
2511 Token::make_keyword("SELECT"),
2512 Token::Whitespace(Whitespace::Space),
2513 Token::Number(String::from("1e10"), false),
2514 Token::Comma,
2515 Token::Whitespace(Whitespace::Space),
2516 Token::Number(String::from("1e-10"), false),
2517 Token::Comma,
2518 Token::Whitespace(Whitespace::Space),
2519 Token::Number(String::from("1e+10"), false),
2520 Token::Comma,
2521 Token::Whitespace(Whitespace::Space),
2522 Token::Number(String::from("1"), false),
2523 Token::make_word("ea", None),
2524 Token::Comma,
2525 Token::Whitespace(Whitespace::Space),
2526 Token::Number(String::from("1e-10"), false),
2527 Token::make_word("a", None),
2528 Token::Comma,
2529 Token::Whitespace(Whitespace::Space),
2530 Token::Number(String::from("1e-10"), false),
2531 Token::Minus,
2532 Token::Number(String::from("10"), false),
2533 ];
2534
2535 compare(expected, tokens);
2536 }
2537
2538 #[test]
2539 fn tokenize_scalar_function() {
2540 let sql = String::from("SELECT sqrt(1)");
2541 let dialect = GenericDialect {};
2542 let tokens = Tokenizer::new(&dialect, &sql).tokenize().unwrap();
2543
2544 let expected = vec![
2545 Token::make_keyword("SELECT"),
2546 Token::Whitespace(Whitespace::Space),
2547 Token::make_word("sqrt", None),
2548 Token::LParen,
2549 Token::Number(String::from("1"), false),
2550 Token::RParen,
2551 ];
2552
2553 compare(expected, tokens);
2554 }
2555
2556 #[test]
2557 fn tokenize_string_string_concat() {
2558 let sql = String::from("SELECT 'a' || 'b'");
2559 let dialect = GenericDialect {};
2560 let tokens = Tokenizer::new(&dialect, &sql).tokenize().unwrap();
2561
2562 let expected = vec![
2563 Token::make_keyword("SELECT"),
2564 Token::Whitespace(Whitespace::Space),
2565 Token::SingleQuotedString(String::from("a")),
2566 Token::Whitespace(Whitespace::Space),
2567 Token::StringConcat,
2568 Token::Whitespace(Whitespace::Space),
2569 Token::SingleQuotedString(String::from("b")),
2570 ];
2571
2572 compare(expected, tokens);
2573 }
2574 #[test]
2575 fn tokenize_bitwise_op() {
2576 let sql = String::from("SELECT one | two ^ three");
2577 let dialect = GenericDialect {};
2578 let tokens = Tokenizer::new(&dialect, &sql).tokenize().unwrap();
2579
2580 let expected = vec![
2581 Token::make_keyword("SELECT"),
2582 Token::Whitespace(Whitespace::Space),
2583 Token::make_word("one", None),
2584 Token::Whitespace(Whitespace::Space),
2585 Token::Pipe,
2586 Token::Whitespace(Whitespace::Space),
2587 Token::make_word("two", None),
2588 Token::Whitespace(Whitespace::Space),
2589 Token::Caret,
2590 Token::Whitespace(Whitespace::Space),
2591 Token::make_word("three", None),
2592 ];
2593 compare(expected, tokens);
2594 }
2595
2596 #[test]
2597 fn tokenize_logical_xor() {
2598 let sql =
2599 String::from("SELECT true XOR true, false XOR false, true XOR false, false XOR true");
2600 let dialect = GenericDialect {};
2601 let tokens = Tokenizer::new(&dialect, &sql).tokenize().unwrap();
2602
2603 let expected = vec![
2604 Token::make_keyword("SELECT"),
2605 Token::Whitespace(Whitespace::Space),
2606 Token::make_keyword("true"),
2607 Token::Whitespace(Whitespace::Space),
2608 Token::make_keyword("XOR"),
2609 Token::Whitespace(Whitespace::Space),
2610 Token::make_keyword("true"),
2611 Token::Comma,
2612 Token::Whitespace(Whitespace::Space),
2613 Token::make_keyword("false"),
2614 Token::Whitespace(Whitespace::Space),
2615 Token::make_keyword("XOR"),
2616 Token::Whitespace(Whitespace::Space),
2617 Token::make_keyword("false"),
2618 Token::Comma,
2619 Token::Whitespace(Whitespace::Space),
2620 Token::make_keyword("true"),
2621 Token::Whitespace(Whitespace::Space),
2622 Token::make_keyword("XOR"),
2623 Token::Whitespace(Whitespace::Space),
2624 Token::make_keyword("false"),
2625 Token::Comma,
2626 Token::Whitespace(Whitespace::Space),
2627 Token::make_keyword("false"),
2628 Token::Whitespace(Whitespace::Space),
2629 Token::make_keyword("XOR"),
2630 Token::Whitespace(Whitespace::Space),
2631 Token::make_keyword("true"),
2632 ];
2633 compare(expected, tokens);
2634 }
2635
2636 #[test]
2637 fn tokenize_simple_select() {
2638 let sql = String::from("SELECT * FROM customer WHERE id = 1 LIMIT 5");
2639 let dialect = GenericDialect {};
2640 let tokens = Tokenizer::new(&dialect, &sql).tokenize().unwrap();
2641
2642 let expected = vec![
2643 Token::make_keyword("SELECT"),
2644 Token::Whitespace(Whitespace::Space),
2645 Token::Mul,
2646 Token::Whitespace(Whitespace::Space),
2647 Token::make_keyword("FROM"),
2648 Token::Whitespace(Whitespace::Space),
2649 Token::make_word("customer", None),
2650 Token::Whitespace(Whitespace::Space),
2651 Token::make_keyword("WHERE"),
2652 Token::Whitespace(Whitespace::Space),
2653 Token::make_word("id", None),
2654 Token::Whitespace(Whitespace::Space),
2655 Token::Eq,
2656 Token::Whitespace(Whitespace::Space),
2657 Token::Number(String::from("1"), false),
2658 Token::Whitespace(Whitespace::Space),
2659 Token::make_keyword("LIMIT"),
2660 Token::Whitespace(Whitespace::Space),
2661 Token::Number(String::from("5"), false),
2662 ];
2663
2664 compare(expected, tokens);
2665 }
2666
2667 #[test]
2668 fn tokenize_explain_select() {
2669 let sql = String::from("EXPLAIN SELECT * FROM customer WHERE id = 1");
2670 let dialect = GenericDialect {};
2671 let tokens = Tokenizer::new(&dialect, &sql).tokenize().unwrap();
2672
2673 let expected = vec![
2674 Token::make_keyword("EXPLAIN"),
2675 Token::Whitespace(Whitespace::Space),
2676 Token::make_keyword("SELECT"),
2677 Token::Whitespace(Whitespace::Space),
2678 Token::Mul,
2679 Token::Whitespace(Whitespace::Space),
2680 Token::make_keyword("FROM"),
2681 Token::Whitespace(Whitespace::Space),
2682 Token::make_word("customer", None),
2683 Token::Whitespace(Whitespace::Space),
2684 Token::make_keyword("WHERE"),
2685 Token::Whitespace(Whitespace::Space),
2686 Token::make_word("id", None),
2687 Token::Whitespace(Whitespace::Space),
2688 Token::Eq,
2689 Token::Whitespace(Whitespace::Space),
2690 Token::Number(String::from("1"), false),
2691 ];
2692
2693 compare(expected, tokens);
2694 }
2695
2696 #[test]
2697 fn tokenize_explain_analyze_select() {
2698 let sql = String::from("EXPLAIN ANALYZE SELECT * FROM customer WHERE id = 1");
2699 let dialect = GenericDialect {};
2700 let tokens = Tokenizer::new(&dialect, &sql).tokenize().unwrap();
2701
2702 let expected = vec![
2703 Token::make_keyword("EXPLAIN"),
2704 Token::Whitespace(Whitespace::Space),
2705 Token::make_keyword("ANALYZE"),
2706 Token::Whitespace(Whitespace::Space),
2707 Token::make_keyword("SELECT"),
2708 Token::Whitespace(Whitespace::Space),
2709 Token::Mul,
2710 Token::Whitespace(Whitespace::Space),
2711 Token::make_keyword("FROM"),
2712 Token::Whitespace(Whitespace::Space),
2713 Token::make_word("customer", None),
2714 Token::Whitespace(Whitespace::Space),
2715 Token::make_keyword("WHERE"),
2716 Token::Whitespace(Whitespace::Space),
2717 Token::make_word("id", None),
2718 Token::Whitespace(Whitespace::Space),
2719 Token::Eq,
2720 Token::Whitespace(Whitespace::Space),
2721 Token::Number(String::from("1"), false),
2722 ];
2723
2724 compare(expected, tokens);
2725 }
2726
2727 #[test]
2728 fn tokenize_string_predicate() {
2729 let sql = String::from("SELECT * FROM customer WHERE salary != 'Not Provided'");
2730 let dialect = GenericDialect {};
2731 let tokens = Tokenizer::new(&dialect, &sql).tokenize().unwrap();
2732
2733 let expected = vec![
2734 Token::make_keyword("SELECT"),
2735 Token::Whitespace(Whitespace::Space),
2736 Token::Mul,
2737 Token::Whitespace(Whitespace::Space),
2738 Token::make_keyword("FROM"),
2739 Token::Whitespace(Whitespace::Space),
2740 Token::make_word("customer", None),
2741 Token::Whitespace(Whitespace::Space),
2742 Token::make_keyword("WHERE"),
2743 Token::Whitespace(Whitespace::Space),
2744 Token::make_word("salary", None),
2745 Token::Whitespace(Whitespace::Space),
2746 Token::Neq,
2747 Token::Whitespace(Whitespace::Space),
2748 Token::SingleQuotedString(String::from("Not Provided")),
2749 ];
2750
2751 compare(expected, tokens);
2752 }
2753
2754 #[test]
2755 fn tokenize_invalid_string() {
2756 let sql = String::from("\n💝مصطفىh");
2757
2758 let dialect = GenericDialect {};
2759 let tokens = Tokenizer::new(&dialect, &sql).tokenize().unwrap();
2760 let expected = vec![
2762 Token::Whitespace(Whitespace::Newline),
2763 Token::Char('💝'),
2764 Token::make_word("مصطفىh", None),
2765 ];
2766 compare(expected, tokens);
2767 }
2768
2769 #[test]
2770 fn tokenize_newline_in_string_literal() {
2771 let sql = String::from("'foo\r\nbar\nbaz'");
2772
2773 let dialect = GenericDialect {};
2774 let tokens = Tokenizer::new(&dialect, &sql).tokenize().unwrap();
2775 let expected = vec![Token::SingleQuotedString("foo\r\nbar\nbaz".to_string())];
2776 compare(expected, tokens);
2777 }
2778
2779 #[test]
2780 fn tokenize_unterminated_string_literal() {
2781 let sql = String::from("select 'foo");
2782
2783 let dialect = GenericDialect {};
2784 let mut tokenizer = Tokenizer::new(&dialect, &sql);
2785 assert_eq!(
2786 tokenizer.tokenize(),
2787 Err(TokenizerError {
2788 message: "Unterminated string literal".to_string(),
2789 location: Location { line: 1, column: 8 },
2790 })
2791 );
2792 }
2793
2794 #[test]
2795 fn tokenize_unterminated_string_literal_utf8() {
2796 let sql = String::from("SELECT \"なにか\" FROM Y WHERE \"なにか\" = 'test;");
2797
2798 let dialect = GenericDialect {};
2799 let mut tokenizer = Tokenizer::new(&dialect, &sql);
2800 assert_eq!(
2801 tokenizer.tokenize(),
2802 Err(TokenizerError {
2803 message: "Unterminated string literal".to_string(),
2804 location: Location {
2805 line: 1,
2806 column: 35
2807 }
2808 })
2809 );
2810 }
2811
2812 #[test]
2813 fn tokenize_invalid_string_cols() {
2814 let sql = String::from("\n\nSELECT * FROM table\t💝مصطفىh");
2815
2816 let dialect = GenericDialect {};
2817 let tokens = Tokenizer::new(&dialect, &sql).tokenize().unwrap();
2818 let expected = vec![
2820 Token::Whitespace(Whitespace::Newline),
2821 Token::Whitespace(Whitespace::Newline),
2822 Token::make_keyword("SELECT"),
2823 Token::Whitespace(Whitespace::Space),
2824 Token::Mul,
2825 Token::Whitespace(Whitespace::Space),
2826 Token::make_keyword("FROM"),
2827 Token::Whitespace(Whitespace::Space),
2828 Token::make_keyword("table"),
2829 Token::Whitespace(Whitespace::Tab),
2830 Token::Char('💝'),
2831 Token::make_word("مصطفىh", None),
2832 ];
2833 compare(expected, tokens);
2834 }
2835
2836 #[test]
2837 fn tokenize_dollar_quoted_string_tagged() {
2838 let test_cases = vec![
2839 (
2840 String::from("SELECT $tag$dollar '$' quoted strings have $tags like this$ or like this $$$tag$"),
2841 vec![
2842 Token::make_keyword("SELECT"),
2843 Token::Whitespace(Whitespace::Space),
2844 Token::DollarQuotedString(DollarQuotedString {
2845 value: "dollar '$' quoted strings have $tags like this$ or like this $$".into(),
2846 tag: Some("tag".into()),
2847 })
2848 ]
2849 ),
2850 (
2851 String::from("SELECT $abc$x$ab$abc$"),
2852 vec![
2853 Token::make_keyword("SELECT"),
2854 Token::Whitespace(Whitespace::Space),
2855 Token::DollarQuotedString(DollarQuotedString {
2856 value: "x$ab".into(),
2857 tag: Some("abc".into()),
2858 })
2859 ]
2860 ),
2861 (
2862 String::from("SELECT $abc$$abc$"),
2863 vec![
2864 Token::make_keyword("SELECT"),
2865 Token::Whitespace(Whitespace::Space),
2866 Token::DollarQuotedString(DollarQuotedString {
2867 value: "".into(),
2868 tag: Some("abc".into()),
2869 })
2870 ]
2871 ),
2872 (
2873 String::from("0$abc$$abc$1"),
2874 vec![
2875 Token::Number("0".into(), false),
2876 Token::DollarQuotedString(DollarQuotedString {
2877 value: "".into(),
2878 tag: Some("abc".into()),
2879 }),
2880 Token::Number("1".into(), false),
2881 ]
2882 ),
2883 (
2884 String::from("$function$abc$q$data$q$$function$"),
2885 vec![
2886 Token::DollarQuotedString(DollarQuotedString {
2887 value: "abc$q$data$q$".into(),
2888 tag: Some("function".into()),
2889 }),
2890 ]
2891 ),
2892 ];
2893
2894 let dialect = GenericDialect {};
2895 for (sql, expected) in test_cases {
2896 let tokens = Tokenizer::new(&dialect, &sql).tokenize().unwrap();
2897 compare(expected, tokens);
2898 }
2899 }
2900
2901 #[test]
2902 fn tokenize_dollar_quoted_string_tagged_unterminated() {
2903 let sql = String::from("SELECT $tag$dollar '$' quoted strings have $tags like this$ or like this $$$different tag$");
2904 let dialect = GenericDialect {};
2905 assert_eq!(
2906 Tokenizer::new(&dialect, &sql).tokenize(),
2907 Err(TokenizerError {
2908 message: "Unterminated dollar-quoted, expected $".into(),
2909 location: Location {
2910 line: 1,
2911 column: 91
2912 }
2913 })
2914 );
2915 }
2916
2917 #[test]
2918 fn tokenize_dollar_quoted_string_tagged_unterminated_mirror() {
2919 let sql = String::from("SELECT $abc$abc$");
2920 let dialect = GenericDialect {};
2921 assert_eq!(
2922 Tokenizer::new(&dialect, &sql).tokenize(),
2923 Err(TokenizerError {
2924 message: "Unterminated dollar-quoted, expected $".into(),
2925 location: Location {
2926 line: 1,
2927 column: 17
2928 }
2929 })
2930 );
2931 }
2932
2933 #[test]
2934 fn tokenize_dollar_placeholder() {
2935 let sql = String::from("SELECT $$, $$ABC$$, $ABC$, $ABC");
2936 let dialect = SQLiteDialect {};
2937 let tokens = Tokenizer::new(&dialect, &sql).tokenize().unwrap();
2938 assert_eq!(
2939 tokens,
2940 vec![
2941 Token::make_keyword("SELECT"),
2942 Token::Whitespace(Whitespace::Space),
2943 Token::Placeholder("$$".into()),
2944 Token::Comma,
2945 Token::Whitespace(Whitespace::Space),
2946 Token::Placeholder("$$ABC$$".into()),
2947 Token::Comma,
2948 Token::Whitespace(Whitespace::Space),
2949 Token::Placeholder("$ABC$".into()),
2950 Token::Comma,
2951 Token::Whitespace(Whitespace::Space),
2952 Token::Placeholder("$ABC".into()),
2953 ]
2954 );
2955 }
2956
2957 #[test]
2958 fn tokenize_nested_dollar_quoted_strings() {
2959 let sql = String::from("SELECT $tag$dollar $nested$ string$tag$");
2960 let dialect = GenericDialect {};
2961 let tokens = Tokenizer::new(&dialect, &sql).tokenize().unwrap();
2962 let expected = vec![
2963 Token::make_keyword("SELECT"),
2964 Token::Whitespace(Whitespace::Space),
2965 Token::DollarQuotedString(DollarQuotedString {
2966 value: "dollar $nested$ string".into(),
2967 tag: Some("tag".into()),
2968 }),
2969 ];
2970 compare(expected, tokens);
2971 }
2972
2973 #[test]
2974 fn tokenize_dollar_quoted_string_untagged_empty() {
2975 let sql = String::from("SELECT $$$$");
2976 let dialect = GenericDialect {};
2977 let tokens = Tokenizer::new(&dialect, &sql).tokenize().unwrap();
2978 let expected = vec![
2979 Token::make_keyword("SELECT"),
2980 Token::Whitespace(Whitespace::Space),
2981 Token::DollarQuotedString(DollarQuotedString {
2982 value: "".into(),
2983 tag: None,
2984 }),
2985 ];
2986 compare(expected, tokens);
2987 }
2988
2989 #[test]
2990 fn tokenize_dollar_quoted_string_untagged() {
2991 let sql =
2992 String::from("SELECT $$within dollar '$' quoted strings have $tags like this$ $$");
2993 let dialect = GenericDialect {};
2994 let tokens = Tokenizer::new(&dialect, &sql).tokenize().unwrap();
2995 let expected = vec![
2996 Token::make_keyword("SELECT"),
2997 Token::Whitespace(Whitespace::Space),
2998 Token::DollarQuotedString(DollarQuotedString {
2999 value: "within dollar '$' quoted strings have $tags like this$ ".into(),
3000 tag: None,
3001 }),
3002 ];
3003 compare(expected, tokens);
3004 }
3005
3006 #[test]
3007 fn tokenize_dollar_quoted_string_untagged_unterminated() {
3008 let sql = String::from(
3009 "SELECT $$dollar '$' quoted strings have $tags like this$ or like this $different tag$",
3010 );
3011 let dialect = GenericDialect {};
3012 assert_eq!(
3013 Tokenizer::new(&dialect, &sql).tokenize(),
3014 Err(TokenizerError {
3015 message: "Unterminated dollar-quoted string".into(),
3016 location: Location {
3017 line: 1,
3018 column: 86
3019 }
3020 })
3021 );
3022 }
3023
3024 #[test]
3025 fn tokenize_right_arrow() {
3026 let sql = String::from("FUNCTION(key=>value)");
3027 let dialect = GenericDialect {};
3028 let tokens = Tokenizer::new(&dialect, &sql).tokenize().unwrap();
3029 let expected = vec![
3030 Token::make_word("FUNCTION", None),
3031 Token::LParen,
3032 Token::make_word("key", None),
3033 Token::RArrow,
3034 Token::make_word("value", None),
3035 Token::RParen,
3036 ];
3037 compare(expected, tokens);
3038 }
3039
3040 #[test]
3041 fn tokenize_is_null() {
3042 let sql = String::from("a IS NULL");
3043 let dialect = GenericDialect {};
3044 let tokens = Tokenizer::new(&dialect, &sql).tokenize().unwrap();
3045
3046 let expected = vec![
3047 Token::make_word("a", None),
3048 Token::Whitespace(Whitespace::Space),
3049 Token::make_keyword("IS"),
3050 Token::Whitespace(Whitespace::Space),
3051 Token::make_keyword("NULL"),
3052 ];
3053
3054 compare(expected, tokens);
3055 }
3056
3057 #[test]
3058 fn tokenize_comment() {
3059 let test_cases = vec![
3060 (
3061 String::from("0--this is a comment\n1"),
3062 vec![
3063 Token::Number("0".to_string(), false),
3064 Token::Whitespace(Whitespace::SingleLineComment {
3065 prefix: "--".to_string(),
3066 comment: "this is a comment\n".to_string(),
3067 }),
3068 Token::Number("1".to_string(), false),
3069 ],
3070 ),
3071 (
3072 String::from("0--this is a comment\r1"),
3073 vec![
3074 Token::Number("0".to_string(), false),
3075 Token::Whitespace(Whitespace::SingleLineComment {
3076 prefix: "--".to_string(),
3077 comment: "this is a comment\r1".to_string(),
3078 }),
3079 ],
3080 ),
3081 (
3082 String::from("0--this is a comment\r\n1"),
3083 vec![
3084 Token::Number("0".to_string(), false),
3085 Token::Whitespace(Whitespace::SingleLineComment {
3086 prefix: "--".to_string(),
3087 comment: "this is a comment\r\n".to_string(),
3088 }),
3089 Token::Number("1".to_string(), false),
3090 ],
3091 ),
3092 ];
3093
3094 let dialect = GenericDialect {};
3095
3096 for (sql, expected) in test_cases {
3097 let tokens = Tokenizer::new(&dialect, &sql).tokenize().unwrap();
3098 compare(expected, tokens);
3099 }
3100 }
3101
3102 #[test]
3103 fn tokenize_comment_postgres() {
3104 let sql = String::from("1--\r0");
3105
3106 let dialect = PostgreSqlDialect {};
3107 let tokens = Tokenizer::new(&dialect, &sql).tokenize().unwrap();
3108 let expected = vec![
3109 Token::Number("1".to_string(), false),
3110 Token::Whitespace(Whitespace::SingleLineComment {
3111 prefix: "--".to_string(),
3112 comment: "\r".to_string(),
3113 }),
3114 Token::Number("0".to_string(), false),
3115 ];
3116 compare(expected, tokens);
3117 }
3118
3119 #[test]
3120 fn tokenize_comment_at_eof() {
3121 let sql = String::from("--this is a comment");
3122
3123 let dialect = GenericDialect {};
3124 let tokens = Tokenizer::new(&dialect, &sql).tokenize().unwrap();
3125 let expected = vec![Token::Whitespace(Whitespace::SingleLineComment {
3126 prefix: "--".to_string(),
3127 comment: "this is a comment".to_string(),
3128 })];
3129 compare(expected, tokens);
3130 }
3131
3132 #[test]
3133 fn tokenize_multiline_comment() {
3134 let sql = String::from("0/*multi-line\n* /comment*/1");
3135
3136 let dialect = GenericDialect {};
3137 let tokens = Tokenizer::new(&dialect, &sql).tokenize().unwrap();
3138 let expected = vec![
3139 Token::Number("0".to_string(), false),
3140 Token::Whitespace(Whitespace::MultiLineComment(
3141 "multi-line\n* /comment".to_string(),
3142 )),
3143 Token::Number("1".to_string(), false),
3144 ];
3145 compare(expected, tokens);
3146 }
3147
3148 #[test]
3149 fn tokenize_nested_multiline_comment() {
3150 let dialect = GenericDialect {};
3151 let test_cases = vec![
3152 (
3153 "0/*multi-line\n* \n/* comment \n /*comment*/*/ */ /comment*/1",
3154 vec![
3155 Token::Number("0".to_string(), false),
3156 Token::Whitespace(Whitespace::MultiLineComment(
3157 "multi-line\n* \n/* comment \n /*comment*/*/ ".into(),
3158 )),
3159 Token::Whitespace(Whitespace::Space),
3160 Token::Div,
3161 Token::Word(Word {
3162 value: "comment".to_string(),
3163 quote_style: None,
3164 keyword: Keyword::COMMENT,
3165 }),
3166 Token::Mul,
3167 Token::Div,
3168 Token::Number("1".to_string(), false),
3169 ],
3170 ),
3171 (
3172 "0/*multi-line\n* \n/* comment \n /*comment/**/ */ /comment*/*/1",
3173 vec![
3174 Token::Number("0".to_string(), false),
3175 Token::Whitespace(Whitespace::MultiLineComment(
3176 "multi-line\n* \n/* comment \n /*comment/**/ */ /comment*/".into(),
3177 )),
3178 Token::Number("1".to_string(), false),
3179 ],
3180 ),
3181 (
3182 "SELECT 1/* a /* b */ c */0",
3183 vec![
3184 Token::make_keyword("SELECT"),
3185 Token::Whitespace(Whitespace::Space),
3186 Token::Number("1".to_string(), false),
3187 Token::Whitespace(Whitespace::MultiLineComment(" a /* b */ c ".to_string())),
3188 Token::Number("0".to_string(), false),
3189 ],
3190 ),
3191 ];
3192
3193 for (sql, expected) in test_cases {
3194 let tokens = Tokenizer::new(&dialect, sql).tokenize().unwrap();
3195 compare(expected, tokens);
3196 }
3197 }
3198
3199 #[test]
3200 fn tokenize_nested_multiline_comment_empty() {
3201 let sql = "select 1/*/**/*/0";
3202
3203 let dialect = GenericDialect {};
3204 let tokens = Tokenizer::new(&dialect, sql).tokenize().unwrap();
3205 let expected = vec![
3206 Token::make_keyword("select"),
3207 Token::Whitespace(Whitespace::Space),
3208 Token::Number("1".to_string(), false),
3209 Token::Whitespace(Whitespace::MultiLineComment("/**/".to_string())),
3210 Token::Number("0".to_string(), false),
3211 ];
3212
3213 compare(expected, tokens);
3214 }
3215
3216 #[test]
3217 fn tokenize_nested_comments_if_not_supported() {
3218 let dialect = SQLiteDialect {};
3219 let sql = "SELECT 1/*/* nested comment */*/0";
3220 let tokens = Tokenizer::new(&dialect, sql).tokenize();
3221 let expected = vec![
3222 Token::make_keyword("SELECT"),
3223 Token::Whitespace(Whitespace::Space),
3224 Token::Number("1".to_string(), false),
3225 Token::Whitespace(Whitespace::MultiLineComment(
3226 "/* nested comment ".to_string(),
3227 )),
3228 Token::Mul,
3229 Token::Div,
3230 Token::Number("0".to_string(), false),
3231 ];
3232
3233 compare(expected, tokens.unwrap());
3234 }
3235
3236 #[test]
3237 fn tokenize_multiline_comment_with_even_asterisks() {
3238 let sql = String::from("\n/** Comment **/\n");
3239
3240 let dialect = GenericDialect {};
3241 let tokens = Tokenizer::new(&dialect, &sql).tokenize().unwrap();
3242 let expected = vec![
3243 Token::Whitespace(Whitespace::Newline),
3244 Token::Whitespace(Whitespace::MultiLineComment("* Comment *".to_string())),
3245 Token::Whitespace(Whitespace::Newline),
3246 ];
3247 compare(expected, tokens);
3248 }
3249
3250 #[test]
3251 fn tokenize_unicode_whitespace() {
3252 let sql = String::from(" \u{2003}\n");
3253
3254 let dialect = GenericDialect {};
3255 let tokens = Tokenizer::new(&dialect, &sql).tokenize().unwrap();
3256 let expected = vec![
3257 Token::Whitespace(Whitespace::Space),
3258 Token::Whitespace(Whitespace::Space),
3259 Token::Whitespace(Whitespace::Newline),
3260 ];
3261 compare(expected, tokens);
3262 }
3263
3264 #[test]
3265 fn tokenize_mismatched_quotes() {
3266 let sql = String::from("\"foo");
3267
3268 let dialect = GenericDialect {};
3269 let mut tokenizer = Tokenizer::new(&dialect, &sql);
3270 assert_eq!(
3271 tokenizer.tokenize(),
3272 Err(TokenizerError {
3273 message: "Expected close delimiter '\"' before EOF.".to_string(),
3274 location: Location { line: 1, column: 1 },
3275 })
3276 );
3277 }
3278
3279 #[test]
3280 fn tokenize_newlines() {
3281 let sql = String::from("line1\nline2\rline3\r\nline4\r");
3282
3283 let dialect = GenericDialect {};
3284 let tokens = Tokenizer::new(&dialect, &sql).tokenize().unwrap();
3285 let expected = vec![
3286 Token::make_word("line1", None),
3287 Token::Whitespace(Whitespace::Newline),
3288 Token::make_word("line2", None),
3289 Token::Whitespace(Whitespace::Newline),
3290 Token::make_word("line3", None),
3291 Token::Whitespace(Whitespace::Newline),
3292 Token::make_word("line4", None),
3293 Token::Whitespace(Whitespace::Newline),
3294 ];
3295 compare(expected, tokens);
3296 }
3297
3298 #[test]
3299 fn tokenize_mssql_top() {
3300 let sql = "SELECT TOP 5 [bar] FROM foo";
3301 let dialect = MsSqlDialect {};
3302 let tokens = Tokenizer::new(&dialect, sql).tokenize().unwrap();
3303 let expected = vec![
3304 Token::make_keyword("SELECT"),
3305 Token::Whitespace(Whitespace::Space),
3306 Token::make_keyword("TOP"),
3307 Token::Whitespace(Whitespace::Space),
3308 Token::Number(String::from("5"), false),
3309 Token::Whitespace(Whitespace::Space),
3310 Token::make_word("bar", Some('[')),
3311 Token::Whitespace(Whitespace::Space),
3312 Token::make_keyword("FROM"),
3313 Token::Whitespace(Whitespace::Space),
3314 Token::make_word("foo", None),
3315 ];
3316 compare(expected, tokens);
3317 }
3318
3319 #[test]
3320 fn tokenize_pg_regex_match() {
3321 let sql = "SELECT col ~ '^a', col ~* '^a', col !~ '^a', col !~* '^a'";
3322 let dialect = GenericDialect {};
3323 let tokens = Tokenizer::new(&dialect, sql).tokenize().unwrap();
3324 let expected = vec![
3325 Token::make_keyword("SELECT"),
3326 Token::Whitespace(Whitespace::Space),
3327 Token::make_word("col", None),
3328 Token::Whitespace(Whitespace::Space),
3329 Token::Tilde,
3330 Token::Whitespace(Whitespace::Space),
3331 Token::SingleQuotedString("^a".into()),
3332 Token::Comma,
3333 Token::Whitespace(Whitespace::Space),
3334 Token::make_word("col", None),
3335 Token::Whitespace(Whitespace::Space),
3336 Token::TildeAsterisk,
3337 Token::Whitespace(Whitespace::Space),
3338 Token::SingleQuotedString("^a".into()),
3339 Token::Comma,
3340 Token::Whitespace(Whitespace::Space),
3341 Token::make_word("col", None),
3342 Token::Whitespace(Whitespace::Space),
3343 Token::ExclamationMarkTilde,
3344 Token::Whitespace(Whitespace::Space),
3345 Token::SingleQuotedString("^a".into()),
3346 Token::Comma,
3347 Token::Whitespace(Whitespace::Space),
3348 Token::make_word("col", None),
3349 Token::Whitespace(Whitespace::Space),
3350 Token::ExclamationMarkTildeAsterisk,
3351 Token::Whitespace(Whitespace::Space),
3352 Token::SingleQuotedString("^a".into()),
3353 ];
3354 compare(expected, tokens);
3355 }
3356
3357 #[test]
3358 fn tokenize_pg_like_match() {
3359 let sql = "SELECT col ~~ '_a%', col ~~* '_a%', col !~~ '_a%', col !~~* '_a%'";
3360 let dialect = GenericDialect {};
3361 let tokens = Tokenizer::new(&dialect, sql).tokenize().unwrap();
3362 let expected = vec![
3363 Token::make_keyword("SELECT"),
3364 Token::Whitespace(Whitespace::Space),
3365 Token::make_word("col", None),
3366 Token::Whitespace(Whitespace::Space),
3367 Token::DoubleTilde,
3368 Token::Whitespace(Whitespace::Space),
3369 Token::SingleQuotedString("_a%".into()),
3370 Token::Comma,
3371 Token::Whitespace(Whitespace::Space),
3372 Token::make_word("col", None),
3373 Token::Whitespace(Whitespace::Space),
3374 Token::DoubleTildeAsterisk,
3375 Token::Whitespace(Whitespace::Space),
3376 Token::SingleQuotedString("_a%".into()),
3377 Token::Comma,
3378 Token::Whitespace(Whitespace::Space),
3379 Token::make_word("col", None),
3380 Token::Whitespace(Whitespace::Space),
3381 Token::ExclamationMarkDoubleTilde,
3382 Token::Whitespace(Whitespace::Space),
3383 Token::SingleQuotedString("_a%".into()),
3384 Token::Comma,
3385 Token::Whitespace(Whitespace::Space),
3386 Token::make_word("col", None),
3387 Token::Whitespace(Whitespace::Space),
3388 Token::ExclamationMarkDoubleTildeAsterisk,
3389 Token::Whitespace(Whitespace::Space),
3390 Token::SingleQuotedString("_a%".into()),
3391 ];
3392 compare(expected, tokens);
3393 }
3394
3395 #[test]
3396 fn tokenize_quoted_identifier() {
3397 let sql = r#" "a "" b" "a """ "c """"" "#;
3398 let dialect = GenericDialect {};
3399 let tokens = Tokenizer::new(&dialect, sql).tokenize().unwrap();
3400 let expected = vec![
3401 Token::Whitespace(Whitespace::Space),
3402 Token::make_word(r#"a " b"#, Some('"')),
3403 Token::Whitespace(Whitespace::Space),
3404 Token::make_word(r#"a ""#, Some('"')),
3405 Token::Whitespace(Whitespace::Space),
3406 Token::make_word(r#"c """#, Some('"')),
3407 Token::Whitespace(Whitespace::Space),
3408 ];
3409 compare(expected, tokens);
3410 }
3411
3412 #[test]
3413 fn tokenize_snowflake_div() {
3414 let sql = r#"field/1000"#;
3415 let dialect = SnowflakeDialect {};
3416 let tokens = Tokenizer::new(&dialect, sql).tokenize().unwrap();
3417 let expected = vec![
3418 Token::make_word(r#"field"#, None),
3419 Token::Div,
3420 Token::Number("1000".to_string(), false),
3421 ];
3422 compare(expected, tokens);
3423 }
3424
3425 #[test]
3426 fn tokenize_quoted_identifier_with_no_escape() {
3427 let sql = r#" "a "" b" "a """ "c """"" "#;
3428 let dialect = GenericDialect {};
3429 let tokens = Tokenizer::new(&dialect, sql)
3430 .with_unescape(false)
3431 .tokenize()
3432 .unwrap();
3433 let expected = vec![
3434 Token::Whitespace(Whitespace::Space),
3435 Token::make_word(r#"a "" b"#, Some('"')),
3436 Token::Whitespace(Whitespace::Space),
3437 Token::make_word(r#"a """#, Some('"')),
3438 Token::Whitespace(Whitespace::Space),
3439 Token::make_word(r#"c """""#, Some('"')),
3440 Token::Whitespace(Whitespace::Space),
3441 ];
3442 compare(expected, tokens);
3443 }
3444
3445 #[test]
3446 fn tokenize_with_location() {
3447 let sql = "SELECT a,\n b";
3448 let dialect = GenericDialect {};
3449 let tokens = Tokenizer::new(&dialect, sql)
3450 .tokenize_with_location()
3451 .unwrap();
3452 let expected = vec![
3453 TokenWithSpan::at(Token::make_keyword("SELECT"), (1, 1).into(), (1, 7).into()),
3454 TokenWithSpan::at(
3455 Token::Whitespace(Whitespace::Space),
3456 (1, 7).into(),
3457 (1, 8).into(),
3458 ),
3459 TokenWithSpan::at(Token::make_word("a", None), (1, 8).into(), (1, 9).into()),
3460 TokenWithSpan::at(Token::Comma, (1, 9).into(), (1, 10).into()),
3461 TokenWithSpan::at(
3462 Token::Whitespace(Whitespace::Newline),
3463 (1, 10).into(),
3464 (2, 1).into(),
3465 ),
3466 TokenWithSpan::at(
3467 Token::Whitespace(Whitespace::Space),
3468 (2, 1).into(),
3469 (2, 2).into(),
3470 ),
3471 TokenWithSpan::at(Token::make_word("b", None), (2, 2).into(), (2, 3).into()),
3472 ];
3473 compare(expected, tokens);
3474 }
3475
3476 fn compare<T: PartialEq + fmt::Debug>(expected: Vec<T>, actual: Vec<T>) {
3477 assert_eq!(expected, actual);
3482 }
3483
3484 fn check_unescape(s: &str, expected: Option<&str>) {
3485 let s = format!("'{}'", s);
3486 let mut state = State {
3487 peekable: s.chars().peekable(),
3488 line: 0,
3489 col: 0,
3490 };
3491
3492 assert_eq!(
3493 unescape_single_quoted_string(&mut state),
3494 expected.map(|s| s.to_string())
3495 );
3496 }
3497
3498 #[test]
3499 fn test_unescape() {
3500 check_unescape(r"\b", Some("\u{0008}"));
3501 check_unescape(r"\f", Some("\u{000C}"));
3502 check_unescape(r"\t", Some("\t"));
3503 check_unescape(r"\r\n", Some("\r\n"));
3504 check_unescape(r"\/", Some("/"));
3505 check_unescape(r"/", Some("/"));
3506 check_unescape(r"\\", Some("\\"));
3507
3508 check_unescape(r"\u0001", Some("\u{0001}"));
3510 check_unescape(r"\u4c91", Some("\u{4c91}"));
3511 check_unescape(r"\u4c916", Some("\u{4c91}6"));
3512 check_unescape(r"\u4c", None);
3513 check_unescape(r"\u0000", None);
3514 check_unescape(r"\U0010FFFF", Some("\u{10FFFF}"));
3515 check_unescape(r"\U00110000", None);
3516 check_unescape(r"\U00000000", None);
3517 check_unescape(r"\u", None);
3518 check_unescape(r"\U", None);
3519 check_unescape(r"\U1010FFFF", None);
3520
3521 check_unescape(r"\x4B", Some("\u{004b}"));
3523 check_unescape(r"\x4", Some("\u{0004}"));
3524 check_unescape(r"\x4L", Some("\u{0004}L"));
3525 check_unescape(r"\x", Some("x"));
3526 check_unescape(r"\xP", Some("xP"));
3527 check_unescape(r"\x0", None);
3528 check_unescape(r"\xCAD", None);
3529 check_unescape(r"\xA9", None);
3530
3531 check_unescape(r"\1", Some("\u{0001}"));
3533 check_unescape(r"\12", Some("\u{000a}"));
3534 check_unescape(r"\123", Some("\u{0053}"));
3535 check_unescape(r"\1232", Some("\u{0053}2"));
3536 check_unescape(r"\4", Some("\u{0004}"));
3537 check_unescape(r"\45", Some("\u{0025}"));
3538 check_unescape(r"\450", Some("\u{0028}"));
3539 check_unescape(r"\603", None);
3540 check_unescape(r"\0", None);
3541 check_unescape(r"\080", None);
3542
3543 check_unescape(r"\9", Some("9"));
3545 check_unescape(r"''", Some("'"));
3546 check_unescape(
3547 r"Hello\r\nRust/\u4c91 SQL Parser\U0010ABCD\1232",
3548 Some("Hello\r\nRust/\u{4c91} SQL Parser\u{10abcd}\u{0053}2"),
3549 );
3550 check_unescape(r"Hello\0", None);
3551 check_unescape(r"Hello\xCADRust", None);
3552 }
3553
3554 #[test]
3555 fn tokenize_numeric_prefix_trait() {
3556 #[derive(Debug)]
3557 struct NumericPrefixDialect;
3558
3559 impl Dialect for NumericPrefixDialect {
3560 fn is_identifier_start(&self, ch: char) -> bool {
3561 ch.is_ascii_lowercase()
3562 || ch.is_ascii_uppercase()
3563 || ch.is_ascii_digit()
3564 || ch == '$'
3565 }
3566
3567 fn is_identifier_part(&self, ch: char) -> bool {
3568 ch.is_ascii_lowercase()
3569 || ch.is_ascii_uppercase()
3570 || ch.is_ascii_digit()
3571 || ch == '_'
3572 || ch == '$'
3573 || ch == '{'
3574 || ch == '}'
3575 }
3576
3577 fn supports_numeric_prefix(&self) -> bool {
3578 true
3579 }
3580 }
3581
3582 tokenize_numeric_prefix_inner(&NumericPrefixDialect {});
3583 tokenize_numeric_prefix_inner(&HiveDialect {});
3584 tokenize_numeric_prefix_inner(&MySqlDialect {});
3585 }
3586
3587 fn tokenize_numeric_prefix_inner(dialect: &dyn Dialect) {
3588 let sql = r#"SELECT * FROM 1"#;
3589 let tokens = Tokenizer::new(dialect, sql).tokenize().unwrap();
3590 let expected = vec![
3591 Token::make_keyword("SELECT"),
3592 Token::Whitespace(Whitespace::Space),
3593 Token::Mul,
3594 Token::Whitespace(Whitespace::Space),
3595 Token::make_keyword("FROM"),
3596 Token::Whitespace(Whitespace::Space),
3597 Token::Number(String::from("1"), false),
3598 ];
3599 compare(expected, tokens);
3600 }
3601
3602 #[test]
3603 fn tokenize_quoted_string_escape() {
3604 let dialect = SnowflakeDialect {};
3605 for (sql, expected, expected_unescaped) in [
3606 (r#"'%a\'%b'"#, r#"%a\'%b"#, r#"%a'%b"#),
3607 (r#"'a\'\'b\'c\'d'"#, r#"a\'\'b\'c\'d"#, r#"a''b'c'd"#),
3608 (r#"'\\'"#, r#"\\"#, r#"\"#),
3609 (
3610 r#"'\0\a\b\f\n\r\t\Z'"#,
3611 r#"\0\a\b\f\n\r\t\Z"#,
3612 "\0\u{7}\u{8}\u{c}\n\r\t\u{1a}",
3613 ),
3614 (r#"'\"'"#, r#"\""#, "\""),
3615 (r#"'\\a\\b\'c'"#, r#"\\a\\b\'c"#, r#"\a\b'c"#),
3616 (r#"'\'abcd'"#, r#"\'abcd"#, r#"'abcd"#),
3617 (r#"'''a''b'"#, r#"''a''b"#, r#"'a'b"#),
3618 (r#"'\q'"#, r#"\q"#, r#"q"#),
3619 (r#"'\%\_'"#, r#"\%\_"#, r#"%_"#),
3620 (r#"'\\%\\_'"#, r#"\\%\\_"#, r#"\%\_"#),
3621 ] {
3622 let tokens = Tokenizer::new(&dialect, sql)
3623 .with_unescape(false)
3624 .tokenize()
3625 .unwrap();
3626 let expected = vec![Token::SingleQuotedString(expected.to_string())];
3627 compare(expected, tokens);
3628
3629 let tokens = Tokenizer::new(&dialect, sql)
3630 .with_unescape(true)
3631 .tokenize()
3632 .unwrap();
3633 let expected = vec![Token::SingleQuotedString(expected_unescaped.to_string())];
3634 compare(expected, tokens);
3635 }
3636
3637 for sql in [r#"'\'"#, r#"'ab\'"#] {
3638 let mut tokenizer = Tokenizer::new(&dialect, sql);
3639 assert_eq!(
3640 "Unterminated string literal",
3641 tokenizer.tokenize().unwrap_err().message.as_str(),
3642 );
3643 }
3644
3645 for (sql, expected) in [(r#"'\'"#, r#"\"#), (r#"'ab\'"#, r#"ab\"#)] {
3647 let dialect = GenericDialect {};
3648 let tokens = Tokenizer::new(&dialect, sql).tokenize().unwrap();
3649
3650 let expected = vec![Token::SingleQuotedString(expected.to_string())];
3651
3652 compare(expected, tokens);
3653 }
3654
3655 for (sql, expected) in [(r#"'\%'"#, r#"\%"#), (r#"'\_'"#, r#"\_"#)] {
3657 let dialect = MySqlDialect {};
3658 let tokens = Tokenizer::new(&dialect, sql).tokenize().unwrap();
3659
3660 let expected = vec![Token::SingleQuotedString(expected.to_string())];
3661
3662 compare(expected, tokens);
3663 }
3664 }
3665
3666 #[test]
3667 fn tokenize_triple_quoted_string() {
3668 fn check<F>(
3669 q: char, r: char, quote_token: F,
3672 ) where
3673 F: Fn(String) -> Token,
3674 {
3675 let dialect = BigQueryDialect {};
3676
3677 for (sql, expected, expected_unescaped) in [
3678 (format!(r#"{q}{q}{q}{q}{q}{q}"#), "".into(), "".into()),
3680 (
3682 format!(r#"{q}{q}{q}ab{q}{q}\{q}{q}cd{q}{q}{q}"#),
3683 format!(r#"ab{q}{q}\{q}{q}cd"#),
3684 format!(r#"ab{q}{q}{q}{q}cd"#),
3685 ),
3686 (
3688 format!(r#"{q}{q}{q}abc{q}{q}{q}"#),
3689 "abc".into(),
3690 "abc".into(),
3691 ),
3692 (
3694 format!(r#"{q}{q}{q}ab{r}{r}{r}c{r}def{r}{r}{r}{q}{q}{q}"#),
3695 format!("ab{r}{r}{r}c{r}def{r}{r}{r}"),
3696 format!("ab{r}{r}{r}c{r}def{r}{r}{r}"),
3697 ),
3698 (
3700 format!(r#"{q}{q}{q}ab{q}{q}c{q}{q}\{q}de{q}{q}f{q}{q}{q}"#),
3701 format!(r#"ab{q}{q}c{q}{q}\{q}de{q}{q}f"#),
3702 format!(r#"ab{q}{q}c{q}{q}{q}de{q}{q}f"#),
3703 ),
3704 (
3706 format!(r#"{q}{q}{q}a\'\'b\'c\'d{q}{q}{q}"#),
3707 r#"a\'\'b\'c\'d"#.into(),
3708 r#"a''b'c'd"#.into(),
3709 ),
3710 (
3712 format!(r#"{q}{q}{q}abc\0\n\rdef{q}{q}{q}"#),
3713 r#"abc\0\n\rdef"#.into(),
3714 "abc\0\n\rdef".into(),
3715 ),
3716 ] {
3717 let tokens = Tokenizer::new(&dialect, sql.as_str())
3718 .with_unescape(false)
3719 .tokenize()
3720 .unwrap();
3721 let expected = vec![quote_token(expected.to_string())];
3722 compare(expected, tokens);
3723
3724 let tokens = Tokenizer::new(&dialect, sql.as_str())
3725 .with_unescape(true)
3726 .tokenize()
3727 .unwrap();
3728 let expected = vec![quote_token(expected_unescaped.to_string())];
3729 compare(expected, tokens);
3730 }
3731
3732 for sql in [
3733 format!(r#"{q}{q}{q}{q}{q}\{q}"#),
3734 format!(r#"{q}{q}{q}abc{q}{q}\{q}"#),
3735 format!(r#"{q}{q}{q}{q}"#),
3736 format!(r#"{q}{q}{q}{r}{r}"#),
3737 format!(r#"{q}{q}{q}abc{q}"#),
3738 format!(r#"{q}{q}{q}abc{q}{q}"#),
3739 format!(r#"{q}{q}{q}abc"#),
3740 ] {
3741 let dialect = BigQueryDialect {};
3742 let mut tokenizer = Tokenizer::new(&dialect, sql.as_str());
3743 assert_eq!(
3744 "Unterminated string literal",
3745 tokenizer.tokenize().unwrap_err().message.as_str(),
3746 );
3747 }
3748 }
3749
3750 check('"', '\'', Token::TripleDoubleQuotedString);
3751
3752 check('\'', '"', Token::TripleSingleQuotedString);
3753
3754 let dialect = BigQueryDialect {};
3755
3756 let sql = r#"""''"#;
3757 let tokens = Tokenizer::new(&dialect, sql)
3758 .with_unescape(true)
3759 .tokenize()
3760 .unwrap();
3761 let expected = vec![
3762 Token::DoubleQuotedString("".to_string()),
3763 Token::SingleQuotedString("".to_string()),
3764 ];
3765 compare(expected, tokens);
3766
3767 let sql = r#"''"""#;
3768 let tokens = Tokenizer::new(&dialect, sql)
3769 .with_unescape(true)
3770 .tokenize()
3771 .unwrap();
3772 let expected = vec![
3773 Token::SingleQuotedString("".to_string()),
3774 Token::DoubleQuotedString("".to_string()),
3775 ];
3776 compare(expected, tokens);
3777
3778 let dialect = SnowflakeDialect {};
3780 let sql = r#"''''''"#;
3781 let tokens = Tokenizer::new(&dialect, sql).tokenize().unwrap();
3782 let expected = vec![Token::SingleQuotedString("''".to_string())];
3783 compare(expected, tokens);
3784 }
3785
3786 #[test]
3787 fn test_mysql_users_grantees() {
3788 let dialect = MySqlDialect {};
3789
3790 let sql = "CREATE USER `root`@`%`";
3791 let tokens = Tokenizer::new(&dialect, sql).tokenize().unwrap();
3792 let expected = vec![
3793 Token::make_keyword("CREATE"),
3794 Token::Whitespace(Whitespace::Space),
3795 Token::make_keyword("USER"),
3796 Token::Whitespace(Whitespace::Space),
3797 Token::make_word("root", Some('`')),
3798 Token::AtSign,
3799 Token::make_word("%", Some('`')),
3800 ];
3801 compare(expected, tokens);
3802 }
3803
3804 #[test]
3805 fn test_postgres_abs_without_space_and_string_literal() {
3806 let dialect = MySqlDialect {};
3807
3808 let sql = "SELECT @'1'";
3809 let tokens = Tokenizer::new(&dialect, sql).tokenize().unwrap();
3810 let expected = vec![
3811 Token::make_keyword("SELECT"),
3812 Token::Whitespace(Whitespace::Space),
3813 Token::AtSign,
3814 Token::SingleQuotedString("1".to_string()),
3815 ];
3816 compare(expected, tokens);
3817 }
3818
3819 #[test]
3820 fn test_postgres_abs_without_space_and_quoted_column() {
3821 let dialect = MySqlDialect {};
3822
3823 let sql = r#"SELECT @"bar" FROM foo"#;
3824 let tokens = Tokenizer::new(&dialect, sql).tokenize().unwrap();
3825 let expected = vec![
3826 Token::make_keyword("SELECT"),
3827 Token::Whitespace(Whitespace::Space),
3828 Token::AtSign,
3829 Token::DoubleQuotedString("bar".to_string()),
3830 Token::Whitespace(Whitespace::Space),
3831 Token::make_keyword("FROM"),
3832 Token::Whitespace(Whitespace::Space),
3833 Token::make_word("foo", None),
3834 ];
3835 compare(expected, tokens);
3836 }
3837
3838 #[test]
3839 fn test_national_strings_backslash_escape_not_supported() {
3840 all_dialects_where(|dialect| !dialect.supports_string_literal_backslash_escape())
3841 .tokenizes_to(
3842 "select n'''''\\'",
3843 vec![
3844 Token::make_keyword("select"),
3845 Token::Whitespace(Whitespace::Space),
3846 Token::NationalStringLiteral("''\\".to_string()),
3847 ],
3848 );
3849 }
3850
3851 #[test]
3852 fn test_national_strings_backslash_escape_supported() {
3853 all_dialects_where(|dialect| dialect.supports_string_literal_backslash_escape())
3854 .tokenizes_to(
3855 "select n'''''\\''",
3856 vec![
3857 Token::make_keyword("select"),
3858 Token::Whitespace(Whitespace::Space),
3859 Token::NationalStringLiteral("'''".to_string()),
3860 ],
3861 );
3862 }
3863
3864 #[test]
3865 fn test_string_escape_constant_not_supported() {
3866 all_dialects_where(|dialect| !dialect.supports_string_escape_constant()).tokenizes_to(
3867 "select e'...'",
3868 vec![
3869 Token::make_keyword("select"),
3870 Token::Whitespace(Whitespace::Space),
3871 Token::make_word("e", None),
3872 Token::SingleQuotedString("...".to_string()),
3873 ],
3874 );
3875
3876 all_dialects_where(|dialect| !dialect.supports_string_escape_constant()).tokenizes_to(
3877 "select E'...'",
3878 vec![
3879 Token::make_keyword("select"),
3880 Token::Whitespace(Whitespace::Space),
3881 Token::make_word("E", None),
3882 Token::SingleQuotedString("...".to_string()),
3883 ],
3884 );
3885 }
3886
3887 #[test]
3888 fn test_string_escape_constant_supported() {
3889 all_dialects_where(|dialect| dialect.supports_string_escape_constant()).tokenizes_to(
3890 "select e'\\''",
3891 vec![
3892 Token::make_keyword("select"),
3893 Token::Whitespace(Whitespace::Space),
3894 Token::EscapedStringLiteral("'".to_string()),
3895 ],
3896 );
3897
3898 all_dialects_where(|dialect| dialect.supports_string_escape_constant()).tokenizes_to(
3899 "select E'\\''",
3900 vec![
3901 Token::make_keyword("select"),
3902 Token::Whitespace(Whitespace::Space),
3903 Token::EscapedStringLiteral("'".to_string()),
3904 ],
3905 );
3906 }
3907
3908 #[test]
3909 fn test_whitespace_required_after_single_line_comment() {
3910 all_dialects_where(|dialect| dialect.requires_single_line_comment_whitespace())
3911 .tokenizes_to(
3912 "SELECT --'abc'",
3913 vec![
3914 Token::make_keyword("SELECT"),
3915 Token::Whitespace(Whitespace::Space),
3916 Token::Minus,
3917 Token::Minus,
3918 Token::SingleQuotedString("abc".to_string()),
3919 ],
3920 );
3921
3922 all_dialects_where(|dialect| dialect.requires_single_line_comment_whitespace())
3923 .tokenizes_to(
3924 "SELECT -- 'abc'",
3925 vec![
3926 Token::make_keyword("SELECT"),
3927 Token::Whitespace(Whitespace::Space),
3928 Token::Whitespace(Whitespace::SingleLineComment {
3929 prefix: "--".to_string(),
3930 comment: " 'abc'".to_string(),
3931 }),
3932 ],
3933 );
3934
3935 all_dialects_where(|dialect| dialect.requires_single_line_comment_whitespace())
3936 .tokenizes_to(
3937 "SELECT --",
3938 vec![
3939 Token::make_keyword("SELECT"),
3940 Token::Whitespace(Whitespace::Space),
3941 Token::Minus,
3942 Token::Minus,
3943 ],
3944 );
3945 }
3946
3947 #[test]
3948 fn test_whitespace_not_required_after_single_line_comment() {
3949 all_dialects_where(|dialect| !dialect.requires_single_line_comment_whitespace())
3950 .tokenizes_to(
3951 "SELECT --'abc'",
3952 vec![
3953 Token::make_keyword("SELECT"),
3954 Token::Whitespace(Whitespace::Space),
3955 Token::Whitespace(Whitespace::SingleLineComment {
3956 prefix: "--".to_string(),
3957 comment: "'abc'".to_string(),
3958 }),
3959 ],
3960 );
3961
3962 all_dialects_where(|dialect| !dialect.requires_single_line_comment_whitespace())
3963 .tokenizes_to(
3964 "SELECT -- 'abc'",
3965 vec![
3966 Token::make_keyword("SELECT"),
3967 Token::Whitespace(Whitespace::Space),
3968 Token::Whitespace(Whitespace::SingleLineComment {
3969 prefix: "--".to_string(),
3970 comment: " 'abc'".to_string(),
3971 }),
3972 ],
3973 );
3974
3975 all_dialects_where(|dialect| !dialect.requires_single_line_comment_whitespace())
3976 .tokenizes_to(
3977 "SELECT --",
3978 vec![
3979 Token::make_keyword("SELECT"),
3980 Token::Whitespace(Whitespace::Space),
3981 Token::Whitespace(Whitespace::SingleLineComment {
3982 prefix: "--".to_string(),
3983 comment: "".to_string(),
3984 }),
3985 ],
3986 );
3987 }
3988
3989 #[test]
3990 fn test_tokenize_identifiers_numeric_prefix() {
3991 all_dialects_where(|dialect| dialect.supports_numeric_prefix())
3992 .tokenizes_to("123abc", vec![Token::make_word("123abc", None)]);
3993
3994 all_dialects_where(|dialect| dialect.supports_numeric_prefix())
3995 .tokenizes_to("12e34", vec![Token::Number("12e34".to_string(), false)]);
3996
3997 all_dialects_where(|dialect| dialect.supports_numeric_prefix()).tokenizes_to(
3998 "t.12e34",
3999 vec![
4000 Token::make_word("t", None),
4001 Token::Period,
4002 Token::make_word("12e34", None),
4003 ],
4004 );
4005
4006 all_dialects_where(|dialect| dialect.supports_numeric_prefix()).tokenizes_to(
4007 "t.1two3",
4008 vec![
4009 Token::make_word("t", None),
4010 Token::Period,
4011 Token::make_word("1two3", None),
4012 ],
4013 );
4014 }
4015}