sqlparser/
tokenizer.rs

1// Licensed to the Apache Software Foundation (ASF) under one
2// or more contributor license agreements.  See the NOTICE file
3// distributed with this work for additional information
4// regarding copyright ownership.  The ASF licenses this file
5// to you under the Apache License, Version 2.0 (the
6// "License"); you may not use this file except in compliance
7// with the License.  You may obtain a copy of the License at
8//
9//   http://www.apache.org/licenses/LICENSE-2.0
10//
11// Unless required by applicable law or agreed to in writing,
12// software distributed under the License is distributed on an
13// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14// KIND, either express or implied.  See the License for the
15// specific language governing permissions and limitations
16// under the License.
17
18//! SQL Tokenizer
19//!
20//! The tokenizer (a.k.a. lexer) converts a string into a sequence of tokens.
21//!
22//! The tokens then form the input for the parser, which outputs an Abstract Syntax Tree (AST).
23
24#[cfg(not(feature = "std"))]
25use alloc::{
26    borrow::ToOwned,
27    format,
28    string::{String, ToString},
29    vec,
30    vec::Vec,
31};
32use core::iter::Peekable;
33use core::num::NonZeroU8;
34use core::str::Chars;
35use core::{cmp, fmt};
36
37#[cfg(feature = "serde")]
38use serde::{Deserialize, Serialize};
39
40#[cfg(feature = "visitor")]
41use sqlparser_derive::{Visit, VisitMut};
42
43use crate::dialect::Dialect;
44use crate::dialect::{
45    BigQueryDialect, DuckDbDialect, GenericDialect, MySqlDialect, PostgreSqlDialect,
46    SnowflakeDialect,
47};
48use crate::keywords::{Keyword, ALL_KEYWORDS, ALL_KEYWORDS_INDEX};
49use crate::{ast::DollarQuotedString, dialect::HiveDialect};
50
51/// SQL Token enumeration
52#[derive(Debug, Clone, PartialEq, PartialOrd, Eq, Ord, Hash)]
53#[cfg_attr(feature = "serde", derive(Serialize, Deserialize))]
54#[cfg_attr(feature = "visitor", derive(Visit, VisitMut))]
55pub enum Token {
56    /// An end-of-file marker, not a real token
57    EOF,
58    /// A keyword (like SELECT) or an optionally quoted SQL identifier
59    Word(Word),
60    /// An unsigned numeric literal
61    Number(String, bool),
62    /// A character that could not be tokenized
63    Char(char),
64    /// Single quoted string: i.e: 'string'
65    SingleQuotedString(String),
66    /// Double quoted string: i.e: "string"
67    DoubleQuotedString(String),
68    /// Triple single quoted strings: Example '''abc'''
69    /// [BigQuery](https://cloud.google.com/bigquery/docs/reference/standard-sql/lexical#quoted_literals)
70    TripleSingleQuotedString(String),
71    /// Triple double quoted strings: Example """abc"""
72    /// [BigQuery](https://cloud.google.com/bigquery/docs/reference/standard-sql/lexical#quoted_literals)
73    TripleDoubleQuotedString(String),
74    /// Dollar quoted string: i.e: $$string$$ or $tag_name$string$tag_name$
75    DollarQuotedString(DollarQuotedString),
76    /// Byte string literal: i.e: b'string' or B'string' (note that some backends, such as
77    /// PostgreSQL, may treat this syntax as a bit string literal instead, i.e: b'10010101')
78    SingleQuotedByteStringLiteral(String),
79    /// Byte string literal: i.e: b"string" or B"string"
80    DoubleQuotedByteStringLiteral(String),
81    /// Triple single quoted literal with byte string prefix. Example `B'''abc'''`
82    /// [BigQuery](https://cloud.google.com/bigquery/docs/reference/standard-sql/lexical#quoted_literals)
83    TripleSingleQuotedByteStringLiteral(String),
84    /// Triple double quoted literal with byte string prefix. Example `B"""abc"""`
85    /// [BigQuery](https://cloud.google.com/bigquery/docs/reference/standard-sql/lexical#quoted_literals)
86    TripleDoubleQuotedByteStringLiteral(String),
87    /// Single quoted literal with raw string prefix. Example `R'abc'`
88    /// [BigQuery](https://cloud.google.com/bigquery/docs/reference/standard-sql/lexical#quoted_literals)
89    SingleQuotedRawStringLiteral(String),
90    /// Double quoted literal with raw string prefix. Example `R"abc"`
91    /// [BigQuery](https://cloud.google.com/bigquery/docs/reference/standard-sql/lexical#quoted_literals)
92    DoubleQuotedRawStringLiteral(String),
93    /// Triple single quoted literal with raw string prefix. Example `R'''abc'''`
94    /// [BigQuery](https://cloud.google.com/bigquery/docs/reference/standard-sql/lexical#quoted_literals)
95    TripleSingleQuotedRawStringLiteral(String),
96    /// Triple double quoted literal with raw string prefix. Example `R"""abc"""`
97    /// [BigQuery](https://cloud.google.com/bigquery/docs/reference/standard-sql/lexical#quoted_literals)
98    TripleDoubleQuotedRawStringLiteral(String),
99    /// "National" string literal: i.e: N'string'
100    NationalStringLiteral(String),
101    /// "escaped" string literal, which are an extension to the SQL standard: i.e: e'first \n second' or E 'first \n second'
102    EscapedStringLiteral(String),
103    /// Unicode string literal: i.e: U&'first \000A second'
104    UnicodeStringLiteral(String),
105    /// Hexadecimal string literal: i.e.: X'deadbeef'
106    HexStringLiteral(String),
107    /// Comma
108    Comma,
109    /// Whitespace (space, tab, etc)
110    Whitespace(Whitespace),
111    /// Double equals sign `==`
112    DoubleEq,
113    /// Equality operator `=`
114    Eq,
115    /// Not Equals operator `<>` (or `!=` in some dialects)
116    Neq,
117    /// Less Than operator `<`
118    Lt,
119    /// Greater Than operator `>`
120    Gt,
121    /// Less Than Or Equals operator `<=`
122    LtEq,
123    /// Greater Than Or Equals operator `>=`
124    GtEq,
125    /// Spaceship operator <=>
126    Spaceship,
127    /// Plus operator `+`
128    Plus,
129    /// Minus operator `-`
130    Minus,
131    /// Multiplication operator `*`
132    Mul,
133    /// Division operator `/`
134    Div,
135    /// Integer division operator `//` in DuckDB
136    DuckIntDiv,
137    /// Modulo Operator `%`
138    Mod,
139    /// String concatenation `||`
140    StringConcat,
141    /// Left parenthesis `(`
142    LParen,
143    /// Right parenthesis `)`
144    RParen,
145    /// Period (used for compound identifiers or projections into nested types)
146    Period,
147    /// Colon `:`
148    Colon,
149    /// DoubleColon `::` (used for casting in PostgreSQL)
150    DoubleColon,
151    /// Assignment `:=` (used for keyword argument in DuckDB macros and some functions, and for variable declarations in DuckDB and Snowflake)
152    Assignment,
153    /// SemiColon `;` used as separator for COPY and payload
154    SemiColon,
155    /// Backslash `\` used in terminating the COPY payload with `\.`
156    Backslash,
157    /// Left bracket `[`
158    LBracket,
159    /// Right bracket `]`
160    RBracket,
161    /// Ampersand `&`
162    Ampersand,
163    /// Pipe `|`
164    Pipe,
165    /// Caret `^`
166    Caret,
167    /// Left brace `{`
168    LBrace,
169    /// Right brace `}`
170    RBrace,
171    /// Right Arrow `=>`
172    RArrow,
173    /// Sharp `#` used for PostgreSQL Bitwise XOR operator, also PostgreSQL/Redshift geometrical unary/binary operator (Number of points in path or polygon/Intersection)
174    Sharp,
175    /// `##` PostgreSQL/Redshift geometrical binary operator (Point of closest proximity)
176    DoubleSharp,
177    /// Tilde `~` used for PostgreSQL Bitwise NOT operator or case sensitive match regular expression operator
178    Tilde,
179    /// `~*` , a case insensitive match regular expression operator in PostgreSQL
180    TildeAsterisk,
181    /// `!~` , a case sensitive not match regular expression operator in PostgreSQL
182    ExclamationMarkTilde,
183    /// `!~*` , a case insensitive not match regular expression operator in PostgreSQL
184    ExclamationMarkTildeAsterisk,
185    /// `~~`, a case sensitive match pattern operator in PostgreSQL
186    DoubleTilde,
187    /// `~~*`, a case insensitive match pattern operator in PostgreSQL
188    DoubleTildeAsterisk,
189    /// `!~~`, a case sensitive not match pattern operator in PostgreSQL
190    ExclamationMarkDoubleTilde,
191    /// `!~~*`, a case insensitive not match pattern operator in PostgreSQL
192    ExclamationMarkDoubleTildeAsterisk,
193    /// `<<`, a bitwise shift left operator in PostgreSQL
194    ShiftLeft,
195    /// `>>`, a bitwise shift right operator in PostgreSQL
196    ShiftRight,
197    /// `&&`, an overlap operator in PostgreSQL
198    Overlap,
199    /// Exclamation Mark `!` used for PostgreSQL factorial operator
200    ExclamationMark,
201    /// Double Exclamation Mark `!!` used for PostgreSQL prefix factorial operator
202    DoubleExclamationMark,
203    /// AtSign `@` used for PostgreSQL abs operator, also PostgreSQL/Redshift geometrical unary/binary operator (Center, Contained or on)
204    AtSign,
205    /// `^@`, a "starts with" string operator in PostgreSQL
206    CaretAt,
207    /// `|/`, a square root math operator in PostgreSQL
208    PGSquareRoot,
209    /// `||/`, a cube root math operator in PostgreSQL
210    PGCubeRoot,
211    /// `?` or `$` , a prepared statement arg placeholder
212    Placeholder(String),
213    /// `->`, used as a operator to extract json field in PostgreSQL
214    Arrow,
215    /// `->>`, used as a operator to extract json field as text in PostgreSQL
216    LongArrow,
217    /// `#>`, extracts JSON sub-object at the specified path
218    HashArrow,
219    /// `@-@` PostgreSQL/Redshift geometrical unary operator (Length or circumference)
220    AtDashAt,
221    /// `?-` PostgreSQL/Redshift geometrical unary/binary operator (Is horizontal?/Are horizontally aligned?)
222    QuestionMarkDash,
223    /// `&<` PostgreSQL/Redshift geometrical binary operator (Overlaps to left?)
224    AmpersandLeftAngleBracket,
225    /// `&>` PostgreSQL/Redshift geometrical binary operator (Overlaps to right?)`
226    AmpersandRightAngleBracket,
227    /// `&<|` PostgreSQL/Redshift geometrical binary operator (Does not extend above?)`
228    AmpersandLeftAngleBracketVerticalBar,
229    /// `|&>` PostgreSQL/Redshift geometrical binary operator (Does not extend below?)`
230    VerticalBarAmpersandRightAngleBracket,
231    /// `<->` PostgreSQL/Redshift geometrical binary operator (Distance between)
232    TwoWayArrow,
233    /// `<^` PostgreSQL/Redshift geometrical binary operator (Is below?)
234    LeftAngleBracketCaret,
235    /// `>^` PostgreSQL/Redshift geometrical binary operator (Is above?)
236    RightAngleBracketCaret,
237    /// `?#` PostgreSQL/Redshift geometrical binary operator (Intersects or overlaps)
238    QuestionMarkSharp,
239    /// `?-|` PostgreSQL/Redshift geometrical binary operator (Is perpendicular?)
240    QuestionMarkDashVerticalBar,
241    /// `?||` PostgreSQL/Redshift geometrical binary operator (Are parallel?)
242    QuestionMarkDoubleVerticalBar,
243    /// `~=` PostgreSQL/Redshift geometrical binary operator (Same as)
244    TildeEqual,
245    /// `<<| PostgreSQL/Redshift geometrical binary operator (Is strictly below?)
246    ShiftLeftVerticalBar,
247    /// `|>> PostgreSQL/Redshift geometrical binary operator (Is strictly above?)
248    VerticalBarShiftRight,
249    /// `#>>`, extracts JSON sub-object at the specified path as text
250    HashLongArrow,
251    /// jsonb @> jsonb -> boolean: Test whether left json contains the right json
252    AtArrow,
253    /// jsonb <@ jsonb -> boolean: Test whether right json contains the left json
254    ArrowAt,
255    /// jsonb #- text[] -> jsonb: Deletes the field or array element at the specified
256    /// path, where path elements can be either field keys or array indexes.
257    HashMinus,
258    /// jsonb @? jsonpath -> boolean: Does JSON path return any item for the specified
259    /// JSON value?
260    AtQuestion,
261    /// jsonb @@ jsonpath → boolean: Returns the result of a JSON path predicate check
262    /// for the specified JSON value. Only the first item of the result is taken into
263    /// account. If the result is not Boolean, then NULL is returned.
264    AtAt,
265    /// jsonb ? text -> boolean: Checks whether the string exists as a top-level key within the
266    /// jsonb object
267    Question,
268    /// jsonb ?& text[] -> boolean: Check whether all members of the text array exist as top-level
269    /// keys within the jsonb object
270    QuestionAnd,
271    /// jsonb ?| text[] -> boolean: Check whether any member of the text array exists as top-level
272    /// keys within the jsonb object
273    QuestionPipe,
274    /// Custom binary operator
275    /// This is used to represent any custom binary operator that is not part of the SQL standard.
276    /// PostgreSQL allows defining custom binary operators using CREATE OPERATOR.
277    CustomBinaryOperator(String),
278}
279
280impl fmt::Display for Token {
281    fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
282        match self {
283            Token::EOF => f.write_str("EOF"),
284            Token::Word(ref w) => write!(f, "{w}"),
285            Token::Number(ref n, l) => write!(f, "{}{long}", n, long = if *l { "L" } else { "" }),
286            Token::Char(ref c) => write!(f, "{c}"),
287            Token::SingleQuotedString(ref s) => write!(f, "'{s}'"),
288            Token::TripleSingleQuotedString(ref s) => write!(f, "'''{s}'''"),
289            Token::DoubleQuotedString(ref s) => write!(f, "\"{s}\""),
290            Token::TripleDoubleQuotedString(ref s) => write!(f, "\"\"\"{s}\"\"\""),
291            Token::DollarQuotedString(ref s) => write!(f, "{s}"),
292            Token::NationalStringLiteral(ref s) => write!(f, "N'{s}'"),
293            Token::EscapedStringLiteral(ref s) => write!(f, "E'{s}'"),
294            Token::UnicodeStringLiteral(ref s) => write!(f, "U&'{s}'"),
295            Token::HexStringLiteral(ref s) => write!(f, "X'{s}'"),
296            Token::SingleQuotedByteStringLiteral(ref s) => write!(f, "B'{s}'"),
297            Token::TripleSingleQuotedByteStringLiteral(ref s) => write!(f, "B'''{s}'''"),
298            Token::DoubleQuotedByteStringLiteral(ref s) => write!(f, "B\"{s}\""),
299            Token::TripleDoubleQuotedByteStringLiteral(ref s) => write!(f, "B\"\"\"{s}\"\"\""),
300            Token::SingleQuotedRawStringLiteral(ref s) => write!(f, "R'{s}'"),
301            Token::DoubleQuotedRawStringLiteral(ref s) => write!(f, "R\"{s}\""),
302            Token::TripleSingleQuotedRawStringLiteral(ref s) => write!(f, "R'''{s}'''"),
303            Token::TripleDoubleQuotedRawStringLiteral(ref s) => write!(f, "R\"\"\"{s}\"\"\""),
304            Token::Comma => f.write_str(","),
305            Token::Whitespace(ws) => write!(f, "{ws}"),
306            Token::DoubleEq => f.write_str("=="),
307            Token::Spaceship => f.write_str("<=>"),
308            Token::Eq => f.write_str("="),
309            Token::Neq => f.write_str("<>"),
310            Token::Lt => f.write_str("<"),
311            Token::Gt => f.write_str(">"),
312            Token::LtEq => f.write_str("<="),
313            Token::GtEq => f.write_str(">="),
314            Token::Plus => f.write_str("+"),
315            Token::Minus => f.write_str("-"),
316            Token::Mul => f.write_str("*"),
317            Token::Div => f.write_str("/"),
318            Token::DuckIntDiv => f.write_str("//"),
319            Token::StringConcat => f.write_str("||"),
320            Token::Mod => f.write_str("%"),
321            Token::LParen => f.write_str("("),
322            Token::RParen => f.write_str(")"),
323            Token::Period => f.write_str("."),
324            Token::Colon => f.write_str(":"),
325            Token::DoubleColon => f.write_str("::"),
326            Token::Assignment => f.write_str(":="),
327            Token::SemiColon => f.write_str(";"),
328            Token::Backslash => f.write_str("\\"),
329            Token::LBracket => f.write_str("["),
330            Token::RBracket => f.write_str("]"),
331            Token::Ampersand => f.write_str("&"),
332            Token::Caret => f.write_str("^"),
333            Token::Pipe => f.write_str("|"),
334            Token::LBrace => f.write_str("{"),
335            Token::RBrace => f.write_str("}"),
336            Token::RArrow => f.write_str("=>"),
337            Token::Sharp => f.write_str("#"),
338            Token::DoubleSharp => f.write_str("##"),
339            Token::ExclamationMark => f.write_str("!"),
340            Token::DoubleExclamationMark => f.write_str("!!"),
341            Token::Tilde => f.write_str("~"),
342            Token::TildeAsterisk => f.write_str("~*"),
343            Token::ExclamationMarkTilde => f.write_str("!~"),
344            Token::ExclamationMarkTildeAsterisk => f.write_str("!~*"),
345            Token::DoubleTilde => f.write_str("~~"),
346            Token::DoubleTildeAsterisk => f.write_str("~~*"),
347            Token::ExclamationMarkDoubleTilde => f.write_str("!~~"),
348            Token::ExclamationMarkDoubleTildeAsterisk => f.write_str("!~~*"),
349            Token::AtSign => f.write_str("@"),
350            Token::CaretAt => f.write_str("^@"),
351            Token::ShiftLeft => f.write_str("<<"),
352            Token::ShiftRight => f.write_str(">>"),
353            Token::Overlap => f.write_str("&&"),
354            Token::PGSquareRoot => f.write_str("|/"),
355            Token::PGCubeRoot => f.write_str("||/"),
356            Token::AtDashAt => f.write_str("@-@"),
357            Token::QuestionMarkDash => f.write_str("?-"),
358            Token::AmpersandLeftAngleBracket => f.write_str("&<"),
359            Token::AmpersandRightAngleBracket => f.write_str("&>"),
360            Token::AmpersandLeftAngleBracketVerticalBar => f.write_str("&<|"),
361            Token::VerticalBarAmpersandRightAngleBracket => f.write_str("|&>"),
362            Token::TwoWayArrow => f.write_str("<->"),
363            Token::LeftAngleBracketCaret => f.write_str("<^"),
364            Token::RightAngleBracketCaret => f.write_str(">^"),
365            Token::QuestionMarkSharp => f.write_str("?#"),
366            Token::QuestionMarkDashVerticalBar => f.write_str("?-|"),
367            Token::QuestionMarkDoubleVerticalBar => f.write_str("?||"),
368            Token::TildeEqual => f.write_str("~="),
369            Token::ShiftLeftVerticalBar => f.write_str("<<|"),
370            Token::VerticalBarShiftRight => f.write_str("|>>"),
371            Token::Placeholder(ref s) => write!(f, "{s}"),
372            Token::Arrow => write!(f, "->"),
373            Token::LongArrow => write!(f, "->>"),
374            Token::HashArrow => write!(f, "#>"),
375            Token::HashLongArrow => write!(f, "#>>"),
376            Token::AtArrow => write!(f, "@>"),
377            Token::ArrowAt => write!(f, "<@"),
378            Token::HashMinus => write!(f, "#-"),
379            Token::AtQuestion => write!(f, "@?"),
380            Token::AtAt => write!(f, "@@"),
381            Token::Question => write!(f, "?"),
382            Token::QuestionAnd => write!(f, "?&"),
383            Token::QuestionPipe => write!(f, "?|"),
384            Token::CustomBinaryOperator(s) => f.write_str(s),
385        }
386    }
387}
388
389impl Token {
390    pub fn make_keyword(keyword: &str) -> Self {
391        Token::make_word(keyword, None)
392    }
393
394    pub fn make_word(word: &str, quote_style: Option<char>) -> Self {
395        let word_uppercase = word.to_uppercase();
396        Token::Word(Word {
397            value: word.to_string(),
398            quote_style,
399            keyword: if quote_style.is_none() {
400                let keyword = ALL_KEYWORDS.binary_search(&word_uppercase.as_str());
401                keyword.map_or(Keyword::NoKeyword, |x| ALL_KEYWORDS_INDEX[x])
402            } else {
403                Keyword::NoKeyword
404            },
405        })
406    }
407}
408
409/// A keyword (like SELECT) or an optionally quoted SQL identifier
410#[derive(Debug, Clone, PartialEq, PartialOrd, Eq, Ord, Hash)]
411#[cfg_attr(feature = "serde", derive(Serialize, Deserialize))]
412#[cfg_attr(feature = "visitor", derive(Visit, VisitMut))]
413pub struct Word {
414    /// The value of the token, without the enclosing quotes, and with the
415    /// escape sequences (if any) processed (TODO: escapes are not handled)
416    pub value: String,
417    /// An identifier can be "quoted" (&lt;delimited identifier> in ANSI parlance).
418    /// The standard and most implementations allow using double quotes for this,
419    /// but some implementations support other quoting styles as well (e.g. \[MS SQL])
420    pub quote_style: Option<char>,
421    /// If the word was not quoted and it matched one of the known keywords,
422    /// this will have one of the values from dialect::keywords, otherwise empty
423    pub keyword: Keyword,
424}
425
426impl fmt::Display for Word {
427    fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
428        match self.quote_style {
429            Some(s) if s == '"' || s == '[' || s == '`' => {
430                write!(f, "{}{}{}", s, self.value, Word::matching_end_quote(s))
431            }
432            None => f.write_str(&self.value),
433            _ => panic!("Unexpected quote_style!"),
434        }
435    }
436}
437
438impl Word {
439    fn matching_end_quote(ch: char) -> char {
440        match ch {
441            '"' => '"', // ANSI and most dialects
442            '[' => ']', // MS SQL
443            '`' => '`', // MySQL
444            _ => panic!("unexpected quoting style!"),
445        }
446    }
447}
448
449#[derive(Debug, Clone, PartialEq, PartialOrd, Eq, Ord, Hash)]
450#[cfg_attr(feature = "serde", derive(Serialize, Deserialize))]
451#[cfg_attr(feature = "visitor", derive(Visit, VisitMut))]
452pub enum Whitespace {
453    Space,
454    Newline,
455    Tab,
456    SingleLineComment { comment: String, prefix: String },
457    MultiLineComment(String),
458}
459
460impl fmt::Display for Whitespace {
461    fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
462        match self {
463            Whitespace::Space => f.write_str(" "),
464            Whitespace::Newline => f.write_str("\n"),
465            Whitespace::Tab => f.write_str("\t"),
466            Whitespace::SingleLineComment { prefix, comment } => write!(f, "{prefix}{comment}"),
467            Whitespace::MultiLineComment(s) => write!(f, "/*{s}*/"),
468        }
469    }
470}
471
472/// Location in input string
473///
474/// # Create an "empty" (unknown) `Location`
475/// ```
476/// # use sqlparser::tokenizer::Location;
477/// let location = Location::empty();
478/// ```
479///
480/// # Create a `Location` from a line and column
481/// ```
482/// # use sqlparser::tokenizer::Location;
483/// let location = Location::new(1, 1);
484/// ```
485///
486/// # Create a `Location` from a pair
487/// ```
488/// # use sqlparser::tokenizer::Location;
489/// let location = Location::from((1, 1));
490/// ```
491#[derive(Eq, PartialEq, Hash, Clone, Copy, Ord, PartialOrd)]
492#[cfg_attr(feature = "serde", derive(Serialize, Deserialize))]
493#[cfg_attr(feature = "visitor", derive(Visit, VisitMut))]
494pub struct Location {
495    /// Line number, starting from 1.
496    ///
497    /// Note: Line 0 is used for empty spans
498    pub line: u64,
499    /// Line column, starting from 1.
500    ///
501    /// Note: Column 0 is used for empty spans
502    pub column: u64,
503}
504
505impl fmt::Display for Location {
506    fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
507        if self.line == 0 {
508            return Ok(());
509        }
510        write!(f, " at Line: {}, Column: {}", self.line, self.column)
511    }
512}
513
514impl fmt::Debug for Location {
515    fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
516        write!(f, "Location({},{})", self.line, self.column)
517    }
518}
519
520impl Location {
521    /// Return an "empty" / unknown location
522    pub fn empty() -> Self {
523        Self { line: 0, column: 0 }
524    }
525
526    /// Create a new `Location` for a given line and column
527    pub fn new(line: u64, column: u64) -> Self {
528        Self { line, column }
529    }
530
531    /// Create a new location for a given line and column
532    ///
533    /// Alias for [`Self::new`]
534    // TODO: remove / deprecate in favor of` `new` for consistency?
535    pub fn of(line: u64, column: u64) -> Self {
536        Self::new(line, column)
537    }
538
539    /// Combine self and `end` into a new `Span`
540    pub fn span_to(self, end: Self) -> Span {
541        Span { start: self, end }
542    }
543}
544
545impl From<(u64, u64)> for Location {
546    fn from((line, column): (u64, u64)) -> Self {
547        Self { line, column }
548    }
549}
550
551/// A span represents a linear portion of the input string (start, end)
552///
553/// See [Spanned](crate::ast::Spanned) for more information.
554#[derive(Eq, PartialEq, Hash, Clone, PartialOrd, Ord, Copy)]
555#[cfg_attr(feature = "serde", derive(Serialize, Deserialize))]
556#[cfg_attr(feature = "visitor", derive(Visit, VisitMut))]
557pub struct Span {
558    pub start: Location,
559    pub end: Location,
560}
561
562impl fmt::Debug for Span {
563    fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
564        write!(f, "Span({:?}..{:?})", self.start, self.end)
565    }
566}
567
568impl Span {
569    // An empty span (0, 0) -> (0, 0)
570    // We need a const instance for pattern matching
571    const EMPTY: Span = Self::empty();
572
573    /// Create a new span from a start and end [`Location`]
574    pub fn new(start: Location, end: Location) -> Span {
575        Span { start, end }
576    }
577
578    /// Returns an empty span `(0, 0) -> (0, 0)`
579    ///
580    /// Empty spans represent no knowledge of source location
581    /// See [Spanned](crate::ast::Spanned) for more information.
582    pub const fn empty() -> Span {
583        Span {
584            start: Location { line: 0, column: 0 },
585            end: Location { line: 0, column: 0 },
586        }
587    }
588
589    /// Returns the smallest Span that contains both `self` and `other`
590    /// If either span is [Span::empty], the other span is returned
591    ///
592    /// # Examples
593    /// ```
594    /// # use sqlparser::tokenizer::{Span, Location};
595    /// // line 1, column1 -> line 2, column 5
596    /// let span1 = Span::new(Location::new(1, 1), Location::new(2, 5));
597    /// // line 2, column 3 -> line 3, column 7
598    /// let span2 = Span::new(Location::new(2, 3), Location::new(3, 7));
599    /// // Union of the two is the min/max of the two spans
600    /// // line 1, column 1 -> line 3, column 7
601    /// let union = span1.union(&span2);
602    /// assert_eq!(union, Span::new(Location::new(1, 1), Location::new(3, 7)));
603    /// ```
604    pub fn union(&self, other: &Span) -> Span {
605        // If either span is empty, return the other
606        // this prevents propagating (0, 0) through the tree
607        match (self, other) {
608            (&Span::EMPTY, _) => *other,
609            (_, &Span::EMPTY) => *self,
610            _ => Span {
611                start: cmp::min(self.start, other.start),
612                end: cmp::max(self.end, other.end),
613            },
614        }
615    }
616
617    /// Same as [Span::union] for `Option<Span>`
618    ///
619    /// If `other` is `None`, `self` is returned
620    pub fn union_opt(&self, other: &Option<Span>) -> Span {
621        match other {
622            Some(other) => self.union(other),
623            None => *self,
624        }
625    }
626
627    /// Return the [Span::union] of all spans in the iterator
628    ///
629    /// If the iterator is empty, an empty span is returned
630    ///
631    /// # Example
632    /// ```
633    /// # use sqlparser::tokenizer::{Span, Location};
634    /// let spans = vec![
635    ///     Span::new(Location::new(1, 1), Location::new(2, 5)),
636    ///     Span::new(Location::new(2, 3), Location::new(3, 7)),
637    ///     Span::new(Location::new(3, 1), Location::new(4, 2)),
638    /// ];
639    /// // line 1, column 1 -> line 4, column 2
640    /// assert_eq!(
641    ///   Span::union_iter(spans),
642    ///   Span::new(Location::new(1, 1), Location::new(4, 2))
643    /// );
644    pub fn union_iter<I: IntoIterator<Item = Span>>(iter: I) -> Span {
645        iter.into_iter()
646            .reduce(|acc, item| acc.union(&item))
647            .unwrap_or(Span::empty())
648    }
649}
650
651/// Backwards compatibility struct for [`TokenWithSpan`]
652#[deprecated(since = "0.53.0", note = "please use `TokenWithSpan` instead")]
653pub type TokenWithLocation = TokenWithSpan;
654
655/// A [Token] with [Span] attached to it
656///
657/// This is used to track the location of a token in the input string
658///
659/// # Examples
660/// ```
661/// # use sqlparser::tokenizer::{Location, Span, Token, TokenWithSpan};
662/// // commas @ line 1, column 10
663/// let tok1 = TokenWithSpan::new(
664///   Token::Comma,
665///   Span::new(Location::new(1, 10), Location::new(1, 11)),
666/// );
667/// assert_eq!(tok1, Token::Comma); // can compare the token
668///
669/// // commas @ line 2, column 20
670/// let tok2 = TokenWithSpan::new(
671///   Token::Comma,
672///   Span::new(Location::new(2, 20), Location::new(2, 21)),
673/// );
674/// // same token but different locations are not equal
675/// assert_ne!(tok1, tok2);
676/// ```
677#[derive(Debug, Clone, Hash, Ord, PartialOrd, Eq, PartialEq)]
678#[cfg_attr(feature = "serde", derive(Serialize, Deserialize))]
679#[cfg_attr(feature = "visitor", derive(Visit, VisitMut))]
680pub struct TokenWithSpan {
681    pub token: Token,
682    pub span: Span,
683}
684
685impl TokenWithSpan {
686    /// Create a new [`TokenWithSpan`] from a [`Token`] and a [`Span`]
687    pub fn new(token: Token, span: Span) -> Self {
688        Self { token, span }
689    }
690
691    /// Wrap a token with an empty span
692    pub fn wrap(token: Token) -> Self {
693        Self::new(token, Span::empty())
694    }
695
696    /// Wrap a token with a location from `start` to `end`
697    pub fn at(token: Token, start: Location, end: Location) -> Self {
698        Self::new(token, Span::new(start, end))
699    }
700
701    /// Return an EOF token with no location
702    pub fn new_eof() -> Self {
703        Self::wrap(Token::EOF)
704    }
705}
706
707impl PartialEq<Token> for TokenWithSpan {
708    fn eq(&self, other: &Token) -> bool {
709        &self.token == other
710    }
711}
712
713impl PartialEq<TokenWithSpan> for Token {
714    fn eq(&self, other: &TokenWithSpan) -> bool {
715        self == &other.token
716    }
717}
718
719impl fmt::Display for TokenWithSpan {
720    fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
721        self.token.fmt(f)
722    }
723}
724
725/// Tokenizer error
726#[derive(Debug, PartialEq, Eq)]
727pub struct TokenizerError {
728    pub message: String,
729    pub location: Location,
730}
731
732impl fmt::Display for TokenizerError {
733    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
734        write!(f, "{}{}", self.message, self.location,)
735    }
736}
737
738#[cfg(feature = "std")]
739impl std::error::Error for TokenizerError {}
740
741struct State<'a> {
742    peekable: Peekable<Chars<'a>>,
743    pub line: u64,
744    pub col: u64,
745}
746
747impl State<'_> {
748    /// return the next character and advance the stream
749    pub fn next(&mut self) -> Option<char> {
750        match self.peekable.next() {
751            None => None,
752            Some(s) => {
753                if s == '\n' {
754                    self.line += 1;
755                    self.col = 1;
756                } else {
757                    self.col += 1;
758                }
759                Some(s)
760            }
761        }
762    }
763
764    /// return the next character but do not advance the stream
765    pub fn peek(&mut self) -> Option<&char> {
766        self.peekable.peek()
767    }
768
769    pub fn location(&self) -> Location {
770        Location {
771            line: self.line,
772            column: self.col,
773        }
774    }
775}
776
777/// Represents how many quote characters enclose a string literal.
778#[derive(Copy, Clone)]
779enum NumStringQuoteChars {
780    /// e.g. `"abc"`, `'abc'`, `r'abc'`
781    One,
782    /// e.g. `"""abc"""`, `'''abc'''`, `r'''abc'''`
783    Many(NonZeroU8),
784}
785
786/// Settings for tokenizing a quoted string literal.
787struct TokenizeQuotedStringSettings {
788    /// The character used to quote the string.
789    quote_style: char,
790    /// Represents how many quotes characters enclose the string literal.
791    num_quote_chars: NumStringQuoteChars,
792    /// The number of opening quotes left to consume, before parsing
793    /// the remaining string literal.
794    /// For example: given initial string `"""abc"""`. If the caller has
795    /// already parsed the first quote for some reason, then this value
796    /// is set to 1, flagging to look to consume only 2 leading quotes.
797    num_opening_quotes_to_consume: u8,
798    /// True if the string uses backslash escaping of special characters
799    /// e.g `'abc\ndef\'ghi'
800    backslash_escape: bool,
801}
802
803/// SQL Tokenizer
804pub struct Tokenizer<'a> {
805    dialect: &'a dyn Dialect,
806    query: &'a str,
807    /// If true (the default), the tokenizer will un-escape literal
808    /// SQL strings See [`Tokenizer::with_unescape`] for more details.
809    unescape: bool,
810}
811
812impl<'a> Tokenizer<'a> {
813    /// Create a new SQL tokenizer for the specified SQL statement
814    ///
815    /// ```
816    /// # use sqlparser::tokenizer::{Token, Whitespace, Tokenizer};
817    /// # use sqlparser::dialect::GenericDialect;
818    /// # let dialect = GenericDialect{};
819    /// let query = r#"SELECT 'foo'"#;
820    ///
821    /// // Parsing the query
822    /// let tokens = Tokenizer::new(&dialect, &query).tokenize().unwrap();
823    ///
824    /// assert_eq!(tokens, vec![
825    ///   Token::make_word("SELECT", None),
826    ///   Token::Whitespace(Whitespace::Space),
827    ///   Token::SingleQuotedString("foo".to_string()),
828    /// ]);
829    pub fn new(dialect: &'a dyn Dialect, query: &'a str) -> Self {
830        Self {
831            dialect,
832            query,
833            unescape: true,
834        }
835    }
836
837    /// Set unescape mode
838    ///
839    /// When true (default) the tokenizer unescapes literal values
840    /// (for example, `""` in SQL is unescaped to the literal `"`).
841    ///
842    /// When false, the tokenizer provides the raw strings as provided
843    /// in the query.  This can be helpful for programs that wish to
844    /// recover the *exact* original query text without normalizing
845    /// the escaping
846    ///
847    /// # Example
848    ///
849    /// ```
850    /// # use sqlparser::tokenizer::{Token, Tokenizer};
851    /// # use sqlparser::dialect::GenericDialect;
852    /// # let dialect = GenericDialect{};
853    /// let query = r#""Foo "" Bar""#;
854    /// let unescaped = Token::make_word(r#"Foo " Bar"#, Some('"'));
855    /// let original  = Token::make_word(r#"Foo "" Bar"#, Some('"'));
856    ///
857    /// // Parsing with unescaping (default)
858    /// let tokens = Tokenizer::new(&dialect, &query).tokenize().unwrap();
859    /// assert_eq!(tokens, vec![unescaped]);
860    ///
861    /// // Parsing with unescape = false
862    /// let tokens = Tokenizer::new(&dialect, &query)
863    ///    .with_unescape(false)
864    ///    .tokenize().unwrap();
865    /// assert_eq!(tokens, vec![original]);
866    /// ```
867    pub fn with_unescape(mut self, unescape: bool) -> Self {
868        self.unescape = unescape;
869        self
870    }
871
872    /// Tokenize the statement and produce a vector of tokens
873    pub fn tokenize(&mut self) -> Result<Vec<Token>, TokenizerError> {
874        let twl = self.tokenize_with_location()?;
875        Ok(twl.into_iter().map(|t| t.token).collect())
876    }
877
878    /// Tokenize the statement and produce a vector of tokens with location information
879    pub fn tokenize_with_location(&mut self) -> Result<Vec<TokenWithSpan>, TokenizerError> {
880        let mut tokens: Vec<TokenWithSpan> = vec![];
881        self.tokenize_with_location_into_buf(&mut tokens)
882            .map(|_| tokens)
883    }
884
885    /// Tokenize the statement and append tokens with location information into the provided buffer.
886    /// If an error is thrown, the buffer will contain all tokens that were successfully parsed before the error.
887    pub fn tokenize_with_location_into_buf(
888        &mut self,
889        buf: &mut Vec<TokenWithSpan>,
890    ) -> Result<(), TokenizerError> {
891        let mut state = State {
892            peekable: self.query.chars().peekable(),
893            line: 1,
894            col: 1,
895        };
896
897        let mut location = state.location();
898        while let Some(token) = self.next_token(&mut state, buf.last().map(|t| &t.token))? {
899            let span = location.span_to(state.location());
900
901            buf.push(TokenWithSpan { token, span });
902
903            location = state.location();
904        }
905        Ok(())
906    }
907
908    // Tokenize the identifier or keywords in `ch`
909    fn tokenize_identifier_or_keyword(
910        &self,
911        ch: impl IntoIterator<Item = char>,
912        chars: &mut State,
913    ) -> Result<Option<Token>, TokenizerError> {
914        chars.next(); // consume the first char
915        let ch: String = ch.into_iter().collect();
916        let word = self.tokenize_word(ch, chars);
917
918        // TODO: implement parsing of exponent here
919        if word.chars().all(|x| x.is_ascii_digit() || x == '.') {
920            let mut inner_state = State {
921                peekable: word.chars().peekable(),
922                line: 0,
923                col: 0,
924            };
925            let mut s = peeking_take_while(&mut inner_state, |ch| matches!(ch, '0'..='9' | '.'));
926            let s2 = peeking_take_while(chars, |ch| matches!(ch, '0'..='9' | '.'));
927            s += s2.as_str();
928            return Ok(Some(Token::Number(s, false)));
929        }
930
931        Ok(Some(Token::make_word(&word, None)))
932    }
933
934    /// Get the next token or return None
935    fn next_token(
936        &self,
937        chars: &mut State,
938        prev_token: Option<&Token>,
939    ) -> Result<Option<Token>, TokenizerError> {
940        match chars.peek() {
941            Some(&ch) => match ch {
942                ' ' => self.consume_and_return(chars, Token::Whitespace(Whitespace::Space)),
943                '\t' => self.consume_and_return(chars, Token::Whitespace(Whitespace::Tab)),
944                '\n' => self.consume_and_return(chars, Token::Whitespace(Whitespace::Newline)),
945                '\r' => {
946                    // Emit a single Whitespace::Newline token for \r and \r\n
947                    chars.next();
948                    if let Some('\n') = chars.peek() {
949                        chars.next();
950                    }
951                    Ok(Some(Token::Whitespace(Whitespace::Newline)))
952                }
953                // BigQuery and MySQL use b or B for byte string literal, Postgres for bit strings
954                b @ 'B' | b @ 'b' if dialect_of!(self is BigQueryDialect | PostgreSqlDialect | MySqlDialect | GenericDialect) =>
955                {
956                    chars.next(); // consume
957                    match chars.peek() {
958                        Some('\'') => {
959                            if self.dialect.supports_triple_quoted_string() {
960                                return self
961                                    .tokenize_single_or_triple_quoted_string::<fn(String) -> Token>(
962                                        chars,
963                                        '\'',
964                                        false,
965                                        Token::SingleQuotedByteStringLiteral,
966                                        Token::TripleSingleQuotedByteStringLiteral,
967                                    );
968                            }
969                            let s = self.tokenize_single_quoted_string(chars, '\'', false)?;
970                            Ok(Some(Token::SingleQuotedByteStringLiteral(s)))
971                        }
972                        Some('\"') => {
973                            if self.dialect.supports_triple_quoted_string() {
974                                return self
975                                    .tokenize_single_or_triple_quoted_string::<fn(String) -> Token>(
976                                        chars,
977                                        '"',
978                                        false,
979                                        Token::DoubleQuotedByteStringLiteral,
980                                        Token::TripleDoubleQuotedByteStringLiteral,
981                                    );
982                            }
983                            let s = self.tokenize_single_quoted_string(chars, '\"', false)?;
984                            Ok(Some(Token::DoubleQuotedByteStringLiteral(s)))
985                        }
986                        _ => {
987                            // regular identifier starting with an "b" or "B"
988                            let s = self.tokenize_word(b, chars);
989                            Ok(Some(Token::make_word(&s, None)))
990                        }
991                    }
992                }
993                // BigQuery uses r or R for raw string literal
994                b @ 'R' | b @ 'r' if dialect_of!(self is BigQueryDialect | GenericDialect) => {
995                    chars.next(); // consume
996                    match chars.peek() {
997                        Some('\'') => self
998                            .tokenize_single_or_triple_quoted_string::<fn(String) -> Token>(
999                                chars,
1000                                '\'',
1001                                false,
1002                                Token::SingleQuotedRawStringLiteral,
1003                                Token::TripleSingleQuotedRawStringLiteral,
1004                            ),
1005                        Some('\"') => self
1006                            .tokenize_single_or_triple_quoted_string::<fn(String) -> Token>(
1007                                chars,
1008                                '"',
1009                                false,
1010                                Token::DoubleQuotedRawStringLiteral,
1011                                Token::TripleDoubleQuotedRawStringLiteral,
1012                            ),
1013                        _ => {
1014                            // regular identifier starting with an "r" or "R"
1015                            let s = self.tokenize_word(b, chars);
1016                            Ok(Some(Token::make_word(&s, None)))
1017                        }
1018                    }
1019                }
1020                // Redshift uses lower case n for national string literal
1021                n @ 'N' | n @ 'n' => {
1022                    chars.next(); // consume, to check the next char
1023                    match chars.peek() {
1024                        Some('\'') => {
1025                            // N'...' - a <national character string literal>
1026                            let backslash_escape =
1027                                self.dialect.supports_string_literal_backslash_escape();
1028                            let s =
1029                                self.tokenize_single_quoted_string(chars, '\'', backslash_escape)?;
1030                            Ok(Some(Token::NationalStringLiteral(s)))
1031                        }
1032                        _ => {
1033                            // regular identifier starting with an "N"
1034                            let s = self.tokenize_word(n, chars);
1035                            Ok(Some(Token::make_word(&s, None)))
1036                        }
1037                    }
1038                }
1039                // PostgreSQL accepts "escape" string constants, which are an extension to the SQL standard.
1040                x @ 'e' | x @ 'E' if self.dialect.supports_string_escape_constant() => {
1041                    let starting_loc = chars.location();
1042                    chars.next(); // consume, to check the next char
1043                    match chars.peek() {
1044                        Some('\'') => {
1045                            let s =
1046                                self.tokenize_escaped_single_quoted_string(starting_loc, chars)?;
1047                            Ok(Some(Token::EscapedStringLiteral(s)))
1048                        }
1049                        _ => {
1050                            // regular identifier starting with an "E" or "e"
1051                            let s = self.tokenize_word(x, chars);
1052                            Ok(Some(Token::make_word(&s, None)))
1053                        }
1054                    }
1055                }
1056                // Unicode string literals like U&'first \000A second' are supported in some dialects, including PostgreSQL
1057                x @ 'u' | x @ 'U' if self.dialect.supports_unicode_string_literal() => {
1058                    chars.next(); // consume, to check the next char
1059                    if chars.peek() == Some(&'&') {
1060                        // we cannot advance the iterator here, as we need to consume the '&' later if the 'u' was an identifier
1061                        let mut chars_clone = chars.peekable.clone();
1062                        chars_clone.next(); // consume the '&' in the clone
1063                        if chars_clone.peek() == Some(&'\'') {
1064                            chars.next(); // consume the '&' in the original iterator
1065                            let s = unescape_unicode_single_quoted_string(chars)?;
1066                            return Ok(Some(Token::UnicodeStringLiteral(s)));
1067                        }
1068                    }
1069                    // regular identifier starting with an "U" or "u"
1070                    let s = self.tokenize_word(x, chars);
1071                    Ok(Some(Token::make_word(&s, None)))
1072                }
1073                // The spec only allows an uppercase 'X' to introduce a hex
1074                // string, but PostgreSQL, at least, allows a lowercase 'x' too.
1075                x @ 'x' | x @ 'X' => {
1076                    chars.next(); // consume, to check the next char
1077                    match chars.peek() {
1078                        Some('\'') => {
1079                            // X'...' - a <binary string literal>
1080                            let s = self.tokenize_single_quoted_string(chars, '\'', true)?;
1081                            Ok(Some(Token::HexStringLiteral(s)))
1082                        }
1083                        _ => {
1084                            // regular identifier starting with an "X"
1085                            let s = self.tokenize_word(x, chars);
1086                            Ok(Some(Token::make_word(&s, None)))
1087                        }
1088                    }
1089                }
1090                // single quoted string
1091                '\'' => {
1092                    if self.dialect.supports_triple_quoted_string() {
1093                        return self
1094                            .tokenize_single_or_triple_quoted_string::<fn(String) -> Token>(
1095                                chars,
1096                                '\'',
1097                                self.dialect.supports_string_literal_backslash_escape(),
1098                                Token::SingleQuotedString,
1099                                Token::TripleSingleQuotedString,
1100                            );
1101                    }
1102                    let s = self.tokenize_single_quoted_string(
1103                        chars,
1104                        '\'',
1105                        self.dialect.supports_string_literal_backslash_escape(),
1106                    )?;
1107
1108                    Ok(Some(Token::SingleQuotedString(s)))
1109                }
1110                // double quoted string
1111                '\"' if !self.dialect.is_delimited_identifier_start(ch)
1112                    && !self.dialect.is_identifier_start(ch) =>
1113                {
1114                    if self.dialect.supports_triple_quoted_string() {
1115                        return self
1116                            .tokenize_single_or_triple_quoted_string::<fn(String) -> Token>(
1117                                chars,
1118                                '"',
1119                                self.dialect.supports_string_literal_backslash_escape(),
1120                                Token::DoubleQuotedString,
1121                                Token::TripleDoubleQuotedString,
1122                            );
1123                    }
1124                    let s = self.tokenize_single_quoted_string(
1125                        chars,
1126                        '"',
1127                        self.dialect.supports_string_literal_backslash_escape(),
1128                    )?;
1129
1130                    Ok(Some(Token::DoubleQuotedString(s)))
1131                }
1132                // delimited (quoted) identifier
1133                quote_start if self.dialect.is_delimited_identifier_start(ch) => {
1134                    let word = self.tokenize_quoted_identifier(quote_start, chars)?;
1135                    Ok(Some(Token::make_word(&word, Some(quote_start))))
1136                }
1137                // Potentially nested delimited (quoted) identifier
1138                quote_start
1139                    if self
1140                        .dialect
1141                        .is_nested_delimited_identifier_start(quote_start)
1142                        && self
1143                            .dialect
1144                            .peek_nested_delimited_identifier_quotes(chars.peekable.clone())
1145                            .is_some() =>
1146                {
1147                    let Some((quote_start, nested_quote_start)) = self
1148                        .dialect
1149                        .peek_nested_delimited_identifier_quotes(chars.peekable.clone())
1150                    else {
1151                        return self.tokenizer_error(
1152                            chars.location(),
1153                            format!("Expected nested delimiter '{quote_start}' before EOF."),
1154                        );
1155                    };
1156
1157                    let Some(nested_quote_start) = nested_quote_start else {
1158                        let word = self.tokenize_quoted_identifier(quote_start, chars)?;
1159                        return Ok(Some(Token::make_word(&word, Some(quote_start))));
1160                    };
1161
1162                    let mut word = vec![];
1163                    let quote_end = Word::matching_end_quote(quote_start);
1164                    let nested_quote_end = Word::matching_end_quote(nested_quote_start);
1165                    let error_loc = chars.location();
1166
1167                    chars.next(); // skip the first delimiter
1168                    peeking_take_while(chars, |ch| ch.is_whitespace());
1169                    if chars.peek() != Some(&nested_quote_start) {
1170                        return self.tokenizer_error(
1171                            error_loc,
1172                            format!("Expected nested delimiter '{nested_quote_start}' before EOF."),
1173                        );
1174                    }
1175                    word.push(nested_quote_start.into());
1176                    word.push(self.tokenize_quoted_identifier(nested_quote_end, chars)?);
1177                    word.push(nested_quote_end.into());
1178                    peeking_take_while(chars, |ch| ch.is_whitespace());
1179                    if chars.peek() != Some(&quote_end) {
1180                        return self.tokenizer_error(
1181                            error_loc,
1182                            format!("Expected close delimiter '{quote_end}' before EOF."),
1183                        );
1184                    }
1185                    chars.next(); // skip close delimiter
1186
1187                    Ok(Some(Token::make_word(&word.concat(), Some(quote_start))))
1188                }
1189                // numbers and period
1190                '0'..='9' | '.' => {
1191                    // Some dialects support underscore as number separator
1192                    // There can only be one at a time and it must be followed by another digit
1193                    let is_number_separator = |ch: char, next_char: Option<char>| {
1194                        self.dialect.supports_numeric_literal_underscores()
1195                            && ch == '_'
1196                            && next_char.is_some_and(|next_ch| next_ch.is_ascii_hexdigit())
1197                    };
1198
1199                    let mut s = peeking_next_take_while(chars, |ch, next_ch| {
1200                        ch.is_ascii_digit() || is_number_separator(ch, next_ch)
1201                    });
1202
1203                    // match binary literal that starts with 0x
1204                    if s == "0" && chars.peek() == Some(&'x') {
1205                        chars.next();
1206                        let s2 = peeking_next_take_while(chars, |ch, next_ch| {
1207                            ch.is_ascii_hexdigit() || is_number_separator(ch, next_ch)
1208                        });
1209                        return Ok(Some(Token::HexStringLiteral(s2)));
1210                    }
1211
1212                    // match one period
1213                    if let Some('.') = chars.peek() {
1214                        s.push('.');
1215                        chars.next();
1216                    }
1217
1218                    // If the dialect supports identifiers that start with a numeric prefix
1219                    // and we have now consumed a dot, check if the previous token was a Word.
1220                    // If so, what follows is definitely not part of a decimal number and
1221                    // we should yield the dot as a dedicated token so compound identifiers
1222                    // starting with digits can be parsed correctly.
1223                    if s == "." && self.dialect.supports_numeric_prefix() {
1224                        if let Some(Token::Word(_)) = prev_token {
1225                            return Ok(Some(Token::Period));
1226                        }
1227                    }
1228
1229                    // Consume fractional digits.
1230                    s += &peeking_next_take_while(chars, |ch, next_ch| {
1231                        ch.is_ascii_digit() || is_number_separator(ch, next_ch)
1232                    });
1233
1234                    // No fraction -> Token::Period
1235                    if s == "." {
1236                        return Ok(Some(Token::Period));
1237                    }
1238
1239                    // Parse exponent as number
1240                    let mut exponent_part = String::new();
1241                    if chars.peek() == Some(&'e') || chars.peek() == Some(&'E') {
1242                        let mut char_clone = chars.peekable.clone();
1243                        exponent_part.push(char_clone.next().unwrap());
1244
1245                        // Optional sign
1246                        match char_clone.peek() {
1247                            Some(&c) if matches!(c, '+' | '-') => {
1248                                exponent_part.push(c);
1249                                char_clone.next();
1250                            }
1251                            _ => (),
1252                        }
1253
1254                        match char_clone.peek() {
1255                            // Definitely an exponent, get original iterator up to speed and use it
1256                            Some(&c) if c.is_ascii_digit() => {
1257                                for _ in 0..exponent_part.len() {
1258                                    chars.next();
1259                                }
1260                                exponent_part +=
1261                                    &peeking_take_while(chars, |ch| ch.is_ascii_digit());
1262                                s += exponent_part.as_str();
1263                            }
1264                            // Not an exponent, discard the work done
1265                            _ => (),
1266                        }
1267                    }
1268
1269                    // If the dialect supports identifiers that start with a numeric prefix,
1270                    // we need to check if the value is in fact an identifier and must thus
1271                    // be tokenized as a word.
1272                    if self.dialect.supports_numeric_prefix() {
1273                        if exponent_part.is_empty() {
1274                            // If it is not a number with an exponent, it may be
1275                            // an identifier starting with digits.
1276                            let word =
1277                                peeking_take_while(chars, |ch| self.dialect.is_identifier_part(ch));
1278
1279                            if !word.is_empty() {
1280                                s += word.as_str();
1281                                return Ok(Some(Token::make_word(s.as_str(), None)));
1282                            }
1283                        } else if prev_token == Some(&Token::Period) {
1284                            // If the previous token was a period, thus not belonging to a number,
1285                            // the value we have is part of an identifier.
1286                            return Ok(Some(Token::make_word(s.as_str(), None)));
1287                        }
1288                    }
1289
1290                    let long = if chars.peek() == Some(&'L') {
1291                        chars.next();
1292                        true
1293                    } else {
1294                        false
1295                    };
1296                    Ok(Some(Token::Number(s, long)))
1297                }
1298                // punctuation
1299                '(' => self.consume_and_return(chars, Token::LParen),
1300                ')' => self.consume_and_return(chars, Token::RParen),
1301                ',' => self.consume_and_return(chars, Token::Comma),
1302                // operators
1303                '-' => {
1304                    chars.next(); // consume the '-'
1305
1306                    match chars.peek() {
1307                        Some('-') => {
1308                            let mut is_comment = true;
1309                            if self.dialect.requires_single_line_comment_whitespace() {
1310                                is_comment = Some(' ') == chars.peekable.clone().nth(1);
1311                            }
1312
1313                            if is_comment {
1314                                chars.next(); // consume second '-'
1315                                let comment = self.tokenize_single_line_comment(chars);
1316                                return Ok(Some(Token::Whitespace(
1317                                    Whitespace::SingleLineComment {
1318                                        prefix: "--".to_owned(),
1319                                        comment,
1320                                    },
1321                                )));
1322                            }
1323
1324                            self.start_binop(chars, "-", Token::Minus)
1325                        }
1326                        Some('>') => {
1327                            chars.next();
1328                            match chars.peek() {
1329                                Some('>') => self.consume_for_binop(chars, "->>", Token::LongArrow),
1330                                _ => self.start_binop(chars, "->", Token::Arrow),
1331                            }
1332                        }
1333                        // a regular '-' operator
1334                        _ => self.start_binop(chars, "-", Token::Minus),
1335                    }
1336                }
1337                '/' => {
1338                    chars.next(); // consume the '/'
1339                    match chars.peek() {
1340                        Some('*') => {
1341                            chars.next(); // consume the '*', starting a multi-line comment
1342                            self.tokenize_multiline_comment(chars)
1343                        }
1344                        Some('/') if dialect_of!(self is SnowflakeDialect) => {
1345                            chars.next(); // consume the second '/', starting a snowflake single-line comment
1346                            let comment = self.tokenize_single_line_comment(chars);
1347                            Ok(Some(Token::Whitespace(Whitespace::SingleLineComment {
1348                                prefix: "//".to_owned(),
1349                                comment,
1350                            })))
1351                        }
1352                        Some('/') if dialect_of!(self is DuckDbDialect | GenericDialect) => {
1353                            self.consume_and_return(chars, Token::DuckIntDiv)
1354                        }
1355                        // a regular '/' operator
1356                        _ => Ok(Some(Token::Div)),
1357                    }
1358                }
1359                '+' => self.consume_and_return(chars, Token::Plus),
1360                '*' => self.consume_and_return(chars, Token::Mul),
1361                '%' => {
1362                    chars.next(); // advance past '%'
1363                    match chars.peek() {
1364                        Some(s) if s.is_whitespace() => Ok(Some(Token::Mod)),
1365                        Some(sch) if self.dialect.is_identifier_start('%') => {
1366                            self.tokenize_identifier_or_keyword([ch, *sch], chars)
1367                        }
1368                        _ => self.start_binop(chars, "%", Token::Mod),
1369                    }
1370                }
1371                '|' => {
1372                    chars.next(); // consume the '|'
1373                    match chars.peek() {
1374                        Some('/') => self.consume_for_binop(chars, "|/", Token::PGSquareRoot),
1375                        Some('|') => {
1376                            chars.next(); // consume the second '|'
1377                            match chars.peek() {
1378                                Some('/') => {
1379                                    self.consume_for_binop(chars, "||/", Token::PGCubeRoot)
1380                                }
1381                                _ => self.start_binop(chars, "||", Token::StringConcat),
1382                            }
1383                        }
1384                        Some('&') if self.dialect.supports_geometric_types() => {
1385                            chars.next(); // consume
1386                            match chars.peek() {
1387                                Some('>') => self.consume_for_binop(
1388                                    chars,
1389                                    "|&>",
1390                                    Token::VerticalBarAmpersandRightAngleBracket,
1391                                ),
1392                                _ => self.start_binop_opt(chars, "|&", None),
1393                            }
1394                        }
1395                        Some('>') if self.dialect.supports_geometric_types() => {
1396                            chars.next(); // consume
1397                            match chars.peek() {
1398                                Some('>') => self.consume_for_binop(
1399                                    chars,
1400                                    "|>>",
1401                                    Token::VerticalBarShiftRight,
1402                                ),
1403                                _ => self.start_binop_opt(chars, "|>", None),
1404                            }
1405                        }
1406                        // Bitshift '|' operator
1407                        _ => self.start_binop(chars, "|", Token::Pipe),
1408                    }
1409                }
1410                '=' => {
1411                    chars.next(); // consume
1412                    match chars.peek() {
1413                        Some('>') => self.consume_and_return(chars, Token::RArrow),
1414                        Some('=') => self.consume_and_return(chars, Token::DoubleEq),
1415                        _ => Ok(Some(Token::Eq)),
1416                    }
1417                }
1418                '!' => {
1419                    chars.next(); // consume
1420                    match chars.peek() {
1421                        Some('=') => self.consume_and_return(chars, Token::Neq),
1422                        Some('!') => self.consume_and_return(chars, Token::DoubleExclamationMark),
1423                        Some('~') => {
1424                            chars.next();
1425                            match chars.peek() {
1426                                Some('*') => self
1427                                    .consume_and_return(chars, Token::ExclamationMarkTildeAsterisk),
1428                                Some('~') => {
1429                                    chars.next();
1430                                    match chars.peek() {
1431                                        Some('*') => self.consume_and_return(
1432                                            chars,
1433                                            Token::ExclamationMarkDoubleTildeAsterisk,
1434                                        ),
1435                                        _ => Ok(Some(Token::ExclamationMarkDoubleTilde)),
1436                                    }
1437                                }
1438                                _ => Ok(Some(Token::ExclamationMarkTilde)),
1439                            }
1440                        }
1441                        _ => Ok(Some(Token::ExclamationMark)),
1442                    }
1443                }
1444                '<' => {
1445                    chars.next(); // consume
1446                    match chars.peek() {
1447                        Some('=') => {
1448                            chars.next();
1449                            match chars.peek() {
1450                                Some('>') => self.consume_for_binop(chars, "<=>", Token::Spaceship),
1451                                _ => self.start_binop(chars, "<=", Token::LtEq),
1452                            }
1453                        }
1454                        Some('|') if self.dialect.supports_geometric_types() => {
1455                            self.consume_for_binop(chars, "<<|", Token::ShiftLeftVerticalBar)
1456                        }
1457                        Some('>') => self.consume_for_binop(chars, "<>", Token::Neq),
1458                        Some('<') if self.dialect.supports_geometric_types() => {
1459                            chars.next(); // consume
1460                            match chars.peek() {
1461                                Some('|') => self.consume_for_binop(
1462                                    chars,
1463                                    "<<|",
1464                                    Token::ShiftLeftVerticalBar,
1465                                ),
1466                                _ => self.start_binop(chars, "<<", Token::ShiftLeft),
1467                            }
1468                        }
1469                        Some('<') => self.consume_for_binop(chars, "<<", Token::ShiftLeft),
1470                        Some('-') if self.dialect.supports_geometric_types() => {
1471                            chars.next(); // consume
1472                            match chars.peek() {
1473                                Some('>') => {
1474                                    self.consume_for_binop(chars, "<->", Token::TwoWayArrow)
1475                                }
1476                                _ => self.start_binop_opt(chars, "<-", None),
1477                            }
1478                        }
1479                        Some('^') if self.dialect.supports_geometric_types() => {
1480                            self.consume_for_binop(chars, "<^", Token::LeftAngleBracketCaret)
1481                        }
1482                        Some('@') => self.consume_for_binop(chars, "<@", Token::ArrowAt),
1483                        _ => self.start_binop(chars, "<", Token::Lt),
1484                    }
1485                }
1486                '>' => {
1487                    chars.next(); // consume
1488                    match chars.peek() {
1489                        Some('=') => self.consume_for_binop(chars, ">=", Token::GtEq),
1490                        Some('>') => self.consume_for_binop(chars, ">>", Token::ShiftRight),
1491                        Some('^') if self.dialect.supports_geometric_types() => {
1492                            self.consume_for_binop(chars, ">^", Token::RightAngleBracketCaret)
1493                        }
1494                        _ => self.start_binop(chars, ">", Token::Gt),
1495                    }
1496                }
1497                ':' => {
1498                    chars.next();
1499                    match chars.peek() {
1500                        Some(':') => self.consume_and_return(chars, Token::DoubleColon),
1501                        Some('=') => self.consume_and_return(chars, Token::Assignment),
1502                        _ => Ok(Some(Token::Colon)),
1503                    }
1504                }
1505                ';' => self.consume_and_return(chars, Token::SemiColon),
1506                '\\' => self.consume_and_return(chars, Token::Backslash),
1507                '[' => self.consume_and_return(chars, Token::LBracket),
1508                ']' => self.consume_and_return(chars, Token::RBracket),
1509                '&' => {
1510                    chars.next(); // consume the '&'
1511                    match chars.peek() {
1512                        Some('>') if self.dialect.supports_geometric_types() => {
1513                            chars.next();
1514                            self.consume_and_return(chars, Token::AmpersandRightAngleBracket)
1515                        }
1516                        Some('<') if self.dialect.supports_geometric_types() => {
1517                            chars.next(); // consume
1518                            match chars.peek() {
1519                                Some('|') => self.consume_and_return(
1520                                    chars,
1521                                    Token::AmpersandLeftAngleBracketVerticalBar,
1522                                ),
1523                                _ => {
1524                                    self.start_binop(chars, "&<", Token::AmpersandLeftAngleBracket)
1525                                }
1526                            }
1527                        }
1528                        Some('&') => {
1529                            chars.next(); // consume the second '&'
1530                            self.start_binop(chars, "&&", Token::Overlap)
1531                        }
1532                        // Bitshift '&' operator
1533                        _ => self.start_binop(chars, "&", Token::Ampersand),
1534                    }
1535                }
1536                '^' => {
1537                    chars.next(); // consume the '^'
1538                    match chars.peek() {
1539                        Some('@') => self.consume_and_return(chars, Token::CaretAt),
1540                        _ => Ok(Some(Token::Caret)),
1541                    }
1542                }
1543                '{' => self.consume_and_return(chars, Token::LBrace),
1544                '}' => self.consume_and_return(chars, Token::RBrace),
1545                '#' if dialect_of!(self is SnowflakeDialect | BigQueryDialect | MySqlDialect | HiveDialect) =>
1546                {
1547                    chars.next(); // consume the '#', starting a snowflake single-line comment
1548                    let comment = self.tokenize_single_line_comment(chars);
1549                    Ok(Some(Token::Whitespace(Whitespace::SingleLineComment {
1550                        prefix: "#".to_owned(),
1551                        comment,
1552                    })))
1553                }
1554                '~' => {
1555                    chars.next(); // consume
1556                    match chars.peek() {
1557                        Some('*') => self.consume_for_binop(chars, "~*", Token::TildeAsterisk),
1558                        Some('=') if self.dialect.supports_geometric_types() => {
1559                            self.consume_for_binop(chars, "~=", Token::TildeEqual)
1560                        }
1561                        Some('~') => {
1562                            chars.next();
1563                            match chars.peek() {
1564                                Some('*') => {
1565                                    self.consume_for_binop(chars, "~~*", Token::DoubleTildeAsterisk)
1566                                }
1567                                _ => self.start_binop(chars, "~~", Token::DoubleTilde),
1568                            }
1569                        }
1570                        _ => self.start_binop(chars, "~", Token::Tilde),
1571                    }
1572                }
1573                '#' => {
1574                    chars.next();
1575                    match chars.peek() {
1576                        Some('-') => self.consume_for_binop(chars, "#-", Token::HashMinus),
1577                        Some('>') => {
1578                            chars.next();
1579                            match chars.peek() {
1580                                Some('>') => {
1581                                    self.consume_for_binop(chars, "#>>", Token::HashLongArrow)
1582                                }
1583                                _ => self.start_binop(chars, "#>", Token::HashArrow),
1584                            }
1585                        }
1586                        Some(' ') => Ok(Some(Token::Sharp)),
1587                        Some('#') if self.dialect.supports_geometric_types() => {
1588                            self.consume_for_binop(chars, "##", Token::DoubleSharp)
1589                        }
1590                        Some(sch) if self.dialect.is_identifier_start('#') => {
1591                            self.tokenize_identifier_or_keyword([ch, *sch], chars)
1592                        }
1593                        _ => self.start_binop(chars, "#", Token::Sharp),
1594                    }
1595                }
1596                '@' => {
1597                    chars.next();
1598                    match chars.peek() {
1599                        Some('@') if self.dialect.supports_geometric_types() => {
1600                            self.consume_and_return(chars, Token::AtAt)
1601                        }
1602                        Some('-') if self.dialect.supports_geometric_types() => {
1603                            chars.next();
1604                            match chars.peek() {
1605                                Some('@') => self.consume_and_return(chars, Token::AtDashAt),
1606                                _ => self.start_binop_opt(chars, "@-", None),
1607                            }
1608                        }
1609                        Some('>') => self.consume_and_return(chars, Token::AtArrow),
1610                        Some('?') => self.consume_and_return(chars, Token::AtQuestion),
1611                        Some('@') => {
1612                            chars.next();
1613                            match chars.peek() {
1614                                Some(' ') => Ok(Some(Token::AtAt)),
1615                                Some(tch) if self.dialect.is_identifier_start('@') => {
1616                                    self.tokenize_identifier_or_keyword([ch, '@', *tch], chars)
1617                                }
1618                                _ => Ok(Some(Token::AtAt)),
1619                            }
1620                        }
1621                        Some(' ') => Ok(Some(Token::AtSign)),
1622                        // We break on quotes here, because no dialect allows identifiers starting
1623                        // with @ and containing quotation marks (e.g. `@'foo'`) unless they are
1624                        // quoted, which is tokenized as a quoted string, not here (e.g.
1625                        // `"@'foo'"`). Further, at least two dialects parse `@` followed by a
1626                        // quoted string as two separate tokens, which this allows. For example,
1627                        // Postgres parses `@'1'` as the absolute value of '1' which is implicitly
1628                        // cast to a numeric type. And when parsing MySQL-style grantees (e.g.
1629                        // `GRANT ALL ON *.* to 'root'@'localhost'`), we also want separate tokens
1630                        // for the user, the `@`, and the host.
1631                        Some('\'') => Ok(Some(Token::AtSign)),
1632                        Some('\"') => Ok(Some(Token::AtSign)),
1633                        Some('`') => Ok(Some(Token::AtSign)),
1634                        Some(sch) if self.dialect.is_identifier_start('@') => {
1635                            self.tokenize_identifier_or_keyword([ch, *sch], chars)
1636                        }
1637                        _ => Ok(Some(Token::AtSign)),
1638                    }
1639                }
1640                // Postgres uses ? for jsonb operators, not prepared statements
1641                '?' if self.dialect.supports_geometric_types() => {
1642                    chars.next(); // consume
1643                    match chars.peek() {
1644                        Some('|') => {
1645                            chars.next();
1646                            match chars.peek() {
1647                                Some('|') => self.consume_and_return(
1648                                    chars,
1649                                    Token::QuestionMarkDoubleVerticalBar,
1650                                ),
1651                                _ => Ok(Some(Token::QuestionPipe)),
1652                            }
1653                        }
1654
1655                        Some('&') => self.consume_and_return(chars, Token::QuestionAnd),
1656                        Some('-') => {
1657                            chars.next(); // consume
1658                            match chars.peek() {
1659                                Some('|') => self
1660                                    .consume_and_return(chars, Token::QuestionMarkDashVerticalBar),
1661                                _ => Ok(Some(Token::QuestionMarkDash)),
1662                            }
1663                        }
1664                        Some('#') => self.consume_and_return(chars, Token::QuestionMarkSharp),
1665                        _ => self.consume_and_return(chars, Token::Question),
1666                    }
1667                }
1668                '?' => {
1669                    chars.next();
1670                    let s = peeking_take_while(chars, |ch| ch.is_numeric());
1671                    Ok(Some(Token::Placeholder(String::from("?") + &s)))
1672                }
1673
1674                // identifier or keyword
1675                ch if self.dialect.is_identifier_start(ch) => {
1676                    self.tokenize_identifier_or_keyword([ch], chars)
1677                }
1678                '$' => Ok(Some(self.tokenize_dollar_preceded_value(chars)?)),
1679
1680                // whitespace check (including unicode chars) should be last as it covers some of the chars above
1681                ch if ch.is_whitespace() => {
1682                    self.consume_and_return(chars, Token::Whitespace(Whitespace::Space))
1683                }
1684                other => self.consume_and_return(chars, Token::Char(other)),
1685            },
1686            None => Ok(None),
1687        }
1688    }
1689
1690    /// Consume the next character, then parse a custom binary operator. The next character should be included in the prefix
1691    fn consume_for_binop(
1692        &self,
1693        chars: &mut State,
1694        prefix: &str,
1695        default: Token,
1696    ) -> Result<Option<Token>, TokenizerError> {
1697        chars.next(); // consume the first char
1698        self.start_binop_opt(chars, prefix, Some(default))
1699    }
1700
1701    /// parse a custom binary operator
1702    fn start_binop(
1703        &self,
1704        chars: &mut State,
1705        prefix: &str,
1706        default: Token,
1707    ) -> Result<Option<Token>, TokenizerError> {
1708        self.start_binop_opt(chars, prefix, Some(default))
1709    }
1710
1711    /// parse a custom binary operator
1712    fn start_binop_opt(
1713        &self,
1714        chars: &mut State,
1715        prefix: &str,
1716        default: Option<Token>,
1717    ) -> Result<Option<Token>, TokenizerError> {
1718        let mut custom = None;
1719        while let Some(&ch) = chars.peek() {
1720            if !self.dialect.is_custom_operator_part(ch) {
1721                break;
1722            }
1723
1724            custom.get_or_insert_with(|| prefix.to_string()).push(ch);
1725            chars.next();
1726        }
1727        match (custom, default) {
1728            (Some(custom), _) => Ok(Token::CustomBinaryOperator(custom).into()),
1729            (None, Some(tok)) => Ok(Some(tok)),
1730            (None, None) => self.tokenizer_error(
1731                chars.location(),
1732                format!("Expected a valid binary operator after '{}'", prefix),
1733            ),
1734        }
1735    }
1736
1737    /// Tokenize dollar preceded value (i.e: a string/placeholder)
1738    fn tokenize_dollar_preceded_value(&self, chars: &mut State) -> Result<Token, TokenizerError> {
1739        let mut s = String::new();
1740        let mut value = String::new();
1741
1742        chars.next();
1743
1744        // If the dialect does not support dollar-quoted strings, then `$$` is rather a placeholder.
1745        if matches!(chars.peek(), Some('$')) && !self.dialect.supports_dollar_placeholder() {
1746            chars.next();
1747
1748            let mut is_terminated = false;
1749            let mut prev: Option<char> = None;
1750
1751            while let Some(&ch) = chars.peek() {
1752                if prev == Some('$') {
1753                    if ch == '$' {
1754                        chars.next();
1755                        is_terminated = true;
1756                        break;
1757                    } else {
1758                        s.push('$');
1759                        s.push(ch);
1760                    }
1761                } else if ch != '$' {
1762                    s.push(ch);
1763                }
1764
1765                prev = Some(ch);
1766                chars.next();
1767            }
1768
1769            return if chars.peek().is_none() && !is_terminated {
1770                self.tokenizer_error(chars.location(), "Unterminated dollar-quoted string")
1771            } else {
1772                Ok(Token::DollarQuotedString(DollarQuotedString {
1773                    value: s,
1774                    tag: None,
1775                }))
1776            };
1777        } else {
1778            value.push_str(&peeking_take_while(chars, |ch| {
1779                ch.is_alphanumeric()
1780                    || ch == '_'
1781                    // Allow $ as a placeholder character if the dialect supports it
1782                    || matches!(ch, '$' if self.dialect.supports_dollar_placeholder())
1783            }));
1784
1785            // If the dialect does not support dollar-quoted strings, don't look for the end delimiter.
1786            if matches!(chars.peek(), Some('$')) && !self.dialect.supports_dollar_placeholder() {
1787                chars.next();
1788
1789                let mut temp = String::new();
1790                let end_delimiter = format!("${}$", value);
1791
1792                loop {
1793                    match chars.next() {
1794                        Some(ch) => {
1795                            temp.push(ch);
1796
1797                            if temp.ends_with(&end_delimiter) {
1798                                if let Some(temp) = temp.strip_suffix(&end_delimiter) {
1799                                    s.push_str(temp);
1800                                }
1801                                break;
1802                            }
1803                        }
1804                        None => {
1805                            if temp.ends_with(&end_delimiter) {
1806                                if let Some(temp) = temp.strip_suffix(&end_delimiter) {
1807                                    s.push_str(temp);
1808                                }
1809                                break;
1810                            }
1811
1812                            return self.tokenizer_error(
1813                                chars.location(),
1814                                "Unterminated dollar-quoted, expected $",
1815                            );
1816                        }
1817                    }
1818                }
1819            } else {
1820                return Ok(Token::Placeholder(String::from("$") + &value));
1821            }
1822        }
1823
1824        Ok(Token::DollarQuotedString(DollarQuotedString {
1825            value: s,
1826            tag: if value.is_empty() { None } else { Some(value) },
1827        }))
1828    }
1829
1830    fn tokenizer_error<R>(
1831        &self,
1832        loc: Location,
1833        message: impl Into<String>,
1834    ) -> Result<R, TokenizerError> {
1835        Err(TokenizerError {
1836            message: message.into(),
1837            location: loc,
1838        })
1839    }
1840
1841    // Consume characters until newline
1842    fn tokenize_single_line_comment(&self, chars: &mut State) -> String {
1843        let mut comment = peeking_take_while(chars, |ch| match ch {
1844            '\n' => false,                                           // Always stop at \n
1845            '\r' if dialect_of!(self is PostgreSqlDialect) => false, // Stop at \r for Postgres
1846            _ => true, // Keep consuming for other characters
1847        });
1848
1849        if let Some(ch) = chars.next() {
1850            assert!(ch == '\n' || ch == '\r');
1851            comment.push(ch);
1852        }
1853
1854        comment
1855    }
1856
1857    /// Tokenize an identifier or keyword, after the first char is already consumed.
1858    fn tokenize_word(&self, first_chars: impl Into<String>, chars: &mut State) -> String {
1859        let mut s = first_chars.into();
1860        s.push_str(&peeking_take_while(chars, |ch| {
1861            self.dialect.is_identifier_part(ch)
1862        }));
1863        s
1864    }
1865
1866    /// Read a quoted identifier
1867    fn tokenize_quoted_identifier(
1868        &self,
1869        quote_start: char,
1870        chars: &mut State,
1871    ) -> Result<String, TokenizerError> {
1872        let error_loc = chars.location();
1873        chars.next(); // consume the opening quote
1874        let quote_end = Word::matching_end_quote(quote_start);
1875        let (s, last_char) = self.parse_quoted_ident(chars, quote_end);
1876
1877        if last_char == Some(quote_end) {
1878            Ok(s)
1879        } else {
1880            self.tokenizer_error(
1881                error_loc,
1882                format!("Expected close delimiter '{quote_end}' before EOF."),
1883            )
1884        }
1885    }
1886
1887    /// Read a single quoted string, starting with the opening quote.
1888    fn tokenize_escaped_single_quoted_string(
1889        &self,
1890        starting_loc: Location,
1891        chars: &mut State,
1892    ) -> Result<String, TokenizerError> {
1893        if let Some(s) = unescape_single_quoted_string(chars) {
1894            return Ok(s);
1895        }
1896
1897        self.tokenizer_error(starting_loc, "Unterminated encoded string literal")
1898    }
1899
1900    /// Reads a string literal quoted by a single or triple quote characters.
1901    /// Examples: `'abc'`, `'''abc'''`, `"""abc"""`.
1902    fn tokenize_single_or_triple_quoted_string<F>(
1903        &self,
1904        chars: &mut State,
1905        quote_style: char,
1906        backslash_escape: bool,
1907        single_quote_token: F,
1908        triple_quote_token: F,
1909    ) -> Result<Option<Token>, TokenizerError>
1910    where
1911        F: Fn(String) -> Token,
1912    {
1913        let error_loc = chars.location();
1914
1915        let mut num_opening_quotes = 0u8;
1916        for _ in 0..3 {
1917            if Some(&quote_style) == chars.peek() {
1918                chars.next(); // Consume quote.
1919                num_opening_quotes += 1;
1920            } else {
1921                break;
1922            }
1923        }
1924
1925        let (token_fn, num_quote_chars) = match num_opening_quotes {
1926            1 => (single_quote_token, NumStringQuoteChars::One),
1927            2 => {
1928                // If we matched double quotes, then this is an empty string.
1929                return Ok(Some(single_quote_token("".into())));
1930            }
1931            3 => {
1932                let Some(num_quote_chars) = NonZeroU8::new(3) else {
1933                    return self.tokenizer_error(error_loc, "invalid number of opening quotes");
1934                };
1935                (
1936                    triple_quote_token,
1937                    NumStringQuoteChars::Many(num_quote_chars),
1938                )
1939            }
1940            _ => {
1941                return self.tokenizer_error(error_loc, "invalid string literal opening");
1942            }
1943        };
1944
1945        let settings = TokenizeQuotedStringSettings {
1946            quote_style,
1947            num_quote_chars,
1948            num_opening_quotes_to_consume: 0,
1949            backslash_escape,
1950        };
1951
1952        self.tokenize_quoted_string(chars, settings)
1953            .map(token_fn)
1954            .map(Some)
1955    }
1956
1957    /// Reads a string literal quoted by a single quote character.
1958    fn tokenize_single_quoted_string(
1959        &self,
1960        chars: &mut State,
1961        quote_style: char,
1962        backslash_escape: bool,
1963    ) -> Result<String, TokenizerError> {
1964        self.tokenize_quoted_string(
1965            chars,
1966            TokenizeQuotedStringSettings {
1967                quote_style,
1968                num_quote_chars: NumStringQuoteChars::One,
1969                num_opening_quotes_to_consume: 1,
1970                backslash_escape,
1971            },
1972        )
1973    }
1974
1975    /// Read a quoted string.
1976    fn tokenize_quoted_string(
1977        &self,
1978        chars: &mut State,
1979        settings: TokenizeQuotedStringSettings,
1980    ) -> Result<String, TokenizerError> {
1981        let mut s = String::new();
1982        let error_loc = chars.location();
1983
1984        // Consume any opening quotes.
1985        for _ in 0..settings.num_opening_quotes_to_consume {
1986            if Some(settings.quote_style) != chars.next() {
1987                return self.tokenizer_error(error_loc, "invalid string literal opening");
1988            }
1989        }
1990
1991        let mut num_consecutive_quotes = 0;
1992        while let Some(&ch) = chars.peek() {
1993            let pending_final_quote = match settings.num_quote_chars {
1994                NumStringQuoteChars::One => Some(NumStringQuoteChars::One),
1995                n @ NumStringQuoteChars::Many(count)
1996                    if num_consecutive_quotes + 1 == count.get() =>
1997                {
1998                    Some(n)
1999                }
2000                NumStringQuoteChars::Many(_) => None,
2001            };
2002
2003            match ch {
2004                char if char == settings.quote_style && pending_final_quote.is_some() => {
2005                    chars.next(); // consume
2006
2007                    if let Some(NumStringQuoteChars::Many(count)) = pending_final_quote {
2008                        // For an initial string like `"""abc"""`, at this point we have
2009                        // `abc""` in the buffer and have now matched the final `"`.
2010                        // However, the string to return is simply `abc`, so we strip off
2011                        // the trailing quotes before returning.
2012                        let mut buf = s.chars();
2013                        for _ in 1..count.get() {
2014                            buf.next_back();
2015                        }
2016                        return Ok(buf.as_str().to_string());
2017                    } else if chars
2018                        .peek()
2019                        .map(|c| *c == settings.quote_style)
2020                        .unwrap_or(false)
2021                    {
2022                        s.push(ch);
2023                        if !self.unescape {
2024                            // In no-escape mode, the given query has to be saved completely
2025                            s.push(ch);
2026                        }
2027                        chars.next();
2028                    } else {
2029                        return Ok(s);
2030                    }
2031                }
2032                '\\' if settings.backslash_escape => {
2033                    // consume backslash
2034                    chars.next();
2035
2036                    num_consecutive_quotes = 0;
2037
2038                    if let Some(next) = chars.peek() {
2039                        if !self.unescape
2040                            || (self.dialect.ignores_wildcard_escapes()
2041                                && (*next == '%' || *next == '_'))
2042                        {
2043                            // In no-escape mode, the given query has to be saved completely
2044                            // including backslashes. Similarly, with ignore_like_wildcard_escapes,
2045                            // the backslash is not stripped.
2046                            s.push(ch);
2047                            s.push(*next);
2048                            chars.next(); // consume next
2049                        } else {
2050                            let n = match next {
2051                                '0' => '\0',
2052                                'a' => '\u{7}',
2053                                'b' => '\u{8}',
2054                                'f' => '\u{c}',
2055                                'n' => '\n',
2056                                'r' => '\r',
2057                                't' => '\t',
2058                                'Z' => '\u{1a}',
2059                                _ => *next,
2060                            };
2061                            s.push(n);
2062                            chars.next(); // consume next
2063                        }
2064                    }
2065                }
2066                ch => {
2067                    chars.next(); // consume ch
2068
2069                    if ch == settings.quote_style {
2070                        num_consecutive_quotes += 1;
2071                    } else {
2072                        num_consecutive_quotes = 0;
2073                    }
2074
2075                    s.push(ch);
2076                }
2077            }
2078        }
2079        self.tokenizer_error(error_loc, "Unterminated string literal")
2080    }
2081
2082    fn tokenize_multiline_comment(
2083        &self,
2084        chars: &mut State,
2085    ) -> Result<Option<Token>, TokenizerError> {
2086        let mut s = String::new();
2087        let mut nested = 1;
2088        let supports_nested_comments = self.dialect.supports_nested_comments();
2089
2090        loop {
2091            match chars.next() {
2092                Some('/') if matches!(chars.peek(), Some('*')) && supports_nested_comments => {
2093                    chars.next(); // consume the '*'
2094                    s.push('/');
2095                    s.push('*');
2096                    nested += 1;
2097                }
2098                Some('*') if matches!(chars.peek(), Some('/')) => {
2099                    chars.next(); // consume the '/'
2100                    nested -= 1;
2101                    if nested == 0 {
2102                        break Ok(Some(Token::Whitespace(Whitespace::MultiLineComment(s))));
2103                    }
2104                    s.push('*');
2105                    s.push('/');
2106                }
2107                Some(ch) => {
2108                    s.push(ch);
2109                }
2110                None => {
2111                    break self.tokenizer_error(
2112                        chars.location(),
2113                        "Unexpected EOF while in a multi-line comment",
2114                    );
2115                }
2116            }
2117        }
2118    }
2119
2120    fn parse_quoted_ident(&self, chars: &mut State, quote_end: char) -> (String, Option<char>) {
2121        let mut last_char = None;
2122        let mut s = String::new();
2123        while let Some(ch) = chars.next() {
2124            if ch == quote_end {
2125                if chars.peek() == Some(&quote_end) {
2126                    chars.next();
2127                    s.push(ch);
2128                    if !self.unescape {
2129                        // In no-escape mode, the given query has to be saved completely
2130                        s.push(ch);
2131                    }
2132                } else {
2133                    last_char = Some(quote_end);
2134                    break;
2135                }
2136            } else {
2137                s.push(ch);
2138            }
2139        }
2140        (s, last_char)
2141    }
2142
2143    #[allow(clippy::unnecessary_wraps)]
2144    fn consume_and_return(
2145        &self,
2146        chars: &mut State,
2147        t: Token,
2148    ) -> Result<Option<Token>, TokenizerError> {
2149        chars.next();
2150        Ok(Some(t))
2151    }
2152}
2153
2154/// Read from `chars` until `predicate` returns `false` or EOF is hit.
2155/// Return the characters read as String, and keep the first non-matching
2156/// char available as `chars.next()`.
2157fn peeking_take_while(chars: &mut State, mut predicate: impl FnMut(char) -> bool) -> String {
2158    let mut s = String::new();
2159    while let Some(&ch) = chars.peek() {
2160        if predicate(ch) {
2161            chars.next(); // consume
2162            s.push(ch);
2163        } else {
2164            break;
2165        }
2166    }
2167    s
2168}
2169
2170/// Same as peeking_take_while, but also passes the next character to the predicate.
2171fn peeking_next_take_while(
2172    chars: &mut State,
2173    mut predicate: impl FnMut(char, Option<char>) -> bool,
2174) -> String {
2175    let mut s = String::new();
2176    while let Some(&ch) = chars.peek() {
2177        let next_char = chars.peekable.clone().nth(1);
2178        if predicate(ch, next_char) {
2179            chars.next(); // consume
2180            s.push(ch);
2181        } else {
2182            break;
2183        }
2184    }
2185    s
2186}
2187
2188fn unescape_single_quoted_string(chars: &mut State<'_>) -> Option<String> {
2189    Unescape::new(chars).unescape()
2190}
2191
2192struct Unescape<'a: 'b, 'b> {
2193    chars: &'b mut State<'a>,
2194}
2195
2196impl<'a: 'b, 'b> Unescape<'a, 'b> {
2197    fn new(chars: &'b mut State<'a>) -> Self {
2198        Self { chars }
2199    }
2200    fn unescape(mut self) -> Option<String> {
2201        let mut unescaped = String::new();
2202
2203        self.chars.next();
2204
2205        while let Some(c) = self.chars.next() {
2206            if c == '\'' {
2207                // case: ''''
2208                if self.chars.peek().map(|c| *c == '\'').unwrap_or(false) {
2209                    self.chars.next();
2210                    unescaped.push('\'');
2211                    continue;
2212                }
2213                return Some(unescaped);
2214            }
2215
2216            if c != '\\' {
2217                unescaped.push(c);
2218                continue;
2219            }
2220
2221            let c = match self.chars.next()? {
2222                'b' => '\u{0008}',
2223                'f' => '\u{000C}',
2224                'n' => '\n',
2225                'r' => '\r',
2226                't' => '\t',
2227                'u' => self.unescape_unicode_16()?,
2228                'U' => self.unescape_unicode_32()?,
2229                'x' => self.unescape_hex()?,
2230                c if c.is_digit(8) => self.unescape_octal(c)?,
2231                c => c,
2232            };
2233
2234            unescaped.push(Self::check_null(c)?);
2235        }
2236
2237        None
2238    }
2239
2240    #[inline]
2241    fn check_null(c: char) -> Option<char> {
2242        if c == '\0' {
2243            None
2244        } else {
2245            Some(c)
2246        }
2247    }
2248
2249    #[inline]
2250    fn byte_to_char<const RADIX: u32>(s: &str) -> Option<char> {
2251        // u32 is used here because Pg has an overflow operation rather than throwing an exception directly.
2252        match u32::from_str_radix(s, RADIX) {
2253            Err(_) => None,
2254            Ok(n) => {
2255                let n = n & 0xFF;
2256                if n <= 127 {
2257                    char::from_u32(n)
2258                } else {
2259                    None
2260                }
2261            }
2262        }
2263    }
2264
2265    // Hexadecimal byte value. \xh, \xhh (h = 0–9, A–F)
2266    fn unescape_hex(&mut self) -> Option<char> {
2267        let mut s = String::new();
2268
2269        for _ in 0..2 {
2270            match self.next_hex_digit() {
2271                Some(c) => s.push(c),
2272                None => break,
2273            }
2274        }
2275
2276        if s.is_empty() {
2277            return Some('x');
2278        }
2279
2280        Self::byte_to_char::<16>(&s)
2281    }
2282
2283    #[inline]
2284    fn next_hex_digit(&mut self) -> Option<char> {
2285        match self.chars.peek() {
2286            Some(c) if c.is_ascii_hexdigit() => self.chars.next(),
2287            _ => None,
2288        }
2289    }
2290
2291    // Octal byte value. \o, \oo, \ooo (o = 0–7)
2292    fn unescape_octal(&mut self, c: char) -> Option<char> {
2293        let mut s = String::new();
2294
2295        s.push(c);
2296        for _ in 0..2 {
2297            match self.next_octal_digest() {
2298                Some(c) => s.push(c),
2299                None => break,
2300            }
2301        }
2302
2303        Self::byte_to_char::<8>(&s)
2304    }
2305
2306    #[inline]
2307    fn next_octal_digest(&mut self) -> Option<char> {
2308        match self.chars.peek() {
2309            Some(c) if c.is_digit(8) => self.chars.next(),
2310            _ => None,
2311        }
2312    }
2313
2314    // 16-bit hexadecimal Unicode character value. \uxxxx (x = 0–9, A–F)
2315    fn unescape_unicode_16(&mut self) -> Option<char> {
2316        self.unescape_unicode::<4>()
2317    }
2318
2319    // 32-bit hexadecimal Unicode character value. \Uxxxxxxxx (x = 0–9, A–F)
2320    fn unescape_unicode_32(&mut self) -> Option<char> {
2321        self.unescape_unicode::<8>()
2322    }
2323
2324    fn unescape_unicode<const NUM: usize>(&mut self) -> Option<char> {
2325        let mut s = String::new();
2326        for _ in 0..NUM {
2327            s.push(self.chars.next()?);
2328        }
2329        match u32::from_str_radix(&s, 16) {
2330            Err(_) => None,
2331            Ok(n) => char::from_u32(n),
2332        }
2333    }
2334}
2335
2336fn unescape_unicode_single_quoted_string(chars: &mut State<'_>) -> Result<String, TokenizerError> {
2337    let mut unescaped = String::new();
2338    chars.next(); // consume the opening quote
2339    while let Some(c) = chars.next() {
2340        match c {
2341            '\'' => {
2342                if chars.peek() == Some(&'\'') {
2343                    chars.next();
2344                    unescaped.push('\'');
2345                } else {
2346                    return Ok(unescaped);
2347                }
2348            }
2349            '\\' => match chars.peek() {
2350                Some('\\') => {
2351                    chars.next();
2352                    unescaped.push('\\');
2353                }
2354                Some('+') => {
2355                    chars.next();
2356                    unescaped.push(take_char_from_hex_digits(chars, 6)?);
2357                }
2358                _ => unescaped.push(take_char_from_hex_digits(chars, 4)?),
2359            },
2360            _ => {
2361                unescaped.push(c);
2362            }
2363        }
2364    }
2365    Err(TokenizerError {
2366        message: "Unterminated unicode encoded string literal".to_string(),
2367        location: chars.location(),
2368    })
2369}
2370
2371fn take_char_from_hex_digits(
2372    chars: &mut State<'_>,
2373    max_digits: usize,
2374) -> Result<char, TokenizerError> {
2375    let mut result = 0u32;
2376    for _ in 0..max_digits {
2377        let next_char = chars.next().ok_or_else(|| TokenizerError {
2378            message: "Unexpected EOF while parsing hex digit in escaped unicode string."
2379                .to_string(),
2380            location: chars.location(),
2381        })?;
2382        let digit = next_char.to_digit(16).ok_or_else(|| TokenizerError {
2383            message: format!("Invalid hex digit in escaped unicode string: {}", next_char),
2384            location: chars.location(),
2385        })?;
2386        result = result * 16 + digit;
2387    }
2388    char::from_u32(result).ok_or_else(|| TokenizerError {
2389        message: format!("Invalid unicode character: {:x}", result),
2390        location: chars.location(),
2391    })
2392}
2393
2394#[cfg(test)]
2395mod tests {
2396    use super::*;
2397    use crate::dialect::{
2398        BigQueryDialect, ClickHouseDialect, HiveDialect, MsSqlDialect, MySqlDialect, SQLiteDialect,
2399    };
2400    use crate::test_utils::all_dialects_where;
2401    use core::fmt::Debug;
2402
2403    #[test]
2404    fn tokenizer_error_impl() {
2405        let err = TokenizerError {
2406            message: "test".into(),
2407            location: Location { line: 1, column: 1 },
2408        };
2409        #[cfg(feature = "std")]
2410        {
2411            use std::error::Error;
2412            assert!(err.source().is_none());
2413        }
2414        assert_eq!(err.to_string(), "test at Line: 1, Column: 1");
2415    }
2416
2417    #[test]
2418    fn tokenize_select_1() {
2419        let sql = String::from("SELECT 1");
2420        let dialect = GenericDialect {};
2421        let tokens = Tokenizer::new(&dialect, &sql).tokenize().unwrap();
2422
2423        let expected = vec![
2424            Token::make_keyword("SELECT"),
2425            Token::Whitespace(Whitespace::Space),
2426            Token::Number(String::from("1"), false),
2427        ];
2428
2429        compare(expected, tokens);
2430    }
2431
2432    #[test]
2433    fn tokenize_select_float() {
2434        let sql = String::from("SELECT .1");
2435        let dialect = GenericDialect {};
2436        let tokens = Tokenizer::new(&dialect, &sql).tokenize().unwrap();
2437
2438        let expected = vec![
2439            Token::make_keyword("SELECT"),
2440            Token::Whitespace(Whitespace::Space),
2441            Token::Number(String::from(".1"), false),
2442        ];
2443
2444        compare(expected, tokens);
2445    }
2446
2447    #[test]
2448    fn tokenize_clickhouse_double_equal() {
2449        let sql = String::from("SELECT foo=='1'");
2450        let dialect = ClickHouseDialect {};
2451        let mut tokenizer = Tokenizer::new(&dialect, &sql);
2452        let tokens = tokenizer.tokenize().unwrap();
2453
2454        let expected = vec![
2455            Token::make_keyword("SELECT"),
2456            Token::Whitespace(Whitespace::Space),
2457            Token::Word(Word {
2458                value: "foo".to_string(),
2459                quote_style: None,
2460                keyword: Keyword::NoKeyword,
2461            }),
2462            Token::DoubleEq,
2463            Token::SingleQuotedString("1".to_string()),
2464        ];
2465
2466        compare(expected, tokens);
2467    }
2468
2469    #[test]
2470    fn tokenize_numeric_literal_underscore() {
2471        let dialect = GenericDialect {};
2472        let sql = String::from("SELECT 10_000");
2473        let mut tokenizer = Tokenizer::new(&dialect, &sql);
2474        let tokens = tokenizer.tokenize().unwrap();
2475        let expected = vec![
2476            Token::make_keyword("SELECT"),
2477            Token::Whitespace(Whitespace::Space),
2478            Token::Number("10".to_string(), false),
2479            Token::make_word("_000", None),
2480        ];
2481        compare(expected, tokens);
2482
2483        all_dialects_where(|dialect| dialect.supports_numeric_literal_underscores()).tokenizes_to(
2484            "SELECT 10_000, _10_000, 10_00_, 10___0",
2485            vec![
2486                Token::make_keyword("SELECT"),
2487                Token::Whitespace(Whitespace::Space),
2488                Token::Number("10_000".to_string(), false),
2489                Token::Comma,
2490                Token::Whitespace(Whitespace::Space),
2491                Token::make_word("_10_000", None), // leading underscore tokenizes as a word (parsed as column identifier)
2492                Token::Comma,
2493                Token::Whitespace(Whitespace::Space),
2494                Token::Number("10_00".to_string(), false),
2495                Token::make_word("_", None), // trailing underscores tokenizes as a word (syntax error in some dialects)
2496                Token::Comma,
2497                Token::Whitespace(Whitespace::Space),
2498                Token::Number("10".to_string(), false),
2499                Token::make_word("___0", None), // multiple underscores tokenizes as a word (syntax error in some dialects)
2500            ],
2501        );
2502    }
2503
2504    #[test]
2505    fn tokenize_select_exponent() {
2506        let sql = String::from("SELECT 1e10, 1e-10, 1e+10, 1ea, 1e-10a, 1e-10-10");
2507        let dialect = GenericDialect {};
2508        let tokens = Tokenizer::new(&dialect, &sql).tokenize().unwrap();
2509
2510        let expected = vec![
2511            Token::make_keyword("SELECT"),
2512            Token::Whitespace(Whitespace::Space),
2513            Token::Number(String::from("1e10"), false),
2514            Token::Comma,
2515            Token::Whitespace(Whitespace::Space),
2516            Token::Number(String::from("1e-10"), false),
2517            Token::Comma,
2518            Token::Whitespace(Whitespace::Space),
2519            Token::Number(String::from("1e+10"), false),
2520            Token::Comma,
2521            Token::Whitespace(Whitespace::Space),
2522            Token::Number(String::from("1"), false),
2523            Token::make_word("ea", None),
2524            Token::Comma,
2525            Token::Whitespace(Whitespace::Space),
2526            Token::Number(String::from("1e-10"), false),
2527            Token::make_word("a", None),
2528            Token::Comma,
2529            Token::Whitespace(Whitespace::Space),
2530            Token::Number(String::from("1e-10"), false),
2531            Token::Minus,
2532            Token::Number(String::from("10"), false),
2533        ];
2534
2535        compare(expected, tokens);
2536    }
2537
2538    #[test]
2539    fn tokenize_scalar_function() {
2540        let sql = String::from("SELECT sqrt(1)");
2541        let dialect = GenericDialect {};
2542        let tokens = Tokenizer::new(&dialect, &sql).tokenize().unwrap();
2543
2544        let expected = vec![
2545            Token::make_keyword("SELECT"),
2546            Token::Whitespace(Whitespace::Space),
2547            Token::make_word("sqrt", None),
2548            Token::LParen,
2549            Token::Number(String::from("1"), false),
2550            Token::RParen,
2551        ];
2552
2553        compare(expected, tokens);
2554    }
2555
2556    #[test]
2557    fn tokenize_string_string_concat() {
2558        let sql = String::from("SELECT 'a' || 'b'");
2559        let dialect = GenericDialect {};
2560        let tokens = Tokenizer::new(&dialect, &sql).tokenize().unwrap();
2561
2562        let expected = vec![
2563            Token::make_keyword("SELECT"),
2564            Token::Whitespace(Whitespace::Space),
2565            Token::SingleQuotedString(String::from("a")),
2566            Token::Whitespace(Whitespace::Space),
2567            Token::StringConcat,
2568            Token::Whitespace(Whitespace::Space),
2569            Token::SingleQuotedString(String::from("b")),
2570        ];
2571
2572        compare(expected, tokens);
2573    }
2574    #[test]
2575    fn tokenize_bitwise_op() {
2576        let sql = String::from("SELECT one | two ^ three");
2577        let dialect = GenericDialect {};
2578        let tokens = Tokenizer::new(&dialect, &sql).tokenize().unwrap();
2579
2580        let expected = vec![
2581            Token::make_keyword("SELECT"),
2582            Token::Whitespace(Whitespace::Space),
2583            Token::make_word("one", None),
2584            Token::Whitespace(Whitespace::Space),
2585            Token::Pipe,
2586            Token::Whitespace(Whitespace::Space),
2587            Token::make_word("two", None),
2588            Token::Whitespace(Whitespace::Space),
2589            Token::Caret,
2590            Token::Whitespace(Whitespace::Space),
2591            Token::make_word("three", None),
2592        ];
2593        compare(expected, tokens);
2594    }
2595
2596    #[test]
2597    fn tokenize_logical_xor() {
2598        let sql =
2599            String::from("SELECT true XOR true, false XOR false, true XOR false, false XOR true");
2600        let dialect = GenericDialect {};
2601        let tokens = Tokenizer::new(&dialect, &sql).tokenize().unwrap();
2602
2603        let expected = vec![
2604            Token::make_keyword("SELECT"),
2605            Token::Whitespace(Whitespace::Space),
2606            Token::make_keyword("true"),
2607            Token::Whitespace(Whitespace::Space),
2608            Token::make_keyword("XOR"),
2609            Token::Whitespace(Whitespace::Space),
2610            Token::make_keyword("true"),
2611            Token::Comma,
2612            Token::Whitespace(Whitespace::Space),
2613            Token::make_keyword("false"),
2614            Token::Whitespace(Whitespace::Space),
2615            Token::make_keyword("XOR"),
2616            Token::Whitespace(Whitespace::Space),
2617            Token::make_keyword("false"),
2618            Token::Comma,
2619            Token::Whitespace(Whitespace::Space),
2620            Token::make_keyword("true"),
2621            Token::Whitespace(Whitespace::Space),
2622            Token::make_keyword("XOR"),
2623            Token::Whitespace(Whitespace::Space),
2624            Token::make_keyword("false"),
2625            Token::Comma,
2626            Token::Whitespace(Whitespace::Space),
2627            Token::make_keyword("false"),
2628            Token::Whitespace(Whitespace::Space),
2629            Token::make_keyword("XOR"),
2630            Token::Whitespace(Whitespace::Space),
2631            Token::make_keyword("true"),
2632        ];
2633        compare(expected, tokens);
2634    }
2635
2636    #[test]
2637    fn tokenize_simple_select() {
2638        let sql = String::from("SELECT * FROM customer WHERE id = 1 LIMIT 5");
2639        let dialect = GenericDialect {};
2640        let tokens = Tokenizer::new(&dialect, &sql).tokenize().unwrap();
2641
2642        let expected = vec![
2643            Token::make_keyword("SELECT"),
2644            Token::Whitespace(Whitespace::Space),
2645            Token::Mul,
2646            Token::Whitespace(Whitespace::Space),
2647            Token::make_keyword("FROM"),
2648            Token::Whitespace(Whitespace::Space),
2649            Token::make_word("customer", None),
2650            Token::Whitespace(Whitespace::Space),
2651            Token::make_keyword("WHERE"),
2652            Token::Whitespace(Whitespace::Space),
2653            Token::make_word("id", None),
2654            Token::Whitespace(Whitespace::Space),
2655            Token::Eq,
2656            Token::Whitespace(Whitespace::Space),
2657            Token::Number(String::from("1"), false),
2658            Token::Whitespace(Whitespace::Space),
2659            Token::make_keyword("LIMIT"),
2660            Token::Whitespace(Whitespace::Space),
2661            Token::Number(String::from("5"), false),
2662        ];
2663
2664        compare(expected, tokens);
2665    }
2666
2667    #[test]
2668    fn tokenize_explain_select() {
2669        let sql = String::from("EXPLAIN SELECT * FROM customer WHERE id = 1");
2670        let dialect = GenericDialect {};
2671        let tokens = Tokenizer::new(&dialect, &sql).tokenize().unwrap();
2672
2673        let expected = vec![
2674            Token::make_keyword("EXPLAIN"),
2675            Token::Whitespace(Whitespace::Space),
2676            Token::make_keyword("SELECT"),
2677            Token::Whitespace(Whitespace::Space),
2678            Token::Mul,
2679            Token::Whitespace(Whitespace::Space),
2680            Token::make_keyword("FROM"),
2681            Token::Whitespace(Whitespace::Space),
2682            Token::make_word("customer", None),
2683            Token::Whitespace(Whitespace::Space),
2684            Token::make_keyword("WHERE"),
2685            Token::Whitespace(Whitespace::Space),
2686            Token::make_word("id", None),
2687            Token::Whitespace(Whitespace::Space),
2688            Token::Eq,
2689            Token::Whitespace(Whitespace::Space),
2690            Token::Number(String::from("1"), false),
2691        ];
2692
2693        compare(expected, tokens);
2694    }
2695
2696    #[test]
2697    fn tokenize_explain_analyze_select() {
2698        let sql = String::from("EXPLAIN ANALYZE SELECT * FROM customer WHERE id = 1");
2699        let dialect = GenericDialect {};
2700        let tokens = Tokenizer::new(&dialect, &sql).tokenize().unwrap();
2701
2702        let expected = vec![
2703            Token::make_keyword("EXPLAIN"),
2704            Token::Whitespace(Whitespace::Space),
2705            Token::make_keyword("ANALYZE"),
2706            Token::Whitespace(Whitespace::Space),
2707            Token::make_keyword("SELECT"),
2708            Token::Whitespace(Whitespace::Space),
2709            Token::Mul,
2710            Token::Whitespace(Whitespace::Space),
2711            Token::make_keyword("FROM"),
2712            Token::Whitespace(Whitespace::Space),
2713            Token::make_word("customer", None),
2714            Token::Whitespace(Whitespace::Space),
2715            Token::make_keyword("WHERE"),
2716            Token::Whitespace(Whitespace::Space),
2717            Token::make_word("id", None),
2718            Token::Whitespace(Whitespace::Space),
2719            Token::Eq,
2720            Token::Whitespace(Whitespace::Space),
2721            Token::Number(String::from("1"), false),
2722        ];
2723
2724        compare(expected, tokens);
2725    }
2726
2727    #[test]
2728    fn tokenize_string_predicate() {
2729        let sql = String::from("SELECT * FROM customer WHERE salary != 'Not Provided'");
2730        let dialect = GenericDialect {};
2731        let tokens = Tokenizer::new(&dialect, &sql).tokenize().unwrap();
2732
2733        let expected = vec![
2734            Token::make_keyword("SELECT"),
2735            Token::Whitespace(Whitespace::Space),
2736            Token::Mul,
2737            Token::Whitespace(Whitespace::Space),
2738            Token::make_keyword("FROM"),
2739            Token::Whitespace(Whitespace::Space),
2740            Token::make_word("customer", None),
2741            Token::Whitespace(Whitespace::Space),
2742            Token::make_keyword("WHERE"),
2743            Token::Whitespace(Whitespace::Space),
2744            Token::make_word("salary", None),
2745            Token::Whitespace(Whitespace::Space),
2746            Token::Neq,
2747            Token::Whitespace(Whitespace::Space),
2748            Token::SingleQuotedString(String::from("Not Provided")),
2749        ];
2750
2751        compare(expected, tokens);
2752    }
2753
2754    #[test]
2755    fn tokenize_invalid_string() {
2756        let sql = String::from("\n💝مصطفىh");
2757
2758        let dialect = GenericDialect {};
2759        let tokens = Tokenizer::new(&dialect, &sql).tokenize().unwrap();
2760        // println!("tokens: {:#?}", tokens);
2761        let expected = vec![
2762            Token::Whitespace(Whitespace::Newline),
2763            Token::Char('💝'),
2764            Token::make_word("مصطفىh", None),
2765        ];
2766        compare(expected, tokens);
2767    }
2768
2769    #[test]
2770    fn tokenize_newline_in_string_literal() {
2771        let sql = String::from("'foo\r\nbar\nbaz'");
2772
2773        let dialect = GenericDialect {};
2774        let tokens = Tokenizer::new(&dialect, &sql).tokenize().unwrap();
2775        let expected = vec![Token::SingleQuotedString("foo\r\nbar\nbaz".to_string())];
2776        compare(expected, tokens);
2777    }
2778
2779    #[test]
2780    fn tokenize_unterminated_string_literal() {
2781        let sql = String::from("select 'foo");
2782
2783        let dialect = GenericDialect {};
2784        let mut tokenizer = Tokenizer::new(&dialect, &sql);
2785        assert_eq!(
2786            tokenizer.tokenize(),
2787            Err(TokenizerError {
2788                message: "Unterminated string literal".to_string(),
2789                location: Location { line: 1, column: 8 },
2790            })
2791        );
2792    }
2793
2794    #[test]
2795    fn tokenize_unterminated_string_literal_utf8() {
2796        let sql = String::from("SELECT \"なにか\" FROM Y WHERE \"なにか\" = 'test;");
2797
2798        let dialect = GenericDialect {};
2799        let mut tokenizer = Tokenizer::new(&dialect, &sql);
2800        assert_eq!(
2801            tokenizer.tokenize(),
2802            Err(TokenizerError {
2803                message: "Unterminated string literal".to_string(),
2804                location: Location {
2805                    line: 1,
2806                    column: 35
2807                }
2808            })
2809        );
2810    }
2811
2812    #[test]
2813    fn tokenize_invalid_string_cols() {
2814        let sql = String::from("\n\nSELECT * FROM table\t💝مصطفىh");
2815
2816        let dialect = GenericDialect {};
2817        let tokens = Tokenizer::new(&dialect, &sql).tokenize().unwrap();
2818        // println!("tokens: {:#?}", tokens);
2819        let expected = vec![
2820            Token::Whitespace(Whitespace::Newline),
2821            Token::Whitespace(Whitespace::Newline),
2822            Token::make_keyword("SELECT"),
2823            Token::Whitespace(Whitespace::Space),
2824            Token::Mul,
2825            Token::Whitespace(Whitespace::Space),
2826            Token::make_keyword("FROM"),
2827            Token::Whitespace(Whitespace::Space),
2828            Token::make_keyword("table"),
2829            Token::Whitespace(Whitespace::Tab),
2830            Token::Char('💝'),
2831            Token::make_word("مصطفىh", None),
2832        ];
2833        compare(expected, tokens);
2834    }
2835
2836    #[test]
2837    fn tokenize_dollar_quoted_string_tagged() {
2838        let test_cases = vec![
2839            (
2840                String::from("SELECT $tag$dollar '$' quoted strings have $tags like this$ or like this $$$tag$"),
2841                vec![
2842                    Token::make_keyword("SELECT"),
2843                    Token::Whitespace(Whitespace::Space),
2844                    Token::DollarQuotedString(DollarQuotedString {
2845                        value: "dollar '$' quoted strings have $tags like this$ or like this $$".into(),
2846                        tag: Some("tag".into()),
2847                    })
2848                ]
2849            ),
2850            (
2851                String::from("SELECT $abc$x$ab$abc$"),
2852                vec![
2853                    Token::make_keyword("SELECT"),
2854                    Token::Whitespace(Whitespace::Space),
2855                    Token::DollarQuotedString(DollarQuotedString {
2856                        value: "x$ab".into(),
2857                        tag: Some("abc".into()),
2858                    })
2859                ]
2860            ),
2861            (
2862                String::from("SELECT $abc$$abc$"),
2863                vec![
2864                    Token::make_keyword("SELECT"),
2865                    Token::Whitespace(Whitespace::Space),
2866                    Token::DollarQuotedString(DollarQuotedString {
2867                        value: "".into(),
2868                        tag: Some("abc".into()),
2869                    })
2870                ]
2871            ),
2872            (
2873                String::from("0$abc$$abc$1"),
2874                vec![
2875                    Token::Number("0".into(), false),
2876                    Token::DollarQuotedString(DollarQuotedString {
2877                        value: "".into(),
2878                        tag: Some("abc".into()),
2879                    }),
2880                    Token::Number("1".into(), false),
2881                ]
2882            ),
2883            (
2884                String::from("$function$abc$q$data$q$$function$"),
2885                vec![
2886                    Token::DollarQuotedString(DollarQuotedString {
2887                        value: "abc$q$data$q$".into(),
2888                        tag: Some("function".into()),
2889                    }),
2890                ]
2891            ),
2892        ];
2893
2894        let dialect = GenericDialect {};
2895        for (sql, expected) in test_cases {
2896            let tokens = Tokenizer::new(&dialect, &sql).tokenize().unwrap();
2897            compare(expected, tokens);
2898        }
2899    }
2900
2901    #[test]
2902    fn tokenize_dollar_quoted_string_tagged_unterminated() {
2903        let sql = String::from("SELECT $tag$dollar '$' quoted strings have $tags like this$ or like this $$$different tag$");
2904        let dialect = GenericDialect {};
2905        assert_eq!(
2906            Tokenizer::new(&dialect, &sql).tokenize(),
2907            Err(TokenizerError {
2908                message: "Unterminated dollar-quoted, expected $".into(),
2909                location: Location {
2910                    line: 1,
2911                    column: 91
2912                }
2913            })
2914        );
2915    }
2916
2917    #[test]
2918    fn tokenize_dollar_quoted_string_tagged_unterminated_mirror() {
2919        let sql = String::from("SELECT $abc$abc$");
2920        let dialect = GenericDialect {};
2921        assert_eq!(
2922            Tokenizer::new(&dialect, &sql).tokenize(),
2923            Err(TokenizerError {
2924                message: "Unterminated dollar-quoted, expected $".into(),
2925                location: Location {
2926                    line: 1,
2927                    column: 17
2928                }
2929            })
2930        );
2931    }
2932
2933    #[test]
2934    fn tokenize_dollar_placeholder() {
2935        let sql = String::from("SELECT $$, $$ABC$$, $ABC$, $ABC");
2936        let dialect = SQLiteDialect {};
2937        let tokens = Tokenizer::new(&dialect, &sql).tokenize().unwrap();
2938        assert_eq!(
2939            tokens,
2940            vec![
2941                Token::make_keyword("SELECT"),
2942                Token::Whitespace(Whitespace::Space),
2943                Token::Placeholder("$$".into()),
2944                Token::Comma,
2945                Token::Whitespace(Whitespace::Space),
2946                Token::Placeholder("$$ABC$$".into()),
2947                Token::Comma,
2948                Token::Whitespace(Whitespace::Space),
2949                Token::Placeholder("$ABC$".into()),
2950                Token::Comma,
2951                Token::Whitespace(Whitespace::Space),
2952                Token::Placeholder("$ABC".into()),
2953            ]
2954        );
2955    }
2956
2957    #[test]
2958    fn tokenize_nested_dollar_quoted_strings() {
2959        let sql = String::from("SELECT $tag$dollar $nested$ string$tag$");
2960        let dialect = GenericDialect {};
2961        let tokens = Tokenizer::new(&dialect, &sql).tokenize().unwrap();
2962        let expected = vec![
2963            Token::make_keyword("SELECT"),
2964            Token::Whitespace(Whitespace::Space),
2965            Token::DollarQuotedString(DollarQuotedString {
2966                value: "dollar $nested$ string".into(),
2967                tag: Some("tag".into()),
2968            }),
2969        ];
2970        compare(expected, tokens);
2971    }
2972
2973    #[test]
2974    fn tokenize_dollar_quoted_string_untagged_empty() {
2975        let sql = String::from("SELECT $$$$");
2976        let dialect = GenericDialect {};
2977        let tokens = Tokenizer::new(&dialect, &sql).tokenize().unwrap();
2978        let expected = vec![
2979            Token::make_keyword("SELECT"),
2980            Token::Whitespace(Whitespace::Space),
2981            Token::DollarQuotedString(DollarQuotedString {
2982                value: "".into(),
2983                tag: None,
2984            }),
2985        ];
2986        compare(expected, tokens);
2987    }
2988
2989    #[test]
2990    fn tokenize_dollar_quoted_string_untagged() {
2991        let sql =
2992            String::from("SELECT $$within dollar '$' quoted strings have $tags like this$ $$");
2993        let dialect = GenericDialect {};
2994        let tokens = Tokenizer::new(&dialect, &sql).tokenize().unwrap();
2995        let expected = vec![
2996            Token::make_keyword("SELECT"),
2997            Token::Whitespace(Whitespace::Space),
2998            Token::DollarQuotedString(DollarQuotedString {
2999                value: "within dollar '$' quoted strings have $tags like this$ ".into(),
3000                tag: None,
3001            }),
3002        ];
3003        compare(expected, tokens);
3004    }
3005
3006    #[test]
3007    fn tokenize_dollar_quoted_string_untagged_unterminated() {
3008        let sql = String::from(
3009            "SELECT $$dollar '$' quoted strings have $tags like this$ or like this $different tag$",
3010        );
3011        let dialect = GenericDialect {};
3012        assert_eq!(
3013            Tokenizer::new(&dialect, &sql).tokenize(),
3014            Err(TokenizerError {
3015                message: "Unterminated dollar-quoted string".into(),
3016                location: Location {
3017                    line: 1,
3018                    column: 86
3019                }
3020            })
3021        );
3022    }
3023
3024    #[test]
3025    fn tokenize_right_arrow() {
3026        let sql = String::from("FUNCTION(key=>value)");
3027        let dialect = GenericDialect {};
3028        let tokens = Tokenizer::new(&dialect, &sql).tokenize().unwrap();
3029        let expected = vec![
3030            Token::make_word("FUNCTION", None),
3031            Token::LParen,
3032            Token::make_word("key", None),
3033            Token::RArrow,
3034            Token::make_word("value", None),
3035            Token::RParen,
3036        ];
3037        compare(expected, tokens);
3038    }
3039
3040    #[test]
3041    fn tokenize_is_null() {
3042        let sql = String::from("a IS NULL");
3043        let dialect = GenericDialect {};
3044        let tokens = Tokenizer::new(&dialect, &sql).tokenize().unwrap();
3045
3046        let expected = vec![
3047            Token::make_word("a", None),
3048            Token::Whitespace(Whitespace::Space),
3049            Token::make_keyword("IS"),
3050            Token::Whitespace(Whitespace::Space),
3051            Token::make_keyword("NULL"),
3052        ];
3053
3054        compare(expected, tokens);
3055    }
3056
3057    #[test]
3058    fn tokenize_comment() {
3059        let test_cases = vec![
3060            (
3061                String::from("0--this is a comment\n1"),
3062                vec![
3063                    Token::Number("0".to_string(), false),
3064                    Token::Whitespace(Whitespace::SingleLineComment {
3065                        prefix: "--".to_string(),
3066                        comment: "this is a comment\n".to_string(),
3067                    }),
3068                    Token::Number("1".to_string(), false),
3069                ],
3070            ),
3071            (
3072                String::from("0--this is a comment\r1"),
3073                vec![
3074                    Token::Number("0".to_string(), false),
3075                    Token::Whitespace(Whitespace::SingleLineComment {
3076                        prefix: "--".to_string(),
3077                        comment: "this is a comment\r1".to_string(),
3078                    }),
3079                ],
3080            ),
3081            (
3082                String::from("0--this is a comment\r\n1"),
3083                vec![
3084                    Token::Number("0".to_string(), false),
3085                    Token::Whitespace(Whitespace::SingleLineComment {
3086                        prefix: "--".to_string(),
3087                        comment: "this is a comment\r\n".to_string(),
3088                    }),
3089                    Token::Number("1".to_string(), false),
3090                ],
3091            ),
3092        ];
3093
3094        let dialect = GenericDialect {};
3095
3096        for (sql, expected) in test_cases {
3097            let tokens = Tokenizer::new(&dialect, &sql).tokenize().unwrap();
3098            compare(expected, tokens);
3099        }
3100    }
3101
3102    #[test]
3103    fn tokenize_comment_postgres() {
3104        let sql = String::from("1--\r0");
3105
3106        let dialect = PostgreSqlDialect {};
3107        let tokens = Tokenizer::new(&dialect, &sql).tokenize().unwrap();
3108        let expected = vec![
3109            Token::Number("1".to_string(), false),
3110            Token::Whitespace(Whitespace::SingleLineComment {
3111                prefix: "--".to_string(),
3112                comment: "\r".to_string(),
3113            }),
3114            Token::Number("0".to_string(), false),
3115        ];
3116        compare(expected, tokens);
3117    }
3118
3119    #[test]
3120    fn tokenize_comment_at_eof() {
3121        let sql = String::from("--this is a comment");
3122
3123        let dialect = GenericDialect {};
3124        let tokens = Tokenizer::new(&dialect, &sql).tokenize().unwrap();
3125        let expected = vec![Token::Whitespace(Whitespace::SingleLineComment {
3126            prefix: "--".to_string(),
3127            comment: "this is a comment".to_string(),
3128        })];
3129        compare(expected, tokens);
3130    }
3131
3132    #[test]
3133    fn tokenize_multiline_comment() {
3134        let sql = String::from("0/*multi-line\n* /comment*/1");
3135
3136        let dialect = GenericDialect {};
3137        let tokens = Tokenizer::new(&dialect, &sql).tokenize().unwrap();
3138        let expected = vec![
3139            Token::Number("0".to_string(), false),
3140            Token::Whitespace(Whitespace::MultiLineComment(
3141                "multi-line\n* /comment".to_string(),
3142            )),
3143            Token::Number("1".to_string(), false),
3144        ];
3145        compare(expected, tokens);
3146    }
3147
3148    #[test]
3149    fn tokenize_nested_multiline_comment() {
3150        let dialect = GenericDialect {};
3151        let test_cases = vec![
3152            (
3153                "0/*multi-line\n* \n/* comment \n /*comment*/*/ */ /comment*/1",
3154                vec![
3155                    Token::Number("0".to_string(), false),
3156                    Token::Whitespace(Whitespace::MultiLineComment(
3157                        "multi-line\n* \n/* comment \n /*comment*/*/ ".into(),
3158                    )),
3159                    Token::Whitespace(Whitespace::Space),
3160                    Token::Div,
3161                    Token::Word(Word {
3162                        value: "comment".to_string(),
3163                        quote_style: None,
3164                        keyword: Keyword::COMMENT,
3165                    }),
3166                    Token::Mul,
3167                    Token::Div,
3168                    Token::Number("1".to_string(), false),
3169                ],
3170            ),
3171            (
3172                "0/*multi-line\n* \n/* comment \n /*comment/**/ */ /comment*/*/1",
3173                vec![
3174                    Token::Number("0".to_string(), false),
3175                    Token::Whitespace(Whitespace::MultiLineComment(
3176                        "multi-line\n* \n/* comment \n /*comment/**/ */ /comment*/".into(),
3177                    )),
3178                    Token::Number("1".to_string(), false),
3179                ],
3180            ),
3181            (
3182                "SELECT 1/* a /* b */ c */0",
3183                vec![
3184                    Token::make_keyword("SELECT"),
3185                    Token::Whitespace(Whitespace::Space),
3186                    Token::Number("1".to_string(), false),
3187                    Token::Whitespace(Whitespace::MultiLineComment(" a /* b */ c ".to_string())),
3188                    Token::Number("0".to_string(), false),
3189                ],
3190            ),
3191        ];
3192
3193        for (sql, expected) in test_cases {
3194            let tokens = Tokenizer::new(&dialect, sql).tokenize().unwrap();
3195            compare(expected, tokens);
3196        }
3197    }
3198
3199    #[test]
3200    fn tokenize_nested_multiline_comment_empty() {
3201        let sql = "select 1/*/**/*/0";
3202
3203        let dialect = GenericDialect {};
3204        let tokens = Tokenizer::new(&dialect, sql).tokenize().unwrap();
3205        let expected = vec![
3206            Token::make_keyword("select"),
3207            Token::Whitespace(Whitespace::Space),
3208            Token::Number("1".to_string(), false),
3209            Token::Whitespace(Whitespace::MultiLineComment("/**/".to_string())),
3210            Token::Number("0".to_string(), false),
3211        ];
3212
3213        compare(expected, tokens);
3214    }
3215
3216    #[test]
3217    fn tokenize_nested_comments_if_not_supported() {
3218        let dialect = SQLiteDialect {};
3219        let sql = "SELECT 1/*/* nested comment */*/0";
3220        let tokens = Tokenizer::new(&dialect, sql).tokenize();
3221        let expected = vec![
3222            Token::make_keyword("SELECT"),
3223            Token::Whitespace(Whitespace::Space),
3224            Token::Number("1".to_string(), false),
3225            Token::Whitespace(Whitespace::MultiLineComment(
3226                "/* nested comment ".to_string(),
3227            )),
3228            Token::Mul,
3229            Token::Div,
3230            Token::Number("0".to_string(), false),
3231        ];
3232
3233        compare(expected, tokens.unwrap());
3234    }
3235
3236    #[test]
3237    fn tokenize_multiline_comment_with_even_asterisks() {
3238        let sql = String::from("\n/** Comment **/\n");
3239
3240        let dialect = GenericDialect {};
3241        let tokens = Tokenizer::new(&dialect, &sql).tokenize().unwrap();
3242        let expected = vec![
3243            Token::Whitespace(Whitespace::Newline),
3244            Token::Whitespace(Whitespace::MultiLineComment("* Comment *".to_string())),
3245            Token::Whitespace(Whitespace::Newline),
3246        ];
3247        compare(expected, tokens);
3248    }
3249
3250    #[test]
3251    fn tokenize_unicode_whitespace() {
3252        let sql = String::from(" \u{2003}\n");
3253
3254        let dialect = GenericDialect {};
3255        let tokens = Tokenizer::new(&dialect, &sql).tokenize().unwrap();
3256        let expected = vec![
3257            Token::Whitespace(Whitespace::Space),
3258            Token::Whitespace(Whitespace::Space),
3259            Token::Whitespace(Whitespace::Newline),
3260        ];
3261        compare(expected, tokens);
3262    }
3263
3264    #[test]
3265    fn tokenize_mismatched_quotes() {
3266        let sql = String::from("\"foo");
3267
3268        let dialect = GenericDialect {};
3269        let mut tokenizer = Tokenizer::new(&dialect, &sql);
3270        assert_eq!(
3271            tokenizer.tokenize(),
3272            Err(TokenizerError {
3273                message: "Expected close delimiter '\"' before EOF.".to_string(),
3274                location: Location { line: 1, column: 1 },
3275            })
3276        );
3277    }
3278
3279    #[test]
3280    fn tokenize_newlines() {
3281        let sql = String::from("line1\nline2\rline3\r\nline4\r");
3282
3283        let dialect = GenericDialect {};
3284        let tokens = Tokenizer::new(&dialect, &sql).tokenize().unwrap();
3285        let expected = vec![
3286            Token::make_word("line1", None),
3287            Token::Whitespace(Whitespace::Newline),
3288            Token::make_word("line2", None),
3289            Token::Whitespace(Whitespace::Newline),
3290            Token::make_word("line3", None),
3291            Token::Whitespace(Whitespace::Newline),
3292            Token::make_word("line4", None),
3293            Token::Whitespace(Whitespace::Newline),
3294        ];
3295        compare(expected, tokens);
3296    }
3297
3298    #[test]
3299    fn tokenize_mssql_top() {
3300        let sql = "SELECT TOP 5 [bar] FROM foo";
3301        let dialect = MsSqlDialect {};
3302        let tokens = Tokenizer::new(&dialect, sql).tokenize().unwrap();
3303        let expected = vec![
3304            Token::make_keyword("SELECT"),
3305            Token::Whitespace(Whitespace::Space),
3306            Token::make_keyword("TOP"),
3307            Token::Whitespace(Whitespace::Space),
3308            Token::Number(String::from("5"), false),
3309            Token::Whitespace(Whitespace::Space),
3310            Token::make_word("bar", Some('[')),
3311            Token::Whitespace(Whitespace::Space),
3312            Token::make_keyword("FROM"),
3313            Token::Whitespace(Whitespace::Space),
3314            Token::make_word("foo", None),
3315        ];
3316        compare(expected, tokens);
3317    }
3318
3319    #[test]
3320    fn tokenize_pg_regex_match() {
3321        let sql = "SELECT col ~ '^a', col ~* '^a', col !~ '^a', col !~* '^a'";
3322        let dialect = GenericDialect {};
3323        let tokens = Tokenizer::new(&dialect, sql).tokenize().unwrap();
3324        let expected = vec![
3325            Token::make_keyword("SELECT"),
3326            Token::Whitespace(Whitespace::Space),
3327            Token::make_word("col", None),
3328            Token::Whitespace(Whitespace::Space),
3329            Token::Tilde,
3330            Token::Whitespace(Whitespace::Space),
3331            Token::SingleQuotedString("^a".into()),
3332            Token::Comma,
3333            Token::Whitespace(Whitespace::Space),
3334            Token::make_word("col", None),
3335            Token::Whitespace(Whitespace::Space),
3336            Token::TildeAsterisk,
3337            Token::Whitespace(Whitespace::Space),
3338            Token::SingleQuotedString("^a".into()),
3339            Token::Comma,
3340            Token::Whitespace(Whitespace::Space),
3341            Token::make_word("col", None),
3342            Token::Whitespace(Whitespace::Space),
3343            Token::ExclamationMarkTilde,
3344            Token::Whitespace(Whitespace::Space),
3345            Token::SingleQuotedString("^a".into()),
3346            Token::Comma,
3347            Token::Whitespace(Whitespace::Space),
3348            Token::make_word("col", None),
3349            Token::Whitespace(Whitespace::Space),
3350            Token::ExclamationMarkTildeAsterisk,
3351            Token::Whitespace(Whitespace::Space),
3352            Token::SingleQuotedString("^a".into()),
3353        ];
3354        compare(expected, tokens);
3355    }
3356
3357    #[test]
3358    fn tokenize_pg_like_match() {
3359        let sql = "SELECT col ~~ '_a%', col ~~* '_a%', col !~~ '_a%', col !~~* '_a%'";
3360        let dialect = GenericDialect {};
3361        let tokens = Tokenizer::new(&dialect, sql).tokenize().unwrap();
3362        let expected = vec![
3363            Token::make_keyword("SELECT"),
3364            Token::Whitespace(Whitespace::Space),
3365            Token::make_word("col", None),
3366            Token::Whitespace(Whitespace::Space),
3367            Token::DoubleTilde,
3368            Token::Whitespace(Whitespace::Space),
3369            Token::SingleQuotedString("_a%".into()),
3370            Token::Comma,
3371            Token::Whitespace(Whitespace::Space),
3372            Token::make_word("col", None),
3373            Token::Whitespace(Whitespace::Space),
3374            Token::DoubleTildeAsterisk,
3375            Token::Whitespace(Whitespace::Space),
3376            Token::SingleQuotedString("_a%".into()),
3377            Token::Comma,
3378            Token::Whitespace(Whitespace::Space),
3379            Token::make_word("col", None),
3380            Token::Whitespace(Whitespace::Space),
3381            Token::ExclamationMarkDoubleTilde,
3382            Token::Whitespace(Whitespace::Space),
3383            Token::SingleQuotedString("_a%".into()),
3384            Token::Comma,
3385            Token::Whitespace(Whitespace::Space),
3386            Token::make_word("col", None),
3387            Token::Whitespace(Whitespace::Space),
3388            Token::ExclamationMarkDoubleTildeAsterisk,
3389            Token::Whitespace(Whitespace::Space),
3390            Token::SingleQuotedString("_a%".into()),
3391        ];
3392        compare(expected, tokens);
3393    }
3394
3395    #[test]
3396    fn tokenize_quoted_identifier() {
3397        let sql = r#" "a "" b" "a """ "c """"" "#;
3398        let dialect = GenericDialect {};
3399        let tokens = Tokenizer::new(&dialect, sql).tokenize().unwrap();
3400        let expected = vec![
3401            Token::Whitespace(Whitespace::Space),
3402            Token::make_word(r#"a " b"#, Some('"')),
3403            Token::Whitespace(Whitespace::Space),
3404            Token::make_word(r#"a ""#, Some('"')),
3405            Token::Whitespace(Whitespace::Space),
3406            Token::make_word(r#"c """#, Some('"')),
3407            Token::Whitespace(Whitespace::Space),
3408        ];
3409        compare(expected, tokens);
3410    }
3411
3412    #[test]
3413    fn tokenize_snowflake_div() {
3414        let sql = r#"field/1000"#;
3415        let dialect = SnowflakeDialect {};
3416        let tokens = Tokenizer::new(&dialect, sql).tokenize().unwrap();
3417        let expected = vec![
3418            Token::make_word(r#"field"#, None),
3419            Token::Div,
3420            Token::Number("1000".to_string(), false),
3421        ];
3422        compare(expected, tokens);
3423    }
3424
3425    #[test]
3426    fn tokenize_quoted_identifier_with_no_escape() {
3427        let sql = r#" "a "" b" "a """ "c """"" "#;
3428        let dialect = GenericDialect {};
3429        let tokens = Tokenizer::new(&dialect, sql)
3430            .with_unescape(false)
3431            .tokenize()
3432            .unwrap();
3433        let expected = vec![
3434            Token::Whitespace(Whitespace::Space),
3435            Token::make_word(r#"a "" b"#, Some('"')),
3436            Token::Whitespace(Whitespace::Space),
3437            Token::make_word(r#"a """#, Some('"')),
3438            Token::Whitespace(Whitespace::Space),
3439            Token::make_word(r#"c """""#, Some('"')),
3440            Token::Whitespace(Whitespace::Space),
3441        ];
3442        compare(expected, tokens);
3443    }
3444
3445    #[test]
3446    fn tokenize_with_location() {
3447        let sql = "SELECT a,\n b";
3448        let dialect = GenericDialect {};
3449        let tokens = Tokenizer::new(&dialect, sql)
3450            .tokenize_with_location()
3451            .unwrap();
3452        let expected = vec![
3453            TokenWithSpan::at(Token::make_keyword("SELECT"), (1, 1).into(), (1, 7).into()),
3454            TokenWithSpan::at(
3455                Token::Whitespace(Whitespace::Space),
3456                (1, 7).into(),
3457                (1, 8).into(),
3458            ),
3459            TokenWithSpan::at(Token::make_word("a", None), (1, 8).into(), (1, 9).into()),
3460            TokenWithSpan::at(Token::Comma, (1, 9).into(), (1, 10).into()),
3461            TokenWithSpan::at(
3462                Token::Whitespace(Whitespace::Newline),
3463                (1, 10).into(),
3464                (2, 1).into(),
3465            ),
3466            TokenWithSpan::at(
3467                Token::Whitespace(Whitespace::Space),
3468                (2, 1).into(),
3469                (2, 2).into(),
3470            ),
3471            TokenWithSpan::at(Token::make_word("b", None), (2, 2).into(), (2, 3).into()),
3472        ];
3473        compare(expected, tokens);
3474    }
3475
3476    fn compare<T: PartialEq + fmt::Debug>(expected: Vec<T>, actual: Vec<T>) {
3477        //println!("------------------------------");
3478        //println!("tokens   = {:?}", actual);
3479        //println!("expected = {:?}", expected);
3480        //println!("------------------------------");
3481        assert_eq!(expected, actual);
3482    }
3483
3484    fn check_unescape(s: &str, expected: Option<&str>) {
3485        let s = format!("'{}'", s);
3486        let mut state = State {
3487            peekable: s.chars().peekable(),
3488            line: 0,
3489            col: 0,
3490        };
3491
3492        assert_eq!(
3493            unescape_single_quoted_string(&mut state),
3494            expected.map(|s| s.to_string())
3495        );
3496    }
3497
3498    #[test]
3499    fn test_unescape() {
3500        check_unescape(r"\b", Some("\u{0008}"));
3501        check_unescape(r"\f", Some("\u{000C}"));
3502        check_unescape(r"\t", Some("\t"));
3503        check_unescape(r"\r\n", Some("\r\n"));
3504        check_unescape(r"\/", Some("/"));
3505        check_unescape(r"/", Some("/"));
3506        check_unescape(r"\\", Some("\\"));
3507
3508        // 16 and 32-bit hexadecimal Unicode character value
3509        check_unescape(r"\u0001", Some("\u{0001}"));
3510        check_unescape(r"\u4c91", Some("\u{4c91}"));
3511        check_unescape(r"\u4c916", Some("\u{4c91}6"));
3512        check_unescape(r"\u4c", None);
3513        check_unescape(r"\u0000", None);
3514        check_unescape(r"\U0010FFFF", Some("\u{10FFFF}"));
3515        check_unescape(r"\U00110000", None);
3516        check_unescape(r"\U00000000", None);
3517        check_unescape(r"\u", None);
3518        check_unescape(r"\U", None);
3519        check_unescape(r"\U1010FFFF", None);
3520
3521        // hexadecimal byte value
3522        check_unescape(r"\x4B", Some("\u{004b}"));
3523        check_unescape(r"\x4", Some("\u{0004}"));
3524        check_unescape(r"\x4L", Some("\u{0004}L"));
3525        check_unescape(r"\x", Some("x"));
3526        check_unescape(r"\xP", Some("xP"));
3527        check_unescape(r"\x0", None);
3528        check_unescape(r"\xCAD", None);
3529        check_unescape(r"\xA9", None);
3530
3531        // octal byte value
3532        check_unescape(r"\1", Some("\u{0001}"));
3533        check_unescape(r"\12", Some("\u{000a}"));
3534        check_unescape(r"\123", Some("\u{0053}"));
3535        check_unescape(r"\1232", Some("\u{0053}2"));
3536        check_unescape(r"\4", Some("\u{0004}"));
3537        check_unescape(r"\45", Some("\u{0025}"));
3538        check_unescape(r"\450", Some("\u{0028}"));
3539        check_unescape(r"\603", None);
3540        check_unescape(r"\0", None);
3541        check_unescape(r"\080", None);
3542
3543        // others
3544        check_unescape(r"\9", Some("9"));
3545        check_unescape(r"''", Some("'"));
3546        check_unescape(
3547            r"Hello\r\nRust/\u4c91 SQL Parser\U0010ABCD\1232",
3548            Some("Hello\r\nRust/\u{4c91} SQL Parser\u{10abcd}\u{0053}2"),
3549        );
3550        check_unescape(r"Hello\0", None);
3551        check_unescape(r"Hello\xCADRust", None);
3552    }
3553
3554    #[test]
3555    fn tokenize_numeric_prefix_trait() {
3556        #[derive(Debug)]
3557        struct NumericPrefixDialect;
3558
3559        impl Dialect for NumericPrefixDialect {
3560            fn is_identifier_start(&self, ch: char) -> bool {
3561                ch.is_ascii_lowercase()
3562                    || ch.is_ascii_uppercase()
3563                    || ch.is_ascii_digit()
3564                    || ch == '$'
3565            }
3566
3567            fn is_identifier_part(&self, ch: char) -> bool {
3568                ch.is_ascii_lowercase()
3569                    || ch.is_ascii_uppercase()
3570                    || ch.is_ascii_digit()
3571                    || ch == '_'
3572                    || ch == '$'
3573                    || ch == '{'
3574                    || ch == '}'
3575            }
3576
3577            fn supports_numeric_prefix(&self) -> bool {
3578                true
3579            }
3580        }
3581
3582        tokenize_numeric_prefix_inner(&NumericPrefixDialect {});
3583        tokenize_numeric_prefix_inner(&HiveDialect {});
3584        tokenize_numeric_prefix_inner(&MySqlDialect {});
3585    }
3586
3587    fn tokenize_numeric_prefix_inner(dialect: &dyn Dialect) {
3588        let sql = r#"SELECT * FROM 1"#;
3589        let tokens = Tokenizer::new(dialect, sql).tokenize().unwrap();
3590        let expected = vec![
3591            Token::make_keyword("SELECT"),
3592            Token::Whitespace(Whitespace::Space),
3593            Token::Mul,
3594            Token::Whitespace(Whitespace::Space),
3595            Token::make_keyword("FROM"),
3596            Token::Whitespace(Whitespace::Space),
3597            Token::Number(String::from("1"), false),
3598        ];
3599        compare(expected, tokens);
3600    }
3601
3602    #[test]
3603    fn tokenize_quoted_string_escape() {
3604        let dialect = SnowflakeDialect {};
3605        for (sql, expected, expected_unescaped) in [
3606            (r#"'%a\'%b'"#, r#"%a\'%b"#, r#"%a'%b"#),
3607            (r#"'a\'\'b\'c\'d'"#, r#"a\'\'b\'c\'d"#, r#"a''b'c'd"#),
3608            (r#"'\\'"#, r#"\\"#, r#"\"#),
3609            (
3610                r#"'\0\a\b\f\n\r\t\Z'"#,
3611                r#"\0\a\b\f\n\r\t\Z"#,
3612                "\0\u{7}\u{8}\u{c}\n\r\t\u{1a}",
3613            ),
3614            (r#"'\"'"#, r#"\""#, "\""),
3615            (r#"'\\a\\b\'c'"#, r#"\\a\\b\'c"#, r#"\a\b'c"#),
3616            (r#"'\'abcd'"#, r#"\'abcd"#, r#"'abcd"#),
3617            (r#"'''a''b'"#, r#"''a''b"#, r#"'a'b"#),
3618            (r#"'\q'"#, r#"\q"#, r#"q"#),
3619            (r#"'\%\_'"#, r#"\%\_"#, r#"%_"#),
3620            (r#"'\\%\\_'"#, r#"\\%\\_"#, r#"\%\_"#),
3621        ] {
3622            let tokens = Tokenizer::new(&dialect, sql)
3623                .with_unescape(false)
3624                .tokenize()
3625                .unwrap();
3626            let expected = vec![Token::SingleQuotedString(expected.to_string())];
3627            compare(expected, tokens);
3628
3629            let tokens = Tokenizer::new(&dialect, sql)
3630                .with_unescape(true)
3631                .tokenize()
3632                .unwrap();
3633            let expected = vec![Token::SingleQuotedString(expected_unescaped.to_string())];
3634            compare(expected, tokens);
3635        }
3636
3637        for sql in [r#"'\'"#, r#"'ab\'"#] {
3638            let mut tokenizer = Tokenizer::new(&dialect, sql);
3639            assert_eq!(
3640                "Unterminated string literal",
3641                tokenizer.tokenize().unwrap_err().message.as_str(),
3642            );
3643        }
3644
3645        // Non-escape dialect
3646        for (sql, expected) in [(r#"'\'"#, r#"\"#), (r#"'ab\'"#, r#"ab\"#)] {
3647            let dialect = GenericDialect {};
3648            let tokens = Tokenizer::new(&dialect, sql).tokenize().unwrap();
3649
3650            let expected = vec![Token::SingleQuotedString(expected.to_string())];
3651
3652            compare(expected, tokens);
3653        }
3654
3655        // MySQL special case for LIKE escapes
3656        for (sql, expected) in [(r#"'\%'"#, r#"\%"#), (r#"'\_'"#, r#"\_"#)] {
3657            let dialect = MySqlDialect {};
3658            let tokens = Tokenizer::new(&dialect, sql).tokenize().unwrap();
3659
3660            let expected = vec![Token::SingleQuotedString(expected.to_string())];
3661
3662            compare(expected, tokens);
3663        }
3664    }
3665
3666    #[test]
3667    fn tokenize_triple_quoted_string() {
3668        fn check<F>(
3669            q: char, // The quote character to test
3670            r: char, // An alternate quote character.
3671            quote_token: F,
3672        ) where
3673            F: Fn(String) -> Token,
3674        {
3675            let dialect = BigQueryDialect {};
3676
3677            for (sql, expected, expected_unescaped) in [
3678                // Empty string
3679                (format!(r#"{q}{q}{q}{q}{q}{q}"#), "".into(), "".into()),
3680                // Should not count escaped quote as end of string.
3681                (
3682                    format!(r#"{q}{q}{q}ab{q}{q}\{q}{q}cd{q}{q}{q}"#),
3683                    format!(r#"ab{q}{q}\{q}{q}cd"#),
3684                    format!(r#"ab{q}{q}{q}{q}cd"#),
3685                ),
3686                // Simple string
3687                (
3688                    format!(r#"{q}{q}{q}abc{q}{q}{q}"#),
3689                    "abc".into(),
3690                    "abc".into(),
3691                ),
3692                // Mix single-double quotes unescaped.
3693                (
3694                    format!(r#"{q}{q}{q}ab{r}{r}{r}c{r}def{r}{r}{r}{q}{q}{q}"#),
3695                    format!("ab{r}{r}{r}c{r}def{r}{r}{r}"),
3696                    format!("ab{r}{r}{r}c{r}def{r}{r}{r}"),
3697                ),
3698                // Escaped quote.
3699                (
3700                    format!(r#"{q}{q}{q}ab{q}{q}c{q}{q}\{q}de{q}{q}f{q}{q}{q}"#),
3701                    format!(r#"ab{q}{q}c{q}{q}\{q}de{q}{q}f"#),
3702                    format!(r#"ab{q}{q}c{q}{q}{q}de{q}{q}f"#),
3703                ),
3704                // backslash-escaped quote characters.
3705                (
3706                    format!(r#"{q}{q}{q}a\'\'b\'c\'d{q}{q}{q}"#),
3707                    r#"a\'\'b\'c\'d"#.into(),
3708                    r#"a''b'c'd"#.into(),
3709                ),
3710                // backslash-escaped characters
3711                (
3712                    format!(r#"{q}{q}{q}abc\0\n\rdef{q}{q}{q}"#),
3713                    r#"abc\0\n\rdef"#.into(),
3714                    "abc\0\n\rdef".into(),
3715                ),
3716            ] {
3717                let tokens = Tokenizer::new(&dialect, sql.as_str())
3718                    .with_unescape(false)
3719                    .tokenize()
3720                    .unwrap();
3721                let expected = vec![quote_token(expected.to_string())];
3722                compare(expected, tokens);
3723
3724                let tokens = Tokenizer::new(&dialect, sql.as_str())
3725                    .with_unescape(true)
3726                    .tokenize()
3727                    .unwrap();
3728                let expected = vec![quote_token(expected_unescaped.to_string())];
3729                compare(expected, tokens);
3730            }
3731
3732            for sql in [
3733                format!(r#"{q}{q}{q}{q}{q}\{q}"#),
3734                format!(r#"{q}{q}{q}abc{q}{q}\{q}"#),
3735                format!(r#"{q}{q}{q}{q}"#),
3736                format!(r#"{q}{q}{q}{r}{r}"#),
3737                format!(r#"{q}{q}{q}abc{q}"#),
3738                format!(r#"{q}{q}{q}abc{q}{q}"#),
3739                format!(r#"{q}{q}{q}abc"#),
3740            ] {
3741                let dialect = BigQueryDialect {};
3742                let mut tokenizer = Tokenizer::new(&dialect, sql.as_str());
3743                assert_eq!(
3744                    "Unterminated string literal",
3745                    tokenizer.tokenize().unwrap_err().message.as_str(),
3746                );
3747            }
3748        }
3749
3750        check('"', '\'', Token::TripleDoubleQuotedString);
3751
3752        check('\'', '"', Token::TripleSingleQuotedString);
3753
3754        let dialect = BigQueryDialect {};
3755
3756        let sql = r#"""''"#;
3757        let tokens = Tokenizer::new(&dialect, sql)
3758            .with_unescape(true)
3759            .tokenize()
3760            .unwrap();
3761        let expected = vec![
3762            Token::DoubleQuotedString("".to_string()),
3763            Token::SingleQuotedString("".to_string()),
3764        ];
3765        compare(expected, tokens);
3766
3767        let sql = r#"''"""#;
3768        let tokens = Tokenizer::new(&dialect, sql)
3769            .with_unescape(true)
3770            .tokenize()
3771            .unwrap();
3772        let expected = vec![
3773            Token::SingleQuotedString("".to_string()),
3774            Token::DoubleQuotedString("".to_string()),
3775        ];
3776        compare(expected, tokens);
3777
3778        // Non-triple quoted string dialect
3779        let dialect = SnowflakeDialect {};
3780        let sql = r#"''''''"#;
3781        let tokens = Tokenizer::new(&dialect, sql).tokenize().unwrap();
3782        let expected = vec![Token::SingleQuotedString("''".to_string())];
3783        compare(expected, tokens);
3784    }
3785
3786    #[test]
3787    fn test_mysql_users_grantees() {
3788        let dialect = MySqlDialect {};
3789
3790        let sql = "CREATE USER `root`@`%`";
3791        let tokens = Tokenizer::new(&dialect, sql).tokenize().unwrap();
3792        let expected = vec![
3793            Token::make_keyword("CREATE"),
3794            Token::Whitespace(Whitespace::Space),
3795            Token::make_keyword("USER"),
3796            Token::Whitespace(Whitespace::Space),
3797            Token::make_word("root", Some('`')),
3798            Token::AtSign,
3799            Token::make_word("%", Some('`')),
3800        ];
3801        compare(expected, tokens);
3802    }
3803
3804    #[test]
3805    fn test_postgres_abs_without_space_and_string_literal() {
3806        let dialect = MySqlDialect {};
3807
3808        let sql = "SELECT @'1'";
3809        let tokens = Tokenizer::new(&dialect, sql).tokenize().unwrap();
3810        let expected = vec![
3811            Token::make_keyword("SELECT"),
3812            Token::Whitespace(Whitespace::Space),
3813            Token::AtSign,
3814            Token::SingleQuotedString("1".to_string()),
3815        ];
3816        compare(expected, tokens);
3817    }
3818
3819    #[test]
3820    fn test_postgres_abs_without_space_and_quoted_column() {
3821        let dialect = MySqlDialect {};
3822
3823        let sql = r#"SELECT @"bar" FROM foo"#;
3824        let tokens = Tokenizer::new(&dialect, sql).tokenize().unwrap();
3825        let expected = vec![
3826            Token::make_keyword("SELECT"),
3827            Token::Whitespace(Whitespace::Space),
3828            Token::AtSign,
3829            Token::DoubleQuotedString("bar".to_string()),
3830            Token::Whitespace(Whitespace::Space),
3831            Token::make_keyword("FROM"),
3832            Token::Whitespace(Whitespace::Space),
3833            Token::make_word("foo", None),
3834        ];
3835        compare(expected, tokens);
3836    }
3837
3838    #[test]
3839    fn test_national_strings_backslash_escape_not_supported() {
3840        all_dialects_where(|dialect| !dialect.supports_string_literal_backslash_escape())
3841            .tokenizes_to(
3842                "select n'''''\\'",
3843                vec![
3844                    Token::make_keyword("select"),
3845                    Token::Whitespace(Whitespace::Space),
3846                    Token::NationalStringLiteral("''\\".to_string()),
3847                ],
3848            );
3849    }
3850
3851    #[test]
3852    fn test_national_strings_backslash_escape_supported() {
3853        all_dialects_where(|dialect| dialect.supports_string_literal_backslash_escape())
3854            .tokenizes_to(
3855                "select n'''''\\''",
3856                vec![
3857                    Token::make_keyword("select"),
3858                    Token::Whitespace(Whitespace::Space),
3859                    Token::NationalStringLiteral("'''".to_string()),
3860                ],
3861            );
3862    }
3863
3864    #[test]
3865    fn test_string_escape_constant_not_supported() {
3866        all_dialects_where(|dialect| !dialect.supports_string_escape_constant()).tokenizes_to(
3867            "select e'...'",
3868            vec![
3869                Token::make_keyword("select"),
3870                Token::Whitespace(Whitespace::Space),
3871                Token::make_word("e", None),
3872                Token::SingleQuotedString("...".to_string()),
3873            ],
3874        );
3875
3876        all_dialects_where(|dialect| !dialect.supports_string_escape_constant()).tokenizes_to(
3877            "select E'...'",
3878            vec![
3879                Token::make_keyword("select"),
3880                Token::Whitespace(Whitespace::Space),
3881                Token::make_word("E", None),
3882                Token::SingleQuotedString("...".to_string()),
3883            ],
3884        );
3885    }
3886
3887    #[test]
3888    fn test_string_escape_constant_supported() {
3889        all_dialects_where(|dialect| dialect.supports_string_escape_constant()).tokenizes_to(
3890            "select e'\\''",
3891            vec![
3892                Token::make_keyword("select"),
3893                Token::Whitespace(Whitespace::Space),
3894                Token::EscapedStringLiteral("'".to_string()),
3895            ],
3896        );
3897
3898        all_dialects_where(|dialect| dialect.supports_string_escape_constant()).tokenizes_to(
3899            "select E'\\''",
3900            vec![
3901                Token::make_keyword("select"),
3902                Token::Whitespace(Whitespace::Space),
3903                Token::EscapedStringLiteral("'".to_string()),
3904            ],
3905        );
3906    }
3907
3908    #[test]
3909    fn test_whitespace_required_after_single_line_comment() {
3910        all_dialects_where(|dialect| dialect.requires_single_line_comment_whitespace())
3911            .tokenizes_to(
3912                "SELECT --'abc'",
3913                vec![
3914                    Token::make_keyword("SELECT"),
3915                    Token::Whitespace(Whitespace::Space),
3916                    Token::Minus,
3917                    Token::Minus,
3918                    Token::SingleQuotedString("abc".to_string()),
3919                ],
3920            );
3921
3922        all_dialects_where(|dialect| dialect.requires_single_line_comment_whitespace())
3923            .tokenizes_to(
3924                "SELECT -- 'abc'",
3925                vec![
3926                    Token::make_keyword("SELECT"),
3927                    Token::Whitespace(Whitespace::Space),
3928                    Token::Whitespace(Whitespace::SingleLineComment {
3929                        prefix: "--".to_string(),
3930                        comment: " 'abc'".to_string(),
3931                    }),
3932                ],
3933            );
3934
3935        all_dialects_where(|dialect| dialect.requires_single_line_comment_whitespace())
3936            .tokenizes_to(
3937                "SELECT --",
3938                vec![
3939                    Token::make_keyword("SELECT"),
3940                    Token::Whitespace(Whitespace::Space),
3941                    Token::Minus,
3942                    Token::Minus,
3943                ],
3944            );
3945    }
3946
3947    #[test]
3948    fn test_whitespace_not_required_after_single_line_comment() {
3949        all_dialects_where(|dialect| !dialect.requires_single_line_comment_whitespace())
3950            .tokenizes_to(
3951                "SELECT --'abc'",
3952                vec![
3953                    Token::make_keyword("SELECT"),
3954                    Token::Whitespace(Whitespace::Space),
3955                    Token::Whitespace(Whitespace::SingleLineComment {
3956                        prefix: "--".to_string(),
3957                        comment: "'abc'".to_string(),
3958                    }),
3959                ],
3960            );
3961
3962        all_dialects_where(|dialect| !dialect.requires_single_line_comment_whitespace())
3963            .tokenizes_to(
3964                "SELECT -- 'abc'",
3965                vec![
3966                    Token::make_keyword("SELECT"),
3967                    Token::Whitespace(Whitespace::Space),
3968                    Token::Whitespace(Whitespace::SingleLineComment {
3969                        prefix: "--".to_string(),
3970                        comment: " 'abc'".to_string(),
3971                    }),
3972                ],
3973            );
3974
3975        all_dialects_where(|dialect| !dialect.requires_single_line_comment_whitespace())
3976            .tokenizes_to(
3977                "SELECT --",
3978                vec![
3979                    Token::make_keyword("SELECT"),
3980                    Token::Whitespace(Whitespace::Space),
3981                    Token::Whitespace(Whitespace::SingleLineComment {
3982                        prefix: "--".to_string(),
3983                        comment: "".to_string(),
3984                    }),
3985                ],
3986            );
3987    }
3988
3989    #[test]
3990    fn test_tokenize_identifiers_numeric_prefix() {
3991        all_dialects_where(|dialect| dialect.supports_numeric_prefix())
3992            .tokenizes_to("123abc", vec![Token::make_word("123abc", None)]);
3993
3994        all_dialects_where(|dialect| dialect.supports_numeric_prefix())
3995            .tokenizes_to("12e34", vec![Token::Number("12e34".to_string(), false)]);
3996
3997        all_dialects_where(|dialect| dialect.supports_numeric_prefix()).tokenizes_to(
3998            "t.12e34",
3999            vec![
4000                Token::make_word("t", None),
4001                Token::Period,
4002                Token::make_word("12e34", None),
4003            ],
4004        );
4005
4006        all_dialects_where(|dialect| dialect.supports_numeric_prefix()).tokenizes_to(
4007            "t.1two3",
4008            vec![
4009                Token::make_word("t", None),
4010                Token::Period,
4011                Token::make_word("1two3", None),
4012            ],
4013        );
4014    }
4015}
sqlparser/tokenizer.rs

sqlparser/
tokenizer.rs