1use diagnostics::make_unclosed_delims_error;
2use rustc_ast::ast::{self, AttrStyle};
3use rustc_ast::token::{self, CommentKind, Delimiter, IdentIsRaw, Token, TokenKind};
4use rustc_ast::tokenstream::TokenStream;
5use rustc_ast::util::unicode::{TEXT_FLOW_CONTROL_CHARS, contains_text_flow_control_chars};
6use rustc_errors::codes::*;
7use rustc_errors::{Applicability, Diag, DiagCtxtHandle, StashKey};
8use rustc_lexer::{
9 Base, Cursor, DocStyle, FrontmatterAllowed, LiteralKind, RawStrError, is_whitespace,
10};
11use rustc_literal_escaper::{EscapeError, Mode, check_for_errors};
12use rustc_session::lint::BuiltinLintDiag;
13use rustc_session::lint::builtin::{
14 RUST_2021_PREFIXES_INCOMPATIBLE_SYNTAX, RUST_2024_GUARDED_STRING_INCOMPATIBLE_SYNTAX,
15 TEXT_DIRECTION_CODEPOINT_IN_COMMENT, TEXT_DIRECTION_CODEPOINT_IN_LITERAL,
16};
17use rustc_session::parse::ParseSess;
18use rustc_span::{BytePos, Pos, Span, Symbol, sym};
19use tracing::debug;
20
21use crate::errors;
22use crate::lexer::diagnostics::TokenTreeDiagInfo;
23use crate::lexer::unicode_chars::UNICODE_ARRAY;
24
25mod diagnostics;
26mod tokentrees;
27mod unescape_error_reporting;
28mod unicode_chars;
29
30use unescape_error_reporting::{emit_unescape_error, escaped_char};
31
32#[cfg(target_pointer_width = "64")]
37rustc_data_structures::static_assert_size!(rustc_lexer::Token, 12);
38
39#[derive(Clone, Debug)]
40pub(crate) struct UnmatchedDelim {
41 pub found_delim: Option<Delimiter>,
42 pub found_span: Span,
43 pub unclosed_span: Option<Span>,
44 pub candidate_span: Option<Span>,
45}
46
47pub(crate) fn lex_token_trees<'psess, 'src>(
48 psess: &'psess ParseSess,
49 mut src: &'src str,
50 mut start_pos: BytePos,
51 override_span: Option<Span>,
52) -> Result<TokenStream, Vec<Diag<'psess>>> {
53 if let Some(shebang_len) = rustc_lexer::strip_shebang(src) {
55 src = &src[shebang_len..];
56 start_pos = start_pos + BytePos::from_usize(shebang_len);
57 }
58
59 let cursor = Cursor::new(src, FrontmatterAllowed::Yes);
60 let mut lexer = Lexer {
61 psess,
62 start_pos,
63 pos: start_pos,
64 src,
65 cursor,
66 override_span,
67 nbsp_is_whitespace: false,
68 last_lifetime: None,
69 token: Token::dummy(),
70 diag_info: TokenTreeDiagInfo::default(),
71 };
72 let res = lexer.lex_token_trees(false);
73
74 let mut unmatched_delims: Vec<_> = lexer
75 .diag_info
76 .unmatched_delims
77 .into_iter()
78 .filter_map(|unmatched_delim| make_unclosed_delims_error(unmatched_delim, psess))
79 .collect();
80
81 match res {
82 Ok((_open_spacing, stream)) => {
83 if unmatched_delims.is_empty() {
84 Ok(stream)
85 } else {
86 Err(unmatched_delims)
88 }
89 }
90 Err(errs) => {
91 unmatched_delims.extend(errs);
94 Err(unmatched_delims)
95 }
96 }
97}
98
99struct Lexer<'psess, 'src> {
100 psess: &'psess ParseSess,
101 start_pos: BytePos,
103 pos: BytePos,
105 src: &'src str,
107 cursor: Cursor<'src>,
109 override_span: Option<Span>,
110 nbsp_is_whitespace: bool,
114
115 last_lifetime: Option<Span>,
118
119 token: Token,
121
122 diag_info: TokenTreeDiagInfo,
123}
124
125impl<'psess, 'src> Lexer<'psess, 'src> {
126 fn dcx(&self) -> DiagCtxtHandle<'psess> {
127 self.psess.dcx()
128 }
129
130 fn mk_sp(&self, lo: BytePos, hi: BytePos) -> Span {
131 self.override_span.unwrap_or_else(|| Span::with_root_ctxt(lo, hi))
132 }
133
134 fn next_token_from_cursor(&mut self) -> (Token, bool) {
137 let mut preceded_by_whitespace = false;
138 let mut swallow_next_invalid = 0;
139 loop {
141 let str_before = self.cursor.as_str();
142 let token = self.cursor.advance_token();
143 let start = self.pos;
144 self.pos = self.pos + BytePos(token.len);
145
146 debug!("next_token: {:?}({:?})", token.kind, self.str_from(start));
147
148 if let rustc_lexer::TokenKind::Semi
149 | rustc_lexer::TokenKind::LineComment { .. }
150 | rustc_lexer::TokenKind::BlockComment { .. }
151 | rustc_lexer::TokenKind::CloseParen
152 | rustc_lexer::TokenKind::CloseBrace
153 | rustc_lexer::TokenKind::CloseBracket = token.kind
154 {
155 self.last_lifetime = None;
158 }
159
160 let kind = match token.kind {
164 rustc_lexer::TokenKind::LineComment { doc_style } => {
165 let Some(doc_style) = doc_style else {
167 self.lint_unicode_text_flow(start);
168 preceded_by_whitespace = true;
169 continue;
170 };
171
172 let content_start = start + BytePos(3);
174 let content = self.str_from(content_start);
175 self.lint_doc_comment_unicode_text_flow(start, content);
176 self.cook_doc_comment(content_start, content, CommentKind::Line, doc_style)
177 }
178 rustc_lexer::TokenKind::BlockComment { doc_style, terminated } => {
179 if !terminated {
180 self.report_unterminated_block_comment(start, doc_style);
181 }
182
183 let Some(doc_style) = doc_style else {
185 self.lint_unicode_text_flow(start);
186 preceded_by_whitespace = true;
187 continue;
188 };
189
190 let content_start = start + BytePos(3);
193 let content_end = self.pos - BytePos(if terminated { 2 } else { 0 });
194 let content = self.str_from_to(content_start, content_end);
195 self.lint_doc_comment_unicode_text_flow(start, content);
196 self.cook_doc_comment(content_start, content, CommentKind::Block, doc_style)
197 }
198 rustc_lexer::TokenKind::Frontmatter { has_invalid_preceding_whitespace, invalid_infostring } => {
199 self.validate_frontmatter(start, has_invalid_preceding_whitespace, invalid_infostring);
200 preceded_by_whitespace = true;
201 continue;
202 }
203 rustc_lexer::TokenKind::Whitespace => {
204 preceded_by_whitespace = true;
205 continue;
206 }
207 rustc_lexer::TokenKind::Ident => self.ident(start),
208 rustc_lexer::TokenKind::RawIdent => {
209 let sym = nfc_normalize(self.str_from(start + BytePos(2)));
210 let span = self.mk_sp(start, self.pos);
211 self.psess.symbol_gallery.insert(sym, span);
212 if !sym.can_be_raw() {
213 self.dcx().emit_err(errors::CannotBeRawIdent { span, ident: sym });
214 }
215 self.psess.raw_identifier_spans.push(span);
216 token::Ident(sym, IdentIsRaw::Yes)
217 }
218 rustc_lexer::TokenKind::UnknownPrefix => {
219 self.report_unknown_prefix(start);
220 self.ident(start)
221 }
222 rustc_lexer::TokenKind::UnknownPrefixLifetime => {
223 self.report_unknown_prefix(start);
224 let lifetime_name = self.str_from(start);
228 self.last_lifetime = Some(self.mk_sp(start, start + BytePos(1)));
229 let ident = Symbol::intern(lifetime_name);
230 token::Lifetime(ident, IdentIsRaw::No)
231 }
232 rustc_lexer::TokenKind::InvalidIdent
233 if !UNICODE_ARRAY.iter().any(|&(c, _, _)| {
236 let sym = self.str_from(start);
237 sym.chars().count() == 1 && c == sym.chars().next().unwrap()
238 }) =>
239 {
240 let sym = nfc_normalize(self.str_from(start));
241 let span = self.mk_sp(start, self.pos);
242 self.psess
243 .bad_unicode_identifiers
244 .borrow_mut()
245 .entry(sym)
246 .or_default()
247 .push(span);
248 token::Ident(sym, IdentIsRaw::No)
249 }
250 rustc_lexer::TokenKind::Literal {
253 kind: kind @ (LiteralKind::CStr { .. } | LiteralKind::RawCStr { .. }),
254 suffix_start: _,
255 } if !self.mk_sp(start, self.pos).edition().at_least_rust_2021() => {
256 let prefix_len = match kind {
257 LiteralKind::CStr { .. } => 1,
258 LiteralKind::RawCStr { .. } => 2,
259 _ => unreachable!(),
260 };
261
262 let lit_start = start + BytePos(prefix_len);
265 self.pos = lit_start;
266 self.cursor = Cursor::new(&str_before[prefix_len as usize..], FrontmatterAllowed::No);
267 self.report_unknown_prefix(start);
268 let prefix_span = self.mk_sp(start, lit_start);
269 return (Token::new(self.ident(start), prefix_span), preceded_by_whitespace);
270 }
271 rustc_lexer::TokenKind::GuardedStrPrefix => {
272 self.maybe_report_guarded_str(start, str_before)
273 }
274 rustc_lexer::TokenKind::Literal { kind, suffix_start } => {
275 let suffix_start = start + BytePos(suffix_start);
276 let (kind, symbol) = self.cook_lexer_literal(start, suffix_start, kind);
277 let suffix = if suffix_start < self.pos {
278 let string = self.str_from(suffix_start);
279 if string == "_" {
280 self.dcx().emit_err(errors::UnderscoreLiteralSuffix {
281 span: self.mk_sp(suffix_start, self.pos),
282 });
283 None
284 } else {
285 Some(Symbol::intern(string))
286 }
287 } else {
288 None
289 };
290 self.lint_literal_unicode_text_flow(symbol, kind, self.mk_sp(start, self.pos), "literal");
291 token::Literal(token::Lit { kind, symbol, suffix })
292 }
293 rustc_lexer::TokenKind::Lifetime { starts_with_number } => {
294 let lifetime_name = self.str_from(start);
298 self.last_lifetime = Some(self.mk_sp(start, start + BytePos(1)));
299 if starts_with_number {
300 let span = self.mk_sp(start, self.pos);
301 self.dcx()
302 .struct_err("lifetimes cannot start with a number")
303 .with_span(span)
304 .stash(span, StashKey::LifetimeIsChar);
305 }
306 let ident = Symbol::intern(lifetime_name);
307 token::Lifetime(ident, IdentIsRaw::No)
308 }
309 rustc_lexer::TokenKind::RawLifetime => {
310 self.last_lifetime = Some(self.mk_sp(start, start + BytePos(1)));
311
312 let ident_start = start + BytePos(3);
313 let prefix_span = self.mk_sp(start, ident_start);
314
315 if prefix_span.at_least_rust_2021() {
316 if self.cursor.as_str().starts_with('\'') {
322 let lit_span = self.mk_sp(start, self.pos + BytePos(1));
323 let contents = self.str_from_to(start + BytePos(1), self.pos);
324 emit_unescape_error(
325 self.dcx(),
326 contents,
327 lit_span,
328 lit_span,
329 Mode::Char,
330 0..contents.len(),
331 EscapeError::MoreThanOneChar,
332 )
333 .expect("expected error");
334 }
335
336 let span = self.mk_sp(start, self.pos);
337
338 let lifetime_name_without_tick =
339 Symbol::intern(&self.str_from(ident_start));
340 if !lifetime_name_without_tick.can_be_raw() {
341 self.dcx().emit_err(
342 errors::CannotBeRawLifetime {
343 span,
344 ident: lifetime_name_without_tick
345 }
346 );
347 }
348
349 let mut lifetime_name =
351 String::with_capacity(lifetime_name_without_tick.as_str().len() + 1);
352 lifetime_name.push('\'');
353 lifetime_name += lifetime_name_without_tick.as_str();
354 let sym = Symbol::intern(&lifetime_name);
355
356 self.psess.raw_identifier_spans.push(span);
358
359 token::Lifetime(sym, IdentIsRaw::Yes)
360 } else {
361 self.psess.buffer_lint(
363 RUST_2021_PREFIXES_INCOMPATIBLE_SYNTAX,
364 prefix_span,
365 ast::CRATE_NODE_ID,
366 BuiltinLintDiag::RawPrefix(prefix_span),
367 );
368
369 let lt_start = start + BytePos(2);
371 self.pos = lt_start;
372 self.cursor = Cursor::new(&str_before[2 as usize..], FrontmatterAllowed::No);
373
374 let lifetime_name = self.str_from(start);
375 let ident = Symbol::intern(lifetime_name);
376 token::Lifetime(ident, IdentIsRaw::No)
377 }
378 }
379 rustc_lexer::TokenKind::Semi => token::Semi,
380 rustc_lexer::TokenKind::Comma => token::Comma,
381 rustc_lexer::TokenKind::Dot => token::Dot,
382 rustc_lexer::TokenKind::OpenParen => token::OpenParen,
383 rustc_lexer::TokenKind::CloseParen => token::CloseParen,
384 rustc_lexer::TokenKind::OpenBrace => token::OpenBrace,
385 rustc_lexer::TokenKind::CloseBrace => token::CloseBrace,
386 rustc_lexer::TokenKind::OpenBracket => token::OpenBracket,
387 rustc_lexer::TokenKind::CloseBracket => token::CloseBracket,
388 rustc_lexer::TokenKind::At => token::At,
389 rustc_lexer::TokenKind::Pound => token::Pound,
390 rustc_lexer::TokenKind::Tilde => token::Tilde,
391 rustc_lexer::TokenKind::Question => token::Question,
392 rustc_lexer::TokenKind::Colon => token::Colon,
393 rustc_lexer::TokenKind::Dollar => token::Dollar,
394 rustc_lexer::TokenKind::Eq => token::Eq,
395 rustc_lexer::TokenKind::Bang => token::Bang,
396 rustc_lexer::TokenKind::Lt => token::Lt,
397 rustc_lexer::TokenKind::Gt => token::Gt,
398 rustc_lexer::TokenKind::Minus => token::Minus,
399 rustc_lexer::TokenKind::And => token::And,
400 rustc_lexer::TokenKind::Or => token::Or,
401 rustc_lexer::TokenKind::Plus => token::Plus,
402 rustc_lexer::TokenKind::Star => token::Star,
403 rustc_lexer::TokenKind::Slash => token::Slash,
404 rustc_lexer::TokenKind::Caret => token::Caret,
405 rustc_lexer::TokenKind::Percent => token::Percent,
406
407 rustc_lexer::TokenKind::Unknown | rustc_lexer::TokenKind::InvalidIdent => {
408 if swallow_next_invalid > 0 {
410 swallow_next_invalid -= 1;
411 continue;
412 }
413 let mut it = self.str_from_to_end(start).chars();
414 let c = it.next().unwrap();
415 if c == '\u{00a0}' {
416 if self.nbsp_is_whitespace {
420 preceded_by_whitespace = true;
421 continue;
422 }
423 self.nbsp_is_whitespace = true;
424 }
425 let repeats = it.take_while(|c1| *c1 == c).count();
426 let (token, sugg) =
433 unicode_chars::check_for_substitution(self, start, c, repeats + 1);
434 self.dcx().emit_err(errors::UnknownTokenStart {
435 span: self.mk_sp(start, self.pos + Pos::from_usize(repeats * c.len_utf8())),
436 escaped: escaped_char(c),
437 sugg,
438 null: if c == '\x00' { Some(errors::UnknownTokenNull) } else { None },
439 repeat: if repeats > 0 {
440 swallow_next_invalid = repeats;
441 Some(errors::UnknownTokenRepeat { repeats })
442 } else {
443 None
444 },
445 });
446
447 if let Some(token) = token {
448 token
449 } else {
450 preceded_by_whitespace = true;
451 continue;
452 }
453 }
454 rustc_lexer::TokenKind::Eof => token::Eof,
455 };
456 let span = self.mk_sp(start, self.pos);
457 return (Token::new(kind, span), preceded_by_whitespace);
458 }
459 }
460
461 fn ident(&self, start: BytePos) -> TokenKind {
462 let sym = nfc_normalize(self.str_from(start));
463 let span = self.mk_sp(start, self.pos);
464 self.psess.symbol_gallery.insert(sym, span);
465 token::Ident(sym, IdentIsRaw::No)
466 }
467
468 fn lint_unicode_text_flow(&self, start: BytePos) {
471 let content_start = start + BytePos(2);
473 let content = self.str_from(content_start);
474 if contains_text_flow_control_chars(content) {
475 let span = self.mk_sp(start, self.pos);
476 self.psess.buffer_lint(
477 TEXT_DIRECTION_CODEPOINT_IN_COMMENT,
478 span,
479 ast::CRATE_NODE_ID,
480 BuiltinLintDiag::UnicodeTextFlow(span, content.to_string()),
481 );
482 }
483 }
484
485 fn lint_doc_comment_unicode_text_flow(&mut self, start: BytePos, content: &str) {
486 if contains_text_flow_control_chars(content) {
487 self.report_text_direction_codepoint(
488 content,
489 self.mk_sp(start, self.pos),
490 0,
491 false,
492 "doc comment",
493 );
494 }
495 }
496
497 fn lint_literal_unicode_text_flow(
498 &mut self,
499 text: Symbol,
500 lit_kind: token::LitKind,
501 span: Span,
502 label: &'static str,
503 ) {
504 if !contains_text_flow_control_chars(text.as_str()) {
505 return;
506 }
507 let (padding, point_at_inner_spans) = match lit_kind {
508 token::LitKind::Str | token::LitKind::Char => (1, true),
510 token::LitKind::CStr => (2, true),
512 token::LitKind::StrRaw(n) => (n as u32 + 2, true),
514 token::LitKind::CStrRaw(n) => (n as u32 + 3, true),
516 token::LitKind::Err(_) => return,
518 _ => (0, false),
520 };
521 self.report_text_direction_codepoint(
522 text.as_str(),
523 span,
524 padding,
525 point_at_inner_spans,
526 label,
527 );
528 }
529
530 fn report_text_direction_codepoint(
531 &self,
532 text: &str,
533 span: Span,
534 padding: u32,
535 point_at_inner_spans: bool,
536 label: &str,
537 ) {
538 let spans: Vec<_> = text
540 .char_indices()
541 .filter_map(|(i, c)| {
542 TEXT_FLOW_CONTROL_CHARS.contains(&c).then(|| {
543 let lo = span.lo() + BytePos(i as u32 + padding);
544 (c, span.with_lo(lo).with_hi(lo + BytePos(c.len_utf8() as u32)))
545 })
546 })
547 .collect();
548
549 let count = spans.len();
550 let labels = point_at_inner_spans.then_some(spans.clone());
551
552 self.psess.buffer_lint(
553 TEXT_DIRECTION_CODEPOINT_IN_LITERAL,
554 span,
555 ast::CRATE_NODE_ID,
556 BuiltinLintDiag::HiddenUnicodeCodepoints {
557 label: label.to_string(),
558 count,
559 span_label: span,
560 labels,
561 escape: point_at_inner_spans && !spans.is_empty(),
562 spans,
563 },
564 );
565 }
566
567 fn validate_frontmatter(
568 &self,
569 start: BytePos,
570 has_invalid_preceding_whitespace: bool,
571 invalid_infostring: bool,
572 ) {
573 let s = self.str_from(start);
574 let real_start = s.find("---").unwrap();
575 let frontmatter_opening_pos = BytePos(real_start as u32) + start;
576 let s_new = &s[real_start..];
577 let within = s_new.trim_start_matches('-');
578 let len_opening = s_new.len() - within.len();
579
580 let frontmatter_opening_end_pos = frontmatter_opening_pos + BytePos(len_opening as u32);
581 if has_invalid_preceding_whitespace {
582 let line_start =
583 BytePos(s[..real_start].rfind("\n").map_or(0, |i| i as u32 + 1)) + start;
584 let span = self.mk_sp(line_start, frontmatter_opening_end_pos);
585 let label_span = self.mk_sp(line_start, frontmatter_opening_pos);
586 self.dcx().emit_err(errors::FrontmatterInvalidOpeningPrecedingWhitespace {
587 span,
588 note_span: label_span,
589 });
590 }
591
592 if invalid_infostring {
593 let line_end = s[real_start..].find('\n').unwrap_or(s[real_start..].len());
594 let span = self.mk_sp(
595 frontmatter_opening_end_pos,
596 frontmatter_opening_pos + BytePos(line_end as u32),
597 );
598 self.dcx().emit_err(errors::FrontmatterInvalidInfostring { span });
599 }
600
601 let last_line_start = within.rfind('\n').map_or(0, |i| i + 1);
602 let last_line = &within[last_line_start..];
603 let last_line_trimmed = last_line.trim_start_matches(is_whitespace);
604 let last_line_start_pos = frontmatter_opening_end_pos + BytePos(last_line_start as u32);
605
606 let frontmatter_span = self.mk_sp(frontmatter_opening_pos, self.pos);
607 self.psess.gated_spans.gate(sym::frontmatter, frontmatter_span);
608
609 if !last_line_trimmed.starts_with("---") {
610 let label_span = self.mk_sp(frontmatter_opening_pos, frontmatter_opening_end_pos);
611 self.dcx().emit_err(errors::FrontmatterUnclosed {
612 span: frontmatter_span,
613 note_span: label_span,
614 });
615 return;
616 }
617
618 if last_line_trimmed.len() != last_line.len() {
619 let line_end = last_line_start_pos + BytePos(last_line.len() as u32);
620 let span = self.mk_sp(last_line_start_pos, line_end);
621 let whitespace_end =
622 last_line_start_pos + BytePos((last_line.len() - last_line_trimmed.len()) as u32);
623 let label_span = self.mk_sp(last_line_start_pos, whitespace_end);
624 self.dcx().emit_err(errors::FrontmatterInvalidClosingPrecedingWhitespace {
625 span,
626 note_span: label_span,
627 });
628 }
629
630 let rest = last_line_trimmed.trim_start_matches('-');
631 let len_close = last_line_trimmed.len() - rest.len();
632 if len_close != len_opening {
633 let span = self.mk_sp(frontmatter_opening_pos, self.pos);
634 let opening = self.mk_sp(frontmatter_opening_pos, frontmatter_opening_end_pos);
635 let last_line_close_pos = last_line_start_pos + BytePos(len_close as u32);
636 let close = self.mk_sp(last_line_start_pos, last_line_close_pos);
637 self.dcx().emit_err(errors::FrontmatterLengthMismatch {
638 span,
639 opening,
640 close,
641 len_opening,
642 len_close,
643 });
644 }
645
646 if !rest.trim_matches(is_whitespace).is_empty() {
647 let span = self.mk_sp(last_line_start_pos, self.pos);
648 self.dcx().emit_err(errors::FrontmatterExtraCharactersAfterClose { span });
649 }
650 }
651
652 fn cook_doc_comment(
653 &self,
654 content_start: BytePos,
655 content: &str,
656 comment_kind: CommentKind,
657 doc_style: DocStyle,
658 ) -> TokenKind {
659 if content.contains('\r') {
660 for (idx, _) in content.char_indices().filter(|&(_, c)| c == '\r') {
661 let span = self.mk_sp(
662 content_start + BytePos(idx as u32),
663 content_start + BytePos(idx as u32 + 1),
664 );
665 let block = matches!(comment_kind, CommentKind::Block);
666 self.dcx().emit_err(errors::CrDocComment { span, block });
667 }
668 }
669
670 let attr_style = match doc_style {
671 DocStyle::Outer => AttrStyle::Outer,
672 DocStyle::Inner => AttrStyle::Inner,
673 };
674
675 token::DocComment(comment_kind, attr_style, Symbol::intern(content))
676 }
677
678 fn cook_lexer_literal(
679 &self,
680 start: BytePos,
681 end: BytePos,
682 kind: rustc_lexer::LiteralKind,
683 ) -> (token::LitKind, Symbol) {
684 match kind {
685 rustc_lexer::LiteralKind::Char { terminated } => {
686 if !terminated {
687 let mut err = self
688 .dcx()
689 .struct_span_fatal(self.mk_sp(start, end), "unterminated character literal")
690 .with_code(E0762);
691 if let Some(lt_sp) = self.last_lifetime {
692 err.multipart_suggestion(
693 "if you meant to write a string literal, use double quotes",
694 vec![
695 (lt_sp, "\"".to_string()),
696 (self.mk_sp(start, start + BytePos(1)), "\"".to_string()),
697 ],
698 Applicability::MaybeIncorrect,
699 );
700 }
701 err.emit()
702 }
703 self.cook_quoted(token::Char, Mode::Char, start, end, 1, 1) }
705 rustc_lexer::LiteralKind::Byte { terminated } => {
706 if !terminated {
707 self.dcx()
708 .struct_span_fatal(
709 self.mk_sp(start + BytePos(1), end),
710 "unterminated byte constant",
711 )
712 .with_code(E0763)
713 .emit()
714 }
715 self.cook_quoted(token::Byte, Mode::Byte, start, end, 2, 1) }
717 rustc_lexer::LiteralKind::Str { terminated } => {
718 if !terminated {
719 self.dcx()
720 .struct_span_fatal(
721 self.mk_sp(start, end),
722 "unterminated double quote string",
723 )
724 .with_code(E0765)
725 .emit()
726 }
727 self.cook_quoted(token::Str, Mode::Str, start, end, 1, 1) }
729 rustc_lexer::LiteralKind::ByteStr { terminated } => {
730 if !terminated {
731 self.dcx()
732 .struct_span_fatal(
733 self.mk_sp(start + BytePos(1), end),
734 "unterminated double quote byte string",
735 )
736 .with_code(E0766)
737 .emit()
738 }
739 self.cook_quoted(token::ByteStr, Mode::ByteStr, start, end, 2, 1)
740 }
742 rustc_lexer::LiteralKind::CStr { terminated } => {
743 if !terminated {
744 self.dcx()
745 .struct_span_fatal(
746 self.mk_sp(start + BytePos(1), end),
747 "unterminated C string",
748 )
749 .with_code(E0767)
750 .emit()
751 }
752 self.cook_quoted(token::CStr, Mode::CStr, start, end, 2, 1) }
754 rustc_lexer::LiteralKind::RawStr { n_hashes } => {
755 if let Some(n_hashes) = n_hashes {
756 let n = u32::from(n_hashes);
757 let kind = token::StrRaw(n_hashes);
758 self.cook_quoted(kind, Mode::RawStr, start, end, 2 + n, 1 + n)
759 } else {
761 self.report_raw_str_error(start, 1);
762 }
763 }
764 rustc_lexer::LiteralKind::RawByteStr { n_hashes } => {
765 if let Some(n_hashes) = n_hashes {
766 let n = u32::from(n_hashes);
767 let kind = token::ByteStrRaw(n_hashes);
768 self.cook_quoted(kind, Mode::RawByteStr, start, end, 3 + n, 1 + n)
769 } else {
771 self.report_raw_str_error(start, 2);
772 }
773 }
774 rustc_lexer::LiteralKind::RawCStr { n_hashes } => {
775 if let Some(n_hashes) = n_hashes {
776 let n = u32::from(n_hashes);
777 let kind = token::CStrRaw(n_hashes);
778 self.cook_quoted(kind, Mode::RawCStr, start, end, 3 + n, 1 + n)
779 } else {
781 self.report_raw_str_error(start, 2);
782 }
783 }
784 rustc_lexer::LiteralKind::Int { base, empty_int } => {
785 let mut kind = token::Integer;
786 if empty_int {
787 let span = self.mk_sp(start, end);
788 let guar = self.dcx().emit_err(errors::NoDigitsLiteral { span });
789 kind = token::Err(guar);
790 } else if matches!(base, Base::Binary | Base::Octal) {
791 let base = base as u32;
792 let s = self.str_from_to(start + BytePos(2), end);
793 for (idx, c) in s.char_indices() {
794 let span = self.mk_sp(
795 start + BytePos::from_usize(2 + idx),
796 start + BytePos::from_usize(2 + idx + c.len_utf8()),
797 );
798 if c != '_' && c.to_digit(base).is_none() {
799 let guar =
800 self.dcx().emit_err(errors::InvalidDigitLiteral { span, base });
801 kind = token::Err(guar);
802 }
803 }
804 }
805 (kind, self.symbol_from_to(start, end))
806 }
807 rustc_lexer::LiteralKind::Float { base, empty_exponent } => {
808 let mut kind = token::Float;
809 if empty_exponent {
810 let span = self.mk_sp(start, self.pos);
811 let guar = self.dcx().emit_err(errors::EmptyExponentFloat { span });
812 kind = token::Err(guar);
813 }
814 let base = match base {
815 Base::Hexadecimal => Some("hexadecimal"),
816 Base::Octal => Some("octal"),
817 Base::Binary => Some("binary"),
818 _ => None,
819 };
820 if let Some(base) = base {
821 let span = self.mk_sp(start, end);
822 let guar =
823 self.dcx().emit_err(errors::FloatLiteralUnsupportedBase { span, base });
824 kind = token::Err(guar)
825 }
826 (kind, self.symbol_from_to(start, end))
827 }
828 }
829 }
830
831 #[inline]
832 fn src_index(&self, pos: BytePos) -> usize {
833 (pos - self.start_pos).to_usize()
834 }
835
836 fn str_from(&self, start: BytePos) -> &'src str {
839 self.str_from_to(start, self.pos)
840 }
841
842 fn symbol_from_to(&self, start: BytePos, end: BytePos) -> Symbol {
844 debug!("taking an ident from {:?} to {:?}", start, end);
845 Symbol::intern(self.str_from_to(start, end))
846 }
847
848 fn str_from_to(&self, start: BytePos, end: BytePos) -> &'src str {
850 &self.src[self.src_index(start)..self.src_index(end)]
851 }
852
853 fn str_from_to_end(&self, start: BytePos) -> &'src str {
855 &self.src[self.src_index(start)..]
856 }
857
858 fn report_raw_str_error(&self, start: BytePos, prefix_len: u32) -> ! {
859 match rustc_lexer::validate_raw_str(self.str_from(start), prefix_len) {
860 Err(RawStrError::InvalidStarter { bad_char }) => {
861 self.report_non_started_raw_string(start, bad_char)
862 }
863 Err(RawStrError::NoTerminator { expected, found, possible_terminator_offset }) => self
864 .report_unterminated_raw_string(start, expected, possible_terminator_offset, found),
865 Err(RawStrError::TooManyDelimiters { found }) => {
866 self.report_too_many_hashes(start, found)
867 }
868 Ok(()) => panic!("no error found for supposedly invalid raw string literal"),
869 }
870 }
871
872 fn report_non_started_raw_string(&self, start: BytePos, bad_char: char) -> ! {
873 self.dcx()
874 .struct_span_fatal(
875 self.mk_sp(start, self.pos),
876 format!(
877 "found invalid character; only `#` is allowed in raw string delimitation: {}",
878 escaped_char(bad_char)
879 ),
880 )
881 .emit()
882 }
883
884 fn report_unterminated_raw_string(
885 &self,
886 start: BytePos,
887 n_hashes: u32,
888 possible_offset: Option<u32>,
889 found_terminators: u32,
890 ) -> ! {
891 let mut err =
892 self.dcx().struct_span_fatal(self.mk_sp(start, start), "unterminated raw string");
893 err.code(E0748);
894 err.span_label(self.mk_sp(start, start), "unterminated raw string");
895
896 if n_hashes > 0 {
897 err.note(format!(
898 "this raw string should be terminated with `\"{}`",
899 "#".repeat(n_hashes as usize)
900 ));
901 }
902
903 if let Some(possible_offset) = possible_offset {
904 let lo = start + BytePos(possible_offset);
905 let hi = lo + BytePos(found_terminators);
906 let span = self.mk_sp(lo, hi);
907 err.span_suggestion(
908 span,
909 "consider terminating the string here",
910 "#".repeat(n_hashes as usize),
911 Applicability::MaybeIncorrect,
912 );
913 }
914
915 err.emit()
916 }
917
918 fn report_unterminated_block_comment(&self, start: BytePos, doc_style: Option<DocStyle>) {
919 let msg = match doc_style {
920 Some(_) => "unterminated block doc-comment",
921 None => "unterminated block comment",
922 };
923 let last_bpos = self.pos;
924 let mut err = self.dcx().struct_span_fatal(self.mk_sp(start, last_bpos), msg);
925 err.code(E0758);
926 let mut nested_block_comment_open_idxs = vec![];
927 let mut last_nested_block_comment_idxs = None;
928 let mut content_chars = self.str_from(start).char_indices().peekable();
929
930 while let Some((idx, current_char)) = content_chars.next() {
931 match content_chars.peek() {
932 Some((_, '*')) if current_char == '/' => {
933 nested_block_comment_open_idxs.push(idx);
934 }
935 Some((_, '/')) if current_char == '*' => {
936 last_nested_block_comment_idxs =
937 nested_block_comment_open_idxs.pop().map(|open_idx| (open_idx, idx));
938 }
939 _ => {}
940 };
941 }
942
943 if let Some((nested_open_idx, nested_close_idx)) = last_nested_block_comment_idxs {
944 err.span_label(self.mk_sp(start, start + BytePos(2)), msg)
945 .span_label(
946 self.mk_sp(
947 start + BytePos(nested_open_idx as u32),
948 start + BytePos(nested_open_idx as u32 + 2),
949 ),
950 "...as last nested comment starts here, maybe you want to close this instead?",
951 )
952 .span_label(
953 self.mk_sp(
954 start + BytePos(nested_close_idx as u32),
955 start + BytePos(nested_close_idx as u32 + 2),
956 ),
957 "...and last nested comment terminates here.",
958 );
959 }
960
961 err.emit();
962 }
963
964 fn report_unknown_prefix(&self, start: BytePos) {
969 let prefix_span = self.mk_sp(start, self.pos);
970 let prefix = self.str_from_to(start, self.pos);
971 let expn_data = prefix_span.ctxt().outer_expn_data();
972
973 if expn_data.edition.at_least_rust_2021() {
974 let sugg = if prefix == "rb" {
976 Some(errors::UnknownPrefixSugg::UseBr(prefix_span))
977 } else if prefix == "rc" {
978 Some(errors::UnknownPrefixSugg::UseCr(prefix_span))
979 } else if expn_data.is_root() {
980 if self.cursor.first() == '\''
981 && let Some(start) = self.last_lifetime
982 && self.cursor.third() != '\''
983 && let end = self.mk_sp(self.pos, self.pos + BytePos(1))
984 && !self.psess.source_map().is_multiline(start.until(end))
985 {
986 Some(errors::UnknownPrefixSugg::MeantStr { start, end })
990 } else {
991 Some(errors::UnknownPrefixSugg::Whitespace(prefix_span.shrink_to_hi()))
992 }
993 } else {
994 None
995 };
996 self.dcx().emit_err(errors::UnknownPrefix { span: prefix_span, prefix, sugg });
997 } else {
998 self.psess.buffer_lint(
1000 RUST_2021_PREFIXES_INCOMPATIBLE_SYNTAX,
1001 prefix_span,
1002 ast::CRATE_NODE_ID,
1003 BuiltinLintDiag::ReservedPrefix(prefix_span, prefix.to_string()),
1004 );
1005 }
1006 }
1007
1008 fn maybe_report_guarded_str(&mut self, start: BytePos, str_before: &'src str) -> TokenKind {
1015 let span = self.mk_sp(start, self.pos);
1016 let edition2024 = span.edition().at_least_rust_2024();
1017
1018 let space_pos = start + BytePos(1);
1019 let space_span = self.mk_sp(space_pos, space_pos);
1020
1021 let mut cursor = Cursor::new(str_before, FrontmatterAllowed::No);
1022
1023 let (is_string, span, unterminated) = match cursor.guarded_double_quoted_string() {
1024 Some(rustc_lexer::GuardedStr { n_hashes, terminated, token_len }) => {
1025 let end = start + BytePos(token_len);
1026 let span = self.mk_sp(start, end);
1027 let str_start = start + BytePos(n_hashes);
1028
1029 if edition2024 {
1030 self.cursor = cursor;
1031 self.pos = end;
1032 }
1033
1034 let unterminated = if terminated { None } else { Some(str_start) };
1035
1036 (true, span, unterminated)
1037 }
1038 None => {
1039 debug_assert_eq!(self.str_from_to(start, start + BytePos(2)), "##");
1041
1042 (false, span, None)
1043 }
1044 };
1045 if edition2024 {
1046 if let Some(str_start) = unterminated {
1047 self.dcx()
1049 .struct_span_fatal(
1050 self.mk_sp(str_start, self.pos),
1051 "unterminated double quote string",
1052 )
1053 .with_code(E0765)
1054 .emit()
1055 }
1056
1057 let sugg = if span.from_expansion() {
1058 None
1059 } else {
1060 Some(errors::GuardedStringSugg(space_span))
1061 };
1062
1063 let err = if is_string {
1065 self.dcx().emit_err(errors::ReservedString { span, sugg })
1066 } else {
1067 self.dcx().emit_err(errors::ReservedMultihash { span, sugg })
1068 };
1069
1070 token::Literal(token::Lit {
1071 kind: token::Err(err),
1072 symbol: self.symbol_from_to(start, self.pos),
1073 suffix: None,
1074 })
1075 } else {
1076 self.psess.buffer_lint(
1078 RUST_2024_GUARDED_STRING_INCOMPATIBLE_SYNTAX,
1079 span,
1080 ast::CRATE_NODE_ID,
1081 BuiltinLintDiag::ReservedString { is_string, suggestion: space_span },
1082 );
1083
1084 self.pos = start + BytePos(1);
1087 self.cursor = Cursor::new(&str_before[1..], FrontmatterAllowed::No);
1088 token::Pound
1089 }
1090 }
1091
1092 fn report_too_many_hashes(&self, start: BytePos, num: u32) -> ! {
1093 self.dcx().emit_fatal(errors::TooManyHashes { span: self.mk_sp(start, self.pos), num });
1094 }
1095
1096 fn cook_quoted(
1097 &self,
1098 mut kind: token::LitKind,
1099 mode: Mode,
1100 start: BytePos,
1101 end: BytePos,
1102 prefix_len: u32,
1103 postfix_len: u32,
1104 ) -> (token::LitKind, Symbol) {
1105 let content_start = start + BytePos(prefix_len);
1106 let content_end = end - BytePos(postfix_len);
1107 let lit_content = self.str_from_to(content_start, content_end);
1108 check_for_errors(lit_content, mode, |range, err| {
1109 let span_with_quotes = self.mk_sp(start, end);
1110 let (start, end) = (range.start as u32, range.end as u32);
1111 let lo = content_start + BytePos(start);
1112 let hi = lo + BytePos(end - start);
1113 let span = self.mk_sp(lo, hi);
1114 let is_fatal = err.is_fatal();
1115 if let Some(guar) = emit_unescape_error(
1116 self.dcx(),
1117 lit_content,
1118 span_with_quotes,
1119 span,
1120 mode,
1121 range,
1122 err,
1123 ) {
1124 assert!(is_fatal);
1125 kind = token::Err(guar);
1126 }
1127 });
1128
1129 let sym = if !matches!(kind, token::Err(_)) {
1132 Symbol::intern(lit_content)
1133 } else {
1134 self.symbol_from_to(start, end)
1135 };
1136 (kind, sym)
1137 }
1138}
1139
1140pub fn nfc_normalize(string: &str) -> Symbol {
1141 use unicode_normalization::{IsNormalized, UnicodeNormalization, is_nfc_quick};
1142 match is_nfc_quick(string.chars()) {
1143 IsNormalized::Yes => Symbol::intern(string),
1144 _ => {
1145 let normalized_str: String = string.chars().nfc().collect();
1146 Symbol::intern(&normalized_str)
1147 }
1148 }
1149}