clang 22.0.0git
Lexer.cpp
Go to the documentation of this file.
1//===- Lexer.cpp - C Language Family Lexer --------------------------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9// This file implements the Lexer and Token interfaces.
10//
11//===----------------------------------------------------------------------===//
12
13#include "clang/Lex/Lexer.h"
14#include "UnicodeCharSets.h"
18#include "clang/Basic/LLVM.h"
28#include "clang/Lex/Token.h"
29#include "llvm/ADT/STLExtras.h"
30#include "llvm/ADT/StringExtras.h"
31#include "llvm/ADT/StringRef.h"
32#include "llvm/ADT/StringSwitch.h"
33#include "llvm/Support/Compiler.h"
34#include "llvm/Support/ConvertUTF.h"
35#include "llvm/Support/MemoryBufferRef.h"
36#include "llvm/Support/NativeFormatting.h"
37#include "llvm/Support/Unicode.h"
38#include "llvm/Support/UnicodeCharRanges.h"
39#include <algorithm>
40#include <cassert>
41#include <cstddef>
42#include <cstdint>
43#include <cstring>
44#include <limits>
45#include <optional>
46#include <string>
47#include <tuple>
48
49#ifdef __SSE4_2__
50#include <nmmintrin.h>
51#endif
52
53using namespace clang;
54
55//===----------------------------------------------------------------------===//
56// Token Class Implementation
57//===----------------------------------------------------------------------===//
58
59/// isObjCAtKeyword - Return true if we have an ObjC keyword identifier.
61 if (isAnnotation())
62 return false;
63 if (const IdentifierInfo *II = getIdentifierInfo())
64 return II->getObjCKeywordID() == objcKey;
65 return false;
66}
67
68/// getObjCKeywordID - Return the ObjC keyword kind.
70 if (isAnnotation())
71 return tok::objc_not_keyword;
72 const IdentifierInfo *specId = getIdentifierInfo();
73 return specId ? specId->getObjCKeywordID() : tok::objc_not_keyword;
74}
75
76/// Determine whether the token kind starts a simple-type-specifier.
77bool Token::isSimpleTypeSpecifier(const LangOptions &LangOpts) const {
78 switch (getKind()) {
79 case tok::annot_typename:
80 case tok::annot_decltype:
81 case tok::annot_pack_indexing_type:
82 return true;
83
84 case tok::kw_short:
85 case tok::kw_long:
86 case tok::kw___int64:
87 case tok::kw___int128:
88 case tok::kw_signed:
89 case tok::kw_unsigned:
90 case tok::kw_void:
91 case tok::kw_char:
92 case tok::kw_int:
93 case tok::kw_half:
94 case tok::kw_float:
95 case tok::kw_double:
96 case tok::kw___bf16:
97 case tok::kw__Float16:
98 case tok::kw___float128:
99 case tok::kw___ibm128:
100 case tok::kw_wchar_t:
101 case tok::kw_bool:
102 case tok::kw__Bool:
103 case tok::kw__Accum:
104 case tok::kw__Fract:
105 case tok::kw__Sat:
106#define TRANSFORM_TYPE_TRAIT_DEF(_, Trait) case tok::kw___##Trait:
107#include "clang/Basic/TransformTypeTraits.def"
108 case tok::kw___auto_type:
109 case tok::kw_char16_t:
110 case tok::kw_char32_t:
111 case tok::kw_typeof:
112 case tok::kw_decltype:
113 case tok::kw_char8_t:
114 return getIdentifierInfo()->isKeyword(LangOpts);
115
116 default:
117 return false;
118 }
119}
120
121//===----------------------------------------------------------------------===//
122// Lexer Class Implementation
123//===----------------------------------------------------------------------===//
124
125void Lexer::anchor() {}
126
127void Lexer::InitLexer(const char *BufStart, const char *BufPtr,
128 const char *BufEnd) {
129 BufferStart = BufStart;
130 BufferPtr = BufPtr;
131 BufferEnd = BufEnd;
132
133 assert(BufEnd[0] == 0 &&
134 "We assume that the input buffer has a null character at the end"
135 " to simplify lexing!");
136
137 // Check whether we have a BOM in the beginning of the buffer. If yes - act
138 // accordingly. Right now we support only UTF-8 with and without BOM, so, just
139 // skip the UTF-8 BOM if it's present.
140 if (BufferStart == BufferPtr) {
141 // Determine the size of the BOM.
142 StringRef Buf(BufferStart, BufferEnd - BufferStart);
143 size_t BOMLength = llvm::StringSwitch<size_t>(Buf)
144 .StartsWith("\xEF\xBB\xBF", 3) // UTF-8 BOM
145 .Default(0);
146
147 // Skip the BOM.
148 BufferPtr += BOMLength;
149 }
150
151 Is_PragmaLexer = false;
152 CurrentConflictMarkerState = CMK_None;
153
154 // Start of the file is a start of line.
155 IsAtStartOfLine = true;
156 IsAtPhysicalStartOfLine = true;
157
158 HasLeadingSpace = false;
159 HasLeadingEmptyMacro = false;
160
161 // We are not after parsing a #.
163
164 // We are not after parsing #include.
165 ParsingFilename = false;
166
167 // We are not in raw mode. Raw mode disables diagnostics and interpretation
168 // of tokens (e.g. identifiers, thus disabling macro expansion). It is used
169 // to quickly lex the tokens of the buffer, e.g. when handling a "#if 0" block
170 // or otherwise skipping over tokens.
171 LexingRawMode = false;
172
173 // Default to not keeping comments.
174 ExtendedTokenMode = 0;
175
176 NewLinePtr = nullptr;
177}
178
179/// Lexer constructor - Create a new lexer object for the specified buffer
180/// with the specified preprocessor managing the lexing process. This lexer
181/// assumes that the associated file buffer and Preprocessor objects will
182/// outlive it, so it doesn't take ownership of either of them.
183Lexer::Lexer(FileID FID, const llvm::MemoryBufferRef &InputFile,
184 Preprocessor &PP, bool IsFirstIncludeOfFile)
185 : PreprocessorLexer(&PP, FID),
186 FileLoc(PP.getSourceManager().getLocForStartOfFile(FID)),
187 LangOpts(PP.getLangOpts()), LineComment(LangOpts.LineComment),
188 IsFirstTimeLexingFile(IsFirstIncludeOfFile) {
189 InitLexer(InputFile.getBufferStart(), InputFile.getBufferStart(),
190 InputFile.getBufferEnd());
191
193}
194
195/// Lexer constructor - Create a new raw lexer object. This object is only
196/// suitable for calls to 'LexFromRawLexer'. This lexer assumes that the text
197/// range will outlive it, so it doesn't take ownership of it.
198Lexer::Lexer(SourceLocation fileloc, const LangOptions &langOpts,
199 const char *BufStart, const char *BufPtr, const char *BufEnd,
200 bool IsFirstIncludeOfFile)
201 : FileLoc(fileloc), LangOpts(langOpts), LineComment(LangOpts.LineComment),
202 IsFirstTimeLexingFile(IsFirstIncludeOfFile) {
203 InitLexer(BufStart, BufPtr, BufEnd);
204
205 // We *are* in raw mode.
206 LexingRawMode = true;
207}
208
209/// Lexer constructor - Create a new raw lexer object. This object is only
210/// suitable for calls to 'LexFromRawLexer'. This lexer assumes that the text
211/// range will outlive it, so it doesn't take ownership of it.
212Lexer::Lexer(FileID FID, const llvm::MemoryBufferRef &FromFile,
213 const SourceManager &SM, const LangOptions &langOpts,
214 bool IsFirstIncludeOfFile)
215 : Lexer(SM.getLocForStartOfFile(FID), langOpts, FromFile.getBufferStart(),
216 FromFile.getBufferStart(), FromFile.getBufferEnd(),
217 IsFirstIncludeOfFile) {}
218
220 assert(PP && "Cannot reset token mode without a preprocessor");
221 if (LangOpts.TraditionalCPP)
223 else
225}
226
227/// Create_PragmaLexer: Lexer constructor - Create a new lexer object for
228/// _Pragma expansion. This has a variety of magic semantics that this method
229/// sets up. It returns a new'd Lexer that must be delete'd when done.
230///
231/// On entrance to this routine, TokStartLoc is a macro location which has a
232/// spelling loc that indicates the bytes to be lexed for the token and an
233/// expansion location that indicates where all lexed tokens should be
234/// "expanded from".
235///
236/// TODO: It would really be nice to make _Pragma just be a wrapper around a
237/// normal lexer that remaps tokens as they fly by. This would require making
238/// Preprocessor::Lex virtual. Given that, we could just dump in a magic lexer
239/// interface that could handle this stuff. This would pull GetMappedTokenLoc
240/// out of the critical path of the lexer!
241///
243 SourceLocation ExpansionLocStart,
244 SourceLocation ExpansionLocEnd,
245 unsigned TokLen, Preprocessor &PP) {
247
248 // Create the lexer as if we were going to lex the file normally.
249 FileID SpellingFID = SM.getFileID(SpellingLoc);
250 llvm::MemoryBufferRef InputFile = SM.getBufferOrFake(SpellingFID);
251 Lexer *L = new Lexer(SpellingFID, InputFile, PP);
252
253 // Now that the lexer is created, change the start/end locations so that we
254 // just lex the subsection of the file that we want. This is lexing from a
255 // scratch buffer.
256 const char *StrData = SM.getCharacterData(SpellingLoc);
257
258 L->BufferPtr = StrData;
259 L->BufferEnd = StrData+TokLen;
260 assert(L->BufferEnd[0] == 0 && "Buffer is not nul terminated!");
261
262 // Set the SourceLocation with the remapping information. This ensures that
263 // GetMappedTokenLoc will remap the tokens as they are lexed.
264 L->FileLoc = SM.createExpansionLoc(SM.getLocForStartOfFile(SpellingFID),
265 ExpansionLocStart,
266 ExpansionLocEnd, TokLen);
267
268 // Ensure that the lexer thinks it is inside a directive, so that end \n will
269 // return an EOD token.
271
272 // This lexer really is for _Pragma.
273 L->Is_PragmaLexer = true;
274 return L;
275}
276
277void Lexer::seek(unsigned Offset, bool IsAtStartOfLine) {
278 this->IsAtPhysicalStartOfLine = IsAtStartOfLine;
279 this->IsAtStartOfLine = IsAtStartOfLine;
280 assert((BufferStart + Offset) <= BufferEnd);
281 BufferPtr = BufferStart + Offset;
282}
283
284template <typename T> static void StringifyImpl(T &Str, char Quote) {
285 typename T::size_type i = 0, e = Str.size();
286 while (i < e) {
287 if (Str[i] == '\\' || Str[i] == Quote) {
288 Str.insert(Str.begin() + i, '\\');
289 i += 2;
290 ++e;
291 } else if (Str[i] == '\n' || Str[i] == '\r') {
292 // Replace '\r\n' and '\n\r' to '\\' followed by 'n'.
293 if ((i < e - 1) && (Str[i + 1] == '\n' || Str[i + 1] == '\r') &&
294 Str[i] != Str[i + 1]) {
295 Str[i] = '\\';
296 Str[i + 1] = 'n';
297 } else {
298 // Replace '\n' and '\r' to '\\' followed by 'n'.
299 Str[i] = '\\';
300 Str.insert(Str.begin() + i + 1, 'n');
301 ++e;
302 }
303 i += 2;
304 } else
305 ++i;
306 }
307}
308
309std::string Lexer::Stringify(StringRef Str, bool Charify) {
310 std::string Result = std::string(Str);
311 char Quote = Charify ? '\'' : '"';
312 StringifyImpl(Result, Quote);
313 return Result;
314}
315
317
318//===----------------------------------------------------------------------===//
319// Token Spelling
320//===----------------------------------------------------------------------===//
321
322/// Slow case of getSpelling. Extract the characters comprising the
323/// spelling of this token from the provided input buffer.
324static size_t getSpellingSlow(const Token &Tok, const char *BufPtr,
325 const LangOptions &LangOpts, char *Spelling) {
326 assert(Tok.needsCleaning() && "getSpellingSlow called on simple token");
327
328 size_t Length = 0;
329 const char *BufEnd = BufPtr + Tok.getLength();
330
331 if (tok::isStringLiteral(Tok.getKind())) {
332 // Munch the encoding-prefix and opening double-quote.
333 while (BufPtr < BufEnd) {
334 auto CharAndSize = Lexer::getCharAndSizeNoWarn(BufPtr, LangOpts);
335 Spelling[Length++] = CharAndSize.Char;
336 BufPtr += CharAndSize.Size;
337
338 if (Spelling[Length - 1] == '"')
339 break;
340 }
341
342 // Raw string literals need special handling; trigraph expansion and line
343 // splicing do not occur within their d-char-sequence nor within their
344 // r-char-sequence.
345 if (Length >= 2 &&
346 Spelling[Length - 2] == 'R' && Spelling[Length - 1] == '"') {
347 // Search backwards from the end of the token to find the matching closing
348 // quote.
349 const char *RawEnd = BufEnd;
350 do --RawEnd; while (*RawEnd != '"');
351 size_t RawLength = RawEnd - BufPtr + 1;
352
353 // Everything between the quotes is included verbatim in the spelling.
354 memcpy(Spelling + Length, BufPtr, RawLength);
355 Length += RawLength;
356 BufPtr += RawLength;
357
358 // The rest of the token is lexed normally.
359 }
360 }
361
362 while (BufPtr < BufEnd) {
363 auto CharAndSize = Lexer::getCharAndSizeNoWarn(BufPtr, LangOpts);
364 Spelling[Length++] = CharAndSize.Char;
365 BufPtr += CharAndSize.Size;
366 }
367
368 assert(Length < Tok.getLength() &&
369 "NeedsCleaning flag set on token that didn't need cleaning!");
370 return Length;
371}
372
373/// getSpelling() - Return the 'spelling' of this token. The spelling of a
374/// token are the characters used to represent the token in the source file
375/// after trigraph expansion and escaped-newline folding. In particular, this
376/// wants to get the true, uncanonicalized, spelling of things like digraphs
377/// UCNs, etc.
379 SmallVectorImpl<char> &buffer,
380 const SourceManager &SM,
381 const LangOptions &options,
382 bool *invalid) {
383 // Break down the source location.
384 FileIDAndOffset locInfo = SM.getDecomposedLoc(loc);
385
386 // Try to the load the file buffer.
387 bool invalidTemp = false;
388 StringRef file = SM.getBufferData(locInfo.first, &invalidTemp);
389 if (invalidTemp) {
390 if (invalid) *invalid = true;
391 return {};
392 }
393
394 const char *tokenBegin = file.data() + locInfo.second;
395
396 // Lex from the start of the given location.
397 Lexer lexer(SM.getLocForStartOfFile(locInfo.first), options,
398 file.begin(), tokenBegin, file.end());
399 Token token;
400 lexer.LexFromRawLexer(token);
401
402 unsigned length = token.getLength();
403
404 // Common case: no need for cleaning.
405 if (!token.needsCleaning())
406 return StringRef(tokenBegin, length);
407
408 // Hard case, we need to relex the characters into the string.
409 buffer.resize(length);
410 buffer.resize(getSpellingSlow(token, tokenBegin, options, buffer.data()));
411 return StringRef(buffer.data(), buffer.size());
412}
413
414/// getSpelling() - Return the 'spelling' of this token. The spelling of a
415/// token are the characters used to represent the token in the source file
416/// after trigraph expansion and escaped-newline folding. In particular, this
417/// wants to get the true, uncanonicalized, spelling of things like digraphs
418/// UCNs, etc.
419std::string Lexer::getSpelling(const Token &Tok, const SourceManager &SourceMgr,
420 const LangOptions &LangOpts, bool *Invalid) {
421 assert((int)Tok.getLength() >= 0 && "Token character range is bogus!");
422
423 bool CharDataInvalid = false;
424 const char *TokStart = SourceMgr.getCharacterData(Tok.getLocation(),
425 &CharDataInvalid);
426 if (Invalid)
427 *Invalid = CharDataInvalid;
428 if (CharDataInvalid)
429 return {};
430
431 // If this token contains nothing interesting, return it directly.
432 if (!Tok.needsCleaning())
433 return std::string(TokStart, TokStart + Tok.getLength());
434
435 std::string Result;
436 Result.resize(Tok.getLength());
437 Result.resize(getSpellingSlow(Tok, TokStart, LangOpts, &*Result.begin()));
438 return Result;
439}
440
441/// getSpelling - This method is used to get the spelling of a token into a
442/// preallocated buffer, instead of as an std::string. The caller is required
443/// to allocate enough space for the token, which is guaranteed to be at least
444/// Tok.getLength() bytes long. The actual length of the token is returned.
445///
446/// Note that this method may do two possible things: it may either fill in
447/// the buffer specified with characters, or it may *change the input pointer*
448/// to point to a constant buffer with the data already in it (avoiding a
449/// copy). The caller is not allowed to modify the returned buffer pointer
450/// if an internal buffer is returned.
451unsigned Lexer::getSpelling(const Token &Tok, const char *&Buffer,
452 const SourceManager &SourceMgr,
453 const LangOptions &LangOpts, bool *Invalid) {
454 assert((int)Tok.getLength() >= 0 && "Token character range is bogus!");
455
456 const char *TokStart = nullptr;
457 // NOTE: this has to be checked *before* testing for an IdentifierInfo.
458 if (Tok.is(tok::raw_identifier))
459 TokStart = Tok.getRawIdentifier().data();
460 else if (!Tok.hasUCN()) {
461 if (const IdentifierInfo *II = Tok.getIdentifierInfo()) {
462 // Just return the string from the identifier table, which is very quick.
463 Buffer = II->getNameStart();
464 return II->getLength();
465 }
466 }
467
468 // NOTE: this can be checked even after testing for an IdentifierInfo.
469 if (Tok.isLiteral())
470 TokStart = Tok.getLiteralData();
471
472 if (!TokStart) {
473 // Compute the start of the token in the input lexer buffer.
474 bool CharDataInvalid = false;
475 TokStart = SourceMgr.getCharacterData(Tok.getLocation(), &CharDataInvalid);
476 if (Invalid)
477 *Invalid = CharDataInvalid;
478 if (CharDataInvalid) {
479 Buffer = "";
480 return 0;
481 }
482 }
483
484 // If this token contains nothing interesting, return it directly.
485 if (!Tok.needsCleaning()) {
486 Buffer = TokStart;
487 return Tok.getLength();
488 }
489
490 // Otherwise, hard case, relex the characters into the string.
491 return getSpellingSlow(Tok, TokStart, LangOpts, const_cast<char*>(Buffer));
492}
493
494/// MeasureTokenLength - Relex the token at the specified location and return
495/// its length in bytes in the input file. If the token needs cleaning (e.g.
496/// includes a trigraph or an escaped newline) then this count includes bytes
497/// that are part of that.
499 const SourceManager &SM,
500 const LangOptions &LangOpts) {
501 Token TheTok;
502 if (getRawToken(Loc, TheTok, SM, LangOpts))
503 return 0;
504 return TheTok.getLength();
505}
506
507/// Relex the token at the specified location.
508/// \returns true if there was a failure, false on success.
510 const SourceManager &SM,
511 const LangOptions &LangOpts,
512 bool IgnoreWhiteSpace) {
513 // TODO: this could be special cased for common tokens like identifiers, ')',
514 // etc to make this faster, if it mattered. Just look at StrData[0] to handle
515 // all obviously single-char tokens. This could use
516 // Lexer::isObviouslySimpleCharacter for example to handle identifiers or
517 // something.
518
519 // If this comes from a macro expansion, we really do want the macro name, not
520 // the token this macro expanded to.
521 Loc = SM.getExpansionLoc(Loc);
522 FileIDAndOffset LocInfo = SM.getDecomposedLoc(Loc);
523 bool Invalid = false;
524 StringRef Buffer = SM.getBufferData(LocInfo.first, &Invalid);
525 if (Invalid)
526 return true;
527
528 const char *StrData = Buffer.data()+LocInfo.second;
529
530 if (!IgnoreWhiteSpace && isWhitespace(SkipEscapedNewLines(StrData)[0]))
531 return true;
532
533 // Create a lexer starting at the beginning of this token.
534 Lexer TheLexer(SM.getLocForStartOfFile(LocInfo.first), LangOpts,
535 Buffer.begin(), StrData, Buffer.end());
536 TheLexer.SetCommentRetentionState(true);
537 TheLexer.LexFromRawLexer(Result);
538 return false;
539}
540
541/// Returns the pointer that points to the beginning of line that contains
542/// the given offset, or null if the offset if invalid.
543static const char *findBeginningOfLine(StringRef Buffer, unsigned Offset) {
544 const char *BufStart = Buffer.data();
545 if (Offset >= Buffer.size())
546 return nullptr;
547
548 const char *LexStart = BufStart + Offset;
549 for (; LexStart != BufStart; --LexStart) {
550 if (isVerticalWhitespace(LexStart[0]) &&
551 !Lexer::isNewLineEscaped(BufStart, LexStart)) {
552 // LexStart should point at first character of logical line.
553 ++LexStart;
554 break;
555 }
556 }
557 return LexStart;
558}
559
561 const SourceManager &SM,
562 const LangOptions &LangOpts) {
563 assert(Loc.isFileID());
564 FileIDAndOffset LocInfo = SM.getDecomposedLoc(Loc);
565 if (LocInfo.first.isInvalid())
566 return Loc;
567
568 bool Invalid = false;
569 StringRef Buffer = SM.getBufferData(LocInfo.first, &Invalid);
570 if (Invalid)
571 return Loc;
572
573 // Back up from the current location until we hit the beginning of a line
574 // (or the buffer). We'll relex from that point.
575 const char *StrData = Buffer.data() + LocInfo.second;
576 const char *LexStart = findBeginningOfLine(Buffer, LocInfo.second);
577 if (!LexStart || LexStart == StrData)
578 return Loc;
579
580 // Create a lexer starting at the beginning of this token.
581 SourceLocation LexerStartLoc = Loc.getLocWithOffset(-LocInfo.second);
582 Lexer TheLexer(LexerStartLoc, LangOpts, Buffer.data(), LexStart,
583 Buffer.end());
584 TheLexer.SetCommentRetentionState(true);
585
586 // Lex tokens until we find the token that contains the source location.
587 Token TheTok;
588 do {
589 TheLexer.LexFromRawLexer(TheTok);
590
591 if (TheLexer.getBufferLocation() > StrData) {
592 // Lexing this token has taken the lexer past the source location we're
593 // looking for. If the current token encompasses our source location,
594 // return the beginning of that token.
595 if (TheLexer.getBufferLocation() - TheTok.getLength() <= StrData)
596 return TheTok.getLocation();
597
598 // We ended up skipping over the source location entirely, which means
599 // that it points into whitespace. We're done here.
600 break;
601 }
602 } while (TheTok.getKind() != tok::eof);
603
604 // We've passed our source location; just return the original source location.
605 return Loc;
606}
607
609 const SourceManager &SM,
610 const LangOptions &LangOpts) {
611 if (Loc.isFileID())
612 return getBeginningOfFileToken(Loc, SM, LangOpts);
613
614 if (!SM.isMacroArgExpansion(Loc))
615 return Loc;
616
617 SourceLocation FileLoc = SM.getSpellingLoc(Loc);
618 SourceLocation BeginFileLoc = getBeginningOfFileToken(FileLoc, SM, LangOpts);
619 FileIDAndOffset FileLocInfo = SM.getDecomposedLoc(FileLoc);
620 FileIDAndOffset BeginFileLocInfo = SM.getDecomposedLoc(BeginFileLoc);
621 assert(FileLocInfo.first == BeginFileLocInfo.first &&
622 FileLocInfo.second >= BeginFileLocInfo.second);
623 return Loc.getLocWithOffset(BeginFileLocInfo.second - FileLocInfo.second);
624}
625
626namespace {
627
628enum PreambleDirectiveKind {
629 PDK_Skipped,
630 PDK_Unknown
631};
632
633} // namespace
634
636 const LangOptions &LangOpts,
637 unsigned MaxLines) {
638 // Create a lexer starting at the beginning of the file. Note that we use a
639 // "fake" file source location at offset 1 so that the lexer will track our
640 // position within the file.
641 const SourceLocation::UIntTy StartOffset = 1;
643 Lexer TheLexer(FileLoc, LangOpts, Buffer.begin(), Buffer.begin(),
644 Buffer.end());
645 TheLexer.SetCommentRetentionState(true);
646
647 bool InPreprocessorDirective = false;
648 Token TheTok;
649 SourceLocation ActiveCommentLoc;
650
651 unsigned MaxLineOffset = 0;
652 if (MaxLines) {
653 const char *CurPtr = Buffer.begin();
654 unsigned CurLine = 0;
655 while (CurPtr != Buffer.end()) {
656 char ch = *CurPtr++;
657 if (ch == '\n') {
658 ++CurLine;
659 if (CurLine == MaxLines)
660 break;
661 }
662 }
663 if (CurPtr != Buffer.end())
664 MaxLineOffset = CurPtr - Buffer.begin();
665 }
666
667 do {
668 TheLexer.LexFromRawLexer(TheTok);
669
670 if (InPreprocessorDirective) {
671 // If we've hit the end of the file, we're done.
672 if (TheTok.getKind() == tok::eof) {
673 break;
674 }
675
676 // If we haven't hit the end of the preprocessor directive, skip this
677 // token.
678 if (!TheTok.isAtStartOfLine())
679 continue;
680
681 // We've passed the end of the preprocessor directive, and will look
682 // at this token again below.
683 InPreprocessorDirective = false;
684 }
685
686 // Keep track of the # of lines in the preamble.
687 if (TheTok.isAtStartOfLine()) {
688 unsigned TokOffset = TheTok.getLocation().getRawEncoding() - StartOffset;
689
690 // If we were asked to limit the number of lines in the preamble,
691 // and we're about to exceed that limit, we're done.
692 if (MaxLineOffset && TokOffset >= MaxLineOffset)
693 break;
694 }
695
696 // Comments are okay; skip over them.
697 if (TheTok.getKind() == tok::comment) {
698 if (ActiveCommentLoc.isInvalid())
699 ActiveCommentLoc = TheTok.getLocation();
700 continue;
701 }
702
703 if (TheTok.isAtStartOfLine() && TheTok.getKind() == tok::hash) {
704 // This is the start of a preprocessor directive.
705 Token HashTok = TheTok;
706 InPreprocessorDirective = true;
707 ActiveCommentLoc = SourceLocation();
708
709 // Figure out which directive this is. Since we're lexing raw tokens,
710 // we don't have an identifier table available. Instead, just look at
711 // the raw identifier to recognize and categorize preprocessor directives.
712 TheLexer.LexFromRawLexer(TheTok);
713 if (TheTok.getKind() == tok::raw_identifier && !TheTok.needsCleaning()) {
714 StringRef Keyword = TheTok.getRawIdentifier();
715 PreambleDirectiveKind PDK
716 = llvm::StringSwitch<PreambleDirectiveKind>(Keyword)
717 .Case("include", PDK_Skipped)
718 .Case("__include_macros", PDK_Skipped)
719 .Case("define", PDK_Skipped)
720 .Case("undef", PDK_Skipped)
721 .Case("line", PDK_Skipped)
722 .Case("error", PDK_Skipped)
723 .Case("pragma", PDK_Skipped)
724 .Case("import", PDK_Skipped)
725 .Case("include_next", PDK_Skipped)
726 .Case("warning", PDK_Skipped)
727 .Case("ident", PDK_Skipped)
728 .Case("sccs", PDK_Skipped)
729 .Case("assert", PDK_Skipped)
730 .Case("unassert", PDK_Skipped)
731 .Case("if", PDK_Skipped)
732 .Case("ifdef", PDK_Skipped)
733 .Case("ifndef", PDK_Skipped)
734 .Case("elif", PDK_Skipped)
735 .Case("elifdef", PDK_Skipped)
736 .Case("elifndef", PDK_Skipped)
737 .Case("else", PDK_Skipped)
738 .Case("endif", PDK_Skipped)
739 .Default(PDK_Unknown);
740
741 switch (PDK) {
742 case PDK_Skipped:
743 continue;
744
745 case PDK_Unknown:
746 // We don't know what this directive is; stop at the '#'.
747 break;
748 }
749 }
750
751 // We only end up here if we didn't recognize the preprocessor
752 // directive or it was one that can't occur in the preamble at this
753 // point. Roll back the current token to the location of the '#'.
754 TheTok = HashTok;
755 } else if (TheTok.isAtStartOfLine() &&
756 TheTok.getKind() == tok::raw_identifier &&
757 TheTok.getRawIdentifier() == "module" &&
758 LangOpts.CPlusPlusModules) {
759 // The initial global module fragment introducer "module;" is part of
760 // the preamble, which runs up to the module declaration "module foo;".
761 Token ModuleTok = TheTok;
762 do {
763 TheLexer.LexFromRawLexer(TheTok);
764 } while (TheTok.getKind() == tok::comment);
765 if (TheTok.getKind() != tok::semi) {
766 // Not global module fragment, roll back.
767 TheTok = ModuleTok;
768 break;
769 }
770 continue;
771 }
772
773 // We hit a token that we don't recognize as being in the
774 // "preprocessing only" part of the file, so we're no longer in
775 // the preamble.
776 break;
777 } while (true);
778
779 SourceLocation End;
780 if (ActiveCommentLoc.isValid())
781 End = ActiveCommentLoc; // don't truncate a decl comment.
782 else
783 End = TheTok.getLocation();
784
785 return PreambleBounds(End.getRawEncoding() - FileLoc.getRawEncoding(),
786 TheTok.isAtStartOfLine());
787}
788
789unsigned Lexer::getTokenPrefixLength(SourceLocation TokStart, unsigned CharNo,
790 const SourceManager &SM,
791 const LangOptions &LangOpts) {
792 // Figure out how many physical characters away the specified expansion
793 // character is. This needs to take into consideration newlines and
794 // trigraphs.
795 bool Invalid = false;
796 const char *TokPtr = SM.getCharacterData(TokStart, &Invalid);
797
798 // If they request the first char of the token, we're trivially done.
799 if (Invalid || (CharNo == 0 && Lexer::isObviouslySimpleCharacter(*TokPtr)))
800 return 0;
801
802 unsigned PhysOffset = 0;
803
804 // The usual case is that tokens don't contain anything interesting. Skip
805 // over the uninteresting characters. If a token only consists of simple
806 // chars, this method is extremely fast.
807 while (Lexer::isObviouslySimpleCharacter(*TokPtr)) {
808 if (CharNo == 0)
809 return PhysOffset;
810 ++TokPtr;
811 --CharNo;
812 ++PhysOffset;
813 }
814
815 // If we have a character that may be a trigraph or escaped newline, use a
816 // lexer to parse it correctly.
817 for (; CharNo; --CharNo) {
818 auto CharAndSize = Lexer::getCharAndSizeNoWarn(TokPtr, LangOpts);
819 TokPtr += CharAndSize.Size;
820 PhysOffset += CharAndSize.Size;
821 }
822
823 // Final detail: if we end up on an escaped newline, we want to return the
824 // location of the actual byte of the token. For example foo<newline>bar
825 // advanced by 3 should return the location of b, not of \\. One compounding
826 // detail of this is that the escape may be made by a trigraph.
827 if (!Lexer::isObviouslySimpleCharacter(*TokPtr))
828 PhysOffset += Lexer::SkipEscapedNewLines(TokPtr)-TokPtr;
829
830 return PhysOffset;
831}
832
833/// Computes the source location just past the end of the
834/// token at this source location.
835///
836/// This routine can be used to produce a source location that
837/// points just past the end of the token referenced by \p Loc, and
838/// is generally used when a diagnostic needs to point just after a
839/// token where it expected something different that it received. If
840/// the returned source location would not be meaningful (e.g., if
841/// it points into a macro), this routine returns an invalid
842/// source location.
843///
844/// \param Offset an offset from the end of the token, where the source
845/// location should refer to. The default offset (0) produces a source
846/// location pointing just past the end of the token; an offset of 1 produces
847/// a source location pointing to the last character in the token, etc.
849 const SourceManager &SM,
850 const LangOptions &LangOpts) {
851 if (Loc.isInvalid())
852 return {};
853
854 if (Loc.isMacroID()) {
855 if (Offset > 0 || !isAtEndOfMacroExpansion(Loc, SM, LangOpts, &Loc))
856 return {}; // Points inside the macro expansion.
857 }
858
859 unsigned Len = Lexer::MeasureTokenLength(Loc, SM, LangOpts);
860 if (Len > Offset)
861 Len = Len - Offset;
862 else
863 return Loc;
864
865 return Loc.getLocWithOffset(Len);
866}
867
868/// Returns true if the given MacroID location points at the first
869/// token of the macro expansion.
871 const SourceManager &SM,
872 const LangOptions &LangOpts,
873 SourceLocation *MacroBegin) {
874 assert(loc.isValid() && loc.isMacroID() && "Expected a valid macro loc");
875
876 SourceLocation expansionLoc;
877 if (!SM.isAtStartOfImmediateMacroExpansion(loc, &expansionLoc))
878 return false;
879
880 if (expansionLoc.isFileID()) {
881 // No other macro expansions, this is the first.
882 if (MacroBegin)
883 *MacroBegin = expansionLoc;
884 return true;
885 }
886
887 return isAtStartOfMacroExpansion(expansionLoc, SM, LangOpts, MacroBegin);
888}
889
890/// Returns true if the given MacroID location points at the last
891/// token of the macro expansion.
893 const SourceManager &SM,
894 const LangOptions &LangOpts,
895 SourceLocation *MacroEnd) {
896 assert(loc.isValid() && loc.isMacroID() && "Expected a valid macro loc");
897
898 SourceLocation spellLoc = SM.getSpellingLoc(loc);
899 unsigned tokLen = MeasureTokenLength(spellLoc, SM, LangOpts);
900 if (tokLen == 0)
901 return false;
902
903 SourceLocation afterLoc = loc.getLocWithOffset(tokLen);
904 SourceLocation expansionLoc;
905 if (!SM.isAtEndOfImmediateMacroExpansion(afterLoc, &expansionLoc))
906 return false;
907
908 if (expansionLoc.isFileID()) {
909 // No other macro expansions.
910 if (MacroEnd)
911 *MacroEnd = expansionLoc;
912 return true;
913 }
914
915 return isAtEndOfMacroExpansion(expansionLoc, SM, LangOpts, MacroEnd);
916}
917
919 const SourceManager &SM,
920 const LangOptions &LangOpts) {
923 assert(Begin.isFileID() && End.isFileID());
924 if (Range.isTokenRange()) {
925 End = Lexer::getLocForEndOfToken(End, 0, SM,LangOpts);
926 if (End.isInvalid())
927 return {};
928 }
929
930 // Break down the source locations.
931 auto [FID, BeginOffs] = SM.getDecomposedLoc(Begin);
932 if (FID.isInvalid())
933 return {};
934
935 unsigned EndOffs;
936 if (!SM.isInFileID(End, FID, &EndOffs) ||
937 BeginOffs > EndOffs)
938 return {};
939
941}
942
943// Assumes that `Loc` is in an expansion.
945 const SourceManager &SM) {
946 return SM.getSLocEntry(SM.getFileID(Loc))
947 .getExpansion()
948 .isExpansionTokenRange();
949}
950
952 const SourceManager &SM,
953 const LangOptions &LangOpts) {
956 if (Begin.isInvalid() || End.isInvalid())
957 return {};
958
959 if (Begin.isFileID() && End.isFileID())
960 return makeRangeFromFileLocs(Range, SM, LangOpts);
961
962 if (Begin.isMacroID() && End.isFileID()) {
963 if (!isAtStartOfMacroExpansion(Begin, SM, LangOpts, &Begin))
964 return {};
966 return makeRangeFromFileLocs(Range, SM, LangOpts);
967 }
968
969 if (Begin.isFileID() && End.isMacroID()) {
970 if (Range.isTokenRange()) {
971 if (!isAtEndOfMacroExpansion(End, SM, LangOpts, &End))
972 return {};
973 // Use the *original* end, not the expanded one in `End`.
974 Range.setTokenRange(isInExpansionTokenRange(Range.getEnd(), SM));
975 } else if (!isAtStartOfMacroExpansion(End, SM, LangOpts, &End))
976 return {};
977 Range.setEnd(End);
978 return makeRangeFromFileLocs(Range, SM, LangOpts);
979 }
980
981 assert(Begin.isMacroID() && End.isMacroID());
982 SourceLocation MacroBegin, MacroEnd;
983 if (isAtStartOfMacroExpansion(Begin, SM, LangOpts, &MacroBegin) &&
984 ((Range.isTokenRange() && isAtEndOfMacroExpansion(End, SM, LangOpts,
985 &MacroEnd)) ||
986 (Range.isCharRange() && isAtStartOfMacroExpansion(End, SM, LangOpts,
987 &MacroEnd)))) {
988 Range.setBegin(MacroBegin);
989 Range.setEnd(MacroEnd);
990 // Use the *original* `End`, not the expanded one in `MacroEnd`.
991 if (Range.isTokenRange())
992 Range.setTokenRange(isInExpansionTokenRange(End, SM));
993 return makeRangeFromFileLocs(Range, SM, LangOpts);
994 }
995
996 bool Invalid = false;
997 const SrcMgr::SLocEntry &BeginEntry = SM.getSLocEntry(SM.getFileID(Begin),
998 &Invalid);
999 if (Invalid)
1000 return {};
1001
1002 if (BeginEntry.getExpansion().isMacroArgExpansion()) {
1003 const SrcMgr::SLocEntry &EndEntry = SM.getSLocEntry(SM.getFileID(End),
1004 &Invalid);
1005 if (Invalid)
1006 return {};
1007
1008 if (EndEntry.getExpansion().isMacroArgExpansion() &&
1009 BeginEntry.getExpansion().getExpansionLocStart() ==
1010 EndEntry.getExpansion().getExpansionLocStart()) {
1011 Range.setBegin(SM.getImmediateSpellingLoc(Begin));
1012 Range.setEnd(SM.getImmediateSpellingLoc(End));
1013 return makeFileCharRange(Range, SM, LangOpts);
1014 }
1015 }
1016
1017 return {};
1018}
1019
1021 const SourceManager &SM,
1022 const LangOptions &LangOpts,
1023 bool *Invalid) {
1024 Range = makeFileCharRange(Range, SM, LangOpts);
1025 if (Range.isInvalid()) {
1026 if (Invalid) *Invalid = true;
1027 return {};
1028 }
1029
1030 // Break down the source location.
1031 FileIDAndOffset beginInfo = SM.getDecomposedLoc(Range.getBegin());
1032 if (beginInfo.first.isInvalid()) {
1033 if (Invalid) *Invalid = true;
1034 return {};
1035 }
1036
1037 unsigned EndOffs;
1038 if (!SM.isInFileID(Range.getEnd(), beginInfo.first, &EndOffs) ||
1039 beginInfo.second > EndOffs) {
1040 if (Invalid) *Invalid = true;
1041 return {};
1042 }
1043
1044 // Try to the load the file buffer.
1045 bool invalidTemp = false;
1046 StringRef file = SM.getBufferData(beginInfo.first, &invalidTemp);
1047 if (invalidTemp) {
1048 if (Invalid) *Invalid = true;
1049 return {};
1050 }
1051
1052 if (Invalid) *Invalid = false;
1053 return file.substr(beginInfo.second, EndOffs - beginInfo.second);
1054}
1055
1057 const SourceManager &SM,
1058 const LangOptions &LangOpts) {
1059 assert(Loc.isMacroID() && "Only reasonable to call this on macros");
1060
1061 // Find the location of the immediate macro expansion.
1062 while (true) {
1063 FileID FID = SM.getFileID(Loc);
1064 const SrcMgr::SLocEntry *E = &SM.getSLocEntry(FID);
1065 const SrcMgr::ExpansionInfo &Expansion = E->getExpansion();
1066 Loc = Expansion.getExpansionLocStart();
1067 if (!Expansion.isMacroArgExpansion())
1068 break;
1069
1070 // For macro arguments we need to check that the argument did not come
1071 // from an inner macro, e.g: "MAC1( MAC2(foo) )"
1072
1073 // Loc points to the argument id of the macro definition, move to the
1074 // macro expansion.
1075 Loc = SM.getImmediateExpansionRange(Loc).getBegin();
1076 SourceLocation SpellLoc = Expansion.getSpellingLoc();
1077 if (SpellLoc.isFileID())
1078 break; // No inner macro.
1079
1080 // If spelling location resides in the same FileID as macro expansion
1081 // location, it means there is no inner macro.
1082 FileID MacroFID = SM.getFileID(Loc);
1083 if (SM.isInFileID(SpellLoc, MacroFID))
1084 break;
1085
1086 // Argument came from inner macro.
1087 Loc = SpellLoc;
1088 }
1089
1090 // Find the spelling location of the start of the non-argument expansion
1091 // range. This is where the macro name was spelled in order to begin
1092 // expanding this macro.
1093 Loc = SM.getSpellingLoc(Loc);
1094
1095 // Dig out the buffer where the macro name was spelled and the extents of the
1096 // name so that we can render it into the expansion note.
1097 FileIDAndOffset ExpansionInfo = SM.getDecomposedLoc(Loc);
1098 unsigned MacroTokenLength = Lexer::MeasureTokenLength(Loc, SM, LangOpts);
1099 StringRef ExpansionBuffer = SM.getBufferData(ExpansionInfo.first);
1100 return ExpansionBuffer.substr(ExpansionInfo.second, MacroTokenLength);
1101}
1102
1104 SourceLocation Loc, const SourceManager &SM, const LangOptions &LangOpts) {
1105 assert(Loc.isMacroID() && "Only reasonable to call this on macros");
1106 // Walk past macro argument expansions.
1107 while (SM.isMacroArgExpansion(Loc))
1108 Loc = SM.getImmediateExpansionRange(Loc).getBegin();
1109
1110 // If the macro's spelling isn't FileID or from scratch space, then it's
1111 // actually a token paste or stringization (or similar) and not a macro at
1112 // all.
1113 SourceLocation SpellLoc = SM.getSpellingLoc(Loc);
1114 if (!SpellLoc.isFileID() || SM.isWrittenInScratchSpace(SpellLoc))
1115 return {};
1116
1117 // Find the spelling location of the start of the non-argument expansion
1118 // range. This is where the macro name was spelled in order to begin
1119 // expanding this macro.
1120 Loc = SM.getSpellingLoc(SM.getImmediateExpansionRange(Loc).getBegin());
1121
1122 // Dig out the buffer where the macro name was spelled and the extents of the
1123 // name so that we can render it into the expansion note.
1124 FileIDAndOffset ExpansionInfo = SM.getDecomposedLoc(Loc);
1125 unsigned MacroTokenLength = Lexer::MeasureTokenLength(Loc, SM, LangOpts);
1126 StringRef ExpansionBuffer = SM.getBufferData(ExpansionInfo.first);
1127 return ExpansionBuffer.substr(ExpansionInfo.second, MacroTokenLength);
1128}
1129
1131 return isAsciiIdentifierContinue(c, LangOpts.DollarIdents);
1132}
1133
1134bool Lexer::isNewLineEscaped(const char *BufferStart, const char *Str) {
1135 assert(isVerticalWhitespace(Str[0]));
1136 if (Str - 1 < BufferStart)
1137 return false;
1138
1139 if ((Str[0] == '\n' && Str[-1] == '\r') ||
1140 (Str[0] == '\r' && Str[-1] == '\n')) {
1141 if (Str - 2 < BufferStart)
1142 return false;
1143 --Str;
1144 }
1145 --Str;
1146
1147 // Rewind to first non-space character:
1148 while (Str > BufferStart && isHorizontalWhitespace(*Str))
1149 --Str;
1150
1151 return *Str == '\\';
1152}
1153
1155 const SourceManager &SM) {
1156 if (Loc.isInvalid() || Loc.isMacroID())
1157 return {};
1158 FileIDAndOffset LocInfo = SM.getDecomposedLoc(Loc);
1159 if (LocInfo.first.isInvalid())
1160 return {};
1161 bool Invalid = false;
1162 StringRef Buffer = SM.getBufferData(LocInfo.first, &Invalid);
1163 if (Invalid)
1164 return {};
1165 const char *Line = findBeginningOfLine(Buffer, LocInfo.second);
1166 if (!Line)
1167 return {};
1168 StringRef Rest = Buffer.substr(Line - Buffer.data());
1169 size_t NumWhitespaceChars = Rest.find_first_not_of(" \t");
1170 return NumWhitespaceChars == StringRef::npos
1171 ? ""
1172 : Rest.take_front(NumWhitespaceChars);
1173}
1174
1175//===----------------------------------------------------------------------===//
1176// Diagnostics forwarding code.
1177//===----------------------------------------------------------------------===//
1178
1179/// GetMappedTokenLoc - If lexing out of a 'mapped buffer', where we pretend the
1180/// lexer buffer was all expanded at a single point, perform the mapping.
1181/// This is currently only used for _Pragma implementation, so it is the slow
1182/// path of the hot getSourceLocation method. Do not allow it to be inlined.
1183static LLVM_ATTRIBUTE_NOINLINE SourceLocation GetMappedTokenLoc(
1184 Preprocessor &PP, SourceLocation FileLoc, unsigned CharNo, unsigned TokLen);
1186 SourceLocation FileLoc,
1187 unsigned CharNo, unsigned TokLen) {
1188 assert(FileLoc.isMacroID() && "Must be a macro expansion");
1189
1190 // Otherwise, we're lexing "mapped tokens". This is used for things like
1191 // _Pragma handling. Combine the expansion location of FileLoc with the
1192 // spelling location.
1194
1195 // Create a new SLoc which is expanded from Expansion(FileLoc) but whose
1196 // characters come from spelling(FileLoc)+Offset.
1197 SourceLocation SpellingLoc = SM.getSpellingLoc(FileLoc);
1198 SpellingLoc = SpellingLoc.getLocWithOffset(CharNo);
1199
1200 // Figure out the expansion loc range, which is the range covered by the
1201 // original _Pragma(...) sequence.
1202 CharSourceRange II = SM.getImmediateExpansionRange(FileLoc);
1203
1204 return SM.createExpansionLoc(SpellingLoc, II.getBegin(), II.getEnd(), TokLen);
1205}
1206
1207/// getSourceLocation - Return a source location identifier for the specified
1208/// offset in the current file.
1210 unsigned TokLen) const {
1211 assert(Loc >= BufferStart && Loc <= BufferEnd &&
1212 "Location out of range for this buffer!");
1213
1214 // In the normal case, we're just lexing from a simple file buffer, return
1215 // the file id from FileLoc with the offset specified.
1216 unsigned CharNo = Loc-BufferStart;
1217 if (FileLoc.isFileID())
1218 return FileLoc.getLocWithOffset(CharNo);
1219
1220 // Otherwise, this is the _Pragma lexer case, which pretends that all of the
1221 // tokens are lexed from where the _Pragma was defined.
1222 assert(PP && "This doesn't work on raw lexers");
1223 return GetMappedTokenLoc(*PP, FileLoc, CharNo, TokLen);
1224}
1225
1226/// Diag - Forwarding function for diagnostics. This translate a source
1227/// position in the current buffer into a SourceLocation object for rendering.
1228DiagnosticBuilder Lexer::Diag(const char *Loc, unsigned DiagID) const {
1229 return PP->Diag(getSourceLocation(Loc), DiagID);
1230}
1231
1232//===----------------------------------------------------------------------===//
1233// Trigraph and Escaped Newline Handling Code.
1234//===----------------------------------------------------------------------===//
1235
1236/// GetTrigraphCharForLetter - Given a character that occurs after a ?? pair,
1237/// return the decoded trigraph letter it corresponds to, or '\0' if nothing.
1238static char GetTrigraphCharForLetter(char Letter) {
1239 switch (Letter) {
1240 default: return 0;
1241 case '=': return '#';
1242 case ')': return ']';
1243 case '(': return '[';
1244 case '!': return '|';
1245 case '\'': return '^';
1246 case '>': return '}';
1247 case '/': return '\\';
1248 case '<': return '{';
1249 case '-': return '~';
1250 }
1251}
1252
1253/// DecodeTrigraphChar - If the specified character is a legal trigraph when
1254/// prefixed with ??, emit a trigraph warning. If trigraphs are enabled,
1255/// return the result character. Finally, emit a warning about trigraph use
1256/// whether trigraphs are enabled or not.
1257static char DecodeTrigraphChar(const char *CP, Lexer *L, bool Trigraphs) {
1258 char Res = GetTrigraphCharForLetter(*CP);
1259 if (!Res)
1260 return Res;
1261
1262 if (!Trigraphs) {
1263 if (L && !L->isLexingRawMode())
1264 L->Diag(CP-2, diag::trigraph_ignored);
1265 return 0;
1266 }
1267
1268 if (L && !L->isLexingRawMode())
1269 L->Diag(CP-2, diag::trigraph_converted) << StringRef(&Res, 1);
1270 return Res;
1271}
1272
1273/// getEscapedNewLineSize - Return the size of the specified escaped newline,
1274/// or 0 if it is not an escaped newline. P[-1] is known to be a "\" or a
1275/// trigraph equivalent on entry to this function.
1276unsigned Lexer::getEscapedNewLineSize(const char *Ptr) {
1277 unsigned Size = 0;
1278 while (isWhitespace(Ptr[Size])) {
1279 ++Size;
1280
1281 if (Ptr[Size-1] != '\n' && Ptr[Size-1] != '\r')
1282 continue;
1283
1284 // If this is a \r\n or \n\r, skip the other half.
1285 if ((Ptr[Size] == '\r' || Ptr[Size] == '\n') &&
1286 Ptr[Size-1] != Ptr[Size])
1287 ++Size;
1288
1289 return Size;
1290 }
1291
1292 // Not an escaped newline, must be a \t or something else.
1293 return 0;
1294}
1295
1296/// SkipEscapedNewLines - If P points to an escaped newline (or a series of
1297/// them), skip over them and return the first non-escaped-newline found,
1298/// otherwise return P.
1299const char *Lexer::SkipEscapedNewLines(const char *P) {
1300 while (true) {
1301 const char *AfterEscape;
1302 if (*P == '\\') {
1303 AfterEscape = P+1;
1304 } else if (*P == '?') {
1305 // If not a trigraph for escape, bail out.
1306 if (P[1] != '?' || P[2] != '/')
1307 return P;
1308 // FIXME: Take LangOpts into account; the language might not
1309 // support trigraphs.
1310 AfterEscape = P+3;
1311 } else {
1312 return P;
1313 }
1314
1315 unsigned NewLineSize = Lexer::getEscapedNewLineSize(AfterEscape);
1316 if (NewLineSize == 0) return P;
1317 P = AfterEscape+NewLineSize;
1318 }
1319}
1320
1322 const SourceManager &SM,
1323 const LangOptions &LangOpts,
1324 bool IncludeComments) {
1325 if (Loc.isMacroID()) {
1326 if (!Lexer::isAtEndOfMacroExpansion(Loc, SM, LangOpts, &Loc))
1327 return std::nullopt;
1328 }
1329 Loc = Lexer::getLocForEndOfToken(Loc, 0, SM, LangOpts);
1330
1331 // Break down the source location.
1332 FileIDAndOffset LocInfo = SM.getDecomposedLoc(Loc);
1333
1334 // Try to load the file buffer.
1335 bool InvalidTemp = false;
1336 StringRef File = SM.getBufferData(LocInfo.first, &InvalidTemp);
1337 if (InvalidTemp)
1338 return std::nullopt;
1339
1340 const char *TokenBegin = File.data() + LocInfo.second;
1341
1342 // Lex from the start of the given location.
1343 Lexer lexer(SM.getLocForStartOfFile(LocInfo.first), LangOpts, File.begin(),
1344 TokenBegin, File.end());
1345 lexer.SetCommentRetentionState(IncludeComments);
1346 // Find the token.
1347 Token Tok;
1348 lexer.LexFromRawLexer(Tok);
1349 return Tok;
1350}
1351
1353 const SourceManager &SM,
1354 const LangOptions &LangOpts,
1355 bool IncludeComments) {
1356 const auto StartOfFile = SM.getLocForStartOfFile(SM.getFileID(Loc));
1357 while (Loc != StartOfFile) {
1358 Loc = Loc.getLocWithOffset(-1);
1359 if (Loc.isInvalid())
1360 return std::nullopt;
1361
1362 Loc = GetBeginningOfToken(Loc, SM, LangOpts);
1363 Token Tok;
1364 if (getRawToken(Loc, Tok, SM, LangOpts))
1365 continue; // Not a token, go to prev location.
1366 if (!Tok.is(tok::comment) || IncludeComments) {
1367 return Tok;
1368 }
1369 }
1370 return std::nullopt;
1371}
1372
1373/// Checks that the given token is the first token that occurs after the
1374/// given location (this excludes comments and whitespace). Returns the location
1375/// immediately after the specified token. If the token is not found or the
1376/// location is inside a macro, the returned source location will be invalid.
1379 const LangOptions &LangOpts, bool SkipTrailingWhitespaceAndNewLine) {
1380 std::optional<Token> Tok = findNextToken(Loc, SM, LangOpts);
1381 if (!Tok || Tok->isNot(TKind))
1382 return {};
1383 SourceLocation TokenLoc = Tok->getLocation();
1384
1385 // Calculate how much whitespace needs to be skipped if any.
1386 unsigned NumWhitespaceChars = 0;
1387 if (SkipTrailingWhitespaceAndNewLine) {
1388 const char *TokenEnd = SM.getCharacterData(TokenLoc) + Tok->getLength();
1389 unsigned char C = *TokenEnd;
1390 while (isHorizontalWhitespace(C)) {
1391 C = *(++TokenEnd);
1392 NumWhitespaceChars++;
1393 }
1394
1395 // Skip \r, \n, \r\n, or \n\r
1396 if (C == '\n' || C == '\r') {
1397 char PrevC = C;
1398 C = *(++TokenEnd);
1399 NumWhitespaceChars++;
1400 if ((C == '\n' || C == '\r') && C != PrevC)
1401 NumWhitespaceChars++;
1402 }
1403 }
1404
1405 return TokenLoc.getLocWithOffset(Tok->getLength() + NumWhitespaceChars);
1406}
1407
1408/// getCharAndSizeSlow - Peek a single 'character' from the specified buffer,
1409/// get its size, and return it. This is tricky in several cases:
1410/// 1. If currently at the start of a trigraph, we warn about the trigraph,
1411/// then either return the trigraph (skipping 3 chars) or the '?',
1412/// depending on whether trigraphs are enabled or not.
1413/// 2. If this is an escaped newline (potentially with whitespace between
1414/// the backslash and newline), implicitly skip the newline and return
1415/// the char after it.
1416///
1417/// This handles the slow/uncommon case of the getCharAndSize method. Here we
1418/// know that we can accumulate into Size, and that we have already incremented
1419/// Ptr by Size bytes.
1420///
1421/// NOTE: When this method is updated, getCharAndSizeSlowNoWarn (below) should
1422/// be updated to match.
1423Lexer::SizedChar Lexer::getCharAndSizeSlow(const char *Ptr, Token *Tok) {
1424 unsigned Size = 0;
1425 // If we have a slash, look for an escaped newline.
1426 if (Ptr[0] == '\\') {
1427 ++Size;
1428 ++Ptr;
1429Slash:
1430 // Common case, backslash-char where the char is not whitespace.
1431 if (!isWhitespace(Ptr[0]))
1432 return {'\\', Size};
1433
1434 // See if we have optional whitespace characters between the slash and
1435 // newline.
1436 if (unsigned EscapedNewLineSize = getEscapedNewLineSize(Ptr)) {
1437 // Remember that this token needs to be cleaned.
1438 if (Tok) Tok->setFlag(Token::NeedsCleaning);
1439
1440 // Warn if there was whitespace between the backslash and newline.
1441 if (Ptr[0] != '\n' && Ptr[0] != '\r' && Tok && !isLexingRawMode())
1442 Diag(Ptr, diag::backslash_newline_space);
1443
1444 // Found backslash<whitespace><newline>. Parse the char after it.
1445 Size += EscapedNewLineSize;
1446 Ptr += EscapedNewLineSize;
1447
1448 // Use slow version to accumulate a correct size field.
1449 auto CharAndSize = getCharAndSizeSlow(Ptr, Tok);
1450 CharAndSize.Size += Size;
1451 return CharAndSize;
1452 }
1453
1454 // Otherwise, this is not an escaped newline, just return the slash.
1455 return {'\\', Size};
1456 }
1457
1458 // If this is a trigraph, process it.
1459 if (Ptr[0] == '?' && Ptr[1] == '?') {
1460 // If this is actually a legal trigraph (not something like "??x"), emit
1461 // a trigraph warning. If so, and if trigraphs are enabled, return it.
1462 if (char C = DecodeTrigraphChar(Ptr + 2, Tok ? this : nullptr,
1463 LangOpts.Trigraphs)) {
1464 // Remember that this token needs to be cleaned.
1465 if (Tok) Tok->setFlag(Token::NeedsCleaning);
1466
1467 Ptr += 3;
1468 Size += 3;
1469 if (C == '\\') goto Slash;
1470 return {C, Size};
1471 }
1472 }
1473
1474 // If this is neither, return a single character.
1475 return {*Ptr, Size + 1u};
1476}
1477
1478/// getCharAndSizeSlowNoWarn - Handle the slow/uncommon case of the
1479/// getCharAndSizeNoWarn method. Here we know that we can accumulate into Size,
1480/// and that we have already incremented Ptr by Size bytes.
1481///
1482/// NOTE: When this method is updated, getCharAndSizeSlow (above) should
1483/// be updated to match.
1484Lexer::SizedChar Lexer::getCharAndSizeSlowNoWarn(const char *Ptr,
1485 const LangOptions &LangOpts) {
1486
1487 unsigned Size = 0;
1488 // If we have a slash, look for an escaped newline.
1489 if (Ptr[0] == '\\') {
1490 ++Size;
1491 ++Ptr;
1492Slash:
1493 // Common case, backslash-char where the char is not whitespace.
1494 if (!isWhitespace(Ptr[0]))
1495 return {'\\', Size};
1496
1497 // See if we have optional whitespace characters followed by a newline.
1498 if (unsigned EscapedNewLineSize = getEscapedNewLineSize(Ptr)) {
1499 // Found backslash<whitespace><newline>. Parse the char after it.
1500 Size += EscapedNewLineSize;
1501 Ptr += EscapedNewLineSize;
1502
1503 // Use slow version to accumulate a correct size field.
1504 auto CharAndSize = getCharAndSizeSlowNoWarn(Ptr, LangOpts);
1505 CharAndSize.Size += Size;
1506 return CharAndSize;
1507 }
1508
1509 // Otherwise, this is not an escaped newline, just return the slash.
1510 return {'\\', Size};
1511 }
1512
1513 // If this is a trigraph, process it.
1514 if (LangOpts.Trigraphs && Ptr[0] == '?' && Ptr[1] == '?') {
1515 // If this is actually a legal trigraph (not something like "??x"), return
1516 // it.
1517 if (char C = GetTrigraphCharForLetter(Ptr[2])) {
1518 Ptr += 3;
1519 Size += 3;
1520 if (C == '\\') goto Slash;
1521 return {C, Size};
1522 }
1523 }
1524
1525 // If this is neither, return a single character.
1526 return {*Ptr, Size + 1u};
1527}
1528
1529//===----------------------------------------------------------------------===//
1530// Helper methods for lexing.
1531//===----------------------------------------------------------------------===//
1532
1533/// Routine that indiscriminately sets the offset into the source file.
1534void Lexer::SetByteOffset(unsigned Offset, bool StartOfLine) {
1535 BufferPtr = BufferStart + Offset;
1536 if (BufferPtr > BufferEnd)
1537 BufferPtr = BufferEnd;
1538 // FIXME: What exactly does the StartOfLine bit mean? There are two
1539 // possible meanings for the "start" of the line: the first token on the
1540 // unexpanded line, or the first token on the expanded line.
1541 IsAtStartOfLine = StartOfLine;
1542 IsAtPhysicalStartOfLine = StartOfLine;
1543}
1544
1545static bool isUnicodeWhitespace(uint32_t Codepoint) {
1546 static const llvm::sys::UnicodeCharSet UnicodeWhitespaceChars(
1548 return UnicodeWhitespaceChars.contains(Codepoint);
1549}
1550
1552 llvm::SmallString<5> CharBuf;
1553 llvm::raw_svector_ostream CharOS(CharBuf);
1554 llvm::write_hex(CharOS, C, llvm::HexPrintStyle::Upper, 4);
1555 return CharBuf;
1556}
1557
1558// To mitigate https://github.com/llvm/llvm-project/issues/54732,
1559// we allow "Mathematical Notation Characters" in identifiers.
1560// This is a proposed profile that extends the XID_Start/XID_continue
1561// with mathematical symbols, superscipts and subscripts digits
1562// found in some production software.
1563// https://www.unicode.org/L2/L2022/22230-math-profile.pdf
1564static bool isMathematicalExtensionID(uint32_t C, const LangOptions &LangOpts,
1565 bool IsStart, bool &IsExtension) {
1566 static const llvm::sys::UnicodeCharSet MathStartChars(
1568 static const llvm::sys::UnicodeCharSet MathContinueChars(
1570 if (MathStartChars.contains(C) ||
1571 (!IsStart && MathContinueChars.contains(C))) {
1572 IsExtension = true;
1573 return true;
1574 }
1575 return false;
1576}
1577
1578static bool isAllowedIDChar(uint32_t C, const LangOptions &LangOpts,
1579 bool &IsExtension) {
1580 if (LangOpts.AsmPreprocessor) {
1581 return false;
1582 } else if (LangOpts.DollarIdents && '$' == C) {
1583 return true;
1584 } else if (LangOpts.CPlusPlus || LangOpts.C23) {
1585 // A non-leading codepoint must have the XID_Continue property.
1586 // XIDContinueRanges doesn't contains characters also in XIDStartRanges,
1587 // so we need to check both tables.
1588 // '_' doesn't have the XID_Continue property but is allowed in C and C++.
1589 static const llvm::sys::UnicodeCharSet XIDStartChars(XIDStartRanges);
1590 static const llvm::sys::UnicodeCharSet XIDContinueChars(XIDContinueRanges);
1591 if (C == '_' || XIDStartChars.contains(C) || XIDContinueChars.contains(C))
1592 return true;
1593 return isMathematicalExtensionID(C, LangOpts, /*IsStart=*/false,
1594 IsExtension);
1595 } else if (LangOpts.C11) {
1596 static const llvm::sys::UnicodeCharSet C11AllowedIDChars(
1598 return C11AllowedIDChars.contains(C);
1599 } else {
1600 static const llvm::sys::UnicodeCharSet C99AllowedIDChars(
1602 return C99AllowedIDChars.contains(C);
1603 }
1604}
1605
1606static bool isAllowedInitiallyIDChar(uint32_t C, const LangOptions &LangOpts,
1607 bool &IsExtension) {
1608 assert(C > 0x7F && "isAllowedInitiallyIDChar called with an ASCII codepoint");
1609 IsExtension = false;
1610 if (LangOpts.AsmPreprocessor) {
1611 return false;
1612 }
1613 if (LangOpts.CPlusPlus || LangOpts.C23) {
1614 static const llvm::sys::UnicodeCharSet XIDStartChars(XIDStartRanges);
1615 if (XIDStartChars.contains(C))
1616 return true;
1617 return isMathematicalExtensionID(C, LangOpts, /*IsStart=*/true,
1618 IsExtension);
1619 }
1620 if (!isAllowedIDChar(C, LangOpts, IsExtension))
1621 return false;
1622 if (LangOpts.C11) {
1623 static const llvm::sys::UnicodeCharSet C11DisallowedInitialIDChars(
1625 return !C11DisallowedInitialIDChars.contains(C);
1626 }
1627 static const llvm::sys::UnicodeCharSet C99DisallowedInitialIDChars(
1629 return !C99DisallowedInitialIDChars.contains(C);
1630}
1631
1634
1635 static const llvm::sys::UnicodeCharSet MathStartChars(
1637 static const llvm::sys::UnicodeCharSet MathContinueChars(
1639
1640 (void)MathStartChars;
1641 (void)MathContinueChars;
1642 assert((MathStartChars.contains(C) || MathContinueChars.contains(C)) &&
1643 "Unexpected mathematical notation codepoint");
1644 Diags.Report(Range.getBegin(), diag::ext_mathematical_notation)
1646}
1647
1648static inline CharSourceRange makeCharRange(Lexer &L, const char *Begin,
1649 const char *End) {
1651 L.getSourceLocation(End));
1652}
1653
1654static void maybeDiagnoseIDCharCompat(DiagnosticsEngine &Diags, uint32_t C,
1655 CharSourceRange Range, bool IsFirst) {
1656 // Check C99 compatibility.
1657 if (!Diags.isIgnored(diag::warn_c99_compat_unicode_id, Range.getBegin())) {
1658 enum {
1659 CannotAppearInIdentifier = 0,
1660 CannotStartIdentifier
1661 };
1662
1663 static const llvm::sys::UnicodeCharSet C99AllowedIDChars(
1665 static const llvm::sys::UnicodeCharSet C99DisallowedInitialIDChars(
1667 if (!C99AllowedIDChars.contains(C)) {
1668 Diags.Report(Range.getBegin(), diag::warn_c99_compat_unicode_id)
1669 << Range
1670 << CannotAppearInIdentifier;
1671 } else if (IsFirst && C99DisallowedInitialIDChars.contains(C)) {
1672 Diags.Report(Range.getBegin(), diag::warn_c99_compat_unicode_id)
1673 << Range
1674 << CannotStartIdentifier;
1675 }
1676 }
1677}
1678
1679/// After encountering UTF-8 character C and interpreting it as an identifier
1680/// character, check whether it's a homoglyph for a common non-identifier
1681/// source character that is unlikely to be an intentional identifier
1682/// character and warn if so.
1685 // FIXME: Handle Unicode quotation marks (smart quotes, fullwidth quotes).
1686 struct HomoglyphPair {
1687 uint32_t Character;
1688 char LooksLike;
1689 bool operator<(HomoglyphPair R) const { return Character < R.Character; }
1690 };
1691 static constexpr HomoglyphPair SortedHomoglyphs[] = {
1692 {U'\u00ad', 0}, // SOFT HYPHEN
1693 {U'\u01c3', '!'}, // LATIN LETTER RETROFLEX CLICK
1694 {U'\u037e', ';'}, // GREEK QUESTION MARK
1695 {U'\u200b', 0}, // ZERO WIDTH SPACE
1696 {U'\u200c', 0}, // ZERO WIDTH NON-JOINER
1697 {U'\u200d', 0}, // ZERO WIDTH JOINER
1698 {U'\u2060', 0}, // WORD JOINER
1699 {U'\u2061', 0}, // FUNCTION APPLICATION
1700 {U'\u2062', 0}, // INVISIBLE TIMES
1701 {U'\u2063', 0}, // INVISIBLE SEPARATOR
1702 {U'\u2064', 0}, // INVISIBLE PLUS
1703 {U'\u2212', '-'}, // MINUS SIGN
1704 {U'\u2215', '/'}, // DIVISION SLASH
1705 {U'\u2216', '\\'}, // SET MINUS
1706 {U'\u2217', '*'}, // ASTERISK OPERATOR
1707 {U'\u2223', '|'}, // DIVIDES
1708 {U'\u2227', '^'}, // LOGICAL AND
1709 {U'\u2236', ':'}, // RATIO
1710 {U'\u223c', '~'}, // TILDE OPERATOR
1711 {U'\ua789', ':'}, // MODIFIER LETTER COLON
1712 {U'\ufeff', 0}, // ZERO WIDTH NO-BREAK SPACE
1713 {U'\uff01', '!'}, // FULLWIDTH EXCLAMATION MARK
1714 {U'\uff03', '#'}, // FULLWIDTH NUMBER SIGN
1715 {U'\uff04', '$'}, // FULLWIDTH DOLLAR SIGN
1716 {U'\uff05', '%'}, // FULLWIDTH PERCENT SIGN
1717 {U'\uff06', '&'}, // FULLWIDTH AMPERSAND
1718 {U'\uff08', '('}, // FULLWIDTH LEFT PARENTHESIS
1719 {U'\uff09', ')'}, // FULLWIDTH RIGHT PARENTHESIS
1720 {U'\uff0a', '*'}, // FULLWIDTH ASTERISK
1721 {U'\uff0b', '+'}, // FULLWIDTH ASTERISK
1722 {U'\uff0c', ','}, // FULLWIDTH COMMA
1723 {U'\uff0d', '-'}, // FULLWIDTH HYPHEN-MINUS
1724 {U'\uff0e', '.'}, // FULLWIDTH FULL STOP
1725 {U'\uff0f', '/'}, // FULLWIDTH SOLIDUS
1726 {U'\uff1a', ':'}, // FULLWIDTH COLON
1727 {U'\uff1b', ';'}, // FULLWIDTH SEMICOLON
1728 {U'\uff1c', '<'}, // FULLWIDTH LESS-THAN SIGN
1729 {U'\uff1d', '='}, // FULLWIDTH EQUALS SIGN
1730 {U'\uff1e', '>'}, // FULLWIDTH GREATER-THAN SIGN
1731 {U'\uff1f', '?'}, // FULLWIDTH QUESTION MARK
1732 {U'\uff20', '@'}, // FULLWIDTH COMMERCIAL AT
1733 {U'\uff3b', '['}, // FULLWIDTH LEFT SQUARE BRACKET
1734 {U'\uff3c', '\\'}, // FULLWIDTH REVERSE SOLIDUS
1735 {U'\uff3d', ']'}, // FULLWIDTH RIGHT SQUARE BRACKET
1736 {U'\uff3e', '^'}, // FULLWIDTH CIRCUMFLEX ACCENT
1737 {U'\uff5b', '{'}, // FULLWIDTH LEFT CURLY BRACKET
1738 {U'\uff5c', '|'}, // FULLWIDTH VERTICAL LINE
1739 {U'\uff5d', '}'}, // FULLWIDTH RIGHT CURLY BRACKET
1740 {U'\uff5e', '~'}, // FULLWIDTH TILDE
1741 {0, 0}
1742 };
1743 auto Homoglyph =
1744 std::lower_bound(std::begin(SortedHomoglyphs),
1745 std::end(SortedHomoglyphs) - 1, HomoglyphPair{C, '\0'});
1746 if (Homoglyph->Character == C) {
1747 if (Homoglyph->LooksLike) {
1748 const char LooksLikeStr[] = {Homoglyph->LooksLike, 0};
1749 Diags.Report(Range.getBegin(), diag::warn_utf8_symbol_homoglyph)
1750 << Range << codepointAsHexString(C) << LooksLikeStr;
1751 } else {
1752 Diags.Report(Range.getBegin(), diag::warn_utf8_symbol_zero_width)
1754 }
1755 }
1756}
1757
1759 DiagnosticsEngine &Diags, const LangOptions &LangOpts, uint32_t CodePoint,
1760 CharSourceRange Range, bool IsFirst) {
1761 if (isASCII(CodePoint))
1762 return;
1763
1764 bool IsExtension;
1765 bool IsIDStart = isAllowedInitiallyIDChar(CodePoint, LangOpts, IsExtension);
1766 bool IsIDContinue =
1767 IsIDStart || isAllowedIDChar(CodePoint, LangOpts, IsExtension);
1768
1769 if ((IsFirst && IsIDStart) || (!IsFirst && IsIDContinue))
1770 return;
1771
1772 bool InvalidOnlyAtStart = IsFirst && !IsIDStart && IsIDContinue;
1773
1774 if (!IsFirst || InvalidOnlyAtStart) {
1775 Diags.Report(Range.getBegin(), diag::err_character_not_allowed_identifier)
1776 << Range << codepointAsHexString(CodePoint) << int(InvalidOnlyAtStart)
1778 } else {
1779 Diags.Report(Range.getBegin(), diag::err_character_not_allowed)
1780 << Range << codepointAsHexString(CodePoint)
1782 }
1783}
1784
1785bool Lexer::tryConsumeIdentifierUCN(const char *&CurPtr, unsigned Size,
1786 Token &Result) {
1787 const char *UCNPtr = CurPtr + Size;
1788 uint32_t CodePoint = tryReadUCN(UCNPtr, CurPtr, /*Token=*/nullptr);
1789 if (CodePoint == 0) {
1790 return false;
1791 }
1792 bool IsExtension = false;
1793 if (!isAllowedIDChar(CodePoint, LangOpts, IsExtension)) {
1794 if (isASCII(CodePoint) || isUnicodeWhitespace(CodePoint))
1795 return false;
1799 PP->getDiagnostics(), LangOpts, CodePoint,
1800 makeCharRange(*this, CurPtr, UCNPtr),
1801 /*IsFirst=*/false);
1802
1803 // We got a unicode codepoint that is neither a space nor a
1804 // a valid identifier part.
1805 // Carry on as if the codepoint was valid for recovery purposes.
1806 } else if (!isLexingRawMode()) {
1807 if (IsExtension)
1809 makeCharRange(*this, CurPtr, UCNPtr));
1810
1812 makeCharRange(*this, CurPtr, UCNPtr),
1813 /*IsFirst=*/false);
1814 }
1815
1816 Result.setFlag(Token::HasUCN);
1817 if ((UCNPtr - CurPtr == 6 && CurPtr[1] == 'u') ||
1818 (UCNPtr - CurPtr == 10 && CurPtr[1] == 'U'))
1819 CurPtr = UCNPtr;
1820 else
1821 while (CurPtr != UCNPtr)
1822 (void)getAndAdvanceChar(CurPtr, Result);
1823 return true;
1824}
1825
1826bool Lexer::tryConsumeIdentifierUTF8Char(const char *&CurPtr, Token &Result) {
1827 llvm::UTF32 CodePoint;
1828
1829 // If a UTF-8 codepoint appears immediately after an escaped new line,
1830 // CurPtr may point to the splicing \ on the preceding line,
1831 // so we need to skip it.
1832 unsigned FirstCodeUnitSize;
1833 getCharAndSize(CurPtr, FirstCodeUnitSize);
1834 const char *CharStart = CurPtr + FirstCodeUnitSize - 1;
1835 const char *UnicodePtr = CharStart;
1836
1837 llvm::ConversionResult ConvResult = llvm::convertUTF8Sequence(
1838 (const llvm::UTF8 **)&UnicodePtr, (const llvm::UTF8 *)BufferEnd,
1839 &CodePoint, llvm::strictConversion);
1840 if (ConvResult != llvm::conversionOK)
1841 return false;
1842
1843 bool IsExtension = false;
1844 if (!isAllowedIDChar(static_cast<uint32_t>(CodePoint), LangOpts,
1845 IsExtension)) {
1846 if (isASCII(CodePoint) || isUnicodeWhitespace(CodePoint))
1847 return false;
1848
1852 PP->getDiagnostics(), LangOpts, CodePoint,
1853 makeCharRange(*this, CharStart, UnicodePtr), /*IsFirst=*/false);
1854 // We got a unicode codepoint that is neither a space nor a
1855 // a valid identifier part. Carry on as if the codepoint was
1856 // valid for recovery purposes.
1857 } else if (!isLexingRawMode()) {
1858 if (IsExtension)
1860 PP->getDiagnostics(), CodePoint,
1861 makeCharRange(*this, CharStart, UnicodePtr));
1863 makeCharRange(*this, CharStart, UnicodePtr),
1864 /*IsFirst=*/false);
1866 makeCharRange(*this, CharStart, UnicodePtr));
1867 }
1868
1869 // Once we sucessfully parsed some UTF-8,
1870 // calling ConsumeChar ensures the NeedsCleaning flag is set on the token
1871 // being lexed, and that warnings about trailing spaces are emitted.
1872 ConsumeChar(CurPtr, FirstCodeUnitSize, Result);
1873 CurPtr = UnicodePtr;
1874 return true;
1875}
1876
1877bool Lexer::LexUnicodeIdentifierStart(Token &Result, uint32_t C,
1878 const char *CurPtr) {
1879 bool IsExtension = false;
1880 if (isAllowedInitiallyIDChar(C, LangOpts, IsExtension)) {
1883 if (IsExtension)
1885 makeCharRange(*this, BufferPtr, CurPtr));
1887 makeCharRange(*this, BufferPtr, CurPtr),
1888 /*IsFirst=*/true);
1890 makeCharRange(*this, BufferPtr, CurPtr));
1891 }
1892
1893 MIOpt.ReadToken();
1894 return LexIdentifierContinue(Result, CurPtr);
1895 }
1896
1898 !PP->isPreprocessedOutput() && !isASCII(*BufferPtr) &&
1900 // Non-ASCII characters tend to creep into source code unintentionally.
1901 // Instead of letting the parser complain about the unknown token,
1902 // just drop the character.
1903 // Note that we can /only/ do this when the non-ASCII character is actually
1904 // spelled as Unicode, not written as a UCN. The standard requires that
1905 // we not throw away any possible preprocessor tokens, but there's a
1906 // loophole in the mapping of Unicode characters to basic character set
1907 // characters that allows us to map these particular characters to, say,
1908 // whitespace.
1910 PP->getDiagnostics(), LangOpts, C,
1911 makeCharRange(*this, BufferPtr, CurPtr), /*IsStart*/ true);
1912 BufferPtr = CurPtr;
1913 return false;
1914 }
1915
1916 // Otherwise, we have an explicit UCN or a character that's unlikely to show
1917 // up by accident.
1918 MIOpt.ReadToken();
1919 FormTokenWithChars(Result, CurPtr, tok::unknown);
1920 return true;
1921}
1922
1923static const char *
1924fastParseASCIIIdentifier(const char *CurPtr,
1925 [[maybe_unused]] const char *BufferEnd) {
1926#ifdef __SSE4_2__
1927 alignas(16) static constexpr char AsciiIdentifierRange[16] = {
1928 '_', '_', 'A', 'Z', 'a', 'z', '0', '9',
1929 };
1930 constexpr ssize_t BytesPerRegister = 16;
1931
1932 __m128i AsciiIdentifierRangeV =
1933 _mm_load_si128((const __m128i *)AsciiIdentifierRange);
1934
1935 while (LLVM_LIKELY(BufferEnd - CurPtr >= BytesPerRegister)) {
1936 __m128i Cv = _mm_loadu_si128((const __m128i *)(CurPtr));
1937
1938 int Consumed = _mm_cmpistri(AsciiIdentifierRangeV, Cv,
1941 CurPtr += Consumed;
1942 if (Consumed == BytesPerRegister)
1943 continue;
1944 return CurPtr;
1945 }
1946#endif
1947
1948 unsigned char C = *CurPtr;
1950 C = *++CurPtr;
1951 return CurPtr;
1952}
1953
1954bool Lexer::LexIdentifierContinue(Token &Result, const char *CurPtr) {
1955 // Match [_A-Za-z0-9]*, we have already matched an identifier start.
1956
1957 while (true) {
1958
1959 CurPtr = fastParseASCIIIdentifier(CurPtr, BufferEnd);
1960
1961 unsigned Size;
1962 // Slow path: handle trigraph, unicode codepoints, UCNs.
1963 unsigned char C = getCharAndSize(CurPtr, Size);
1965 CurPtr = ConsumeChar(CurPtr, Size, Result);
1966 continue;
1967 }
1968 if (C == '$') {
1969 // If we hit a $ and they are not supported in identifiers, we are done.
1970 if (!LangOpts.DollarIdents)
1971 break;
1972 // Otherwise, emit a diagnostic and continue.
1973 if (!isLexingRawMode())
1974 Diag(CurPtr, diag::ext_dollar_in_identifier);
1975 CurPtr = ConsumeChar(CurPtr, Size, Result);
1976 continue;
1977 }
1978 if (C == '\\' && tryConsumeIdentifierUCN(CurPtr, Size, Result))
1979 continue;
1980 if (!isASCII(C) && tryConsumeIdentifierUTF8Char(CurPtr, Result))
1981 continue;
1982 // Neither an expected Unicode codepoint nor a UCN.
1983 break;
1984 }
1985
1986 const char *IdStart = BufferPtr;
1987 FormTokenWithChars(Result, CurPtr, tok::raw_identifier);
1988 Result.setRawIdentifierData(IdStart);
1989
1990 // If we are in raw mode, return this identifier raw. There is no need to
1991 // look up identifier information or attempt to macro expand it.
1992 if (LexingRawMode)
1993 return true;
1994
1995 // Fill in Result.IdentifierInfo and update the token kind,
1996 // looking up the identifier in the identifier table.
1998 // Note that we have to call PP->LookUpIdentifierInfo() even for code
1999 // completion, it writes IdentifierInfo into Result, and callers rely on it.
2000
2001 // If the completion point is at the end of an identifier, we want to treat
2002 // the identifier as incomplete even if it resolves to a macro or a keyword.
2003 // This allows e.g. 'class^' to complete to 'classifier'.
2004 if (isCodeCompletionPoint(CurPtr)) {
2005 // Return the code-completion token.
2006 Result.setKind(tok::code_completion);
2007 // Skip the code-completion char and all immediate identifier characters.
2008 // This ensures we get consistent behavior when completing at any point in
2009 // an identifier (i.e. at the start, in the middle, at the end). Note that
2010 // only simple cases (i.e. [a-zA-Z0-9_]) are supported to keep the code
2011 // simpler.
2012 assert(*CurPtr == 0 && "Completion character must be 0");
2013 ++CurPtr;
2014 // Note that code completion token is not added as a separate character
2015 // when the completion point is at the end of the buffer. Therefore, we need
2016 // to check if the buffer has ended.
2017 if (CurPtr < BufferEnd) {
2018 while (isAsciiIdentifierContinue(*CurPtr))
2019 ++CurPtr;
2020 }
2021 BufferPtr = CurPtr;
2022 return true;
2023 }
2024
2025 // Finally, now that we know we have an identifier, pass this off to the
2026 // preprocessor, which may macro expand it or something.
2027 if (II->isHandleIdentifierCase())
2028 return PP->HandleIdentifier(Result);
2029
2030 return true;
2031}
2032
2033/// isHexaLiteral - Return true if Start points to a hex constant.
2034/// in microsoft mode (where this is supposed to be several different tokens).
2035bool Lexer::isHexaLiteral(const char *Start, const LangOptions &LangOpts) {
2036 auto CharAndSize1 = Lexer::getCharAndSizeNoWarn(Start, LangOpts);
2037 char C1 = CharAndSize1.Char;
2038 if (C1 != '0')
2039 return false;
2040
2041 auto CharAndSize2 =
2042 Lexer::getCharAndSizeNoWarn(Start + CharAndSize1.Size, LangOpts);
2043 char C2 = CharAndSize2.Char;
2044 return (C2 == 'x' || C2 == 'X');
2045}
2046
2047/// LexNumericConstant - Lex the remainder of a integer or floating point
2048/// constant. From[-1] is the first character lexed. Return the end of the
2049/// constant.
2050bool Lexer::LexNumericConstant(Token &Result, const char *CurPtr) {
2051 unsigned Size;
2052 char C = getCharAndSize(CurPtr, Size);
2053 char PrevCh = 0;
2054 while (isPreprocessingNumberBody(C)) {
2055 CurPtr = ConsumeChar(CurPtr, Size, Result);
2056 PrevCh = C;
2057 if (LangOpts.HLSL && C == '.' && (*CurPtr == 'x' || *CurPtr == 'r')) {
2058 CurPtr -= Size;
2059 break;
2060 }
2061 C = getCharAndSize(CurPtr, Size);
2062 }
2063
2064 // If we fell out, check for a sign, due to 1e+12. If we have one, continue.
2065 if ((C == '-' || C == '+') && (PrevCh == 'E' || PrevCh == 'e')) {
2066 // If we are in Microsoft mode, don't continue if the constant is hex.
2067 // For example, MSVC will accept the following as 3 tokens: 0x1234567e+1
2068 if (!LangOpts.MicrosoftExt || !isHexaLiteral(BufferPtr, LangOpts))
2069 return LexNumericConstant(Result, ConsumeChar(CurPtr, Size, Result));
2070 }
2071
2072 // If we have a hex FP constant, continue.
2073 if ((C == '-' || C == '+') && (PrevCh == 'P' || PrevCh == 'p')) {
2074 // Outside C99 and C++17, we accept hexadecimal floating point numbers as a
2075 // not-quite-conforming extension. Only do so if this looks like it's
2076 // actually meant to be a hexfloat, and not if it has a ud-suffix.
2077 bool IsHexFloat = true;
2078 if (!LangOpts.C99) {
2079 if (!isHexaLiteral(BufferPtr, LangOpts))
2080 IsHexFloat = false;
2081 else if (!LangOpts.CPlusPlus17 &&
2082 std::find(BufferPtr, CurPtr, '_') != CurPtr)
2083 IsHexFloat = false;
2084 }
2085 if (IsHexFloat)
2086 return LexNumericConstant(Result, ConsumeChar(CurPtr, Size, Result));
2087 }
2088
2089 // If we have a digit separator, continue.
2090 if (C == '\'' && (LangOpts.CPlusPlus14 || LangOpts.C23)) {
2091 auto [Next, NextSize] = getCharAndSizeNoWarn(CurPtr + Size, LangOpts);
2092 if (isAsciiIdentifierContinue(Next)) {
2093 if (!isLexingRawMode())
2094 Diag(CurPtr, LangOpts.CPlusPlus
2095 ? diag::warn_cxx11_compat_digit_separator
2096 : diag::warn_c23_compat_digit_separator);
2097 CurPtr = ConsumeChar(CurPtr, Size, Result);
2098 CurPtr = ConsumeChar(CurPtr, NextSize, Result);
2099 return LexNumericConstant(Result, CurPtr);
2100 }
2101 }
2102
2103 // If we have a UCN or UTF-8 character (perhaps in a ud-suffix), continue.
2104 if (C == '\\' && tryConsumeIdentifierUCN(CurPtr, Size, Result))
2105 return LexNumericConstant(Result, CurPtr);
2106 if (!isASCII(C) && tryConsumeIdentifierUTF8Char(CurPtr, Result))
2107 return LexNumericConstant(Result, CurPtr);
2108
2109 // Update the location of token as well as BufferPtr.
2110 const char *TokStart = BufferPtr;
2111 FormTokenWithChars(Result, CurPtr, tok::numeric_constant);
2112 Result.setLiteralData(TokStart);
2113 return true;
2114}
2115
2116/// LexUDSuffix - Lex the ud-suffix production for user-defined literal suffixes
2117/// in C++11, or warn on a ud-suffix in C++98.
2118const char *Lexer::LexUDSuffix(Token &Result, const char *CurPtr,
2119 bool IsStringLiteral) {
2120 assert(LangOpts.CPlusPlus);
2121
2122 // Maximally munch an identifier.
2123 unsigned Size;
2124 char C = getCharAndSize(CurPtr, Size);
2125 bool Consumed = false;
2126
2127 if (!isAsciiIdentifierStart(C)) {
2128 if (C == '\\' && tryConsumeIdentifierUCN(CurPtr, Size, Result))
2129 Consumed = true;
2130 else if (!isASCII(C) && tryConsumeIdentifierUTF8Char(CurPtr, Result))
2131 Consumed = true;
2132 else
2133 return CurPtr;
2134 }
2135
2136 if (!LangOpts.CPlusPlus11) {
2137 if (!isLexingRawMode())
2138 Diag(CurPtr,
2139 C == '_' ? diag::warn_cxx11_compat_user_defined_literal
2140 : diag::warn_cxx11_compat_reserved_user_defined_literal)
2142 return CurPtr;
2143 }
2144
2145 // C++11 [lex.ext]p10, [usrlit.suffix]p1: A program containing a ud-suffix
2146 // that does not start with an underscore is ill-formed. As a conforming
2147 // extension, we treat all such suffixes as if they had whitespace before
2148 // them. We assume a suffix beginning with a UCN or UTF-8 character is more
2149 // likely to be a ud-suffix than a macro, however, and accept that.
2150 if (!Consumed) {
2151 bool IsUDSuffix = false;
2152 if (C == '_')
2153 IsUDSuffix = true;
2154 else if (IsStringLiteral && LangOpts.CPlusPlus14) {
2155 // In C++1y, we need to look ahead a few characters to see if this is a
2156 // valid suffix for a string literal or a numeric literal (this could be
2157 // the 'operator""if' defining a numeric literal operator).
2158 const unsigned MaxStandardSuffixLength = 3;
2159 char Buffer[MaxStandardSuffixLength] = { C };
2160 unsigned Consumed = Size;
2161 unsigned Chars = 1;
2162 while (true) {
2163 auto [Next, NextSize] =
2164 getCharAndSizeNoWarn(CurPtr + Consumed, LangOpts);
2165 if (!isAsciiIdentifierContinue(Next)) {
2166 // End of suffix. Check whether this is on the allowed list.
2167 const StringRef CompleteSuffix(Buffer, Chars);
2168 IsUDSuffix =
2169 StringLiteralParser::isValidUDSuffix(LangOpts, CompleteSuffix);
2170 break;
2171 }
2172
2173 if (Chars == MaxStandardSuffixLength)
2174 // Too long: can't be a standard suffix.
2175 break;
2176
2177 Buffer[Chars++] = Next;
2178 Consumed += NextSize;
2179 }
2180 }
2181
2182 if (!IsUDSuffix) {
2183 if (!isLexingRawMode())
2184 Diag(CurPtr, LangOpts.MSVCCompat
2185 ? diag::ext_ms_reserved_user_defined_literal
2186 : diag::ext_reserved_user_defined_literal)
2188 return CurPtr;
2189 }
2190
2191 CurPtr = ConsumeChar(CurPtr, Size, Result);
2192 }
2193
2194 Result.setFlag(Token::HasUDSuffix);
2195 while (true) {
2196 C = getCharAndSize(CurPtr, Size);
2198 CurPtr = ConsumeChar(CurPtr, Size, Result);
2199 } else if (C == '\\' && tryConsumeIdentifierUCN(CurPtr, Size, Result)) {
2200 } else if (!isASCII(C) && tryConsumeIdentifierUTF8Char(CurPtr, Result)) {
2201 } else
2202 break;
2203 }
2204
2205 return CurPtr;
2206}
2207
2208/// LexStringLiteral - Lex the remainder of a string literal, after having lexed
2209/// either " or L" or u8" or u" or U".
2210bool Lexer::LexStringLiteral(Token &Result, const char *CurPtr,
2211 tok::TokenKind Kind) {
2212 const char *AfterQuote = CurPtr;
2213 // Does this string contain the \0 character?
2214 const char *NulCharacter = nullptr;
2215
2216 if (!isLexingRawMode() &&
2217 (Kind == tok::utf8_string_literal ||
2218 Kind == tok::utf16_string_literal ||
2219 Kind == tok::utf32_string_literal))
2220 Diag(BufferPtr, LangOpts.CPlusPlus ? diag::warn_cxx98_compat_unicode_literal
2221 : diag::warn_c99_compat_unicode_literal);
2222
2223 char C = getAndAdvanceChar(CurPtr, Result);
2224 while (C != '"') {
2225 // Skip escaped characters. Escaped newlines will already be processed by
2226 // getAndAdvanceChar.
2227 if (C == '\\')
2228 C = getAndAdvanceChar(CurPtr, Result);
2229
2230 if (C == '\n' || C == '\r' || // Newline.
2231 (C == 0 && CurPtr-1 == BufferEnd)) { // End of file.
2232 if (!isLexingRawMode() && !LangOpts.AsmPreprocessor)
2233 Diag(BufferPtr, diag::ext_unterminated_char_or_string) << 1;
2234 FormTokenWithChars(Result, CurPtr-1, tok::unknown);
2235 return true;
2236 }
2237
2238 if (C == 0) {
2239 if (isCodeCompletionPoint(CurPtr-1)) {
2240 if (ParsingFilename)
2241 codeCompleteIncludedFile(AfterQuote, CurPtr - 1, /*IsAngled=*/false);
2242 else
2244 FormTokenWithChars(Result, CurPtr - 1, tok::unknown);
2245 cutOffLexing();
2246 return true;
2247 }
2248
2249 NulCharacter = CurPtr-1;
2250 }
2251 C = getAndAdvanceChar(CurPtr, Result);
2252 }
2253
2254 // If we are in C++11, lex the optional ud-suffix.
2255 if (LangOpts.CPlusPlus)
2256 CurPtr = LexUDSuffix(Result, CurPtr, true);
2257
2258 // If a nul character existed in the string, warn about it.
2259 if (NulCharacter && !isLexingRawMode())
2260 Diag(NulCharacter, diag::null_in_char_or_string) << 1;
2261
2262 // Update the location of the token as well as the BufferPtr instance var.
2263 const char *TokStart = BufferPtr;
2264 FormTokenWithChars(Result, CurPtr, Kind);
2265 Result.setLiteralData(TokStart);
2266 return true;
2267}
2268
2269/// LexRawStringLiteral - Lex the remainder of a raw string literal, after
2270/// having lexed R", LR", u8R", uR", or UR".
2271bool Lexer::LexRawStringLiteral(Token &Result, const char *CurPtr,
2272 tok::TokenKind Kind) {
2273 // This function doesn't use getAndAdvanceChar because C++0x [lex.pptoken]p3:
2274 // Between the initial and final double quote characters of the raw string,
2275 // any transformations performed in phases 1 and 2 (trigraphs,
2276 // universal-character-names, and line splicing) are reverted.
2277
2278 if (!isLexingRawMode())
2279 Diag(BufferPtr, diag::warn_cxx98_compat_raw_string_literal);
2280
2281 unsigned PrefixLen = 0;
2282
2283 while (PrefixLen != 16 && isRawStringDelimBody(CurPtr[PrefixLen])) {
2284 if (!isLexingRawMode() &&
2285 llvm::is_contained({'$', '@', '`'}, CurPtr[PrefixLen])) {
2286 const char *Pos = &CurPtr[PrefixLen];
2287 Diag(Pos, LangOpts.CPlusPlus26
2288 ? diag::warn_cxx26_compat_raw_string_literal_character_set
2289 : diag::ext_cxx26_raw_string_literal_character_set)
2290 << StringRef(Pos, 1);
2291 }
2292 ++PrefixLen;
2293 }
2294
2295 // If the last character was not a '(', then we didn't lex a valid delimiter.
2296 if (CurPtr[PrefixLen] != '(') {
2297 if (!isLexingRawMode()) {
2298 const char *PrefixEnd = &CurPtr[PrefixLen];
2299 if (PrefixLen == 16) {
2300 Diag(PrefixEnd, diag::err_raw_delim_too_long);
2301 } else if (*PrefixEnd == '\n') {
2302 Diag(PrefixEnd, diag::err_invalid_newline_raw_delim);
2303 } else {
2304 Diag(PrefixEnd, diag::err_invalid_char_raw_delim)
2305 << StringRef(PrefixEnd, 1);
2306 }
2307 }
2308
2309 // Search for the next '"' in hopes of salvaging the lexer. Unfortunately,
2310 // it's possible the '"' was intended to be part of the raw string, but
2311 // there's not much we can do about that.
2312 while (true) {
2313 char C = *CurPtr++;
2314
2315 if (C == '"')
2316 break;
2317 if (C == 0 && CurPtr-1 == BufferEnd) {
2318 --CurPtr;
2319 break;
2320 }
2321 }
2322
2323 FormTokenWithChars(Result, CurPtr, tok::unknown);
2324 return true;
2325 }
2326
2327 // Save prefix and move CurPtr past it
2328 const char *Prefix = CurPtr;
2329 CurPtr += PrefixLen + 1; // skip over prefix and '('
2330
2331 while (true) {
2332 char C = *CurPtr++;
2333
2334 if (C == ')') {
2335 // Check for prefix match and closing quote.
2336 if (strncmp(CurPtr, Prefix, PrefixLen) == 0 && CurPtr[PrefixLen] == '"') {
2337 CurPtr += PrefixLen + 1; // skip over prefix and '"'
2338 break;
2339 }
2340 } else if (C == 0 && CurPtr-1 == BufferEnd) { // End of file.
2341 if (!isLexingRawMode())
2342 Diag(BufferPtr, diag::err_unterminated_raw_string)
2343 << StringRef(Prefix, PrefixLen);
2344 FormTokenWithChars(Result, CurPtr-1, tok::unknown);
2345 return true;
2346 }
2347 }
2348
2349 // If we are in C++11, lex the optional ud-suffix.
2350 if (LangOpts.CPlusPlus)
2351 CurPtr = LexUDSuffix(Result, CurPtr, true);
2352
2353 // Update the location of token as well as BufferPtr.
2354 const char *TokStart = BufferPtr;
2355 FormTokenWithChars(Result, CurPtr, Kind);
2356 Result.setLiteralData(TokStart);
2357 return true;
2358}
2359
2360/// LexAngledStringLiteral - Lex the remainder of an angled string literal,
2361/// after having lexed the '<' character. This is used for #include filenames.
2362bool Lexer::LexAngledStringLiteral(Token &Result, const char *CurPtr) {
2363 // Does this string contain the \0 character?
2364 const char *NulCharacter = nullptr;
2365 const char *AfterLessPos = CurPtr;
2366 char C = getAndAdvanceChar(CurPtr, Result);
2367 while (C != '>') {
2368 // Skip escaped characters. Escaped newlines will already be processed by
2369 // getAndAdvanceChar.
2370 if (C == '\\')
2371 C = getAndAdvanceChar(CurPtr, Result);
2372
2373 if (isVerticalWhitespace(C) || // Newline.
2374 (C == 0 && (CurPtr - 1 == BufferEnd))) { // End of file.
2375 // If the filename is unterminated, then it must just be a lone <
2376 // character. Return this as such.
2377 FormTokenWithChars(Result, AfterLessPos, tok::less);
2378 return true;
2379 }
2380
2381 if (C == 0) {
2382 if (isCodeCompletionPoint(CurPtr - 1)) {
2383 codeCompleteIncludedFile(AfterLessPos, CurPtr - 1, /*IsAngled=*/true);
2384 cutOffLexing();
2385 FormTokenWithChars(Result, CurPtr - 1, tok::unknown);
2386 return true;
2387 }
2388 NulCharacter = CurPtr-1;
2389 }
2390 C = getAndAdvanceChar(CurPtr, Result);
2391 }
2392
2393 // If a nul character existed in the string, warn about it.
2394 if (NulCharacter && !isLexingRawMode())
2395 Diag(NulCharacter, diag::null_in_char_or_string) << 1;
2396
2397 // Update the location of token as well as BufferPtr.
2398 const char *TokStart = BufferPtr;
2399 FormTokenWithChars(Result, CurPtr, tok::header_name);
2400 Result.setLiteralData(TokStart);
2401 return true;
2402}
2403
2404void Lexer::codeCompleteIncludedFile(const char *PathStart,
2405 const char *CompletionPoint,
2406 bool IsAngled) {
2407 // Completion only applies to the filename, after the last slash.
2408 StringRef PartialPath(PathStart, CompletionPoint - PathStart);
2409 llvm::StringRef SlashChars = LangOpts.MSVCCompat ? "/\\" : "/";
2410 auto Slash = PartialPath.find_last_of(SlashChars);
2411 StringRef Dir =
2412 (Slash == StringRef::npos) ? "" : PartialPath.take_front(Slash);
2413 const char *StartOfFilename =
2414 (Slash == StringRef::npos) ? PathStart : PathStart + Slash + 1;
2415 // Code completion filter range is the filename only, up to completion point.
2417 StringRef(StartOfFilename, CompletionPoint - StartOfFilename)));
2418 // We should replace the characters up to the closing quote or closest slash,
2419 // if any.
2420 while (CompletionPoint < BufferEnd) {
2421 char Next = *(CompletionPoint + 1);
2422 if (Next == 0 || Next == '\r' || Next == '\n')
2423 break;
2424 ++CompletionPoint;
2425 if (Next == (IsAngled ? '>' : '"'))
2426 break;
2427 if (SlashChars.contains(Next))
2428 break;
2429 }
2430
2432 FileLoc.getLocWithOffset(StartOfFilename - BufferStart),
2433 FileLoc.getLocWithOffset(CompletionPoint - BufferStart));
2434 PP->CodeCompleteIncludedFile(Dir, IsAngled);
2435}
2436
2437/// LexCharConstant - Lex the remainder of a character constant, after having
2438/// lexed either ' or L' or u8' or u' or U'.
2439bool Lexer::LexCharConstant(Token &Result, const char *CurPtr,
2440 tok::TokenKind Kind) {
2441 // Does this character contain the \0 character?
2442 const char *NulCharacter = nullptr;
2443
2444 if (!isLexingRawMode()) {
2445 if (Kind == tok::utf16_char_constant || Kind == tok::utf32_char_constant)
2446 Diag(BufferPtr, LangOpts.CPlusPlus
2447 ? diag::warn_cxx98_compat_unicode_literal
2448 : diag::warn_c99_compat_unicode_literal);
2449 else if (Kind == tok::utf8_char_constant)
2450 Diag(BufferPtr, LangOpts.CPlusPlus
2451 ? diag::warn_cxx14_compat_u8_character_literal
2452 : diag::warn_c17_compat_u8_character_literal);
2453 }
2454
2455 char C = getAndAdvanceChar(CurPtr, Result);
2456 if (C == '\'') {
2457 if (!isLexingRawMode() && !LangOpts.AsmPreprocessor)
2458 Diag(BufferPtr, diag::ext_empty_character);
2459 FormTokenWithChars(Result, CurPtr, tok::unknown);
2460 return true;
2461 }
2462
2463 while (C != '\'') {
2464 // Skip escaped characters.
2465 if (C == '\\')
2466 C = getAndAdvanceChar(CurPtr, Result);
2467
2468 if (C == '\n' || C == '\r' || // Newline.
2469 (C == 0 && CurPtr-1 == BufferEnd)) { // End of file.
2470 if (!isLexingRawMode() && !LangOpts.AsmPreprocessor)
2471 Diag(BufferPtr, diag::ext_unterminated_char_or_string) << 0;
2472 FormTokenWithChars(Result, CurPtr-1, tok::unknown);
2473 return true;
2474 }
2475
2476 if (C == 0) {
2477 if (isCodeCompletionPoint(CurPtr-1)) {
2479 FormTokenWithChars(Result, CurPtr-1, tok::unknown);
2480 cutOffLexing();
2481 return true;
2482 }
2483
2484 NulCharacter = CurPtr-1;
2485 }
2486 C = getAndAdvanceChar(CurPtr, Result);
2487 }
2488
2489 // If we are in C++11, lex the optional ud-suffix.
2490 if (LangOpts.CPlusPlus)
2491 CurPtr = LexUDSuffix(Result, CurPtr, false);
2492
2493 // If a nul character existed in the character, warn about it.
2494 if (NulCharacter && !isLexingRawMode())
2495 Diag(NulCharacter, diag::null_in_char_or_string) << 0;
2496
2497 // Update the location of token as well as BufferPtr.
2498 const char *TokStart = BufferPtr;
2499 FormTokenWithChars(Result, CurPtr, Kind);
2500 Result.setLiteralData(TokStart);
2501 return true;
2502}
2503
2504/// SkipWhitespace - Efficiently skip over a series of whitespace characters.
2505/// Update BufferPtr to point to the next non-whitespace character and return.
2506///
2507/// This method forms a token and returns true if KeepWhitespaceMode is enabled.
2508bool Lexer::SkipWhitespace(Token &Result, const char *CurPtr,
2509 bool &TokAtPhysicalStartOfLine) {
2510 // Whitespace - Skip it, then return the token after the whitespace.
2511 bool SawNewline = isVerticalWhitespace(CurPtr[-1]);
2512
2513 unsigned char Char = *CurPtr;
2514
2515 const char *lastNewLine = nullptr;
2516 auto setLastNewLine = [&](const char *Ptr) {
2517 lastNewLine = Ptr;
2518 if (!NewLinePtr)
2519 NewLinePtr = Ptr;
2520 };
2521 if (SawNewline)
2522 setLastNewLine(CurPtr - 1);
2523
2524 // Skip consecutive spaces efficiently.
2525 while (true) {
2526 // Skip horizontal whitespace very aggressively.
2527 while (isHorizontalWhitespace(Char))
2528 Char = *++CurPtr;
2529
2530 // Otherwise if we have something other than whitespace, we're done.
2531 if (!isVerticalWhitespace(Char))
2532 break;
2533
2535 // End of preprocessor directive line, let LexTokenInternal handle this.
2536 BufferPtr = CurPtr;
2537 return false;
2538 }
2539
2540 // OK, but handle newline.
2541 if (*CurPtr == '\n')
2542 setLastNewLine(CurPtr);
2543 SawNewline = true;
2544 Char = *++CurPtr;
2545 }
2546
2547 // If the client wants us to return whitespace, return it now.
2548 if (isKeepWhitespaceMode()) {
2549 FormTokenWithChars(Result, CurPtr, tok::unknown);
2550 if (SawNewline) {
2551 IsAtStartOfLine = true;
2552 IsAtPhysicalStartOfLine = true;
2553 }
2554 // FIXME: The next token will not have LeadingSpace set.
2555 return true;
2556 }
2557
2558 // If this isn't immediately after a newline, there is leading space.
2559 char PrevChar = CurPtr[-1];
2560 bool HasLeadingSpace = !isVerticalWhitespace(PrevChar);
2561
2562 Result.setFlagValue(Token::LeadingSpace, HasLeadingSpace);
2563 if (SawNewline) {
2564 Result.setFlag(Token::StartOfLine);
2565 TokAtPhysicalStartOfLine = true;
2566
2567 if (NewLinePtr && lastNewLine && NewLinePtr != lastNewLine && PP) {
2568 if (auto *Handler = PP->getEmptylineHandler())
2569 Handler->HandleEmptyline(SourceRange(getSourceLocation(NewLinePtr + 1),
2570 getSourceLocation(lastNewLine)));
2571 }
2572 }
2573
2574 BufferPtr = CurPtr;
2575 return false;
2576}
2577
2578/// We have just read the // characters from input. Skip until we find the
2579/// newline character that terminates the comment. Then update BufferPtr and
2580/// return.
2581///
2582/// If we're in KeepCommentMode or any CommentHandler has inserted
2583/// some tokens, this will store the first token and return true.
2584bool Lexer::SkipLineComment(Token &Result, const char *CurPtr,
2585 bool &TokAtPhysicalStartOfLine) {
2586 // If Line comments aren't explicitly enabled for this language, emit an
2587 // extension warning.
2588 if (!LineComment) {
2589 if (!isLexingRawMode()) // There's no PP in raw mode, so can't emit diags.
2590 Diag(BufferPtr, diag::ext_line_comment);
2591
2592 // Mark them enabled so we only emit one warning for this translation
2593 // unit.
2594 LineComment = true;
2595 }
2596
2597 // Scan over the body of the comment. The common case, when scanning, is that
2598 // the comment contains normal ascii characters with nothing interesting in
2599 // them. As such, optimize for this case with the inner loop.
2600 //
2601 // This loop terminates with CurPtr pointing at the newline (or end of buffer)
2602 // character that ends the line comment.
2603
2604 // C++23 [lex.phases] p1
2605 // Diagnose invalid UTF-8 if the corresponding warning is enabled, emitting a
2606 // diagnostic only once per entire ill-formed subsequence to avoid
2607 // emiting to many diagnostics (see http://unicode.org/review/pr-121.html).
2608 bool UnicodeDecodingAlreadyDiagnosed = false;
2609
2610 char C;
2611 while (true) {
2612 C = *CurPtr;
2613 // Skip over characters in the fast loop.
2614 while (isASCII(C) && C != 0 && // Potentially EOF.
2615 C != '\n' && C != '\r') { // Newline or DOS-style newline.
2616 C = *++CurPtr;
2617 UnicodeDecodingAlreadyDiagnosed = false;
2618 }
2619
2620 if (!isASCII(C)) {
2621 unsigned Length = llvm::getUTF8SequenceSize(
2622 (const llvm::UTF8 *)CurPtr, (const llvm::UTF8 *)BufferEnd);
2623 if (Length == 0) {
2624 if (!UnicodeDecodingAlreadyDiagnosed && !isLexingRawMode())
2625 Diag(CurPtr, diag::warn_invalid_utf8_in_comment);
2626 UnicodeDecodingAlreadyDiagnosed = true;
2627 ++CurPtr;
2628 } else {
2629 UnicodeDecodingAlreadyDiagnosed = false;
2630 CurPtr += Length;
2631 }
2632 continue;
2633 }
2634
2635 const char *NextLine = CurPtr;
2636 if (C != 0) {
2637 // We found a newline, see if it's escaped.
2638 const char *EscapePtr = CurPtr-1;
2639 bool HasSpace = false;
2640 while (isHorizontalWhitespace(*EscapePtr)) { // Skip whitespace.
2641 --EscapePtr;
2642 HasSpace = true;
2643 }
2644
2645 if (*EscapePtr == '\\')
2646 // Escaped newline.
2647 CurPtr = EscapePtr;
2648 else if (EscapePtr[0] == '/' && EscapePtr[-1] == '?' &&
2649 EscapePtr[-2] == '?' && LangOpts.Trigraphs)
2650 // Trigraph-escaped newline.
2651 CurPtr = EscapePtr-2;
2652 else
2653 break; // This is a newline, we're done.
2654
2655 // If there was space between the backslash and newline, warn about it.
2656 if (HasSpace && !isLexingRawMode())
2657 Diag(EscapePtr, diag::backslash_newline_space);
2658 }
2659
2660 // Otherwise, this is a hard case. Fall back on getAndAdvanceChar to
2661 // properly decode the character. Read it in raw mode to avoid emitting
2662 // diagnostics about things like trigraphs. If we see an escaped newline,
2663 // we'll handle it below.
2664 const char *OldPtr = CurPtr;
2665 bool OldRawMode = isLexingRawMode();
2666 LexingRawMode = true;
2667 C = getAndAdvanceChar(CurPtr, Result);
2668 LexingRawMode = OldRawMode;
2669
2670 // If we only read only one character, then no special handling is needed.
2671 // We're done and can skip forward to the newline.
2672 if (C != 0 && CurPtr == OldPtr+1) {
2673 CurPtr = NextLine;
2674 break;
2675 }
2676
2677 // If we read multiple characters, and one of those characters was a \r or
2678 // \n, then we had an escaped newline within the comment. Emit diagnostic
2679 // unless the next line is also a // comment.
2680 if (CurPtr != OldPtr + 1 && C != '/' &&
2681 (CurPtr == BufferEnd + 1 || CurPtr[0] != '/')) {
2682 for (; OldPtr != CurPtr; ++OldPtr)
2683 if (OldPtr[0] == '\n' || OldPtr[0] == '\r') {
2684 // Okay, we found a // comment that ends in a newline, if the next
2685 // line is also a // comment, but has spaces, don't emit a diagnostic.
2686 if (isWhitespace(C)) {
2687 const char *ForwardPtr = CurPtr;
2688 while (isWhitespace(*ForwardPtr)) // Skip whitespace.
2689 ++ForwardPtr;
2690 if (ForwardPtr[0] == '/' && ForwardPtr[1] == '/')
2691 break;
2692 }
2693
2694 if (!isLexingRawMode())
2695 Diag(OldPtr-1, diag::ext_multi_line_line_comment);
2696 break;
2697 }
2698 }
2699
2700 if (C == '\r' || C == '\n' || CurPtr == BufferEnd + 1) {
2701 --CurPtr;
2702 break;
2703 }
2704
2705 if (C == '\0' && isCodeCompletionPoint(CurPtr-1)) {
2707 cutOffLexing();
2708 return false;
2709 }
2710 }
2711
2712 // Found but did not consume the newline. Notify comment handlers about the
2713 // comment unless we're in a #if 0 block.
2714 if (PP && !isLexingRawMode() &&
2716 getSourceLocation(CurPtr)))) {
2717 BufferPtr = CurPtr;
2718 return true; // A token has to be returned.
2719 }
2720
2721 // If we are returning comments as tokens, return this comment as a token.
2722 if (inKeepCommentMode())
2723 return SaveLineComment(Result, CurPtr);
2724
2725 // If we are inside a preprocessor directive and we see the end of line,
2726 // return immediately, so that the lexer can return this as an EOD token.
2727 if (ParsingPreprocessorDirective || CurPtr == BufferEnd) {
2728 BufferPtr = CurPtr;
2729 return false;
2730 }
2731
2732 // Otherwise, eat the \n character. We don't care if this is a \n\r or
2733 // \r\n sequence. This is an efficiency hack (because we know the \n can't
2734 // contribute to another token), it isn't needed for correctness. Note that
2735 // this is ok even in KeepWhitespaceMode, because we would have returned the
2736 // comment above in that mode.
2737 NewLinePtr = CurPtr++;
2738
2739 // The next returned token is at the start of the line.
2740 Result.setFlag(Token::StartOfLine);
2741 TokAtPhysicalStartOfLine = true;
2742 // No leading whitespace seen so far.
2743 Result.clearFlag(Token::LeadingSpace);
2744 BufferPtr = CurPtr;
2745 return false;
2746}
2747
2748/// If in save-comment mode, package up this Line comment in an appropriate
2749/// way and return it.
2750bool Lexer::SaveLineComment(Token &Result, const char *CurPtr) {
2751 // If we're not in a preprocessor directive, just return the // comment
2752 // directly.
2753 FormTokenWithChars(Result, CurPtr, tok::comment);
2754
2756 return true;
2757
2758 // If this Line-style comment is in a macro definition, transmogrify it into
2759 // a C-style block comment.
2760 bool Invalid = false;
2761 std::string Spelling = PP->getSpelling(Result, &Invalid);
2762 if (Invalid)
2763 return true;
2764
2765 assert(Spelling[0] == '/' && Spelling[1] == '/' && "Not line comment?");
2766 Spelling[1] = '*'; // Change prefix to "/*".
2767 Spelling += "*/"; // add suffix.
2768
2769 Result.setKind(tok::comment);
2770 PP->CreateString(Spelling, Result,
2771 Result.getLocation(), Result.getLocation());
2772 return true;
2773}
2774
2775/// isBlockCommentEndOfEscapedNewLine - Return true if the specified newline
2776/// character (either \\n or \\r) is part of an escaped newline sequence. Issue
2777/// a diagnostic if so. We know that the newline is inside of a block comment.
2778static bool isEndOfBlockCommentWithEscapedNewLine(const char *CurPtr, Lexer *L,
2779 bool Trigraphs) {
2780 assert(CurPtr[0] == '\n' || CurPtr[0] == '\r');
2781
2782 // Position of the first trigraph in the ending sequence.
2783 const char *TrigraphPos = nullptr;
2784 // Position of the first whitespace after a '\' in the ending sequence.
2785 const char *SpacePos = nullptr;
2786
2787 while (true) {
2788 // Back up off the newline.
2789 --CurPtr;
2790
2791 // If this is a two-character newline sequence, skip the other character.
2792 if (CurPtr[0] == '\n' || CurPtr[0] == '\r') {
2793 // \n\n or \r\r -> not escaped newline.
2794 if (CurPtr[0] == CurPtr[1])
2795 return false;
2796 // \n\r or \r\n -> skip the newline.
2797 --CurPtr;
2798 }
2799
2800 // If we have horizontal whitespace, skip over it. We allow whitespace
2801 // between the slash and newline.
2802 while (isHorizontalWhitespace(*CurPtr) || *CurPtr == 0) {
2803 SpacePos = CurPtr;
2804 --CurPtr;
2805 }
2806
2807 // If we have a slash, this is an escaped newline.
2808 if (*CurPtr == '\\') {
2809 --CurPtr;
2810 } else if (CurPtr[0] == '/' && CurPtr[-1] == '?' && CurPtr[-2] == '?') {
2811 // This is a trigraph encoding of a slash.
2812 TrigraphPos = CurPtr - 2;
2813 CurPtr -= 3;
2814 } else {
2815 return false;
2816 }
2817
2818 // If the character preceding the escaped newline is a '*', then after line
2819 // splicing we have a '*/' ending the comment.
2820 if (*CurPtr == '*')
2821 break;
2822
2823 if (*CurPtr != '\n' && *CurPtr != '\r')
2824 return false;
2825 }
2826
2827 if (TrigraphPos) {
2828 // If no trigraphs are enabled, warn that we ignored this trigraph and
2829 // ignore this * character.
2830 if (!Trigraphs) {
2831 if (!L->isLexingRawMode())
2832 L->Diag(TrigraphPos, diag::trigraph_ignored_block_comment);
2833 return false;
2834 }
2835 if (!L->isLexingRawMode())
2836 L->Diag(TrigraphPos, diag::trigraph_ends_block_comment);
2837 }
2838
2839 // Warn about having an escaped newline between the */ characters.
2840 if (!L->isLexingRawMode())
2841 L->Diag(CurPtr + 1, diag::escaped_newline_block_comment_end);
2842
2843 // If there was space between the backslash and newline, warn about it.
2844 if (SpacePos && !L->isLexingRawMode())
2845 L->Diag(SpacePos, diag::backslash_newline_space);
2846
2847 return true;
2848}
2849
2850#ifdef __SSE2__
2851#include <emmintrin.h>
2852#elif __ALTIVEC__
2853#include <altivec.h>
2854#undef bool
2855#endif
2856
2857/// We have just read from input the / and * characters that started a comment.
2858/// Read until we find the * and / characters that terminate the comment.
2859/// Note that we don't bother decoding trigraphs or escaped newlines in block
2860/// comments, because they cannot cause the comment to end. The only thing
2861/// that can happen is the comment could end with an escaped newline between
2862/// the terminating * and /.
2863///
2864/// If we're in KeepCommentMode or any CommentHandler has inserted
2865/// some tokens, this will store the first token and return true.
2866bool Lexer::SkipBlockComment(Token &Result, const char *CurPtr,
2867 bool &TokAtPhysicalStartOfLine) {
2868 // Scan one character past where we should, looking for a '/' character. Once
2869 // we find it, check to see if it was preceded by a *. This common
2870 // optimization helps people who like to put a lot of * characters in their
2871 // comments.
2872
2873 // The first character we get with newlines and trigraphs skipped to handle
2874 // the degenerate /*/ case below correctly if the * has an escaped newline
2875 // after it.
2876 unsigned CharSize;
2877 unsigned char C = getCharAndSize(CurPtr, CharSize);
2878 CurPtr += CharSize;
2879 if (C == 0 && CurPtr == BufferEnd+1) {
2880 if (!isLexingRawMode())
2881 Diag(BufferPtr, diag::err_unterminated_block_comment);
2882 --CurPtr;
2883
2884 // KeepWhitespaceMode should return this broken comment as a token. Since
2885 // it isn't a well formed comment, just return it as an 'unknown' token.
2886 if (isKeepWhitespaceMode()) {
2887 FormTokenWithChars(Result, CurPtr, tok::unknown);
2888 return true;
2889 }
2890
2891 BufferPtr = CurPtr;
2892 return false;
2893 }
2894
2895 // Check to see if the first character after the '/*' is another /. If so,
2896 // then this slash does not end the block comment, it is part of it.
2897 if (C == '/')
2898 C = *CurPtr++;
2899
2900 // C++23 [lex.phases] p1
2901 // Diagnose invalid UTF-8 if the corresponding warning is enabled, emitting a
2902 // diagnostic only once per entire ill-formed subsequence to avoid
2903 // emiting to many diagnostics (see http://unicode.org/review/pr-121.html).
2904 bool UnicodeDecodingAlreadyDiagnosed = false;
2905
2906 while (true) {
2907 // Skip over all non-interesting characters until we find end of buffer or a
2908 // (probably ending) '/' character.
2909 if (CurPtr + 24 < BufferEnd &&
2910 // If there is a code-completion point avoid the fast scan because it
2911 // doesn't check for '\0'.
2912 !(PP && PP->getCodeCompletionFileLoc() == FileLoc)) {
2913 // While not aligned to a 16-byte boundary.
2914 while (C != '/' && (intptr_t)CurPtr % 16 != 0) {
2915 if (!isASCII(C))
2916 goto MultiByteUTF8;
2917 C = *CurPtr++;
2918 }
2919 if (C == '/') goto FoundSlash;
2920
2921#ifdef __SSE2__
2922 __m128i Slashes = _mm_set1_epi8('/');
2923 while (CurPtr + 16 < BufferEnd) {
2924 int Mask = _mm_movemask_epi8(*(const __m128i *)CurPtr);
2925 if (LLVM_UNLIKELY(Mask != 0)) {
2926 goto MultiByteUTF8;
2927 }
2928 // look for slashes
2929 int cmp = _mm_movemask_epi8(_mm_cmpeq_epi8(*(const __m128i*)CurPtr,
2930 Slashes));
2931 if (cmp != 0) {
2932 // Adjust the pointer to point directly after the first slash. It's
2933 // not necessary to set C here, it will be overwritten at the end of
2934 // the outer loop.
2935 CurPtr += llvm::countr_zero<unsigned>(cmp) + 1;
2936 goto FoundSlash;
2937 }
2938 CurPtr += 16;
2939 }
2940#elif __ALTIVEC__
2941 __vector unsigned char LongUTF = {0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
2942 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
2943 0x80, 0x80, 0x80, 0x80};
2944 __vector unsigned char Slashes = {
2945 '/', '/', '/', '/', '/', '/', '/', '/',
2946 '/', '/', '/', '/', '/', '/', '/', '/'
2947 };
2948 while (CurPtr + 16 < BufferEnd) {
2949 if (LLVM_UNLIKELY(
2950 vec_any_ge(*(const __vector unsigned char *)CurPtr, LongUTF)))
2951 goto MultiByteUTF8;
2952 if (vec_any_eq(*(const __vector unsigned char *)CurPtr, Slashes)) {
2953 break;
2954 }
2955 CurPtr += 16;
2956 }
2957
2958#else
2959 while (CurPtr + 16 < BufferEnd) {
2960 bool HasNonASCII = false;
2961 for (unsigned I = 0; I < 16; ++I)
2962 HasNonASCII |= !isASCII(CurPtr[I]);
2963
2964 if (LLVM_UNLIKELY(HasNonASCII))
2965 goto MultiByteUTF8;
2966
2967 bool HasSlash = false;
2968 for (unsigned I = 0; I < 16; ++I)
2969 HasSlash |= CurPtr[I] == '/';
2970 if (HasSlash)
2971 break;
2972 CurPtr += 16;
2973 }
2974#endif
2975
2976 // It has to be one of the bytes scanned, increment to it and read one.
2977 C = *CurPtr++;
2978 }
2979
2980 // Loop to scan the remainder, warning on invalid UTF-8
2981 // if the corresponding warning is enabled, emitting a diagnostic only once
2982 // per sequence that cannot be decoded.
2983 while (C != '/' && C != '\0') {
2984 if (isASCII(C)) {
2985 UnicodeDecodingAlreadyDiagnosed = false;
2986 C = *CurPtr++;
2987 continue;
2988 }
2989 MultiByteUTF8:
2990 // CurPtr is 1 code unit past C, so to decode
2991 // the codepoint, we need to read from the previous position.
2992 unsigned Length = llvm::getUTF8SequenceSize(
2993 (const llvm::UTF8 *)CurPtr - 1, (const llvm::UTF8 *)BufferEnd);
2994 if (Length == 0) {
2995 if (!UnicodeDecodingAlreadyDiagnosed && !isLexingRawMode())
2996 Diag(CurPtr - 1, diag::warn_invalid_utf8_in_comment);
2997 UnicodeDecodingAlreadyDiagnosed = true;
2998 } else {
2999 UnicodeDecodingAlreadyDiagnosed = false;
3000 CurPtr += Length - 1;
3001 }
3002 C = *CurPtr++;
3003 }
3004
3005 if (C == '/') {
3006 FoundSlash:
3007 if (CurPtr[-2] == '*') // We found the final */. We're done!
3008 break;
3009
3010 if ((CurPtr[-2] == '\n' || CurPtr[-2] == '\r')) {
3011 if (isEndOfBlockCommentWithEscapedNewLine(CurPtr - 2, this,
3012 LangOpts.Trigraphs)) {
3013 // We found the final */, though it had an escaped newline between the
3014 // * and /. We're done!
3015 break;
3016 }
3017 }
3018 if (CurPtr[0] == '*' && CurPtr[1] != '/') {
3019 // If this is a /* inside of the comment, emit a warning. Don't do this
3020 // if this is a /*/, which will end the comment. This misses cases with
3021 // embedded escaped newlines, but oh well.
3022 if (!isLexingRawMode())
3023 Diag(CurPtr-1, diag::warn_nested_block_comment);
3024 }
3025 } else if (C == 0 && CurPtr == BufferEnd+1) {
3026 if (!isLexingRawMode())
3027 Diag(BufferPtr, diag::err_unterminated_block_comment);
3028 // Note: the user probably forgot a */. We could continue immediately
3029 // after the /*, but this would involve lexing a lot of what really is the
3030 // comment, which surely would confuse the parser.
3031 --CurPtr;
3032
3033 // KeepWhitespaceMode should return this broken comment as a token. Since
3034 // it isn't a well formed comment, just return it as an 'unknown' token.
3035 if (isKeepWhitespaceMode()) {
3036 FormTokenWithChars(Result, CurPtr, tok::unknown);
3037 return true;
3038 }
3039
3040 BufferPtr = CurPtr;
3041 return false;
3042 } else if (C == '\0' && isCodeCompletionPoint(CurPtr-1)) {
3044 cutOffLexing();
3045 return false;
3046 }
3047
3048 C = *CurPtr++;
3049 }
3050
3051 // Notify comment handlers about the comment unless we're in a #if 0 block.
3052 if (PP && !isLexingRawMode() &&
3054 getSourceLocation(CurPtr)))) {
3055 BufferPtr = CurPtr;
3056 return true; // A token has to be returned.
3057 }
3058
3059 // If we are returning comments as tokens, return this comment as a token.
3060 if (inKeepCommentMode()) {
3061 FormTokenWithChars(Result, CurPtr, tok::comment);
3062 return true;
3063 }
3064
3065 // It is common for the tokens immediately after a /**/ comment to be
3066 // whitespace. Instead of going through the big switch, handle it
3067 // efficiently now. This is safe even in KeepWhitespaceMode because we would
3068 // have already returned above with the comment as a token.
3069 if (isHorizontalWhitespace(*CurPtr)) {
3070 SkipWhitespace(Result, CurPtr+1, TokAtPhysicalStartOfLine);
3071 return false;
3072 }
3073
3074 // Otherwise, just return so that the next character will be lexed as a token.
3075 BufferPtr = CurPtr;
3076 Result.setFlag(Token::LeadingSpace);
3077 return false;
3078}
3079
3080//===----------------------------------------------------------------------===//
3081// Primary Lexing Entry Points
3082//===----------------------------------------------------------------------===//
3083
3084/// ReadToEndOfLine - Read the rest of the current preprocessor line as an
3085/// uninterpreted string. This switches the lexer out of directive mode.
3087 assert(ParsingPreprocessorDirective && ParsingFilename == false &&
3088 "Must be in a preprocessing directive!");
3089 Token Tmp;
3090 Tmp.startToken();
3091
3092 // CurPtr - Cache BufferPtr in an automatic variable.
3093 const char *CurPtr = BufferPtr;
3094 while (true) {
3095 char Char = getAndAdvanceChar(CurPtr, Tmp);
3096 switch (Char) {
3097 default:
3098 if (Result)
3099 Result->push_back(Char);
3100 break;
3101 case 0: // Null.
3102 // Found end of file?
3103 if (CurPtr-1 != BufferEnd) {
3104 if (isCodeCompletionPoint(CurPtr-1)) {
3106 cutOffLexing();
3107 return;
3108 }
3109
3110 // Nope, normal character, continue.
3111 if (Result)
3112 Result->push_back(Char);
3113 break;
3114 }
3115 // FALL THROUGH.
3116 [[fallthrough]];
3117 case '\r':
3118 case '\n':
3119 // Okay, we found the end of the line. First, back up past the \0, \r, \n.
3120 assert(CurPtr[-1] == Char && "Trigraphs for newline?");
3121 BufferPtr = CurPtr-1;
3122
3123 // Next, lex the character, which should handle the EOD transition.
3124 Lex(Tmp);
3125 if (Tmp.is(tok::code_completion)) {
3126 if (PP)
3128 Lex(Tmp);
3129 }
3130 assert(Tmp.is(tok::eod) && "Unexpected token!");
3131
3132 // Finally, we're done;
3133 return;
3134 }
3135 }
3136}
3137
3138/// LexEndOfFile - CurPtr points to the end of this file. Handle this
3139/// condition, reporting diagnostics and handling other edge cases as required.
3140/// This returns true if Result contains a token, false if PP.Lex should be
3141/// called again.
3142bool Lexer::LexEndOfFile(Token &Result, const char *CurPtr) {
3143 // If we hit the end of the file while parsing a preprocessor directive,
3144 // end the preprocessor directive first. The next token returned will
3145 // then be the end of file.
3147 // Done parsing the "line".
3149 // Update the location of token as well as BufferPtr.
3150 FormTokenWithChars(Result, CurPtr, tok::eod);
3151
3152 // Restore comment saving mode, in case it was disabled for directive.
3153 if (PP)
3155 return true; // Have a token.
3156 }
3157
3158 // If we are in raw mode, return this event as an EOF token. Let the caller
3159 // that put us in raw mode handle the event.
3160 if (isLexingRawMode()) {
3161 Result.startToken();
3162 BufferPtr = BufferEnd;
3163 FormTokenWithChars(Result, BufferEnd, tok::eof);
3164 return true;
3165 }
3166
3169 // If the preamble cuts off the end of a header guard, consider it guarded.
3170 // The guard is valid for the preamble content itself, and for tools the
3171 // most useful answer is "yes, this file has a header guard".
3172 if (!ConditionalStack.empty())
3174 ConditionalStack.clear();
3175 }
3176
3177 // Issue diagnostics for unterminated #if and missing newline.
3178
3179 // If we are in a #if directive, emit an error.
3180 while (!ConditionalStack.empty()) {
3181 if (PP->getCodeCompletionFileLoc() != FileLoc)
3182 PP->Diag(ConditionalStack.back().IfLoc,
3183 diag::err_pp_unterminated_conditional);
3184 ConditionalStack.pop_back();
3185 }
3186
3187 // Before C++11 and C2y, a file not ending with a newline was UB. Both
3188 // standards changed this behavior (as a DR or equivalent), but we still have
3189 // an opt-in diagnostic to warn about it.
3190 if (CurPtr != BufferStart && (CurPtr[-1] != '\n' && CurPtr[-1] != '\r'))
3191 Diag(BufferEnd, diag::warn_no_newline_eof)
3192 << FixItHint::CreateInsertion(getSourceLocation(BufferEnd), "\n");
3193
3194 BufferPtr = CurPtr;
3195
3196 // Finally, let the preprocessor handle this.
3198}
3199
3200/// peekNextPPToken - Return std::nullopt if there are no more tokens in the
3201/// buffer controlled by this lexer, otherwise return the next unexpanded
3202/// token.
3203std::optional<Token> Lexer::peekNextPPToken() {
3204 assert(!LexingRawMode && "How can we expand a macro from a skipping buffer?");
3205
3206 if (isDependencyDirectivesLexer()) {
3207 if (NextDepDirectiveTokenIndex == DepDirectives.front().Tokens.size())
3208 return std::nullopt;
3209 Token Result;
3210 (void)convertDependencyDirectiveToken(
3211 DepDirectives.front().Tokens[NextDepDirectiveTokenIndex], Result);
3212 return Result;
3213 }
3214
3215 // Switch to 'skipping' mode. This will ensure that we can lex a token
3216 // without emitting diagnostics, disables macro expansion, and will cause EOF
3217 // to return an EOF token instead of popping the include stack.
3218 LexingRawMode = true;
3219
3220 // Save state that can be changed while lexing so that we can restore it.
3221 const char *TmpBufferPtr = BufferPtr;
3222 bool inPPDirectiveMode = ParsingPreprocessorDirective;
3223 bool atStartOfLine = IsAtStartOfLine;
3224 bool atPhysicalStartOfLine = IsAtPhysicalStartOfLine;
3225 bool leadingSpace = HasLeadingSpace;
3226
3227 Token Tok;
3228 Lex(Tok);
3229
3230 // Restore state that may have changed.
3231 BufferPtr = TmpBufferPtr;
3232 ParsingPreprocessorDirective = inPPDirectiveMode;
3233 HasLeadingSpace = leadingSpace;
3234 IsAtStartOfLine = atStartOfLine;
3235 IsAtPhysicalStartOfLine = atPhysicalStartOfLine;
3236 // Restore the lexer back to non-skipping mode.
3237 LexingRawMode = false;
3238
3239 if (Tok.is(tok::eof))
3240 return std::nullopt;
3241 return Tok;
3242}
3243
3244/// Find the end of a version control conflict marker.
3245static const char *FindConflictEnd(const char *CurPtr, const char *BufferEnd,
3246 ConflictMarkerKind CMK) {
3247 const char *Terminator = CMK == CMK_Perforce ? "<<<<\n" : ">>>>>>>";
3248 size_t TermLen = CMK == CMK_Perforce ? 5 : 7;
3249 auto RestOfBuffer = StringRef(CurPtr, BufferEnd - CurPtr).substr(TermLen);
3250 size_t Pos = RestOfBuffer.find(Terminator);
3251 while (Pos != StringRef::npos) {
3252 // Must occur at start of line.
3253 if (Pos == 0 ||
3254 (RestOfBuffer[Pos - 1] != '\r' && RestOfBuffer[Pos - 1] != '\n')) {
3255 RestOfBuffer = RestOfBuffer.substr(Pos+TermLen);
3256 Pos = RestOfBuffer.find(Terminator);
3257 continue;
3258 }
3259 return RestOfBuffer.data()+Pos;
3260 }
3261 return nullptr;
3262}
3263
3264/// IsStartOfConflictMarker - If the specified pointer is the start of a version
3265/// control conflict marker like '<<<<<<<', recognize it as such, emit an error
3266/// and recover nicely. This returns true if it is a conflict marker and false
3267/// if not.
3268bool Lexer::IsStartOfConflictMarker(const char *CurPtr) {
3269 // Only a conflict marker if it starts at the beginning of a line.
3270 if (CurPtr != BufferStart &&
3271 CurPtr[-1] != '\n' && CurPtr[-1] != '\r')
3272 return false;
3273
3274 // Check to see if we have <<<<<<< or >>>>.
3275 if (!StringRef(CurPtr, BufferEnd - CurPtr).starts_with("<<<<<<<") &&
3276 !StringRef(CurPtr, BufferEnd - CurPtr).starts_with(">>>> "))
3277 return false;
3278
3279 // If we have a situation where we don't care about conflict markers, ignore
3280 // it.
3281 if (CurrentConflictMarkerState || isLexingRawMode())
3282 return false;
3283
3284 ConflictMarkerKind Kind = *CurPtr == '<' ? CMK_Normal : CMK_Perforce;
3285
3286 // Check to see if there is an ending marker somewhere in the buffer at the
3287 // start of a line to terminate this conflict marker.
3288 if (FindConflictEnd(CurPtr, BufferEnd, Kind)) {
3289 // We found a match. We are really in a conflict marker.
3290 // Diagnose this, and ignore to the end of line.
3291 Diag(CurPtr, diag::err_conflict_marker);
3292 CurrentConflictMarkerState = Kind;
3293
3294 // Skip ahead to the end of line. We know this exists because the
3295 // end-of-conflict marker starts with \r or \n.
3296 while (*CurPtr != '\r' && *CurPtr != '\n') {
3297 assert(CurPtr != BufferEnd && "Didn't find end of line");
3298 ++CurPtr;
3299 }
3300 BufferPtr = CurPtr;
3301 return true;
3302 }
3303
3304 // No end of conflict marker found.
3305 return false;
3306}
3307
3308/// HandleEndOfConflictMarker - If this is a '====' or '||||' or '>>>>', or if
3309/// it is '<<<<' and the conflict marker started with a '>>>>' marker, then it
3310/// is the end of a conflict marker. Handle it by ignoring up until the end of
3311/// the line. This returns true if it is a conflict marker and false if not.
3312bool Lexer::HandleEndOfConflictMarker(const char *CurPtr) {
3313 // Only a conflict marker if it starts at the beginning of a line.
3314 if (CurPtr != BufferStart &&
3315 CurPtr[-1] != '\n' && CurPtr[-1] != '\r')
3316 return false;
3317
3318 // If we have a situation where we don't care about conflict markers, ignore
3319 // it.
3320 if (!CurrentConflictMarkerState || isLexingRawMode())
3321 return false;
3322
3323 // Check to see if we have the marker (4 characters in a row).
3324 for (unsigned i = 1; i != 4; ++i)
3325 if (CurPtr[i] != CurPtr[0])
3326 return false;
3327
3328 // If we do have it, search for the end of the conflict marker. This could
3329 // fail if it got skipped with a '#if 0' or something. Note that CurPtr might
3330 // be the end of conflict marker.
3331 if (const char *End = FindConflictEnd(CurPtr, BufferEnd,
3332 CurrentConflictMarkerState)) {
3333 CurPtr = End;
3334
3335 // Skip ahead to the end of line.
3336 while (CurPtr != BufferEnd && *CurPtr != '\r' && *CurPtr != '\n')
3337 ++CurPtr;
3338
3339 BufferPtr = CurPtr;
3340
3341 // No longer in the conflict marker.
3342 CurrentConflictMarkerState = CMK_None;
3343 return true;
3344 }
3345
3346 return false;
3347}
3348
3349static const char *findPlaceholderEnd(const char *CurPtr,
3350 const char *BufferEnd) {
3351 if (CurPtr == BufferEnd)
3352 return nullptr;
3353 BufferEnd -= 1; // Scan until the second last character.
3354 for (; CurPtr != BufferEnd; ++CurPtr) {
3355 if (CurPtr[0] == '#' && CurPtr[1] == '>')
3356 return CurPtr + 2;
3357 }
3358 return nullptr;
3359}
3360
3361bool Lexer::lexEditorPlaceholder(Token &Result, const char *CurPtr) {
3362 assert(CurPtr[-1] == '<' && CurPtr[0] == '#' && "Not a placeholder!");
3364 return false;
3365 const char *End = findPlaceholderEnd(CurPtr + 1, BufferEnd);
3366 if (!End)
3367 return false;
3368 const char *Start = CurPtr - 1;
3369 if (!LangOpts.AllowEditorPlaceholders)
3370 Diag(Start, diag::err_placeholder_in_source);
3371 Result.startToken();
3372 FormTokenWithChars(Result, End, tok::raw_identifier);
3373 Result.setRawIdentifierData(Start);
3376 BufferPtr = End;
3377 return true;
3378}
3379
3380bool Lexer::isCodeCompletionPoint(const char *CurPtr) const {
3381 if (PP && PP->isCodeCompletionEnabled()) {
3382 SourceLocation Loc = FileLoc.getLocWithOffset(CurPtr-BufferStart);
3383 return Loc == PP->getCodeCompletionLoc();
3384 }
3385
3386 return false;
3387}
3388
3390 bool Named,
3391 const LangOptions &Opts,
3392 DiagnosticsEngine &Diags) {
3393 unsigned DiagId;
3394 if (Opts.CPlusPlus23)
3395 DiagId = diag::warn_cxx23_delimited_escape_sequence;
3396 else if (Opts.C2y && !Named)
3397 DiagId = diag::warn_c2y_delimited_escape_sequence;
3398 else
3399 DiagId = diag::ext_delimited_escape_sequence;
3400
3401 // The trailing arguments are only used by the extension warning; either this
3402 // is a C2y extension or a C++23 extension, unless it's a named escape
3403 // sequence in C, then it's a Clang extension.
3404 unsigned Ext;
3405 if (!Opts.CPlusPlus)
3406 Ext = Named ? 2 /* Clang extension */ : 1 /* C2y extension */;
3407 else
3408 Ext = 0; // C++23 extension
3409
3410 Diags.Report(Loc, DiagId) << Named << Ext;
3411}
3412
3413std::optional<uint32_t> Lexer::tryReadNumericUCN(const char *&StartPtr,
3414 const char *SlashLoc,
3415 Token *Result) {
3416 unsigned CharSize;
3417 char Kind = getCharAndSize(StartPtr, CharSize);
3418 assert((Kind == 'u' || Kind == 'U') && "expected a UCN");
3419
3420 unsigned NumHexDigits;
3421 if (Kind == 'u')
3422 NumHexDigits = 4;
3423 else if (Kind == 'U')
3424 NumHexDigits = 8;
3425
3426 bool Delimited = false;
3427 bool FoundEndDelimiter = false;
3428 unsigned Count = 0;
3429 bool Diagnose = Result && !isLexingRawMode();
3430
3431 if (!LangOpts.CPlusPlus && !LangOpts.C99) {
3432 if (Diagnose)
3433 Diag(SlashLoc, diag::warn_ucn_not_valid_in_c89);
3434 return std::nullopt;
3435 }
3436
3437 const char *CurPtr = StartPtr + CharSize;
3438 const char *KindLoc = &CurPtr[-1];
3439
3440 uint32_t CodePoint = 0;
3441 while (Count != NumHexDigits || Delimited) {
3442 char C = getCharAndSize(CurPtr, CharSize);
3443 if (!Delimited && Count == 0 && C == '{') {
3444 Delimited = true;
3445 CurPtr += CharSize;
3446 continue;
3447 }
3448
3449 if (Delimited && C == '}') {
3450 CurPtr += CharSize;
3451 FoundEndDelimiter = true;
3452 break;
3453 }
3454
3455 unsigned Value = llvm::hexDigitValue(C);
3456 if (Value == std::numeric_limits<unsigned>::max()) {
3457 if (!Delimited)
3458 break;
3459 if (Diagnose)
3460 Diag(SlashLoc, diag::warn_delimited_ucn_incomplete)
3461 << StringRef(KindLoc, 1);
3462 return std::nullopt;
3463 }
3464
3465 if (CodePoint & 0xF000'0000) {
3466 if (Diagnose)
3467 Diag(KindLoc, diag::err_escape_too_large) << 0;
3468 return std::nullopt;
3469 }
3470
3471 CodePoint <<= 4;
3472 CodePoint |= Value;
3473 CurPtr += CharSize;
3474 Count++;
3475 }
3476
3477 if (Count == 0) {
3478 if (Diagnose)
3479 Diag(SlashLoc, FoundEndDelimiter ? diag::warn_delimited_ucn_empty
3480 : diag::warn_ucn_escape_no_digits)
3481 << StringRef(KindLoc, 1);
3482 return std::nullopt;
3483 }
3484
3485 if (Delimited && Kind == 'U') {
3486 if (Diagnose)
3487 Diag(SlashLoc, diag::err_hex_escape_no_digits) << StringRef(KindLoc, 1);
3488 return std::nullopt;
3489 }
3490
3491 if (!Delimited && Count != NumHexDigits) {
3492 if (Diagnose) {
3493 Diag(SlashLoc, diag::warn_ucn_escape_incomplete);
3494 // If the user wrote \U1234, suggest a fixit to \u.
3495 if (Count == 4 && NumHexDigits == 8) {
3496 CharSourceRange URange = makeCharRange(*this, KindLoc, KindLoc + 1);
3497 Diag(KindLoc, diag::note_ucn_four_not_eight)
3498 << FixItHint::CreateReplacement(URange, "u");
3499 }
3500 }
3501 return std::nullopt;
3502 }
3503
3504 if (Delimited && PP)
3506 PP->getLangOpts(),
3507 PP->getDiagnostics());
3508
3509 if (Result) {
3510 Result->setFlag(Token::HasUCN);
3511 // If the UCN contains either a trigraph or a line splicing,
3512 // we need to call getAndAdvanceChar again to set the appropriate flags
3513 // on Result.
3514 if (CurPtr - StartPtr == (ptrdiff_t)(Count + 1 + (Delimited ? 2 : 0)))
3515 StartPtr = CurPtr;
3516 else
3517 while (StartPtr != CurPtr)
3518 (void)getAndAdvanceChar(StartPtr, *Result);
3519 } else {
3520 StartPtr = CurPtr;
3521 }
3522 return CodePoint;
3523}
3524
3525std::optional<uint32_t> Lexer::tryReadNamedUCN(const char *&StartPtr,
3526 const char *SlashLoc,
3527 Token *Result) {
3528 unsigned CharSize;
3529 bool Diagnose = Result && !isLexingRawMode();
3530
3531 char C = getCharAndSize(StartPtr, CharSize);
3532 assert(C == 'N' && "expected \\N{...}");
3533
3534 const char *CurPtr = StartPtr + CharSize;
3535 const char *KindLoc = &CurPtr[-1];
3536
3537 C = getCharAndSize(CurPtr, CharSize);
3538 if (C != '{') {
3539 if (Diagnose)
3540 Diag(SlashLoc, diag::warn_ucn_escape_incomplete);
3541 return std::nullopt;
3542 }
3543 CurPtr += CharSize;
3544 const char *StartName = CurPtr;
3545 bool FoundEndDelimiter = false;
3547 while (C) {
3548 C = getCharAndSize(CurPtr, CharSize);
3549 CurPtr += CharSize;
3550 if (C == '}') {
3551 FoundEndDelimiter = true;
3552 break;
3553 }
3554
3556 break;
3557 Buffer.push_back(C);
3558 }
3559
3560 if (!FoundEndDelimiter || Buffer.empty()) {
3561 if (Diagnose)
3562 Diag(SlashLoc, FoundEndDelimiter ? diag::warn_delimited_ucn_empty
3563 : diag::warn_delimited_ucn_incomplete)
3564 << StringRef(KindLoc, 1);
3565 return std::nullopt;
3566 }
3567
3568 StringRef Name(Buffer.data(), Buffer.size());
3569 std::optional<char32_t> Match =
3570 llvm::sys::unicode::nameToCodepointStrict(Name);
3571 std::optional<llvm::sys::unicode::LooseMatchingResult> LooseMatch;
3572 if (!Match) {
3573 LooseMatch = llvm::sys::unicode::nameToCodepointLooseMatching(Name);
3574 if (Diagnose) {
3575 Diag(StartName, diag::err_invalid_ucn_name)
3576 << StringRef(Buffer.data(), Buffer.size())
3577 << makeCharRange(*this, StartName, CurPtr - CharSize);
3578 if (LooseMatch) {
3579 Diag(StartName, diag::note_invalid_ucn_name_loose_matching)
3581 makeCharRange(*this, StartName, CurPtr - CharSize),
3582 LooseMatch->Name);
3583 }
3584 }
3585 // We do not offer misspelled character names suggestions here
3586 // as the set of what would be a valid suggestion depends on context,
3587 // and we should not make invalid suggestions.
3588 }
3589
3590 if (Diagnose && Match)
3592 PP->getLangOpts(),
3593 PP->getDiagnostics());
3594
3595 // If no diagnostic has been emitted yet, likely because we are doing a
3596 // tentative lexing, we do not want to recover here to make sure the token
3597 // will not be incorrectly considered valid. This function will be called
3598 // again and a diagnostic emitted then.
3599 if (LooseMatch && Diagnose)
3600 Match = LooseMatch->CodePoint;
3601
3602 if (Result) {
3603 Result->setFlag(Token::HasUCN);
3604 // If the UCN contains either a trigraph or a line splicing,
3605 // we need to call getAndAdvanceChar again to set the appropriate flags
3606 // on Result.
3607 if (CurPtr - StartPtr == (ptrdiff_t)(Buffer.size() + 3))
3608 StartPtr = CurPtr;
3609 else
3610 while (StartPtr != CurPtr)
3611 (void)getAndAdvanceChar(StartPtr, *Result);
3612 } else {
3613 StartPtr = CurPtr;
3614 }
3615 return Match ? std::optional<uint32_t>(*Match) : std::nullopt;
3616}
3617
3618uint32_t Lexer::tryReadUCN(const char *&StartPtr, const char *SlashLoc,
3619 Token *Result) {
3620
3621 unsigned CharSize;
3622 std::optional<uint32_t> CodePointOpt;
3623 char Kind = getCharAndSize(StartPtr, CharSize);
3624 if (Kind == 'u' || Kind == 'U')
3625 CodePointOpt = tryReadNumericUCN(StartPtr, SlashLoc, Result);
3626 else if (Kind == 'N')
3627 CodePointOpt = tryReadNamedUCN(StartPtr, SlashLoc, Result);
3628
3629 if (!CodePointOpt)
3630 return 0;
3631
3632 uint32_t CodePoint = *CodePointOpt;
3633
3634 // Don't apply C family restrictions to UCNs in assembly mode
3635 if (LangOpts.AsmPreprocessor)
3636 return CodePoint;
3637
3638 // C23 6.4.3p2: A universal character name shall not designate a code point
3639 // where the hexadecimal value is:
3640 // - in the range D800 through DFFF inclusive; or
3641 // - greater than 10FFFF.
3642 // A universal-character-name outside the c-char-sequence of a character
3643 // constant, or the s-char-sequence of a string-literal shall not designate
3644 // a control character or a character in the basic character set.
3645
3646 // C++11 [lex.charset]p2: If the hexadecimal value for a
3647 // universal-character-name corresponds to a surrogate code point (in the
3648 // range 0xD800-0xDFFF, inclusive), the program is ill-formed. Additionally,
3649 // if the hexadecimal value for a universal-character-name outside the
3650 // c-char-sequence, s-char-sequence, or r-char-sequence of a character or
3651 // string literal corresponds to a control character (in either of the
3652 // ranges 0x00-0x1F or 0x7F-0x9F, both inclusive) or to a character in the
3653 // basic source character set, the program is ill-formed.
3654 if (CodePoint < 0xA0) {
3655 // We don't use isLexingRawMode() here because we need to warn about bad
3656 // UCNs even when skipping preprocessing tokens in a #if block.
3657 if (Result && PP) {
3658 if (CodePoint < 0x20 || CodePoint >= 0x7F)
3659 Diag(BufferPtr, diag::err_ucn_control_character);
3660 else {
3661 char C = static_cast<char>(CodePoint);
3662 Diag(BufferPtr, diag::err_ucn_escape_basic_scs) << StringRef(&C, 1);
3663 }
3664 }
3665
3666 return 0;
3667 } else if (CodePoint >= 0xD800 && CodePoint <= 0xDFFF) {
3668 // C++03 allows UCNs representing surrogate characters. C99 and C++11 don't.
3669 // We don't use isLexingRawMode() here because we need to diagnose bad
3670 // UCNs even when skipping preprocessing tokens in a #if block.
3671 if (Result && PP) {
3672 if (LangOpts.CPlusPlus && !LangOpts.CPlusPlus11)
3673 Diag(BufferPtr, diag::warn_ucn_escape_surrogate);
3674 else
3675 Diag(BufferPtr, diag::err_ucn_escape_invalid);
3676 }
3677 return 0;
3678 }
3679
3680 return CodePoint;
3681}
3682
3683bool Lexer::CheckUnicodeWhitespace(Token &Result, uint32_t C,
3684 const char *CurPtr) {
3685 if (!isLexingRawMode() && !PP->isPreprocessedOutput() &&
3687 Diag(BufferPtr, diag::ext_unicode_whitespace)
3688 << makeCharRange(*this, BufferPtr, CurPtr);
3689
3690 Result.setFlag(Token::LeadingSpace);
3691 return true;
3692 }
3693 return false;
3694}
3695
3696void Lexer::PropagateLineStartLeadingSpaceInfo(Token &Result) {
3697 IsAtStartOfLine = Result.isAtStartOfLine();
3698 HasLeadingSpace = Result.hasLeadingSpace();
3699 HasLeadingEmptyMacro = Result.hasLeadingEmptyMacro();
3700 // Note that this doesn't affect IsAtPhysicalStartOfLine.
3701}
3702
3704 assert(!isDependencyDirectivesLexer());
3705
3706 // Start a new token.
3707 Result.startToken();
3708
3709 // Set up misc whitespace flags for LexTokenInternal.
3710 if (IsAtStartOfLine) {
3711 Result.setFlag(Token::StartOfLine);
3712 IsAtStartOfLine = false;
3713 }
3714
3715 if (HasLeadingSpace) {
3716 Result.setFlag(Token::LeadingSpace);
3717 HasLeadingSpace = false;
3718 }
3719
3720 if (HasLeadingEmptyMacro) {
3722 HasLeadingEmptyMacro = false;
3723 }
3724
3725 bool atPhysicalStartOfLine = IsAtPhysicalStartOfLine;
3726 IsAtPhysicalStartOfLine = false;
3727 bool isRawLex = isLexingRawMode();
3728 (void) isRawLex;
3729 bool returnedToken = LexTokenInternal(Result, atPhysicalStartOfLine);
3730 // (After the LexTokenInternal call, the lexer might be destroyed.)
3731 assert((returnedToken || !isRawLex) && "Raw lex must succeed");
3732 return returnedToken;
3733}
3734
3735/// LexTokenInternal - This implements a simple C family lexer. It is an
3736/// extremely performance critical piece of code. This assumes that the buffer
3737/// has a null character at the end of the file. This returns a preprocessing
3738/// token, not a normal token, as such, it is an internal interface. It assumes
3739/// that the Flags of result have been cleared before calling this.
3740bool Lexer::LexTokenInternal(Token &Result, bool TokAtPhysicalStartOfLine) {
3741LexStart:
3742 assert(!Result.needsCleaning() && "Result needs cleaning");
3743 assert(!Result.hasPtrData() && "Result has not been reset");
3744
3745 // CurPtr - Cache BufferPtr in an automatic variable.
3746 const char *CurPtr = BufferPtr;
3747
3748 // Small amounts of horizontal whitespace is very common between tokens.
3749 if (isHorizontalWhitespace(*CurPtr)) {
3750 do {
3751 ++CurPtr;
3752 } while (isHorizontalWhitespace(*CurPtr));
3753
3754 // If we are keeping whitespace and other tokens, just return what we just
3755 // skipped. The next lexer invocation will return the token after the
3756 // whitespace.
3757 if (isKeepWhitespaceMode()) {
3758 FormTokenWithChars(Result, CurPtr, tok::unknown);
3759 // FIXME: The next token will not have LeadingSpace set.
3760 return true;
3761 }
3762
3763 BufferPtr = CurPtr;
3764 Result.setFlag(Token::LeadingSpace);
3765 }
3766
3767 unsigned SizeTmp, SizeTmp2; // Temporaries for use in cases below.
3768
3769 // Read a character, advancing over it.
3770 char Char = getAndAdvanceChar(CurPtr, Result);
3772
3773 if (!isVerticalWhitespace(Char))
3774 NewLinePtr = nullptr;
3775
3776 switch (Char) {
3777 case 0: // Null.
3778 // Found end of file?
3779 if (CurPtr-1 == BufferEnd)
3780 return LexEndOfFile(Result, CurPtr-1);
3781
3782 // Check if we are performing code completion.
3783 if (isCodeCompletionPoint(CurPtr-1)) {
3784 // Return the code-completion token.
3785 Result.startToken();
3786 FormTokenWithChars(Result, CurPtr, tok::code_completion);
3787 return true;
3788 }
3789
3790 if (!isLexingRawMode())
3791 Diag(CurPtr-1, diag::null_in_file);
3792 Result.setFlag(Token::LeadingSpace);
3793 if (SkipWhitespace(Result, CurPtr, TokAtPhysicalStartOfLine))
3794 return true; // KeepWhitespaceMode
3795
3796 // We know the lexer hasn't changed, so just try again with this lexer.
3797 // (We manually eliminate the tail call to avoid recursion.)
3798 goto LexNextToken;
3799
3800 case 26: // DOS & CP/M EOF: "^Z".
3801 // If we're in Microsoft extensions mode, treat this as end of file.
3802 if (LangOpts.MicrosoftExt) {
3803 if (!isLexingRawMode())
3804 Diag(CurPtr-1, diag::ext_ctrl_z_eof_microsoft);
3805 return LexEndOfFile(Result, CurPtr-1);
3806 }
3807
3808 // If Microsoft extensions are disabled, this is just random garbage.
3809 Kind = tok::unknown;
3810 break;
3811
3812 case '\r':
3813 if (CurPtr[0] == '\n')
3814 (void)getAndAdvanceChar(CurPtr, Result);
3815 [[fallthrough]];
3816 case '\n':
3817 // If we are inside a preprocessor directive and we see the end of line,
3818 // we know we are done with the directive, so return an EOD token.
3820 // Done parsing the "line".
3822
3823 // Restore comment saving mode, in case it was disabled for directive.
3824 if (PP)
3826
3827 // Since we consumed a newline, we are back at the start of a line.
3828 IsAtStartOfLine = true;
3829 IsAtPhysicalStartOfLine = true;
3830 NewLinePtr = CurPtr - 1;
3831
3832 Kind = tok::eod;
3833 break;
3834 }
3835
3836 // No leading whitespace seen so far.
3837 Result.clearFlag(Token::LeadingSpace);
3838
3839 if (SkipWhitespace(Result, CurPtr, TokAtPhysicalStartOfLine))
3840 return true; // KeepWhitespaceMode
3841
3842 // We only saw whitespace, so just try again with this lexer.
3843 // (We manually eliminate the tail call to avoid recursion.)
3844 goto LexNextToken;
3845 case ' ':
3846 case '\t':
3847 case '\f':
3848 case '\v':
3849 SkipHorizontalWhitespace:
3850 Result.setFlag(Token::LeadingSpace);
3851 if (SkipWhitespace(Result, CurPtr, TokAtPhysicalStartOfLine))
3852 return true; // KeepWhitespaceMode
3853
3854 SkipIgnoredUnits:
3855 CurPtr = BufferPtr;
3856
3857 // If the next token is obviously a // or /* */ comment, skip it efficiently
3858 // too (without going through the big switch stmt).
3859 if (CurPtr[0] == '/' && CurPtr[1] == '/' && !inKeepCommentMode() &&
3860 LineComment && (LangOpts.CPlusPlus || !LangOpts.TraditionalCPP)) {
3861 if (SkipLineComment(Result, CurPtr+2, TokAtPhysicalStartOfLine))
3862 return true; // There is a token to return.
3863 goto SkipIgnoredUnits;
3864 } else if (CurPtr[0] == '/' && CurPtr[1] == '*' && !inKeepCommentMode()) {
3865 if (SkipBlockComment(Result, CurPtr+2, TokAtPhysicalStartOfLine))
3866 return true; // There is a token to return.
3867 goto SkipIgnoredUnits;
3868 } else if (isHorizontalWhitespace(*CurPtr)) {
3869 goto SkipHorizontalWhitespace;
3870 }
3871 // We only saw whitespace, so just try again with this lexer.
3872 // (We manually eliminate the tail call to avoid recursion.)
3873 goto LexNextToken;
3874
3875 // C99 6.4.4.1: Integer Constants.
3876 // C99 6.4.4.2: Floating Constants.
3877 case '0': case '1': case '2': case '3': case '4':
3878 case '5': case '6': case '7': case '8': case '9':
3879 // Notify MIOpt that we read a non-whitespace/non-comment token.
3880 MIOpt.ReadToken();
3881 return LexNumericConstant(Result, CurPtr);
3882
3883 // Identifier (e.g., uber), or
3884 // UTF-8 (C23/C++17) or UTF-16 (C11/C++11) character literal, or
3885 // UTF-8 or UTF-16 string literal (C11/C++11).
3886 case 'u':
3887 // Notify MIOpt that we read a non-whitespace/non-comment token.
3888 MIOpt.ReadToken();
3889
3890 if (LangOpts.CPlusPlus11 || LangOpts.C11) {
3891 Char = getCharAndSize(CurPtr, SizeTmp);
3892
3893 // UTF-16 string literal
3894 if (Char == '"')
3895 return LexStringLiteral(Result, ConsumeChar(CurPtr, SizeTmp, Result),
3896 tok::utf16_string_literal);
3897
3898 // UTF-16 character constant
3899 if (Char == '\'')
3900 return LexCharConstant(Result, ConsumeChar(CurPtr, SizeTmp, Result),
3901 tok::utf16_char_constant);
3902
3903 // UTF-16 raw string literal
3904 if (Char == 'R' && LangOpts.RawStringLiterals &&
3905 getCharAndSize(CurPtr + SizeTmp, SizeTmp2) == '"')
3906 return LexRawStringLiteral(Result,
3907 ConsumeChar(ConsumeChar(CurPtr, SizeTmp, Result),
3908 SizeTmp2, Result),
3909 tok::utf16_string_literal);
3910
3911 if (Char == '8') {
3912 char Char2 = getCharAndSize(CurPtr + SizeTmp, SizeTmp2);
3913
3914 // UTF-8 string literal
3915 if (Char2 == '"')
3916 return LexStringLiteral(Result,
3917 ConsumeChar(ConsumeChar(CurPtr, SizeTmp, Result),
3918 SizeTmp2, Result),
3919 tok::utf8_string_literal);
3920 if (Char2 == '\'' && (LangOpts.CPlusPlus17 || LangOpts.C23))
3921 return LexCharConstant(
3922 Result, ConsumeChar(ConsumeChar(CurPtr, SizeTmp, Result),
3923 SizeTmp2, Result),
3924 tok::utf8_char_constant);
3925
3926 if (Char2 == 'R' && LangOpts.RawStringLiterals) {
3927 unsigned SizeTmp3;
3928 char Char3 = getCharAndSize(CurPtr + SizeTmp + SizeTmp2, SizeTmp3);
3929 // UTF-8 raw string literal
3930 if (Char3 == '"') {
3931 return LexRawStringLiteral(Result,
3932 ConsumeChar(ConsumeChar(ConsumeChar(CurPtr, SizeTmp, Result),
3933 SizeTmp2, Result),
3934 SizeTmp3, Result),
3935 tok::utf8_string_literal);
3936 }
3937 }
3938 }
3939 }
3940
3941 // treat u like the start of an identifier.
3942 return LexIdentifierContinue(Result, CurPtr);
3943
3944 case 'U': // Identifier (e.g. Uber) or C11/C++11 UTF-32 string literal
3945 // Notify MIOpt that we read a non-whitespace/non-comment token.
3946 MIOpt.ReadToken();
3947
3948 if (LangOpts.CPlusPlus11 || LangOpts.C11) {
3949 Char = getCharAndSize(CurPtr, SizeTmp);
3950
3951 // UTF-32 string literal
3952 if (Char == '"')
3953 return LexStringLiteral(Result, ConsumeChar(CurPtr, SizeTmp, Result),
3954 tok::utf32_string_literal);
3955
3956 // UTF-32 character constant
3957 if (Char == '\'')
3958 return LexCharConstant(Result, ConsumeChar(CurPtr, SizeTmp, Result),
3959 tok::utf32_char_constant);
3960
3961 // UTF-32 raw string literal
3962 if (Char == 'R' && LangOpts.RawStringLiterals &&
3963 getCharAndSize(CurPtr + SizeTmp, SizeTmp2) == '"')
3964 return LexRawStringLiteral(Result,
3965 ConsumeChar(ConsumeChar(CurPtr, SizeTmp, Result),
3966 SizeTmp2, Result),
3967 tok::utf32_string_literal);
3968 }
3969
3970 // treat U like the start of an identifier.
3971 return LexIdentifierContinue(Result, CurPtr);
3972
3973 case 'R': // Identifier or C++0x raw string literal
3974 // Notify MIOpt that we read a non-whitespace/non-comment token.
3975 MIOpt.ReadToken();
3976
3977 if (LangOpts.RawStringLiterals) {
3978 Char = getCharAndSize(CurPtr, SizeTmp);
3979
3980 if (Char == '"')
3981 return LexRawStringLiteral(Result,
3982 ConsumeChar(CurPtr, SizeTmp, Result),
3983 tok::string_literal);
3984 }
3985
3986 // treat R like the start of an identifier.
3987 return LexIdentifierContinue(Result, CurPtr);
3988
3989 case 'L': // Identifier (Loony) or wide literal (L'x' or L"xyz").
3990 // Notify MIOpt that we read a non-whitespace/non-comment token.
3991 MIOpt.ReadToken();
3992 Char = getCharAndSize(CurPtr, SizeTmp);
3993
3994 // Wide string literal.
3995 if (Char == '"')
3996 return LexStringLiteral(Result, ConsumeChar(CurPtr, SizeTmp, Result),
3997 tok::wide_string_literal);
3998
3999 // Wide raw string literal.
4000 if (LangOpts.RawStringLiterals && Char == 'R' &&
4001 getCharAndSize(CurPtr + SizeTmp, SizeTmp2) == '"')
4002 return LexRawStringLiteral(Result,
4003 ConsumeChar(ConsumeChar(CurPtr, SizeTmp, Result),
4004 SizeTmp2, Result),
4005 tok::wide_string_literal);
4006
4007 // Wide character constant.
4008 if (Char == '\'')
4009 return LexCharConstant(Result, ConsumeChar(CurPtr, SizeTmp, Result),
4010 tok::wide_char_constant);
4011 // FALL THROUGH, treating L like the start of an identifier.
4012 [[fallthrough]];
4013
4014 // C99 6.4.2: Identifiers.
4015 case 'A': case 'B': case 'C': case 'D': case 'E': case 'F': case 'G':
4016 case 'H': case 'I': case 'J': case 'K': /*'L'*/case 'M': case 'N':
4017 case 'O': case 'P': case 'Q': /*'R'*/case 'S': case 'T': /*'U'*/
4018 case 'V': case 'W': case 'X': case 'Y': case 'Z':
4019 case 'a': case 'b': case 'c': case 'd': case 'e': case 'f': case 'g':
4020 case 'h': case 'i': case 'j': case 'k': case 'l': case 'm': case 'n':
4021 case 'o': case 'p': case 'q': case 'r': case 's': case 't': /*'u'*/
4022 case 'v': case 'w': case 'x': case 'y': case 'z':
4023 case '_':
4024 // Notify MIOpt that we read a non-whitespace/non-comment token.
4025 MIOpt.ReadToken();
4026 return LexIdentifierContinue(Result, CurPtr);
4027
4028 case '$': // $ in identifiers.
4029 if (LangOpts.DollarIdents) {
4030 if (!isLexingRawMode())
4031 Diag(CurPtr-1, diag::ext_dollar_in_identifier);
4032 // Notify MIOpt that we read a non-whitespace/non-comment token.
4033 MIOpt.ReadToken();
4034 return LexIdentifierContinue(Result, CurPtr);
4035 }
4036
4037 Kind = tok::unknown;
4038 break;
4039
4040 // C99 6.4.4: Character Constants.
4041 case '\'':
4042 // Notify MIOpt that we read a non-whitespace/non-comment token.
4043 MIOpt.ReadToken();
4044 return LexCharConstant(Result, CurPtr, tok::char_constant);
4045
4046 // C99 6.4.5: String Literals.
4047 case '"':
4048 // Notify MIOpt that we read a non-whitespace/non-comment token.
4049 MIOpt.ReadToken();
4050 return LexStringLiteral(Result, CurPtr,
4051 ParsingFilename ? tok::header_name
4052 : tok::string_literal);
4053
4054 // C99 6.4.6: Punctuators.
4055 case '?':
4056 Kind = tok::question;
4057 break;
4058 case '[':
4059 Kind = tok::l_square;
4060 break;
4061 case ']':
4062 Kind = tok::r_square;
4063 break;
4064 case '(':
4065 Kind = tok::l_paren;
4066 break;
4067 case ')':
4068 Kind = tok::r_paren;
4069 break;
4070 case '{':
4071 Kind = tok::l_brace;
4072 break;
4073 case '}':
4074 Kind = tok::r_brace;
4075 break;
4076 case '.':
4077 Char = getCharAndSize(CurPtr, SizeTmp);
4078 if (Char >= '0' && Char <= '9') {
4079 // Notify MIOpt that we read a non-whitespace/non-comment token.
4080 MIOpt.ReadToken();
4081
4082 return LexNumericConstant(Result, ConsumeChar(CurPtr, SizeTmp, Result));
4083 } else if (LangOpts.CPlusPlus && Char == '*') {
4084 Kind = tok::periodstar;
4085 CurPtr += SizeTmp;
4086 } else if (Char == '.' &&
4087 getCharAndSize(CurPtr+SizeTmp, SizeTmp2) == '.') {
4088 Kind = tok::ellipsis;
4089 CurPtr = ConsumeChar(ConsumeChar(CurPtr, SizeTmp, Result),
4090 SizeTmp2, Result);
4091 } else {
4092 Kind = tok::period;
4093 }
4094 break;
4095 case '&':
4096 Char = getCharAndSize(CurPtr, SizeTmp);
4097 if (Char == '&') {
4098 Kind = tok::ampamp;
4099 CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);
4100 } else if (Char == '=') {
4101 Kind = tok::ampequal;
4102 CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);
4103 } else {
4104 Kind = tok::amp;
4105 }
4106 break;
4107 case '*':
4108 if (getCharAndSize(CurPtr, SizeTmp) == '=') {
4109 Kind = tok::starequal;
4110 CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);
4111 } else {
4112 Kind = tok::star;
4113 }
4114 break;
4115 case '+':
4116 Char = getCharAndSize(CurPtr, SizeTmp);
4117 if (Char == '+') {
4118 CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);
4119 Kind = tok::plusplus;
4120 } else if (Char == '=') {
4121 CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);
4122 Kind = tok::plusequal;
4123 } else {
4124 Kind = tok::plus;
4125 }
4126 break;
4127 case '-':
4128 Char = getCharAndSize(CurPtr, SizeTmp);
4129 if (Char == '-') { // --
4130 CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);
4131 Kind = tok::minusminus;
4132 } else if (Char == '>' && LangOpts.CPlusPlus &&
4133 getCharAndSize(CurPtr+SizeTmp, SizeTmp2) == '*') { // C++ ->*
4134 CurPtr = ConsumeChar(ConsumeChar(CurPtr, SizeTmp, Result),
4135 SizeTmp2, Result);
4136 Kind = tok::arrowstar;
4137 } else if (Char == '>') { // ->
4138 CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);
4139 Kind = tok::arrow;
4140 } else if (Char == '=') { // -=
4141 CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);
4142 Kind = tok::minusequal;
4143 } else {
4144 Kind = tok::minus;
4145 }
4146 break;
4147 case '~':
4148 Kind = tok::tilde;
4149 break;
4150 case '!':
4151 if (getCharAndSize(CurPtr, SizeTmp) == '=') {
4152 Kind = tok::exclaimequal;
4153 CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);
4154 } else {
4155 Kind = tok::exclaim;
4156 }
4157 break;
4158 case '/':
4159 // 6.4.9: Comments
4160 Char = getCharAndSize(CurPtr, SizeTmp);
4161 if (Char == '/') { // Line comment.
4162 // Even if Line comments are disabled (e.g. in C89 mode), we generally
4163 // want to lex this as a comment. There is one problem with this though,
4164 // that in one particular corner case, this can change the behavior of the
4165 // resultant program. For example, In "foo //**/ bar", C89 would lex
4166 // this as "foo / bar" and languages with Line comments would lex it as
4167 // "foo". Check to see if the character after the second slash is a '*'.
4168 // If so, we will lex that as a "/" instead of the start of a comment.
4169 // However, we never do this if we are just preprocessing.
4170 bool TreatAsComment =
4171 LineComment && (LangOpts.CPlusPlus || !LangOpts.TraditionalCPP);
4172 if (!TreatAsComment)
4173 if (!(PP && PP->isPreprocessedOutput()))
4174 TreatAsComment = getCharAndSize(CurPtr+SizeTmp, SizeTmp2) != '*';
4175
4176 if (TreatAsComment) {
4177 if (SkipLineComment(Result, ConsumeChar(CurPtr, SizeTmp, Result),
4178 TokAtPhysicalStartOfLine))
4179 return true; // There is a token to return.
4180
4181 // It is common for the tokens immediately after a // comment to be
4182 // whitespace (indentation for the next line). Instead of going through
4183 // the big switch, handle it efficiently now.
4184 goto SkipIgnoredUnits;
4185 }
4186 }
4187
4188 if (Char == '*') { // /**/ comment.
4189 if (SkipBlockComment(Result, ConsumeChar(CurPtr, SizeTmp, Result),
4190 TokAtPhysicalStartOfLine))
4191 return true; // There is a token to return.
4192
4193 // We only saw whitespace, so just try again with this lexer.
4194 // (We manually eliminate the tail call to avoid recursion.)
4195 goto LexNextToken;
4196 }
4197
4198 if (Char == '=') {
4199 CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);
4200 Kind = tok::slashequal;
4201 } else {
4202 Kind = tok::slash;
4203 }
4204 break;
4205 case '%':
4206 Char = getCharAndSize(CurPtr, SizeTmp);
4207 if (Char == '=') {
4208 Kind = tok::percentequal;
4209 CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);
4210 } else if (LangOpts.Digraphs && Char == '>') {
4211 Kind = tok::r_brace; // '%>' -> '}'
4212 CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);
4213 } else if (LangOpts.Digraphs && Char == ':') {
4214 CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);
4215 Char = getCharAndSize(CurPtr, SizeTmp);
4216 if (Char == '%' && getCharAndSize(CurPtr+SizeTmp, SizeTmp2) == ':') {
4217 Kind = tok::hashhash; // '%:%:' -> '##'
4218 CurPtr = ConsumeChar(ConsumeChar(CurPtr, SizeTmp, Result),
4219 SizeTmp2, Result);
4220 } else if (Char == '@' && LangOpts.MicrosoftExt) {// %:@ -> #@ -> Charize
4221 CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);
4222 if (!isLexingRawMode())
4223 Diag(BufferPtr, diag::ext_charize_microsoft);
4224 Kind = tok::hashat;
4225 } else { // '%:' -> '#'
4226 // We parsed a # character. If this occurs at the start of the line,
4227 // it's actually the start of a preprocessing directive. Callback to
4228 // the preprocessor to handle it.
4229 // TODO: -fpreprocessed mode??
4230 if (TokAtPhysicalStartOfLine && !LexingRawMode && !Is_PragmaLexer)
4231 goto HandleDirective;
4232
4233 Kind = tok::hash;
4234 }
4235 } else {
4236 Kind = tok::percent;
4237 }
4238 break;
4239 case '<':
4240 Char = getCharAndSize(CurPtr, SizeTmp);
4241 if (ParsingFilename) {
4242 return LexAngledStringLiteral(Result, CurPtr);
4243 } else if (Char == '<') {
4244 char After = getCharAndSize(CurPtr+SizeTmp, SizeTmp2);
4245 if (After == '=') {
4246 Kind = tok::lesslessequal;
4247 CurPtr = ConsumeChar(ConsumeChar(CurPtr, SizeTmp, Result),
4248 SizeTmp2, Result);
4249 } else if (After == '<' && IsStartOfConflictMarker(CurPtr-1)) {
4250 // If this is actually a '<<<<<<<' version control conflict marker,
4251 // recognize it as such and recover nicely.
4252 goto LexNextToken;
4253 } else if (After == '<' && HandleEndOfConflictMarker(CurPtr-1)) {
4254 // If this is '<<<<' and we're in a Perforce-style conflict marker,
4255 // ignore it.
4256 goto LexNextToken;
4257 } else if (LangOpts.CUDA && After == '<') {
4258 Kind = tok::lesslessless;
4259 CurPtr = ConsumeChar(ConsumeChar(CurPtr, SizeTmp, Result),
4260 SizeTmp2, Result);
4261 } else {
4262 CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);
4263 Kind = tok::lessless;
4264 }
4265 } else if (Char == '=') {
4266 char After = getCharAndSize(CurPtr+SizeTmp, SizeTmp2);
4267 if (After == '>') {
4268 if (LangOpts.CPlusPlus20) {
4269 if (!isLexingRawMode())
4270 Diag(BufferPtr, diag::warn_cxx17_compat_spaceship);
4271 CurPtr = ConsumeChar(ConsumeChar(CurPtr, SizeTmp, Result),
4272 SizeTmp2, Result);
4273 Kind = tok::spaceship;
4274 break;
4275 }
4276 // Suggest adding a space between the '<=' and the '>' to avoid a
4277 // change in semantics if this turns up in C++ <=17 mode.
4278 if (LangOpts.CPlusPlus && !isLexingRawMode()) {
4279 Diag(BufferPtr, diag::warn_cxx20_compat_spaceship)
4281 getSourceLocation(CurPtr + SizeTmp, SizeTmp2), " ");
4282 }
4283 }
4284 CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);
4285 Kind = tok::lessequal;
4286 } else if (LangOpts.Digraphs && Char == ':') { // '<:' -> '['
4287 if (LangOpts.CPlusPlus11 &&
4288 getCharAndSize(CurPtr + SizeTmp, SizeTmp2) == ':') {
4289 // C++0x [lex.pptoken]p3:
4290 // Otherwise, if the next three characters are <:: and the subsequent
4291 // character is neither : nor >, the < is treated as a preprocessor
4292 // token by itself and not as the first character of the alternative
4293 // token <:.
4294 unsigned SizeTmp3;
4295 char After = getCharAndSize(CurPtr + SizeTmp + SizeTmp2, SizeTmp3);
4296 if (After != ':' && After != '>') {
4297 Kind = tok::less;
4298 if (!isLexingRawMode())
4299 Diag(BufferPtr, diag::warn_cxx98_compat_less_colon_colon);
4300 break;
4301 }
4302 }
4303
4304 CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);
4305 Kind = tok::l_square;
4306 } else if (LangOpts.Digraphs && Char == '%') { // '<%' -> '{'
4307 CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);
4308 Kind = tok::l_brace;
4309 } else if (Char == '#' && /*Not a trigraph*/ SizeTmp == 1 &&
4310 lexEditorPlaceholder(Result, CurPtr)) {
4311 return true;
4312 } else {
4313 Kind = tok::less;
4314 }
4315 break;
4316 case '>':
4317 Char = getCharAndSize(CurPtr, SizeTmp);
4318 if (Char == '=') {
4319 CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);
4320 Kind = tok::greaterequal;
4321 } else if (Char == '>') {
4322 char After = getCharAndSize(CurPtr+SizeTmp, SizeTmp2);
4323 if (After == '=') {
4324 CurPtr = ConsumeChar(ConsumeChar(CurPtr, SizeTmp, Result),
4325 SizeTmp2, Result);
4326 Kind = tok::greatergreaterequal;
4327 } else if (After == '>' && IsStartOfConflictMarker(CurPtr-1)) {
4328 // If this is actually a '>>>>' conflict marker, recognize it as such
4329 // and recover nicely.
4330 goto LexNextToken;
4331 } else if (After == '>' && HandleEndOfConflictMarker(CurPtr-1)) {
4332 // If this is '>>>>>>>' and we're in a conflict marker, ignore it.
4333 goto LexNextToken;
4334 } else if (LangOpts.CUDA && After == '>') {
4335 Kind = tok::greatergreatergreater;
4336 CurPtr = ConsumeChar(ConsumeChar(CurPtr, SizeTmp, Result),
4337 SizeTmp2, Result);
4338 } else {
4339 CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);
4340 Kind = tok::greatergreater;
4341 }
4342 } else {
4343 Kind = tok::greater;
4344 }
4345 break;
4346 case '^':
4347 Char = getCharAndSize(CurPtr, SizeTmp);
4348 if (Char == '=') {
4349 CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);
4350 Kind = tok::caretequal;
4351 } else {
4352 if (LangOpts.OpenCL && Char == '^')
4353 Diag(CurPtr, diag::err_opencl_logical_exclusive_or);
4354 Kind = tok::caret;
4355 }
4356 break;
4357 case '|':
4358 Char = getCharAndSize(CurPtr, SizeTmp);
4359 if (Char == '=') {
4360 Kind = tok::pipeequal;
4361 CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);
4362 } else if (Char == '|') {
4363 // If this is '|||||||' and we're in a conflict marker, ignore it.
4364 if (CurPtr[1] == '|' && HandleEndOfConflictMarker(CurPtr-1))
4365 goto LexNextToken;
4366 Kind = tok::pipepipe;
4367 CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);
4368 } else {
4369 Kind = tok::pipe;
4370 }
4371 break;
4372 case ':':
4373 Char = getCharAndSize(CurPtr, SizeTmp);
4374 if (LangOpts.Digraphs && Char == '>') {
4375 Kind = tok::r_square; // ':>' -> ']'
4376 CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);
4377 } else if (Char == ':') {
4378 Kind = tok::coloncolon;
4379 CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);
4380 } else {
4381 Kind = tok::colon;
4382 }
4383 break;
4384 case ';':
4385 Kind = tok::semi;
4386 break;
4387 case '=':
4388 Char = getCharAndSize(CurPtr, SizeTmp);
4389 if (Char == '=') {
4390 // If this is '====' and we're in a conflict marker, ignore it.
4391 if (CurPtr[1] == '=' && HandleEndOfConflictMarker(CurPtr-1))
4392 goto LexNextToken;
4393
4394 Kind = tok::equalequal;
4395 CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);
4396 } else {
4397 Kind = tok::equal;
4398 }
4399 break;
4400 case ',':
4401 Kind = tok::comma;
4402 break;
4403 case '#':
4404 Char = getCharAndSize(CurPtr, SizeTmp);
4405 if (Char == '#') {
4406 Kind = tok::hashhash;
4407 CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);
4408 } else if (Char == '@' && LangOpts.MicrosoftExt) { // #@ -> Charize
4409 Kind = tok::hashat;
4410 if (!isLexingRawMode())
4411 Diag(BufferPtr, diag::ext_charize_microsoft);
4412 CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);
4413 } else {
4414 // We parsed a # character. If this occurs at the start of the line,
4415 // it's actually the start of a preprocessing directive. Callback to
4416 // the preprocessor to handle it.
4417 // TODO: -fpreprocessed mode??
4418 if (TokAtPhysicalStartOfLine && !LexingRawMode && !Is_PragmaLexer)
4419 goto HandleDirective;
4420
4421 Kind = tok::hash;
4422 }
4423 break;
4424
4425 case '@':
4426 // Objective C support.
4427 if (CurPtr[-1] == '@' && LangOpts.ObjC)
4428 Kind = tok::at;
4429 else
4430 Kind = tok::unknown;
4431 break;
4432
4433 // UCNs (C99 6.4.3, C++11 [lex.charset]p2)
4434 case '\\':
4435 if (!LangOpts.AsmPreprocessor) {
4436 if (uint32_t CodePoint = tryReadUCN(CurPtr, BufferPtr, &Result)) {
4437 if (CheckUnicodeWhitespace(Result, CodePoint, CurPtr)) {
4438 if (SkipWhitespace(Result, CurPtr, TokAtPhysicalStartOfLine))
4439 return true; // KeepWhitespaceMode
4440
4441 // We only saw whitespace, so just try again with this lexer.
4442 // (We manually eliminate the tail call to avoid recursion.)
4443 goto LexNextToken;
4444 }
4445
4446 return LexUnicodeIdentifierStart(Result, CodePoint, CurPtr);
4447 }
4448 }
4449
4450 Kind = tok::unknown;
4451 break;
4452
4453 default: {
4454 if (isASCII(Char)) {
4455 Kind = tok::unknown;
4456 break;
4457 }
4458
4459 llvm::UTF32 CodePoint;
4460
4461 // We can't just reset CurPtr to BufferPtr because BufferPtr may point to
4462 // an escaped newline.
4463 --CurPtr;
4464 llvm::ConversionResult Status =
4465 llvm::convertUTF8Sequence((const llvm::UTF8 **)&CurPtr,
4466 (const llvm::UTF8 *)BufferEnd,
4467 &CodePoint,
4468 llvm::strictConversion);
4469 if (Status == llvm::conversionOK) {
4470 if (CheckUnicodeWhitespace(Result, CodePoint, CurPtr)) {
4471 if (SkipWhitespace(Result, CurPtr, TokAtPhysicalStartOfLine))
4472 return true; // KeepWhitespaceMode
4473
4474 // We only saw whitespace, so just try again with this lexer.
4475 // (We manually eliminate the tail call to avoid recursion.)
4476 goto LexNextToken;
4477 }
4478 return LexUnicodeIdentifierStart(Result, CodePoint, CurPtr);
4479 }
4480
4483 ++CurPtr;
4484 Kind = tok::unknown;
4485 break;
4486 }
4487
4488 // Non-ASCII characters tend to creep into source code unintentionally.
4489 // Instead of letting the parser complain about the unknown token,
4490 // just diagnose the invalid UTF-8, then drop the character.
4491 Diag(CurPtr, diag::err_invalid_utf8);
4492
4493 BufferPtr = CurPtr+1;
4494 // We're pretending the character didn't exist, so just try again with
4495 // this lexer.
4496 // (We manually eliminate the tail call to avoid recursion.)
4497 goto LexNextToken;
4498 }
4499 }
4500
4501 // Notify MIOpt that we read a non-whitespace/non-comment token.
4502 MIOpt.ReadToken();
4503
4504 // Update the location of token as well as BufferPtr.
4505 FormTokenWithChars(Result, CurPtr, Kind);
4506 return true;
4507
4508HandleDirective:
4509 // We parsed a # character and it's the start of a preprocessing directive.
4510
4511 FormTokenWithChars(Result, CurPtr, tok::hash);
4513
4515 // With a fatal failure in the module loader, we abort parsing.
4516 return true;
4517
4518 // We parsed the directive; lex a token with the new state.
4519 return false;
4520
4521LexNextToken:
4522 Result.clearFlag(Token::NeedsCleaning);
4523 goto LexStart;
4524}
4525
4526const char *Lexer::convertDependencyDirectiveToken(
4528 const char *TokPtr = BufferStart + DDTok.Offset;
4529 Result.startToken();
4530 Result.setLocation(getSourceLocation(TokPtr));
4531 Result.setKind(DDTok.Kind);
4532 Result.setFlag((Token::TokenFlags)DDTok.Flags);
4533 Result.setLength(DDTok.Length);
4534 BufferPtr = TokPtr + DDTok.Length;
4535 return TokPtr;
4536}
4537
4538bool Lexer::LexDependencyDirectiveToken(Token &Result) {
4539 assert(isDependencyDirectivesLexer());
4540
4541 using namespace dependency_directives_scan;
4542
4543 if (BufferPtr == BufferEnd)
4544 return LexEndOfFile(Result, BufferPtr);
4545
4546 while (NextDepDirectiveTokenIndex == DepDirectives.front().Tokens.size()) {
4547 if (DepDirectives.front().Kind == pp_eof)
4548 return LexEndOfFile(Result, BufferEnd);
4549 if (DepDirectives.front().Kind == tokens_present_before_eof)
4550 MIOpt.ReadToken();
4551 NextDepDirectiveTokenIndex = 0;
4552 DepDirectives = DepDirectives.drop_front();
4553 }
4554
4556 DepDirectives.front().Tokens[NextDepDirectiveTokenIndex++];
4557 if (NextDepDirectiveTokenIndex > 1 || DDTok.Kind != tok::hash) {
4558 // Read something other than a preprocessor directive hash.
4559 MIOpt.ReadToken();
4560 }
4561
4562 if (ParsingFilename && DDTok.is(tok::less)) {
4563 BufferPtr = BufferStart + DDTok.Offset;
4564 LexAngledStringLiteral(Result, BufferPtr + 1);
4565 if (Result.isNot(tok::header_name))
4566 return true;
4567 // Advance the index of lexed tokens.
4568 while (true) {
4569 const dependency_directives_scan::Token &NextTok =
4570 DepDirectives.front().Tokens[NextDepDirectiveTokenIndex];
4571 if (BufferStart + NextTok.Offset >= BufferPtr)
4572 break;
4573 ++NextDepDirectiveTokenIndex;
4574 }
4575 return true;
4576 }
4577
4578 const char *TokPtr = convertDependencyDirectiveToken(DDTok, Result);
4579
4580 if (Result.is(tok::hash) && Result.isAtStartOfLine()) {
4583 // With a fatal failure in the module loader, we abort parsing.
4584 return true;
4585 return false;
4586 }
4587 if (Result.is(tok::raw_identifier)) {
4588 Result.setRawIdentifierData(TokPtr);
4589 if (!isLexingRawMode()) {
4591 if (II->isHandleIdentifierCase())
4592 return PP->HandleIdentifier(Result);
4593 }
4594 return true;
4595 }
4596 if (Result.isLiteral()) {
4597 Result.setLiteralData(TokPtr);
4598 return true;
4599 }
4600 if (Result.is(tok::colon)) {
4601 // Convert consecutive colons to 'tok::coloncolon'.
4602 if (*BufferPtr == ':') {
4603 assert(DepDirectives.front().Tokens[NextDepDirectiveTokenIndex].is(
4604 tok::colon));
4605 ++NextDepDirectiveTokenIndex;
4606 Result.setKind(tok::coloncolon);
4607 }
4608 return true;
4609 }
4610 if (Result.is(tok::eod))
4612
4613 return true;
4614}
4615
4616bool Lexer::LexDependencyDirectiveTokenWhileSkipping(Token &Result) {
4617 assert(isDependencyDirectivesLexer());
4618
4619 using namespace dependency_directives_scan;
4620
4621 bool Stop = false;
4622 unsigned NestedIfs = 0;
4623 do {
4624 DepDirectives = DepDirectives.drop_front();
4625 switch (DepDirectives.front().Kind) {
4626 case pp_none:
4627 llvm_unreachable("unexpected 'pp_none'");
4628 case pp_include:
4630 case pp_define:
4631 case pp_undef:
4632 case pp_import:
4633 case pp_pragma_import:
4634 case pp_pragma_once:
4639 case pp_include_next:
4640 case decl_at_import:
4641 case cxx_module_decl:
4642 case cxx_import_decl:
4646 break;
4647 case pp_if:
4648 case pp_ifdef:
4649 case pp_ifndef:
4650 ++NestedIfs;
4651 break;
4652 case pp_elif:
4653 case pp_elifdef:
4654 case pp_elifndef:
4655 case pp_else:
4656 if (!NestedIfs) {
4657 Stop = true;
4658 }
4659 break;
4660 case pp_endif:
4661 if (!NestedIfs) {
4662 Stop = true;
4663 } else {
4664 --NestedIfs;
4665 }
4666 break;
4667 case pp_eof:
4668 NextDepDirectiveTokenIndex = 0;
4669 return LexEndOfFile(Result, BufferEnd);
4670 }
4671 } while (!Stop);
4672
4674 DepDirectives.front().Tokens.front();
4675 assert(DDTok.is(tok::hash));
4676 NextDepDirectiveTokenIndex = 1;
4677
4678 convertDependencyDirectiveToken(DDTok, Result);
4679 return false;
4680}
StringRef P
Defines the Diagnostic-related interfaces.
Expr * E
Defines the clang::IdentifierInfo, clang::IdentifierTable, and clang::Selector interfaces.
Forward-declares and imports various common LLVM datatypes that clang wants to use unqualified.
Defines the clang::LangOptions interface.
static bool isInExpansionTokenRange(const SourceLocation Loc, const SourceManager &SM)
Definition: Lexer.cpp:944
static bool isMathematicalExtensionID(uint32_t C, const LangOptions &LangOpts, bool IsStart, bool &IsExtension)
Definition: Lexer.cpp:1564
static void diagnoseInvalidUnicodeCodepointInIdentifier(DiagnosticsEngine &Diags, const LangOptions &LangOpts, uint32_t CodePoint, CharSourceRange Range, bool IsFirst)
Definition: Lexer.cpp:1758
static char DecodeTrigraphChar(const char *CP, Lexer *L, bool Trigraphs)
DecodeTrigraphChar - If the specified character is a legal trigraph when prefixed with ?...
Definition: Lexer.cpp:1257
static size_t getSpellingSlow(const Token &Tok, const char *BufPtr, const LangOptions &LangOpts, char *Spelling)
Slow case of getSpelling.
Definition: Lexer.cpp:324
static const char * FindConflictEnd(const char *CurPtr, const char *BufferEnd, ConflictMarkerKind CMK)
Find the end of a version control conflict marker.
Definition: Lexer.cpp:3245
static void maybeDiagnoseUTF8Homoglyph(DiagnosticsEngine &Diags, uint32_t C, CharSourceRange Range)
After encountering UTF-8 character C and interpreting it as an identifier character,...
Definition: Lexer.cpp:1683
static SourceLocation getBeginningOfFileToken(SourceLocation Loc, const SourceManager &SM, const LangOptions &LangOpts)
Definition: Lexer.cpp:560
static void StringifyImpl(T &Str, char Quote)
Definition: Lexer.cpp:284
static LLVM_ATTRIBUTE_NOINLINE SourceLocation GetMappedTokenLoc(Preprocessor &PP, SourceLocation FileLoc, unsigned CharNo, unsigned TokLen)
GetMappedTokenLoc - If lexing out of a 'mapped buffer', where we pretend the lexer buffer was all exp...
Definition: Lexer.cpp:1185
static bool isAllowedIDChar(uint32_t C, const LangOptions &LangOpts, bool &IsExtension)
Definition: Lexer.cpp:1578
static CharSourceRange makeCharRange(Lexer &L, const char *Begin, const char *End)
Definition: Lexer.cpp:1648
static bool isUnicodeWhitespace(uint32_t Codepoint)
Definition: Lexer.cpp:1545
static void diagnoseExtensionInIdentifier(DiagnosticsEngine &Diags, uint32_t C, CharSourceRange Range)
Definition: Lexer.cpp:1632
static const char * findPlaceholderEnd(const char *CurPtr, const char *BufferEnd)
Definition: Lexer.cpp:3349
static llvm::SmallString< 5 > codepointAsHexString(uint32_t C)
Definition: Lexer.cpp:1551
static CharSourceRange makeRangeFromFileLocs(CharSourceRange Range, const SourceManager &SM, const LangOptions &LangOpts)
Definition: Lexer.cpp:918
static bool isEndOfBlockCommentWithEscapedNewLine(const char *CurPtr, Lexer *L, bool Trigraphs)
isBlockCommentEndOfEscapedNewLine - Return true if the specified newline character (either \n or \r) ...
Definition: Lexer.cpp:2778
static const char * fastParseASCIIIdentifier(const char *CurPtr, const char *BufferEnd)
Definition: Lexer.cpp:1924
static char GetTrigraphCharForLetter(char Letter)
GetTrigraphCharForLetter - Given a character that occurs after a ?? pair, return the decoded trigraph...
Definition: Lexer.cpp:1238
static bool isAllowedInitiallyIDChar(uint32_t C, const LangOptions &LangOpts, bool &IsExtension)
Definition: Lexer.cpp:1606
static void maybeDiagnoseIDCharCompat(DiagnosticsEngine &Diags, uint32_t C, CharSourceRange Range, bool IsFirst)
Definition: Lexer.cpp:1654
static const char * findBeginningOfLine(StringRef Buffer, unsigned Offset)
Returns the pointer that points to the beginning of line that contains the given offset,...
Definition: Lexer.cpp:543
Defines the MultipleIncludeOpt interface.
#define SM(sm)
Definition: OffloadArch.cpp:16
Defines the clang::Preprocessor interface.
SourceRange Range
Definition: SemaObjC.cpp:753
SourceLocation Loc
Definition: SemaObjC.cpp:754
Defines the clang::SourceLocation class and associated facilities.
Defines the SourceManager interface.
Defines the clang::TokenKind enum and support functions.
SourceLocation Begin
static const llvm::sys::UnicodeCharRange C11DisallowedInitialIDCharRanges[]
static const llvm::sys::UnicodeCharRange C99DisallowedInitialIDCharRanges[]
static const llvm::sys::UnicodeCharRange UnicodeWhitespaceCharRanges[]
static const llvm::sys::UnicodeCharRange C99AllowedIDCharRanges[]
static const llvm::sys::UnicodeCharRange C11AllowedIDCharRanges[]
static const llvm::sys::UnicodeCharRange MathematicalNotationProfileIDStartRanges[]
static const llvm::sys::UnicodeCharRange MathematicalNotationProfileIDContinueRanges[]
static const llvm::sys::UnicodeCharRange XIDStartRanges[]
static const llvm::sys::UnicodeCharRange XIDContinueRanges[]
__DEVICE__ void * memcpy(void *__a, const void *__b, size_t __c)
__device__ int
__device__ __2f16 float c
__PTRDIFF_TYPE__ ptrdiff_t
static __inline__ int __ATTRS_o_ai vec_any_ge(vector signed char __a, vector signed char __b)
Definition: altivec.h:16260
static __inline__ int __ATTRS_o_ai vec_any_eq(vector signed char __a, vector signed char __b)
Definition: altivec.h:16052
Represents a character-granular source range.
static CharSourceRange getCharRange(SourceRange R)
SourceLocation getEnd() const
SourceLocation getBegin() const
A little helper class used to produce diagnostics.
Definition: Diagnostic.h:1233
Concrete class used by the front-end to report problems and issues.
Definition: Diagnostic.h:231
DiagnosticBuilder Report(SourceLocation Loc, unsigned DiagID)
Issue the message to the client.
Definition: Diagnostic.h:1529
bool isIgnored(unsigned DiagID, SourceLocation Loc) const
Determine whether the diagnostic is known to be ignored.
Definition: Diagnostic.h:950
An opaque identifier used by SourceManager which refers to a source file (MemoryBuffer) along with it...
static FixItHint CreateReplacement(CharSourceRange RemoveRange, StringRef Code)
Create a code modification hint that replaces the given source range with the given code string.
Definition: Diagnostic.h:139
static FixItHint CreateRemoval(CharSourceRange RemoveRange)
Create a code modification hint that removes the given source range.
Definition: Diagnostic.h:128
static FixItHint CreateInsertion(SourceLocation InsertionLoc, StringRef Code, bool BeforePreviousInsertions=false)
Create a code modification hint that inserts the given code string at a specific location.
Definition: Diagnostic.h:102
One of these records is kept for each identifier that is lexed.
bool isHandleIdentifierCase() const
Return true if the Preprocessor::HandleIdentifier must be called on a token of this identifier.
bool isKeyword(const LangOptions &LangOpts) const
Return true if this token is a keyword in the specified language.
tok::ObjCKeywordKind getObjCKeywordID() const
Return the Objective-C keyword ID for the this identifier.
IdentifierInfo & get(StringRef Name)
Return the identifier token info for the specified named identifier.
Keeps track of the various options that can be enabled, which controls the dialect of C or C++ that i...
Definition: LangOptions.h:434
Lexer - This provides a simple interface that turns a text buffer into a stream of tokens.
Definition: Lexer.h:78
static StringRef getSourceText(CharSourceRange Range, const SourceManager &SM, const LangOptions &LangOpts, bool *Invalid=nullptr)
Returns a string for the source that the range encompasses.
Definition: Lexer.cpp:1020
void SetKeepWhitespaceMode(bool Val)
SetKeepWhitespaceMode - This method lets clients enable or disable whitespace retention mode.
Definition: Lexer.h:254
static SourceLocation findLocationAfterToken(SourceLocation loc, tok::TokenKind TKind, const SourceManager &SM, const LangOptions &LangOpts, bool SkipTrailingWhitespaceAndNewLine)
Checks that the given token is the first token that occurs after the given location (this excludes co...
Definition: Lexer.cpp:1377
bool LexFromRawLexer(Token &Result)
LexFromRawLexer - Lex a token from a designated raw lexer (one with no associated preprocessor object...
Definition: Lexer.h:236
static unsigned getEscapedNewLineSize(const char *P)
getEscapedNewLineSize - Return the size of the specified escaped newline, or 0 if it is not an escape...
Definition: Lexer.cpp:1276
bool inKeepCommentMode() const
inKeepCommentMode - Return true if the lexer should return comments as tokens.
Definition: Lexer.h:262
void SetCommentRetentionState(bool Mode)
SetCommentRetentionMode - Change the comment retention mode of the lexer to the specified mode.
Definition: Lexer.h:269
static std::optional< Token > findPreviousToken(SourceLocation Loc, const SourceManager &SM, const LangOptions &LangOpts, bool IncludeComments)
Finds the token that comes before the given location.
Definition: Lexer.cpp:1352
void seek(unsigned Offset, bool IsAtStartOfLine)
Set the lexer's buffer pointer to Offset.
Definition: Lexer.cpp:277
static StringRef getImmediateMacroName(SourceLocation Loc, const SourceManager &SM, const LangOptions &LangOpts)
Retrieve the name of the immediate macro expansion.
Definition: Lexer.cpp:1056
void ReadToEndOfLine(SmallVectorImpl< char > *Result=nullptr)
ReadToEndOfLine - Read the rest of the current preprocessor line as an uninterpreted string.
Definition: Lexer.cpp:3086
static bool isAtStartOfMacroExpansion(SourceLocation loc, const SourceManager &SM, const LangOptions &LangOpts, SourceLocation *MacroBegin=nullptr)
Returns true if the given MacroID location points at the first token of the macro expansion.
Definition: Lexer.cpp:870
DiagnosticBuilder Diag(const char *Loc, unsigned DiagID) const
Diag - Forwarding function for diagnostics.
Definition: Lexer.cpp:1228
const char * getBufferLocation() const
Return the current location in the buffer.
Definition: Lexer.h:308
bool Lex(Token &Result)
Lex - Return the next token in the file.
Definition: Lexer.cpp:3703
bool isPragmaLexer() const
isPragmaLexer - Returns true if this Lexer is being used to lex a pragma.
Definition: Lexer.h:225
static void DiagnoseDelimitedOrNamedEscapeSequence(SourceLocation Loc, bool Named, const LangOptions &Opts, DiagnosticsEngine &Diags)
Diagnose use of a delimited or named escape sequence.
Definition: Lexer.cpp:3389
static unsigned getTokenPrefixLength(SourceLocation TokStart, unsigned CharNo, const SourceManager &SM, const LangOptions &LangOpts)
Get the physical length (including trigraphs and escaped newlines) of the first Characters characters...
Definition: Lexer.cpp:789
Lexer(FileID FID, const llvm::MemoryBufferRef &InputFile, Preprocessor &PP, bool IsFirstIncludeOfFile=true)
Lexer constructor - Create a new lexer object for the specified buffer with the specified preprocesso...
Definition: Lexer.cpp:183
static bool isAtEndOfMacroExpansion(SourceLocation loc, const SourceManager &SM, const LangOptions &LangOpts, SourceLocation *MacroEnd=nullptr)
Returns true if the given MacroID location points at the last token of the macro expansion.
Definition: Lexer.cpp:892
SourceLocation getSourceLocation() override
getSourceLocation - Return a source location for the next character in the current file.
Definition: Lexer.h:303
static CharSourceRange makeFileCharRange(CharSourceRange Range, const SourceManager &SM, const LangOptions &LangOpts)
Accepts a range and returns a character range with file locations.
Definition: Lexer.cpp:951
static bool isNewLineEscaped(const char *BufferStart, const char *Str)
Checks whether new line pointed by Str is preceded by escape sequence.
Definition: Lexer.cpp:1134
SourceLocation getSourceLocation(const char *Loc, unsigned TokLen=1) const
getSourceLocation - Return a source location identifier for the specified offset in the current file.
Definition: Lexer.cpp:1209
static StringRef getIndentationForLine(SourceLocation Loc, const SourceManager &SM)
Returns the leading whitespace for line that corresponds to the given location Loc.
Definition: Lexer.cpp:1154
static unsigned getSpelling(const Token &Tok, const char *&Buffer, const SourceManager &SourceMgr, const LangOptions &LangOpts, bool *Invalid=nullptr)
getSpelling - This method is used to get the spelling of a token into a preallocated buffer,...
Definition: Lexer.cpp:451
bool isKeepWhitespaceMode() const
isKeepWhitespaceMode - Return true if the lexer should return tokens for every character in the file,...
Definition: Lexer.h:248
static bool isAsciiIdentifierContinueChar(char c, const LangOptions &LangOpts)
Returns true if the given character could appear in an identifier.
Definition: Lexer.cpp:1130
static std::optional< Token > findNextToken(SourceLocation Loc, const SourceManager &SM, const LangOptions &LangOpts, bool IncludeComments=false)
Finds the token that comes right after the given location.
Definition: Lexer.cpp:1321
static unsigned MeasureTokenLength(SourceLocation Loc, const SourceManager &SM, const LangOptions &LangOpts)
MeasureTokenLength - Relex the token at the specified location and return its length in bytes in the ...
Definition: Lexer.cpp:498
static SourceLocation GetBeginningOfToken(SourceLocation Loc, const SourceManager &SM, const LangOptions &LangOpts)
Given a location any where in a source buffer, find the location that corresponds to the beginning of...
Definition: Lexer.cpp:608
void resetExtendedTokenMode()
Sets the extended token mode back to its initial value, according to the language options and preproc...
Definition: Lexer.cpp:219
static StringRef getImmediateMacroNameForDiagnostics(SourceLocation Loc, const SourceManager &SM, const LangOptions &LangOpts)
Retrieve the name of the immediate macro expansion.
Definition: Lexer.cpp:1103
static Lexer * Create_PragmaLexer(SourceLocation SpellingLoc, SourceLocation ExpansionLocStart, SourceLocation ExpansionLocEnd, unsigned TokLen, Preprocessor &PP)
Create_PragmaLexer: Lexer constructor - Create a new lexer object for _Pragma expansion.
Definition: Lexer.cpp:242
static PreambleBounds ComputePreamble(StringRef Buffer, const LangOptions &LangOpts, unsigned MaxLines=0)
Compute the preamble of the given file.
Definition: Lexer.cpp:635
static bool getRawToken(SourceLocation Loc, Token &Result, const SourceManager &SM, const LangOptions &LangOpts, bool IgnoreWhiteSpace=false)
Relex the token at the specified location.
Definition: Lexer.cpp:509
static SourceLocation getLocForEndOfToken(SourceLocation Loc, unsigned Offset, const SourceManager &SM, const LangOptions &LangOpts)
Computes the source location just past the end of the token at this source location.
Definition: Lexer.cpp:848
static std::string Stringify(StringRef Str, bool Charify=false)
Stringify - Convert the specified string into a C string by i) escaping '\' and " characters and ii) ...
Definition: Lexer.cpp:309
static SizedChar getCharAndSizeNoWarn(const char *Ptr, const LangOptions &LangOpts)
getCharAndSizeNoWarn - Like the getCharAndSize method, but does not ever emit a warning.
Definition: Lexer.h:604
void ExitTopLevelConditional()
Called when the lexer exits the top-level conditional.
bool LexingRawMode
True if in raw mode.
SmallVector< PPConditionalInfo, 4 > ConditionalStack
Information about the set of #if/#ifdef/#ifndef blocks we are currently in.
bool ParsingPreprocessorDirective
True when parsing #XXX; turns '\n' into a tok::eod token.
MultipleIncludeOpt MIOpt
A state machine that detects the #ifndef-wrapping a file idiom for the multiple-include optimization.
bool ParsingFilename
True after #include; turns <xx> or "xxx" into a tok::header_name token.
bool isLexingRawMode() const
Return true if this lexer is in raw mode or not.
const FileID FID
The SourceManager FileID corresponding to the file being lexed.
bool LexEditorPlaceholders
When enabled, the preprocessor will construct editor placeholder tokens.
Engages in a tight little dance with the lexer to efficiently preprocess tokens.
Definition: Preprocessor.h:145
SourceLocation getCodeCompletionLoc() const
Returns the location of the code-completion point.
SourceLocation getCodeCompletionFileLoc() const
Returns the start location of the file of code-completion point.
void setCodeCompletionTokenRange(const SourceLocation Start, const SourceLocation End)
Set the code completion token range for detecting replacement range later on.
bool isRecordingPreamble() const
void setRecordedPreambleConditionalStack(ArrayRef< PPConditionalInfo > s)
bool isInPrimaryFile() const
Return true if we're in the top-level file, not in a #include.
void CreateString(StringRef Str, Token &Tok, SourceLocation ExpansionLocStart=SourceLocation(), SourceLocation ExpansionLocEnd=SourceLocation())
Plop the specified string into a scratch buffer and set the specified token's location and length to ...
IdentifierInfo * LookUpIdentifierInfo(Token &Identifier) const
Given a tok::raw_identifier token, look up the identifier information for the token and install it in...
bool isPreprocessedOutput() const
Returns true if the preprocessor is responsible for generating output, false if it is producing token...
bool HandleIdentifier(Token &Identifier)
Callback invoked when the lexer reads an identifier and has filled in the tokens IdentifierInfo membe...
SourceManager & getSourceManager() const
EmptylineHandler * getEmptylineHandler() const
bool getCommentRetentionState() const
bool hadModuleLoaderFatalFailure() const
StringRef getSpelling(SourceLocation loc, SmallVectorImpl< char > &buffer, bool *invalid=nullptr) const
Return the 'spelling' of the token at the given location; does not go up to the spelling location or ...
bool HandleComment(Token &result, SourceRange Comment)
bool isCodeCompletionEnabled() const
Determine if we are performing code completion.
void HandleDirective(Token &Result)
Callback invoked when the lexer sees a # token at the start of a line.
IdentifierTable & getIdentifierTable()
const PreprocessorOptions & getPreprocessorOpts() const
Retrieve the preprocessor options used to initialize this preprocessor.
const LangOptions & getLangOpts() const
void CodeCompleteIncludedFile(llvm::StringRef Dir, bool IsAngled)
Hook used by the lexer to invoke the "included file" code completion point.
void CodeCompleteNaturalLanguage()
Hook used by the lexer to invoke the "natural language" code completion point.
bool HandleEndOfFile(Token &Result, bool isEndOfMacro=false)
Callback invoked when the lexer hits the end of the current file.
DiagnosticsEngine & getDiagnostics() const
void setCodeCompletionIdentifierInfo(IdentifierInfo *Filter)
Set the code completion token for filtering purposes.
DiagnosticBuilder Diag(SourceLocation Loc, unsigned DiagID) const
Forwarding function for diagnostics.
Encodes a location in the source.
static SourceLocation getFromRawEncoding(UIntTy Encoding)
Turn a raw encoding of a SourceLocation object into a real SourceLocation.
bool isValid() const
Return true if this is a valid SourceLocation object.
SourceLocation getLocWithOffset(IntTy Offset) const
Return a source location with the specified offset from this SourceLocation.
UIntTy getRawEncoding() const
When a SourceLocation itself cannot be used, this returns an (opaque) 32-bit integer encoding for it.
This class handles loading and caching of source files into memory.
const char * getCharacterData(SourceLocation SL, bool *Invalid=nullptr) const
Return a pointer to the start of the specified location in the appropriate spelling MemoryBuffer.
A trivial tuple used to represent a source range.
void setBegin(SourceLocation b)
bool isInvalid() const
SourceLocation getEnd() const
SourceLocation getBegin() const
void setEnd(SourceLocation e)
Each ExpansionInfo encodes the expansion location - where the token was ultimately expanded,...
SourceLocation getExpansionLocStart() const
SourceLocation getSpellingLoc() const
This is a discriminated union of FileInfo and ExpansionInfo.
const ExpansionInfo & getExpansion() const
static bool isValidUDSuffix(const LangOptions &LangOpts, StringRef Suffix)
Determine whether a suffix is a valid ud-suffix.
Token - This structure provides full information about a lexed token.
Definition: Token.h:36
IdentifierInfo * getIdentifierInfo() const
Definition: Token.h:189
bool hasUCN() const
Returns true if this token contains a universal character name.
Definition: Token.h:308
bool isLiteral() const
Return true if this is a "literal", like a numeric constant, string, etc.
Definition: Token.h:118
SourceLocation getLocation() const
Return a source location identifier for the specified offset in the current file.
Definition: Token.h:134
unsigned getLength() const
Definition: Token.h:137
tok::ObjCKeywordKind getObjCKeywordID() const
Return the ObjC keyword kind.
Definition: Lexer.cpp:69
bool is(tok::TokenKind K) const
is/isNot - Predicates to check if this token is a specific kind, as in "if (Tok.is(tok::l_brace)) {....
Definition: Token.h:102
tok::TokenKind getKind() const
Definition: Token.h:97
bool isAtStartOfLine() const
isAtStartOfLine - Return true if this token is at the start of a line.
Definition: Token.h:278
@ HasUCN
Definition: Token.h:83
@ IsEditorPlaceholder
Definition: Token.h:88
@ LeadingEmptyMacro
Definition: Token.h:81
@ LeadingSpace
Definition: Token.h:77
@ StartOfLine
Definition: Token.h:75
@ HasUDSuffix
Definition: Token.h:82
@ NeedsCleaning
Definition: Token.h:80
bool isAnnotation() const
Return true if this is any of tok::annot_* kind tokens.
Definition: Token.h:123
bool isObjCAtKeyword(tok::ObjCKeywordKind objcKey) const
Return true if we have an ObjC keyword identifier.
Definition: Lexer.cpp:60
bool isSimpleTypeSpecifier(const LangOptions &LangOpts) const
Determine whether the token kind starts a simple-type-specifier.
Definition: Lexer.cpp:77
void startToken()
Reset all flags to cleared.
Definition: Token.h:179
bool needsCleaning() const
Return true if this token has trigraphs or escaped newlines in it.
Definition: Token.h:297
StringRef getRawIdentifier() const
getRawIdentifier - For a raw identifier token (i.e., an identifier lexed in raw mode),...
Definition: Token.h:215
const char * getLiteralData() const
getLiteralData - For a literal token (numeric constant, string, etc), this returns a pointer to the s...
Definition: Token.h:227
void setFlag(TokenFlags Flag)
Set the specified flag.
Definition: Token.h:246
static __inline__ int __DEFAULT_FN_ATTRS _mm_movemask_epi8(__m128i __a)
Copies the values of the most significant bits from each 8-bit element in a 128-bit integer vector of...
Definition: emmintrin.h:4290
static __inline__ __m128i __DEFAULT_FN_ATTRS_CONSTEXPR _mm_cmpeq_epi8(__m128i __a, __m128i __b)
Compares each of the corresponding 8-bit values of the 128-bit integer vectors for equality.
Definition: emmintrin.h:3094
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_loadu_si128(__m128i_u const *__p)
Moves packed integer values from an unaligned 128-bit memory location to elements in a 128-bit intege...
Definition: emmintrin.h:3463
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_load_si128(__m128i const *__p)
Moves packed integer values from an aligned 128-bit memory location to elements in a 128-bit integer ...
Definition: emmintrin.h:3448
static __inline__ __m128i __DEFAULT_FN_ATTRS_CONSTEXPR _mm_set1_epi8(char __b)
Initializes all values in a 128-bit vector of [16 x i8] with the specified 8-bit value.
Definition: emmintrin.h:3751
@ tokens_present_before_eof
Indicates that there are tokens present between the last scanned directive and eof.
@ After
Like System, but searched after the system directories.
bool isStringLiteral(TokenKind K)
Return true if this is a C or C++ string-literal (or C++11 user-defined-string-literal) token.
Definition: TokenKinds.h:89
ObjCKeywordKind
Provides a namespace for Objective-C keywords which start with an '@'.
Definition: TokenKinds.h:41
TokenKind
Provides a simple uniform namespace for tokens from all C languages.
Definition: TokenKinds.h:25
The JSON file list parser is used to communicate input to InstallAPI.
LLVM_READNONE bool isASCII(char c)
Returns true if a byte is an ASCII character.
Definition: CharInfo.h:41
@ Match
This is not an overload because the signature exactly matches an existing declaration.
LLVM_READONLY bool isVerticalWhitespace(unsigned char c)
Returns true if this character is vertical ASCII whitespace: '\n', '\r'.
Definition: CharInfo.h:99
ConflictMarkerKind
ConflictMarkerKind - Kinds of conflict marker which the lexer might be recovering from.
Definition: Lexer.h:44
@ CMK_Perforce
A Perforce-style conflict marker, initiated by 4 ">"s, separated by 4 "="s, and terminated by 4 "<"s.
Definition: Lexer.h:54
@ CMK_None
Not within a conflict marker.
Definition: Lexer.h:46
@ CMK_Normal
A normal or diff3 conflict marker, initiated by at least 7 "<"s, separated by at least 7 "="s or "|"s...
Definition: Lexer.h:50
@ LineComment
Definition: LangStandard.h:49
std::pair< FileID, unsigned > FileIDAndOffset
LLVM_READONLY bool isAsciiIdentifierContinue(unsigned char c)
Definition: CharInfo.h:61
bool operator<(DeclarationName LHS, DeclarationName RHS)
Ordering on two declaration names.
LLVM_READONLY bool isHorizontalWhitespace(unsigned char c)
Returns true if this character is horizontal ASCII whitespace: ' ', '\t', '\f', '\v'.
Definition: CharInfo.h:91
@ Result
The result type of a method or function.
LLVM_READONLY bool isRawStringDelimBody(unsigned char c)
Return true if this is the body character of a C++ raw string delimiter.
Definition: CharInfo.h:175
LLVM_READONLY bool isWhitespace(unsigned char c)
Return true if this character is horizontal or vertical ASCII whitespace: ' ', '\t',...
Definition: CharInfo.h:108
LLVM_READONLY bool isPreprocessingNumberBody(unsigned char c)
Return true if this is the body character of a C preprocessing number, which is [a-zA-Z0-9_.
Definition: CharInfo.h:168
@ Keyword
The name has been typo-corrected to a keyword.
const FunctionProtoType * T
LLVM_READONLY bool isAsciiIdentifierStart(unsigned char c, bool AllowDollar=false)
Returns true if this is a valid first character of a C identifier, which is [a-zA-Z_].
Definition: CharInfo.h:53
unsigned int uint32_t
__INTPTR_TYPE__ intptr_t
A signed integer type with the property that any valid pointer to void can be converted to this type,...
float __ovld __cnfn length(float)
Return the length of vector p, i.e., sqrt(p.x2 + p.y 2 + ...)
#define _SIDD_UBYTE_OPS
Definition: smmintrin.h:1550
#define _mm_cmpistri(A, B, M)
Uses the immediate operand M to perform a comparison of string data with implicitly defined lengths t...
Definition: smmintrin.h:1682
#define _SIDD_LEAST_SIGNIFICANT
Definition: smmintrin.h:1568
#define _SIDD_NEGATIVE_POLARITY
Definition: smmintrin.h:1563
#define _SIDD_CMP_RANGES
Definition: smmintrin.h:1557
Represents a char and the number of bytes parsed to produce it.
Definition: Lexer.h:597
Describes the bounds (start, size) of the preamble and a flag required by PreprocessorOptions::Precom...
Definition: Lexer.h:60
Token lexed as part of dependency directive scanning.
unsigned Offset
Offset into the original source input.