23#include "llvm/ADT/ScopeExit.h"
24#include "llvm/ADT/SmallString.h"
25#include "llvm/ADT/StringMap.h"
26#include "llvm/ADT/StringSwitch.h"
35struct DirectiveWithTokens {
40 :
Kind(
Kind), NumTokens(NumTokens) {}
63 Scanner(StringRef Input,
66 : Input(Input), Tokens(Tokens), Diags(Diags),
67 InputSourceLoc(InputSourceLoc), LangOpts(getLangOptsForDepScanning()),
68 TheLexer(InputSourceLoc, LangOpts, Input.begin(), Input.begin(),
75 LangOpts.LineComment =
true;
76 LangOpts.RawStringLiterals =
true;
91 lexToken(
const char *&
First,
const char *
const End);
94 lexIncludeFilename(
const char *&
First,
const char *
const End);
96 void skipLine(
const char *&
First,
const char *
const End);
97 void skipDirective(StringRef Name,
const char *&
First,
const char *
const End);
108 [[nodiscard]] std::optional<StringRef>
109 tryLexIdentifierOrSkipLine(
const char *&
First,
const char *
const End);
112 [[nodiscard]] StringRef lexIdentifier(
const char *&
First,
113 const char *
const End);
120 [[nodiscard]]
bool isNextIdentifierOrSkipLine(StringRef
Id,
122 const char *
const End);
130 const char *
const End);
137 [[nodiscard]] std::optional<StringRef>
138 tryLexStringLiteralOrSkipLine(
const char *&
First,
const char *
const End);
140 [[nodiscard]]
bool scanImpl(
const char *
First,
const char *
const End);
141 [[nodiscard]]
bool lexPPLine(
const char *&
First,
const char *
const End);
142 [[nodiscard]]
bool lexAt(
const char *&
First,
const char *
const End);
143 [[nodiscard]]
bool lexModule(
const char *&
First,
const char *
const End);
144 [[nodiscard]]
bool lexDefine(
const char *HashLoc,
const char *&
First,
145 const char *
const End);
146 [[nodiscard]]
bool lexPragma(
const char *&
First,
const char *
const End);
147 [[nodiscard]]
bool lex_Pragma(
const char *&
First,
const char *
const End);
148 [[nodiscard]]
bool lexEndif(
const char *&
First,
const char *
const End);
150 const char *
const End);
151 [[nodiscard]]
bool lexModuleDirectiveBody(
DirectiveKind Kind,
153 const char *
const End);
154 void lexPPDirectiveBody(
const char *&
First,
const char *
const End);
157 Tokens.append(CurDirToks);
158 DirsWithToks.emplace_back(Kind, CurDirToks.size());
160 return DirsWithToks.back();
162 void popDirective() {
163 Tokens.pop_back_n(DirsWithToks.pop_back_val().NumTokens);
166 return DirsWithToks.empty() ?
pp_none : DirsWithToks.back().Kind;
169 unsigned getOffsetAt(
const char *CurPtr)
const {
170 return CurPtr - Input.data();
175 bool reportError(
const char *CurPtr,
unsigned Err);
177 StringMap<char> SplitIds;
183 const char *LastTokenPtr =
nullptr;
198bool Scanner::reportError(
const char *CurPtr,
unsigned Err) {
201 assert(CurPtr >= Input.data() &&
"invalid buffer ptr");
202 Diags->Report(InputSourceLoc.getLocWithOffset(getOffsetAt(CurPtr)), Err);
213 assert(Current >
First);
217 if (Current >
First && *(Current - 1) ==
'\\') {
221 if (EscapeSize > 0) {
224 Current -= (1 + EscapeSize);
236 const char *Current) {
237 assert(
First <= Current);
240 if (*Current !=
'"' ||
First == Current)
246 if (
First == Current ||
251 if (*Current ==
'u' || *Current ==
'U' || *Current ==
'L')
252 return First == Current ||
256 if (*Current !=
'8' ||
First == Current ||
259 return First == Current ||
264 assert(
First[0] ==
'"');
286 while (
Last != End &&
size_t(
Last -
First) < Terminator.size() &&
295 if (
size_t(
Last -
First) < Terminator.size())
305static unsigned isEOL(
const char *
First,
const char *
const End) {
316 const char Terminator = *
First ==
'<' ?
'>' : *
First;
331 const char *FirstAfterBackslashPastSpace =
First;
333 if (
unsigned NLSize =
isEOL(FirstAfterBackslashPastSpace, End)) {
336 First = FirstAfterBackslashPastSpace + NLSize - 1;
349 assert(Len &&
"expected newline");
363 char LastNonWhitespace =
' ';
366 LastNonWhitespace = *
First;
372 if (LastNonWhitespace !=
'\\')
388 if (End -
First < 4) {
402 const char *
const Cur,
403 const char *
const End) {
404 assert(*Cur ==
'\'' &&
"expected quotation character");
412 char Prev = *(Cur - 1);
413 if (Prev ==
'L' || Prev ==
'U' || Prev ==
'u')
415 if (Prev ==
'8' && (Cur - 1 != Start) && *(Cur - 2) ==
'u')
423void Scanner::skipLine(
const char *&
First,
const char *
const End) {
425 assert(
First <= End);
433 const char *Start =
First;
436 char LastNonWhitespace =
' ';
441 LastTokenPtr =
First;
459 LastTokenPtr =
First;
461 LastNonWhitespace = *
First;
466 if (
First[1] ==
'/') {
472 if (
First[1] !=
'*') {
473 LastTokenPtr =
First;
475 LastNonWhitespace = *
First;
489 if (LastNonWhitespace !=
'\\')
494void Scanner::skipDirective(StringRef Name,
const char *&
First,
495 const char *
const End) {
496 if (llvm::StringSwitch<bool>(Name)
497 .Case(
"warning",
true)
503 skipLine(
First, End);
508 assert(
First <= End);
514 if (*
First ==
'\\') {
515 const char *Ptr =
First + 1;
531 if (
First[1] ==
'/') {
546 const char *
const End) {
547 const char *DirectiveLoc = Input.data() + CurDirToks.front().Offset;
552 if ((Tok.
is(tok::hash) || Tok.
is(tok::at)) &&
554 CurDirToks.pop_back();
558 if (Tok.
is(tok::eof))
561 diag::err_dep_source_scanner_missing_semi_after_at_import);
562 if (Tok.
is(tok::semi))
566 const auto &Tok = lexToken(
First, End);
568 if (Tok.
is(tok::eof) || Tok.
is(tok::eod))
570 return reportError(DirectiveLoc,
571 diag::err_dep_source_scanner_unexpected_tokens_at_import);
575 const char *
const End) {
577 TheLexer.LexFromRawLexer(Tok);
578 First = Input.data() + TheLexer.getCurrentBufferOffset();
579 assert(
First <= End);
581 unsigned Offset = TheLexer.getCurrentBufferOffset() - Tok.
getLength();
584 return CurDirToks.back();
588Scanner::lexIncludeFilename(
const char *&
First,
const char *
const End) {
590 TheLexer.LexIncludeFilename(Tok);
591 First = Input.data() + TheLexer.getCurrentBufferOffset();
592 assert(
First <= End);
594 unsigned Offset = TheLexer.getCurrentBufferOffset() - Tok.
getLength();
597 return CurDirToks.back();
600void Scanner::lexPPDirectiveBody(
const char *&
First,
const char *
const End) {
603 if (Tok.
is(tok::eod) || Tok.
is(tok::eof))
611 if (LLVM_LIKELY(!NeedsCleaning))
615 Spelling.resize(Tok.
Length);
620 unsigned SpellingLength = 0;
621 const char *BufPtr = Input.begin() + Tok.
Offset;
622 const char *AfterIdent = Input.begin() + Tok.
getEnd();
623 while (BufPtr < AfterIdent) {
625 Spelling[SpellingLength++] = Char;
629 return SplitIds.try_emplace(StringRef(Spelling.begin(), SpellingLength), 0)
633std::optional<StringRef>
634Scanner::tryLexIdentifierOrSkipLine(
const char *&
First,
const char *
const End) {
636 if (Tok.
isNot(tok::raw_identifier)) {
637 if (!Tok.
is(tok::eod))
638 skipLine(
First, End);
642 return cleanStringIfNeeded(Tok);
645StringRef Scanner::lexIdentifier(
const char *&
First,
const char *
const End) {
646 std::optional<StringRef>
Id = tryLexIdentifierOrSkipLine(
First, End);
647 assert(
Id &&
"expected identifier token");
651bool Scanner::isNextIdentifierOrSkipLine(StringRef
Id,
const char *&
First,
652 const char *
const End) {
653 if (std::optional<StringRef> FoundId =
654 tryLexIdentifierOrSkipLine(
First, End)) {
657 skipLine(
First, End);
663 const char *
const End) {
667 skipLine(
First, End);
671std::optional<StringRef>
672Scanner::tryLexStringLiteralOrSkipLine(
const char *&
First,
673 const char *
const End) {
676 if (!Tok.
is(tok::eod))
677 skipLine(
First, End);
681 return cleanStringIfNeeded(Tok);
684bool Scanner::lexAt(
const char *&
First,
const char *
const End) {
689 assert(AtTok.
is(tok::at));
692 if (!isNextIdentifierOrSkipLine(
"import",
First, End))
697bool Scanner::lexModule(
const char *&
First,
const char *
const End) {
698 StringRef
Id = lexIdentifier(
First, End);
700 if (
Id ==
"export") {
702 std::optional<StringRef> NextId = tryLexIdentifierOrSkipLine(
First, End);
708 if (
Id !=
"module" &&
Id !=
"import") {
709 skipLine(
First, End);
721 if (
Id ==
"module") {
722 skipLine(
First, End);
729 skipLine(
First, End);
733 (void)lexToken(
First, End);
734 if (!tryLexIdentifierOrSkipLine(
First, End))
740 if (
Id ==
"module" && !Export)
742 skipLine(
First, End);
750 skipLine(
First, End);
755 TheLexer.seek(getOffsetAt(
First),
false);
763 return lexModuleDirectiveBody(Kind,
First, End);
766bool Scanner::lex_Pragma(
const char *&
First,
const char *
const End) {
767 if (!isNextTokenOrSkipLine(tok::l_paren,
First, End))
770 std::optional<StringRef> Str = tryLexStringLiteralOrSkipLine(
First, End);
772 if (!Str || !isNextTokenOrSkipLine(tok::r_paren,
First, End))
782 const char *
Begin = Buffer.c_str();
783 Scanner PragmaScanner{StringRef(
Begin, Buffer.size()), DiscardTokens, Diags,
786 PragmaScanner.TheLexer.setParsingPreprocessorDirective(
true);
787 if (PragmaScanner.lexPragma(
Begin, Buffer.end()))
792 skipLine(
First, End);
796 assert(
Begin == Buffer.end());
801bool Scanner::lexPragma(
const char *&
First,
const char *
const End) {
802 std::optional<StringRef> FoundId = tryLexIdentifierOrSkipLine(
First, End);
806 StringRef
Id = *FoundId;
807 auto Kind = llvm::StringSwitch<DirectiveKind>(
Id)
814 lexPPDirectiveBody(
First, End);
820 skipLine(
First, End);
824 FoundId = tryLexIdentifierOrSkipLine(
First, End);
830 if (
Id ==
"system_header") {
831 lexPPDirectiveBody(
First, End);
836 if (
Id !=
"module") {
837 skipLine(
First, End);
842 if (!isNextIdentifierOrSkipLine(
"import",
First, End))
846 lexPPDirectiveBody(
First, End);
851bool Scanner::lexEndif(
const char *&
First,
const char *
const End) {
864 skipLine(
First, End);
872 const char *
const End) {
873 lexPPDirectiveBody(
First, End);
891bool Scanner::lexPPLine(
const char *&
First,
const char *
const End) {
892 assert(
First != End);
895 assert(
First <= End);
900 skipLine(
First, End);
901 assert(
First <= End);
905 LastTokenPtr =
First;
907 TheLexer.seek(getOffsetAt(
First),
true);
909 auto ScEx1 = make_scope_exit([&]() {
916 if (isNextIdentifierOrSkipLine(
"_Pragma",
First, End))
917 return lex_Pragma(
First, End);
923 TheLexer.setParsingPreprocessorDirective(
true);
924 auto ScEx2 = make_scope_exit(
925 [&]() { TheLexer.setParsingPreprocessorDirective(
false); });
929 return lexAt(
First, End);
933 return lexModule(
First, End);
937 if (HashTok.
is(tok::hashhash)) {
941 skipLine(
First, End);
942 assert(
First <= End);
945 assert(HashTok.
is(tok::hash));
948 std::optional<StringRef> FoundId = tryLexIdentifierOrSkipLine(
First, End);
952 StringRef
Id = *FoundId;
955 return lexPragma(
First, End);
957 auto Kind = llvm::StringSwitch<DirectiveKind>(
Id)
979 return lexEndif(
First, End);
987 if (lexIncludeFilename(
First, End).is(tok::eod)) {
996 return lexDefault(Kind,
First, End);
1005bool Scanner::scanImpl(
const char *
First,
const char *
const End) {
1007 while (
First != End)
1008 if (lexPPLine(
First, End))
1014 bool Error = scanImpl(Input.begin(), Input.end());
1019 (Tokens.empty() || LastTokenPtr > Input.begin() + Tokens.back().Offset))
1025 for (
const DirectiveWithTokens &DirWithToks : DirsWithToks) {
1026 assert(RemainingTokens.size() >= DirWithToks.NumTokens);
1027 Directives.emplace_back(DirWithToks.Kind,
1028 RemainingTokens.take_front(DirWithToks.NumTokens));
1029 RemainingTokens = RemainingTokens.drop_front(DirWithToks.NumTokens);
1031 assert(RemainingTokens.empty());
1040 return Scanner(Input, Tokens, Diags, InputSourceLoc).scan(Directives);
1046 llvm::raw_ostream &OS) {
1048 auto needsSpaceSeparator =
1051 if (Prev == Tok.
Kind)
1052 return !Tok.
isOneOf(tok::l_paren, tok::r_paren, tok::l_square,
1054 if (Prev == tok::raw_identifier &&
1055 Tok.
isOneOf(tok::hash, tok::numeric_constant, tok::string_literal,
1056 tok::char_constant, tok::header_name))
1058 if (Prev == tok::r_paren &&
1059 Tok.
isOneOf(tok::raw_identifier, tok::hash, tok::string_literal,
1060 tok::char_constant, tok::unknown))
1062 if (Prev == tok::comma &&
1063 Tok.
isOneOf(tok::l_paren, tok::string_literal, tok::less))
1070 OS <<
"<TokBeforeEOF>";
1071 std::optional<tok::TokenKind> PrevTokenKind;
1073 if (PrevTokenKind && needsSpaceSeparator(*PrevTokenKind, Tok))
1075 PrevTokenKind = Tok.
Kind;
1082 const char *
const End) {
1083 assert(
First <= End);
1084 while (
First != End) {
1085 if (*
First ==
'#') {
1099 const char *
First = Source.begin();
1100 const char *
const End = Source.end();
1112 S.TheLexer.setParsingPreprocessorDirective(
true);
1113 if (S.lexModule(
First, End))
1115 auto IsCXXNamedModuleDirective = [](
const DirectiveWithTokens &
D) {
1126 return llvm::any_of(S.DirsWithToks, IsCXXNamedModuleDirective);
Defines the Diagnostic-related interfaces.
static void skipBlockComment(const char *&First, const char *const End)
static void skipRawString(const char *&First, const char *const End)
static void skipString(const char *&First, const char *const End)
static bool isStartOfRelevantLine(char First)
static void skipWhitespace(const char *&First, const char *const End)
static bool isRawStringLiteral(const char *First, const char *Current)
static void skipUntilMaybeCXX20ModuleDirective(const char *&First, const char *const End)
static void skipOverSpaces(const char *&First, const char *const End)
static unsigned isEOL(const char *First, const char *const End)
static char previousChar(const char *First, const char *&Current)
static void skipToNewlineRaw(const char *&First, const char *const End)
static unsigned skipNewline(const char *&First, const char *End)
static void skipUTF8ByteOrderMark(const char *&First, const char *const End)
static void skipLineComment(const char *&First, const char *const End)
static bool isQuoteCppDigitSeparator(const char *const Start, const char *const Cur, const char *const End)
This is the interface for scanning header and source files to get the minimum necessary preprocessor ...
Kind
Lists the kind of concrete classes of Decl.
Concrete class used by the front-end to report problems and issues.
Keeps track of the various options that can be enabled, which controls the dialect of C or C++ that i...
Lexer - This provides a simple interface that turns a text buffer into a stream of tokens.
static unsigned getEscapedNewLineSize(const char *P)
getEscapedNewLineSize - Return the size of the specified escaped newline, or 0 if it is not an escape...
static SizedChar getCharAndSizeNoWarn(const char *Ptr, const LangOptions &LangOpts)
getCharAndSizeNoWarn - Like the getCharAndSize method, but does not ever emit a warning.
Encodes a location in the source.
Token - This structure provides full information about a lexed token.
unsigned getFlags() const
Return the internal represtation of the flags.
unsigned getLength() const
tok::TokenKind getKind() const
DirectiveKind
Represents the kind of preprocessor directive or a module declaration that is tracked by the scanner ...
@ tokens_present_before_eof
Indicates that there are tokens present between the last scanned directive and eof.
@ pp_pragma_system_header
@ pp_pragma_include_alias
bool isStringLiteral(TokenKind K)
Return true if this is a C or C++ string-literal (or C++11 user-defined-string-literal) token.
TokenKind
Provides a simple uniform namespace for tokens from all C languages.
The JSON file list parser is used to communicate input to InstallAPI.
LLVM_READONLY bool isVerticalWhitespace(unsigned char c)
Returns true if this character is vertical ASCII whitespace: '\n', '\r'.
LLVM_READONLY bool isAsciiIdentifierContinue(unsigned char c)
void printDependencyDirectivesAsSource(StringRef Source, ArrayRef< dependency_directives_scan::Directive > Directives, llvm::raw_ostream &OS)
Print the previously scanned dependency directives as minimized source text.
bool scanInputForCXX20ModulesUsage(StringRef Source)
Scan an input source buffer for C++20 named module usage.
LLVM_READONLY bool isHorizontalWhitespace(unsigned char c)
Returns true if this character is horizontal ASCII whitespace: ' ', '\t', '\f', '\v'.
bool scanSourceForDependencyDirectives(StringRef Input, SmallVectorImpl< dependency_directives_scan::Token > &Tokens, SmallVectorImpl< dependency_directives_scan::Directive > &Directives, DiagnosticsEngine *Diags=nullptr, SourceLocation InputSourceLoc=SourceLocation())
Scan the input for the preprocessor directives that might have an effect on the dependencies for a co...
LLVM_READONLY bool isWhitespace(unsigned char c)
Return true if this character is horizontal or vertical ASCII whitespace: ' ', '\t',...
LLVM_READONLY bool isPreprocessingNumberBody(unsigned char c)
Return true if this is the body character of a C preprocessing number, which is [a-zA-Z0-9_.
void prepare_PragmaString(SmallVectorImpl< char > &StrVal)
Destringize a _Pragma("") string according to C11 6.10.9.1: "The string literal is destringized by de...
Diagnostic wrappers for TextAPI types for error reporting.
Represents a directive that's lexed as part of the dependency directives scanning.
DirectiveKind Kind
The kind of token.
Token lexed as part of dependency directive scanning.
bool isNot(tok::TokenKind K) const
unsigned Offset
Offset into the original source input.
bool is(tok::TokenKind K) const
bool isOneOf(Ts... Ks) const