proxy/index.php

//===- DependencyDirectivesScanner.cpp ------------------------------------===//

//

// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.

// See https://llvm.org/LICENSE.txt for license information.

// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception

//

//===----------------------------------------------------------------------===//

///

/// \file

/// This is the interface for scanning header and source files to get the

/// minimum necessary preprocessor directives for evaluating includes. It

/// reduces the source down to #define, #include, #import, @import, and any

/// conditional preprocessor logic that contains one of those.

///

//===----------------------------------------------------------------------===//


#include "clang/Lex/DependencyDirectivesScanner.h"

#include "clang/Basic/CharInfo.h"

#include "clang/Basic/Diagnostic.h"

#include "clang/Lex/LexDiagnostic.h"

#include "clang/Lex/Lexer.h"

#include "clang/Lex/Pragma.h"

#include "llvm/ADT/ScopeExit.h"

#include "llvm/ADT/SmallString.h"

#include "llvm/ADT/StringMap.h"

#include "llvm/ADT/StringSwitch.h"

#include <optional>


using namespace clang;

using namespace clang::dependency_directives_scan;

using namespace llvm;


namespace {


struct DirectiveWithTokens {

  DirectiveKind Kind;

  unsigned NumTokens;


  DirectiveWithTokens(DirectiveKind Kind, unsigned NumTokens)

      : Kind(Kind), NumTokens(NumTokens) {}

};


/// Does an efficient "scan" of the sources to detect the presence of

/// preprocessor (or module import) directives and collects the raw lexed tokens

/// for those directives so that the \p Lexer can "replay" them when the file is

/// included.

///

/// Note that the behavior of the raw lexer is affected by the language mode,

/// while at this point we want to do a scan and collect tokens once,

/// irrespective of the language mode that the file will get included in. To

/// compensate for that the \p Lexer, while "replaying", will adjust a token

/// where appropriate, when it could affect the preprocessor's state.

/// For example in a directive like

///

/// \code

///   #if __has_cpp_attribute(clang::fallthrough)

/// \endcode

///

/// The preprocessor needs to see '::' as 'tok::coloncolon' instead of 2

/// 'tok::colon'. The \p Lexer will adjust if it sees consecutive 'tok::colon'

/// while in C++ mode.

struct Scanner {

  Scanner(StringRef Input,

          SmallVectorImpl<dependency_directives_scan::Token> &Tokens,

          DiagnosticsEngine *Diags, SourceLocation InputSourceLoc)

      : Input(Input), Tokens(Tokens), Diags(Diags),

        InputSourceLoc(InputSourceLoc), LangOpts(getLangOptsForDepScanning()),

        TheLexer(InputSourceLoc, LangOpts, Input.begin(), Input.begin(),

                 Input.end()) {}


  static LangOptions getLangOptsForDepScanning() {

    LangOptions LangOpts;

    // Set the lexer to use 'tok::at' for '@', instead of 'tok::unknown'.

    LangOpts.ObjC = true;

    LangOpts.LineComment = true;

    LangOpts.RawStringLiterals = true;

    // FIXME: we do not enable C11 or C++11, so we are missing u/u8/U"".

    return LangOpts;

  }


  /// Lex the provided source and emit the directive tokens.

  ///

  /// \returns True on error.

  bool scan(SmallVectorImpl<Directive> &Directives);


  friend bool clang::scanInputForCXX20ModulesUsage(StringRef Source);


private:

  /// Lexes next token and advances \p First and the \p Lexer.

  [[nodiscard]] dependency_directives_scan::Token &

  lexToken(const char *&First, const char *const End);


  [[nodiscard]] dependency_directives_scan::Token &

  lexIncludeFilename(const char *&First, const char *const End);


  void skipLine(const char *&First, const char *const End);

  void skipDirective(StringRef Name, const char *&First, const char *const End);


  /// Returns the spelling of a string literal or identifier after performing

  /// any processing needed to handle \c clang::Token::NeedsCleaning.

  StringRef cleanStringIfNeeded(const dependency_directives_scan::Token &Tok);


  /// Lexes next token and if it is identifier returns its string, otherwise

  /// it skips the current line and returns \p std::nullopt.

  ///

  /// In any case (whatever the token kind) \p First and the \p Lexer will

  /// advance beyond the token.

  [[nodiscard]] std::optional<StringRef>

  tryLexIdentifierOrSkipLine(const char *&First, const char *const End);


  /// Used when it is certain that next token is an identifier.

  [[nodiscard]] StringRef lexIdentifier(const char *&First,

                                        const char *const End);


  /// Lexes next token and returns true iff it is an identifier that matches \p

  /// Id, otherwise it skips the current line and returns false.

  ///

  /// In any case (whatever the token kind) \p First and the \p Lexer will

  /// advance beyond the token.

  [[nodiscard]] bool isNextIdentifierOrSkipLine(StringRef Id,

                                                const char *&First,

                                                const char *const End);


  /// Lexes next token and returns true iff it matches the kind \p K.

  /// Otherwise it skips the current line and returns false.

  ///

  /// In any case (whatever the token kind) \p First and the \p Lexer will

  /// advance beyond the token.

  [[nodiscard]] bool isNextTokenOrSkipLine(tok::TokenKind K, const char *&First,

                                           const char *const End);


  /// Lexes next token and if it is string literal, returns its string.

  /// Otherwise, it skips the current line and returns \p std::nullopt.

  ///

  /// In any case (whatever the token kind) \p First and the \p Lexer will

  /// advance beyond the token.

  [[nodiscard]] std::optional<StringRef>

  tryLexStringLiteralOrSkipLine(const char *&First, const char *const End);


  [[nodiscard]] bool scanImpl(const char *First, const char *const End);

  [[nodiscard]] bool lexPPLine(const char *&First, const char *const End);

  [[nodiscard]] bool lexAt(const char *&First, const char *const End);

  [[nodiscard]] bool lexModule(const char *&First, const char *const End);

  [[nodiscard]] bool lexDefine(const char *HashLoc, const char *&First,

                               const char *const End);

  [[nodiscard]] bool lexPragma(const char *&First, const char *const End);

  [[nodiscard]] bool lex_Pragma(const char *&First, const char *const End);

  [[nodiscard]] bool lexEndif(const char *&First, const char *const End);

  [[nodiscard]] bool lexDefault(DirectiveKind Kind, const char *&First,

                                const char *const End);

  [[nodiscard]] bool lexModuleDirectiveBody(DirectiveKind Kind,

                                            const char *&First,

                                            const char *const End);

  void lexPPDirectiveBody(const char *&First, const char *const End);


  DirectiveWithTokens &pushDirective(DirectiveKind Kind) {

    Tokens.append(CurDirToks);

    DirsWithToks.emplace_back(Kind, CurDirToks.size());

    CurDirToks.clear();

    return DirsWithToks.back();

  }

  void popDirective() {

    Tokens.pop_back_n(DirsWithToks.pop_back_val().NumTokens);

  }

  DirectiveKind topDirective() const {

    return DirsWithToks.empty() ? pp_none : DirsWithToks.back().Kind;

  }


  unsigned getOffsetAt(const char *CurPtr) const {

    return CurPtr - Input.data();

  }


  /// Reports a diagnostic if the diagnostic engine is provided. Always returns

  /// true at the end.

  bool reportError(const char *CurPtr, unsigned Err);


  StringMap<char> SplitIds;

  StringRef Input;

  SmallVectorImpl<dependency_directives_scan::Token> &Tokens;

  DiagnosticsEngine *Diags;

  SourceLocation InputSourceLoc;


  const char *LastTokenPtr = nullptr;

  /// Keeps track of the tokens for the currently lexed directive. Once a

  /// directive is fully lexed and "committed" then the tokens get appended to

  /// \p Tokens and \p CurDirToks is cleared for the next directive.

  SmallVector<dependency_directives_scan::Token, 32> CurDirToks;

  /// The directives that were lexed along with the number of tokens that each

  /// directive contains. The tokens of all the directives are kept in \p Tokens

  /// vector, in the same order as the directives order in \p DirsWithToks.

  SmallVector<DirectiveWithTokens, 64> DirsWithToks;

  LangOptions LangOpts;

  Lexer TheLexer;

};


} // end anonymous namespace


bool Scanner::reportError(const char *CurPtr, unsigned Err) {

  if (!Diags)

    return true;

  assert(CurPtr >= Input.data() && "invalid buffer ptr");

  Diags->Report(InputSourceLoc.getLocWithOffset(getOffsetAt(CurPtr)), Err);

  return true;

}


static void skipOverSpaces(const char *&First, const char *const End) {

  while (First != End && isHorizontalWhitespace(*First))

    ++First;

}


// Move back by one character, skipping escaped newlines (backslash + \n)

static char previousChar(const char *First, const char *&Current) {

  assert(Current > First);

  --Current;

  while (Current > First && isVerticalWhitespace(*Current)) {

    // Check if the previous character is a backslash

    if (Current > First && *(Current - 1) == '\\') {

      // Use Lexer's getEscapedNewLineSize to get the size of the escaped

      // newline

      unsigned EscapeSize = Lexer::getEscapedNewLineSize(Current);

      if (EscapeSize > 0) {

        // Skip back over the entire escaped newline sequence (backslash +

        // newline)

        Current -= (1 + EscapeSize);

      } else {

        break;

      }

    } else {

      break;

    }

  }

  return *Current;

}


[[nodiscard]] static bool isRawStringLiteral(const char *First,

                                             const char *Current) {

  assert(First <= Current);


  // Check if we can even back up.

  if (*Current != '"' || First == Current)

    return false;


  // Check for an "R".

  if (previousChar(First, Current) != 'R')

    return false;

  if (First == Current ||

      !isAsciiIdentifierContinue(previousChar(First, Current)))

    return true;


  // Check for a prefix of "u", "U", or "L".

  if (*Current == 'u' || *Current == 'U' || *Current == 'L')

    return First == Current ||

           !isAsciiIdentifierContinue(previousChar(First, Current));


  // Check for a prefix of "u8".

  if (*Current != '8' || First == Current ||

      previousChar(First, Current) != 'u')

    return false;

  return First == Current ||

         !isAsciiIdentifierContinue(previousChar(First, Current));

}


static void skipRawString(const char *&First, const char *const End) {

  assert(First[0] == '"');


  const char *Last = ++First;

  while (Last != End && *Last != '(')

    ++Last;

  if (Last == End) {

    First = Last; // Hit the end... just give up.

    return;

  }


  StringRef Terminator(First, Last - First);

  for (;;) {

    // Move First to just past the next ")".

    First = Last;

    while (First != End && *First != ')')

      ++First;

    if (First == End)

      return;

    ++First;


    // Look ahead for the terminator sequence.

    Last = First;

    while (Last != End && size_t(Last - First) < Terminator.size() &&

           Terminator[Last - First] == *Last)

      ++Last;


    // Check if we hit it (or the end of the file).

    if (Last == End) {

      First = Last;

      return;

    }

    if (size_t(Last - First) < Terminator.size())

      continue;

    if (*Last != '"')

      continue;

    First = Last + 1;

    return;

  }

}


// Returns the length of EOL, either 0 (no end-of-line), 1 (\n) or 2 (\r\n)

static unsigned isEOL(const char *First, const char *const End) {

  if (First == End)

    return 0;

  if (End - First > 1 && isVerticalWhitespace(First[0]) &&

      isVerticalWhitespace(First[1]) && First[0] != First[1])

    return 2;

  return !!isVerticalWhitespace(First[0]);

}


static void skipString(const char *&First, const char *const End) {

  assert(*First == '\'' || *First == '"' || *First == '<');

  const char Terminator = *First == '<' ? '>' : *First;

  for (++First; First != End && *First != Terminator; ++First) {

    // String and character literals don't extend past the end of the line.

    if (isVerticalWhitespace(*First))

      return;

    if (*First != '\\')

      continue;

    // Skip past backslash to the next character. This ensures that the

    // character right after it is skipped as well, which matters if it's

    // the terminator.

    if (++First == End)

      return;

    if (!isWhitespace(*First))

      continue;

    // Whitespace after the backslash might indicate a line continuation.

    const char *FirstAfterBackslashPastSpace = First;

    skipOverSpaces(FirstAfterBackslashPastSpace, End);

    if (unsigned NLSize = isEOL(FirstAfterBackslashPastSpace, End)) {

      // Advance the character pointer to the next line for the next

      // iteration.

      First = FirstAfterBackslashPastSpace + NLSize - 1;

    }

  }

  if (First != End)

    ++First; // Finish off the string.

}


// Returns the length of the skipped newline

static unsigned skipNewline(const char *&First, const char *End) {

  if (First == End)

    return 0;

  assert(isVerticalWhitespace(*First));

  unsigned Len = isEOL(First, End);

  assert(Len && "expected newline");

  First += Len;

  return Len;

}


static void skipToNewlineRaw(const char *&First, const char *const End) {

  for (;;) {

    if (First == End)

      return;


    unsigned Len = isEOL(First, End);

    if (Len)

      return;


    char LastNonWhitespace = ' ';

    do {

      if (!isHorizontalWhitespace(*First))

        LastNonWhitespace = *First;

      if (++First == End)

        return;

      Len = isEOL(First, End);

    } while (!Len);


    if (LastNonWhitespace != '\\')

      return;


    First += Len;

    // Keep skipping lines...

  }

}


static void skipLineComment(const char *&First, const char *const End) {

  assert(First[0] == '/' && First[1] == '/');

  First += 2;

  skipToNewlineRaw(First, End);

}


static void skipBlockComment(const char *&First, const char *const End) {

  assert(First[0] == '/' && First[1] == '*');

  if (End - First < 4) {

    First = End;

    return;

  }

  for (First += 3; First != End; ++First)

    if (First[-1] == '*' && First[0] == '/') {

      ++First;

      return;

    }

}


/// \returns True if the current single quotation mark character is a C++14

/// digit separator.

static bool isQuoteCppDigitSeparator(const char *const Start,

                                     const char *const Cur,

                                     const char *const End) {

  assert(*Cur == '\'' && "expected quotation character");

  // skipLine called in places where we don't expect a valid number

  // body before `start` on the same line, so always return false at the start.

  if (Start == Cur)

    return false;

  // The previous character must be a valid PP number character.

  // Make sure that the L, u, U, u8 prefixes don't get marked as a

  // separator though.

  char Prev = *(Cur - 1);

  if (Prev == 'L' || Prev == 'U' || Prev == 'u')

    return false;

  if (Prev == '8' && (Cur - 1 != Start) && *(Cur - 2) == 'u')

    return false;

  if (!isPreprocessingNumberBody(Prev))

    return false;

  // The next character should be a valid identifier body character.

  return (Cur + 1) < End && isAsciiIdentifierContinue(*(Cur + 1));

}


void Scanner::skipLine(const char *&First, const char *const End) {

  for (;;) {

    assert(First <= End);

    if (First == End)

      return;


    if (isVerticalWhitespace(*First)) {

      skipNewline(First, End);

      return;

    }

    const char *Start = First;

    // Use `LastNonWhitespace`to track if a line-continuation has ever been seen

    // before a new-line character:

    char LastNonWhitespace = ' ';

    while (First != End && !isVerticalWhitespace(*First)) {

      // Iterate over strings correctly to avoid comments and newlines.

      if (*First == '"' ||

          (*First == '\'' && !isQuoteCppDigitSeparator(Start, First, End))) {

        LastTokenPtr = First;

        if (isRawStringLiteral(Start, First))

          skipRawString(First, End);

        else

          skipString(First, End);

        continue;

      }


      // Continue on the same line if an EOL is preceded with backslash

      if (First + 1 < End && *First == '\\') {

        if (unsigned Len = isEOL(First + 1, End)) {

          First += 1 + Len;

          continue;

        }

      }


      // Iterate over comments correctly.

      if (*First != '/' || End - First < 2) {

        LastTokenPtr = First;

        if (!isWhitespace(*First))

          LastNonWhitespace = *First;

        ++First;

        continue;

      }


      if (First[1] == '/') {

        // "//...".

        skipLineComment(First, End);

        continue;

      }


      if (First[1] != '*') {

        LastTokenPtr = First;

        if (!isWhitespace(*First))

          LastNonWhitespace = *First;

        ++First;

        continue;

      }


      // "/*...*/".

      skipBlockComment(First, End);

    }

    if (First == End)

      return;


    // Skip over the newline.

    skipNewline(First, End);


    if (LastNonWhitespace != '\\')

      break;

  }

}


void Scanner::skipDirective(StringRef Name, const char *&First,

                            const char *const End) {

  if (llvm::StringSwitch<bool>(Name)

          .Case("warning", true)

          .Case("error", true)

          .Default(false))

    // Do not process quotes or comments.

    skipToNewlineRaw(First, End);

  else

    skipLine(First, End);

}


static void skipWhitespace(const char *&First, const char *const End) {

  for (;;) {

    assert(First <= End);

    skipOverSpaces(First, End);


    if (End - First < 2)

      return;


    if (*First == '\\') {

      const char *Ptr = First + 1;

      while (Ptr < End && isHorizontalWhitespace(*Ptr))

        ++Ptr;

      if (Ptr != End && isVerticalWhitespace(*Ptr)) {

        skipNewline(Ptr, End);

        First = Ptr;

        continue;

      }

      return;

    }


    // Check for a non-comment character.

    if (First[0] != '/')

      return;


    // "// ...".

    if (First[1] == '/') {

      skipLineComment(First, End);

      return;

    }


    // Cannot be a comment.

    if (First[1] != '*')

      return;


    // "/*...*/".

    skipBlockComment(First, End);

  }

}


bool Scanner::lexModuleDirectiveBody(DirectiveKind Kind, const char *&First,

                                     const char *const End) {

  const char *DirectiveLoc = Input.data() + CurDirToks.front().Offset;

  for (;;) {

    // Keep a copy of the First char incase it needs to be reset.

    const char *Previous = First;

    const dependency_directives_scan::Token &Tok = lexToken(First, End);

    if ((Tok.is(tok::hash) || Tok.is(tok::at)) &&

        (Tok.Flags & clang::Token::StartOfLine)) {

      CurDirToks.pop_back();

      First = Previous;

      return false;

    }

    if (Tok.is(tok::eof))

      return reportError(

          DirectiveLoc,

          diag::err_dep_source_scanner_missing_semi_after_at_import);

    if (Tok.is(tok::semi))

      break;

  }


  const auto &Tok = lexToken(First, End);

  pushDirective(Kind);

  if (Tok.is(tok::eof) || Tok.is(tok::eod))

    return false;

  return reportError(DirectiveLoc,

                     diag::err_dep_source_scanner_unexpected_tokens_at_import);

}


dependency_directives_scan::Token &Scanner::lexToken(const char *&First,

                                                     const char *const End) {

  clang::Token Tok;

  TheLexer.LexFromRawLexer(Tok);

  First = Input.data() + TheLexer.getCurrentBufferOffset();

  assert(First <= End);


  unsigned Offset = TheLexer.getCurrentBufferOffset() - Tok.getLength();

  CurDirToks.emplace_back(Offset, Tok.getLength(), Tok.getKind(),

                          Tok.getFlags());

  return CurDirToks.back();

}


dependency_directives_scan::Token &

Scanner::lexIncludeFilename(const char *&First, const char *const End) {

  clang::Token Tok;

  TheLexer.LexIncludeFilename(Tok);

  First = Input.data() + TheLexer.getCurrentBufferOffset();

  assert(First <= End);


  unsigned Offset = TheLexer.getCurrentBufferOffset() - Tok.getLength();

  CurDirToks.emplace_back(Offset, Tok.getLength(), Tok.getKind(),

                          Tok.getFlags());

  return CurDirToks.back();

}


void Scanner::lexPPDirectiveBody(const char *&First, const char *const End) {

  while (true) {

    const dependency_directives_scan::Token &Tok = lexToken(First, End);

    if (Tok.is(tok::eod) || Tok.is(tok::eof))

      break;

  }

}


StringRef

Scanner::cleanStringIfNeeded(const dependency_directives_scan::Token &Tok) {

  bool NeedsCleaning = Tok.Flags & clang::Token::NeedsCleaning;

  if (LLVM_LIKELY(!NeedsCleaning))

    return Input.slice(Tok.Offset, Tok.getEnd());


  SmallString<64> Spelling;

  Spelling.resize(Tok.Length);


  // FIXME: C++11 raw string literals need special handling (see getSpellingSlow

  // in the Lexer). Currently we cannot see them due to our LangOpts.


  unsigned SpellingLength = 0;

  const char *BufPtr = Input.begin() + Tok.Offset;

  const char *AfterIdent = Input.begin() + Tok.getEnd();

  while (BufPtr < AfterIdent) {

    auto [Char, Size] = Lexer::getCharAndSizeNoWarn(BufPtr, LangOpts);

    Spelling[SpellingLength++] = Char;

    BufPtr += Size;

  }


  return SplitIds.try_emplace(StringRef(Spelling.begin(), SpellingLength), 0)

      .first->first();

}


std::optional<StringRef>

Scanner::tryLexIdentifierOrSkipLine(const char *&First, const char *const End) {

  const dependency_directives_scan::Token &Tok = lexToken(First, End);

  if (Tok.isNot(tok::raw_identifier)) {

    if (!Tok.is(tok::eod))

      skipLine(First, End);

    return std::nullopt;

  }


  return cleanStringIfNeeded(Tok);

}


StringRef Scanner::lexIdentifier(const char *&First, const char *const End) {

  std::optional<StringRef> Id = tryLexIdentifierOrSkipLine(First, End);

  assert(Id && "expected identifier token");

  return *Id;

}


bool Scanner::isNextIdentifierOrSkipLine(StringRef Id, const char *&First,

                                         const char *const End) {

  if (std::optional<StringRef> FoundId =

          tryLexIdentifierOrSkipLine(First, End)) {

    if (*FoundId == Id)

      return true;

    skipLine(First, End);

  }

  return false;

}


bool Scanner::isNextTokenOrSkipLine(tok::TokenKind K, const char *&First,

                                    const char *const End) {

  const dependency_directives_scan::Token &Tok = lexToken(First, End);

  if (Tok.is(K))

    return true;

  skipLine(First, End);

  return false;

}


std::optional<StringRef>

Scanner::tryLexStringLiteralOrSkipLine(const char *&First,

                                       const char *const End) {

  const dependency_directives_scan::Token &Tok = lexToken(First, End);

  if (!tok::isStringLiteral(Tok.Kind)) {

    if (!Tok.is(tok::eod))

      skipLine(First, End);

    return std::nullopt;

  }


  return cleanStringIfNeeded(Tok);

}


bool Scanner::lexAt(const char *&First, const char *const End) {

  // Handle "@import".


  // Lex '@'.

  const dependency_directives_scan::Token &AtTok = lexToken(First, End);

  assert(AtTok.is(tok::at));

  (void)AtTok;


  if (!isNextIdentifierOrSkipLine("import", First, End))

    return false;

  return lexModuleDirectiveBody(decl_at_import, First, End);

}


bool Scanner::lexModule(const char *&First, const char *const End) {

  StringRef Id = lexIdentifier(First, End);

  bool Export = false;

  if (Id == "export") {

    Export = true;

    std::optional<StringRef> NextId = tryLexIdentifierOrSkipLine(First, End);

    if (!NextId)

      return false;

    Id = *NextId;

  }


  if (Id != "module" && Id != "import") {

    skipLine(First, End);

    return false;

  }


  skipWhitespace(First, End);


  // Ignore this as a module directive if the next character can't be part of

  // an import.


  switch (*First) {

  case ':': {

    // `module :` is never the start of a valid module declaration.

    if (Id == "module") {

      skipLine(First, End);

      return false;

    }

    // A module partition starts with exactly one ':'. If we have '::', this is

    // a scope resolution instead and shouldn't be recognized as a directive

    // per P1857R3.

    if (First + 1 != End && First[1] == ':') {

      skipLine(First, End);

      return false;

    }

    // `import:(type)name` is a valid ObjC method decl, so check one more token.

    (void)lexToken(First, End);

    if (!tryLexIdentifierOrSkipLine(First, End))

      return false;

    break;

  }

  case ';': {

    // Handle the global module fragment `module;`.

    if (Id == "module" && !Export)

      break;

    skipLine(First, End);

    return false;

  }

  case '<':

  case '"':

    break;

  default:

    if (!isAsciiIdentifierContinue(*First)) {

      skipLine(First, End);

      return false;

    }

  }


  TheLexer.seek(getOffsetAt(First), /*IsAtStartOfLine*/ false);


  DirectiveKind Kind;

  if (Id == "module")

    Kind = Export ? cxx_export_module_decl : cxx_module_decl;

  else

    Kind = Export ? cxx_export_import_decl : cxx_import_decl;


  return lexModuleDirectiveBody(Kind, First, End);

}


bool Scanner::lex_Pragma(const char *&First, const char *const End) {

  if (!isNextTokenOrSkipLine(tok::l_paren, First, End))

    return false;


  std::optional<StringRef> Str = tryLexStringLiteralOrSkipLine(First, End);


  if (!Str || !isNextTokenOrSkipLine(tok::r_paren, First, End))

    return false;


  SmallString<64> Buffer(*Str);

  prepare_PragmaString(Buffer);


  // Use a new scanner instance since the tokens will be inside the allocated

  // string. We should already have captured all the relevant tokens in the

  // current scanner.

  SmallVector<dependency_directives_scan::Token> DiscardTokens;

  const char *Begin = Buffer.c_str();

  Scanner PragmaScanner{StringRef(Begin, Buffer.size()), DiscardTokens, Diags,

                        InputSourceLoc};


  PragmaScanner.TheLexer.setParsingPreprocessorDirective(true);

  if (PragmaScanner.lexPragma(Begin, Buffer.end()))

    return true;


  DirectiveKind K = PragmaScanner.topDirective();

  if (K == pp_none) {

    skipLine(First, End);

    return false;

  }


  assert(Begin == Buffer.end());

  pushDirective(K);

  return false;

}


bool Scanner::lexPragma(const char *&First, const char *const End) {

  std::optional<StringRef> FoundId = tryLexIdentifierOrSkipLine(First, End);

  if (!FoundId)

    return false;


  StringRef Id = *FoundId;

  auto Kind = llvm::StringSwitch<DirectiveKind>(Id)

                  .Case("once", pp_pragma_once)

                  .Case("push_macro", pp_pragma_push_macro)

                  .Case("pop_macro", pp_pragma_pop_macro)

                  .Case("include_alias", pp_pragma_include_alias)

                  .Default(pp_none);

  if (Kind != pp_none) {

    lexPPDirectiveBody(First, End);

    pushDirective(Kind);

    return false;

  }


  if (Id != "clang") {

    skipLine(First, End);

    return false;

  }


  FoundId = tryLexIdentifierOrSkipLine(First, End);

  if (!FoundId)

    return false;

  Id = *FoundId;


  // #pragma clang system_header

  if (Id == "system_header") {

    lexPPDirectiveBody(First, End);

    pushDirective(pp_pragma_system_header);

    return false;

  }


  if (Id != "module") {

    skipLine(First, End);

    return false;

  }


  // #pragma clang module.

  if (!isNextIdentifierOrSkipLine("import", First, End))

    return false;


  // #pragma clang module import.

  lexPPDirectiveBody(First, End);

  pushDirective(pp_pragma_import);

  return false;

}


bool Scanner::lexEndif(const char *&First, const char *const End) {

  // Strip out "#else" if it's empty.

  if (topDirective() == pp_else)

    popDirective();


  // If "#ifdef" is empty, strip it and skip the "#endif".

  //

  // FIXME: Once/if Clang starts disallowing __has_include in macro expansions,

  // we can skip empty `#if` and `#elif` blocks as well after scanning for a

  // literal __has_include in the condition.  Even without that rule we could

  // drop the tokens if we scan for identifiers in the condition and find none.

  if (topDirective() == pp_ifdef || topDirective() == pp_ifndef) {

    popDirective();

    skipLine(First, End);

    return false;

  }


  return lexDefault(pp_endif, First, End);

}


bool Scanner::lexDefault(DirectiveKind Kind, const char *&First,

                         const char *const End) {

  lexPPDirectiveBody(First, End);

  pushDirective(Kind);

  return false;

}


static bool isStartOfRelevantLine(char First) {

  switch (First) {

  case '#':

  case '@':

  case 'i':

  case 'e':

  case 'm':

  case '_':

    return true;

  }

  return false;

}


bool Scanner::lexPPLine(const char *&First, const char *const End) {

  assert(First != End);


  skipWhitespace(First, End);

  assert(First <= End);

  if (First == End)

    return false;


  if (!isStartOfRelevantLine(*First)) {

    skipLine(First, End);

    assert(First <= End);

    return false;

  }


  LastTokenPtr = First;


  TheLexer.seek(getOffsetAt(First), /*IsAtStartOfLine*/ true);


  auto ScEx1 = make_scope_exit([&]() {

    /// Clear Scanner's CurDirToks before returning, in case we didn't push a

    /// new directive.

    CurDirToks.clear();

  });


  if (*First == '_') {

    if (isNextIdentifierOrSkipLine("_Pragma", First, End))

      return lex_Pragma(First, End);

    return false;

  }


  // Handle preprocessing directives.


  TheLexer.setParsingPreprocessorDirective(true);

  auto ScEx2 = make_scope_exit(

      [&]() { TheLexer.setParsingPreprocessorDirective(false); });


  // Handle "@import".

  if (*First == '@')

    return lexAt(First, End);


  // Handle module directives for C++20 modules.

  if (*First == 'i' || *First == 'e' || *First == 'm')

    return lexModule(First, End);


  // Lex '#'.

  const dependency_directives_scan::Token &HashTok = lexToken(First, End);

  if (HashTok.is(tok::hashhash)) {

    // A \p tok::hashhash at this location is passed by the preprocessor to the

    // parser to interpret, like any other token. So for dependency scanning

    // skip it like a normal token not affecting the preprocessor.

    skipLine(First, End);

    assert(First <= End);

    return false;

  }

  assert(HashTok.is(tok::hash));

  (void)HashTok;


  std::optional<StringRef> FoundId = tryLexIdentifierOrSkipLine(First, End);

  if (!FoundId)

    return false;


  StringRef Id = *FoundId;


  if (Id == "pragma")

    return lexPragma(First, End);


  auto Kind = llvm::StringSwitch<DirectiveKind>(Id)

                  .Case("include", pp_include)

                  .Case("__include_macros", pp___include_macros)

                  .Case("define", pp_define)

                  .Case("undef", pp_undef)

                  .Case("import", pp_import)

                  .Case("include_next", pp_include_next)

                  .Case("if", pp_if)

                  .Case("ifdef", pp_ifdef)

                  .Case("ifndef", pp_ifndef)

                  .Case("elif", pp_elif)

                  .Case("elifdef", pp_elifdef)

                  .Case("elifndef", pp_elifndef)

                  .Case("else", pp_else)

                  .Case("endif", pp_endif)

                  .Default(pp_none);

  if (Kind == pp_none) {

    skipDirective(Id, First, End);

    return false;

  }


  if (Kind == pp_endif)

    return lexEndif(First, End);


  switch (Kind) {

  case pp_include:

  case pp___include_macros:

  case pp_include_next:

  case pp_import:

    // Ignore missing filenames in include or import directives.

    if (lexIncludeFilename(First, End).is(tok::eod)) {

      return false;

    }

    break;

  default:

    break;

  }


  // Everything else.

  return lexDefault(Kind, First, End);

}


static void skipUTF8ByteOrderMark(const char *&First, const char *const End) {

  if ((End - First) >= 3 && First[0] == '\xef' && First[1] == '\xbb' &&

      First[2] == '\xbf')

    First += 3;

}


bool Scanner::scanImpl(const char *First, const char *const End) {

  skipUTF8ByteOrderMark(First, End);

  while (First != End)

    if (lexPPLine(First, End))

      return true;

  return false;

}


bool Scanner::scan(SmallVectorImpl<Directive> &Directives) {

  bool Error = scanImpl(Input.begin(), Input.end());


  if (!Error) {

    // Add an EOF on success.

    if (LastTokenPtr &&

        (Tokens.empty() || LastTokenPtr > Input.begin() + Tokens.back().Offset))

      pushDirective(tokens_present_before_eof);

    pushDirective(pp_eof);

  }


  ArrayRef<dependency_directives_scan::Token> RemainingTokens = Tokens;

  for (const DirectiveWithTokens &DirWithToks : DirsWithToks) {

    assert(RemainingTokens.size() >= DirWithToks.NumTokens);

    Directives.emplace_back(DirWithToks.Kind,

                            RemainingTokens.take_front(DirWithToks.NumTokens));

    RemainingTokens = RemainingTokens.drop_front(DirWithToks.NumTokens);

  }

  assert(RemainingTokens.empty());


  return Error;

}


bool clang::scanSourceForDependencyDirectives(

    StringRef Input, SmallVectorImpl<dependency_directives_scan::Token> &Tokens,

    SmallVectorImpl<Directive> &Directives, DiagnosticsEngine *Diags,

    SourceLocation InputSourceLoc) {

  return Scanner(Input, Tokens, Diags, InputSourceLoc).scan(Directives);

}


void clang::printDependencyDirectivesAsSource(

    StringRef Source,

    ArrayRef<dependency_directives_scan::Directive> Directives,

    llvm::raw_ostream &OS) {

  // Add a space separator where it is convenient for testing purposes.

  auto needsSpaceSeparator =

      [](tok::TokenKind Prev,

         const dependency_directives_scan::Token &Tok) -> bool {

    if (Prev == Tok.Kind)

      return !Tok.isOneOf(tok::l_paren, tok::r_paren, tok::l_square,

                          tok::r_square);

    if (Prev == tok::raw_identifier &&

        Tok.isOneOf(tok::hash, tok::numeric_constant, tok::string_literal,

                    tok::char_constant, tok::header_name))

      return true;

    if (Prev == tok::r_paren &&

        Tok.isOneOf(tok::raw_identifier, tok::hash, tok::string_literal,

                    tok::char_constant, tok::unknown))

      return true;

    if (Prev == tok::comma &&

        Tok.isOneOf(tok::l_paren, tok::string_literal, tok::less))

      return true;

    return false;

  };


  for (const dependency_directives_scan::Directive &Directive : Directives) {

    if (Directive.Kind == tokens_present_before_eof)

      OS << "<TokBeforeEOF>";

    std::optional<tok::TokenKind> PrevTokenKind;

    for (const dependency_directives_scan::Token &Tok : Directive.Tokens) {

      if (PrevTokenKind && needsSpaceSeparator(*PrevTokenKind, Tok))

        OS << ' ';

      PrevTokenKind = Tok.Kind;

      OS << Source.slice(Tok.Offset, Tok.getEnd());

    }

  }

}


static void skipUntilMaybeCXX20ModuleDirective(const char *&First,

                                               const char *const End) {

  assert(First <= End);

  while (First != End) {

    if (*First == '#') {

      ++First;

      skipToNewlineRaw(First, End);

    }

    skipWhitespace(First, End);

    if (const auto Len = isEOL(First, End)) {

      First += Len;

      continue;

    }

    break;

  }

}


bool clang::scanInputForCXX20ModulesUsage(StringRef Source) {

  const char *First = Source.begin();

  const char *const End = Source.end();

  skipUntilMaybeCXX20ModuleDirective(First, End);

  if (First == End)

    return false;


  // Check if the next token can even be a module directive before creating a

  // full lexer.

  if (!(*First == 'i' || *First == 'e' || *First == 'm'))

    return false;


  llvm::SmallVector<dependency_directives_scan::Token> Tokens;

  Scanner S(StringRef(First, End - First), Tokens, nullptr, SourceLocation());

  S.TheLexer.setParsingPreprocessorDirective(true);

  if (S.lexModule(First, End))

    return false;

  auto IsCXXNamedModuleDirective = [](const DirectiveWithTokens &D) {

    switch (D.Kind) {

    case dependency_directives_scan::cxx_module_decl:

    case dependency_directives_scan::cxx_import_decl:

    case dependency_directives_scan::cxx_export_module_decl:

    case dependency_directives_scan::cxx_export_import_decl:

      return true;

    default:

      return false;

    }

  };

  return llvm::any_of(S.DirsWithToks, IsCXXNamedModuleDirective);

}

Diagnostic.h
Defines the Diagnostic-related interfaces.

CharInfo.h

D
const Decl * D
Definition: CheckExprLifetime.cpp:212

skipBlockComment
static void skipBlockComment(const char *&First, const char *const End)
Definition: DependencyDirectivesScanner.cpp:386

skipRawString
static void skipRawString(const char *&First, const char *const End)
Definition: DependencyDirectivesScanner.cpp:263

skipString
static void skipString(const char *&First, const char *const End)
Definition: DependencyDirectivesScanner.cpp:314

isStartOfRelevantLine
static bool isStartOfRelevantLine(char First)
Definition: DependencyDirectivesScanner.cpp:878

skipWhitespace
static void skipWhitespace(const char *&First, const char *const End)
Definition: DependencyDirectivesScanner.cpp:506

isRawStringLiteral
static bool isRawStringLiteral(const char *First, const char *Current)
Definition: DependencyDirectivesScanner.cpp:235

skipUntilMaybeCXX20ModuleDirective
static void skipUntilMaybeCXX20ModuleDirective(const char *&First, const char *const End)
Definition: DependencyDirectivesScanner.cpp:1081

skipOverSpaces
static void skipOverSpaces(const char *&First, const char *const End)
Definition: DependencyDirectivesScanner.cpp:206

isEOL
static unsigned isEOL(const char *First, const char *const End)
Definition: DependencyDirectivesScanner.cpp:305

previousChar
static char previousChar(const char *First, const char *&Current)
Definition: DependencyDirectivesScanner.cpp:212

skipToNewlineRaw
static void skipToNewlineRaw(const char *&First, const char *const End)
Definition: DependencyDirectivesScanner.cpp:354

skipNewline
static unsigned skipNewline(const char *&First, const char *End)
Definition: DependencyDirectivesScanner.cpp:344

skipUTF8ByteOrderMark
static void skipUTF8ByteOrderMark(const char *&First, const char *const End)
Definition: DependencyDirectivesScanner.cpp:999

skipLineComment
static void skipLineComment(const char *&First, const char *const End)
Definition: DependencyDirectivesScanner.cpp:380

isQuoteCppDigitSeparator
static bool isQuoteCppDigitSeparator(const char *const Start, const char *const Cur, const char *const End)
Definition: DependencyDirectivesScanner.cpp:401

DependencyDirectivesScanner.h
This is the interface for scanning header and source files to get the minimum necessary preprocessor ...

LexDiagnostic.h

Lexer.h

Pragma.h

Id
uint32_t Id
Definition: SemaARM.cpp:1179

Begin
SourceLocation Begin
Definition: USRLocFinder.cpp:202

Previous
StateNode * Previous
Definition: UnwrappedLineFormatter.cpp:1256

clang::Decl::Kind
Kind
Lists the kind of concrete classes of Decl.
Definition: DeclBase.h:89

clang::DiagnosticsEngine
Concrete class used by the front-end to report problems and issues.
Definition: Diagnostic.h:231

clang::LangOptions
Keeps track of the various options that can be enabled, which controls the dialect of C or C++ that i...
Definition: LangOptions.h:434

clang::Lexer
Lexer - This provides a simple interface that turns a text buffer into a stream of tokens.
Definition: Lexer.h:78

clang::Lexer::getEscapedNewLineSize
static unsigned getEscapedNewLineSize(const char *P)
getEscapedNewLineSize - Return the size of the specified escaped newline, or 0 if it is not an escape...
Definition: Lexer.cpp:1276

clang::Lexer::getCharAndSizeNoWarn
static SizedChar getCharAndSizeNoWarn(const char *Ptr, const LangOptions &LangOpts)
getCharAndSizeNoWarn - Like the getCharAndSize method, but does not ever emit a warning.
Definition: Lexer.h:604

clang::SourceLocation
Encodes a location in the source.
Definition: SourceLocation.h:90

clang::Token
Token - This structure provides full information about a lexed token.
Definition: Token.h:36

clang::Token::getFlags
unsigned getFlags() const
Return the internal represtation of the flags.
Definition: Token.h:264

clang::Token::getLength
unsigned getLength() const
Definition: Token.h:137

clang::Token::getKind
tok::TokenKind getKind() const
Definition: Token.h:97

clang::Token::StartOfLine
@ StartOfLine
Definition: Token.h:75

clang::Token::NeedsCleaning
@ NeedsCleaning
Definition: Token.h:80

llvm::ArrayRef
Definition: LLVM.h:31

llvm::SmallString
Definition: LLVM.h:34

llvm::SmallVectorImpl
Definition: Randstruct.h:18

llvm::SmallVector
Definition: LLVM.h:35

clang::ObjCPropertyAttribute::Kind
Kind
Definition: DeclObjCCommon.h:22

clang::dependency_directives_scan
Definition: DependencyDirectivesScanner.h:32

clang::dependency_directives_scan::DirectiveKind
DirectiveKind
Represents the kind of preprocessor directive or a module declaration that is tracked by the scanner ...
Definition: DependencyDirectivesScanner.h:59

clang::dependency_directives_scan::pp_else
@ pp_else
Definition: DependencyDirectivesScanner.h:79

clang::dependency_directives_scan::cxx_module_decl
@ cxx_module_decl
Definition: DependencyDirectivesScanner.h:82

clang::dependency_directives_scan::pp_if
@ pp_if
Definition: DependencyDirectivesScanner.h:73

clang::dependency_directives_scan::pp_pragma_once
@ pp_pragma_once
Definition: DependencyDirectivesScanner.h:67

clang::dependency_directives_scan::pp_pragma_import
@ pp_pragma_import
Definition: DependencyDirectivesScanner.h:66

clang::dependency_directives_scan::pp_include
@ pp_include
Definition: DependencyDirectivesScanner.h:61

clang::dependency_directives_scan::tokens_present_before_eof
@ tokens_present_before_eof
Indicates that there are tokens present between the last scanned directive and eof.
Definition: DependencyDirectivesScanner.h:88

clang::dependency_directives_scan::pp_include_next
@ pp_include_next
Definition: DependencyDirectivesScanner.h:72

clang::dependency_directives_scan::pp_elif
@ pp_elif
Definition: DependencyDirectivesScanner.h:76

clang::dependency_directives_scan::pp_elifdef
@ pp_elifdef
Definition: DependencyDirectivesScanner.h:77

clang::dependency_directives_scan::cxx_export_module_decl
@ cxx_export_module_decl
Definition: DependencyDirectivesScanner.h:84

clang::dependency_directives_scan::pp_eof
@ pp_eof
Definition: DependencyDirectivesScanner.h:89

clang::dependency_directives_scan::decl_at_import
@ decl_at_import
Definition: DependencyDirectivesScanner.h:81

clang::dependency_directives_scan::cxx_import_decl
@ cxx_import_decl
Definition: DependencyDirectivesScanner.h:83

clang::dependency_directives_scan::pp_undef
@ pp_undef
Definition: DependencyDirectivesScanner.h:64

clang::dependency_directives_scan::pp_pragma_system_header
@ pp_pragma_system_header
Definition: DependencyDirectivesScanner.h:71

clang::dependency_directives_scan::pp_endif
@ pp_endif
Definition: DependencyDirectivesScanner.h:80

clang::dependency_directives_scan::pp___include_macros
@ pp___include_macros
Definition: DependencyDirectivesScanner.h:62

clang::dependency_directives_scan::pp_define
@ pp_define
Definition: DependencyDirectivesScanner.h:63

clang::dependency_directives_scan::pp_ifndef
@ pp_ifndef
Definition: DependencyDirectivesScanner.h:75

clang::dependency_directives_scan::pp_elifndef
@ pp_elifndef
Definition: DependencyDirectivesScanner.h:78

clang::dependency_directives_scan::pp_import
@ pp_import
Definition: DependencyDirectivesScanner.h:65

clang::dependency_directives_scan::cxx_export_import_decl
@ cxx_export_import_decl
Definition: DependencyDirectivesScanner.h:85

clang::dependency_directives_scan::pp_pragma_include_alias
@ pp_pragma_include_alias
Definition: DependencyDirectivesScanner.h:70

clang::dependency_directives_scan::pp_pragma_push_macro
@ pp_pragma_push_macro
Definition: DependencyDirectivesScanner.h:68

clang::dependency_directives_scan::pp_none
@ pp_none
Definition: DependencyDirectivesScanner.h:60

clang::dependency_directives_scan::pp_ifdef
@ pp_ifdef
Definition: DependencyDirectivesScanner.h:74

clang::dependency_directives_scan::pp_pragma_pop_macro
@ pp_pragma_pop_macro
Definition: DependencyDirectivesScanner.h:69

clang::serialized_diags::Error
@ Error
Definition: SerializedDiagnostics.h:47

clang::syntax::NodeRole::Size
@ Size

clang::tok::isStringLiteral
bool isStringLiteral(TokenKind K)
Return true if this is a C or C++ string-literal (or C++11 user-defined-string-literal) token.
Definition: TokenKinds.h:89

clang::tok::TokenKind
TokenKind
Provides a simple uniform namespace for tokens from all C languages.
Definition: TokenKinds.h:25

clang
The JSON file list parser is used to communicate input to InstallAPI.
Definition: CalledOnceCheck.h:17

clang::isVerticalWhitespace
LLVM_READONLY bool isVerticalWhitespace(unsigned char c)
Returns true if this character is vertical ASCII whitespace: '\n', '\r'.
Definition: CharInfo.h:99

clang::StructuralEquivalenceKind::Default
@ Default

clang::isAsciiIdentifierContinue
LLVM_READONLY bool isAsciiIdentifierContinue(unsigned char c)
Definition: CharInfo.h:61

clang::printDependencyDirectivesAsSource
void printDependencyDirectivesAsSource(StringRef Source, ArrayRef< dependency_directives_scan::Directive > Directives, llvm::raw_ostream &OS)
Print the previously scanned dependency directives as minimized source text.
Definition: DependencyDirectivesScanner.cpp:1043

clang::scanInputForCXX20ModulesUsage
bool scanInputForCXX20ModulesUsage(StringRef Source)
Scan an input source buffer for C++20 named module usage.
Definition: DependencyDirectivesScanner.cpp:1098

clang::isHorizontalWhitespace
LLVM_READONLY bool isHorizontalWhitespace(unsigned char c)
Returns true if this character is horizontal ASCII whitespace: ' ', '\t', '\f', '\v'.
Definition: CharInfo.h:91

clang::scanSourceForDependencyDirectives
bool scanSourceForDependencyDirectives(StringRef Input, SmallVectorImpl< dependency_directives_scan::Token > &Tokens, SmallVectorImpl< dependency_directives_scan::Directive > &Directives, DiagnosticsEngine *Diags=nullptr, SourceLocation InputSourceLoc=SourceLocation())
Scan the input for the preprocessor directives that might have an effect on the dependencies for a co...
Definition: DependencyDirectivesScanner.cpp:1036

clang::isWhitespace
LLVM_READONLY bool isWhitespace(unsigned char c)
Return true if this character is horizontal or vertical ASCII whitespace: ' ', '\t',...
Definition: CharInfo.h:108

clang::isPreprocessingNumberBody
LLVM_READONLY bool isPreprocessingNumberBody(unsigned char c)
Return true if this is the body character of a C preprocessing number, which is [a-zA-Z0-9_.
Definition: CharInfo.h:168

clang::ComparisonCategoryType::First
@ First

clang::ComparisonCategoryType::Last
@ Last

clang::prepare_PragmaString
void prepare_PragmaString(SmallVectorImpl< char > &StrVal)
Destringize a _Pragma("") string according to C11 6.10.9.1: "The string literal is destringized by de...
Definition: Pragma.cpp:302

llvm
Diagnostic wrappers for TextAPI types for error reporting.
Definition: Dominators.h:30

clang::dependency_directives_scan::Directive
Represents a directive that's lexed as part of the dependency directives scanning.
Definition: DependencyDirectivesScanner.h:95

clang::dependency_directives_scan::Directive::Tokens
ArrayRef< Token > Tokens
Definition: DependencyDirectivesScanner.h:96

clang::dependency_directives_scan::Directive::Kind
DirectiveKind Kind
The kind of token.
Definition: DependencyDirectivesScanner.h:99

clang::dependency_directives_scan::Token
Token lexed as part of dependency directive scanning.
Definition: DependencyDirectivesScanner.h:35

clang::dependency_directives_scan::Token::Flags
unsigned short Flags
Definition: DependencyDirectivesScanner.h:40

clang::dependency_directives_scan::Token::isNot
bool isNot(tok::TokenKind K) const
Definition: DependencyDirectivesScanner.h:49

clang::dependency_directives_scan::Token::Offset
unsigned Offset
Offset into the original source input.
Definition: DependencyDirectivesScanner.h:37

clang::dependency_directives_scan::Token::Kind
tok::TokenKind Kind
Definition: DependencyDirectivesScanner.h:39

clang::dependency_directives_scan::Token::Length
unsigned Length
Definition: DependencyDirectivesScanner.h:38

clang::dependency_directives_scan::Token::getEnd
unsigned getEnd() const
Definition: DependencyDirectivesScanner.h:46

clang::dependency_directives_scan::Token::is
bool is(tok::TokenKind K) const
Definition: DependencyDirectivesScanner.h:48

clang::dependency_directives_scan::Token::isOneOf
bool isOneOf(Ts... Ks) const
Definition: DependencyDirectivesScanner.h:50