19#include "llvm/ADT/STLExtras.h"
20#include "llvm/Support/Debug.h"
23#define DEBUG_TYPE "format-token-breaker"
28static constexpr StringRef
Blanks(
" \t\v\f\r");
32 static constexpr StringRef KnownCStylePrefixes[] = {
"///<",
"//!<",
"///",
34 static constexpr StringRef KnownTextProtoPrefixes[] = {
"####",
"###",
"##",
38 KnownPrefixes = KnownTextProtoPrefixes;
41 llvm::is_sorted(KnownPrefixes, [](StringRef Lhs, StringRef Rhs)
noexcept {
42 return Lhs.size() > Rhs.size();
45 for (StringRef KnownPrefix : KnownPrefixes) {
46 if (Comment.starts_with(KnownPrefix)) {
47 const auto PrefixLength =
48 Comment.find_first_not_of(
' ', KnownPrefix.size());
49 return Comment.substr(0, PrefixLength);
57 unsigned ColumnLimit,
unsigned TabWidth,
59 bool DecorationEndsWithStar =
false) {
60 LLVM_DEBUG(llvm::dbgs() <<
"Comment split: \"" <<
Text
61 <<
"\", Column limit: " << ColumnLimit
62 <<
", Content start: " << ContentStartColumn <<
"\n");
63 if (ColumnLimit <= ContentStartColumn + 1)
66 unsigned MaxSplit = ColumnLimit - ContentStartColumn + 1;
67 unsigned MaxSplitBytes = 0;
69 for (
unsigned NumChars = 0;
70 NumChars < MaxSplit && MaxSplitBytes <
Text.size();) {
71 unsigned BytesInChar =
74 Text.substr(MaxSplitBytes, BytesInChar), ContentStartColumn + NumChars,
76 MaxSplitBytes += BytesInChar;
83 StringRef::size_type SpaceOffset =
85 if (SpaceOffset != StringRef::npos && SpaceOffset + 1 <
Text.size() &&
86 Text[SpaceOffset + 1] ==
'{') {
87 MaxSplitBytes = SpaceOffset + 1;
91 StringRef::size_type SpaceOffset =
Text.find_last_of(
Blanks, MaxSplitBytes);
93 static const auto kNumberedListRegexp = llvm::Regex(
"^[1-9][0-9]?\\.");
95 while (SpaceOffset != StringRef::npos) {
102 StringRef::size_type LastNonBlank =
104 if (LastNonBlank != StringRef::npos &&
Text[LastNonBlank] ==
'\\') {
105 SpaceOffset =
Text.find_last_of(
Blanks, LastNonBlank);
112 if (kNumberedListRegexp.match(
Text.substr(SpaceOffset).ltrim(
Blanks))) {
113 SpaceOffset =
Text.find_last_of(
Blanks, SpaceOffset);
119 (
Text[SpaceOffset + 1] ==
'{' ||
Text[SpaceOffset + 1] ==
'@')) {
120 SpaceOffset =
Text.find_last_of(
Blanks, SpaceOffset);
127 if (SpaceOffset == StringRef::npos ||
129 Text.find_last_not_of(
Blanks, SpaceOffset) == StringRef::npos) {
132 StringRef::size_type FirstNonWhitespace =
Text.find_first_not_of(
Blanks);
133 if (FirstNonWhitespace == StringRef::npos) {
137 SpaceOffset =
Text.find_first_of(
138 Blanks, std::max<unsigned>(MaxSplitBytes, FirstNonWhitespace));
140 if (SpaceOffset != StringRef::npos && SpaceOffset != 0) {
145 if (SpaceOffset == 1 &&
Text[SpaceOffset - 1] ==
'*')
147 StringRef BeforeCut =
Text.substr(0, SpaceOffset).rtrim(
Blanks);
148 StringRef AfterCut =
Text.substr(SpaceOffset);
149 if (!DecorationEndsWithStar)
150 AfterCut = AfterCut.ltrim(
Blanks);
152 AfterCut.begin() - BeforeCut.end());
163 if (ColumnLimit <= UsedColumns)
165 unsigned MaxSplit = ColumnLimit - UsedColumns;
166 StringRef::size_type SpaceOffset = 0;
167 StringRef::size_type SlashOffset = 0;
168 StringRef::size_type WordStartOffset = 0;
169 StringRef::size_type SplitPoint = 0;
170 for (
unsigned Chars = 0;;) {
172 if (
Text[0] ==
'\\') {
178 Text.substr(0, Advance), UsedColumns + Chars, TabWidth, Encoding);
181 if (Chars > MaxSplit ||
Text.size() <= Advance)
185 SpaceOffset = SplitPoint;
187 SlashOffset = SplitPoint;
189 WordStartOffset = SplitPoint;
191 SplitPoint += Advance;
195 if (SpaceOffset != 0)
197 if (SlashOffset != 0)
199 if (WordStartOffset != 0)
208 "formatting regions are switched by comment tokens");
209 StringRef Content =
Token.TokenText.substr(2).ltrim();
210 return Content.starts_with(
"clang-format on") ||
211 Content.starts_with(
"clang-format off");
228 return RemainingTokenColumns + 1 -
Split.second;
235 StringRef::size_type Length,
236 unsigned StartColumn)
const {
237 llvm_unreachable(
"Getting the length of a part of the string literal "
238 "indicates that the code tries to reflow it.");
243 unsigned StartColumn)
const {
255 const FormatToken &Tok,
unsigned StartColumn, StringRef Prefix,
256 StringRef Postfix,
unsigned UnbreakableTailLength,
bool InPPDirective,
259 StartColumn(StartColumn), Prefix(Prefix), Postfix(Postfix),
260 UnbreakableTailLength(UnbreakableTailLength) {
267 unsigned LineIndex,
unsigned TailOffset,
unsigned ColumnLimit,
268 unsigned ContentStartColumn,
const llvm::Regex &CommentPragmasRegex)
const {
275 unsigned ContentIndent,
277 Whitespaces.replaceWhitespaceInToken(
284 unsigned StartColumn,
unsigned UnbreakableTailLength,
bool InPPDirective,
287 Tok, StartColumn, QuoteStyle == SingleQuotes ?
"'"
288 : QuoteStyle == AtDoubleQuotes ?
"@\""
290 QuoteStyle == SingleQuotes ?
"'" :
"\"",
291 UnbreakableTailLength, InPPDirective, Encoding, Style),
292 BracesNeeded(Tok.isNot(TT_StringInConcatenation)),
293 QuoteStyle(QuoteStyle) {
319 Postfix = SignOnNewLine ?
"'" :
"' +";
320 Prefix = SignOnNewLine ?
"+ '" :
"'";
324 Prefix = SignOnNewLine ?
"+ @\"" :
"@\"";
327 Prefix = SignOnNewLine ?
"+ \"" :
"\"";
330 Postfix = SignOnNewLine ?
"\"" :
"\" +";
345 unsigned LineIndex,
unsigned Offset,
unsigned StartColumn)
const {
364 unsigned LineIndex,
unsigned TailOffset,
Split Split,
366 Whitespaces.replaceWhitespaceInToken(
386 Whitespaces.replaceWhitespaceInToken(
390 Whitespaces.replaceWhitespaceInToken(
397 unsigned StartColumn,
bool InPPDirective,
401 StartColumn(StartColumn) {}
407 unsigned ColumnLimit,
unsigned ContentStartColumn,
408 const llvm::Regex &CommentPragmasRegex)
const {
411 return Split(StringRef::npos, 0);
418 unsigned LineIndex,
unsigned TailOffset,
Split Split,
420 StringRef
Text =
Content[LineIndex].substr(TailOffset);
425 unsigned BreakOffsetInToken =
427 unsigned CharsToRemove =
Split.second;
428 Whitespaces.replaceWhitespaceInToken(
429 tokenAt(LineIndex), BreakOffsetInToken, CharsToRemove,
"",
"",
438 Content = Content.trim(
Blanks);
441 bool hasSpecialMeaningPrefix =
false;
442 for (StringRef Prefix :
443 {
"@",
"\\",
"TODO",
"FIXME",
"XXX",
"-# ",
"- ",
"+ ",
"* "}) {
444 if (Content.starts_with(Prefix)) {
445 hasSpecialMeaningPrefix =
true;
453 static const auto kNumberedListRegexp = llvm::Regex(
"^[1-9][0-9]?\\. ");
454 hasSpecialMeaningPrefix =
455 hasSpecialMeaningPrefix || kNumberedListRegexp.match(Content);
460 return Content.size() >= 2 && !hasSpecialMeaningPrefix &&
461 !Content.ends_with(
"\\") &&
469 unsigned OriginalStartColumn,
bool FirstInLine,
bool InPPDirective,
472 DelimitersOnNewline(
false),
473 UnbreakableTailLength(
Token.UnbreakableTailLength) {
474 assert(
Tok.
is(TT_BlockComment) &&
475 "block comment section must start with a block comment");
478 assert(TokenText.starts_with(
"/*") && TokenText.ends_with(
"*/"));
479 TokenText.substr(2, TokenText.size() - 4)
480 .split(
Lines, UseCRLF ?
"\r\n" :
"\n");
482 int IndentDelta =
StartColumn - OriginalStartColumn;
489 for (
size_t i = 1; i <
Lines.size(); ++i)
490 adjustWhitespace(i, IndentDelta);
501 if (
Lines.size() >= 2 &&
Content[1].starts_with(
"**") &&
507 if (
Lines.size() == 1 && !FirstInLine) {
515 for (
size_t i = 1, e =
Content.size(); i < e && !Decoration.empty(); ++i) {
521 }
else if (!
Text.empty() && Decoration.starts_with(
Text)) {
524 while (!
Text.starts_with(Decoration))
525 Decoration = Decoration.drop_back(1);
528 LastLineNeedsDecoration =
true;
530 for (
size_t i = 1, e =
Lines.size(); i < e; ++i) {
536 LastLineNeedsDecoration =
false;
538 if (e >= 2 && !Decoration.empty())
540 }
else if (Decoration.empty()) {
552 unsigned DecorationSize = Decoration.starts_with(
Content[i])
558 if (!Decoration.starts_with(
Content[i])) {
560 std::min<int>(IndentAtLineBreak, std::max(0,
ContentColumn[i]));
563 IndentAtLineBreak = std::max<unsigned>(IndentAtLineBreak, Decoration.size());
567 if ((
Lines[0] ==
"*" ||
Lines[0].starts_with(
"* ")) &&
Lines.size() > 1) {
569 DelimitersOnNewline =
true;
570 }
else if (
Lines[0].starts_with(
"* ") &&
Lines.size() == 1) {
584 llvm::dbgs() <<
"IndentAtLineBreak " << IndentAtLineBreak <<
"\n";
585 llvm::dbgs() <<
"DelimitersOnNewline " << DelimitersOnNewline <<
"\n";
586 for (
size_t i = 0; i <
Lines.size(); ++i) {
587 llvm::dbgs() << i <<
" |" <<
Content[i] <<
"| "
589 <<
"IN=" << (
Content[i].data() -
Lines[i].data()) <<
"\n";
595 unsigned LineIndex,
unsigned TailOffset,
unsigned ColumnLimit,
596 unsigned ContentStartColumn,
const llvm::Regex &CommentPragmasRegex)
const {
599 return Split(StringRef::npos, 0);
605void BreakableBlockComment::adjustWhitespace(
unsigned LineIndex,
612 size_t EndOfPreviousLine =
Lines[LineIndex - 1].size();
618 Lines[LineIndex - 1].find_last_not_of(
Blanks, EndOfPreviousLine);
619 if (EndOfPreviousLine == StringRef::npos)
620 EndOfPreviousLine = 0;
624 size_t StartOfLine =
Lines[LineIndex].find_first_not_of(
Blanks);
625 if (StartOfLine == StringRef::npos)
626 StartOfLine =
Lines[LineIndex].size();
628 StringRef Whitespace =
Lines[LineIndex].substr(0, StartOfLine);
630 size_t PreviousContentOffset =
631 Content[LineIndex - 1].data() -
Lines[LineIndex - 1].data();
633 PreviousContentOffset, EndOfPreviousLine - PreviousContentOffset);
634 Content[LineIndex] =
Lines[LineIndex].substr(StartOfLine);
644 StringRef::size_type Length,
645 unsigned StartColumn)
const {
653 unsigned StartColumn)
const {
654 unsigned LineLength =
655 UnbreakableTailLength +
657 if (LineIndex + 1 ==
Lines.size()) {
660 bool HasRemainingText = Offset <
Content[LineIndex].size();
661 if (!HasRemainingText) {
662 bool HasDecoration =
Lines[LineIndex].ltrim().starts_with(Decoration);
664 LineLength -= Decoration.size();
673 return IndentAtLineBreak;
677const llvm::StringSet<>
679 "@param",
"@return",
"@returns",
"@throws",
"@type",
"@template",
680 "@see",
"@deprecated",
"@define",
"@exports",
"@mods",
"@private",
689 StringRef ContentWithNoDecoration =
Content[LineIndex];
690 if (LineIndex == 0 && ContentWithNoDecoration.starts_with(
"*"))
691 ContentWithNoDecoration = ContentWithNoDecoration.substr(1).ltrim(
Blanks);
692 StringRef FirstWord = ContentWithNoDecoration.substr(
693 0, ContentWithNoDecoration.find_first_of(
Blanks));
702 StringRef
Text =
Content[LineIndex].substr(TailOffset);
703 StringRef Prefix = Decoration;
707 unsigned LocalIndentAtLineBreak = IndentAtLineBreak;
708 if (LineIndex + 1 ==
Lines.size() &&
712 if (LocalIndentAtLineBreak >= 2)
713 LocalIndentAtLineBreak -= 2;
717 unsigned BreakOffsetInToken =
719 unsigned CharsToRemove =
Split.second;
720 assert(LocalIndentAtLineBreak >= Prefix.size());
721 std::string PrefixWithTrailingIndent = std::string(Prefix);
722 PrefixWithTrailingIndent.append(ContentIndent,
' ');
723 Whitespaces.replaceWhitespaceInToken(
724 tokenAt(LineIndex), BreakOffsetInToken, CharsToRemove,
"",
726 LocalIndentAtLineBreak + ContentIndent -
727 PrefixWithTrailingIndent.size());
731 unsigned LineIndex,
const llvm::Regex &CommentPragmasRegex)
const {
732 if (!
mayReflow(LineIndex, CommentPragmasRegex))
733 return Split(StringRef::npos, 0);
737 size_t Trimmed =
Content[LineIndex].find_first_not_of(
Blanks);
740 if (PreviousContentIndent && Trimmed != StringRef::npos &&
741 Trimmed != PreviousContentIndent) {
742 return Split(StringRef::npos, 0);
746 return Split(0, Trimmed != StringRef::npos ? Trimmed : 0);
751 return DelimitersOnNewline &&
752 Lines[0].substr(1).find_first_not_of(
Blanks) != StringRef::npos;
760 "Reflowing whitespace within a token");
763 unsigned WhitespaceOffsetInToken =
Content[LineIndex - 1].data() +
764 Content[LineIndex - 1].size() -
766 unsigned WhitespaceLength = TrimmedContent.data() -
768 WhitespaceOffsetInToken;
769 Whitespaces.replaceWhitespaceInToken(
770 tokenAt(LineIndex), WhitespaceOffsetInToken,
771 WhitespaceLength,
"",
778 if (LineIndex == 0) {
779 if (DelimitersOnNewline) {
784 size_t BreakLength =
Lines[0].substr(1).find_first_not_of(
Blanks);
785 if (BreakLength != StringRef::npos) {
794 StringRef Prefix = Decoration;
795 if (
Content[LineIndex].empty()) {
796 if (LineIndex + 1 ==
Lines.size()) {
797 if (!LastLineNeedsDecoration) {
802 }
else if (!Decoration.empty()) {
805 Prefix = Prefix.substr(0, 1);
809 Prefix = Prefix.substr(0, 1);
813 unsigned WhitespaceOffsetInToken =
Content[LineIndex - 1].data() +
814 Content[LineIndex - 1].size() -
816 unsigned WhitespaceLength =
Content[LineIndex].data() -
818 WhitespaceOffsetInToken;
819 Whitespaces.replaceWhitespaceInToken(
820 tokenAt(LineIndex), WhitespaceOffsetInToken, WhitespaceLength,
"", Prefix,
826 if (DelimitersOnNewline) {
830 StringRef
Line =
Content.back().substr(TailOffset);
832 if (!TrimmedLine.empty())
833 return Split(TrimmedLine.size(),
Line.size() - TrimmedLine.size());
835 return Split(StringRef::npos, 0);
839 unsigned LineIndex,
const llvm::Regex &CommentPragmasRegex)
const {
842 StringRef IndentContent =
Content[LineIndex];
843 if (
Lines[LineIndex].ltrim(
Blanks).starts_with(
"*"))
844 IndentContent =
Lines[LineIndex].ltrim(
Blanks).substr(1);
846 !CommentPragmasRegex.match(IndentContent) &&
855 assert(
Tok.
is(TT_LineComment) &&
856 "line comment section must start with a line comment");
861 int FirstLineSpaceChange = 0;
863 CurrentTok && CurrentTok->
is(TT_LineComment);
864 CurrentTok = CurrentTok->Next) {
865 LastLineTok = LineTok;
866 StringRef TokenText(CurrentTok->TokenText);
867 assert((TokenText.starts_with(
"//") || TokenText.starts_with(
"#")) &&
868 "unsupported line comment prefix, '//' and '#' are supported");
869 size_t FirstLineIndex =
Lines.size();
870 TokenText.split(
Lines,
"\n");
873 PrefixSpaceChange.resize(
Lines.size());
875 Prefix.resize(
Lines.size());
876 OriginalPrefix.resize(
Lines.size());
877 for (
size_t i = FirstLineIndex, e =
Lines.size(); i < e; ++i) {
880 OriginalPrefix[i] = IndentPrefix;
881 const int SpacesInPrefix = llvm::count(IndentPrefix,
' ');
885 const auto NoSpaceBeforeFirstCommentChar = [&]() {
886 assert(
Lines[i].size() > IndentPrefix.size());
887 const char FirstCommentChar =
Lines[i][IndentPrefix.size()];
888 const unsigned FirstCharByteSize =
891 Lines[i].substr(IndentPrefix.size(), FirstCharByteSize),
904 if (FirstCommentChar ==
'#' && !TokenText.starts_with(
"#"))
906 return FirstCommentChar ==
'\\' ||
isPunctuation(FirstCommentChar) ||
915 if (i == 0 || OriginalPrefix[i].rtrim(
Blanks) !=
916 OriginalPrefix[i - 1].rtrim(
Blanks)) {
917 if (SpacesInPrefix < Minimum &&
Lines[i].size() > IndentPrefix.size() &&
918 !NoSpaceBeforeFirstCommentChar()) {
919 FirstLineSpaceChange = Minimum - SpacesInPrefix;
920 }
else if (
static_cast<unsigned>(SpacesInPrefix) >
922 FirstLineSpaceChange =
925 FirstLineSpaceChange = 0;
929 if (
Lines[i].size() != IndentPrefix.size()) {
930 assert(
Lines[i].size() > IndentPrefix.size());
932 PrefixSpaceChange[i] = SpacesInPrefix + FirstLineSpaceChange < Minimum
933 ? Minimum - SpacesInPrefix
934 : FirstLineSpaceChange;
936 const auto FirstNonSpace =
Lines[i][IndentPrefix.size()];
938 const bool LineRequiresLeadingSpace =
939 !NoSpaceBeforeFirstCommentChar() ||
940 (FirstNonSpace ==
'}' && FirstLineSpaceChange != 0);
941 const bool AllowsSpaceChange =
943 (SpacesInPrefix != 0 || LineRequiresLeadingSpace);
945 if (PrefixSpaceChange[i] > 0 && AllowsSpaceChange) {
946 Prefix[i] = IndentPrefix.str();
947 Prefix[i].append(PrefixSpaceChange[i],
' ');
948 }
else if (PrefixSpaceChange[i] < 0 && AllowsSpaceChange) {
949 Prefix[i] = IndentPrefix
950 .drop_back(std::min<std::size_t>(
951 -PrefixSpaceChange[i], SpacesInPrefix))
954 Prefix[i] = IndentPrefix.str();
959 Prefix[i] = IndentPrefix.drop_back(SpacesInPrefix).str();
970 if (EndOfLine == StringRef::npos)
976 LineTok = CurrentTok->
Next;
977 if (CurrentTok->Next && !CurrentTok->Next->ContinuesLineCommentSection) {
999 StringRef::size_type Length,
1000 unsigned StartColumn)
const {
1013 unsigned LineIndex,
unsigned TailOffset,
Split Split,
1015 StringRef
Text =
Content[LineIndex].substr(TailOffset);
1018 unsigned BreakOffsetInToken =
1020 unsigned CharsToRemove =
Split.second;
1021 Whitespaces.replaceWhitespaceInToken(
1022 tokenAt(LineIndex), BreakOffsetInToken, CharsToRemove,
"",
1028 unsigned LineIndex,
const llvm::Regex &CommentPragmasRegex)
const {
1029 if (!
mayReflow(LineIndex, CommentPragmasRegex))
1030 return Split(StringRef::npos, 0);
1032 size_t Trimmed =
Content[LineIndex].find_first_not_of(
Blanks);
1038 return Split(0, Trimmed != StringRef::npos ? Trimmed : 0);
1043 if (LineIndex > 0 &&
Tokens[LineIndex] !=
Tokens[LineIndex - 1]) {
1046 Whitespaces.replaceWhitespace(
1047 *
Tokens[LineIndex], 0, 0,
1050 }
else if (LineIndex > 0) {
1061 unsigned Offset =
Lines[LineIndex - 1].data() +
1062 Lines[LineIndex - 1].size() -
1066 unsigned WhitespaceLength =
1068 Whitespaces.replaceWhitespaceInToken(*
Tokens[LineIndex], Offset,
1079 unsigned WhitespaceLength =
1081 Whitespaces.replaceWhitespaceInToken(*
Tokens[LineIndex], Offset,
1098 if (LineIndex > 0 &&
Tokens[LineIndex] !=
Tokens[LineIndex - 1]) {
1104 unsigned LineColumn =
1106 (
Content[LineIndex].data() -
Lines[LineIndex].data()) +
1107 (OriginalPrefix[LineIndex].size() - Prefix[LineIndex].size());
1113 Whitespaces.replaceWhitespace(*
Tokens[LineIndex],
1120 if (OriginalPrefix[LineIndex] != Prefix[LineIndex]) {
1122 const auto SpacesToRemove = -std::min(PrefixSpaceChange[LineIndex], 0);
1123 const auto SpacesToAdd = std::max(PrefixSpaceChange[LineIndex], 0);
1124 Whitespaces.replaceWhitespaceInToken(
1125 tokenAt(LineIndex), OriginalPrefix[LineIndex].size() - SpacesToRemove,
1126 SpacesToRemove,
"",
"",
false,
1133 State.NextToken = LastLineTok->
Next;
1137 unsigned LineIndex,
const llvm::Regex &CommentPragmasRegex)
const {
1140 StringRef IndentContent =
Content[LineIndex];
1141 if (
Lines[LineIndex].starts_with(
"//"))
1142 IndentContent =
Lines[LineIndex].substr(2);
1150 !CommentPragmasRegex.match(IndentContent) &&
1153 OriginalPrefix[LineIndex] == OriginalPrefix[LineIndex - 1];
Declares BreakableToken, BreakableStringLiteral, BreakableComment, BreakableBlockComment and Breakabl...
This file implements an indenter that manages the indentation of continuations.
Token - This structure provides full information about a lexed token.
bool is(tok::TokenKind K) const
is/isNot - Predicates to check if this token is a specific kind, as in "if (Tok.is(tok::l_brace)) {....
The JSON file list parser is used to communicate input to InstallAPI.
LLVM_READONLY bool isAlphanumeric(unsigned char c)
Return true if this character is an ASCII letter or digit: [a-zA-Z0-9].
LLVM_READONLY bool isHorizontalWhitespace(unsigned char c)
Returns true if this character is horizontal ASCII whitespace: ' ', '\t', '\f', '\v'.
LLVM_READONLY bool isPunctuation(unsigned char c)
Return true if this character is an ASCII punctuation character.