clang 22.0.0git
ArrayBoundChecker.cpp
Go to the documentation of this file.
1//== ArrayBoundChecker.cpp -------------------------------------------------==//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9// This file defines security.ArrayBound, which is a path-sensitive checker
10// that looks for out of bounds access of memory regions.
11//
12//===----------------------------------------------------------------------===//
13
14#include "clang/AST/CharUnits.h"
25#include "llvm/ADT/APSInt.h"
26#include "llvm/Support/FormatVariadic.h"
27#include "llvm/Support/raw_ostream.h"
28#include <optional>
29
30using namespace clang;
31using namespace ento;
32using namespace taint;
33using llvm::formatv;
34
35namespace {
36/// If `E` is an array subscript expression with a base that is "clean" (= not
37/// modified by pointer arithmetic = the beginning of a memory region), return
38/// it as a pointer to ArraySubscriptExpr; otherwise return nullptr.
39/// This helper function is used by two separate heuristics that are only valid
40/// in these "clean" cases.
41static const ArraySubscriptExpr *
42getAsCleanArraySubscriptExpr(const Expr *E, const CheckerContext &C) {
43 const auto *ASE = dyn_cast<ArraySubscriptExpr>(E);
44 if (!ASE)
45 return nullptr;
46
47 const MemRegion *SubscriptBaseReg = C.getSVal(ASE->getBase()).getAsRegion();
48 if (!SubscriptBaseReg)
49 return nullptr;
50
51 // The base of the subscript expression is affected by pointer arithmetics,
52 // so we want to report byte offsets instead of indices and we don't want to
53 // activate the "index is unsigned -> cannot be negative" shortcut.
54 if (isa<ElementRegion>(SubscriptBaseReg->StripCasts()))
55 return nullptr;
56
57 return ASE;
58}
59
60/// If `E` is a "clean" array subscript expression, return the type of the
61/// accessed element; otherwise return std::nullopt because that's the best (or
62/// least bad) option for the diagnostic generation that relies on this.
63static std::optional<QualType> determineElementType(const Expr *E,
64 const CheckerContext &C) {
65 const auto *ASE = getAsCleanArraySubscriptExpr(E, C);
66 if (!ASE)
67 return std::nullopt;
68
69 return ASE->getType();
70}
71
72static std::optional<int64_t>
73determineElementSize(const std::optional<QualType> T, const CheckerContext &C) {
74 if (!T)
75 return std::nullopt;
76 return C.getASTContext().getTypeSizeInChars(*T).getQuantity();
77}
78
79class StateUpdateReporter {
80 const MemSpaceRegion *Space;
81 const SubRegion *Reg;
82 const NonLoc ByteOffsetVal;
83 const std::optional<QualType> ElementType;
84 const std::optional<int64_t> ElementSize;
85 bool AssumedNonNegative = false;
86 std::optional<NonLoc> AssumedUpperBound = std::nullopt;
87
88public:
89 StateUpdateReporter(const SubRegion *R, NonLoc ByteOffsVal, const Expr *E,
91 : Space(R->getMemorySpace(C.getState())), Reg(R),
92 ByteOffsetVal(ByteOffsVal), ElementType(determineElementType(E, C)),
93 ElementSize(determineElementSize(ElementType, C)) {}
94
95 void recordNonNegativeAssumption() { AssumedNonNegative = true; }
96 void recordUpperBoundAssumption(NonLoc UpperBoundVal) {
97 AssumedUpperBound = UpperBoundVal;
98 }
99
100 bool assumedNonNegative() { return AssumedNonNegative; }
101
102 const NoteTag *createNoteTag(CheckerContext &C) const;
103
104private:
105 std::string getMessage(PathSensitiveBugReport &BR) const;
106
107 /// Return true if information about the value of `Sym` can put constraints
108 /// on some symbol which is interesting within the bug report `BR`.
109 /// In particular, this returns true when `Sym` is interesting within `BR`;
110 /// but it also returns true if `Sym` is an expression that contains integer
111 /// constants and a single symbolic operand which is interesting (in `BR`).
112 /// We need to use this instead of plain `BR.isInteresting()` because if we
113 /// are analyzing code like
114 /// int array[10];
115 /// int f(int arg) {
116 /// return array[arg] && array[arg + 10];
117 /// }
118 /// then the byte offsets are `arg * 4` and `(arg + 10) * 4`, which are not
119 /// sub-expressions of each other (but `getSimplifiedOffsets` is smart enough
120 /// to detect this out of bounds access).
121 static bool providesInformationAboutInteresting(SymbolRef Sym,
123 static bool providesInformationAboutInteresting(SVal SV,
125 return providesInformationAboutInteresting(SV.getAsSymbol(), BR);
126 }
127};
128
129struct Messages {
130 std::string Short, Full;
131};
132
133// NOTE: The `ArraySubscriptExpr` and `UnaryOperator` callbacks are `PostStmt`
134// instead of `PreStmt` because the current implementation passes the whole
135// expression to `CheckerContext::getSVal()` which only works after the
136// symbolic evaluation of the expression. (To turn them into `PreStmt`
137// callbacks, we'd need to duplicate the logic that evaluates these
138// expressions.) The `MemberExpr` callback would work as `PreStmt` but it's
139// defined as `PostStmt` for the sake of consistency with the other callbacks.
140class ArrayBoundChecker : public Checker<check::PostStmt<ArraySubscriptExpr>,
141 check::PostStmt<UnaryOperator>,
142 check::PostStmt<MemberExpr>> {
143 BugType BT{this, "Out-of-bound access"};
144 BugType TaintBT{this, "Out-of-bound access", categories::TaintedData};
145
146 void performCheck(const Expr *E, CheckerContext &C) const;
147
148 void reportOOB(CheckerContext &C, ProgramStateRef ErrorState, Messages Msgs,
149 NonLoc Offset, std::optional<NonLoc> Extent,
150 bool IsTaintBug = false) const;
151
152 static void markPartsInteresting(PathSensitiveBugReport &BR,
153 ProgramStateRef ErrorState, NonLoc Val,
154 bool MarkTaint);
155
156 static bool isFromCtypeMacro(const Expr *E, ASTContext &AC);
157
158 static bool isOffsetObviouslyNonnegative(const Expr *E, CheckerContext &C);
159
160 static bool isIdiomaticPastTheEndPtr(const Expr *E, ProgramStateRef State,
161 NonLoc Offset, NonLoc Limit,
163 static bool isInAddressOf(const Stmt *S, ASTContext &AC);
164
165public:
166 void checkPostStmt(const ArraySubscriptExpr *E, CheckerContext &C) const {
167 performCheck(E, C);
168 }
169 void checkPostStmt(const UnaryOperator *E, CheckerContext &C) const {
170 if (E->getOpcode() == UO_Deref)
171 performCheck(E, C);
172 }
173 void checkPostStmt(const MemberExpr *E, CheckerContext &C) const {
174 if (E->isArrow())
175 performCheck(E->getBase(), C);
176 }
177};
178
179} // anonymous namespace
180
181/// For a given Location that can be represented as a symbolic expression
182/// Arr[Idx] (or perhaps Arr[Idx1][Idx2] etc.), return the parent memory block
183/// Arr and the distance of Location from the beginning of Arr (expressed in a
184/// NonLoc that specifies the number of CharUnits). Returns nullopt when these
185/// cannot be determined.
186static std::optional<std::pair<const SubRegion *, NonLoc>>
189 auto EvalBinOp = [&SVB, State, T](BinaryOperatorKind Op, NonLoc L, NonLoc R) {
190 // We will use this utility to add and multiply values.
191 return SVB.evalBinOpNN(State, Op, L, R, T).getAs<NonLoc>();
192 };
193
194 const SubRegion *OwnerRegion = nullptr;
195 std::optional<NonLoc> Offset = SVB.makeZeroArrayIndex();
196
197 const ElementRegion *CurRegion =
198 dyn_cast_or_null<ElementRegion>(Location.getAsRegion());
199
200 while (CurRegion) {
201 const auto Index = CurRegion->getIndex().getAs<NonLoc>();
202 if (!Index)
203 return std::nullopt;
204
205 QualType ElemType = CurRegion->getElementType();
206
207 // FIXME: The following early return was presumably added to safeguard the
208 // getTypeSizeInChars() call (which doesn't accept an incomplete type), but
209 // it seems that `ElemType` cannot be incomplete at this point.
210 if (ElemType->isIncompleteType())
211 return std::nullopt;
212
213 // Calculate Delta = Index * sizeof(ElemType).
214 NonLoc Size = SVB.makeArrayIndex(
215 SVB.getContext().getTypeSizeInChars(ElemType).getQuantity());
216 auto Delta = EvalBinOp(BO_Mul, *Index, Size);
217 if (!Delta)
218 return std::nullopt;
219
220 // Perform Offset += Delta.
221 Offset = EvalBinOp(BO_Add, *Offset, *Delta);
222 if (!Offset)
223 return std::nullopt;
224
225 OwnerRegion = CurRegion->getSuperRegion()->getAs<SubRegion>();
226 // When this is just another ElementRegion layer, we need to continue the
227 // offset calculations:
228 CurRegion = dyn_cast_or_null<ElementRegion>(OwnerRegion);
229 }
230
231 if (OwnerRegion)
232 return std::make_pair(OwnerRegion, *Offset);
233
234 return std::nullopt;
235}
236
237// NOTE: This function is the "heart" of this checker. It simplifies
238// inequalities with transformations that are valid (and very elementary) in
239// pure mathematics, but become invalid if we use them in C++ number model
240// where the calculations may overflow.
241// Due to the overflow issues I think it's impossible (or at least not
242// practical) to integrate this kind of simplification into the resolution of
243// arbitrary inequalities (i.e. the code of `evalBinOp`); but this function
244// produces valid results when the calculations are handling memory offsets
245// and every value is well below SIZE_MAX.
246// TODO: This algorithm should be moved to a central location where it's
247// available for other checkers that need to compare memory offsets.
248// NOTE: the simplification preserves the order of the two operands in a
249// mathematical sense, but it may change the result produced by a C++
250// comparison operator (and the automatic type conversions).
251// For example, consider a comparison "X+1 < 0", where the LHS is stored as a
252// size_t and the RHS is stored in an int. (As size_t is unsigned, this
253// comparison is false for all values of "X".) However, the simplification may
254// turn it into "X < -1", which is still always false in a mathematical sense,
255// but can produce a true result when evaluated by `evalBinOp` (which follows
256// the rules of C++ and casts -1 to SIZE_MAX).
257static std::pair<NonLoc, nonloc::ConcreteInt>
259 SValBuilder &svalBuilder) {
260 const llvm::APSInt &extentVal = extent.getValue();
261 std::optional<nonloc::SymbolVal> SymVal = offset.getAs<nonloc::SymbolVal>();
262 if (SymVal && SymVal->isExpression()) {
263 if (const SymIntExpr *SIE = dyn_cast<SymIntExpr>(SymVal->getSymbol())) {
264 llvm::APSInt constant = APSIntType(extentVal).convert(SIE->getRHS());
265 switch (SIE->getOpcode()) {
266 case BO_Mul:
267 // The constant should never be 0 here, becasue multiplication by zero
268 // is simplified by the engine.
269 if ((extentVal % constant) != 0)
270 return std::pair<NonLoc, nonloc::ConcreteInt>(offset, extent);
271 else
273 nonloc::SymbolVal(SIE->getLHS()),
274 svalBuilder.makeIntVal(extentVal / constant), svalBuilder);
275 case BO_Add:
277 nonloc::SymbolVal(SIE->getLHS()),
278 svalBuilder.makeIntVal(extentVal - constant), svalBuilder);
279 default:
280 break;
281 }
282 }
283 }
284
285 return std::pair<NonLoc, nonloc::ConcreteInt>(offset, extent);
286}
287
289 const llvm::APSInt *MaxV = SVB.getMaxValue(State, Value);
290 return MaxV && MaxV->isNegative();
291}
292
293static bool isUnsigned(SValBuilder &SVB, NonLoc Value) {
295 return T->isUnsignedIntegerType();
296}
297
298// Evaluate the comparison Value < Threshold with the help of the custom
299// simplification algorithm defined for this checker. Return a pair of states,
300// where the first one corresponds to "value below threshold" and the second
301// corresponds to "value at or above threshold". Returns {nullptr, nullptr} in
302// the case when the evaluation fails.
303// If the optional argument CheckEquality is true, then use BO_EQ instead of
304// the default BO_LT after consistently applying the same simplification steps.
305static std::pair<ProgramStateRef, ProgramStateRef>
307 SValBuilder &SVB, bool CheckEquality = false) {
308 if (auto ConcreteThreshold = Threshold.getAs<nonloc::ConcreteInt>()) {
309 std::tie(Value, Threshold) =
310 getSimplifiedOffsets(Value, *ConcreteThreshold, SVB);
311 }
312
313 // We want to perform a _mathematical_ comparison between the numbers `Value`
314 // and `Threshold`; but `evalBinOpNN` evaluates a C/C++ operator that may
315 // perform automatic conversions. For example the number -1 is less than the
316 // number 1000, but -1 < `1000ull` will evaluate to `false` because the `int`
317 // -1 is converted to ULONGLONG_MAX.
318 // To avoid automatic conversions, we evaluate the "obvious" cases without
319 // calling `evalBinOpNN`:
320 if (isNegative(SVB, State, Value) && isUnsigned(SVB, Threshold)) {
321 if (CheckEquality) {
322 // negative_value == unsigned_threshold is always false
323 return {nullptr, State};
324 }
325 // negative_value < unsigned_threshold is always true
326 return {State, nullptr};
327 }
328 if (isUnsigned(SVB, Value) && isNegative(SVB, State, Threshold)) {
329 // unsigned_value == negative_threshold and
330 // unsigned_value < negative_threshold are both always false
331 return {nullptr, State};
332 }
333 // FIXME: These special cases are sufficient for handling real-world
334 // comparisons, but in theory there could be contrived situations where
335 // automatic conversion of a symbolic value (which can be negative and can be
336 // positive) leads to incorrect results.
337 // NOTE: We NEED to use the `evalBinOpNN` call in the "common" case, because
338 // we want to ensure that assumptions coming from this precondition and
339 // assumptions coming from regular C/C++ operator calls are represented by
340 // constraints on the same symbolic expression. A solution that would
341 // evaluate these "mathematical" comparisons through a separate pathway would
342 // be a step backwards in this sense.
343
344 const BinaryOperatorKind OpKind = CheckEquality ? BO_EQ : BO_LT;
345 auto BelowThreshold =
346 SVB.evalBinOpNN(State, OpKind, Value, Threshold, SVB.getConditionType())
347 .getAs<NonLoc>();
348
349 if (BelowThreshold)
350 return State->assume(*BelowThreshold);
351
352 return {nullptr, nullptr};
353}
354
355static std::string getRegionName(const MemSpaceRegion *Space,
356 const SubRegion *Region) {
357 if (std::string RegName = Region->getDescriptiveName(); !RegName.empty())
358 return RegName;
359
360 // Field regions only have descriptive names when their parent has a
361 // descriptive name; so we provide a fallback representation for them:
362 if (const auto *FR = Region->getAs<FieldRegion>()) {
363 if (StringRef Name = FR->getDecl()->getName(); !Name.empty())
364 return formatv("the field '{0}'", Name);
365 return "the unnamed field";
366 }
367
368 if (isa<AllocaRegion>(Region))
369 return "the memory returned by 'alloca'";
370
371 if (isa<SymbolicRegion>(Region) && isa<HeapSpaceRegion>(Space))
372 return "the heap area";
373
374 if (isa<StringRegion>(Region))
375 return "the string literal";
376
377 return "the region";
378}
379
380static std::optional<int64_t> getConcreteValue(NonLoc SV) {
381 if (auto ConcreteVal = SV.getAs<nonloc::ConcreteInt>()) {
382 return ConcreteVal->getValue()->tryExtValue();
383 }
384 return std::nullopt;
385}
386
387static std::optional<int64_t> getConcreteValue(std::optional<NonLoc> SV) {
388 return SV ? getConcreteValue(*SV) : std::nullopt;
389}
390
391static Messages getPrecedesMsgs(const MemSpaceRegion *Space,
392 const SubRegion *Region, NonLoc Offset) {
393 std::string RegName = getRegionName(Space, Region), OffsetStr = "";
394
395 if (auto ConcreteOffset = getConcreteValue(Offset))
396 OffsetStr = formatv(" {0}", ConcreteOffset);
397
398 return {
399 formatv("Out of bound access to memory preceding {0}", RegName),
400 formatv("Access of {0} at negative byte offset{1}", RegName, OffsetStr)};
401}
402
403/// Try to divide `Val1` and `Val2` (in place) by `Divisor` and return true if
404/// it can be performed (`Divisor` is nonzero and there is no remainder). The
405/// values `Val1` and `Val2` may be nullopt and in that case the corresponding
406/// division is considered to be successful.
407static bool tryDividePair(std::optional<int64_t> &Val1,
408 std::optional<int64_t> &Val2, int64_t Divisor) {
409 if (!Divisor)
410 return false;
411 const bool Val1HasRemainder = Val1 && *Val1 % Divisor;
412 const bool Val2HasRemainder = Val2 && *Val2 % Divisor;
413 if (Val1HasRemainder || Val2HasRemainder)
414 return false;
415 if (Val1)
416 *Val1 /= Divisor;
417 if (Val2)
418 *Val2 /= Divisor;
419 return true;
420}
421
422static Messages getExceedsMsgs(ASTContext &ACtx, const MemSpaceRegion *Space,
423 const SubRegion *Region, NonLoc Offset,
424 NonLoc Extent, SVal Location,
425 bool AlsoMentionUnderflow) {
426 std::string RegName = getRegionName(Space, Region);
427 const auto *EReg = Location.getAsRegion()->getAs<ElementRegion>();
428 assert(EReg && "this checker only handles element access");
429 QualType ElemType = EReg->getElementType();
430
431 std::optional<int64_t> OffsetN = getConcreteValue(Offset);
432 std::optional<int64_t> ExtentN = getConcreteValue(Extent);
433
434 int64_t ElemSize = ACtx.getTypeSizeInChars(ElemType).getQuantity();
435
436 bool UseByteOffsets = !tryDividePair(OffsetN, ExtentN, ElemSize);
437 const char *OffsetOrIndex = UseByteOffsets ? "byte offset" : "index";
438
440 llvm::raw_svector_ostream Out(Buf);
441 Out << "Access of ";
442 if (!ExtentN && !UseByteOffsets)
443 Out << "'" << ElemType.getAsString() << "' element in ";
444 Out << RegName << " at ";
445 if (AlsoMentionUnderflow) {
446 Out << "a negative or overflowing " << OffsetOrIndex;
447 } else if (OffsetN) {
448 Out << OffsetOrIndex << " " << *OffsetN;
449 } else {
450 Out << "an overflowing " << OffsetOrIndex;
451 }
452 if (ExtentN) {
453 Out << ", while it holds only ";
454 if (*ExtentN != 1)
455 Out << *ExtentN;
456 else
457 Out << "a single";
458 if (UseByteOffsets)
459 Out << " byte";
460 else
461 Out << " '" << ElemType.getAsString() << "' element";
462
463 if (*ExtentN > 1)
464 Out << "s";
465 }
466
467 return {formatv("Out of bound access to memory {0} {1}",
468 AlsoMentionUnderflow ? "around" : "after the end of",
469 RegName),
470 std::string(Buf)};
471}
472
473static Messages getTaintMsgs(const MemSpaceRegion *Space,
474 const SubRegion *Region, const char *OffsetName,
475 bool AlsoMentionUnderflow) {
476 std::string RegName = getRegionName(Space, Region);
477 return {formatv("Potential out of bound access to {0} with tainted {1}",
478 RegName, OffsetName),
479 formatv("Access of {0} with a tainted {1} that may be {2}too large",
480 RegName, OffsetName,
481 AlsoMentionUnderflow ? "negative or " : "")};
482}
483
484const NoteTag *StateUpdateReporter::createNoteTag(CheckerContext &C) const {
485 // Don't create a note tag if we didn't assume anything:
486 if (!AssumedNonNegative && !AssumedUpperBound)
487 return nullptr;
488
489 return C.getNoteTag([*this](PathSensitiveBugReport &BR) -> std::string {
490 return getMessage(BR);
491 });
492}
493
494std::string StateUpdateReporter::getMessage(PathSensitiveBugReport &BR) const {
495 bool ShouldReportNonNegative = AssumedNonNegative;
496 if (!providesInformationAboutInteresting(ByteOffsetVal, BR)) {
497 if (AssumedUpperBound &&
498 providesInformationAboutInteresting(*AssumedUpperBound, BR)) {
499 // Even if the byte offset isn't interesting (e.g. it's a constant value),
500 // the assumption can still be interesting if it provides information
501 // about an interesting symbolic upper bound.
502 ShouldReportNonNegative = false;
503 } else {
504 // We don't have anything interesting, don't report the assumption.
505 return "";
506 }
507 }
508
509 std::optional<int64_t> OffsetN = getConcreteValue(ByteOffsetVal);
510 std::optional<int64_t> ExtentN = getConcreteValue(AssumedUpperBound);
511
512 const bool UseIndex =
513 ElementSize && tryDividePair(OffsetN, ExtentN, *ElementSize);
514
516 llvm::raw_svector_ostream Out(Buf);
517 Out << "Assuming ";
518 if (UseIndex) {
519 Out << "index ";
520 if (OffsetN)
521 Out << "'" << OffsetN << "' ";
522 } else if (AssumedUpperBound) {
523 Out << "byte offset ";
524 if (OffsetN)
525 Out << "'" << OffsetN << "' ";
526 } else {
527 Out << "offset ";
528 }
529
530 Out << "is";
531 if (ShouldReportNonNegative) {
532 Out << " non-negative";
533 }
534 if (AssumedUpperBound) {
535 if (ShouldReportNonNegative)
536 Out << " and";
537 Out << " less than ";
538 if (ExtentN)
539 Out << *ExtentN << ", ";
540 if (UseIndex && ElementType)
541 Out << "the number of '" << ElementType->getAsString()
542 << "' elements in ";
543 else
544 Out << "the extent of ";
545 Out << getRegionName(Space, Reg);
546 }
547 return std::string(Out.str());
548}
549
550bool StateUpdateReporter::providesInformationAboutInteresting(
552 if (!Sym)
553 return false;
554 for (SymbolRef PartSym : Sym->symbols()) {
555 // The interestingess mark may appear on any layer as we're stripping off
556 // the SymIntExpr, UnarySymExpr etc. layers...
557 if (BR.isInteresting(PartSym))
558 return true;
559 // ...but if both sides of the expression are symbolic, then there is no
560 // practical algorithm to produce separate constraints for the two
561 // operands (from the single combined result).
562 if (isa<SymSymExpr>(PartSym))
563 return false;
564 }
565 return false;
566}
567
568void ArrayBoundChecker::performCheck(const Expr *E, CheckerContext &C) const {
569 const SVal Location = C.getSVal(E);
570
571 // The header ctype.h (from e.g. glibc) implements the isXXXXX() macros as
572 // #define isXXXXX(arg) (LOOKUP_TABLE[arg] & BITMASK_FOR_XXXXX)
573 // and incomplete analysis of these leads to false positives. As even
574 // accurate reports would be confusing for the users, just disable reports
575 // from these macros:
576 if (isFromCtypeMacro(E, C.getASTContext()))
577 return;
578
579 ProgramStateRef State = C.getState();
580 SValBuilder &SVB = C.getSValBuilder();
581
582 const std::optional<std::pair<const SubRegion *, NonLoc>> &RawOffset =
583 computeOffset(State, SVB, Location);
584
585 if (!RawOffset)
586 return;
587
588 auto [Reg, ByteOffset] = *RawOffset;
589
590 // The state updates will be reported as a single note tag, which will be
591 // composed by this helper class.
592 StateUpdateReporter SUR(Reg, ByteOffset, E, C);
593
594 // CHECK LOWER BOUND
595 const MemSpaceRegion *Space = Reg->getMemorySpace(State);
596 if (!(isa<SymbolicRegion>(Reg) && isa<UnknownSpaceRegion>(Space))) {
597 // A symbolic region in unknown space represents an unknown pointer that
598 // may point into the middle of an array, so we don't look for underflows.
599 // Both conditions are significant because we want to check underflows in
600 // symbolic regions on the heap (which may be introduced by checkers like
601 // MallocChecker that call SValBuilder::getConjuredHeapSymbolVal()) and
602 // non-symbolic regions (e.g. a field subregion of a symbolic region) in
603 // unknown space.
604 auto [PrecedesLowerBound, WithinLowerBound] = compareValueToThreshold(
605 State, ByteOffset, SVB.makeZeroArrayIndex(), SVB);
606
607 if (PrecedesLowerBound) {
608 // The analyzer thinks that the offset may be invalid (negative)...
609
610 if (isOffsetObviouslyNonnegative(E, C)) {
611 // ...but the offset is obviously non-negative (clear array subscript
612 // with an unsigned index), so we're in a buggy situation.
613
614 // TODO: Currently the analyzer ignores many casts (e.g. signed ->
615 // unsigned casts), so it can easily reach states where it will load a
616 // signed (and negative) value from an unsigned variable. This sanity
617 // check is a duct tape "solution" that silences most of the ugly false
618 // positives that are caused by this buggy behavior. Note that this is
619 // not a complete solution: this cannot silence reports where pointer
620 // arithmetic complicates the picture and cannot ensure modeling of the
621 // "unsigned index is positive with highest bit set" cases which are
622 // "usurped" by the nonsense "unsigned index is negative" case.
623 // For more information about this topic, see the umbrella ticket
624 // https://github.com/llvm/llvm-project/issues/39492
625 // TODO: Remove this hack once 'SymbolCast's are modeled properly.
626
627 if (!WithinLowerBound) {
628 // The state is completely nonsense -- let's just sink it!
629 C.addSink();
630 return;
631 }
632 // Otherwise continue on the 'WithinLowerBound' branch where the
633 // unsigned index _is_ non-negative. Don't mention this assumption as a
634 // note tag, because it would just confuse the users!
635 } else {
636 if (!WithinLowerBound) {
637 // ...and it cannot be valid (>= 0), so report an error.
638 Messages Msgs = getPrecedesMsgs(Space, Reg, ByteOffset);
639 reportOOB(C, PrecedesLowerBound, Msgs, ByteOffset, std::nullopt);
640 return;
641 }
642 // ...but it can be valid as well, so the checker will (optimistically)
643 // assume that it's valid and mention this in the note tag.
644 SUR.recordNonNegativeAssumption();
645 }
646 }
647
648 // Actually update the state. The "if" only fails in the extremely unlikely
649 // case when compareValueToThreshold returns {nullptr, nullptr} because
650 // evalBinOpNN fails to evaluate the less-than operator.
651 if (WithinLowerBound)
652 State = WithinLowerBound;
653 }
654
655 // CHECK UPPER BOUND
656 DefinedOrUnknownSVal Size = getDynamicExtent(State, Reg, SVB);
657 if (auto KnownSize = Size.getAs<NonLoc>()) {
658 // In a situation where both underflow and overflow are possible (but the
659 // index is either tainted or known to be invalid), the logic of this
660 // checker will first assume that the offset is non-negative, and then
661 // (with this additional assumption) it will detect an overflow error.
662 // In this situation the warning message should mention both possibilities.
663 bool AlsoMentionUnderflow = SUR.assumedNonNegative();
664
665 auto [WithinUpperBound, ExceedsUpperBound] =
666 compareValueToThreshold(State, ByteOffset, *KnownSize, SVB);
667
668 if (ExceedsUpperBound) {
669 // The offset may be invalid (>= Size)...
670 if (!WithinUpperBound) {
671 // ...and it cannot be within bounds, so report an error, unless we can
672 // definitely determine that this is an idiomatic `&array[size]`
673 // expression that calculates the past-the-end pointer.
674 if (isIdiomaticPastTheEndPtr(E, ExceedsUpperBound, ByteOffset,
675 *KnownSize, C)) {
676 C.addTransition(ExceedsUpperBound, SUR.createNoteTag(C));
677 return;
678 }
679
680 Messages Msgs =
681 getExceedsMsgs(C.getASTContext(), Space, Reg, ByteOffset,
682 *KnownSize, Location, AlsoMentionUnderflow);
683 reportOOB(C, ExceedsUpperBound, Msgs, ByteOffset, KnownSize);
684 return;
685 }
686 // ...and it can be valid as well...
687 if (isTainted(State, ByteOffset)) {
688 // ...but it's tainted, so report an error.
689
690 // Diagnostic detail: saying "tainted offset" is always correct, but
691 // the common case is that 'idx' is tainted in 'arr[idx]' and then it's
692 // nicer to say "tainted index".
693 const char *OffsetName = "offset";
694 if (const auto *ASE = dyn_cast<ArraySubscriptExpr>(E))
695 if (isTainted(State, ASE->getIdx(), C.getLocationContext()))
696 OffsetName = "index";
697
698 Messages Msgs =
699 getTaintMsgs(Space, Reg, OffsetName, AlsoMentionUnderflow);
700 reportOOB(C, ExceedsUpperBound, Msgs, ByteOffset, KnownSize,
701 /*IsTaintBug=*/true);
702 return;
703 }
704 // ...and it isn't tainted, so the checker will (optimistically) assume
705 // that the offset is in bounds and mention this in the note tag.
706 SUR.recordUpperBoundAssumption(*KnownSize);
707 }
708
709 // Actually update the state. The "if" only fails in the extremely unlikely
710 // case when compareValueToThreshold returns {nullptr, nullptr} because
711 // evalBinOpNN fails to evaluate the less-than operator.
712 if (WithinUpperBound)
713 State = WithinUpperBound;
714 }
715
716 // Add a transition, reporting the state updates that we accumulated.
717 C.addTransition(State, SUR.createNoteTag(C));
718}
719
720void ArrayBoundChecker::markPartsInteresting(PathSensitiveBugReport &BR,
721 ProgramStateRef ErrorState,
722 NonLoc Val, bool MarkTaint) {
723 if (SymbolRef Sym = Val.getAsSymbol()) {
724 // If the offset is a symbolic value, iterate over its "parts" with
725 // `SymExpr::symbols()` and mark each of them as interesting.
726 // For example, if the offset is `x*4 + y` then we put interestingness onto
727 // the SymSymExpr `x*4 + y`, the SymIntExpr `x*4` and the two data symbols
728 // `x` and `y`.
729 for (SymbolRef PartSym : Sym->symbols())
730 BR.markInteresting(PartSym);
731 }
732
733 if (MarkTaint) {
734 // If the issue that we're reporting depends on the taintedness of the
735 // offset, then put interestingness onto symbols that could be the origin
736 // of the taint. Note that this may find symbols that did not appear in
737 // `Sym->symbols()` (because they're only loosely connected to `Val`).
738 for (SymbolRef Sym : getTaintedSymbols(ErrorState, Val))
739 BR.markInteresting(Sym);
740 }
741}
742
743void ArrayBoundChecker::reportOOB(CheckerContext &C, ProgramStateRef ErrorState,
744 Messages Msgs, NonLoc Offset,
745 std::optional<NonLoc> Extent,
746 bool IsTaintBug /*=false*/) const {
747
748 ExplodedNode *ErrorNode = C.generateErrorNode(ErrorState);
749 if (!ErrorNode)
750 return;
751
752 auto BR = std::make_unique<PathSensitiveBugReport>(
753 IsTaintBug ? TaintBT : BT, Msgs.Short, Msgs.Full, ErrorNode);
754
755 // FIXME: ideally we would just call trackExpressionValue() and that would
756 // "do the right thing": mark the relevant symbols as interesting, track the
757 // control dependencies and statements storing the relevant values and add
758 // helpful diagnostic pieces. However, right now trackExpressionValue() is
759 // a heap of unreliable heuristics, so it would cause several issues:
760 // - Interestingness is not applied consistently, e.g. if `array[x+10]`
761 // causes an overflow, then `x` is not marked as interesting.
762 // - We get irrelevant diagnostic pieces, e.g. in the code
763 // `int *p = (int*)malloc(2*sizeof(int)); p[3] = 0;`
764 // it places a "Storing uninitialized value" note on the `malloc` call
765 // (which is technically true, but irrelevant).
766 // If trackExpressionValue() becomes reliable, it should be applied instead
767 // of this custom markPartsInteresting().
768 markPartsInteresting(*BR, ErrorState, Offset, IsTaintBug);
769 if (Extent)
770 markPartsInteresting(*BR, ErrorState, *Extent, IsTaintBug);
771
772 C.emitReport(std::move(BR));
773}
774
775bool ArrayBoundChecker::isFromCtypeMacro(const Expr *E, ASTContext &ACtx) {
777 if (!Loc.isMacroID())
778 return false;
779
780 StringRef MacroName = Lexer::getImmediateMacroName(
781 Loc, ACtx.getSourceManager(), ACtx.getLangOpts());
782
783 if (MacroName.size() < 7 || MacroName[0] != 'i' || MacroName[1] != 's')
784 return false;
785
786 return ((MacroName == "isalnum") || (MacroName == "isalpha") ||
787 (MacroName == "isblank") || (MacroName == "isdigit") ||
788 (MacroName == "isgraph") || (MacroName == "islower") ||
789 (MacroName == "isnctrl") || (MacroName == "isprint") ||
790 (MacroName == "ispunct") || (MacroName == "isspace") ||
791 (MacroName == "isupper") || (MacroName == "isxdigit"));
792}
793
794bool ArrayBoundChecker::isOffsetObviouslyNonnegative(const Expr *E,
795 CheckerContext &C) {
796 const ArraySubscriptExpr *ASE = getAsCleanArraySubscriptExpr(E, C);
797 if (!ASE)
798 return false;
800}
801
802bool ArrayBoundChecker::isInAddressOf(const Stmt *S, ASTContext &ACtx) {
803 ParentMapContext &ParentCtx = ACtx.getParentMapContext();
804 do {
805 const DynTypedNodeList Parents = ParentCtx.getParents(*S);
806 if (Parents.empty())
807 return false;
808 S = Parents[0].get<Stmt>();
809 } while (isa_and_nonnull<ParenExpr, ImplicitCastExpr>(S));
810 const auto *UnaryOp = dyn_cast_or_null<UnaryOperator>(S);
811 return UnaryOp && UnaryOp->getOpcode() == UO_AddrOf;
812}
813
814bool ArrayBoundChecker::isIdiomaticPastTheEndPtr(const Expr *E,
815 ProgramStateRef State,
816 NonLoc Offset, NonLoc Limit,
817 CheckerContext &C) {
818 if (isa<ArraySubscriptExpr>(E) && isInAddressOf(E, C.getASTContext())) {
819 auto [EqualsToThreshold, NotEqualToThreshold] = compareValueToThreshold(
820 State, Offset, Limit, C.getSValBuilder(), /*CheckEquality=*/true);
821 return EqualsToThreshold && !NotEqualToThreshold;
822 }
823 return false;
824}
825
826void ento::registerArrayBoundChecker(CheckerManager &mgr) {
827 mgr.registerChecker<ArrayBoundChecker>();
828}
829
830bool ento::shouldRegisterArrayBoundChecker(const CheckerManager &mgr) {
831 return true;
832}
static std::pair< ProgramStateRef, ProgramStateRef > compareValueToThreshold(ProgramStateRef State, NonLoc Value, NonLoc Threshold, SValBuilder &SVB, bool CheckEquality=false)
static std::string getRegionName(const MemSpaceRegion *Space, const SubRegion *Region)
static std::optional< std::pair< const SubRegion *, NonLoc > > computeOffset(ProgramStateRef State, SValBuilder &SVB, SVal Location)
For a given Location that can be represented as a symbolic expression Arr[Idx] (or perhaps Arr[Idx1][...
static Messages getPrecedesMsgs(const MemSpaceRegion *Space, const SubRegion *Region, NonLoc Offset)
static bool isNegative(SValBuilder &SVB, ProgramStateRef State, NonLoc Value)
static std::optional< int64_t > getConcreteValue(NonLoc SV)
static Messages getTaintMsgs(const MemSpaceRegion *Space, const SubRegion *Region, const char *OffsetName, bool AlsoMentionUnderflow)
static bool isUnsigned(SValBuilder &SVB, NonLoc Value)
static std::pair< NonLoc, nonloc::ConcreteInt > getSimplifiedOffsets(NonLoc offset, nonloc::ConcreteInt extent, SValBuilder &svalBuilder)
static bool tryDividePair(std::optional< int64_t > &Val1, std::optional< int64_t > &Val2, int64_t Divisor)
Try to divide Val1 and Val2 (in place) by Divisor and return true if it can be performed (Divisor is ...
static Messages getExceedsMsgs(ASTContext &ACtx, const MemSpaceRegion *Space, const SubRegion *Region, NonLoc Offset, NonLoc Extent, SVal Location, bool AlsoMentionUnderflow)
Expr * E
Holds long-lived AST nodes (such as types and decls) that can be referred to throughout the semantic ...
Definition: ASTContext.h:188
SourceManager & getSourceManager()
Definition: ASTContext.h:801
ParentMapContext & getParentMapContext()
Returns the dynamic AST node parent map context.
Definition: ASTContext.cpp:917
const LangOptions & getLangOpts() const
Definition: ASTContext.h:894
CharUnits getTypeSizeInChars(QualType T) const
Return the size of the specified (complete) type T, in characters.
ArraySubscriptExpr - [C99 6.5.2.1] Array Subscripting.
Definition: Expr.h:2723
QuantityType getQuantity() const
getQuantity - Get the raw integer representation of this quantity.
Definition: CharUnits.h:185
Container for either a single DynTypedNode or for an ArrayRef to DynTypedNode.
This represents one expression.
Definition: Expr.h:112
QualType getType() const
Definition: Expr.h:144
static StringRef getImmediateMacroName(SourceLocation Loc, const SourceManager &SM, const LangOptions &LangOpts)
Retrieve the name of the immediate macro expansion.
Definition: Lexer.cpp:1056
MemberExpr - [C99 6.5.2.3] Structure and Union Members.
Definition: Expr.h:3300
DynTypedNodeList getParents(const NodeT &Node)
Returns the parents of the given node (within the traversal scope).
A (possibly-)qualified type.
Definition: TypeBase.h:937
static std::string getAsString(SplitQualType split, const PrintingPolicy &Policy)
Definition: TypeBase.h:1332
Encodes a location in the source.
Stmt - This represents one statement.
Definition: Stmt.h:85
SourceLocation getBeginLoc() const LLVM_READONLY
Definition: Stmt.cpp:346
bool isUnsignedIntegerOrEnumerationType() const
Determines whether this is an integer type that is unsigned or an enumeration types whose underlying ...
Definition: Type.cpp:2277
bool isIncompleteType(NamedDecl **Def=nullptr) const
Types are partitioned into 3 broad categories (C99 6.2.5p1): object types, function types,...
Definition: Type.cpp:2440
bool isUnsignedIntegerType() const
Return true if this is an integer type that is unsigned, according to C99 6.2.5p6 [which returns true...
Definition: Type.cpp:2257
UnaryOperator - This represents the unary-expression's (except sizeof and alignof),...
Definition: Expr.h:2246
QualType getType() const
Definition: Value.cpp:237
A record of the "type" of an APSInt, used for conversions.
Definition: APSIntType.h:19
llvm::APSInt convert(const llvm::APSInt &Value) const LLVM_READONLY
Convert and return a new APSInt with the given value, but this type's bit width and signedness.
Definition: APSIntType.h:48
Template implementation for all binary symbolic expressions.
CHECKER * registerChecker(AT &&...Args)
Register a single-part checker (derived from Checker): construct its singleton instance,...
Simple checker classes that implement one frontend (i.e.
Definition: Checker.h:553
ElementRegion is used to represent both array elements and casts.
Definition: MemRegion.h:1227
QualType getElementType() const
Definition: MemRegion.h:1251
NonLoc getIndex() const
Definition: MemRegion.h:1247
MemRegion - The root abstract class for all memory regions.
Definition: MemRegion.h:98
LLVM_ATTRIBUTE_RETURNS_NONNULL const MemRegion * StripCasts(bool StripBaseAndDerivedCasts=true) const
Definition: MemRegion.cpp:1457
LLVM_ATTRIBUTE_RETURNS_NONNULL const MemSpaceRegion * getMemorySpace(ProgramStateRef State) const
Returns the most specific memory space for this memory region in the given ProgramStateRef.
Definition: MemRegion.cpp:1397
std::string getDescriptiveName(bool UseQuotes=true) const
Get descriptive name for memory region.
Definition: MemRegion.cpp:727
const RegionTy * getAs() const
Definition: MemRegion.h:1416
MemSpaceRegion - A memory region that represents a "memory space"; for example, the set of global var...
Definition: MemRegion.h:236
The tag upon which the TagVisitor reacts.
Definition: BugReporter.h:786
void markInteresting(SymbolRef sym, bugreporter::TrackingKind TKind=bugreporter::TrackingKind::Thorough)
Marks a symbol as interesting.
bool isInteresting(SymbolRef sym) const
NonLoc makeArrayIndex(uint64_t idx)
Definition: SValBuilder.h:271
ASTContext & getContext()
Definition: SValBuilder.h:149
nonloc::ConcreteInt makeIntVal(const IntegerLiteral *integer)
Definition: SValBuilder.h:277
QualType getArrayIndexType() const
Definition: SValBuilder.h:158
virtual SVal evalBinOpNN(ProgramStateRef state, BinaryOperator::Opcode op, NonLoc lhs, NonLoc rhs, QualType resultTy)=0
Create a new value which represents a binary expression with two non- location operands.
QualType getConditionType() const
Definition: SValBuilder.h:154
virtual const llvm::APSInt * getMaxValue(ProgramStateRef state, SVal val)=0
Tries to get the maximal possible (integer) value of a given SVal.
SVal - This represents a symbolic expression, which can be either an L-value or an R-value.
Definition: SVals.h:56
SymbolRef getAsSymbol(bool IncludeBaseRegions=false) const
If this SVal wraps a symbol return that SymbolRef.
Definition: SVals.cpp:103
std::optional< T > getAs() const
Convert to the specified SVal type, returning std::nullopt if this SVal is not of the desired type.
Definition: SVals.h:87
const MemRegion * getAsRegion() const
Definition: SVals.cpp:119
SubRegion - A region that subsets another larger region.
Definition: MemRegion.h:474
LLVM_ATTRIBUTE_RETURNS_NONNULL const MemRegion * getSuperRegion() const
Definition: MemRegion.h:487
Symbolic value.
Definition: SymExpr.h:32
llvm::iterator_range< symbol_iterator > symbols() const
Definition: SymExpr.h:107
Value representing integer constant.
Definition: SVals.h:300
APSIntPtr getValue() const
Definition: SVals.h:304
Represents symbolic expression that isn't a location.
Definition: SVals.h:279
std::vector< SymbolRef > getTaintedSymbols(ProgramStateRef State, const Stmt *S, const LocationContext *LCtx, TaintTagType Kind=TaintTagGeneric)
Returns the tainted Symbols for a given Statement and state.
Definition: Taint.cpp:170
bool isTainted(ProgramStateRef State, const Stmt *S, const LocationContext *LCtx, TaintTagType Kind=TaintTagGeneric)
Check if the statement has a tainted value in the given state.
Definition: Taint.cpp:148
DefinedOrUnknownSVal getDynamicExtent(ProgramStateRef State, const MemRegion *MR, SValBuilder &SVB)
@ Full
This outputs the full clang module dependency graph suitable for use for explicitly building modules.
The JSON file list parser is used to communicate input to InstallAPI.
BinaryOperatorKind
const FunctionProtoType * T