clang 20.0.0git
Lexer.cpp
Go to the documentation of this file.
1//===- Lexer.cpp - C Language Family Lexer --------------------------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9// This file implements the Lexer and Token interfaces.
10//
11//===----------------------------------------------------------------------===//
12
13#include "clang/Lex/Lexer.h"
14#include "UnicodeCharSets.h"
18#include "clang/Basic/LLVM.h"
28#include "clang/Lex/Token.h"
29#include "llvm/ADT/STLExtras.h"
30#include "llvm/ADT/StringExtras.h"
31#include "llvm/ADT/StringRef.h"
32#include "llvm/ADT/StringSwitch.h"
33#include "llvm/Support/Compiler.h"
34#include "llvm/Support/ConvertUTF.h"
35#include "llvm/Support/MemoryBufferRef.h"
36#include "llvm/Support/NativeFormatting.h"
37#include "llvm/Support/Unicode.h"
38#include "llvm/Support/UnicodeCharRanges.h"
39#include <algorithm>
40#include <cassert>
41#include <cstddef>
42#include <cstdint>
43#include <cstring>
44#include <optional>
45#include <string>
46#include <tuple>
47#include <utility>
48
49#ifdef __SSE4_2__
50#include <nmmintrin.h>
51#endif
52
53using namespace clang;
54
55//===----------------------------------------------------------------------===//
56// Token Class Implementation
57//===----------------------------------------------------------------------===//
58
59/// isObjCAtKeyword - Return true if we have an ObjC keyword identifier.
61 if (isAnnotation())
62 return false;
63 if (const IdentifierInfo *II = getIdentifierInfo())
64 return II->getObjCKeywordID() == objcKey;
65 return false;
66}
67
68/// getObjCKeywordID - Return the ObjC keyword kind.
70 if (isAnnotation())
71 return tok::objc_not_keyword;
72 const IdentifierInfo *specId = getIdentifierInfo();
73 return specId ? specId->getObjCKeywordID() : tok::objc_not_keyword;
74}
75
76/// Determine whether the token kind starts a simple-type-specifier.
77bool Token::isSimpleTypeSpecifier(const LangOptions &LangOpts) const {
78 switch (getKind()) {
79 case tok::annot_typename:
80 case tok::annot_decltype:
81 case tok::annot_pack_indexing_type:
82 return true;
83
84 case tok::kw_short:
85 case tok::kw_long:
86 case tok::kw___int64:
87 case tok::kw___int128:
88 case tok::kw_signed:
89 case tok::kw_unsigned:
90 case tok::kw_void:
91 case tok::kw_char:
92 case tok::kw_int:
93 case tok::kw_half:
94 case tok::kw_float:
95 case tok::kw_double:
96 case tok::kw___bf16:
97 case tok::kw__Float16:
98 case tok::kw___float128:
99 case tok::kw___ibm128:
100 case tok::kw_wchar_t:
101 case tok::kw_bool:
102 case tok::kw__Bool:
103 case tok::kw__Accum:
104 case tok::kw__Fract:
105 case tok::kw__Sat:
106#define TRANSFORM_TYPE_TRAIT_DEF(_, Trait) case tok::kw___##Trait:
107#include "clang/Basic/TransformTypeTraits.def"
108 case tok::kw___auto_type:
109 case tok::kw_char16_t:
110 case tok::kw_char32_t:
111 case tok::kw_typeof:
112 case tok::kw_decltype:
113 case tok::kw_char8_t:
114 return getIdentifierInfo()->isKeyword(LangOpts);
115
116 default:
117 return false;
118 }
119}
120
121//===----------------------------------------------------------------------===//
122// Lexer Class Implementation
123//===----------------------------------------------------------------------===//
124
125void Lexer::anchor() {}
126
127void Lexer::InitLexer(const char *BufStart, const char *BufPtr,
128 const char *BufEnd) {
129 BufferStart = BufStart;
130 BufferPtr = BufPtr;
131 BufferEnd = BufEnd;
132
133 assert(BufEnd[0] == 0 &&
134 "We assume that the input buffer has a null character at the end"
135 " to simplify lexing!");
136
137 // Check whether we have a BOM in the beginning of the buffer. If yes - act
138 // accordingly. Right now we support only UTF-8 with and without BOM, so, just
139 // skip the UTF-8 BOM if it's present.
140 if (BufferStart == BufferPtr) {
141 // Determine the size of the BOM.
142 StringRef Buf(BufferStart, BufferEnd - BufferStart);
143 size_t BOMLength = llvm::StringSwitch<size_t>(Buf)
144 .StartsWith("\xEF\xBB\xBF", 3) // UTF-8 BOM
145 .Default(0);
146
147 // Skip the BOM.
148 BufferPtr += BOMLength;
149 }
150
151 Is_PragmaLexer = false;
152 CurrentConflictMarkerState = CMK_None;
153
154 // Start of the file is a start of line.
155 IsAtStartOfLine = true;
156 IsAtPhysicalStartOfLine = true;
157
158 HasLeadingSpace = false;
159 HasLeadingEmptyMacro = false;
160
161 // We are not after parsing a #.
163
164 // We are not after parsing #include.
165 ParsingFilename = false;
166
167 // We are not in raw mode. Raw mode disables diagnostics and interpretation
168 // of tokens (e.g. identifiers, thus disabling macro expansion). It is used
169 // to quickly lex the tokens of the buffer, e.g. when handling a "#if 0" block
170 // or otherwise skipping over tokens.
171 LexingRawMode = false;
172
173 // Default to not keeping comments.
174 ExtendedTokenMode = 0;
175
176 NewLinePtr = nullptr;
177}
178
179/// Lexer constructor - Create a new lexer object for the specified buffer
180/// with the specified preprocessor managing the lexing process. This lexer
181/// assumes that the associated file buffer and Preprocessor objects will
182/// outlive it, so it doesn't take ownership of either of them.
183Lexer::Lexer(FileID FID, const llvm::MemoryBufferRef &InputFile,
184 Preprocessor &PP, bool IsFirstIncludeOfFile)
185 : PreprocessorLexer(&PP, FID),
186 FileLoc(PP.getSourceManager().getLocForStartOfFile(FID)),
187 LangOpts(PP.getLangOpts()), LineComment(LangOpts.LineComment),
188 IsFirstTimeLexingFile(IsFirstIncludeOfFile) {
189 InitLexer(InputFile.getBufferStart(), InputFile.getBufferStart(),
190 InputFile.getBufferEnd());
191
193}
194
195/// Lexer constructor - Create a new raw lexer object. This object is only
196/// suitable for calls to 'LexFromRawLexer'. This lexer assumes that the text
197/// range will outlive it, so it doesn't take ownership of it.
198Lexer::Lexer(SourceLocation fileloc, const LangOptions &langOpts,
199 const char *BufStart, const char *BufPtr, const char *BufEnd,
200 bool IsFirstIncludeOfFile)
201 : FileLoc(fileloc), LangOpts(langOpts), LineComment(LangOpts.LineComment),
202 IsFirstTimeLexingFile(IsFirstIncludeOfFile) {
203 InitLexer(BufStart, BufPtr, BufEnd);
204
205 // We *are* in raw mode.
206 LexingRawMode = true;
207}
208
209/// Lexer constructor - Create a new raw lexer object. This object is only
210/// suitable for calls to 'LexFromRawLexer'. This lexer assumes that the text
211/// range will outlive it, so it doesn't take ownership of it.
212Lexer::Lexer(FileID FID, const llvm::MemoryBufferRef &FromFile,
213 const SourceManager &SM, const LangOptions &langOpts,
214 bool IsFirstIncludeOfFile)
215 : Lexer(SM.getLocForStartOfFile(FID), langOpts, FromFile.getBufferStart(),
216 FromFile.getBufferStart(), FromFile.getBufferEnd(),
217 IsFirstIncludeOfFile) {}
218
220 assert(PP && "Cannot reset token mode without a preprocessor");
221 if (LangOpts.TraditionalCPP)
223 else
225}
226
227/// Create_PragmaLexer: Lexer constructor - Create a new lexer object for
228/// _Pragma expansion. This has a variety of magic semantics that this method
229/// sets up. It returns a new'd Lexer that must be delete'd when done.
230///
231/// On entrance to this routine, TokStartLoc is a macro location which has a
232/// spelling loc that indicates the bytes to be lexed for the token and an
233/// expansion location that indicates where all lexed tokens should be
234/// "expanded from".
235///
236/// TODO: It would really be nice to make _Pragma just be a wrapper around a
237/// normal lexer that remaps tokens as they fly by. This would require making
238/// Preprocessor::Lex virtual. Given that, we could just dump in a magic lexer
239/// interface that could handle this stuff. This would pull GetMappedTokenLoc
240/// out of the critical path of the lexer!
241///
243 SourceLocation ExpansionLocStart,
244 SourceLocation ExpansionLocEnd,
245 unsigned TokLen, Preprocessor &PP) {
247
248 // Create the lexer as if we were going to lex the file normally.
249 FileID SpellingFID = SM.getFileID(SpellingLoc);
250 llvm::MemoryBufferRef InputFile = SM.getBufferOrFake(SpellingFID);
251 Lexer *L = new Lexer(SpellingFID, InputFile, PP);
252
253 // Now that the lexer is created, change the start/end locations so that we
254 // just lex the subsection of the file that we want. This is lexing from a
255 // scratch buffer.
256 const char *StrData = SM.getCharacterData(SpellingLoc);
257
258 L->BufferPtr = StrData;
259 L->BufferEnd = StrData+TokLen;
260 assert(L->BufferEnd[0] == 0 && "Buffer is not nul terminated!");
261
262 // Set the SourceLocation with the remapping information. This ensures that
263 // GetMappedTokenLoc will remap the tokens as they are lexed.
264 L->FileLoc = SM.createExpansionLoc(SM.getLocForStartOfFile(SpellingFID),
265 ExpansionLocStart,
266 ExpansionLocEnd, TokLen);
267
268 // Ensure that the lexer thinks it is inside a directive, so that end \n will
269 // return an EOD token.
271
272 // This lexer really is for _Pragma.
273 L->Is_PragmaLexer = true;
274 return L;
275}
276
277void Lexer::seek(unsigned Offset, bool IsAtStartOfLine) {
278 this->IsAtPhysicalStartOfLine = IsAtStartOfLine;
279 this->IsAtStartOfLine = IsAtStartOfLine;
280 assert((BufferStart + Offset) <= BufferEnd);
281 BufferPtr = BufferStart + Offset;
282}
283
284template <typename T> static void StringifyImpl(T &Str, char Quote) {
285 typename T::size_type i = 0, e = Str.size();
286 while (i < e) {
287 if (Str[i] == '\\' || Str[i] == Quote) {
288 Str.insert(Str.begin() + i, '\\');
289 i += 2;
290 ++e;
291 } else if (Str[i] == '\n' || Str[i] == '\r') {
292 // Replace '\r\n' and '\n\r' to '\\' followed by 'n'.
293 if ((i < e - 1) && (Str[i + 1] == '\n' || Str[i + 1] == '\r') &&
294 Str[i] != Str[i + 1]) {
295 Str[i] = '\\';
296 Str[i + 1] = 'n';
297 } else {
298 // Replace '\n' and '\r' to '\\' followed by 'n'.
299 Str[i] = '\\';
300 Str.insert(Str.begin() + i + 1, 'n');
301 ++e;
302 }
303 i += 2;
304 } else
305 ++i;
306 }
307}
308
309std::string Lexer::Stringify(StringRef Str, bool Charify) {
310 std::string Result = std::string(Str);
311 char Quote = Charify ? '\'' : '"';
312 StringifyImpl(Result, Quote);
313 return Result;
314}
315
317
318//===----------------------------------------------------------------------===//
319// Token Spelling
320//===----------------------------------------------------------------------===//
321
322/// Slow case of getSpelling. Extract the characters comprising the
323/// spelling of this token from the provided input buffer.
324static size_t getSpellingSlow(const Token &Tok, const char *BufPtr,
325 const LangOptions &LangOpts, char *Spelling) {
326 assert(Tok.needsCleaning() && "getSpellingSlow called on simple token");
327
328 size_t Length = 0;
329 const char *BufEnd = BufPtr + Tok.getLength();
330
331 if (tok::isStringLiteral(Tok.getKind())) {
332 // Munch the encoding-prefix and opening double-quote.
333 while (BufPtr < BufEnd) {
334 auto CharAndSize = Lexer::getCharAndSizeNoWarn(BufPtr, LangOpts);
335 Spelling[Length++] = CharAndSize.Char;
336 BufPtr += CharAndSize.Size;
337
338 if (Spelling[Length - 1] == '"')
339 break;
340 }
341
342 // Raw string literals need special handling; trigraph expansion and line
343 // splicing do not occur within their d-char-sequence nor within their
344 // r-char-sequence.
345 if (Length >= 2 &&
346 Spelling[Length - 2] == 'R' && Spelling[Length - 1] == '"') {
347 // Search backwards from the end of the token to find the matching closing
348 // quote.
349 const char *RawEnd = BufEnd;
350 do --RawEnd; while (*RawEnd != '"');
351 size_t RawLength = RawEnd - BufPtr + 1;
352
353 // Everything between the quotes is included verbatim in the spelling.
354 memcpy(Spelling + Length, BufPtr, RawLength);
355 Length += RawLength;
356 BufPtr += RawLength;
357
358 // The rest of the token is lexed normally.
359 }
360 }
361
362 while (BufPtr < BufEnd) {
363 auto CharAndSize = Lexer::getCharAndSizeNoWarn(BufPtr, LangOpts);
364 Spelling[Length++] = CharAndSize.Char;
365 BufPtr += CharAndSize.Size;
366 }
367
368 assert(Length < Tok.getLength() &&
369 "NeedsCleaning flag set on token that didn't need cleaning!");
370 return Length;
371}
372
373/// getSpelling() - Return the 'spelling' of this token. The spelling of a
374/// token are the characters used to represent the token in the source file
375/// after trigraph expansion and escaped-newline folding. In particular, this
376/// wants to get the true, uncanonicalized, spelling of things like digraphs
377/// UCNs, etc.
379 SmallVectorImpl<char> &buffer,
380 const SourceManager &SM,
381 const LangOptions &options,
382 bool *invalid) {
383 // Break down the source location.
384 std::pair<FileID, unsigned> locInfo = SM.getDecomposedLoc(loc);
385
386 // Try to the load the file buffer.
387 bool invalidTemp = false;
388 StringRef file = SM.getBufferData(locInfo.first, &invalidTemp);
389 if (invalidTemp) {
390 if (invalid) *invalid = true;
391 return {};
392 }
393
394 const char *tokenBegin = file.data() + locInfo.second;
395
396 // Lex from the start of the given location.
397 Lexer lexer(SM.getLocForStartOfFile(locInfo.first), options,
398 file.begin(), tokenBegin, file.end());
399 Token token;
400 lexer.LexFromRawLexer(token);
401
402 unsigned length = token.getLength();
403
404 // Common case: no need for cleaning.
405 if (!token.needsCleaning())
406 return StringRef(tokenBegin, length);
407
408 // Hard case, we need to relex the characters into the string.
409 buffer.resize(length);
410 buffer.resize(getSpellingSlow(token, tokenBegin, options, buffer.data()));
411 return StringRef(buffer.data(), buffer.size());
412}
413
414/// getSpelling() - Return the 'spelling' of this token. The spelling of a
415/// token are the characters used to represent the token in the source file
416/// after trigraph expansion and escaped-newline folding. In particular, this
417/// wants to get the true, uncanonicalized, spelling of things like digraphs
418/// UCNs, etc.
419std::string Lexer::getSpelling(const Token &Tok, const SourceManager &SourceMgr,
420 const LangOptions &LangOpts, bool *Invalid) {
421 assert((int)Tok.getLength() >= 0 && "Token character range is bogus!");
422
423 bool CharDataInvalid = false;
424 const char *TokStart = SourceMgr.getCharacterData(Tok.getLocation(),
425 &CharDataInvalid);
426 if (Invalid)
427 *Invalid = CharDataInvalid;
428 if (CharDataInvalid)
429 return {};
430
431 // If this token contains nothing interesting, return it directly.
432 if (!Tok.needsCleaning())
433 return std::string(TokStart, TokStart + Tok.getLength());
434
435 std::string Result;
436 Result.resize(Tok.getLength());
437 Result.resize(getSpellingSlow(Tok, TokStart, LangOpts, &*Result.begin()));
438 return Result;
439}
440
441/// getSpelling - This method is used to get the spelling of a token into a
442/// preallocated buffer, instead of as an std::string. The caller is required
443/// to allocate enough space for the token, which is guaranteed to be at least
444/// Tok.getLength() bytes long. The actual length of the token is returned.
445///
446/// Note that this method may do two possible things: it may either fill in
447/// the buffer specified with characters, or it may *change the input pointer*
448/// to point to a constant buffer with the data already in it (avoiding a
449/// copy). The caller is not allowed to modify the returned buffer pointer
450/// if an internal buffer is returned.
451unsigned Lexer::getSpelling(const Token &Tok, const char *&Buffer,
452 const SourceManager &SourceMgr,
453 const LangOptions &LangOpts, bool *Invalid) {
454 assert((int)Tok.getLength() >= 0 && "Token character range is bogus!");
455
456 const char *TokStart = nullptr;
457 // NOTE: this has to be checked *before* testing for an IdentifierInfo.
458 if (Tok.is(tok::raw_identifier))
459 TokStart = Tok.getRawIdentifier().data();
460 else if (!Tok.hasUCN()) {
461 if (const IdentifierInfo *II = Tok.getIdentifierInfo()) {
462 // Just return the string from the identifier table, which is very quick.
463 Buffer = II->getNameStart();
464 return II->getLength();
465 }
466 }
467
468 // NOTE: this can be checked even after testing for an IdentifierInfo.
469 if (Tok.isLiteral())
470 TokStart = Tok.getLiteralData();
471
472 if (!TokStart) {
473 // Compute the start of the token in the input lexer buffer.
474 bool CharDataInvalid = false;
475 TokStart = SourceMgr.getCharacterData(Tok.getLocation(), &CharDataInvalid);
476 if (Invalid)
477 *Invalid = CharDataInvalid;
478 if (CharDataInvalid) {
479 Buffer = "";
480 return 0;
481 }
482 }
483
484 // If this token contains nothing interesting, return it directly.
485 if (!Tok.needsCleaning()) {
486 Buffer = TokStart;
487 return Tok.getLength();
488 }
489
490 // Otherwise, hard case, relex the characters into the string.
491 return getSpellingSlow(Tok, TokStart, LangOpts, const_cast<char*>(Buffer));
492}
493
494/// MeasureTokenLength - Relex the token at the specified location and return
495/// its length in bytes in the input file. If the token needs cleaning (e.g.
496/// includes a trigraph or an escaped newline) then this count includes bytes
497/// that are part of that.
499 const SourceManager &SM,
500 const LangOptions &LangOpts) {
501 Token TheTok;
502 if (getRawToken(Loc, TheTok, SM, LangOpts))
503 return 0;
504 return TheTok.getLength();
505}
506
507/// Relex the token at the specified location.
508/// \returns true if there was a failure, false on success.
510 const SourceManager &SM,
511 const LangOptions &LangOpts,
512 bool IgnoreWhiteSpace) {
513 // TODO: this could be special cased for common tokens like identifiers, ')',
514 // etc to make this faster, if it mattered. Just look at StrData[0] to handle
515 // all obviously single-char tokens. This could use
516 // Lexer::isObviouslySimpleCharacter for example to handle identifiers or
517 // something.
518
519 // If this comes from a macro expansion, we really do want the macro name, not
520 // the token this macro expanded to.
521 Loc = SM.getExpansionLoc(Loc);
522 std::pair<FileID, unsigned> LocInfo = SM.getDecomposedLoc(Loc);
523 bool Invalid = false;
524 StringRef Buffer = SM.getBufferData(LocInfo.first, &Invalid);
525 if (Invalid)
526 return true;
527
528 const char *StrData = Buffer.data()+LocInfo.second;
529
530 if (!IgnoreWhiteSpace && isWhitespace(SkipEscapedNewLines(StrData)[0]))
531 return true;
532
533 // Create a lexer starting at the beginning of this token.
534 Lexer TheLexer(SM.getLocForStartOfFile(LocInfo.first), LangOpts,
535 Buffer.begin(), StrData, Buffer.end());
536 TheLexer.SetCommentRetentionState(true);
537 TheLexer.LexFromRawLexer(Result);
538 return false;
539}
540
541/// Returns the pointer that points to the beginning of line that contains
542/// the given offset, or null if the offset if invalid.
543static const char *findBeginningOfLine(StringRef Buffer, unsigned Offset) {
544 const char *BufStart = Buffer.data();
545 if (Offset >= Buffer.size())
546 return nullptr;
547
548 const char *LexStart = BufStart + Offset;
549 for (; LexStart != BufStart; --LexStart) {
550 if (isVerticalWhitespace(LexStart[0]) &&
551 !Lexer::isNewLineEscaped(BufStart, LexStart)) {
552 // LexStart should point at first character of logical line.
553 ++LexStart;
554 break;
555 }
556 }
557 return LexStart;
558}
559
561 const SourceManager &SM,
562 const LangOptions &LangOpts) {
563 assert(Loc.isFileID());
564 std::pair<FileID, unsigned> LocInfo = SM.getDecomposedLoc(Loc);
565 if (LocInfo.first.isInvalid())
566 return Loc;
567
568 bool Invalid = false;
569 StringRef Buffer = SM.getBufferData(LocInfo.first, &Invalid);
570 if (Invalid)
571 return Loc;
572
573 // Back up from the current location until we hit the beginning of a line
574 // (or the buffer). We'll relex from that point.
575 const char *StrData = Buffer.data() + LocInfo.second;
576 const char *LexStart = findBeginningOfLine(Buffer, LocInfo.second);
577 if (!LexStart || LexStart == StrData)
578 return Loc;
579
580 // Create a lexer starting at the beginning of this token.
581 SourceLocation LexerStartLoc = Loc.getLocWithOffset(-LocInfo.second);
582 Lexer TheLexer(LexerStartLoc, LangOpts, Buffer.data(), LexStart,
583 Buffer.end());
584 TheLexer.SetCommentRetentionState(true);
585
586 // Lex tokens until we find the token that contains the source location.
587 Token TheTok;
588 do {
589 TheLexer.LexFromRawLexer(TheTok);
590
591 if (TheLexer.getBufferLocation() > StrData) {
592 // Lexing this token has taken the lexer past the source location we're
593 // looking for. If the current token encompasses our source location,
594 // return the beginning of that token.
595 if (TheLexer.getBufferLocation() - TheTok.getLength() <= StrData)
596 return TheTok.getLocation();
597
598 // We ended up skipping over the source location entirely, which means
599 // that it points into whitespace. We're done here.
600 break;
601 }
602 } while (TheTok.getKind() != tok::eof);
603
604 // We've passed our source location; just return the original source location.
605 return Loc;
606}
607
609 const SourceManager &SM,
610 const LangOptions &LangOpts) {
611 if (Loc.isFileID())
612 return getBeginningOfFileToken(Loc, SM, LangOpts);
613
614 if (!SM.isMacroArgExpansion(Loc))
615 return Loc;
616
617 SourceLocation FileLoc = SM.getSpellingLoc(Loc);
618 SourceLocation BeginFileLoc = getBeginningOfFileToken(FileLoc, SM, LangOpts);
619 std::pair<FileID, unsigned> FileLocInfo = SM.getDecomposedLoc(FileLoc);
620 std::pair<FileID, unsigned> BeginFileLocInfo =
621 SM.getDecomposedLoc(BeginFileLoc);
622 assert(FileLocInfo.first == BeginFileLocInfo.first &&
623 FileLocInfo.second >= BeginFileLocInfo.second);
624 return Loc.getLocWithOffset(BeginFileLocInfo.second - FileLocInfo.second);
625}
626
627namespace {
628
629enum PreambleDirectiveKind {
630 PDK_Skipped,
631 PDK_Unknown
632};
633
634} // namespace
635
637 const LangOptions &LangOpts,
638 unsigned MaxLines) {
639 // Create a lexer starting at the beginning of the file. Note that we use a
640 // "fake" file source location at offset 1 so that the lexer will track our
641 // position within the file.
642 const SourceLocation::UIntTy StartOffset = 1;
644 Lexer TheLexer(FileLoc, LangOpts, Buffer.begin(), Buffer.begin(),
645 Buffer.end());
646 TheLexer.SetCommentRetentionState(true);
647
648 bool InPreprocessorDirective = false;
649 Token TheTok;
650 SourceLocation ActiveCommentLoc;
651
652 unsigned MaxLineOffset = 0;
653 if (MaxLines) {
654 const char *CurPtr = Buffer.begin();
655 unsigned CurLine = 0;
656 while (CurPtr != Buffer.end()) {
657 char ch = *CurPtr++;
658 if (ch == '\n') {
659 ++CurLine;
660 if (CurLine == MaxLines)
661 break;
662 }
663 }
664 if (CurPtr != Buffer.end())
665 MaxLineOffset = CurPtr - Buffer.begin();
666 }
667
668 do {
669 TheLexer.LexFromRawLexer(TheTok);
670
671 if (InPreprocessorDirective) {
672 // If we've hit the end of the file, we're done.
673 if (TheTok.getKind() == tok::eof) {
674 break;
675 }
676
677 // If we haven't hit the end of the preprocessor directive, skip this
678 // token.
679 if (!TheTok.isAtStartOfLine())
680 continue;
681
682 // We've passed the end of the preprocessor directive, and will look
683 // at this token again below.
684 InPreprocessorDirective = false;
685 }
686
687 // Keep track of the # of lines in the preamble.
688 if (TheTok.isAtStartOfLine()) {
689 unsigned TokOffset = TheTok.getLocation().getRawEncoding() - StartOffset;
690
691 // If we were asked to limit the number of lines in the preamble,
692 // and we're about to exceed that limit, we're done.
693 if (MaxLineOffset && TokOffset >= MaxLineOffset)
694 break;
695 }
696
697 // Comments are okay; skip over them.
698 if (TheTok.getKind() == tok::comment) {
699 if (ActiveCommentLoc.isInvalid())
700 ActiveCommentLoc = TheTok.getLocation();
701 continue;
702 }
703
704 if (TheTok.isAtStartOfLine() && TheTok.getKind() == tok::hash) {
705 // This is the start of a preprocessor directive.
706 Token HashTok = TheTok;
707 InPreprocessorDirective = true;
708 ActiveCommentLoc = SourceLocation();
709
710 // Figure out which directive this is. Since we're lexing raw tokens,
711 // we don't have an identifier table available. Instead, just look at
712 // the raw identifier to recognize and categorize preprocessor directives.
713 TheLexer.LexFromRawLexer(TheTok);
714 if (TheTok.getKind() == tok::raw_identifier && !TheTok.needsCleaning()) {
715 StringRef Keyword = TheTok.getRawIdentifier();
716 PreambleDirectiveKind PDK
717 = llvm::StringSwitch<PreambleDirectiveKind>(Keyword)
718 .Case("include", PDK_Skipped)
719 .Case("__include_macros", PDK_Skipped)
720 .Case("define", PDK_Skipped)
721 .Case("undef", PDK_Skipped)
722 .Case("line", PDK_Skipped)
723 .Case("error", PDK_Skipped)
724 .Case("pragma", PDK_Skipped)
725 .Case("import", PDK_Skipped)
726 .Case("include_next", PDK_Skipped)
727 .Case("warning", PDK_Skipped)
728 .Case("ident", PDK_Skipped)
729 .Case("sccs", PDK_Skipped)
730 .Case("assert", PDK_Skipped)
731 .Case("unassert", PDK_Skipped)
732 .Case("if", PDK_Skipped)
733 .Case("ifdef", PDK_Skipped)
734 .Case("ifndef", PDK_Skipped)
735 .Case("elif", PDK_Skipped)
736 .Case("elifdef", PDK_Skipped)
737 .Case("elifndef", PDK_Skipped)
738 .Case("else", PDK_Skipped)
739 .Case("endif", PDK_Skipped)
740 .Default(PDK_Unknown);
741
742 switch (PDK) {
743 case PDK_Skipped:
744 continue;
745
746 case PDK_Unknown:
747 // We don't know what this directive is; stop at the '#'.
748 break;
749 }
750 }
751
752 // We only end up here if we didn't recognize the preprocessor
753 // directive or it was one that can't occur in the preamble at this
754 // point. Roll back the current token to the location of the '#'.
755 TheTok = HashTok;
756 } else if (TheTok.isAtStartOfLine() &&
757 TheTok.getKind() == tok::raw_identifier &&
758 TheTok.getRawIdentifier() == "module" &&
759 LangOpts.CPlusPlusModules) {
760 // The initial global module fragment introducer "module;" is part of
761 // the preamble, which runs up to the module declaration "module foo;".
762 Token ModuleTok = TheTok;
763 do {
764 TheLexer.LexFromRawLexer(TheTok);
765 } while (TheTok.getKind() == tok::comment);
766 if (TheTok.getKind() != tok::semi) {
767 // Not global module fragment, roll back.
768 TheTok = ModuleTok;
769 break;
770 }
771 continue;
772 }
773
774 // We hit a token that we don't recognize as being in the
775 // "preprocessing only" part of the file, so we're no longer in
776 // the preamble.
777 break;
778 } while (true);
779
780 SourceLocation End;
781 if (ActiveCommentLoc.isValid())
782 End = ActiveCommentLoc; // don't truncate a decl comment.
783 else
784 End = TheTok.getLocation();
785
786 return PreambleBounds(End.getRawEncoding() - FileLoc.getRawEncoding(),
787 TheTok.isAtStartOfLine());
788}
789
790unsigned Lexer::getTokenPrefixLength(SourceLocation TokStart, unsigned CharNo,
791 const SourceManager &SM,
792 const LangOptions &LangOpts) {
793 // Figure out how many physical characters away the specified expansion
794 // character is. This needs to take into consideration newlines and
795 // trigraphs.
796 bool Invalid = false;
797 const char *TokPtr = SM.getCharacterData(TokStart, &Invalid);
798
799 // If they request the first char of the token, we're trivially done.
800 if (Invalid || (CharNo == 0 && Lexer::isObviouslySimpleCharacter(*TokPtr)))
801 return 0;
802
803 unsigned PhysOffset = 0;
804
805 // The usual case is that tokens don't contain anything interesting. Skip
806 // over the uninteresting characters. If a token only consists of simple
807 // chars, this method is extremely fast.
808 while (Lexer::isObviouslySimpleCharacter(*TokPtr)) {
809 if (CharNo == 0)
810 return PhysOffset;
811 ++TokPtr;
812 --CharNo;
813 ++PhysOffset;
814 }
815
816 // If we have a character that may be a trigraph or escaped newline, use a
817 // lexer to parse it correctly.
818 for (; CharNo; --CharNo) {
819 auto CharAndSize = Lexer::getCharAndSizeNoWarn(TokPtr, LangOpts);
820 TokPtr += CharAndSize.Size;
821 PhysOffset += CharAndSize.Size;
822 }
823
824 // Final detail: if we end up on an escaped newline, we want to return the
825 // location of the actual byte of the token. For example foo<newline>bar
826 // advanced by 3 should return the location of b, not of \\. One compounding
827 // detail of this is that the escape may be made by a trigraph.
828 if (!Lexer::isObviouslySimpleCharacter(*TokPtr))
829 PhysOffset += Lexer::SkipEscapedNewLines(TokPtr)-TokPtr;
830
831 return PhysOffset;
832}
833
834/// Computes the source location just past the end of the
835/// token at this source location.
836///
837/// This routine can be used to produce a source location that
838/// points just past the end of the token referenced by \p Loc, and
839/// is generally used when a diagnostic needs to point just after a
840/// token where it expected something different that it received. If
841/// the returned source location would not be meaningful (e.g., if
842/// it points into a macro), this routine returns an invalid
843/// source location.
844///
845/// \param Offset an offset from the end of the token, where the source
846/// location should refer to. The default offset (0) produces a source
847/// location pointing just past the end of the token; an offset of 1 produces
848/// a source location pointing to the last character in the token, etc.
850 const SourceManager &SM,
851 const LangOptions &LangOpts) {
852 if (Loc.isInvalid())
853 return {};
854
855 if (Loc.isMacroID()) {
856 if (Offset > 0 || !isAtEndOfMacroExpansion(Loc, SM, LangOpts, &Loc))
857 return {}; // Points inside the macro expansion.
858 }
859
860 unsigned Len = Lexer::MeasureTokenLength(Loc, SM, LangOpts);
861 if (Len > Offset)
862 Len = Len - Offset;
863 else
864 return Loc;
865
866 return Loc.getLocWithOffset(Len);
867}
868
869/// Returns true if the given MacroID location points at the first
870/// token of the macro expansion.
872 const SourceManager &SM,
873 const LangOptions &LangOpts,
874 SourceLocation *MacroBegin) {
875 assert(loc.isValid() && loc.isMacroID() && "Expected a valid macro loc");
876
877 SourceLocation expansionLoc;
878 if (!SM.isAtStartOfImmediateMacroExpansion(loc, &expansionLoc))
879 return false;
880
881 if (expansionLoc.isFileID()) {
882 // No other macro expansions, this is the first.
883 if (MacroBegin)
884 *MacroBegin = expansionLoc;
885 return true;
886 }
887
888 return isAtStartOfMacroExpansion(expansionLoc, SM, LangOpts, MacroBegin);
889}
890
891/// Returns true if the given MacroID location points at the last
892/// token of the macro expansion.
894 const SourceManager &SM,
895 const LangOptions &LangOpts,
896 SourceLocation *MacroEnd) {
897 assert(loc.isValid() && loc.isMacroID() && "Expected a valid macro loc");
898
899 SourceLocation spellLoc = SM.getSpellingLoc(loc);
900 unsigned tokLen = MeasureTokenLength(spellLoc, SM, LangOpts);
901 if (tokLen == 0)
902 return false;
903
904 SourceLocation afterLoc = loc.getLocWithOffset(tokLen);
905 SourceLocation expansionLoc;
906 if (!SM.isAtEndOfImmediateMacroExpansion(afterLoc, &expansionLoc))
907 return false;
908
909 if (expansionLoc.isFileID()) {
910 // No other macro expansions.
911 if (MacroEnd)
912 *MacroEnd = expansionLoc;
913 return true;
914 }
915
916 return isAtEndOfMacroExpansion(expansionLoc, SM, LangOpts, MacroEnd);
917}
918
920 const SourceManager &SM,
921 const LangOptions &LangOpts) {
924 assert(Begin.isFileID() && End.isFileID());
925 if (Range.isTokenRange()) {
926 End = Lexer::getLocForEndOfToken(End, 0, SM,LangOpts);
927 if (End.isInvalid())
928 return {};
929 }
930
931 // Break down the source locations.
932 FileID FID;
933 unsigned BeginOffs;
934 std::tie(FID, BeginOffs) = SM.getDecomposedLoc(Begin);
935 if (FID.isInvalid())
936 return {};
937
938 unsigned EndOffs;
939 if (!SM.isInFileID(End, FID, &EndOffs) ||
940 BeginOffs > EndOffs)
941 return {};
942
944}
945
946// Assumes that `Loc` is in an expansion.
948 const SourceManager &SM) {
949 return SM.getSLocEntry(SM.getFileID(Loc))
950 .getExpansion()
951 .isExpansionTokenRange();
952}
953
955 const SourceManager &SM,
956 const LangOptions &LangOpts) {
959 if (Begin.isInvalid() || End.isInvalid())
960 return {};
961
962 if (Begin.isFileID() && End.isFileID())
963 return makeRangeFromFileLocs(Range, SM, LangOpts);
964
965 if (Begin.isMacroID() && End.isFileID()) {
966 if (!isAtStartOfMacroExpansion(Begin, SM, LangOpts, &Begin))
967 return {};
969 return makeRangeFromFileLocs(Range, SM, LangOpts);
970 }
971
972 if (Begin.isFileID() && End.isMacroID()) {
973 if (Range.isTokenRange()) {
974 if (!isAtEndOfMacroExpansion(End, SM, LangOpts, &End))
975 return {};
976 // Use the *original* end, not the expanded one in `End`.
977 Range.setTokenRange(isInExpansionTokenRange(Range.getEnd(), SM));
978 } else if (!isAtStartOfMacroExpansion(End, SM, LangOpts, &End))
979 return {};
980 Range.setEnd(End);
981 return makeRangeFromFileLocs(Range, SM, LangOpts);
982 }
983
984 assert(Begin.isMacroID() && End.isMacroID());
985 SourceLocation MacroBegin, MacroEnd;
986 if (isAtStartOfMacroExpansion(Begin, SM, LangOpts, &MacroBegin) &&
987 ((Range.isTokenRange() && isAtEndOfMacroExpansion(End, SM, LangOpts,
988 &MacroEnd)) ||
989 (Range.isCharRange() && isAtStartOfMacroExpansion(End, SM, LangOpts,
990 &MacroEnd)))) {
991 Range.setBegin(MacroBegin);
992 Range.setEnd(MacroEnd);
993 // Use the *original* `End`, not the expanded one in `MacroEnd`.
994 if (Range.isTokenRange())
995 Range.setTokenRange(isInExpansionTokenRange(End, SM));
996 return makeRangeFromFileLocs(Range, SM, LangOpts);
997 }
998
999 bool Invalid = false;
1000 const SrcMgr::SLocEntry &BeginEntry = SM.getSLocEntry(SM.getFileID(Begin),
1001 &Invalid);
1002 if (Invalid)
1003 return {};
1004
1005 if (BeginEntry.getExpansion().isMacroArgExpansion()) {
1006 const SrcMgr::SLocEntry &EndEntry = SM.getSLocEntry(SM.getFileID(End),
1007 &Invalid);
1008 if (Invalid)
1009 return {};
1010
1011 if (EndEntry.getExpansion().isMacroArgExpansion() &&
1012 BeginEntry.getExpansion().getExpansionLocStart() ==
1013 EndEntry.getExpansion().getExpansionLocStart()) {
1014 Range.setBegin(SM.getImmediateSpellingLoc(Begin));
1015 Range.setEnd(SM.getImmediateSpellingLoc(End));
1016 return makeFileCharRange(Range, SM, LangOpts);
1017 }
1018 }
1019
1020 return {};
1021}
1022
1024 const SourceManager &SM,
1025 const LangOptions &LangOpts,
1026 bool *Invalid) {
1027 Range = makeFileCharRange(Range, SM, LangOpts);
1028 if (Range.isInvalid()) {
1029 if (Invalid) *Invalid = true;
1030 return {};
1031 }
1032
1033 // Break down the source location.
1034 std::pair<FileID, unsigned> beginInfo = SM.getDecomposedLoc(Range.getBegin());
1035 if (beginInfo.first.isInvalid()) {
1036 if (Invalid) *Invalid = true;
1037 return {};
1038 }
1039
1040 unsigned EndOffs;
1041 if (!SM.isInFileID(Range.getEnd(), beginInfo.first, &EndOffs) ||
1042 beginInfo.second > EndOffs) {
1043 if (Invalid) *Invalid = true;
1044 return {};
1045 }
1046
1047 // Try to the load the file buffer.
1048 bool invalidTemp = false;
1049 StringRef file = SM.getBufferData(beginInfo.first, &invalidTemp);
1050 if (invalidTemp) {
1051 if (Invalid) *Invalid = true;
1052 return {};
1053 }
1054
1055 if (Invalid) *Invalid = false;
1056 return file.substr(beginInfo.second, EndOffs - beginInfo.second);
1057}
1058
1060 const SourceManager &SM,
1061 const LangOptions &LangOpts) {
1062 assert(Loc.isMacroID() && "Only reasonable to call this on macros");
1063
1064 // Find the location of the immediate macro expansion.
1065 while (true) {
1066 FileID FID = SM.getFileID(Loc);
1067 const SrcMgr::SLocEntry *E = &SM.getSLocEntry(FID);
1068 const SrcMgr::ExpansionInfo &Expansion = E->getExpansion();
1069 Loc = Expansion.getExpansionLocStart();
1070 if (!Expansion.isMacroArgExpansion())
1071 break;
1072
1073 // For macro arguments we need to check that the argument did not come
1074 // from an inner macro, e.g: "MAC1( MAC2(foo) )"
1075
1076 // Loc points to the argument id of the macro definition, move to the
1077 // macro expansion.
1078 Loc = SM.getImmediateExpansionRange(Loc).getBegin();
1079 SourceLocation SpellLoc = Expansion.getSpellingLoc();
1080 if (SpellLoc.isFileID())
1081 break; // No inner macro.
1082
1083 // If spelling location resides in the same FileID as macro expansion
1084 // location, it means there is no inner macro.
1085 FileID MacroFID = SM.getFileID(Loc);
1086 if (SM.isInFileID(SpellLoc, MacroFID))
1087 break;
1088
1089 // Argument came from inner macro.
1090 Loc = SpellLoc;
1091 }
1092
1093 // Find the spelling location of the start of the non-argument expansion
1094 // range. This is where the macro name was spelled in order to begin
1095 // expanding this macro.
1096 Loc = SM.getSpellingLoc(Loc);
1097
1098 // Dig out the buffer where the macro name was spelled and the extents of the
1099 // name so that we can render it into the expansion note.
1100 std::pair<FileID, unsigned> ExpansionInfo = SM.getDecomposedLoc(Loc);
1101 unsigned MacroTokenLength = Lexer::MeasureTokenLength(Loc, SM, LangOpts);
1102 StringRef ExpansionBuffer = SM.getBufferData(ExpansionInfo.first);
1103 return ExpansionBuffer.substr(ExpansionInfo.second, MacroTokenLength);
1104}
1105
1107 SourceLocation Loc, const SourceManager &SM, const LangOptions &LangOpts) {
1108 assert(Loc.isMacroID() && "Only reasonable to call this on macros");
1109 // Walk past macro argument expansions.
1110 while (SM.isMacroArgExpansion(Loc))
1111 Loc = SM.getImmediateExpansionRange(Loc).getBegin();
1112
1113 // If the macro's spelling isn't FileID or from scratch space, then it's
1114 // actually a token paste or stringization (or similar) and not a macro at
1115 // all.
1116 SourceLocation SpellLoc = SM.getSpellingLoc(Loc);
1117 if (!SpellLoc.isFileID() || SM.isWrittenInScratchSpace(SpellLoc))
1118 return {};
1119
1120 // Find the spelling location of the start of the non-argument expansion
1121 // range. This is where the macro name was spelled in order to begin
1122 // expanding this macro.
1123 Loc = SM.getSpellingLoc(SM.getImmediateExpansionRange(Loc).getBegin());
1124
1125 // Dig out the buffer where the macro name was spelled and the extents of the
1126 // name so that we can render it into the expansion note.
1127 std::pair<FileID, unsigned> ExpansionInfo = SM.getDecomposedLoc(Loc);
1128 unsigned MacroTokenLength = Lexer::MeasureTokenLength(Loc, SM, LangOpts);
1129 StringRef ExpansionBuffer = SM.getBufferData(ExpansionInfo.first);
1130 return ExpansionBuffer.substr(ExpansionInfo.second, MacroTokenLength);
1131}
1132
1134 return isAsciiIdentifierContinue(c, LangOpts.DollarIdents);
1135}
1136
1137bool Lexer::isNewLineEscaped(const char *BufferStart, const char *Str) {
1138 assert(isVerticalWhitespace(Str[0]));
1139 if (Str - 1 < BufferStart)
1140 return false;
1141
1142 if ((Str[0] == '\n' && Str[-1] == '\r') ||
1143 (Str[0] == '\r' && Str[-1] == '\n')) {
1144 if (Str - 2 < BufferStart)
1145 return false;
1146 --Str;
1147 }
1148 --Str;
1149
1150 // Rewind to first non-space character:
1151 while (Str > BufferStart && isHorizontalWhitespace(*Str))
1152 --Str;
1153
1154 return *Str == '\\';
1155}
1156
1158 const SourceManager &SM) {
1159 if (Loc.isInvalid() || Loc.isMacroID())
1160 return {};
1161 std::pair<FileID, unsigned> LocInfo = SM.getDecomposedLoc(Loc);
1162 if (LocInfo.first.isInvalid())
1163 return {};
1164 bool Invalid = false;
1165 StringRef Buffer = SM.getBufferData(LocInfo.first, &Invalid);
1166 if (Invalid)
1167 return {};
1168 const char *Line = findBeginningOfLine(Buffer, LocInfo.second);
1169 if (!Line)
1170 return {};
1171 StringRef Rest = Buffer.substr(Line - Buffer.data());
1172 size_t NumWhitespaceChars = Rest.find_first_not_of(" \t");
1173 return NumWhitespaceChars == StringRef::npos
1174 ? ""
1175 : Rest.take_front(NumWhitespaceChars);
1176}
1177
1178//===----------------------------------------------------------------------===//
1179// Diagnostics forwarding code.
1180//===----------------------------------------------------------------------===//
1181
1182/// GetMappedTokenLoc - If lexing out of a 'mapped buffer', where we pretend the
1183/// lexer buffer was all expanded at a single point, perform the mapping.
1184/// This is currently only used for _Pragma implementation, so it is the slow
1185/// path of the hot getSourceLocation method. Do not allow it to be inlined.
1186static LLVM_ATTRIBUTE_NOINLINE SourceLocation GetMappedTokenLoc(
1187 Preprocessor &PP, SourceLocation FileLoc, unsigned CharNo, unsigned TokLen);
1189 SourceLocation FileLoc,
1190 unsigned CharNo, unsigned TokLen) {
1191 assert(FileLoc.isMacroID() && "Must be a macro expansion");
1192
1193 // Otherwise, we're lexing "mapped tokens". This is used for things like
1194 // _Pragma handling. Combine the expansion location of FileLoc with the
1195 // spelling location.
1197
1198 // Create a new SLoc which is expanded from Expansion(FileLoc) but whose
1199 // characters come from spelling(FileLoc)+Offset.
1200 SourceLocation SpellingLoc = SM.getSpellingLoc(FileLoc);
1201 SpellingLoc = SpellingLoc.getLocWithOffset(CharNo);
1202
1203 // Figure out the expansion loc range, which is the range covered by the
1204 // original _Pragma(...) sequence.
1205 CharSourceRange II = SM.getImmediateExpansionRange(FileLoc);
1206
1207 return SM.createExpansionLoc(SpellingLoc, II.getBegin(), II.getEnd(), TokLen);
1208}
1209
1210/// getSourceLocation - Return a source location identifier for the specified
1211/// offset in the current file.
1213 unsigned TokLen) const {
1214 assert(Loc >= BufferStart && Loc <= BufferEnd &&
1215 "Location out of range for this buffer!");
1216
1217 // In the normal case, we're just lexing from a simple file buffer, return
1218 // the file id from FileLoc with the offset specified.
1219 unsigned CharNo = Loc-BufferStart;
1220 if (FileLoc.isFileID())
1221 return FileLoc.getLocWithOffset(CharNo);
1222
1223 // Otherwise, this is the _Pragma lexer case, which pretends that all of the
1224 // tokens are lexed from where the _Pragma was defined.
1225 assert(PP && "This doesn't work on raw lexers");
1226 return GetMappedTokenLoc(*PP, FileLoc, CharNo, TokLen);
1227}
1228
1229/// Diag - Forwarding function for diagnostics. This translate a source
1230/// position in the current buffer into a SourceLocation object for rendering.
1231DiagnosticBuilder Lexer::Diag(const char *Loc, unsigned DiagID) const {
1232 return PP->Diag(getSourceLocation(Loc), DiagID);
1233}
1234
1235//===----------------------------------------------------------------------===//
1236// Trigraph and Escaped Newline Handling Code.
1237//===----------------------------------------------------------------------===//
1238
1239/// GetTrigraphCharForLetter - Given a character that occurs after a ?? pair,
1240/// return the decoded trigraph letter it corresponds to, or '\0' if nothing.
1241static char GetTrigraphCharForLetter(char Letter) {
1242 switch (Letter) {
1243 default: return 0;
1244 case '=': return '#';
1245 case ')': return ']';
1246 case '(': return '[';
1247 case '!': return '|';
1248 case '\'': return '^';
1249 case '>': return '}';
1250 case '/': return '\\';
1251 case '<': return '{';
1252 case '-': return '~';
1253 }
1254}
1255
1256/// DecodeTrigraphChar - If the specified character is a legal trigraph when
1257/// prefixed with ??, emit a trigraph warning. If trigraphs are enabled,
1258/// return the result character. Finally, emit a warning about trigraph use
1259/// whether trigraphs are enabled or not.
1260static char DecodeTrigraphChar(const char *CP, Lexer *L, bool Trigraphs) {
1261 char Res = GetTrigraphCharForLetter(*CP);
1262 if (!Res)
1263 return Res;
1264
1265 if (!Trigraphs) {
1266 if (L && !L->isLexingRawMode())
1267 L->Diag(CP-2, diag::trigraph_ignored);
1268 return 0;
1269 }
1270
1271 if (L && !L->isLexingRawMode())
1272 L->Diag(CP-2, diag::trigraph_converted) << StringRef(&Res, 1);
1273 return Res;
1274}
1275
1276/// getEscapedNewLineSize - Return the size of the specified escaped newline,
1277/// or 0 if it is not an escaped newline. P[-1] is known to be a "\" or a
1278/// trigraph equivalent on entry to this function.
1279unsigned Lexer::getEscapedNewLineSize(const char *Ptr) {
1280 unsigned Size = 0;
1281 while (isWhitespace(Ptr[Size])) {
1282 ++Size;
1283
1284 if (Ptr[Size-1] != '\n' && Ptr[Size-1] != '\r')
1285 continue;
1286
1287 // If this is a \r\n or \n\r, skip the other half.
1288 if ((Ptr[Size] == '\r' || Ptr[Size] == '\n') &&
1289 Ptr[Size-1] != Ptr[Size])
1290 ++Size;
1291
1292 return Size;
1293 }
1294
1295 // Not an escaped newline, must be a \t or something else.
1296 return 0;
1297}
1298
1299/// SkipEscapedNewLines - If P points to an escaped newline (or a series of
1300/// them), skip over them and return the first non-escaped-newline found,
1301/// otherwise return P.
1302const char *Lexer::SkipEscapedNewLines(const char *P) {
1303 while (true) {
1304 const char *AfterEscape;
1305 if (*P == '\\') {
1306 AfterEscape = P+1;
1307 } else if (*P == '?') {
1308 // If not a trigraph for escape, bail out.
1309 if (P[1] != '?' || P[2] != '/')
1310 return P;
1311 // FIXME: Take LangOpts into account; the language might not
1312 // support trigraphs.
1313 AfterEscape = P+3;
1314 } else {
1315 return P;
1316 }
1317
1318 unsigned NewLineSize = Lexer::getEscapedNewLineSize(AfterEscape);
1319 if (NewLineSize == 0) return P;
1320 P = AfterEscape+NewLineSize;
1321 }
1322}
1323
1325 const SourceManager &SM,
1326 const LangOptions &LangOpts,
1327 bool IncludeComments) {
1328 if (Loc.isMacroID()) {
1329 if (!Lexer::isAtEndOfMacroExpansion(Loc, SM, LangOpts, &Loc))
1330 return std::nullopt;
1331 }
1332 Loc = Lexer::getLocForEndOfToken(Loc, 0, SM, LangOpts);
1333
1334 // Break down the source location.
1335 std::pair<FileID, unsigned> LocInfo = SM.getDecomposedLoc(Loc);
1336
1337 // Try to load the file buffer.
1338 bool InvalidTemp = false;
1339 StringRef File = SM.getBufferData(LocInfo.first, &InvalidTemp);
1340 if (InvalidTemp)
1341 return std::nullopt;
1342
1343 const char *TokenBegin = File.data() + LocInfo.second;
1344
1345 // Lex from the start of the given location.
1346 Lexer lexer(SM.getLocForStartOfFile(LocInfo.first), LangOpts, File.begin(),
1347 TokenBegin, File.end());
1348 lexer.SetCommentRetentionState(IncludeComments);
1349 // Find the token.
1350 Token Tok;
1351 lexer.LexFromRawLexer(Tok);
1352 return Tok;
1353}
1354
1356 const SourceManager &SM,
1357 const LangOptions &LangOpts,
1358 bool IncludeComments) {
1359 const auto StartOfFile = SM.getLocForStartOfFile(SM.getFileID(Loc));
1360 while (Loc != StartOfFile) {
1361 Loc = Loc.getLocWithOffset(-1);
1362 if (Loc.isInvalid())
1363 return std::nullopt;
1364
1365 Loc = GetBeginningOfToken(Loc, SM, LangOpts);
1366 Token Tok;
1367 if (getRawToken(Loc, Tok, SM, LangOpts))
1368 continue; // Not a token, go to prev location.
1369 if (!Tok.is(tok::comment) || IncludeComments) {
1370 return Tok;
1371 }
1372 }
1373 return std::nullopt;
1374}
1375
1376/// Checks that the given token is the first token that occurs after the
1377/// given location (this excludes comments and whitespace). Returns the location
1378/// immediately after the specified token. If the token is not found or the
1379/// location is inside a macro, the returned source location will be invalid.
1382 const LangOptions &LangOpts, bool SkipTrailingWhitespaceAndNewLine) {
1383 std::optional<Token> Tok = findNextToken(Loc, SM, LangOpts);
1384 if (!Tok || Tok->isNot(TKind))
1385 return {};
1386 SourceLocation TokenLoc = Tok->getLocation();
1387
1388 // Calculate how much whitespace needs to be skipped if any.
1389 unsigned NumWhitespaceChars = 0;
1390 if (SkipTrailingWhitespaceAndNewLine) {
1391 const char *TokenEnd = SM.getCharacterData(TokenLoc) + Tok->getLength();
1392 unsigned char C = *TokenEnd;
1393 while (isHorizontalWhitespace(C)) {
1394 C = *(++TokenEnd);
1395 NumWhitespaceChars++;
1396 }
1397
1398 // Skip \r, \n, \r\n, or \n\r
1399 if (C == '\n' || C == '\r') {
1400 char PrevC = C;
1401 C = *(++TokenEnd);
1402 NumWhitespaceChars++;
1403 if ((C == '\n' || C == '\r') && C != PrevC)
1404 NumWhitespaceChars++;
1405 }
1406 }
1407
1408 return TokenLoc.getLocWithOffset(Tok->getLength() + NumWhitespaceChars);
1409}
1410
1411/// getCharAndSizeSlow - Peek a single 'character' from the specified buffer,
1412/// get its size, and return it. This is tricky in several cases:
1413/// 1. If currently at the start of a trigraph, we warn about the trigraph,
1414/// then either return the trigraph (skipping 3 chars) or the '?',
1415/// depending on whether trigraphs are enabled or not.
1416/// 2. If this is an escaped newline (potentially with whitespace between
1417/// the backslash and newline), implicitly skip the newline and return
1418/// the char after it.
1419///
1420/// This handles the slow/uncommon case of the getCharAndSize method. Here we
1421/// know that we can accumulate into Size, and that we have already incremented
1422/// Ptr by Size bytes.
1423///
1424/// NOTE: When this method is updated, getCharAndSizeSlowNoWarn (below) should
1425/// be updated to match.
1426Lexer::SizedChar Lexer::getCharAndSizeSlow(const char *Ptr, Token *Tok) {
1427 unsigned Size = 0;
1428 // If we have a slash, look for an escaped newline.
1429 if (Ptr[0] == '\\') {
1430 ++Size;
1431 ++Ptr;
1432Slash:
1433 // Common case, backslash-char where the char is not whitespace.
1434 if (!isWhitespace(Ptr[0]))
1435 return {'\\', Size};
1436
1437 // See if we have optional whitespace characters between the slash and
1438 // newline.
1439 if (unsigned EscapedNewLineSize = getEscapedNewLineSize(Ptr)) {
1440 // Remember that this token needs to be cleaned.
1441 if (Tok) Tok->setFlag(Token::NeedsCleaning);
1442
1443 // Warn if there was whitespace between the backslash and newline.
1444 if (Ptr[0] != '\n' && Ptr[0] != '\r' && Tok && !isLexingRawMode())
1445 Diag(Ptr, diag::backslash_newline_space);
1446
1447 // Found backslash<whitespace><newline>. Parse the char after it.
1448 Size += EscapedNewLineSize;
1449 Ptr += EscapedNewLineSize;
1450
1451 // Use slow version to accumulate a correct size field.
1452 auto CharAndSize = getCharAndSizeSlow(Ptr, Tok);
1453 CharAndSize.Size += Size;
1454 return CharAndSize;
1455 }
1456
1457 // Otherwise, this is not an escaped newline, just return the slash.
1458 return {'\\', Size};
1459 }
1460
1461 // If this is a trigraph, process it.
1462 if (Ptr[0] == '?' && Ptr[1] == '?') {
1463 // If this is actually a legal trigraph (not something like "??x"), emit
1464 // a trigraph warning. If so, and if trigraphs are enabled, return it.
1465 if (char C = DecodeTrigraphChar(Ptr + 2, Tok ? this : nullptr,
1466 LangOpts.Trigraphs)) {
1467 // Remember that this token needs to be cleaned.
1468 if (Tok) Tok->setFlag(Token::NeedsCleaning);
1469
1470 Ptr += 3;
1471 Size += 3;
1472 if (C == '\\') goto Slash;
1473 return {C, Size};
1474 }
1475 }
1476
1477 // If this is neither, return a single character.
1478 return {*Ptr, Size + 1u};
1479}
1480
1481/// getCharAndSizeSlowNoWarn - Handle the slow/uncommon case of the
1482/// getCharAndSizeNoWarn method. Here we know that we can accumulate into Size,
1483/// and that we have already incremented Ptr by Size bytes.
1484///
1485/// NOTE: When this method is updated, getCharAndSizeSlow (above) should
1486/// be updated to match.
1487Lexer::SizedChar Lexer::getCharAndSizeSlowNoWarn(const char *Ptr,
1488 const LangOptions &LangOpts) {
1489
1490 unsigned Size = 0;
1491 // If we have a slash, look for an escaped newline.
1492 if (Ptr[0] == '\\') {
1493 ++Size;
1494 ++Ptr;
1495Slash:
1496 // Common case, backslash-char where the char is not whitespace.
1497 if (!isWhitespace(Ptr[0]))
1498 return {'\\', Size};
1499
1500 // See if we have optional whitespace characters followed by a newline.
1501 if (unsigned EscapedNewLineSize = getEscapedNewLineSize(Ptr)) {
1502 // Found backslash<whitespace><newline>. Parse the char after it.
1503 Size += EscapedNewLineSize;
1504 Ptr += EscapedNewLineSize;
1505
1506 // Use slow version to accumulate a correct size field.
1507 auto CharAndSize = getCharAndSizeSlowNoWarn(Ptr, LangOpts);
1508 CharAndSize.Size += Size;
1509 return CharAndSize;
1510 }
1511
1512 // Otherwise, this is not an escaped newline, just return the slash.
1513 return {'\\', Size};
1514 }
1515
1516 // If this is a trigraph, process it.
1517 if (LangOpts.Trigraphs && Ptr[0] == '?' && Ptr[1] == '?') {
1518 // If this is actually a legal trigraph (not something like "??x"), return
1519 // it.
1520 if (char C = GetTrigraphCharForLetter(Ptr[2])) {
1521 Ptr += 3;
1522 Size += 3;
1523 if (C == '\\') goto Slash;
1524 return {C, Size};
1525 }
1526 }
1527
1528 // If this is neither, return a single character.
1529 return {*Ptr, Size + 1u};
1530}
1531
1532//===----------------------------------------------------------------------===//
1533// Helper methods for lexing.
1534//===----------------------------------------------------------------------===//
1535
1536/// Routine that indiscriminately sets the offset into the source file.
1537void Lexer::SetByteOffset(unsigned Offset, bool StartOfLine) {
1538 BufferPtr = BufferStart + Offset;
1539 if (BufferPtr > BufferEnd)
1540 BufferPtr = BufferEnd;
1541 // FIXME: What exactly does the StartOfLine bit mean? There are two
1542 // possible meanings for the "start" of the line: the first token on the
1543 // unexpanded line, or the first token on the expanded line.
1544 IsAtStartOfLine = StartOfLine;
1545 IsAtPhysicalStartOfLine = StartOfLine;
1546}
1547
1548static bool isUnicodeWhitespace(uint32_t Codepoint) {
1549 static const llvm::sys::UnicodeCharSet UnicodeWhitespaceChars(
1551 return UnicodeWhitespaceChars.contains(Codepoint);
1552}
1553
1555 llvm::SmallString<5> CharBuf;
1556 llvm::raw_svector_ostream CharOS(CharBuf);
1557 llvm::write_hex(CharOS, C, llvm::HexPrintStyle::Upper, 4);
1558 return CharBuf;
1559}
1560
1561// To mitigate https://github.com/llvm/llvm-project/issues/54732,
1562// we allow "Mathematical Notation Characters" in identifiers.
1563// This is a proposed profile that extends the XID_Start/XID_continue
1564// with mathematical symbols, superscipts and subscripts digits
1565// found in some production software.
1566// https://www.unicode.org/L2/L2022/22230-math-profile.pdf
1567static bool isMathematicalExtensionID(uint32_t C, const LangOptions &LangOpts,
1568 bool IsStart, bool &IsExtension) {
1569 static const llvm::sys::UnicodeCharSet MathStartChars(
1571 static const llvm::sys::UnicodeCharSet MathContinueChars(
1573 if (MathStartChars.contains(C) ||
1574 (!IsStart && MathContinueChars.contains(C))) {
1575 IsExtension = true;
1576 return true;
1577 }
1578 return false;
1579}
1580
1581static bool isAllowedIDChar(uint32_t C, const LangOptions &LangOpts,
1582 bool &IsExtension) {
1583 if (LangOpts.AsmPreprocessor) {
1584 return false;
1585 } else if (LangOpts.DollarIdents && '$' == C) {
1586 return true;
1587 } else if (LangOpts.CPlusPlus || LangOpts.C23) {
1588 // A non-leading codepoint must have the XID_Continue property.
1589 // XIDContinueRanges doesn't contains characters also in XIDStartRanges,
1590 // so we need to check both tables.
1591 // '_' doesn't have the XID_Continue property but is allowed in C and C++.
1592 static const llvm::sys::UnicodeCharSet XIDStartChars(XIDStartRanges);
1593 static const llvm::sys::UnicodeCharSet XIDContinueChars(XIDContinueRanges);
1594 if (C == '_' || XIDStartChars.contains(C) || XIDContinueChars.contains(C))
1595 return true;
1596 return isMathematicalExtensionID(C, LangOpts, /*IsStart=*/false,
1597 IsExtension);
1598 } else if (LangOpts.C11) {
1599 static const llvm::sys::UnicodeCharSet C11AllowedIDChars(
1601 return C11AllowedIDChars.contains(C);
1602 } else {
1603 static const llvm::sys::UnicodeCharSet C99AllowedIDChars(
1605 return C99AllowedIDChars.contains(C);
1606 }
1607}
1608
1609static bool isAllowedInitiallyIDChar(uint32_t C, const LangOptions &LangOpts,
1610 bool &IsExtension) {
1611 assert(C > 0x7F && "isAllowedInitiallyIDChar called with an ASCII codepoint");
1612 IsExtension = false;
1613 if (LangOpts.AsmPreprocessor) {
1614 return false;
1615 }
1616 if (LangOpts.CPlusPlus || LangOpts.C23) {
1617 static const llvm::sys::UnicodeCharSet XIDStartChars(XIDStartRanges);
1618 if (XIDStartChars.contains(C))
1619 return true;
1620 return isMathematicalExtensionID(C, LangOpts, /*IsStart=*/true,
1621 IsExtension);
1622 }
1623 if (!isAllowedIDChar(C, LangOpts, IsExtension))
1624 return false;
1625 if (LangOpts.C11) {
1626 static const llvm::sys::UnicodeCharSet C11DisallowedInitialIDChars(
1628 return !C11DisallowedInitialIDChars.contains(C);
1629 }
1630 static const llvm::sys::UnicodeCharSet C99DisallowedInitialIDChars(
1632 return !C99DisallowedInitialIDChars.contains(C);
1633}
1634
1637
1638 static const llvm::sys::UnicodeCharSet MathStartChars(
1640 static const llvm::sys::UnicodeCharSet MathContinueChars(
1642
1643 (void)MathStartChars;
1644 (void)MathContinueChars;
1645 assert((MathStartChars.contains(C) || MathContinueChars.contains(C)) &&
1646 "Unexpected mathematical notation codepoint");
1647 Diags.Report(Range.getBegin(), diag::ext_mathematical_notation)
1649}
1650
1651static inline CharSourceRange makeCharRange(Lexer &L, const char *Begin,
1652 const char *End) {
1654 L.getSourceLocation(End));
1655}
1656
1657static void maybeDiagnoseIDCharCompat(DiagnosticsEngine &Diags, uint32_t C,
1658 CharSourceRange Range, bool IsFirst) {
1659 // Check C99 compatibility.
1660 if (!Diags.isIgnored(diag::warn_c99_compat_unicode_id, Range.getBegin())) {
1661 enum {
1662 CannotAppearInIdentifier = 0,
1663 CannotStartIdentifier
1664 };
1665
1666 static const llvm::sys::UnicodeCharSet C99AllowedIDChars(
1668 static const llvm::sys::UnicodeCharSet C99DisallowedInitialIDChars(
1670 if (!C99AllowedIDChars.contains(C)) {
1671 Diags.Report(Range.getBegin(), diag::warn_c99_compat_unicode_id)
1672 << Range
1673 << CannotAppearInIdentifier;
1674 } else if (IsFirst && C99DisallowedInitialIDChars.contains(C)) {
1675 Diags.Report(Range.getBegin(), diag::warn_c99_compat_unicode_id)
1676 << Range
1677 << CannotStartIdentifier;
1678 }
1679 }
1680}
1681
1682/// After encountering UTF-8 character C and interpreting it as an identifier
1683/// character, check whether it's a homoglyph for a common non-identifier
1684/// source character that is unlikely to be an intentional identifier
1685/// character and warn if so.
1688 // FIXME: Handle Unicode quotation marks (smart quotes, fullwidth quotes).
1689 struct HomoglyphPair {
1690 uint32_t Character;
1691 char LooksLike;
1692 bool operator<(HomoglyphPair R) const { return Character < R.Character; }
1693 };
1694 static constexpr HomoglyphPair SortedHomoglyphs[] = {
1695 {U'\u00ad', 0}, // SOFT HYPHEN
1696 {U'\u01c3', '!'}, // LATIN LETTER RETROFLEX CLICK
1697 {U'\u037e', ';'}, // GREEK QUESTION MARK
1698 {U'\u200b', 0}, // ZERO WIDTH SPACE
1699 {U'\u200c', 0}, // ZERO WIDTH NON-JOINER
1700 {U'\u200d', 0}, // ZERO WIDTH JOINER
1701 {U'\u2060', 0}, // WORD JOINER
1702 {U'\u2061', 0}, // FUNCTION APPLICATION
1703 {U'\u2062', 0}, // INVISIBLE TIMES
1704 {U'\u2063', 0}, // INVISIBLE SEPARATOR
1705 {U'\u2064', 0}, // INVISIBLE PLUS
1706 {U'\u2212', '-'}, // MINUS SIGN
1707 {U'\u2215', '/'}, // DIVISION SLASH
1708 {U'\u2216', '\\'}, // SET MINUS
1709 {U'\u2217', '*'}, // ASTERISK OPERATOR
1710 {U'\u2223', '|'}, // DIVIDES
1711 {U'\u2227', '^'}, // LOGICAL AND
1712 {U'\u2236', ':'}, // RATIO
1713 {U'\u223c', '~'}, // TILDE OPERATOR
1714 {U'\ua789', ':'}, // MODIFIER LETTER COLON
1715 {U'\ufeff', 0}, // ZERO WIDTH NO-BREAK SPACE
1716 {U'\uff01', '!'}, // FULLWIDTH EXCLAMATION MARK
1717 {U'\uff03', '#'}, // FULLWIDTH NUMBER SIGN
1718 {U'\uff04', '$'}, // FULLWIDTH DOLLAR SIGN
1719 {U'\uff05', '%'}, // FULLWIDTH PERCENT SIGN
1720 {U'\uff06', '&'}, // FULLWIDTH AMPERSAND
1721 {U'\uff08', '('}, // FULLWIDTH LEFT PARENTHESIS
1722 {U'\uff09', ')'}, // FULLWIDTH RIGHT PARENTHESIS
1723 {U'\uff0a', '*'}, // FULLWIDTH ASTERISK
1724 {U'\uff0b', '+'}, // FULLWIDTH ASTERISK
1725 {U'\uff0c', ','}, // FULLWIDTH COMMA
1726 {U'\uff0d', '-'}, // FULLWIDTH HYPHEN-MINUS
1727 {U'\uff0e', '.'}, // FULLWIDTH FULL STOP
1728 {U'\uff0f', '/'}, // FULLWIDTH SOLIDUS
1729 {U'\uff1a', ':'}, // FULLWIDTH COLON
1730 {U'\uff1b', ';'}, // FULLWIDTH SEMICOLON
1731 {U'\uff1c', '<'}, // FULLWIDTH LESS-THAN SIGN
1732 {U'\uff1d', '='}, // FULLWIDTH EQUALS SIGN
1733 {U'\uff1e', '>'}, // FULLWIDTH GREATER-THAN SIGN
1734 {U'\uff1f', '?'}, // FULLWIDTH QUESTION MARK
1735 {U'\uff20', '@'}, // FULLWIDTH COMMERCIAL AT
1736 {U'\uff3b', '['}, // FULLWIDTH LEFT SQUARE BRACKET
1737 {U'\uff3c', '\\'}, // FULLWIDTH REVERSE SOLIDUS
1738 {U'\uff3d', ']'}, // FULLWIDTH RIGHT SQUARE BRACKET
1739 {U'\uff3e', '^'}, // FULLWIDTH CIRCUMFLEX ACCENT
1740 {U'\uff5b', '{'}, // FULLWIDTH LEFT CURLY BRACKET
1741 {U'\uff5c', '|'}, // FULLWIDTH VERTICAL LINE
1742 {U'\uff5d', '}'}, // FULLWIDTH RIGHT CURLY BRACKET
1743 {U'\uff5e', '~'}, // FULLWIDTH TILDE
1744 {0, 0}
1745 };
1746 auto Homoglyph =
1747 std::lower_bound(std::begin(SortedHomoglyphs),
1748 std::end(SortedHomoglyphs) - 1, HomoglyphPair{C, '\0'});
1749 if (Homoglyph->Character == C) {
1750 if (Homoglyph->LooksLike) {
1751 const char LooksLikeStr[] = {Homoglyph->LooksLike, 0};
1752 Diags.Report(Range.getBegin(), diag::warn_utf8_symbol_homoglyph)
1753 << Range << codepointAsHexString(C) << LooksLikeStr;
1754 } else {
1755 Diags.Report(Range.getBegin(), diag::warn_utf8_symbol_zero_width)
1757 }
1758 }
1759}
1760
1762 DiagnosticsEngine &Diags, const LangOptions &LangOpts, uint32_t CodePoint,
1763 CharSourceRange Range, bool IsFirst) {
1764 if (isASCII(CodePoint))
1765 return;
1766
1767 bool IsExtension;
1768 bool IsIDStart = isAllowedInitiallyIDChar(CodePoint, LangOpts, IsExtension);
1769 bool IsIDContinue =
1770 IsIDStart || isAllowedIDChar(CodePoint, LangOpts, IsExtension);
1771
1772 if ((IsFirst && IsIDStart) || (!IsFirst && IsIDContinue))
1773 return;
1774
1775 bool InvalidOnlyAtStart = IsFirst && !IsIDStart && IsIDContinue;
1776
1777 if (!IsFirst || InvalidOnlyAtStart) {
1778 Diags.Report(Range.getBegin(), diag::err_character_not_allowed_identifier)
1779 << Range << codepointAsHexString(CodePoint) << int(InvalidOnlyAtStart)
1781 } else {
1782 Diags.Report(Range.getBegin(), diag::err_character_not_allowed)
1783 << Range << codepointAsHexString(CodePoint)
1785 }
1786}
1787
1788bool Lexer::tryConsumeIdentifierUCN(const char *&CurPtr, unsigned Size,
1789 Token &Result) {
1790 const char *UCNPtr = CurPtr + Size;
1791 uint32_t CodePoint = tryReadUCN(UCNPtr, CurPtr, /*Token=*/nullptr);
1792 if (CodePoint == 0) {
1793 return false;
1794 }
1795 bool IsExtension = false;
1796 if (!isAllowedIDChar(CodePoint, LangOpts, IsExtension)) {
1797 if (isASCII(CodePoint) || isUnicodeWhitespace(CodePoint))
1798 return false;
1802 PP->getDiagnostics(), LangOpts, CodePoint,
1803 makeCharRange(*this, CurPtr, UCNPtr),
1804 /*IsFirst=*/false);
1805
1806 // We got a unicode codepoint that is neither a space nor a
1807 // a valid identifier part.
1808 // Carry on as if the codepoint was valid for recovery purposes.
1809 } else if (!isLexingRawMode()) {
1810 if (IsExtension)
1812 makeCharRange(*this, CurPtr, UCNPtr));
1813
1815 makeCharRange(*this, CurPtr, UCNPtr),
1816 /*IsFirst=*/false);
1817 }
1818
1819 Result.setFlag(Token::HasUCN);
1820 if ((UCNPtr - CurPtr == 6 && CurPtr[1] == 'u') ||
1821 (UCNPtr - CurPtr == 10 && CurPtr[1] == 'U'))
1822 CurPtr = UCNPtr;
1823 else
1824 while (CurPtr != UCNPtr)
1825 (void)getAndAdvanceChar(CurPtr, Result);
1826 return true;
1827}
1828
1829bool Lexer::tryConsumeIdentifierUTF8Char(const char *&CurPtr, Token &Result) {
1830 llvm::UTF32 CodePoint;
1831
1832 // If a UTF-8 codepoint appears immediately after an escaped new line,
1833 // CurPtr may point to the splicing \ on the preceding line,
1834 // so we need to skip it.
1835 unsigned FirstCodeUnitSize;
1836 getCharAndSize(CurPtr, FirstCodeUnitSize);
1837 const char *CharStart = CurPtr + FirstCodeUnitSize - 1;
1838 const char *UnicodePtr = CharStart;
1839
1840 llvm::ConversionResult ConvResult = llvm::convertUTF8Sequence(
1841 (const llvm::UTF8 **)&UnicodePtr, (const llvm::UTF8 *)BufferEnd,
1842 &CodePoint, llvm::strictConversion);
1843 if (ConvResult != llvm::conversionOK)
1844 return false;
1845
1846 bool IsExtension = false;
1847 if (!isAllowedIDChar(static_cast<uint32_t>(CodePoint), LangOpts,
1848 IsExtension)) {
1849 if (isASCII(CodePoint) || isUnicodeWhitespace(CodePoint))
1850 return false;
1851
1855 PP->getDiagnostics(), LangOpts, CodePoint,
1856 makeCharRange(*this, CharStart, UnicodePtr), /*IsFirst=*/false);
1857 // We got a unicode codepoint that is neither a space nor a
1858 // a valid identifier part. Carry on as if the codepoint was
1859 // valid for recovery purposes.
1860 } else if (!isLexingRawMode()) {
1861 if (IsExtension)
1863 PP->getDiagnostics(), CodePoint,
1864 makeCharRange(*this, CharStart, UnicodePtr));
1866 makeCharRange(*this, CharStart, UnicodePtr),
1867 /*IsFirst=*/false);
1869 makeCharRange(*this, CharStart, UnicodePtr));
1870 }
1871
1872 // Once we sucessfully parsed some UTF-8,
1873 // calling ConsumeChar ensures the NeedsCleaning flag is set on the token
1874 // being lexed, and that warnings about trailing spaces are emitted.
1875 ConsumeChar(CurPtr, FirstCodeUnitSize, Result);
1876 CurPtr = UnicodePtr;
1877 return true;
1878}
1879
1880bool Lexer::LexUnicodeIdentifierStart(Token &Result, uint32_t C,
1881 const char *CurPtr) {
1882 bool IsExtension = false;
1883 if (isAllowedInitiallyIDChar(C, LangOpts, IsExtension)) {
1886 if (IsExtension)
1888 makeCharRange(*this, BufferPtr, CurPtr));
1890 makeCharRange(*this, BufferPtr, CurPtr),
1891 /*IsFirst=*/true);
1893 makeCharRange(*this, BufferPtr, CurPtr));
1894 }
1895
1896 MIOpt.ReadToken();
1897 return LexIdentifierContinue(Result, CurPtr);
1898 }
1899
1901 !PP->isPreprocessedOutput() && !isASCII(*BufferPtr) &&
1903 // Non-ASCII characters tend to creep into source code unintentionally.
1904 // Instead of letting the parser complain about the unknown token,
1905 // just drop the character.
1906 // Note that we can /only/ do this when the non-ASCII character is actually
1907 // spelled as Unicode, not written as a UCN. The standard requires that
1908 // we not throw away any possible preprocessor tokens, but there's a
1909 // loophole in the mapping of Unicode characters to basic character set
1910 // characters that allows us to map these particular characters to, say,
1911 // whitespace.
1913 PP->getDiagnostics(), LangOpts, C,
1914 makeCharRange(*this, BufferPtr, CurPtr), /*IsStart*/ true);
1915 BufferPtr = CurPtr;
1916 return false;
1917 }
1918
1919 // Otherwise, we have an explicit UCN or a character that's unlikely to show
1920 // up by accident.
1921 MIOpt.ReadToken();
1922 FormTokenWithChars(Result, CurPtr, tok::unknown);
1923 return true;
1924}
1925
1926static const char *
1927fastParseASCIIIdentifier(const char *CurPtr,
1928 [[maybe_unused]] const char *BufferEnd) {
1929#ifdef __SSE4_2__
1930 alignas(16) static constexpr char AsciiIdentifierRange[16] = {
1931 '_', '_', 'A', 'Z', 'a', 'z', '0', '9',
1932 };
1933 constexpr ssize_t BytesPerRegister = 16;
1934
1935 __m128i AsciiIdentifierRangeV =
1936 _mm_load_si128((const __m128i *)AsciiIdentifierRange);
1937
1938 while (LLVM_LIKELY(BufferEnd - CurPtr >= BytesPerRegister)) {
1939 __m128i Cv = _mm_loadu_si128((const __m128i *)(CurPtr));
1940
1941 int Consumed = _mm_cmpistri(AsciiIdentifierRangeV, Cv,
1944 CurPtr += Consumed;
1945 if (Consumed == BytesPerRegister)
1946 continue;
1947 return CurPtr;
1948 }
1949#endif
1950
1951 unsigned char C = *CurPtr;
1953 C = *++CurPtr;
1954 return CurPtr;
1955}
1956
1957bool Lexer::LexIdentifierContinue(Token &Result, const char *CurPtr) {
1958 // Match [_A-Za-z0-9]*, we have already matched an identifier start.
1959
1960 while (true) {
1961
1962 CurPtr = fastParseASCIIIdentifier(CurPtr, BufferEnd);
1963
1964 unsigned Size;
1965 // Slow path: handle trigraph, unicode codepoints, UCNs.
1966 unsigned char C = getCharAndSize(CurPtr, Size);
1968 CurPtr = ConsumeChar(CurPtr, Size, Result);
1969 continue;
1970 }
1971 if (C == '$') {
1972 // If we hit a $ and they are not supported in identifiers, we are done.
1973 if (!LangOpts.DollarIdents)
1974 break;
1975 // Otherwise, emit a diagnostic and continue.
1976 if (!isLexingRawMode())
1977 Diag(CurPtr, diag::ext_dollar_in_identifier);
1978 CurPtr = ConsumeChar(CurPtr, Size, Result);
1979 continue;
1980 }
1981 if (C == '\\' && tryConsumeIdentifierUCN(CurPtr, Size, Result))
1982 continue;
1983 if (!isASCII(C) && tryConsumeIdentifierUTF8Char(CurPtr, Result))
1984 continue;
1985 // Neither an expected Unicode codepoint nor a UCN.
1986 break;
1987 }
1988
1989 const char *IdStart = BufferPtr;
1990 FormTokenWithChars(Result, CurPtr, tok::raw_identifier);
1991 Result.setRawIdentifierData(IdStart);
1992
1993 // If we are in raw mode, return this identifier raw. There is no need to
1994 // look up identifier information or attempt to macro expand it.
1995 if (LexingRawMode)
1996 return true;
1997
1998 // Fill in Result.IdentifierInfo and update the token kind,
1999 // looking up the identifier in the identifier table.
2001 // Note that we have to call PP->LookUpIdentifierInfo() even for code
2002 // completion, it writes IdentifierInfo into Result, and callers rely on it.
2003
2004 // If the completion point is at the end of an identifier, we want to treat
2005 // the identifier as incomplete even if it resolves to a macro or a keyword.
2006 // This allows e.g. 'class^' to complete to 'classifier'.
2007 if (isCodeCompletionPoint(CurPtr)) {
2008 // Return the code-completion token.
2009 Result.setKind(tok::code_completion);
2010 // Skip the code-completion char and all immediate identifier characters.
2011 // This ensures we get consistent behavior when completing at any point in
2012 // an identifier (i.e. at the start, in the middle, at the end). Note that
2013 // only simple cases (i.e. [a-zA-Z0-9_]) are supported to keep the code
2014 // simpler.
2015 assert(*CurPtr == 0 && "Completion character must be 0");
2016 ++CurPtr;
2017 // Note that code completion token is not added as a separate character
2018 // when the completion point is at the end of the buffer. Therefore, we need
2019 // to check if the buffer has ended.
2020 if (CurPtr < BufferEnd) {
2021 while (isAsciiIdentifierContinue(*CurPtr))
2022 ++CurPtr;
2023 }
2024 BufferPtr = CurPtr;
2025 return true;
2026 }
2027
2028 // Finally, now that we know we have an identifier, pass this off to the
2029 // preprocessor, which may macro expand it or something.
2030 if (II->isHandleIdentifierCase())
2031 return PP->HandleIdentifier(Result);
2032
2033 return true;
2034}
2035
2036/// isHexaLiteral - Return true if Start points to a hex constant.
2037/// in microsoft mode (where this is supposed to be several different tokens).
2038bool Lexer::isHexaLiteral(const char *Start, const LangOptions &LangOpts) {
2039 auto CharAndSize1 = Lexer::getCharAndSizeNoWarn(Start, LangOpts);
2040 char C1 = CharAndSize1.Char;
2041 if (C1 != '0')
2042 return false;
2043
2044 auto CharAndSize2 =
2045 Lexer::getCharAndSizeNoWarn(Start + CharAndSize1.Size, LangOpts);
2046 char C2 = CharAndSize2.Char;
2047 return (C2 == 'x' || C2 == 'X');
2048}
2049
2050/// LexNumericConstant - Lex the remainder of a integer or floating point
2051/// constant. From[-1] is the first character lexed. Return the end of the
2052/// constant.
2053bool Lexer::LexNumericConstant(Token &Result, const char *CurPtr) {
2054 unsigned Size;
2055 char C = getCharAndSize(CurPtr, Size);
2056 char PrevCh = 0;
2057 while (isPreprocessingNumberBody(C)) {
2058 CurPtr = ConsumeChar(CurPtr, Size, Result);
2059 PrevCh = C;
2060 if (LangOpts.HLSL && C == '.' && (*CurPtr == 'x' || *CurPtr == 'r')) {
2061 CurPtr -= Size;
2062 break;
2063 }
2064 C = getCharAndSize(CurPtr, Size);
2065 }
2066
2067 // If we fell out, check for a sign, due to 1e+12. If we have one, continue.
2068 if ((C == '-' || C == '+') && (PrevCh == 'E' || PrevCh == 'e')) {
2069 // If we are in Microsoft mode, don't continue if the constant is hex.
2070 // For example, MSVC will accept the following as 3 tokens: 0x1234567e+1
2071 if (!LangOpts.MicrosoftExt || !isHexaLiteral(BufferPtr, LangOpts))
2072 return LexNumericConstant(Result, ConsumeChar(CurPtr, Size, Result));
2073 }
2074
2075 // If we have a hex FP constant, continue.
2076 if ((C == '-' || C == '+') && (PrevCh == 'P' || PrevCh == 'p')) {
2077 // Outside C99 and C++17, we accept hexadecimal floating point numbers as a
2078 // not-quite-conforming extension. Only do so if this looks like it's
2079 // actually meant to be a hexfloat, and not if it has a ud-suffix.
2080 bool IsHexFloat = true;
2081 if (!LangOpts.C99) {
2082 if (!isHexaLiteral(BufferPtr, LangOpts))
2083 IsHexFloat = false;
2084 else if (!LangOpts.CPlusPlus17 &&
2085 std::find(BufferPtr, CurPtr, '_') != CurPtr)
2086 IsHexFloat = false;
2087 }
2088 if (IsHexFloat)
2089 return LexNumericConstant(Result, ConsumeChar(CurPtr, Size, Result));
2090 }
2091
2092 // If we have a digit separator, continue.
2093 if (C == '\'' && (LangOpts.CPlusPlus14 || LangOpts.C23)) {
2094 auto [Next, NextSize] = getCharAndSizeNoWarn(CurPtr + Size, LangOpts);
2095 if (isAsciiIdentifierContinue(Next)) {
2096 if (!isLexingRawMode())
2097 Diag(CurPtr, LangOpts.CPlusPlus
2098 ? diag::warn_cxx11_compat_digit_separator
2099 : diag::warn_c23_compat_digit_separator);
2100 CurPtr = ConsumeChar(CurPtr, Size, Result);
2101 CurPtr = ConsumeChar(CurPtr, NextSize, Result);
2102 return LexNumericConstant(Result, CurPtr);
2103 }
2104 }
2105
2106 // If we have a UCN or UTF-8 character (perhaps in a ud-suffix), continue.
2107 if (C == '\\' && tryConsumeIdentifierUCN(CurPtr, Size, Result))
2108 return LexNumericConstant(Result, CurPtr);
2109 if (!isASCII(C) && tryConsumeIdentifierUTF8Char(CurPtr, Result))
2110 return LexNumericConstant(Result, CurPtr);
2111
2112 // Update the location of token as well as BufferPtr.
2113 const char *TokStart = BufferPtr;
2114 FormTokenWithChars(Result, CurPtr, tok::numeric_constant);
2115 Result.setLiteralData(TokStart);
2116 return true;
2117}
2118
2119/// LexUDSuffix - Lex the ud-suffix production for user-defined literal suffixes
2120/// in C++11, or warn on a ud-suffix in C++98.
2121const char *Lexer::LexUDSuffix(Token &Result, const char *CurPtr,
2122 bool IsStringLiteral) {
2123 assert(LangOpts.CPlusPlus);
2124
2125 // Maximally munch an identifier.
2126 unsigned Size;
2127 char C = getCharAndSize(CurPtr, Size);
2128 bool Consumed = false;
2129
2130 if (!isAsciiIdentifierStart(C)) {
2131 if (C == '\\' && tryConsumeIdentifierUCN(CurPtr, Size, Result))
2132 Consumed = true;
2133 else if (!isASCII(C) && tryConsumeIdentifierUTF8Char(CurPtr, Result))
2134 Consumed = true;
2135 else
2136 return CurPtr;
2137 }
2138
2139 if (!LangOpts.CPlusPlus11) {
2140 if (!isLexingRawMode())
2141 Diag(CurPtr,
2142 C == '_' ? diag::warn_cxx11_compat_user_defined_literal
2143 : diag::warn_cxx11_compat_reserved_user_defined_literal)
2145 return CurPtr;
2146 }
2147
2148 // C++11 [lex.ext]p10, [usrlit.suffix]p1: A program containing a ud-suffix
2149 // that does not start with an underscore is ill-formed. As a conforming
2150 // extension, we treat all such suffixes as if they had whitespace before
2151 // them. We assume a suffix beginning with a UCN or UTF-8 character is more
2152 // likely to be a ud-suffix than a macro, however, and accept that.
2153 if (!Consumed) {
2154 bool IsUDSuffix = false;
2155 if (C == '_')
2156 IsUDSuffix = true;
2157 else if (IsStringLiteral && LangOpts.CPlusPlus14) {
2158 // In C++1y, we need to look ahead a few characters to see if this is a
2159 // valid suffix for a string literal or a numeric literal (this could be
2160 // the 'operator""if' defining a numeric literal operator).
2161 const unsigned MaxStandardSuffixLength = 3;
2162 char Buffer[MaxStandardSuffixLength] = { C };
2163 unsigned Consumed = Size;
2164 unsigned Chars = 1;
2165 while (true) {
2166 auto [Next, NextSize] =
2167 getCharAndSizeNoWarn(CurPtr + Consumed, LangOpts);
2168 if (!isAsciiIdentifierContinue(Next)) {
2169 // End of suffix. Check whether this is on the allowed list.
2170 const StringRef CompleteSuffix(Buffer, Chars);
2171 IsUDSuffix =
2172 StringLiteralParser::isValidUDSuffix(LangOpts, CompleteSuffix);
2173 break;
2174 }
2175
2176 if (Chars == MaxStandardSuffixLength)
2177 // Too long: can't be a standard suffix.
2178 break;
2179
2180 Buffer[Chars++] = Next;
2181 Consumed += NextSize;
2182 }
2183 }
2184
2185 if (!IsUDSuffix) {
2186 if (!isLexingRawMode())
2187 Diag(CurPtr, LangOpts.MSVCCompat
2188 ? diag::ext_ms_reserved_user_defined_literal
2189 : diag::ext_reserved_user_defined_literal)
2191 return CurPtr;
2192 }
2193
2194 CurPtr = ConsumeChar(CurPtr, Size, Result);
2195 }
2196
2197 Result.setFlag(Token::HasUDSuffix);
2198 while (true) {
2199 C = getCharAndSize(CurPtr, Size);
2201 CurPtr = ConsumeChar(CurPtr, Size, Result);
2202 } else if (C == '\\' && tryConsumeIdentifierUCN(CurPtr, Size, Result)) {
2203 } else if (!isASCII(C) && tryConsumeIdentifierUTF8Char(CurPtr, Result)) {
2204 } else
2205 break;
2206 }
2207
2208 return CurPtr;
2209}
2210
2211/// LexStringLiteral - Lex the remainder of a string literal, after having lexed
2212/// either " or L" or u8" or u" or U".
2213bool Lexer::LexStringLiteral(Token &Result, const char *CurPtr,
2214 tok::TokenKind Kind) {
2215 const char *AfterQuote = CurPtr;
2216 // Does this string contain the \0 character?
2217 const char *NulCharacter = nullptr;
2218
2219 if (!isLexingRawMode() &&
2220 (Kind == tok::utf8_string_literal ||
2221 Kind == tok::utf16_string_literal ||
2222 Kind == tok::utf32_string_literal))
2223 Diag(BufferPtr, LangOpts.CPlusPlus ? diag::warn_cxx98_compat_unicode_literal
2224 : diag::warn_c99_compat_unicode_literal);
2225
2226 char C = getAndAdvanceChar(CurPtr, Result);
2227 while (C != '"') {
2228 // Skip escaped characters. Escaped newlines will already be processed by
2229 // getAndAdvanceChar.
2230 if (C == '\\')
2231 C = getAndAdvanceChar(CurPtr, Result);
2232
2233 if (C == '\n' || C == '\r' || // Newline.
2234 (C == 0 && CurPtr-1 == BufferEnd)) { // End of file.
2235 if (!isLexingRawMode() && !LangOpts.AsmPreprocessor)
2236 Diag(BufferPtr, diag::ext_unterminated_char_or_string) << 1;
2237 FormTokenWithChars(Result, CurPtr-1, tok::unknown);
2238 return true;
2239 }
2240
2241 if (C == 0) {
2242 if (isCodeCompletionPoint(CurPtr-1)) {
2243 if (ParsingFilename)
2244 codeCompleteIncludedFile(AfterQuote, CurPtr - 1, /*IsAngled=*/false);
2245 else
2247 FormTokenWithChars(Result, CurPtr - 1, tok::unknown);
2248 cutOffLexing();
2249 return true;
2250 }
2251
2252 NulCharacter = CurPtr-1;
2253 }
2254 C = getAndAdvanceChar(CurPtr, Result);
2255 }
2256
2257 // If we are in C++11, lex the optional ud-suffix.
2258 if (LangOpts.CPlusPlus)
2259 CurPtr = LexUDSuffix(Result, CurPtr, true);
2260
2261 // If a nul character existed in the string, warn about it.
2262 if (NulCharacter && !isLexingRawMode())
2263 Diag(NulCharacter, diag::null_in_char_or_string) << 1;
2264
2265 // Update the location of the token as well as the BufferPtr instance var.
2266 const char *TokStart = BufferPtr;
2267 FormTokenWithChars(Result, CurPtr, Kind);
2268 Result.setLiteralData(TokStart);
2269 return true;
2270}
2271
2272/// LexRawStringLiteral - Lex the remainder of a raw string literal, after
2273/// having lexed R", LR", u8R", uR", or UR".
2274bool Lexer::LexRawStringLiteral(Token &Result, const char *CurPtr,
2275 tok::TokenKind Kind) {
2276 // This function doesn't use getAndAdvanceChar because C++0x [lex.pptoken]p3:
2277 // Between the initial and final double quote characters of the raw string,
2278 // any transformations performed in phases 1 and 2 (trigraphs,
2279 // universal-character-names, and line splicing) are reverted.
2280
2281 if (!isLexingRawMode())
2282 Diag(BufferPtr, diag::warn_cxx98_compat_raw_string_literal);
2283
2284 unsigned PrefixLen = 0;
2285
2286 while (PrefixLen != 16 && isRawStringDelimBody(CurPtr[PrefixLen])) {
2287 if (!isLexingRawMode() &&
2288 llvm::is_contained({'$', '@', '`'}, CurPtr[PrefixLen])) {
2289 const char *Pos = &CurPtr[PrefixLen];
2290 Diag(Pos, LangOpts.CPlusPlus26
2291 ? diag::warn_cxx26_compat_raw_string_literal_character_set
2292 : diag::ext_cxx26_raw_string_literal_character_set)
2293 << StringRef(Pos, 1);
2294 }
2295 ++PrefixLen;
2296 }
2297
2298 // If the last character was not a '(', then we didn't lex a valid delimiter.
2299 if (CurPtr[PrefixLen] != '(') {
2300 if (!isLexingRawMode()) {
2301 const char *PrefixEnd = &CurPtr[PrefixLen];
2302 if (PrefixLen == 16) {
2303 Diag(PrefixEnd, diag::err_raw_delim_too_long);
2304 } else if (*PrefixEnd == '\n') {
2305 Diag(PrefixEnd, diag::err_invalid_newline_raw_delim);
2306 } else {
2307 Diag(PrefixEnd, diag::err_invalid_char_raw_delim)
2308 << StringRef(PrefixEnd, 1);
2309 }
2310 }
2311
2312 // Search for the next '"' in hopes of salvaging the lexer. Unfortunately,
2313 // it's possible the '"' was intended to be part of the raw string, but
2314 // there's not much we can do about that.
2315 while (true) {
2316 char C = *CurPtr++;
2317
2318 if (C == '"')
2319 break;
2320 if (C == 0 && CurPtr-1 == BufferEnd) {
2321 --CurPtr;
2322 break;
2323 }
2324 }
2325
2326 FormTokenWithChars(Result, CurPtr, tok::unknown);
2327 return true;
2328 }
2329
2330 // Save prefix and move CurPtr past it
2331 const char *Prefix = CurPtr;
2332 CurPtr += PrefixLen + 1; // skip over prefix and '('
2333
2334 while (true) {
2335 char C = *CurPtr++;
2336
2337 if (C == ')') {
2338 // Check for prefix match and closing quote.
2339 if (strncmp(CurPtr, Prefix, PrefixLen) == 0 && CurPtr[PrefixLen] == '"') {
2340 CurPtr += PrefixLen + 1; // skip over prefix and '"'
2341 break;
2342 }
2343 } else if (C == 0 && CurPtr-1 == BufferEnd) { // End of file.
2344 if (!isLexingRawMode())
2345 Diag(BufferPtr, diag::err_unterminated_raw_string)
2346 << StringRef(Prefix, PrefixLen);
2347 FormTokenWithChars(Result, CurPtr-1, tok::unknown);
2348 return true;
2349 }
2350 }
2351
2352 // If we are in C++11, lex the optional ud-suffix.
2353 if (LangOpts.CPlusPlus)
2354 CurPtr = LexUDSuffix(Result, CurPtr, true);
2355
2356 // Update the location of token as well as BufferPtr.
2357 const char *TokStart = BufferPtr;
2358 FormTokenWithChars(Result, CurPtr, Kind);
2359 Result.setLiteralData(TokStart);
2360 return true;
2361}
2362
2363/// LexAngledStringLiteral - Lex the remainder of an angled string literal,
2364/// after having lexed the '<' character. This is used for #include filenames.
2365bool Lexer::LexAngledStringLiteral(Token &Result, const char *CurPtr) {
2366 // Does this string contain the \0 character?
2367 const char *NulCharacter = nullptr;
2368 const char *AfterLessPos = CurPtr;
2369 char C = getAndAdvanceChar(CurPtr, Result);
2370 while (C != '>') {
2371 // Skip escaped characters. Escaped newlines will already be processed by
2372 // getAndAdvanceChar.
2373 if (C == '\\')
2374 C = getAndAdvanceChar(CurPtr, Result);
2375
2376 if (isVerticalWhitespace(C) || // Newline.
2377 (C == 0 && (CurPtr - 1 == BufferEnd))) { // End of file.
2378 // If the filename is unterminated, then it must just be a lone <
2379 // character. Return this as such.
2380 FormTokenWithChars(Result, AfterLessPos, tok::less);
2381 return true;
2382 }
2383
2384 if (C == 0) {
2385 if (isCodeCompletionPoint(CurPtr - 1)) {
2386 codeCompleteIncludedFile(AfterLessPos, CurPtr - 1, /*IsAngled=*/true);
2387 cutOffLexing();
2388 FormTokenWithChars(Result, CurPtr - 1, tok::unknown);
2389 return true;
2390 }
2391 NulCharacter = CurPtr-1;
2392 }
2393 C = getAndAdvanceChar(CurPtr, Result);
2394 }
2395
2396 // If a nul character existed in the string, warn about it.
2397 if (NulCharacter && !isLexingRawMode())
2398 Diag(NulCharacter, diag::null_in_char_or_string) << 1;
2399
2400 // Update the location of token as well as BufferPtr.
2401 const char *TokStart = BufferPtr;
2402 FormTokenWithChars(Result, CurPtr, tok::header_name);
2403 Result.setLiteralData(TokStart);
2404 return true;
2405}
2406
2407void Lexer::codeCompleteIncludedFile(const char *PathStart,
2408 const char *CompletionPoint,
2409 bool IsAngled) {
2410 // Completion only applies to the filename, after the last slash.
2411 StringRef PartialPath(PathStart, CompletionPoint - PathStart);
2412 llvm::StringRef SlashChars = LangOpts.MSVCCompat ? "/\\" : "/";
2413 auto Slash = PartialPath.find_last_of(SlashChars);
2414 StringRef Dir =
2415 (Slash == StringRef::npos) ? "" : PartialPath.take_front(Slash);
2416 const char *StartOfFilename =
2417 (Slash == StringRef::npos) ? PathStart : PathStart + Slash + 1;
2418 // Code completion filter range is the filename only, up to completion point.
2420 StringRef(StartOfFilename, CompletionPoint - StartOfFilename)));
2421 // We should replace the characters up to the closing quote or closest slash,
2422 // if any.
2423 while (CompletionPoint < BufferEnd) {
2424 char Next = *(CompletionPoint + 1);
2425 if (Next == 0 || Next == '\r' || Next == '\n')
2426 break;
2427 ++CompletionPoint;
2428 if (Next == (IsAngled ? '>' : '"'))
2429 break;
2430 if (SlashChars.contains(Next))
2431 break;
2432 }
2433
2435 FileLoc.getLocWithOffset(StartOfFilename - BufferStart),
2436 FileLoc.getLocWithOffset(CompletionPoint - BufferStart));
2437 PP->CodeCompleteIncludedFile(Dir, IsAngled);
2438}
2439
2440/// LexCharConstant - Lex the remainder of a character constant, after having
2441/// lexed either ' or L' or u8' or u' or U'.
2442bool Lexer::LexCharConstant(Token &Result, const char *CurPtr,
2443 tok::TokenKind Kind) {
2444 // Does this character contain the \0 character?
2445 const char *NulCharacter = nullptr;
2446
2447 if (!isLexingRawMode()) {
2448 if (Kind == tok::utf16_char_constant || Kind == tok::utf32_char_constant)
2449 Diag(BufferPtr, LangOpts.CPlusPlus
2450 ? diag::warn_cxx98_compat_unicode_literal
2451 : diag::warn_c99_compat_unicode_literal);
2452 else if (Kind == tok::utf8_char_constant)
2453 Diag(BufferPtr, LangOpts.CPlusPlus
2454 ? diag::warn_cxx14_compat_u8_character_literal
2455 : diag::warn_c17_compat_u8_character_literal);
2456 }
2457
2458 char C = getAndAdvanceChar(CurPtr, Result);
2459 if (C == '\'') {
2460 if (!isLexingRawMode() && !LangOpts.AsmPreprocessor)
2461 Diag(BufferPtr, diag::ext_empty_character);
2462 FormTokenWithChars(Result, CurPtr, tok::unknown);
2463 return true;
2464 }
2465
2466 while (C != '\'') {
2467 // Skip escaped characters.
2468 if (C == '\\')
2469 C = getAndAdvanceChar(CurPtr, Result);
2470
2471 if (C == '\n' || C == '\r' || // Newline.
2472 (C == 0 && CurPtr-1 == BufferEnd)) { // End of file.
2473 if (!isLexingRawMode() && !LangOpts.AsmPreprocessor)
2474 Diag(BufferPtr, diag::ext_unterminated_char_or_string) << 0;
2475 FormTokenWithChars(Result, CurPtr-1, tok::unknown);
2476 return true;
2477 }
2478
2479 if (C == 0) {
2480 if (isCodeCompletionPoint(CurPtr-1)) {
2482 FormTokenWithChars(Result, CurPtr-1, tok::unknown);
2483 cutOffLexing();
2484 return true;
2485 }
2486
2487 NulCharacter = CurPtr-1;
2488 }
2489 C = getAndAdvanceChar(CurPtr, Result);
2490 }
2491
2492 // If we are in C++11, lex the optional ud-suffix.
2493 if (LangOpts.CPlusPlus)
2494 CurPtr = LexUDSuffix(Result, CurPtr, false);
2495
2496 // If a nul character existed in the character, warn about it.
2497 if (NulCharacter && !isLexingRawMode())
2498 Diag(NulCharacter, diag::null_in_char_or_string) << 0;
2499
2500 // Update the location of token as well as BufferPtr.
2501 const char *TokStart = BufferPtr;
2502 FormTokenWithChars(Result, CurPtr, Kind);
2503 Result.setLiteralData(TokStart);
2504 return true;
2505}
2506
2507/// SkipWhitespace - Efficiently skip over a series of whitespace characters.
2508/// Update BufferPtr to point to the next non-whitespace character and return.
2509///
2510/// This method forms a token and returns true if KeepWhitespaceMode is enabled.
2511bool Lexer::SkipWhitespace(Token &Result, const char *CurPtr,
2512 bool &TokAtPhysicalStartOfLine) {
2513 // Whitespace - Skip it, then return the token after the whitespace.
2514 bool SawNewline = isVerticalWhitespace(CurPtr[-1]);
2515
2516 unsigned char Char = *CurPtr;
2517
2518 const char *lastNewLine = nullptr;
2519 auto setLastNewLine = [&](const char *Ptr) {
2520 lastNewLine = Ptr;
2521 if (!NewLinePtr)
2522 NewLinePtr = Ptr;
2523 };
2524 if (SawNewline)
2525 setLastNewLine(CurPtr - 1);
2526
2527 // Skip consecutive spaces efficiently.
2528 while (true) {
2529 // Skip horizontal whitespace very aggressively.
2530 while (isHorizontalWhitespace(Char))
2531 Char = *++CurPtr;
2532
2533 // Otherwise if we have something other than whitespace, we're done.
2534 if (!isVerticalWhitespace(Char))
2535 break;
2536
2538 // End of preprocessor directive line, let LexTokenInternal handle this.
2539 BufferPtr = CurPtr;
2540 return false;
2541 }
2542
2543 // OK, but handle newline.
2544 if (*CurPtr == '\n')
2545 setLastNewLine(CurPtr);
2546 SawNewline = true;
2547 Char = *++CurPtr;
2548 }
2549
2550 // If the client wants us to return whitespace, return it now.
2551 if (isKeepWhitespaceMode()) {
2552 FormTokenWithChars(Result, CurPtr, tok::unknown);
2553 if (SawNewline) {
2554 IsAtStartOfLine = true;
2555 IsAtPhysicalStartOfLine = true;
2556 }
2557 // FIXME: The next token will not have LeadingSpace set.
2558 return true;
2559 }
2560
2561 // If this isn't immediately after a newline, there is leading space.
2562 char PrevChar = CurPtr[-1];
2563 bool HasLeadingSpace = !isVerticalWhitespace(PrevChar);
2564
2565 Result.setFlagValue(Token::LeadingSpace, HasLeadingSpace);
2566 if (SawNewline) {
2567 Result.setFlag(Token::StartOfLine);
2568 TokAtPhysicalStartOfLine = true;
2569
2570 if (NewLinePtr && lastNewLine && NewLinePtr != lastNewLine && PP) {
2571 if (auto *Handler = PP->getEmptylineHandler())
2572 Handler->HandleEmptyline(SourceRange(getSourceLocation(NewLinePtr + 1),
2573 getSourceLocation(lastNewLine)));
2574 }
2575 }
2576
2577 BufferPtr = CurPtr;
2578 return false;
2579}
2580
2581/// We have just read the // characters from input. Skip until we find the
2582/// newline character that terminates the comment. Then update BufferPtr and
2583/// return.
2584///
2585/// If we're in KeepCommentMode or any CommentHandler has inserted
2586/// some tokens, this will store the first token and return true.
2587bool Lexer::SkipLineComment(Token &Result, const char *CurPtr,
2588 bool &TokAtPhysicalStartOfLine) {
2589 // If Line comments aren't explicitly enabled for this language, emit an
2590 // extension warning.
2591 if (!LineComment) {
2592 if (!isLexingRawMode()) // There's no PP in raw mode, so can't emit diags.
2593 Diag(BufferPtr, diag::ext_line_comment);
2594
2595 // Mark them enabled so we only emit one warning for this translation
2596 // unit.
2597 LineComment = true;
2598 }
2599
2600 // Scan over the body of the comment. The common case, when scanning, is that
2601 // the comment contains normal ascii characters with nothing interesting in
2602 // them. As such, optimize for this case with the inner loop.
2603 //
2604 // This loop terminates with CurPtr pointing at the newline (or end of buffer)
2605 // character that ends the line comment.
2606
2607 // C++23 [lex.phases] p1
2608 // Diagnose invalid UTF-8 if the corresponding warning is enabled, emitting a
2609 // diagnostic only once per entire ill-formed subsequence to avoid
2610 // emiting to many diagnostics (see http://unicode.org/review/pr-121.html).
2611 bool UnicodeDecodingAlreadyDiagnosed = false;
2612
2613 char C;
2614 while (true) {
2615 C = *CurPtr;
2616 // Skip over characters in the fast loop.
2617 while (isASCII(C) && C != 0 && // Potentially EOF.
2618 C != '\n' && C != '\r') { // Newline or DOS-style newline.
2619 C = *++CurPtr;
2620 UnicodeDecodingAlreadyDiagnosed = false;
2621 }
2622
2623 if (!isASCII(C)) {
2624 unsigned Length = llvm::getUTF8SequenceSize(
2625 (const llvm::UTF8 *)CurPtr, (const llvm::UTF8 *)BufferEnd);
2626 if (Length == 0) {
2627 if (!UnicodeDecodingAlreadyDiagnosed && !isLexingRawMode())
2628 Diag(CurPtr, diag::warn_invalid_utf8_in_comment);
2629 UnicodeDecodingAlreadyDiagnosed = true;
2630 ++CurPtr;
2631 } else {
2632 UnicodeDecodingAlreadyDiagnosed = false;
2633 CurPtr += Length;
2634 }
2635 continue;
2636 }
2637
2638 const char *NextLine = CurPtr;
2639 if (C != 0) {
2640 // We found a newline, see if it's escaped.
2641 const char *EscapePtr = CurPtr-1;
2642 bool HasSpace = false;
2643 while (isHorizontalWhitespace(*EscapePtr)) { // Skip whitespace.
2644 --EscapePtr;
2645 HasSpace = true;
2646 }
2647
2648 if (*EscapePtr == '\\')
2649 // Escaped newline.
2650 CurPtr = EscapePtr;
2651 else if (EscapePtr[0] == '/' && EscapePtr[-1] == '?' &&
2652 EscapePtr[-2] == '?' && LangOpts.Trigraphs)
2653 // Trigraph-escaped newline.
2654 CurPtr = EscapePtr-2;
2655 else
2656 break; // This is a newline, we're done.
2657
2658 // If there was space between the backslash and newline, warn about it.
2659 if (HasSpace && !isLexingRawMode())
2660 Diag(EscapePtr, diag::backslash_newline_space);
2661 }
2662
2663 // Otherwise, this is a hard case. Fall back on getAndAdvanceChar to
2664 // properly decode the character. Read it in raw mode to avoid emitting
2665 // diagnostics about things like trigraphs. If we see an escaped newline,
2666 // we'll handle it below.
2667 const char *OldPtr = CurPtr;
2668 bool OldRawMode = isLexingRawMode();
2669 LexingRawMode = true;
2670 C = getAndAdvanceChar(CurPtr, Result);
2671 LexingRawMode = OldRawMode;
2672
2673 // If we only read only one character, then no special handling is needed.
2674 // We're done and can skip forward to the newline.
2675 if (C != 0 && CurPtr == OldPtr+1) {
2676 CurPtr = NextLine;
2677 break;
2678 }
2679
2680 // If we read multiple characters, and one of those characters was a \r or
2681 // \n, then we had an escaped newline within the comment. Emit diagnostic
2682 // unless the next line is also a // comment.
2683 if (CurPtr != OldPtr + 1 && C != '/' &&
2684 (CurPtr == BufferEnd + 1 || CurPtr[0] != '/')) {
2685 for (; OldPtr != CurPtr; ++OldPtr)
2686 if (OldPtr[0] == '\n' || OldPtr[0] == '\r') {
2687 // Okay, we found a // comment that ends in a newline, if the next
2688 // line is also a // comment, but has spaces, don't emit a diagnostic.
2689 if (isWhitespace(C)) {
2690 const char *ForwardPtr = CurPtr;
2691 while (isWhitespace(*ForwardPtr)) // Skip whitespace.
2692 ++ForwardPtr;
2693 if (ForwardPtr[0] == '/' && ForwardPtr[1] == '/')
2694 break;
2695 }
2696
2697 if (!isLexingRawMode())
2698 Diag(OldPtr-1, diag::ext_multi_line_line_comment);
2699 break;
2700 }
2701 }
2702
2703 if (C == '\r' || C == '\n' || CurPtr == BufferEnd + 1) {
2704 --CurPtr;
2705 break;
2706 }
2707
2708 if (C == '\0' && isCodeCompletionPoint(CurPtr-1)) {
2710 cutOffLexing();
2711 return false;
2712 }
2713 }
2714
2715 // Found but did not consume the newline. Notify comment handlers about the
2716 // comment unless we're in a #if 0 block.
2717 if (PP && !isLexingRawMode() &&
2719 getSourceLocation(CurPtr)))) {
2720 BufferPtr = CurPtr;
2721 return true; // A token has to be returned.
2722 }
2723
2724 // If we are returning comments as tokens, return this comment as a token.
2725 if (inKeepCommentMode())
2726 return SaveLineComment(Result, CurPtr);
2727
2728 // If we are inside a preprocessor directive and we see the end of line,
2729 // return immediately, so that the lexer can return this as an EOD token.
2730 if (ParsingPreprocessorDirective || CurPtr == BufferEnd) {
2731 BufferPtr = CurPtr;
2732 return false;
2733 }
2734
2735 // Otherwise, eat the \n character. We don't care if this is a \n\r or
2736 // \r\n sequence. This is an efficiency hack (because we know the \n can't
2737 // contribute to another token), it isn't needed for correctness. Note that
2738 // this is ok even in KeepWhitespaceMode, because we would have returned the
2739 // comment above in that mode.
2740 NewLinePtr = CurPtr++;
2741
2742 // The next returned token is at the start of the line.
2743 Result.setFlag(Token::StartOfLine);
2744 TokAtPhysicalStartOfLine = true;
2745 // No leading whitespace seen so far.
2746 Result.clearFlag(Token::LeadingSpace);
2747 BufferPtr = CurPtr;
2748 return false;
2749}
2750
2751/// If in save-comment mode, package up this Line comment in an appropriate
2752/// way and return it.
2753bool Lexer::SaveLineComment(Token &Result, const char *CurPtr) {
2754 // If we're not in a preprocessor directive, just return the // comment
2755 // directly.
2756 FormTokenWithChars(Result, CurPtr, tok::comment);
2757
2759 return true;
2760
2761 // If this Line-style comment is in a macro definition, transmogrify it into
2762 // a C-style block comment.
2763 bool Invalid = false;
2764 std::string Spelling = PP->getSpelling(Result, &Invalid);
2765 if (Invalid)
2766 return true;
2767
2768 assert(Spelling[0] == '/' && Spelling[1] == '/' && "Not line comment?");
2769 Spelling[1] = '*'; // Change prefix to "/*".
2770 Spelling += "*/"; // add suffix.
2771
2772 Result.setKind(tok::comment);
2773 PP->CreateString(Spelling, Result,
2774 Result.getLocation(), Result.getLocation());
2775 return true;
2776}
2777
2778/// isBlockCommentEndOfEscapedNewLine - Return true if the specified newline
2779/// character (either \\n or \\r) is part of an escaped newline sequence. Issue
2780/// a diagnostic if so. We know that the newline is inside of a block comment.
2781static bool isEndOfBlockCommentWithEscapedNewLine(const char *CurPtr, Lexer *L,
2782 bool Trigraphs) {
2783 assert(CurPtr[0] == '\n' || CurPtr[0] == '\r');
2784
2785 // Position of the first trigraph in the ending sequence.
2786 const char *TrigraphPos = nullptr;
2787 // Position of the first whitespace after a '\' in the ending sequence.
2788 const char *SpacePos = nullptr;
2789
2790 while (true) {
2791 // Back up off the newline.
2792 --CurPtr;
2793
2794 // If this is a two-character newline sequence, skip the other character.
2795 if (CurPtr[0] == '\n' || CurPtr[0] == '\r') {
2796 // \n\n or \r\r -> not escaped newline.
2797 if (CurPtr[0] == CurPtr[1])
2798 return false;
2799 // \n\r or \r\n -> skip the newline.
2800 --CurPtr;
2801 }
2802
2803 // If we have horizontal whitespace, skip over it. We allow whitespace
2804 // between the slash and newline.
2805 while (isHorizontalWhitespace(*CurPtr) || *CurPtr == 0) {
2806 SpacePos = CurPtr;
2807 --CurPtr;
2808 }
2809
2810 // If we have a slash, this is an escaped newline.
2811 if (*CurPtr == '\\') {
2812 --CurPtr;
2813 } else if (CurPtr[0] == '/' && CurPtr[-1] == '?' && CurPtr[-2] == '?') {
2814 // This is a trigraph encoding of a slash.
2815 TrigraphPos = CurPtr - 2;
2816 CurPtr -= 3;
2817 } else {
2818 return false;
2819 }
2820
2821 // If the character preceding the escaped newline is a '*', then after line
2822 // splicing we have a '*/' ending the comment.
2823 if (*CurPtr == '*')
2824 break;
2825
2826 if (*CurPtr != '\n' && *CurPtr != '\r')
2827 return false;
2828 }
2829
2830 if (TrigraphPos) {
2831 // If no trigraphs are enabled, warn that we ignored this trigraph and
2832 // ignore this * character.
2833 if (!Trigraphs) {
2834 if (!L->isLexingRawMode())
2835 L->Diag(TrigraphPos, diag::trigraph_ignored_block_comment);
2836 return false;
2837 }
2838 if (!L->isLexingRawMode())
2839 L->Diag(TrigraphPos, diag::trigraph_ends_block_comment);
2840 }
2841
2842 // Warn about having an escaped newline between the */ characters.
2843 if (!L->isLexingRawMode())
2844 L->Diag(CurPtr + 1, diag::escaped_newline_block_comment_end);
2845
2846 // If there was space between the backslash and newline, warn about it.
2847 if (SpacePos && !L->isLexingRawMode())
2848 L->Diag(SpacePos, diag::backslash_newline_space);
2849
2850 return true;
2851}
2852
2853#ifdef __SSE2__
2854#include <emmintrin.h>
2855#elif __ALTIVEC__
2856#include <altivec.h>
2857#undef bool
2858#endif
2859
2860/// We have just read from input the / and * characters that started a comment.
2861/// Read until we find the * and / characters that terminate the comment.
2862/// Note that we don't bother decoding trigraphs or escaped newlines in block
2863/// comments, because they cannot cause the comment to end. The only thing
2864/// that can happen is the comment could end with an escaped newline between
2865/// the terminating * and /.
2866///
2867/// If we're in KeepCommentMode or any CommentHandler has inserted
2868/// some tokens, this will store the first token and return true.
2869bool Lexer::SkipBlockComment(Token &Result, const char *CurPtr,
2870 bool &TokAtPhysicalStartOfLine) {
2871 // Scan one character past where we should, looking for a '/' character. Once
2872 // we find it, check to see if it was preceded by a *. This common
2873 // optimization helps people who like to put a lot of * characters in their
2874 // comments.
2875
2876 // The first character we get with newlines and trigraphs skipped to handle
2877 // the degenerate /*/ case below correctly if the * has an escaped newline
2878 // after it.
2879 unsigned CharSize;
2880 unsigned char C = getCharAndSize(CurPtr, CharSize);
2881 CurPtr += CharSize;
2882 if (C == 0 && CurPtr == BufferEnd+1) {
2883 if (!isLexingRawMode())
2884 Diag(BufferPtr, diag::err_unterminated_block_comment);
2885 --CurPtr;
2886
2887 // KeepWhitespaceMode should return this broken comment as a token. Since
2888 // it isn't a well formed comment, just return it as an 'unknown' token.
2889 if (isKeepWhitespaceMode()) {
2890 FormTokenWithChars(Result, CurPtr, tok::unknown);
2891 return true;
2892 }
2893
2894 BufferPtr = CurPtr;
2895 return false;
2896 }
2897
2898 // Check to see if the first character after the '/*' is another /. If so,
2899 // then this slash does not end the block comment, it is part of it.
2900 if (C == '/')
2901 C = *CurPtr++;
2902
2903 // C++23 [lex.phases] p1
2904 // Diagnose invalid UTF-8 if the corresponding warning is enabled, emitting a
2905 // diagnostic only once per entire ill-formed subsequence to avoid
2906 // emiting to many diagnostics (see http://unicode.org/review/pr-121.html).
2907 bool UnicodeDecodingAlreadyDiagnosed = false;
2908
2909 while (true) {
2910 // Skip over all non-interesting characters until we find end of buffer or a
2911 // (probably ending) '/' character.
2912 if (CurPtr + 24 < BufferEnd &&
2913 // If there is a code-completion point avoid the fast scan because it
2914 // doesn't check for '\0'.
2915 !(PP && PP->getCodeCompletionFileLoc() == FileLoc)) {
2916 // While not aligned to a 16-byte boundary.
2917 while (C != '/' && (intptr_t)CurPtr % 16 != 0) {
2918 if (!isASCII(C))
2919 goto MultiByteUTF8;
2920 C = *CurPtr++;
2921 }
2922 if (C == '/') goto FoundSlash;
2923
2924#ifdef __SSE2__
2925 __m128i Slashes = _mm_set1_epi8('/');
2926 while (CurPtr + 16 < BufferEnd) {
2927 int Mask = _mm_movemask_epi8(*(const __m128i *)CurPtr);
2928 if (LLVM_UNLIKELY(Mask != 0)) {
2929 goto MultiByteUTF8;
2930 }
2931 // look for slashes
2932 int cmp = _mm_movemask_epi8(_mm_cmpeq_epi8(*(const __m128i*)CurPtr,
2933 Slashes));
2934 if (cmp != 0) {
2935 // Adjust the pointer to point directly after the first slash. It's
2936 // not necessary to set C here, it will be overwritten at the end of
2937 // the outer loop.
2938 CurPtr += llvm::countr_zero<unsigned>(cmp) + 1;
2939 goto FoundSlash;
2940 }
2941 CurPtr += 16;
2942 }
2943#elif __ALTIVEC__
2944 __vector unsigned char LongUTF = {0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
2945 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
2946 0x80, 0x80, 0x80, 0x80};
2947 __vector unsigned char Slashes = {
2948 '/', '/', '/', '/', '/', '/', '/', '/',
2949 '/', '/', '/', '/', '/', '/', '/', '/'
2950 };
2951 while (CurPtr + 16 < BufferEnd) {
2952 if (LLVM_UNLIKELY(
2953 vec_any_ge(*(const __vector unsigned char *)CurPtr, LongUTF)))
2954 goto MultiByteUTF8;
2955 if (vec_any_eq(*(const __vector unsigned char *)CurPtr, Slashes)) {
2956 break;
2957 }
2958 CurPtr += 16;
2959 }
2960
2961#else
2962 while (CurPtr + 16 < BufferEnd) {
2963 bool HasNonASCII = false;
2964 for (unsigned I = 0; I < 16; ++I)
2965 HasNonASCII |= !isASCII(CurPtr[I]);
2966
2967 if (LLVM_UNLIKELY(HasNonASCII))
2968 goto MultiByteUTF8;
2969
2970 bool HasSlash = false;
2971 for (unsigned I = 0; I < 16; ++I)
2972 HasSlash |= CurPtr[I] == '/';
2973 if (HasSlash)
2974 break;
2975 CurPtr += 16;
2976 }
2977#endif
2978
2979 // It has to be one of the bytes scanned, increment to it and read one.
2980 C = *CurPtr++;
2981 }
2982
2983 // Loop to scan the remainder, warning on invalid UTF-8
2984 // if the corresponding warning is enabled, emitting a diagnostic only once
2985 // per sequence that cannot be decoded.
2986 while (C != '/' && C != '\0') {
2987 if (isASCII(C)) {
2988 UnicodeDecodingAlreadyDiagnosed = false;
2989 C = *CurPtr++;
2990 continue;
2991 }
2992 MultiByteUTF8:
2993 // CurPtr is 1 code unit past C, so to decode
2994 // the codepoint, we need to read from the previous position.
2995 unsigned Length = llvm::getUTF8SequenceSize(
2996 (const llvm::UTF8 *)CurPtr - 1, (const llvm::UTF8 *)BufferEnd);
2997 if (Length == 0) {
2998 if (!UnicodeDecodingAlreadyDiagnosed && !isLexingRawMode())
2999 Diag(CurPtr - 1, diag::warn_invalid_utf8_in_comment);
3000 UnicodeDecodingAlreadyDiagnosed = true;
3001 } else {
3002 UnicodeDecodingAlreadyDiagnosed = false;
3003 CurPtr += Length - 1;
3004 }
3005 C = *CurPtr++;
3006 }
3007
3008 if (C == '/') {
3009 FoundSlash:
3010 if (CurPtr[-2] == '*') // We found the final */. We're done!
3011 break;
3012
3013 if ((CurPtr[-2] == '\n' || CurPtr[-2] == '\r')) {
3014 if (isEndOfBlockCommentWithEscapedNewLine(CurPtr - 2, this,
3015 LangOpts.Trigraphs)) {
3016 // We found the final */, though it had an escaped newline between the
3017 // * and /. We're done!
3018 break;
3019 }
3020 }
3021 if (CurPtr[0] == '*' && CurPtr[1] != '/') {
3022 // If this is a /* inside of the comment, emit a warning. Don't do this
3023 // if this is a /*/, which will end the comment. This misses cases with
3024 // embedded escaped newlines, but oh well.
3025 if (!isLexingRawMode())
3026 Diag(CurPtr-1, diag::warn_nested_block_comment);
3027 }
3028 } else if (C == 0 && CurPtr == BufferEnd+1) {
3029 if (!isLexingRawMode())
3030 Diag(BufferPtr, diag::err_unterminated_block_comment);
3031 // Note: the user probably forgot a */. We could continue immediately
3032 // after the /*, but this would involve lexing a lot of what really is the
3033 // comment, which surely would confuse the parser.
3034 --CurPtr;
3035
3036 // KeepWhitespaceMode should return this broken comment as a token. Since
3037 // it isn't a well formed comment, just return it as an 'unknown' token.
3038 if (isKeepWhitespaceMode()) {
3039 FormTokenWithChars(Result, CurPtr, tok::unknown);
3040 return true;
3041 }
3042
3043 BufferPtr = CurPtr;
3044 return false;
3045 } else if (C == '\0' && isCodeCompletionPoint(CurPtr-1)) {
3047 cutOffLexing();
3048 return false;
3049 }
3050
3051 C = *CurPtr++;
3052 }
3053
3054 // Notify comment handlers about the comment unless we're in a #if 0 block.
3055 if (PP && !isLexingRawMode() &&
3057 getSourceLocation(CurPtr)))) {
3058 BufferPtr = CurPtr;
3059 return true; // A token has to be returned.
3060 }
3061
3062 // If we are returning comments as tokens, return this comment as a token.
3063 if (inKeepCommentMode()) {
3064 FormTokenWithChars(Result, CurPtr, tok::comment);
3065 return true;
3066 }
3067
3068 // It is common for the tokens immediately after a /**/ comment to be
3069 // whitespace. Instead of going through the big switch, handle it
3070 // efficiently now. This is safe even in KeepWhitespaceMode because we would
3071 // have already returned above with the comment as a token.
3072 if (isHorizontalWhitespace(*CurPtr)) {
3073 SkipWhitespace(Result, CurPtr+1, TokAtPhysicalStartOfLine);
3074 return false;
3075 }
3076
3077 // Otherwise, just return so that the next character will be lexed as a token.
3078 BufferPtr = CurPtr;
3079 Result.setFlag(Token::LeadingSpace);
3080 return false;
3081}
3082
3083//===----------------------------------------------------------------------===//
3084// Primary Lexing Entry Points
3085//===----------------------------------------------------------------------===//
3086
3087/// ReadToEndOfLine - Read the rest of the current preprocessor line as an
3088/// uninterpreted string. This switches the lexer out of directive mode.
3090 assert(ParsingPreprocessorDirective && ParsingFilename == false &&
3091 "Must be in a preprocessing directive!");
3092 Token Tmp;
3093 Tmp.startToken();
3094
3095 // CurPtr - Cache BufferPtr in an automatic variable.
3096 const char *CurPtr = BufferPtr;
3097 while (true) {
3098 char Char = getAndAdvanceChar(CurPtr, Tmp);
3099 switch (Char) {
3100 default:
3101 if (Result)
3102 Result->push_back(Char);
3103 break;
3104 case 0: // Null.
3105 // Found end of file?
3106 if (CurPtr-1 != BufferEnd) {
3107 if (isCodeCompletionPoint(CurPtr-1)) {
3109 cutOffLexing();
3110 return;
3111 }
3112
3113 // Nope, normal character, continue.
3114 if (Result)
3115 Result->push_back(Char);
3116 break;
3117 }
3118 // FALL THROUGH.
3119 [[fallthrough]];
3120 case '\r':
3121 case '\n':
3122 // Okay, we found the end of the line. First, back up past the \0, \r, \n.
3123 assert(CurPtr[-1] == Char && "Trigraphs for newline?");
3124 BufferPtr = CurPtr-1;
3125
3126 // Next, lex the character, which should handle the EOD transition.
3127 Lex(Tmp);
3128 if (Tmp.is(tok::code_completion)) {
3129 if (PP)
3131 Lex(Tmp);
3132 }
3133 assert(Tmp.is(tok::eod) && "Unexpected token!");
3134
3135 // Finally, we're done;
3136 return;
3137 }
3138 }
3139}
3140
3141/// LexEndOfFile - CurPtr points to the end of this file. Handle this
3142/// condition, reporting diagnostics and handling other edge cases as required.
3143/// This returns true if Result contains a token, false if PP.Lex should be
3144/// called again.
3145bool Lexer::LexEndOfFile(Token &Result, const char *CurPtr) {
3146 // If we hit the end of the file while parsing a preprocessor directive,
3147 // end the preprocessor directive first. The next token returned will
3148 // then be the end of file.
3150 // Done parsing the "line".
3152 // Update the location of token as well as BufferPtr.
3153 FormTokenWithChars(Result, CurPtr, tok::eod);
3154
3155 // Restore comment saving mode, in case it was disabled for directive.
3156 if (PP)
3158 return true; // Have a token.
3159 }
3160
3161 // If we are in raw mode, return this event as an EOF token. Let the caller
3162 // that put us in raw mode handle the event.
3163 if (isLexingRawMode()) {
3164 Result.startToken();
3165 BufferPtr = BufferEnd;
3166 FormTokenWithChars(Result, BufferEnd, tok::eof);
3167 return true;
3168 }
3169
3172 // If the preamble cuts off the end of a header guard, consider it guarded.
3173 // The guard is valid for the preamble content itself, and for tools the
3174 // most useful answer is "yes, this file has a header guard".
3175 if (!ConditionalStack.empty())
3177 ConditionalStack.clear();
3178 }
3179
3180 // Issue diagnostics for unterminated #if and missing newline.
3181
3182 // If we are in a #if directive, emit an error.
3183 while (!ConditionalStack.empty()) {
3184 if (PP->getCodeCompletionFileLoc() != FileLoc)
3185 PP->Diag(ConditionalStack.back().IfLoc,
3186 diag::err_pp_unterminated_conditional);
3187 ConditionalStack.pop_back();
3188 }
3189
3190 // C99 5.1.1.2p2: If the file is non-empty and didn't end in a newline, issue
3191 // a pedwarn.
3192 if (CurPtr != BufferStart && (CurPtr[-1] != '\n' && CurPtr[-1] != '\r')) {
3194 SourceLocation EndLoc = getSourceLocation(BufferEnd);
3195 unsigned DiagID;
3196
3197 if (LangOpts.CPlusPlus11) {
3198 // C++11 [lex.phases] 2.2 p2
3199 // Prefer the C++98 pedantic compatibility warning over the generic,
3200 // non-extension, user-requested "missing newline at EOF" warning.
3201 if (!Diags.isIgnored(diag::warn_cxx98_compat_no_newline_eof, EndLoc)) {
3202 DiagID = diag::warn_cxx98_compat_no_newline_eof;
3203 } else {
3204 DiagID = diag::warn_no_newline_eof;
3205 }
3206 } else {
3207 DiagID = diag::ext_no_newline_eof;
3208 }
3209
3210 Diag(BufferEnd, DiagID)
3211 << FixItHint::CreateInsertion(EndLoc, "\n");
3212 }
3213
3214 BufferPtr = CurPtr;
3215
3216 // Finally, let the preprocessor handle this.
3218}
3219
3220/// isNextPPTokenLParen - Return 1 if the next unexpanded token lexed from
3221/// the specified lexer will return a tok::l_paren token, 0 if it is something
3222/// else and 2 if there are no more tokens in the buffer controlled by the
3223/// lexer.
3224unsigned Lexer::isNextPPTokenLParen() {
3225 assert(!LexingRawMode && "How can we expand a macro from a skipping buffer?");
3226
3227 if (isDependencyDirectivesLexer()) {
3228 if (NextDepDirectiveTokenIndex == DepDirectives.front().Tokens.size())
3229 return 2;
3230 return DepDirectives.front().Tokens[NextDepDirectiveTokenIndex].is(
3231 tok::l_paren);
3232 }
3233
3234 // Switch to 'skipping' mode. This will ensure that we can lex a token
3235 // without emitting diagnostics, disables macro expansion, and will cause EOF
3236 // to return an EOF token instead of popping the include stack.
3237 LexingRawMode = true;
3238
3239 // Save state that can be changed while lexing so that we can restore it.
3240 const char *TmpBufferPtr = BufferPtr;
3241 bool inPPDirectiveMode = ParsingPreprocessorDirective;
3242 bool atStartOfLine = IsAtStartOfLine;
3243 bool atPhysicalStartOfLine = IsAtPhysicalStartOfLine;
3244 bool leadingSpace = HasLeadingSpace;
3245
3246 Token Tok;
3247 Lex(Tok);
3248
3249 // Restore state that may have changed.
3250 BufferPtr = TmpBufferPtr;
3251 ParsingPreprocessorDirective = inPPDirectiveMode;
3252 HasLeadingSpace = leadingSpace;
3253 IsAtStartOfLine = atStartOfLine;
3254 IsAtPhysicalStartOfLine = atPhysicalStartOfLine;
3255
3256 // Restore the lexer back to non-skipping mode.
3257 LexingRawMode = false;
3258
3259 if (Tok.is(tok::eof))
3260 return 2;
3261 return Tok.is(tok::l_paren);
3262}
3263
3264/// Find the end of a version control conflict marker.
3265static const char *FindConflictEnd(const char *CurPtr, const char *BufferEnd,
3266 ConflictMarkerKind CMK) {
3267 const char *Terminator = CMK == CMK_Perforce ? "<<<<\n" : ">>>>>>>";
3268 size_t TermLen = CMK == CMK_Perforce ? 5 : 7;
3269 auto RestOfBuffer = StringRef(CurPtr, BufferEnd - CurPtr).substr(TermLen);
3270 size_t Pos = RestOfBuffer.find(Terminator);
3271 while (Pos != StringRef::npos) {
3272 // Must occur at start of line.
3273 if (Pos == 0 ||
3274 (RestOfBuffer[Pos - 1] != '\r' && RestOfBuffer[Pos - 1] != '\n')) {
3275 RestOfBuffer = RestOfBuffer.substr(Pos+TermLen);
3276 Pos = RestOfBuffer.find(Terminator);
3277 continue;
3278 }
3279 return RestOfBuffer.data()+Pos;
3280 }
3281 return nullptr;
3282}
3283
3284/// IsStartOfConflictMarker - If the specified pointer is the start of a version
3285/// control conflict marker like '<<<<<<<', recognize it as such, emit an error
3286/// and recover nicely. This returns true if it is a conflict marker and false
3287/// if not.
3288bool Lexer::IsStartOfConflictMarker(const char *CurPtr) {
3289 // Only a conflict marker if it starts at the beginning of a line.
3290 if (CurPtr != BufferStart &&
3291 CurPtr[-1] != '\n' && CurPtr[-1] != '\r')
3292 return false;
3293
3294 // Check to see if we have <<<<<<< or >>>>.
3295 if (!StringRef(CurPtr, BufferEnd - CurPtr).starts_with("<<<<<<<") &&
3296 !StringRef(CurPtr, BufferEnd - CurPtr).starts_with(">>>> "))
3297 return false;
3298
3299 // If we have a situation where we don't care about conflict markers, ignore
3300 // it.
3301 if (CurrentConflictMarkerState || isLexingRawMode())
3302 return false;
3303
3304 ConflictMarkerKind Kind = *CurPtr == '<' ? CMK_Normal : CMK_Perforce;
3305
3306 // Check to see if there is an ending marker somewhere in the buffer at the
3307 // start of a line to terminate this conflict marker.
3308 if (FindConflictEnd(CurPtr, BufferEnd, Kind)) {
3309 // We found a match. We are really in a conflict marker.
3310 // Diagnose this, and ignore to the end of line.
3311 Diag(CurPtr, diag::err_conflict_marker);
3312 CurrentConflictMarkerState = Kind;
3313
3314 // Skip ahead to the end of line. We know this exists because the
3315 // end-of-conflict marker starts with \r or \n.
3316 while (*CurPtr != '\r' && *CurPtr != '\n') {
3317 assert(CurPtr != BufferEnd && "Didn't find end of line");
3318 ++CurPtr;
3319 }
3320 BufferPtr = CurPtr;
3321 return true;
3322 }
3323
3324 // No end of conflict marker found.
3325 return false;
3326}
3327
3328/// HandleEndOfConflictMarker - If this is a '====' or '||||' or '>>>>', or if
3329/// it is '<<<<' and the conflict marker started with a '>>>>' marker, then it
3330/// is the end of a conflict marker. Handle it by ignoring up until the end of
3331/// the line. This returns true if it is a conflict marker and false if not.
3332bool Lexer::HandleEndOfConflictMarker(const char *CurPtr) {
3333 // Only a conflict marker if it starts at the beginning of a line.
3334 if (CurPtr != BufferStart &&
3335 CurPtr[-1] != '\n' && CurPtr[-1] != '\r')
3336 return false;
3337
3338 // If we have a situation where we don't care about conflict markers, ignore
3339 // it.
3340 if (!CurrentConflictMarkerState || isLexingRawMode())
3341 return false;
3342
3343 // Check to see if we have the marker (4 characters in a row).
3344 for (unsigned i = 1; i != 4; ++i)
3345 if (CurPtr[i] != CurPtr[0])
3346 return false;
3347
3348 // If we do have it, search for the end of the conflict marker. This could
3349 // fail if it got skipped with a '#if 0' or something. Note that CurPtr might
3350 // be the end of conflict marker.
3351 if (const char *End = FindConflictEnd(CurPtr, BufferEnd,
3352 CurrentConflictMarkerState)) {
3353 CurPtr = End;
3354
3355 // Skip ahead to the end of line.
3356 while (CurPtr != BufferEnd && *CurPtr != '\r' && *CurPtr != '\n')
3357 ++CurPtr;
3358
3359 BufferPtr = CurPtr;
3360
3361 // No longer in the conflict marker.
3362 CurrentConflictMarkerState = CMK_None;
3363 return true;
3364 }
3365
3366 return false;
3367}
3368
3369static const char *findPlaceholderEnd(const char *CurPtr,
3370 const char *BufferEnd) {
3371 if (CurPtr == BufferEnd)
3372 return nullptr;
3373 BufferEnd -= 1; // Scan until the second last character.
3374 for (; CurPtr != BufferEnd; ++CurPtr) {
3375 if (CurPtr[0] == '#' && CurPtr[1] == '>')
3376 return CurPtr + 2;
3377 }
3378 return nullptr;
3379}
3380
3381bool Lexer::lexEditorPlaceholder(Token &Result, const char *CurPtr) {
3382 assert(CurPtr[-1] == '<' && CurPtr[0] == '#' && "Not a placeholder!");
3384 return false;
3385 const char *End = findPlaceholderEnd(CurPtr + 1, BufferEnd);
3386 if (!End)
3387 return false;
3388 const char *Start = CurPtr - 1;
3389 if (!LangOpts.AllowEditorPlaceholders)
3390 Diag(Start, diag::err_placeholder_in_source);
3391 Result.startToken();
3392 FormTokenWithChars(Result, End, tok::raw_identifier);
3393 Result.setRawIdentifierData(Start);
3396 BufferPtr = End;
3397 return true;
3398}
3399
3400bool Lexer::isCodeCompletionPoint(const char *CurPtr) const {
3401 if (PP && PP->isCodeCompletionEnabled()) {
3402 SourceLocation Loc = FileLoc.getLocWithOffset(CurPtr-BufferStart);
3403 return Loc == PP->getCodeCompletionLoc();
3404 }
3405
3406 return false;
3407}
3408
3409std::optional<uint32_t> Lexer::tryReadNumericUCN(const char *&StartPtr,
3410 const char *SlashLoc,
3411 Token *Result) {
3412 unsigned CharSize;
3413 char Kind = getCharAndSize(StartPtr, CharSize);
3414 assert((Kind == 'u' || Kind == 'U') && "expected a UCN");
3415
3416 unsigned NumHexDigits;
3417 if (Kind == 'u')
3418 NumHexDigits = 4;
3419 else if (Kind == 'U')
3420 NumHexDigits = 8;
3421
3422 bool Delimited = false;
3423 bool FoundEndDelimiter = false;
3424 unsigned Count = 0;
3425 bool Diagnose = Result && !isLexingRawMode();
3426
3427 if (!LangOpts.CPlusPlus && !LangOpts.C99) {
3428 if (Diagnose)
3429 Diag(SlashLoc, diag::warn_ucn_not_valid_in_c89);
3430 return std::nullopt;
3431 }
3432
3433 const char *CurPtr = StartPtr + CharSize;
3434 const char *KindLoc = &CurPtr[-1];
3435
3436 uint32_t CodePoint = 0;
3437 while (Count != NumHexDigits || Delimited) {
3438 char C = getCharAndSize(CurPtr, CharSize);
3439 if (!Delimited && Count == 0 && C == '{') {
3440 Delimited = true;
3441 CurPtr += CharSize;
3442 continue;
3443 }
3444
3445 if (Delimited && C == '}') {
3446 CurPtr += CharSize;
3447 FoundEndDelimiter = true;
3448 break;
3449 }
3450
3451 unsigned Value = llvm::hexDigitValue(C);
3452 if (Value == -1U) {
3453 if (!Delimited)
3454 break;
3455 if (Diagnose)
3456 Diag(SlashLoc, diag::warn_delimited_ucn_incomplete)
3457 << StringRef(KindLoc, 1);
3458 return std::nullopt;
3459 }
3460
3461 if (CodePoint & 0xF000'0000) {
3462 if (Diagnose)
3463 Diag(KindLoc, diag::err_escape_too_large) << 0;
3464 return std::nullopt;
3465 }
3466
3467 CodePoint <<= 4;
3468 CodePoint |= Value;
3469 CurPtr += CharSize;
3470 Count++;
3471 }
3472
3473 if (Count == 0) {
3474 if (Diagnose)
3475 Diag(SlashLoc, FoundEndDelimiter ? diag::warn_delimited_ucn_empty
3476 : diag::warn_ucn_escape_no_digits)
3477 << StringRef(KindLoc, 1);
3478 return std::nullopt;
3479 }
3480
3481 if (Delimited && Kind == 'U') {
3482 if (Diagnose)
3483 Diag(SlashLoc, diag::err_hex_escape_no_digits) << StringRef(KindLoc, 1);
3484 return std::nullopt;
3485 }
3486
3487 if (!Delimited && Count != NumHexDigits) {
3488 if (Diagnose) {
3489 Diag(SlashLoc, diag::warn_ucn_escape_incomplete);
3490 // If the user wrote \U1234, suggest a fixit to \u.
3491 if (Count == 4 && NumHexDigits == 8) {
3492 CharSourceRange URange = makeCharRange(*this, KindLoc, KindLoc + 1);
3493 Diag(KindLoc, diag::note_ucn_four_not_eight)
3494 << FixItHint::CreateReplacement(URange, "u");
3495 }
3496 }
3497 return std::nullopt;
3498 }
3499
3500 if (Delimited && PP) {
3501 Diag(SlashLoc, PP->getLangOpts().CPlusPlus23
3502 ? diag::warn_cxx23_delimited_escape_sequence
3503 : diag::ext_delimited_escape_sequence)
3504 << /*delimited*/ 0 << (PP->getLangOpts().CPlusPlus ? 1 : 0);
3505 }
3506
3507 if (Result) {
3508 Result->setFlag(Token::HasUCN);
3509 // If the UCN contains either a trigraph or a line splicing,
3510 // we need to call getAndAdvanceChar again to set the appropriate flags
3511 // on Result.
3512 if (CurPtr - StartPtr == (ptrdiff_t)(Count + 1 + (Delimited ? 2 : 0)))
3513 StartPtr = CurPtr;
3514 else
3515 while (StartPtr != CurPtr)
3516 (void)getAndAdvanceChar(StartPtr, *Result);
3517 } else {
3518 StartPtr = CurPtr;
3519 }
3520 return CodePoint;
3521}
3522
3523std::optional<uint32_t> Lexer::tryReadNamedUCN(const char *&StartPtr,
3524 const char *SlashLoc,
3525 Token *Result) {
3526 unsigned CharSize;
3527 bool Diagnose = Result && !isLexingRawMode();
3528
3529 char C = getCharAndSize(StartPtr, CharSize);
3530 assert(C == 'N' && "expected \\N{...}");
3531
3532 const char *CurPtr = StartPtr + CharSize;
3533 const char *KindLoc = &CurPtr[-1];
3534
3535 C = getCharAndSize(CurPtr, CharSize);
3536 if (C != '{') {
3537 if (Diagnose)
3538 Diag(SlashLoc, diag::warn_ucn_escape_incomplete);
3539 return std::nullopt;
3540 }
3541 CurPtr += CharSize;
3542 const char *StartName = CurPtr;
3543 bool FoundEndDelimiter = false;
3545 while (C) {
3546 C = getCharAndSize(CurPtr, CharSize);
3547 CurPtr += CharSize;
3548 if (C == '}') {
3549 FoundEndDelimiter = true;
3550 break;
3551 }
3552
3554 break;
3555 Buffer.push_back(C);
3556 }
3557
3558 if (!FoundEndDelimiter || Buffer.empty()) {
3559 if (Diagnose)
3560 Diag(SlashLoc, FoundEndDelimiter ? diag::warn_delimited_ucn_empty
3561 : diag::warn_delimited_ucn_incomplete)
3562 << StringRef(KindLoc, 1);
3563 return std::nullopt;
3564 }
3565
3566 StringRef Name(Buffer.data(), Buffer.size());
3567 std::optional<char32_t> Match =
3568 llvm::sys::unicode::nameToCodepointStrict(Name);
3569 std::optional<llvm::sys::unicode::LooseMatchingResult> LooseMatch;
3570 if (!Match) {
3571 LooseMatch = llvm::sys::unicode::nameToCodepointLooseMatching(Name);
3572 if (Diagnose) {
3573 Diag(StartName, diag::err_invalid_ucn_name)
3574 << StringRef(Buffer.data(), Buffer.size())
3575 << makeCharRange(*this, StartName, CurPtr - CharSize);
3576 if (LooseMatch) {
3577 Diag(StartName, diag::note_invalid_ucn_name_loose_matching)
3579 makeCharRange(*this, StartName, CurPtr - CharSize),
3580 LooseMatch->Name);
3581 }
3582 }
3583 // We do not offer misspelled character names suggestions here
3584 // as the set of what would be a valid suggestion depends on context,
3585 // and we should not make invalid suggestions.
3586 }
3587
3588 if (Diagnose && Match)
3589 Diag(SlashLoc, PP->getLangOpts().CPlusPlus23
3590 ? diag::warn_cxx23_delimited_escape_sequence
3591 : diag::ext_delimited_escape_sequence)
3592 << /*named*/ 1 << (PP->getLangOpts().CPlusPlus ? 1 : 0);
3593
3594 // If no diagnostic has been emitted yet, likely because we are doing a
3595 // tentative lexing, we do not want to recover here to make sure the token
3596 // will not be incorrectly considered valid. This function will be called
3597 // again and a diagnostic emitted then.
3598 if (LooseMatch && Diagnose)
3599 Match = LooseMatch->CodePoint;
3600
3601 if (Result) {
3602 Result->setFlag(Token::HasUCN);
3603 // If the UCN contains either a trigraph or a line splicing,
3604 // we need to call getAndAdvanceChar again to set the appropriate flags
3605 // on Result.
3606 if (CurPtr - StartPtr == (ptrdiff_t)(Buffer.size() + 3))
3607 StartPtr = CurPtr;
3608 else
3609 while (StartPtr != CurPtr)
3610 (void)getAndAdvanceChar(StartPtr, *Result);
3611 } else {
3612 StartPtr = CurPtr;
3613 }
3614 return Match ? std::optional<uint32_t>(*Match) : std::nullopt;
3615}
3616
3617uint32_t Lexer::tryReadUCN(const char *&StartPtr, const char *SlashLoc,
3618 Token *Result) {
3619
3620 unsigned CharSize;
3621 std::optional<uint32_t> CodePointOpt;
3622 char Kind = getCharAndSize(StartPtr, CharSize);
3623 if (Kind == 'u' || Kind == 'U')
3624 CodePointOpt = tryReadNumericUCN(StartPtr, SlashLoc, Result);
3625 else if (Kind == 'N')
3626 CodePointOpt = tryReadNamedUCN(StartPtr, SlashLoc, Result);
3627
3628 if (!CodePointOpt)
3629 return 0;
3630
3631 uint32_t CodePoint = *CodePointOpt;
3632
3633 // Don't apply C family restrictions to UCNs in assembly mode
3634 if (LangOpts.AsmPreprocessor)
3635 return CodePoint;
3636
3637 // C23 6.4.3p2: A universal character name shall not designate a code point
3638 // where the hexadecimal value is:
3639 // - in the range D800 through DFFF inclusive; or
3640 // - greater than 10FFFF.
3641 // A universal-character-name outside the c-char-sequence of a character
3642 // constant, or the s-char-sequence of a string-literal shall not designate
3643 // a control character or a character in the basic character set.
3644
3645 // C++11 [lex.charset]p2: If the hexadecimal value for a
3646 // universal-character-name corresponds to a surrogate code point (in the
3647 // range 0xD800-0xDFFF, inclusive), the program is ill-formed. Additionally,
3648 // if the hexadecimal value for a universal-character-name outside the
3649 // c-char-sequence, s-char-sequence, or r-char-sequence of a character or
3650 // string literal corresponds to a control character (in either of the
3651 // ranges 0x00-0x1F or 0x7F-0x9F, both inclusive) or to a character in the
3652 // basic source character set, the program is ill-formed.
3653 if (CodePoint < 0xA0) {
3654 // We don't use isLexingRawMode() here because we need to warn about bad
3655 // UCNs even when skipping preprocessing tokens in a #if block.
3656 if (Result && PP) {
3657 if (CodePoint < 0x20 || CodePoint >= 0x7F)
3658 Diag(BufferPtr, diag::err_ucn_control_character);
3659 else {
3660 char C = static_cast<char>(CodePoint);
3661 Diag(BufferPtr, diag::err_ucn_escape_basic_scs) << StringRef(&C, 1);
3662 }
3663 }
3664
3665 return 0;
3666 } else if (CodePoint >= 0xD800 && CodePoint <= 0xDFFF) {
3667 // C++03 allows UCNs representing surrogate characters. C99 and C++11 don't.
3668 // We don't use isLexingRawMode() here because we need to diagnose bad
3669 // UCNs even when skipping preprocessing tokens in a #if block.
3670 if (Result && PP) {
3671 if (LangOpts.CPlusPlus && !LangOpts.CPlusPlus11)
3672 Diag(BufferPtr, diag::warn_ucn_escape_surrogate);
3673 else
3674 Diag(BufferPtr, diag::err_ucn_escape_invalid);
3675 }
3676 return 0;
3677 }
3678
3679 return CodePoint;
3680}
3681
3682bool Lexer::CheckUnicodeWhitespace(Token &Result, uint32_t C,
3683 const char *CurPtr) {
3684 if (!isLexingRawMode() && !PP->isPreprocessedOutput() &&
3686 Diag(BufferPtr, diag::ext_unicode_whitespace)
3687 << makeCharRange(*this, BufferPtr, CurPtr);
3688
3689 Result.setFlag(Token::LeadingSpace);
3690 return true;
3691 }
3692 return false;
3693}
3694
3695void Lexer::PropagateLineStartLeadingSpaceInfo(Token &Result) {
3696 IsAtStartOfLine = Result.isAtStartOfLine();
3697 HasLeadingSpace = Result.hasLeadingSpace();
3698 HasLeadingEmptyMacro = Result.hasLeadingEmptyMacro();
3699 // Note that this doesn't affect IsAtPhysicalStartOfLine.
3700}
3701
3703 assert(!isDependencyDirectivesLexer());
3704
3705 // Start a new token.
3706 Result.startToken();
3707
3708 // Set up misc whitespace flags for LexTokenInternal.
3709 if (IsAtStartOfLine) {
3710 Result.setFlag(Token::StartOfLine);
3711 IsAtStartOfLine = false;
3712 }
3713
3714 if (HasLeadingSpace) {
3715 Result.setFlag(Token::LeadingSpace);
3716 HasLeadingSpace = false;
3717 }
3718
3719 if (HasLeadingEmptyMacro) {
3721 HasLeadingEmptyMacro = false;
3722 }
3723
3724 bool atPhysicalStartOfLine = IsAtPhysicalStartOfLine;
3725 IsAtPhysicalStartOfLine = false;
3726 bool isRawLex = isLexingRawMode();
3727 (void) isRawLex;
3728 bool returnedToken = LexTokenInternal(Result, atPhysicalStartOfLine);
3729 // (After the LexTokenInternal call, the lexer might be destroyed.)
3730 assert((returnedToken || !isRawLex) && "Raw lex must succeed");
3731 return returnedToken;
3732}
3733
3734/// LexTokenInternal - This implements a simple C family lexer. It is an
3735/// extremely performance critical piece of code. This assumes that the buffer
3736/// has a null character at the end of the file. This returns a preprocessing
3737/// token, not a normal token, as such, it is an internal interface. It assumes
3738/// that the Flags of result have been cleared before calling this.
3739bool Lexer::LexTokenInternal(Token &Result, bool TokAtPhysicalStartOfLine) {
3740LexStart:
3741 assert(!Result.needsCleaning() && "Result needs cleaning");
3742 assert(!Result.hasPtrData() && "Result has not been reset");
3743
3744 // CurPtr - Cache BufferPtr in an automatic variable.
3745 const char *CurPtr = BufferPtr;
3746
3747 // Small amounts of horizontal whitespace is very common between tokens.
3748 if (isHorizontalWhitespace(*CurPtr)) {
3749 do {
3750 ++CurPtr;
3751 } while (isHorizontalWhitespace(*CurPtr));
3752
3753 // If we are keeping whitespace and other tokens, just return what we just
3754 // skipped. The next lexer invocation will return the token after the
3755 // whitespace.
3756 if (isKeepWhitespaceMode()) {
3757 FormTokenWithChars(Result, CurPtr, tok::unknown);
3758 // FIXME: The next token will not have LeadingSpace set.
3759 return true;
3760 }
3761
3762 BufferPtr = CurPtr;
3763 Result.setFlag(Token::LeadingSpace);
3764 }
3765
3766 unsigned SizeTmp, SizeTmp2; // Temporaries for use in cases below.
3767
3768 // Read a character, advancing over it.
3769 char Char = getAndAdvanceChar(CurPtr, Result);
3771
3772 if (!isVerticalWhitespace(Char))
3773 NewLinePtr = nullptr;
3774
3775 switch (Char) {
3776 case 0: // Null.
3777 // Found end of file?
3778 if (CurPtr-1 == BufferEnd)
3779 return LexEndOfFile(Result, CurPtr-1);
3780
3781 // Check if we are performing code completion.
3782 if (isCodeCompletionPoint(CurPtr-1)) {
3783 // Return the code-completion token.
3784 Result.startToken();
3785 FormTokenWithChars(Result, CurPtr, tok::code_completion);
3786 return true;
3787 }
3788
3789 if (!isLexingRawMode())
3790 Diag(CurPtr-1, diag::null_in_file);
3791 Result.setFlag(Token::LeadingSpace);
3792 if (SkipWhitespace(Result, CurPtr, TokAtPhysicalStartOfLine))
3793 return true; // KeepWhitespaceMode
3794
3795 // We know the lexer hasn't changed, so just try again with this lexer.
3796 // (We manually eliminate the tail call to avoid recursion.)
3797 goto LexNextToken;
3798
3799 case 26: // DOS & CP/M EOF: "^Z".
3800 // If we're in Microsoft extensions mode, treat this as end of file.
3801 if (LangOpts.MicrosoftExt) {
3802 if (!isLexingRawMode())
3803 Diag(CurPtr-1, diag::ext_ctrl_z_eof_microsoft);
3804 return LexEndOfFile(Result, CurPtr-1);
3805 }
3806
3807 // If Microsoft extensions are disabled, this is just random garbage.
3808 Kind = tok::unknown;
3809 break;
3810
3811 case '\r':
3812 if (CurPtr[0] == '\n')
3813 (void)getAndAdvanceChar(CurPtr, Result);
3814 [[fallthrough]];
3815 case '\n':
3816 // If we are inside a preprocessor directive and we see the end of line,
3817 // we know we are done with the directive, so return an EOD token.
3819 // Done parsing the "line".
3821
3822 // Restore comment saving mode, in case it was disabled for directive.
3823 if (PP)
3825
3826 // Since we consumed a newline, we are back at the start of a line.
3827 IsAtStartOfLine = true;
3828 IsAtPhysicalStartOfLine = true;
3829 NewLinePtr = CurPtr - 1;
3830
3831 Kind = tok::eod;
3832 break;
3833 }
3834
3835 // No leading whitespace seen so far.
3836 Result.clearFlag(Token::LeadingSpace);
3837
3838 if (SkipWhitespace(Result, CurPtr, TokAtPhysicalStartOfLine))
3839 return true; // KeepWhitespaceMode
3840
3841 // We only saw whitespace, so just try again with this lexer.
3842 // (We manually eliminate the tail call to avoid recursion.)
3843 goto LexNextToken;
3844 case ' ':
3845 case '\t':
3846 case '\f':
3847 case '\v':
3848 SkipHorizontalWhitespace:
3849 Result.setFlag(Token::LeadingSpace);
3850 if (SkipWhitespace(Result, CurPtr, TokAtPhysicalStartOfLine))
3851 return true; // KeepWhitespaceMode
3852
3853 SkipIgnoredUnits:
3854 CurPtr = BufferPtr;
3855
3856 // If the next token is obviously a // or /* */ comment, skip it efficiently
3857 // too (without going through the big switch stmt).
3858 if (CurPtr[0] == '/' && CurPtr[1] == '/' && !inKeepCommentMode() &&
3859 LineComment && (LangOpts.CPlusPlus || !LangOpts.TraditionalCPP)) {
3860 if (SkipLineComment(Result, CurPtr+2, TokAtPhysicalStartOfLine))
3861 return true; // There is a token to return.
3862 goto SkipIgnoredUnits;
3863 } else if (CurPtr[0] == '/' && CurPtr[1] == '*' && !inKeepCommentMode()) {
3864 if (SkipBlockComment(Result, CurPtr+2, TokAtPhysicalStartOfLine))
3865 return true; // There is a token to return.
3866 goto SkipIgnoredUnits;
3867 } else if (isHorizontalWhitespace(*CurPtr)) {
3868 goto SkipHorizontalWhitespace;
3869 }
3870 // We only saw whitespace, so just try again with this lexer.
3871 // (We manually eliminate the tail call to avoid recursion.)
3872 goto LexNextToken;
3873
3874 // C99 6.4.4.1: Integer Constants.
3875 // C99 6.4.4.2: Floating Constants.
3876 case '0': case '1': case '2': case '3': case '4':
3877 case '5': case '6': case '7': case '8': case '9':
3878 // Notify MIOpt that we read a non-whitespace/non-comment token.
3879 MIOpt.ReadToken();
3880 return LexNumericConstant(Result, CurPtr);
3881
3882 // Identifier (e.g., uber), or
3883 // UTF-8 (C23/C++17) or UTF-16 (C11/C++11) character literal, or
3884 // UTF-8 or UTF-16 string literal (C11/C++11).
3885 case 'u':
3886 // Notify MIOpt that we read a non-whitespace/non-comment token.
3887 MIOpt.ReadToken();
3888
3889 if (LangOpts.CPlusPlus11 || LangOpts.C11) {
3890 Char = getCharAndSize(CurPtr, SizeTmp);
3891
3892 // UTF-16 string literal
3893 if (Char == '"')
3894 return LexStringLiteral(Result, ConsumeChar(CurPtr, SizeTmp, Result),
3895 tok::utf16_string_literal);
3896
3897 // UTF-16 character constant
3898 if (Char == '\'')
3899 return LexCharConstant(Result, ConsumeChar(CurPtr, SizeTmp, Result),
3900 tok::utf16_char_constant);
3901
3902 // UTF-16 raw string literal
3903 if (Char == 'R' && LangOpts.RawStringLiterals &&
3904 getCharAndSize(CurPtr + SizeTmp, SizeTmp2) == '"')
3905 return LexRawStringLiteral(Result,
3906 ConsumeChar(ConsumeChar(CurPtr, SizeTmp, Result),
3907 SizeTmp2, Result),
3908 tok::utf16_string_literal);
3909
3910 if (Char == '8') {
3911 char Char2 = getCharAndSize(CurPtr + SizeTmp, SizeTmp2);
3912
3913 // UTF-8 string literal
3914 if (Char2 == '"')
3915 return LexStringLiteral(Result,
3916 ConsumeChar(ConsumeChar(CurPtr, SizeTmp, Result),
3917 SizeTmp2, Result),
3918 tok::utf8_string_literal);
3919 if (Char2 == '\'' && (LangOpts.CPlusPlus17 || LangOpts.C23))
3920 return LexCharConstant(
3921 Result, ConsumeChar(ConsumeChar(CurPtr, SizeTmp, Result),
3922 SizeTmp2, Result),
3923 tok::utf8_char_constant);
3924
3925 if (Char2 == 'R' && LangOpts.RawStringLiterals) {
3926 unsigned SizeTmp3;
3927 char Char3 = getCharAndSize(CurPtr + SizeTmp + SizeTmp2, SizeTmp3);
3928 // UTF-8 raw string literal
3929 if (Char3 == '"') {
3930 return LexRawStringLiteral(Result,
3931 ConsumeChar(ConsumeChar(ConsumeChar(CurPtr, SizeTmp, Result),
3932 SizeTmp2, Result),
3933 SizeTmp3, Result),
3934 tok::utf8_string_literal);
3935 }
3936 }
3937 }
3938 }
3939
3940 // treat u like the start of an identifier.
3941 return LexIdentifierContinue(Result, CurPtr);
3942
3943 case 'U': // Identifier (e.g. Uber) or C11/C++11 UTF-32 string literal
3944 // Notify MIOpt that we read a non-whitespace/non-comment token.
3945 MIOpt.ReadToken();
3946
3947 if (LangOpts.CPlusPlus11 || LangOpts.C11) {
3948 Char = getCharAndSize(CurPtr, SizeTmp);
3949
3950 // UTF-32 string literal
3951 if (Char == '"')
3952 return LexStringLiteral(Result, ConsumeChar(CurPtr, SizeTmp, Result),
3953 tok::utf32_string_literal);
3954
3955 // UTF-32 character constant
3956 if (Char == '\'')
3957 return LexCharConstant(Result, ConsumeChar(CurPtr, SizeTmp, Result),
3958 tok::utf32_char_constant);
3959
3960 // UTF-32 raw string literal
3961 if (Char == 'R' && LangOpts.RawStringLiterals &&
3962 getCharAndSize(CurPtr + SizeTmp, SizeTmp2) == '"')
3963 return LexRawStringLiteral(Result,
3964 ConsumeChar(ConsumeChar(CurPtr, SizeTmp, Result),
3965 SizeTmp2, Result),
3966 tok::utf32_string_literal);
3967 }
3968
3969 // treat U like the start of an identifier.
3970 return LexIdentifierContinue(Result, CurPtr);
3971
3972 case 'R': // Identifier or C++0x raw string literal
3973 // Notify MIOpt that we read a non-whitespace/non-comment token.
3974 MIOpt.ReadToken();
3975
3976 if (LangOpts.RawStringLiterals) {
3977 Char = getCharAndSize(CurPtr, SizeTmp);
3978
3979 if (Char == '"')
3980 return LexRawStringLiteral(Result,
3981 ConsumeChar(CurPtr, SizeTmp, Result),
3982 tok::string_literal);
3983 }
3984
3985 // treat R like the start of an identifier.
3986 return LexIdentifierContinue(Result, CurPtr);
3987
3988 case 'L': // Identifier (Loony) or wide literal (L'x' or L"xyz").
3989 // Notify MIOpt that we read a non-whitespace/non-comment token.
3990 MIOpt.ReadToken();
3991 Char = getCharAndSize(CurPtr, SizeTmp);
3992
3993 // Wide string literal.
3994 if (Char == '"')
3995 return LexStringLiteral(Result, ConsumeChar(CurPtr, SizeTmp, Result),
3996 tok::wide_string_literal);
3997
3998 // Wide raw string literal.
3999 if (LangOpts.RawStringLiterals && Char == 'R' &&
4000 getCharAndSize(CurPtr + SizeTmp, SizeTmp2) == '"')
4001 return LexRawStringLiteral(Result,
4002 ConsumeChar(ConsumeChar(CurPtr, SizeTmp, Result),
4003 SizeTmp2, Result),
4004 tok::wide_string_literal);
4005
4006 // Wide character constant.
4007 if (Char == '\'')
4008 return LexCharConstant(Result, ConsumeChar(CurPtr, SizeTmp, Result),
4009 tok::wide_char_constant);
4010 // FALL THROUGH, treating L like the start of an identifier.
4011 [[fallthrough]];
4012
4013 // C99 6.4.2: Identifiers.
4014 case 'A': case 'B': case 'C': case 'D': case 'E': case 'F': case 'G':
4015 case 'H': case 'I': case 'J': case 'K': /*'L'*/case 'M': case 'N':
4016 case 'O': case 'P': case 'Q': /*'R'*/case 'S': case 'T': /*'U'*/
4017 case 'V': case 'W': case 'X': case 'Y': case 'Z':
4018 case 'a': case 'b': case 'c': case 'd': case 'e': case 'f': case 'g':
4019 case 'h': case 'i': case 'j': case 'k': case 'l': case 'm': case 'n':
4020 case 'o': case 'p': case 'q': case 'r': case 's': case 't': /*'u'*/
4021 case 'v': case 'w': case 'x': case 'y': case 'z':
4022 case '_':
4023 // Notify MIOpt that we read a non-whitespace/non-comment token.
4024 MIOpt.ReadToken();
4025 return LexIdentifierContinue(Result, CurPtr);
4026
4027 case '$': // $ in identifiers.
4028 if (LangOpts.DollarIdents) {
4029 if (!isLexingRawMode())
4030 Diag(CurPtr-1, diag::ext_dollar_in_identifier);
4031 // Notify MIOpt that we read a non-whitespace/non-comment token.
4032 MIOpt.ReadToken();
4033 return LexIdentifierContinue(Result, CurPtr);
4034 }
4035
4036 Kind = tok::unknown;
4037 break;
4038
4039 // C99 6.4.4: Character Constants.
4040 case '\'':
4041 // Notify MIOpt that we read a non-whitespace/non-comment token.
4042 MIOpt.ReadToken();
4043 return LexCharConstant(Result, CurPtr, tok::char_constant);
4044
4045 // C99 6.4.5: String Literals.
4046 case '"':
4047 // Notify MIOpt that we read a non-whitespace/non-comment token.
4048 MIOpt.ReadToken();
4049 return LexStringLiteral(Result, CurPtr,
4050 ParsingFilename ? tok::header_name
4051 : tok::string_literal);
4052
4053 // C99 6.4.6: Punctuators.
4054 case '?':
4055 Kind = tok::question;
4056 break;
4057 case '[':
4058 Kind = tok::l_square;
4059 break;
4060 case ']':
4061 Kind = tok::r_square;
4062 break;
4063 case '(':
4064 Kind = tok::l_paren;
4065 break;
4066 case ')':
4067 Kind = tok::r_paren;
4068 break;
4069 case '{':
4070 Kind = tok::l_brace;
4071 break;
4072 case '}':
4073 Kind = tok::r_brace;
4074 break;
4075 case '.':
4076 Char = getCharAndSize(CurPtr, SizeTmp);
4077 if (Char >= '0' && Char <= '9') {
4078 // Notify MIOpt that we read a non-whitespace/non-comment token.
4079 MIOpt.ReadToken();
4080
4081 return LexNumericConstant(Result, ConsumeChar(CurPtr, SizeTmp, Result));
4082 } else if (LangOpts.CPlusPlus && Char == '*') {
4083 Kind = tok::periodstar;
4084 CurPtr += SizeTmp;
4085 } else if (Char == '.' &&
4086 getCharAndSize(CurPtr+SizeTmp, SizeTmp2) == '.') {
4087 Kind = tok::ellipsis;
4088 CurPtr = ConsumeChar(ConsumeChar(CurPtr, SizeTmp, Result),
4089 SizeTmp2, Result);
4090 } else {
4091 Kind = tok::period;
4092 }
4093 break;
4094 case '&':
4095 Char = getCharAndSize(CurPtr, SizeTmp);
4096 if (Char == '&') {
4097 Kind = tok::ampamp;
4098 CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);
4099 } else if (Char == '=') {
4100 Kind = tok::ampequal;
4101 CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);
4102 } else {
4103 Kind = tok::amp;
4104 }
4105 break;
4106 case '*':
4107 if (getCharAndSize(CurPtr, SizeTmp) == '=') {
4108 Kind = tok::starequal;
4109 CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);
4110 } else {
4111 Kind = tok::star;
4112 }
4113 break;
4114 case '+':
4115 Char = getCharAndSize(CurPtr, SizeTmp);
4116 if (Char == '+') {
4117 CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);
4118 Kind = tok::plusplus;
4119 } else if (Char == '=') {
4120 CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);
4121 Kind = tok::plusequal;
4122 } else {
4123 Kind = tok::plus;
4124 }
4125 break;
4126 case '-':
4127 Char = getCharAndSize(CurPtr, SizeTmp);
4128 if (Char == '-') { // --
4129 CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);
4130 Kind = tok::minusminus;
4131 } else if (Char == '>' && LangOpts.CPlusPlus &&
4132 getCharAndSize(CurPtr+SizeTmp, SizeTmp2) == '*') { // C++ ->*
4133 CurPtr = ConsumeChar(ConsumeChar(CurPtr, SizeTmp, Result),
4134 SizeTmp2, Result);
4135 Kind = tok::arrowstar;
4136 } else if (Char == '>') { // ->
4137 CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);
4138 Kind = tok::arrow;
4139 } else if (Char == '=') { // -=
4140 CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);
4141 Kind = tok::minusequal;
4142 } else {
4143 Kind = tok::minus;
4144 }
4145 break;
4146 case '~':
4147 Kind = tok::tilde;
4148 break;
4149 case '!':
4150 if (getCharAndSize(CurPtr, SizeTmp) == '=') {
4151 Kind = tok::exclaimequal;
4152 CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);
4153 } else {
4154 Kind = tok::exclaim;
4155 }
4156 break;
4157 case '/':
4158 // 6.4.9: Comments
4159 Char = getCharAndSize(CurPtr, SizeTmp);
4160 if (Char == '/') { // Line comment.
4161 // Even if Line comments are disabled (e.g. in C89 mode), we generally
4162 // want to lex this as a comment. There is one problem with this though,
4163 // that in one particular corner case, this can change the behavior of the
4164 // resultant program. For example, In "foo //**/ bar", C89 would lex
4165 // this as "foo / bar" and languages with Line comments would lex it as
4166 // "foo". Check to see if the character after the second slash is a '*'.
4167 // If so, we will lex that as a "/" instead of the start of a comment.
4168 // However, we never do this if we are just preprocessing.
4169 bool TreatAsComment =
4170 LineComment && (LangOpts.CPlusPlus || !LangOpts.TraditionalCPP);
4171 if (!TreatAsComment)
4172 if (!(PP && PP->isPreprocessedOutput()))
4173 TreatAsComment = getCharAndSize(CurPtr+SizeTmp, SizeTmp2) != '*';
4174
4175 if (TreatAsComment) {
4176 if (SkipLineComment(Result, ConsumeChar(CurPtr, SizeTmp, Result),
4177 TokAtPhysicalStartOfLine))
4178 return true; // There is a token to return.
4179
4180 // It is common for the tokens immediately after a // comment to be
4181 // whitespace (indentation for the next line). Instead of going through
4182 // the big switch, handle it efficiently now.
4183 goto SkipIgnoredUnits;
4184 }
4185 }
4186
4187 if (Char == '*') { // /**/ comment.
4188 if (SkipBlockComment(Result, ConsumeChar(CurPtr, SizeTmp, Result),
4189 TokAtPhysicalStartOfLine))
4190 return true; // There is a token to return.
4191
4192 // We only saw whitespace, so just try again with this lexer.
4193 // (We manually eliminate the tail call to avoid recursion.)
4194 goto LexNextToken;
4195 }
4196
4197 if (Char == '=') {
4198 CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);
4199 Kind = tok::slashequal;
4200 } else {
4201 Kind = tok::slash;
4202 }
4203 break;
4204 case '%':
4205 Char = getCharAndSize(CurPtr, SizeTmp);
4206 if (Char == '=') {
4207 Kind = tok::percentequal;
4208 CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);
4209 } else if (LangOpts.Digraphs && Char == '>') {
4210 Kind = tok::r_brace; // '%>' -> '}'
4211 CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);
4212 } else if (LangOpts.Digraphs && Char == ':') {
4213 CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);
4214 Char = getCharAndSize(CurPtr, SizeTmp);
4215 if (Char == '%' && getCharAndSize(CurPtr+SizeTmp, SizeTmp2) == ':') {
4216 Kind = tok::hashhash; // '%:%:' -> '##'
4217 CurPtr = ConsumeChar(ConsumeChar(CurPtr, SizeTmp, Result),
4218 SizeTmp2, Result);
4219 } else if (Char == '@' && LangOpts.MicrosoftExt) {// %:@ -> #@ -> Charize
4220 CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);
4221 if (!isLexingRawMode())
4222 Diag(BufferPtr, diag::ext_charize_microsoft);
4223 Kind = tok::hashat;
4224 } else { // '%:' -> '#'
4225 // We parsed a # character. If this occurs at the start of the line,
4226 // it's actually the start of a preprocessing directive. Callback to
4227 // the preprocessor to handle it.
4228 // TODO: -fpreprocessed mode??
4229 if (TokAtPhysicalStartOfLine && !LexingRawMode && !Is_PragmaLexer)
4230 goto HandleDirective;
4231
4232 Kind = tok::hash;
4233 }
4234 } else {
4235 Kind = tok::percent;
4236 }
4237 break;
4238 case '<':
4239 Char = getCharAndSize(CurPtr, SizeTmp);
4240 if (ParsingFilename) {
4241 return LexAngledStringLiteral(Result, CurPtr);
4242 } else if (Char == '<') {
4243 char After = getCharAndSize(CurPtr+SizeTmp, SizeTmp2);
4244 if (After == '=') {
4245 Kind = tok::lesslessequal;
4246 CurPtr = ConsumeChar(ConsumeChar(CurPtr, SizeTmp, Result),
4247 SizeTmp2, Result);
4248 } else if (After == '<' && IsStartOfConflictMarker(CurPtr-1)) {
4249 // If this is actually a '<<<<<<<' version control conflict marker,
4250 // recognize it as such and recover nicely.
4251 goto LexNextToken;
4252 } else if (After == '<' && HandleEndOfConflictMarker(CurPtr-1)) {
4253 // If this is '<<<<' and we're in a Perforce-style conflict marker,
4254 // ignore it.
4255 goto LexNextToken;
4256 } else if (LangOpts.CUDA && After == '<') {
4257 Kind = tok::lesslessless;
4258 CurPtr = ConsumeChar(ConsumeChar(CurPtr, SizeTmp, Result),
4259 SizeTmp2, Result);
4260 } else {
4261 CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);
4262 Kind = tok::lessless;
4263 }
4264 } else if (Char == '=') {
4265 char After = getCharAndSize(CurPtr+SizeTmp, SizeTmp2);
4266 if (After == '>') {
4267 if (LangOpts.CPlusPlus20) {
4268 if (!isLexingRawMode())
4269 Diag(BufferPtr, diag::warn_cxx17_compat_spaceship);
4270 CurPtr = ConsumeChar(ConsumeChar(CurPtr, SizeTmp, Result),
4271 SizeTmp2, Result);
4272 Kind = tok::spaceship;
4273 break;
4274 }
4275 // Suggest adding a space between the '<=' and the '>' to avoid a
4276 // change in semantics if this turns up in C++ <=17 mode.
4277 if (LangOpts.CPlusPlus && !isLexingRawMode()) {
4278 Diag(BufferPtr, diag::warn_cxx20_compat_spaceship)
4280 getSourceLocation(CurPtr + SizeTmp, SizeTmp2), " ");
4281 }
4282 }
4283 CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);
4284 Kind = tok::lessequal;
4285 } else if (LangOpts.Digraphs && Char == ':') { // '<:' -> '['
4286 if (LangOpts.CPlusPlus11 &&
4287 getCharAndSize(CurPtr + SizeTmp, SizeTmp2) == ':') {
4288 // C++0x [lex.pptoken]p3:
4289 // Otherwise, if the next three characters are <:: and the subsequent
4290 // character is neither : nor >, the < is treated as a preprocessor
4291 // token by itself and not as the first character of the alternative
4292 // token <:.
4293 unsigned SizeTmp3;
4294 char After = getCharAndSize(CurPtr + SizeTmp + SizeTmp2, SizeTmp3);
4295 if (After != ':' && After != '>') {
4296 Kind = tok::less;
4297 if (!isLexingRawMode())
4298 Diag(BufferPtr, diag::warn_cxx98_compat_less_colon_colon);
4299 break;
4300 }
4301 }
4302
4303 CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);
4304 Kind = tok::l_square;
4305 } else if (LangOpts.Digraphs && Char == '%') { // '<%' -> '{'
4306 CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);
4307 Kind = tok::l_brace;
4308 } else if (Char == '#' && /*Not a trigraph*/ SizeTmp == 1 &&
4309 lexEditorPlaceholder(Result, CurPtr)) {
4310 return true;
4311 } else {
4312 Kind = tok::less;
4313 }
4314 break;
4315 case '>':
4316 Char = getCharAndSize(CurPtr, SizeTmp);
4317 if (Char == '=') {
4318 CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);
4319 Kind = tok::greaterequal;
4320 } else if (Char == '>') {
4321 char After = getCharAndSize(CurPtr+SizeTmp, SizeTmp2);
4322 if (After == '=') {
4323 CurPtr = ConsumeChar(ConsumeChar(CurPtr, SizeTmp, Result),
4324 SizeTmp2, Result);
4325 Kind = tok::greatergreaterequal;
4326 } else if (After == '>' && IsStartOfConflictMarker(CurPtr-1)) {
4327 // If this is actually a '>>>>' conflict marker, recognize it as such
4328 // and recover nicely.
4329 goto LexNextToken;
4330 } else if (After == '>' && HandleEndOfConflictMarker(CurPtr-1)) {
4331 // If this is '>>>>>>>' and we're in a conflict marker, ignore it.
4332 goto LexNextToken;
4333 } else if (LangOpts.CUDA && After == '>') {
4334 Kind = tok::greatergreatergreater;
4335 CurPtr = ConsumeChar(ConsumeChar(CurPtr, SizeTmp, Result),
4336 SizeTmp2, Result);
4337 } else {
4338 CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);
4339 Kind = tok::greatergreater;
4340 }
4341 } else {
4342 Kind = tok::greater;
4343 }
4344 break;
4345 case '^':
4346 Char = getCharAndSize(CurPtr, SizeTmp);
4347 if (Char == '=') {
4348 CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);
4349 Kind = tok::caretequal;
4350 } else {
4351 if (LangOpts.OpenCL && Char == '^')
4352 Diag(CurPtr, diag::err_opencl_logical_exclusive_or);
4353 Kind = tok::caret;
4354 }
4355 break;
4356 case '|':
4357 Char = getCharAndSize(CurPtr, SizeTmp);
4358 if (Char == '=') {
4359 Kind = tok::pipeequal;
4360 CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);
4361 } else if (Char == '|') {
4362 // If this is '|||||||' and we're in a conflict marker, ignore it.
4363 if (CurPtr[1] == '|' && HandleEndOfConflictMarker(CurPtr-1))
4364 goto LexNextToken;
4365 Kind = tok::pipepipe;
4366 CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);
4367 } else {
4368 Kind = tok::pipe;
4369 }
4370 break;
4371 case ':':
4372 Char = getCharAndSize(CurPtr, SizeTmp);
4373 if (LangOpts.Digraphs && Char == '>') {
4374 Kind = tok::r_square; // ':>' -> ']'
4375 CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);
4376 } else if (Char == ':') {
4377 Kind = tok::coloncolon;
4378 CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);
4379 } else {
4380 Kind = tok::colon;
4381 }
4382 break;
4383 case ';':
4384 Kind = tok::semi;
4385 break;
4386 case '=':
4387 Char = getCharAndSize(CurPtr, SizeTmp);
4388 if (Char == '=') {
4389 // If this is '====' and we're in a conflict marker, ignore it.
4390 if (CurPtr[1] == '=' && HandleEndOfConflictMarker(CurPtr-1))
4391 goto LexNextToken;
4392
4393 Kind = tok::equalequal;
4394 CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);
4395 } else {
4396 Kind = tok::equal;
4397 }
4398 break;
4399 case ',':
4400 Kind = tok::comma;
4401 break;
4402 case '#':
4403 Char = getCharAndSize(CurPtr, SizeTmp);
4404 if (Char == '#') {
4405 Kind = tok::hashhash;
4406 CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);
4407 } else if (Char == '@' && LangOpts.MicrosoftExt) { // #@ -> Charize
4408 Kind = tok::hashat;
4409 if (!isLexingRawMode())
4410 Diag(BufferPtr, diag::ext_charize_microsoft);
4411 CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);
4412 } else {
4413 // We parsed a # character. If this occurs at the start of the line,
4414 // it's actually the start of a preprocessing directive. Callback to
4415 // the preprocessor to handle it.
4416 // TODO: -fpreprocessed mode??
4417 if (TokAtPhysicalStartOfLine && !LexingRawMode && !Is_PragmaLexer)
4418 goto HandleDirective;
4419
4420 Kind = tok::hash;
4421 }
4422 break;
4423
4424 case '@':
4425 // Objective C support.
4426 if (CurPtr[-1] == '@' && LangOpts.ObjC)
4427 Kind = tok::at;
4428 else
4429 Kind = tok::unknown;
4430 break;
4431
4432 // UCNs (C99 6.4.3, C++11 [lex.charset]p2)
4433 case '\\':
4434 if (!LangOpts.AsmPreprocessor) {
4435 if (uint32_t CodePoint = tryReadUCN(CurPtr, BufferPtr, &Result)) {
4436 if (CheckUnicodeWhitespace(Result, CodePoint, CurPtr)) {
4437 if (SkipWhitespace(Result, CurPtr, TokAtPhysicalStartOfLine))
4438 return true; // KeepWhitespaceMode
4439
4440 // We only saw whitespace, so just try again with this lexer.
4441 // (We manually eliminate the tail call to avoid recursion.)
4442 goto LexNextToken;
4443 }
4444
4445 return LexUnicodeIdentifierStart(Result, CodePoint, CurPtr);
4446 }
4447 }
4448
4449 Kind = tok::unknown;
4450 break;
4451
4452 default: {
4453 if (isASCII(Char)) {
4454 Kind = tok::unknown;
4455 break;
4456 }
4457
4458 llvm::UTF32 CodePoint;
4459
4460 // We can't just reset CurPtr to BufferPtr because BufferPtr may point to
4461 // an escaped newline.
4462 --CurPtr;
4463 llvm::ConversionResult Status =
4464 llvm::convertUTF8Sequence((const llvm::UTF8 **)&CurPtr,
4465 (const llvm::UTF8 *)BufferEnd,
4466 &CodePoint,
4467 llvm::strictConversion);
4468 if (Status == llvm::conversionOK) {
4469 if (CheckUnicodeWhitespace(Result, CodePoint, CurPtr)) {
4470 if (SkipWhitespace(Result, CurPtr, TokAtPhysicalStartOfLine))
4471 return true; // KeepWhitespaceMode
4472
4473 // We only saw whitespace, so just try again with this lexer.
4474 // (We manually eliminate the tail call to avoid recursion.)
4475 goto LexNextToken;
4476 }
4477 return LexUnicodeIdentifierStart(Result, CodePoint, CurPtr);
4478 }
4479
4482 ++CurPtr;
4483 Kind = tok::unknown;
4484 break;
4485 }
4486
4487 // Non-ASCII characters tend to creep into source code unintentionally.
4488 // Instead of letting the parser complain about the unknown token,
4489 // just diagnose the invalid UTF-8, then drop the character.
4490 Diag(CurPtr, diag::err_invalid_utf8);
4491
4492 BufferPtr = CurPtr+1;
4493 // We're pretending the character didn't exist, so just try again with
4494 // this lexer.
4495 // (We manually eliminate the tail call to avoid recursion.)
4496 goto LexNextToken;
4497 }
4498 }
4499
4500 // Notify MIOpt that we read a non-whitespace/non-comment token.
4501 MIOpt.ReadToken();
4502
4503 // Update the location of token as well as BufferPtr.
4504 FormTokenWithChars(Result, CurPtr, Kind);
4505 return true;
4506
4507HandleDirective:
4508 // We parsed a # character and it's the start of a preprocessing directive.
4509
4510 FormTokenWithChars(Result, CurPtr, tok::hash);
4512
4514 // With a fatal failure in the module loader, we abort parsing.
4515 return true;
4516
4517 // We parsed the directive; lex a token with the new state.
4518 return false;
4519
4520LexNextToken:
4521 Result.clearFlag(Token::NeedsCleaning);
4522 goto LexStart;
4523}
4524
4525const char *Lexer::convertDependencyDirectiveToken(
4527 const char *TokPtr = BufferStart + DDTok.Offset;
4528 Result.startToken();
4529 Result.setLocation(getSourceLocation(TokPtr));
4530 Result.setKind(DDTok.Kind);
4531 Result.setFlag((Token::TokenFlags)DDTok.Flags);
4532 Result.setLength(DDTok.Length);
4533 BufferPtr = TokPtr + DDTok.Length;
4534 return TokPtr;
4535}
4536
4537bool Lexer::LexDependencyDirectiveToken(Token &Result) {
4538 assert(isDependencyDirectivesLexer());
4539
4540 using namespace dependency_directives_scan;
4541
4542 while (NextDepDirectiveTokenIndex == DepDirectives.front().Tokens.size()) {
4543 if (DepDirectives.front().Kind == pp_eof)
4544 return LexEndOfFile(Result, BufferEnd);
4545 if (DepDirectives.front().Kind == tokens_present_before_eof)
4546 MIOpt.ReadToken();
4547 NextDepDirectiveTokenIndex = 0;
4548 DepDirectives = DepDirectives.drop_front();
4549 }
4550
4552 DepDirectives.front().Tokens[NextDepDirectiveTokenIndex++];
4553 if (NextDepDirectiveTokenIndex > 1 || DDTok.Kind != tok::hash) {
4554 // Read something other than a preprocessor directive hash.
4555 MIOpt.ReadToken();
4556 }
4557
4558 if (ParsingFilename && DDTok.is(tok::less)) {
4559 BufferPtr = BufferStart + DDTok.Offset;
4560 LexAngledStringLiteral(Result, BufferPtr + 1);
4561 if (Result.isNot(tok::header_name))
4562 return true;
4563 // Advance the index of lexed tokens.
4564 while (true) {
4565 const dependency_directives_scan::Token &NextTok =
4566 DepDirectives.front().Tokens[NextDepDirectiveTokenIndex];
4567 if (BufferStart + NextTok.Offset >= BufferPtr)
4568 break;
4569 ++NextDepDirectiveTokenIndex;
4570 }
4571 return true;
4572 }
4573
4574 const char *TokPtr = convertDependencyDirectiveToken(DDTok, Result);
4575
4576 if (Result.is(tok::hash) && Result.isAtStartOfLine()) {
4578 return false;
4579 }
4580 if (Result.is(tok::raw_identifier)) {
4581 Result.setRawIdentifierData(TokPtr);
4582 if (!isLexingRawMode()) {
4584 if (II->isHandleIdentifierCase())
4585 return PP->HandleIdentifier(Result);
4586 }
4587 return true;
4588 }
4589 if (Result.isLiteral()) {
4590 Result.setLiteralData(TokPtr);
4591 return true;
4592 }
4593 if (Result.is(tok::colon)) {
4594 // Convert consecutive colons to 'tok::coloncolon'.
4595 if (*BufferPtr == ':') {
4596 assert(DepDirectives.front().Tokens[NextDepDirectiveTokenIndex].is(
4597 tok::colon));
4598 ++NextDepDirectiveTokenIndex;
4599 Result.setKind(tok::coloncolon);
4600 }
4601 return true;
4602 }
4603 if (Result.is(tok::eod))
4605
4606 return true;
4607}
4608
4609bool Lexer::LexDependencyDirectiveTokenWhileSkipping(Token &Result) {
4610 assert(isDependencyDirectivesLexer());
4611
4612 using namespace dependency_directives_scan;
4613
4614 bool Stop = false;
4615 unsigned NestedIfs = 0;
4616 do {
4617 DepDirectives = DepDirectives.drop_front();
4618 switch (DepDirectives.front().Kind) {
4619 case pp_none:
4620 llvm_unreachable("unexpected 'pp_none'");
4621 case pp_include:
4623 case pp_define:
4624 case pp_undef:
4625 case pp_import:
4626 case pp_pragma_import:
4627 case pp_pragma_once:
4632 case pp_include_next:
4633 case decl_at_import:
4634 case cxx_module_decl:
4635 case cxx_import_decl:
4639 break;
4640 case pp_if:
4641 case pp_ifdef:
4642 case pp_ifndef:
4643 ++NestedIfs;
4644 break;
4645 case pp_elif:
4646 case pp_elifdef:
4647 case pp_elifndef:
4648 case pp_else:
4649 if (!NestedIfs) {
4650 Stop = true;
4651 }
4652 break;
4653 case pp_endif:
4654 if (!NestedIfs) {
4655 Stop = true;
4656 } else {
4657 --NestedIfs;
4658 }
4659 break;
4660 case pp_eof:
4661 NextDepDirectiveTokenIndex = 0;
4662 return LexEndOfFile(Result, BufferEnd);
4663 }
4664 } while (!Stop);
4665
4667 DepDirectives.front().Tokens.front();
4668 assert(DDTok.is(tok::hash));
4669 NextDepDirectiveTokenIndex = 1;
4670
4671 convertDependencyDirectiveToken(DDTok, Result);
4672 return false;
4673}
StringRef P
#define SM(sm)
Definition: Cuda.cpp:85
Defines the Diagnostic-related interfaces.
Expr * E
Defines the clang::IdentifierInfo, clang::IdentifierTable, and clang::Selector interfaces.
Forward-declares and imports various common LLVM datatypes that clang wants to use unqualified.
Defines the clang::LangOptions interface.
static bool isInExpansionTokenRange(const SourceLocation Loc, const SourceManager &SM)
Definition: Lexer.cpp:947
static bool isMathematicalExtensionID(uint32_t C, const LangOptions &LangOpts, bool IsStart, bool &IsExtension)
Definition: Lexer.cpp:1567
static void diagnoseInvalidUnicodeCodepointInIdentifier(DiagnosticsEngine &Diags, const LangOptions &LangOpts, uint32_t CodePoint, CharSourceRange Range, bool IsFirst)
Definition: Lexer.cpp:1761
static char DecodeTrigraphChar(const char *CP, Lexer *L, bool Trigraphs)
DecodeTrigraphChar - If the specified character is a legal trigraph when prefixed with ?...
Definition: Lexer.cpp:1260
static size_t getSpellingSlow(const Token &Tok, const char *BufPtr, const LangOptions &LangOpts, char *Spelling)
Slow case of getSpelling.
Definition: Lexer.cpp:324
static const char * FindConflictEnd(const char *CurPtr, const char *BufferEnd, ConflictMarkerKind CMK)
Find the end of a version control conflict marker.
Definition: Lexer.cpp:3265
static void maybeDiagnoseUTF8Homoglyph(DiagnosticsEngine &Diags, uint32_t C, CharSourceRange Range)
After encountering UTF-8 character C and interpreting it as an identifier character,...
Definition: Lexer.cpp:1686
static SourceLocation getBeginningOfFileToken(SourceLocation Loc, const SourceManager &SM, const LangOptions &LangOpts)
Definition: Lexer.cpp:560
static void StringifyImpl(T &Str, char Quote)
Definition: Lexer.cpp:284
static LLVM_ATTRIBUTE_NOINLINE SourceLocation GetMappedTokenLoc(Preprocessor &PP, SourceLocation FileLoc, unsigned CharNo, unsigned TokLen)
GetMappedTokenLoc - If lexing out of a 'mapped buffer', where we pretend the lexer buffer was all exp...
Definition: Lexer.cpp:1188
static bool isAllowedIDChar(uint32_t C, const LangOptions &LangOpts, bool &IsExtension)
Definition: Lexer.cpp:1581
static CharSourceRange makeCharRange(Lexer &L, const char *Begin, const char *End)
Definition: Lexer.cpp:1651
static bool isUnicodeWhitespace(uint32_t Codepoint)
Definition: Lexer.cpp:1548
static void diagnoseExtensionInIdentifier(DiagnosticsEngine &Diags, uint32_t C, CharSourceRange Range)
Definition: Lexer.cpp:1635
static const char * findPlaceholderEnd(const char *CurPtr, const char *BufferEnd)
Definition: Lexer.cpp:3369
static llvm::SmallString< 5 > codepointAsHexString(uint32_t C)
Definition: Lexer.cpp:1554
static CharSourceRange makeRangeFromFileLocs(CharSourceRange Range, const SourceManager &SM, const LangOptions &LangOpts)
Definition: Lexer.cpp:919
static bool isEndOfBlockCommentWithEscapedNewLine(const char *CurPtr, Lexer *L, bool Trigraphs)
isBlockCommentEndOfEscapedNewLine - Return true if the specified newline character (either \n or \r) ...
Definition: Lexer.cpp:2781
static const char * fastParseASCIIIdentifier(const char *CurPtr, const char *BufferEnd)
Definition: Lexer.cpp:1927
static char GetTrigraphCharForLetter(char Letter)
GetTrigraphCharForLetter - Given a character that occurs after a ?? pair, return the decoded trigraph...
Definition: Lexer.cpp:1241
static bool isAllowedInitiallyIDChar(uint32_t C, const LangOptions &LangOpts, bool &IsExtension)
Definition: Lexer.cpp:1609
static void maybeDiagnoseIDCharCompat(DiagnosticsEngine &Diags, uint32_t C, CharSourceRange Range, bool IsFirst)
Definition: Lexer.cpp:1657
static const char * findBeginningOfLine(StringRef Buffer, unsigned Offset)
Returns the pointer that points to the beginning of line that contains the given offset,...
Definition: Lexer.cpp:543
Defines the MultipleIncludeOpt interface.
Defines the clang::Preprocessor interface.
SourceRange Range
Definition: SemaObjC.cpp:758
SourceLocation Loc
Definition: SemaObjC.cpp:759
Defines the clang::SourceLocation class and associated facilities.
Defines the SourceManager interface.
Defines the clang::TokenKind enum and support functions.
SourceLocation Begin
static const llvm::sys::UnicodeCharRange C11DisallowedInitialIDCharRanges[]
static const llvm::sys::UnicodeCharRange C99DisallowedInitialIDCharRanges[]
static const llvm::sys::UnicodeCharRange UnicodeWhitespaceCharRanges[]
static const llvm::sys::UnicodeCharRange C99AllowedIDCharRanges[]
static const llvm::sys::UnicodeCharRange C11AllowedIDCharRanges[]
static const llvm::sys::UnicodeCharRange MathematicalNotationProfileIDStartRanges[]
static const llvm::sys::UnicodeCharRange MathematicalNotationProfileIDContinueRanges[]
static const llvm::sys::UnicodeCharRange XIDStartRanges[]
static const llvm::sys::UnicodeCharRange XIDContinueRanges[]
__DEVICE__ void * memcpy(void *__a, const void *__b, size_t __c)
__device__ int
__device__ __2f16 float c
__PTRDIFF_TYPE__ ptrdiff_t
static __inline__ int __ATTRS_o_ai vec_any_ge(vector signed char __a, vector signed char __b)
Definition: altivec.h:16260
static __inline__ int __ATTRS_o_ai vec_any_eq(vector signed char __a, vector signed char __b)
Definition: altivec.h:16052
Represents a character-granular source range.
static CharSourceRange getCharRange(SourceRange R)
SourceLocation getEnd() const
SourceLocation getBegin() const
A little helper class used to produce diagnostics.
Definition: Diagnostic.h:1220
Concrete class used by the front-end to report problems and issues.
Definition: Diagnostic.h:231
DiagnosticBuilder Report(SourceLocation Loc, unsigned DiagID)
Issue the message to the client.
Definition: Diagnostic.h:1493
bool isIgnored(unsigned DiagID, SourceLocation Loc) const
Determine whether the diagnostic is known to be ignored.
Definition: Diagnostic.h:939
An opaque identifier used by SourceManager which refers to a source file (MemoryBuffer) along with it...
bool isInvalid() const
static FixItHint CreateReplacement(CharSourceRange RemoveRange, StringRef Code)
Create a code modification hint that replaces the given source range with the given code string.
Definition: Diagnostic.h:138
static FixItHint CreateRemoval(CharSourceRange RemoveRange)
Create a code modification hint that removes the given source range.
Definition: Diagnostic.h:127
static FixItHint CreateInsertion(SourceLocation InsertionLoc, StringRef Code, bool BeforePreviousInsertions=false)
Create a code modification hint that inserts the given code string at a specific location.
Definition: Diagnostic.h:101
One of these records is kept for each identifier that is lexed.
bool isHandleIdentifierCase() const
Return true if the Preprocessor::HandleIdentifier must be called on a token of this identifier.
bool isKeyword(const LangOptions &LangOpts) const
Return true if this token is a keyword in the specified language.
tok::ObjCKeywordKind getObjCKeywordID() const
Return the Objective-C keyword ID for the this identifier.
IdentifierInfo & get(StringRef Name)
Return the identifier token info for the specified named identifier.
Keeps track of the various options that can be enabled, which controls the dialect of C or C++ that i...
Definition: LangOptions.h:499
Lexer - This provides a simple interface that turns a text buffer into a stream of tokens.
Definition: Lexer.h:78
static StringRef getSourceText(CharSourceRange Range, const SourceManager &SM, const LangOptions &LangOpts, bool *Invalid=nullptr)
Returns a string for the source that the range encompasses.
Definition: Lexer.cpp:1023
void SetKeepWhitespaceMode(bool Val)
SetKeepWhitespaceMode - This method lets clients enable or disable whitespace retention mode.
Definition: Lexer.h:254
static SourceLocation findLocationAfterToken(SourceLocation loc, tok::TokenKind TKind, const SourceManager &SM, const LangOptions &LangOpts, bool SkipTrailingWhitespaceAndNewLine)
Checks that the given token is the first token that occurs after the given location (this excludes co...
Definition: Lexer.cpp:1380
bool LexFromRawLexer(Token &Result)
LexFromRawLexer - Lex a token from a designated raw lexer (one with no associated preprocessor object...
Definition: Lexer.h:236
bool inKeepCommentMode() const
inKeepCommentMode - Return true if the lexer should return comments as tokens.
Definition: Lexer.h:262
void SetCommentRetentionState(bool Mode)
SetCommentRetentionMode - Change the comment retention mode of the lexer to the specified mode.
Definition: Lexer.h:269
static std::optional< Token > findPreviousToken(SourceLocation Loc, const SourceManager &SM, const LangOptions &LangOpts, bool IncludeComments)
Finds the token that comes before the given location.
Definition: Lexer.cpp:1355
void seek(unsigned Offset, bool IsAtStartOfLine)
Set the lexer's buffer pointer to Offset.
Definition: Lexer.cpp:277
static StringRef getImmediateMacroName(SourceLocation Loc, const SourceManager &SM, const LangOptions &LangOpts)
Retrieve the name of the immediate macro expansion.
Definition: Lexer.cpp:1059
void ReadToEndOfLine(SmallVectorImpl< char > *Result=nullptr)
ReadToEndOfLine - Read the rest of the current preprocessor line as an uninterpreted string.
Definition: Lexer.cpp:3089
static bool isAtStartOfMacroExpansion(SourceLocation loc, const SourceManager &SM, const LangOptions &LangOpts, SourceLocation *MacroBegin=nullptr)
Returns true if the given MacroID location points at the first token of the macro expansion.
Definition: Lexer.cpp:871
DiagnosticBuilder Diag(const char *Loc, unsigned DiagID) const
Diag - Forwarding function for diagnostics.
Definition: Lexer.cpp:1231
const char * getBufferLocation() const
Return the current location in the buffer.
Definition: Lexer.h:308
bool Lex(Token &Result)
Lex - Return the next token in the file.
Definition: Lexer.cpp:3702
bool isPragmaLexer() const
isPragmaLexer - Returns true if this Lexer is being used to lex a pragma.
Definition: Lexer.h:225
static unsigned getTokenPrefixLength(SourceLocation TokStart, unsigned CharNo, const SourceManager &SM, const LangOptions &LangOpts)
Get the physical length (including trigraphs and escaped newlines) of the first Characters characters...
Definition: Lexer.cpp:790
Lexer(FileID FID, const llvm::MemoryBufferRef &InputFile, Preprocessor &PP, bool IsFirstIncludeOfFile=true)
Lexer constructor - Create a new lexer object for the specified buffer with the specified preprocesso...
Definition: Lexer.cpp:183
static bool isAtEndOfMacroExpansion(SourceLocation loc, const SourceManager &SM, const LangOptions &LangOpts, SourceLocation *MacroEnd=nullptr)
Returns true if the given MacroID location points at the last token of the macro expansion.
Definition: Lexer.cpp:893
SourceLocation getSourceLocation() override
getSourceLocation - Return a source location for the next character in the current file.
Definition: Lexer.h:303
static CharSourceRange makeFileCharRange(CharSourceRange Range, const SourceManager &SM, const LangOptions &LangOpts)
Accepts a range and returns a character range with file locations.
Definition: Lexer.cpp:954
static bool isNewLineEscaped(const char *BufferStart, const char *Str)
Checks whether new line pointed by Str is preceded by escape sequence.
Definition: Lexer.cpp:1137
SourceLocation getSourceLocation(const char *Loc, unsigned TokLen=1) const
getSourceLocation - Return a source location identifier for the specified offset in the current file.
Definition: Lexer.cpp:1212
static StringRef getIndentationForLine(SourceLocation Loc, const SourceManager &SM)
Returns the leading whitespace for line that corresponds to the given location Loc.
Definition: Lexer.cpp:1157
static unsigned getSpelling(const Token &Tok, const char *&Buffer, const SourceManager &SourceMgr, const LangOptions &LangOpts, bool *Invalid=nullptr)
getSpelling - This method is used to get the spelling of a token into a preallocated buffer,...
Definition: Lexer.cpp:451
bool isKeepWhitespaceMode() const
isKeepWhitespaceMode - Return true if the lexer should return tokens for every character in the file,...
Definition: Lexer.h:248
static bool isAsciiIdentifierContinueChar(char c, const LangOptions &LangOpts)
Returns true if the given character could appear in an identifier.
Definition: Lexer.cpp:1133
static std::optional< Token > findNextToken(SourceLocation Loc, const SourceManager &SM, const LangOptions &LangOpts, bool IncludeComments=false)
Finds the token that comes right after the given location.
Definition: Lexer.cpp:1324
static unsigned MeasureTokenLength(SourceLocation Loc, const SourceManager &SM, const LangOptions &LangOpts)
MeasureTokenLength - Relex the token at the specified location and return its length in bytes in the ...
Definition: Lexer.cpp:498
static SourceLocation GetBeginningOfToken(SourceLocation Loc, const SourceManager &SM, const LangOptions &LangOpts)
Given a location any where in a source buffer, find the location that corresponds to the beginning of...
Definition: Lexer.cpp:608
void resetExtendedTokenMode()
Sets the extended token mode back to its initial value, according to the language options and preproc...
Definition: Lexer.cpp:219
static StringRef getImmediateMacroNameForDiagnostics(SourceLocation Loc, const SourceManager &SM, const LangOptions &LangOpts)
Retrieve the name of the immediate macro expansion.
Definition: Lexer.cpp:1106
static Lexer * Create_PragmaLexer(SourceLocation SpellingLoc, SourceLocation ExpansionLocStart, SourceLocation ExpansionLocEnd, unsigned TokLen, Preprocessor &PP)
Create_PragmaLexer: Lexer constructor - Create a new lexer object for _Pragma expansion.
Definition: Lexer.cpp:242
static PreambleBounds ComputePreamble(StringRef Buffer, const LangOptions &LangOpts, unsigned MaxLines=0)
Compute the preamble of the given file.
Definition: Lexer.cpp:636
static bool getRawToken(SourceLocation Loc, Token &Result, const SourceManager &SM, const LangOptions &LangOpts, bool IgnoreWhiteSpace=false)
Relex the token at the specified location.
Definition: Lexer.cpp:509
static SourceLocation getLocForEndOfToken(SourceLocation Loc, unsigned Offset, const SourceManager &SM, const LangOptions &LangOpts)
Computes the source location just past the end of the token at this source location.
Definition: Lexer.cpp:849
static std::string Stringify(StringRef Str, bool Charify=false)
Stringify - Convert the specified string into a C string by i) escaping '\' and " characters and ii) ...
Definition: Lexer.cpp:309
static SizedChar getCharAndSizeNoWarn(const char *Ptr, const LangOptions &LangOpts)
getCharAndSizeNoWarn - Like the getCharAndSize method, but does not ever emit a warning.
Definition: Lexer.h:593
void ExitTopLevelConditional()
Called when the lexer exits the top-level conditional.
bool LexingRawMode
True if in raw mode.
SmallVector< PPConditionalInfo, 4 > ConditionalStack
Information about the set of #if/#ifdef/#ifndef blocks we are currently in.
bool ParsingPreprocessorDirective
True when parsing #XXX; turns '\n' into a tok::eod token.
MultipleIncludeOpt MIOpt
A state machine that detects the #ifndef-wrapping a file idiom for the multiple-include optimization.
bool ParsingFilename
True after #include; turns <xx> or "xxx" into a tok::header_name token.
bool isLexingRawMode() const
Return true if this lexer is in raw mode or not.
const FileID FID
The SourceManager FileID corresponding to the file being lexed.
bool LexEditorPlaceholders
When enabled, the preprocessor will construct editor placeholder tokens.
Engages in a tight little dance with the lexer to efficiently preprocess tokens.
Definition: Preprocessor.h:138
SourceLocation getCodeCompletionLoc() const
Returns the location of the code-completion point.
SourceLocation getCodeCompletionFileLoc() const
Returns the start location of the file of code-completion point.
void setCodeCompletionTokenRange(const SourceLocation Start, const SourceLocation End)
Set the code completion token range for detecting replacement range later on.
bool isRecordingPreamble() const
void setRecordedPreambleConditionalStack(ArrayRef< PPConditionalInfo > s)
bool isInPrimaryFile() const
Return true if we're in the top-level file, not in a #include.
void CreateString(StringRef Str, Token &Tok, SourceLocation ExpansionLocStart=SourceLocation(), SourceLocation ExpansionLocEnd=SourceLocation())
Plop the specified string into a scratch buffer and set the specified token's location and length to ...
IdentifierInfo * LookUpIdentifierInfo(Token &Identifier) const
Given a tok::raw_identifier token, look up the identifier information for the token and install it in...
bool isPreprocessedOutput() const
Returns true if the preprocessor is responsible for generating output, false if it is producing token...
bool HandleIdentifier(Token &Identifier)
Callback invoked when the lexer reads an identifier and has filled in the tokens IdentifierInfo membe...
SourceManager & getSourceManager() const
EmptylineHandler * getEmptylineHandler() const
bool getCommentRetentionState() const
bool hadModuleLoaderFatalFailure() const
PreprocessorOptions & getPreprocessorOpts() const
Retrieve the preprocessor options used to initialize this preprocessor.
StringRef getSpelling(SourceLocation loc, SmallVectorImpl< char > &buffer, bool *invalid=nullptr) const
Return the 'spelling' of the token at the given location; does not go up to the spelling location or ...
bool HandleComment(Token &result, SourceRange Comment)
bool isCodeCompletionEnabled() const
Determine if we are performing code completion.
void HandleDirective(Token &Result)
Callback invoked when the lexer sees a # token at the start of a line.
IdentifierTable & getIdentifierTable()
const LangOptions & getLangOpts() const
void CodeCompleteIncludedFile(llvm::StringRef Dir, bool IsAngled)
Hook used by the lexer to invoke the "included file" code completion point.
void CodeCompleteNaturalLanguage()
Hook used by the lexer to invoke the "natural language" code completion point.
bool HandleEndOfFile(Token &Result, bool isEndOfMacro=false)
Callback invoked when the lexer hits the end of the current file.
DiagnosticsEngine & getDiagnostics() const
void setCodeCompletionIdentifierInfo(IdentifierInfo *Filter)
Set the code completion token for filtering purposes.
DiagnosticBuilder Diag(SourceLocation Loc, unsigned DiagID) const
Forwarding function for diagnostics.
Encodes a location in the source.
static SourceLocation getFromRawEncoding(UIntTy Encoding)
Turn a raw encoding of a SourceLocation object into a real SourceLocation.
bool isValid() const
Return true if this is a valid SourceLocation object.
SourceLocation getLocWithOffset(IntTy Offset) const
Return a source location with the specified offset from this SourceLocation.
UIntTy getRawEncoding() const
When a SourceLocation itself cannot be used, this returns an (opaque) 32-bit integer encoding for it.
This class handles loading and caching of source files into memory.
const char * getCharacterData(SourceLocation SL, bool *Invalid=nullptr) const
Return a pointer to the start of the specified location in the appropriate spelling MemoryBuffer.
A trivial tuple used to represent a source range.
void setBegin(SourceLocation b)
bool isInvalid() const
SourceLocation getEnd() const
SourceLocation getBegin() const
void setEnd(SourceLocation e)
Each ExpansionInfo encodes the expansion location - where the token was ultimately expanded,...
SourceLocation getExpansionLocStart() const
SourceLocation getSpellingLoc() const
This is a discriminated union of FileInfo and ExpansionInfo.
const ExpansionInfo & getExpansion() const
static bool isValidUDSuffix(const LangOptions &LangOpts, StringRef Suffix)
Determine whether a suffix is a valid ud-suffix.
Token - This structure provides full information about a lexed token.
Definition: Token.h:36
IdentifierInfo * getIdentifierInfo() const
Definition: Token.h:187
bool hasUCN() const
Returns true if this token contains a universal character name.
Definition: Token.h:306
bool isLiteral() const
Return true if this is a "literal", like a numeric constant, string, etc.
Definition: Token.h:116
SourceLocation getLocation() const
Return a source location identifier for the specified offset in the current file.
Definition: Token.h:132
unsigned getLength() const
Definition: Token.h:135
tok::ObjCKeywordKind getObjCKeywordID() const
Return the ObjC keyword kind.
Definition: Lexer.cpp:69
bool is(tok::TokenKind K) const
is/isNot - Predicates to check if this token is a specific kind, as in "if (Tok.is(tok::l_brace)) {....
Definition: Token.h:99
tok::TokenKind getKind() const
Definition: Token.h:94
bool isAtStartOfLine() const
isAtStartOfLine - Return true if this token is at the start of a line.
Definition: Token.h:276
@ HasUCN
Definition: Token.h:83
@ IsEditorPlaceholder
Definition: Token.h:88
@ LeadingEmptyMacro
Definition: Token.h:81
@ LeadingSpace
Definition: Token.h:77
@ StartOfLine
Definition: Token.h:75
@ HasUDSuffix
Definition: Token.h:82
@ NeedsCleaning
Definition: Token.h:80
bool isAnnotation() const
Return true if this is any of tok::annot_* kind tokens.
Definition: Token.h:121
bool isObjCAtKeyword(tok::ObjCKeywordKind objcKey) const
Return true if we have an ObjC keyword identifier.
Definition: Lexer.cpp:60
bool isSimpleTypeSpecifier(const LangOptions &LangOpts) const
Determine whether the token kind starts a simple-type-specifier.
Definition: Lexer.cpp:77
void startToken()
Reset all flags to cleared.
Definition: Token.h:177
bool needsCleaning() const
Return true if this token has trigraphs or escaped newlines in it.
Definition: Token.h:295
StringRef getRawIdentifier() const
getRawIdentifier - For a raw identifier token (i.e., an identifier lexed in raw mode),...
Definition: Token.h:213
const char * getLiteralData() const
getLiteralData - For a literal token (numeric constant, string, etc), this returns a pointer to the s...
Definition: Token.h:225
void setFlag(TokenFlags Flag)
Set the specified flag.
Definition: Token.h:244
static __inline__ int __DEFAULT_FN_ATTRS _mm_movemask_epi8(__m128i __a)
Copies the values of the most significant bits from each 8-bit element in a 128-bit integer vector of...
Definition: emmintrin.h:4285
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_cmpeq_epi8(__m128i __a, __m128i __b)
Compares each of the corresponding 8-bit values of the 128-bit integer vectors for equality.
Definition: emmintrin.h:3092
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_loadu_si128(__m128i_u const *__p)
Moves packed integer values from an unaligned 128-bit memory location to elements in a 128-bit intege...
Definition: emmintrin.h:3458
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_load_si128(__m128i const *__p)
Moves packed integer values from an aligned 128-bit memory location to elements in a 128-bit integer ...
Definition: emmintrin.h:3443
static __inline__ __m128i __DEFAULT_FN_ATTRS_CONSTEXPR _mm_set1_epi8(char __b)
Initializes all values in a 128-bit vector of [16 x i8] with the specified 8-bit value.
Definition: emmintrin.h:3746
@ tokens_present_before_eof
Indicates that there are tokens present between the last scanned directive and eof.
@ After
Like System, but searched after the system directories.
bool isStringLiteral(TokenKind K)
Return true if this is a C or C++ string-literal (or C++11 user-defined-string-literal) token.
Definition: TokenKinds.h:89
ObjCKeywordKind
Provides a namespace for Objective-C keywords which start with an '@'.
Definition: TokenKinds.h:41
TokenKind
Provides a simple uniform namespace for tokens from all C languages.
Definition: TokenKinds.h:25
The JSON file list parser is used to communicate input to InstallAPI.
LLVM_READNONE bool isASCII(char c)
Returns true if a byte is an ASCII character.
Definition: CharInfo.h:41
LLVM_READONLY bool isVerticalWhitespace(unsigned char c)
Returns true if this character is vertical ASCII whitespace: '\n', '\r'.
Definition: CharInfo.h:99
ConflictMarkerKind
ConflictMarkerKind - Kinds of conflict marker which the lexer might be recovering from.
Definition: Lexer.h:44
@ CMK_Perforce
A Perforce-style conflict marker, initiated by 4 ">"s, separated by 4 "="s, and terminated by 4 "<"s.
Definition: Lexer.h:54
@ CMK_None
Not within a conflict marker.
Definition: Lexer.h:46
@ CMK_Normal
A normal or diff3 conflict marker, initiated by at least 7 "<"s, separated by at least 7 "="s or "|"s...
Definition: Lexer.h:50
@ LineComment
Definition: LangStandard.h:49
LLVM_READONLY bool isAsciiIdentifierContinue(unsigned char c)
Definition: CharInfo.h:61
bool operator<(DeclarationName LHS, DeclarationName RHS)
Ordering on two declaration names.
LLVM_READONLY bool isHorizontalWhitespace(unsigned char c)
Returns true if this character is horizontal ASCII whitespace: ' ', '\t', '\f', '\v'.
Definition: CharInfo.h:91
@ Result
The result type of a method or function.
LLVM_READONLY bool isRawStringDelimBody(unsigned char c)
Return true if this is the body character of a C++ raw string delimiter.
Definition: CharInfo.h:175
LLVM_READONLY bool isWhitespace(unsigned char c)
Return true if this character is horizontal or vertical ASCII whitespace: ' ', '\t',...
Definition: CharInfo.h:108
LLVM_READONLY bool isPreprocessingNumberBody(unsigned char c)
Return true if this is the body character of a C preprocessing number, which is [a-zA-Z0-9_.
Definition: CharInfo.h:168
const FunctionProtoType * T
LLVM_READONLY bool isAsciiIdentifierStart(unsigned char c, bool AllowDollar=false)
Returns true if this is a valid first character of a C identifier, which is [a-zA-Z_].
Definition: CharInfo.h:53
unsigned int uint32_t
__INTPTR_TYPE__ intptr_t
A signed integer type with the property that any valid pointer to void can be converted to this type,...
float __ovld __cnfn length(float)
Return the length of vector p, i.e., sqrt(p.x2 + p.y 2 + ...)
#define _SIDD_UBYTE_OPS
Definition: smmintrin.h:1532
#define _mm_cmpistri(A, B, M)
Uses the immediate operand M to perform a comparison of string data with implicitly defined lengths t...
Definition: smmintrin.h:1664
#define _SIDD_LEAST_SIGNIFICANT
Definition: smmintrin.h:1550
#define _SIDD_NEGATIVE_POLARITY
Definition: smmintrin.h:1545
#define _SIDD_CMP_RANGES
Definition: smmintrin.h:1539
Represents a char and the number of bytes parsed to produce it.
Definition: Lexer.h:586
Describes the bounds (start, size) of the preamble and a flag required by PreprocessorOptions::Precom...
Definition: Lexer.h:60
Token lexed as part of dependency directive scanning.
unsigned Offset
Offset into the original source input.