prescan.cpp | Explore in Territory

//===-- lib/Parser/prescan.cpp --------------------------------------------===//
//
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//

#include "prescan.h"
#include "flang/Common/idioms.h"
#include "flang/Parser/characters.h"
#include "flang/Parser/message.h"
#include "flang/Parser/preprocessor.h"
#include "flang/Parser/source.h"
#include "flang/Parser/token-sequence.h"
#include "llvm/Support/raw_ostream.h"
#include <cstddef>
#include <cstring>
#include <utility>
#include <vector>

namespace Fortran::parser {

using common::LanguageFeature;

static constexpr int maxPrescannerNesting{100};

Prescanner::Prescanner(Messages &messages, CookedSource &cooked,
    Preprocessor &preprocessor, common::LanguageFeatureControl lfc)
    : messages_{messages}, cooked_{cooked}, preprocessor_{preprocessor},
      allSources_{preprocessor_.allSources()}, features_{lfc},
      backslashFreeFormContinuation_{preprocessor.AnyDefinitions()},
      encoding_{allSources_.encoding()} {}

Prescanner::Prescanner(const Prescanner &that, Preprocessor &prepro,
    bool isNestedInIncludeDirective)
    : messages_{that.messages_}, cooked_{that.cooked_}, preprocessor_{prepro},
      allSources_{that.allSources_}, features_{that.features_},
      isNestedInIncludeDirective_{isNestedInIncludeDirective},
      backslashFreeFormContinuation_{that.backslashFreeFormContinuation_},
      inFixedForm_{that.inFixedForm_},
      fixedFormColumnLimit_{that.fixedFormColumnLimit_},
      encoding_{that.encoding_},
      prescannerNesting_{that.prescannerNesting_ + 1},
      skipLeadingAmpersand_{that.skipLeadingAmpersand_},
      compilerDirectiveBloomFilter_{that.compilerDirectiveBloomFilter_},
      compilerDirectiveSentinels_{that.compilerDirectiveSentinels_} {}

// Returns number of bytes to skip
static inline int IsSpace(const char *p) {
  if (*p == ' ') {
    return 1;
  } else if (*p == '\xa0') { // LATIN-1 NBSP non-breaking space
    return 1;
  } else if (p[0] == '\xc2' && p[1] == '\xa0') { // UTF-8 NBSP
    return 2;
  } else {
    return 0;
  }
}

static inline int IsSpaceOrTab(const char *p) {
  return *p == '\t' ? 1 : IsSpace(p);
}

static inline constexpr bool IsFixedFormCommentChar(char ch) {
  return ch == '!' || ch == '*' || ch == 'C' || ch == 'c';
}

static void NormalizeCompilerDirectiveCommentMarker(TokenSequence &dir) {
  char *p{dir.GetMutableCharData()};
  char *limit{p + dir.SizeInChars()};
  for (; p < limit; ++p) {
    if (*p != ' ') {
      CHECK(IsFixedFormCommentChar(*p));
      *p = '!';
      return;
    }
  }
  DIE("compiler directive all blank");
}

void Prescanner::Prescan(ProvenanceRange range) {
  startProvenance_ = range.start();
  start_ = allSources_.GetSource(range);
  CHECK(start_);
  limit_ = start_ + range.size();
  nextLine_ = start_;
  const bool beganInFixedForm{inFixedForm_};
  if (prescannerNesting_ > maxPrescannerNesting) {
    Say(GetProvenance(start_),
        "too many nested INCLUDE/#include files, possibly circular"_err_en_US);
    return;
  }
  while (!IsAtEnd()) {
    Statement();
  }
  if (inFixedForm_ != beganInFixedForm) {
    std::string dir{"!dir$ "};
    if (beganInFixedForm) {
      dir += "fixed";
    } else {
      dir += "free";
    }
    dir += '\n';
    TokenSequence tokens{dir, allSources_.AddCompilerInsertion(dir).start()};
    tokens.Emit(cooked_);
  }
}

void Prescanner::Statement() {
  TokenSequence tokens;
  const char *statementStart{nextLine_};
  LineClassification line{ClassifyLine(statementStart)};
  switch (line.kind) {
  case LineClassification::Kind::Comment:
    nextLine_ += line.payloadOffset; // advance to '!' or newline
    NextLine();
    return;
  case LineClassification::Kind::IncludeLine:
    FortranInclude(nextLine_ + line.payloadOffset);
    NextLine();
    return;
  case LineClassification::Kind::ConditionalCompilationDirective:
  case LineClassification::Kind::IncludeDirective:
    preprocessor_.Directive(TokenizePreprocessorDirective(), *this);
    afterPreprocessingDirective_ = true;
    skipLeadingAmpersand_ |= !inFixedForm_;
    return;
  case LineClassification::Kind::PreprocessorDirective:
    preprocessor_.Directive(TokenizePreprocessorDirective(), *this);
    afterPreprocessingDirective_ = true;
    // Don't set skipLeadingAmpersand_
    return;
  case LineClassification::Kind::DefinitionDirective:
    preprocessor_.Directive(TokenizePreprocessorDirective(), *this);
    // Don't set afterPreprocessingDirective_ or skipLeadingAmpersand_
    return;
  case LineClassification::Kind::CompilerDirective: {
    directiveSentinel_ = line.sentinel;
    CHECK(InCompilerDirective());
    BeginStatementAndAdvance();
    if (inFixedForm_) {
      CHECK(IsFixedFormCommentChar(*at_));
    } else {
      while (int n{IsSpaceOrTab(at_)}) {
        at_ += n, ++column_;
      }
      CHECK(*at_ == '!');
    }
    std::optional<int> condOffset;
    if (directiveSentinel_[0] == '$' && directiveSentinel_[1] == '\0') {
      // OpenMP conditional compilation line.
      condOffset = 2;
    } else if (directiveSentinel_[0] == '@' && directiveSentinel_[1] == 'c' &&
        directiveSentinel_[2] == 'u' && directiveSentinel_[3] == 'f' &&
        directiveSentinel_[4] == '\0') {
      // CUDA conditional compilation line.
      condOffset = 5;
    }
    if (condOffset) {
      at_ += *condOffset, column_ += *condOffset;
      if (auto payload{IsIncludeLine(at_)}) {
        FortranInclude(at_ + *payload);
        return;
      } else if (inFixedForm_) {
        LabelField(tokens);
      } else {
        SkipSpaces();
      }
    } else {
      // Compiler directive.  Emit normalized sentinel, squash following spaces.
      EmitChar(tokens, '!');
      ++at_, ++column_;
      for (const char *sp{directiveSentinel_}; *sp != '\0';
           ++sp, ++at_, ++column_) {
        EmitChar(tokens, *sp);
      }
      if (IsSpaceOrTab(at_)) {
        EmitChar(tokens, ' ');
        while (int n{IsSpaceOrTab(at_)}) {
          at_ += n, ++column_;
        }
      }
      tokens.CloseToken();
    }
    break;
  }
  case LineClassification::Kind::Source:
    BeginStatementAndAdvance();
    if (inFixedForm_) {
      if (features_.IsEnabled(LanguageFeature::OldDebugLines) &&
          (*at_ == 'D' || *at_ == 'd')) {
        NextChar();
      }
      LabelField(tokens);
    } else {
      if (skipLeadingAmpersand_) {
        skipLeadingAmpersand_ = false;
        const char *p{SkipWhiteSpace(at_)};
        if (p < limit_ && *p == '&') {
          column_ += ++p - at_;
          at_ = p;
        }
      } else {
        SkipSpaces();
      }
      // Check for a leading identifier that might be a keyword macro
      // that will expand to anything indicating a non-source line, like
      // a comment marker or directive sentinel.  If so, disable line
      // continuation, so that NextToken() won't consume anything from
      // following lines.
      if (IsLegalIdentifierStart(*at_)) {
        // TODO: Only bother with these cases when any keyword macro has
        // been defined with replacement text that could begin a comment
        // or directive sentinel.
        const char *p{at_};
        while (IsLegalInIdentifier(*++p)) {
        }
        CharBlock id{at_, static_cast<std::size_t>(p - at_)};
        if (preprocessor_.IsNameDefined(id) &&
            !preprocessor_.IsFunctionLikeDefinition(id)) {
          TokenSequence toks;
          toks.Put(id, GetProvenance(at_));
          if (auto replaced{preprocessor_.MacroReplacement(toks, *this)}) {
            auto newLineClass{ClassifyLine(*replaced, GetCurrentProvenance())};
            if (newLineClass.kind ==
                LineClassification::Kind::CompilerDirective) {
              directiveSentinel_ = newLineClass.sentinel;
              disableSourceContinuation_ = false;
            } else {
              disableSourceContinuation_ =
                  newLineClass.kind != LineClassification::Kind::Source;
            }
          }
        }
      }
    }
    break;
  }

  while (NextToken(tokens)) {
  }
  if (continuationLines_ > 255) {
    if (features_.ShouldWarn(common::LanguageFeature::MiscSourceExtensions)) {
      Say(common::LanguageFeature::MiscSourceExtensions,
          GetProvenance(statementStart),
          "%d continuation lines is more than the Fortran standard allows"_port_en_US,
          continuationLines_);
    }
  }

  Provenance newlineProvenance{GetCurrentProvenance()};
  if (std::optional<TokenSequence> preprocessed{
          preprocessor_.MacroReplacement(tokens, *this)}) {
    // Reprocess the preprocessed line.
    LineClassification ppl{ClassifyLine(*preprocessed, newlineProvenance)};
    switch (ppl.kind) {
    case LineClassification::Kind::Comment:
      break;
    case LineClassification::Kind::IncludeLine:
      FortranInclude(preprocessed->TokenAt(0).begin() + ppl.payloadOffset);
      break;
    case LineClassification::Kind::ConditionalCompilationDirective:
    case LineClassification::Kind::IncludeDirective:
    case LineClassification::Kind::DefinitionDirective:
    case LineClassification::Kind::PreprocessorDirective:
      if (features_.ShouldWarn(common::UsageWarning::Preprocessing)) {
        Say(common::UsageWarning::Preprocessing,
            preprocessed->GetProvenanceRange(),
            "Preprocessed line resembles a preprocessor directive"_warn_en_US);
      }
      CheckAndEmitLine(preprocessed->ToLowerCase(), newlineProvenance);
      break;
    case LineClassification::Kind::CompilerDirective:
      if (preprocessed->HasRedundantBlanks()) {
        preprocessed->RemoveRedundantBlanks();
      }
      while (CompilerDirectiveContinuation(*preprocessed, ppl.sentinel)) {
        newlineProvenance = GetCurrentProvenance();
      }
      NormalizeCompilerDirectiveCommentMarker(*preprocessed);
      preprocessed->ToLowerCase();
      SourceFormChange(preprocessed->ToString());
      CheckAndEmitLine(preprocessed->ToLowerCase().ClipComment(
                           *this, true /* skip first ! */),
          newlineProvenance);
      break;
    case LineClassification::Kind::Source:
      if (inFixedForm_) {
        if (preprocessed->HasBlanks(/*after column*/ 6)) {
          preprocessed->RemoveBlanks(/*after column*/ 6);
        }
      } else {
        while (SourceLineContinuation(*preprocessed)) {
          newlineProvenance = GetCurrentProvenance();
        }
        if (preprocessed->HasRedundantBlanks()) {
          preprocessed->RemoveRedundantBlanks();
        }
      }
      CheckAndEmitLine(
          preprocessed->ToLowerCase().ClipComment(*this), newlineProvenance);
      break;
    }
  } else { // no macro replacement
    if (line.kind == LineClassification::Kind::CompilerDirective) {
      while (CompilerDirectiveContinuation(tokens, line.sentinel)) {
        newlineProvenance = GetCurrentProvenance();
      }
      tokens.ToLowerCase();
      SourceFormChange(tokens.ToString());
    } else { // Kind::Source
      tokens.ToLowerCase();
      if (inFixedForm_) {
        EnforceStupidEndStatementRules(tokens);
      }
    }
    CheckAndEmitLine(tokens, newlineProvenance);
  }
  directiveSentinel_ = nullptr;
}

void Prescanner::CheckAndEmitLine(
    TokenSequence &tokens, Provenance newlineProvenance) {
  tokens.CheckBadFortranCharacters(
      messages_, *this, disableSourceContinuation_);
  // Parenthesis nesting check does not apply while any #include is
  // active, nor on the lines before and after a top-level #include,
  // nor before or after conditional source.
  // Applications play shenanigans with line continuation before and
  // after #include'd subprogram argument lists and conditional source.
  if (!isNestedInIncludeDirective_ && !omitNewline_ &&
      !afterPreprocessingDirective_ && tokens.BadlyNestedParentheses() &&
      !preprocessor_.InConditional()) {
    if (nextLine_ < limit_ && IsPreprocessorDirectiveLine(nextLine_)) {
      // don't complain
    } else {
      tokens.CheckBadParentheses(messages_);
    }
  }
  tokens.Emit(cooked_);
  if (omitNewline_) {
    omitNewline_ = false;
  } else {
    cooked_.Put('\n', newlineProvenance);
    afterPreprocessingDirective_ = false;
  }
}

TokenSequence Prescanner::TokenizePreprocessorDirective() {
  CHECK(!IsAtEnd() && !inPreprocessorDirective_);
  inPreprocessorDirective_ = true;
  BeginStatementAndAdvance();
  TokenSequence tokens;
  while (NextToken(tokens)) {
  }
  inPreprocessorDirective_ = false;
  return tokens;
}

void Prescanner::NextLine() {
  void *vstart{static_cast<void *>(const_cast<char *>(nextLine_))};
  void *v{std::memchr(vstart, '\n', limit_ - nextLine_)};
  if (!v) {
    nextLine_ = limit_;
  } else {
    const char *nl{const_cast<const char *>(static_cast<char *>(v))};
    nextLine_ = nl + 1;
  }
}

void Prescanner::LabelField(TokenSequence &token) {
  int outCol{1};
  const char *start{at_};
  std::optional<int> badColumn;
  for (; *at_ != '\n' && column_ <= 6; ++at_) {
    if (*at_ == '\t') {
      ++at_;
      column_ = 7;
      break;
    }
    if (int n{IsSpace(at_)}; n == 0 &&
        !(*at_ == '0' && column_ == 6)) { // '0' in column 6 becomes space
      EmitChar(token, *at_);
      ++outCol;
      if (!badColumn && (column_ == 6 || !IsDecimalDigit(*at_))) {
        badColumn = column_;
      }
    }
    ++column_;
  }
  if (badColumn && !preprocessor_.IsNameDefined(token.CurrentOpenToken())) {
    if ((prescannerNesting_ > 0 && *badColumn == 6 &&
            cooked_.BufferedBytes() == firstCookedCharacterOffset_) ||
        afterPreprocessingDirective_) {
      // This is the first source line in #include'd text or conditional
      // code under #if, or the first source line after such.
      // If it turns out that the preprocessed text begins with a
      // fixed form continuation line, the newline at the end
      // of the latest source line beforehand will be deleted in
      // CookedSource::Marshal().
      cooked_.MarkPossibleFixedFormContinuation();
    } else if (features_.ShouldWarn(common::UsageWarning::Scanning)) {
      Say(common::UsageWarning::Scanning, GetProvenance(start + *badColumn - 1),
          *badColumn == 6
              ? "Statement should not begin with a continuation line"_warn_en_US
              : "Character in fixed-form label field must be a digit"_warn_en_US);
    }
    token.clear();
    if (*badColumn < 6) {
      at_ = start;
      column_ = 1;
      return;
    }
    outCol = 1;
  }
  if (outCol == 1) { // empty label field
    // Emit a space so that, if the line is rescanned after preprocessing,
    // a leading 'C' or 'D' won't be left-justified and then accidentally
    // misinterpreted as a comment card.
    EmitChar(token, ' ');
    ++outCol;
  }
  token.CloseToken();
  SkipToNextSignificantCharacter();
  if (IsDecimalDigit(*at_)) {
    if (features_.ShouldWarn(common::LanguageFeature::MiscSourceExtensions)) {
      Say(common::LanguageFeature::MiscSourceExtensions, GetCurrentProvenance(),
          "Label digit is not in fixed-form label field"_port_en_US);
    }
  }
}

// 6.3.3.5: A program unit END statement, or any other statement whose
// initial line resembles an END statement, shall not be continued in
// fixed form source.
void Prescanner::EnforceStupidEndStatementRules(const TokenSequence &tokens) {
  CharBlock cBlock{tokens.ToCharBlock()};
  const char *str{cBlock.begin()};
  std::size_t n{cBlock.size()};
  if (n < 3) {
    return;
  }
  std::size_t j{0};
  for (; j < n && (str[j] == ' ' || (str[j] >= '0' && str[j] <= '9')); ++j) {
  }
  if (j + 3 > n || std::memcmp(str + j, "end", 3) != 0) {
    return;
  }
  // It starts with END, possibly after a label.
  auto start{allSources_.GetSourcePosition(tokens.GetCharProvenance(j))};
  auto end{allSources_.GetSourcePosition(tokens.GetCharProvenance(n - 1))};
  if (!start || !end) {
    return;
  }
  if (&*start->sourceFile == &*end->sourceFile && start->line == end->line) {
    return; // no continuation
  }
  j += 3;
  static const char *const prefixes[]{"program", "subroutine", "function",
      "blockdata", "module", "submodule", nullptr};
  bool isPrefix{j == n || !IsLegalInIdentifier(str[j])}; // prefix is END
  std::size_t endOfPrefix{j - 1};
  for (const char *const *p{prefixes}; *p; ++p) {
    std::size_t pLen{std::strlen(*p)};
    if (j + pLen <= n && std::memcmp(str + j, *p, pLen) == 0) {
      isPrefix = true; // END thing as prefix
      j += pLen;
      endOfPrefix = j - 1;
      for (; j < n && IsLegalInIdentifier(str[j]); ++j) {
      }
      break;
    }
  }
  if (isPrefix) {
    auto range{tokens.GetTokenProvenanceRange(1)};
    if (j == n) { // END or END thing [name]
      Say(range,
          "Program unit END statement may not be continued in fixed form source"_err_en_US);
    } else {
      auto endOfPrefixPos{
          allSources_.GetSourcePosition(tokens.GetCharProvenance(endOfPrefix))};
      auto next{allSources_.GetSourcePosition(tokens.GetCharProvenance(j))};
      if (endOfPrefixPos && next &&
          &*endOfPrefixPos->sourceFile == &*start->sourceFile &&
          endOfPrefixPos->line == start->line &&
          (&*next->sourceFile != &*start->sourceFile ||
              next->line != start->line)) {
        Say(range,
            "Initial line of continued statement must not appear to be a program unit END in fixed form source"_err_en_US);
      }
    }
  }
}

void Prescanner::SkipToEndOfLine() {
  while (*at_ != '\n') {
    ++at_, ++column_;
  }
}

bool Prescanner::MustSkipToEndOfLine() const {
  if (inFixedForm_ && column_ > fixedFormColumnLimit_ && !tabInCurrentLine_) {
    return true; // skip over ignored columns in right margin (73:80)
  } else if (*at_ == '!' && !inCharLiteral_) {
    return !IsCompilerDirectiveSentinel(at_);
  } else {
    return false;
  }
}

void Prescanner::NextChar() {
  CHECK(*at_ != '\n');
  int n{IsSpace(at_)};
  at_ += n ? n : 1;
  ++column_;
  while (at_[0] == '\xef' && at_[1] == '\xbb' && at_[2] == '\xbf') {
    // UTF-8 byte order mark - treat this file as UTF-8
    at_ += 3;
    encoding_ = Encoding::UTF_8;
  }
  SkipToNextSignificantCharacter();
}

// Skip everything that should be ignored until the next significant
// character is reached; handles C-style comments in preprocessing
// directives, Fortran ! comments, stuff after the right margin in
// fixed form, and all forms of line continuation.
bool Prescanner::SkipToNextSignificantCharacter() {
  auto anyContinuationLine{false};
  if (inPreprocessorDirective_) {
    SkipCComments();
  } else {
    bool mightNeedSpace{false};
    if (MustSkipToEndOfLine()) {
      SkipToEndOfLine();
    } else {
      mightNeedSpace = *at_ == '\n';
    }
    for (; Continuation(mightNeedSpace); mightNeedSpace = false) {
      anyContinuationLine = true;
      ++continuationLines_;
      if (MustSkipToEndOfLine()) {
        SkipToEndOfLine();
      }
    }
    if (*at_ == '\t') {
      tabInCurrentLine_ = true;
    }
  }
  return anyContinuationLine;
}

void Prescanner::SkipCComments() {
  while (true) {
    if (IsCComment(at_)) {
      if (const char *after{SkipCComment(at_)}) {
        column_ += after - at_;
        // May have skipped over one or more newlines; relocate the start of
        // the next line.
        nextLine_ = at_ = after;
        NextLine();
      } else {
        // Don't emit any messages about unclosed C-style comments, because
        // the sequence /* can appear legally in a FORMAT statement.  There's
        // no ambiguity, since the sequence */ cannot appear legally.
        break;
      }
    } else if (inPreprocessorDirective_ && at_[0] == '\\' && at_ + 2 < limit_ &&
        at_[1] == '\n' && !IsAtEnd()) {
      BeginSourceLineAndAdvance();
    } else {
      break;
    }
  }
}

void Prescanner::SkipSpaces() {
  while (IsSpaceOrTab(at_)) {
    NextChar();
  }
  insertASpace_ = false;
}

const char *Prescanner::SkipWhiteSpace(const char *p) {
  while (int n{IsSpaceOrTab(p)}) {
    p += n;
  }
  return p;
}

const char *Prescanner::SkipWhiteSpaceAndCComments(const char *p) const {
  while (true) {
    if (int n{IsSpaceOrTab(p)}) {
      p += n;
    } else if (IsCComment(p)) {
      if (const char *after{SkipCComment(p)}) {
        p = after;
      } else {
        break;
      }
    } else {
      break;
    }
  }
  return p;
}

const char *Prescanner::SkipCComment(const char *p) const {
  char star{' '}, slash{' '};
  p += 2;
  while (star != '*' || slash != '/') {
    if (p >= limit_) {
      return nullptr; // signifies an unterminated comment
    }
    star = slash;
    slash = *p++;
  }
  return p;
}

bool Prescanner::NextToken(TokenSequence &tokens) {
  CHECK(at_ >= start_ && at_ < limit_);
  if (InFixedFormSource()) {
    SkipSpaces();
  } else {
    if (*at_ == '/' && IsCComment(at_)) {
      // Recognize and skip over classic C style /*comments*/ when
      // outside a character literal.
      if (features_.ShouldWarn(LanguageFeature::ClassicCComments)) {
        Say(LanguageFeature::ClassicCComments, GetCurrentProvenance(),
            "nonstandard usage: C-style comment"_port_en_US);
      }
      SkipCComments();
    }
    if (IsSpaceOrTab(at_)) {
      // Compress free-form white space into a single space character.
      const auto theSpace{at_};
      char previous{at_ <= start_ ? ' ' : at_[-1]};
      NextChar();
      SkipSpaces();
      if (*at_ == '\n' && !omitNewline_) {
        // Discard white space at the end of a line.
      } else if (!inPreprocessorDirective_ &&
          (previous == '(' || *at_ == '(' || *at_ == ')')) {
        // Discard white space before/after '(' and before ')', unless in a
        // preprocessor directive.  This helps yield space-free contiguous
        // names for generic interfaces like OPERATOR( + ) and
        // READ ( UNFORMATTED ), without misinterpreting #define f (notAnArg).
        // This has the effect of silently ignoring the illegal spaces in
        // the array constructor ( /1,2/ ) but that seems benign; it's
        // hard to avoid that while still removing spaces from OPERATOR( / )
        // and OPERATOR( // ).
      } else {
        // Preserve the squashed white space as a single space character.
        tokens.PutNextTokenChar(' ', GetProvenance(theSpace));
        tokens.CloseToken();
        return true;
      }
    }
  }
  if (insertASpace_) {
    tokens.PutNextTokenChar(' ', spaceProvenance_);
    insertASpace_ = false;
  }
  if (*at_ == '\n') {
    return false;
  }
  const char *start{at_};
  if (*at_ == '\'' || *at_ == '"') {
    QuotedCharacterLiteral(tokens, start);
    preventHollerith_ = false;
  } else if (IsDecimalDigit(*at_)) {
    int n{0}, digits{0};
    static constexpr int maxHollerith{256 /*lines*/ * (132 - 6 /*columns*/)};
    do {
      if (n < maxHollerith) {
        n = 10 * n + DecimalDigitValue(*at_);
      }
      EmitCharAndAdvance(tokens, *at_);
      ++digits;
      if (InFixedFormSource()) {
        SkipSpaces();
      }
    } while (IsDecimalDigit(*at_));
    if ((*at_ == 'h' || *at_ == 'H') && n > 0 && n < maxHollerith &&
        !preventHollerith_) {
      Hollerith(tokens, n, start);
    } else if (*at_ == '.') {
      while (IsDecimalDigit(EmitCharAndAdvance(tokens, *at_))) {
      }
      ExponentAndKind(tokens);
    } else if (ExponentAndKind(tokens)) {
    } else if (digits == 1 && n == 0 && (*at_ == 'x' || *at_ == 'X') &&
        inPreprocessorDirective_) {
      do {
        EmitCharAndAdvance(tokens, *at_);
      } while (IsHexadecimalDigit(*at_));
    } else if (at_[0] == '_' && (at_[1] == '\'' || at_[1] == '"')) { // 4_"..."
      EmitCharAndAdvance(tokens, *at_);
      QuotedCharacterLiteral(tokens, start);
    } else if (IsLetter(*at_) && !preventHollerith_ &&
        parenthesisNesting_ > 0) {
      // Handles FORMAT(3I9HHOLLERITH) by skipping over the first I so that
      // we don't misrecognize I9HOLLERITH as an identifier in the next case.
      EmitCharAndAdvance(tokens, *at_);
    }
    preventHollerith_ = false;
  } else if (*at_ == '.') {
    char nch{EmitCharAndAdvance(tokens, '.')};
    if (!inPreprocessorDirective_ && IsDecimalDigit(nch)) {
      while (IsDecimalDigit(EmitCharAndAdvance(tokens, *at_))) {
      }
      ExponentAndKind(tokens);
    } else if (nch == '.' && EmitCharAndAdvance(tokens, '.') == '.') {
      EmitCharAndAdvance(tokens, '.'); // variadic macro definition ellipsis
    }
    preventHollerith_ = false;
  } else if (IsLegalInIdentifier(*at_)) {
    int parts{1};
    const char *afterLast{nullptr};
    do {
      EmitChar(tokens, *at_);
      ++at_, ++column_;
      afterLast = at_;
      if (SkipToNextSignificantCharacter() && IsLegalIdentifierStart(*at_)) {
        tokens.CloseToken();
        ++parts;
      }
    } while (IsLegalInIdentifier(*at_));
    if (parts >= 3) {
      // Subtlety: When an identifier is split across three or more continuation
      // lines (or two continuation lines, immediately preceded or followed
      // by '&' free form continuation line markers, its parts are kept as
      // distinct pp-tokens so that macro replacement operates on them
      // independently.  This trick accommodates the historic practice of
      // using line continuation for token pasting after replacement.
    } else if (parts == 2) {
      if (afterLast && afterLast < limit_) {
        afterLast = SkipWhiteSpace(afterLast);
      }
      if ((start > start_ && start[-1] == '&') ||
          (afterLast && afterLast < limit_ &&
              (*afterLast == '&' || *afterLast == '\n'))) {
        // call &                call foo&        call foo&
        //   &MACRO&      OR       &MACRO&   OR     &MACRO
        //   &foo(...)             &(...)
      } else {
        tokens.ReopenLastToken();
      }
    }
    if (InFixedFormSource()) {
      SkipSpaces();
    }
    if ((*at_ == '\'' || *at_ == '"') &&
        tokens.CharAt(tokens.SizeInChars() - 1) == '_') { // kind_"..."
      QuotedCharacterLiteral(tokens, start);
      preventHollerith_ = false;
    } else {
      preventHollerith_ = true; // DO 10 H = ...
    }
  } else if (*at_ == '*') {
    if (EmitCharAndAdvance(tokens, '*') == '*') {
      EmitCharAndAdvance(tokens, '*');
    } else {
      // Subtle ambiguity:
      //  CHARACTER*2H     declares H because *2 is a kind specifier
      //  DATAC/N*2H  /    is repeated Hollerith
      preventHollerith_ = !slashInCurrentStatement_;
    }
  } else {
    char ch{*at_};
    if (ch == '(') {
      if (parenthesisNesting_++ == 0) {
        isPossibleMacroCall_ = tokens.SizeInTokens() > 0 &&
            preprocessor_.IsFunctionLikeDefinition(
                tokens.TokenAt(tokens.SizeInTokens() - 1));
      }
    } else if (ch == ')' && parenthesisNesting_ > 0) {
      --parenthesisNesting_;
    }
    char nch{EmitCharAndAdvance(tokens, ch)};
    preventHollerith_ = false;
    if ((nch == '=' &&
            (ch == '<' || ch == '>' || ch == '/' || ch == '=' || ch == '!')) ||
        (ch == nch &&
            (ch == '/' || ch == ':' || ch == '*' || ch == '#' || ch == '&' ||
                ch == '|' || ch == '<' || ch == '>')) ||
        (ch == '=' && nch == '>')) {
      // token comprises two characters
      EmitCharAndAdvance(tokens, nch);
    } else if (ch == '/') {
      slashInCurrentStatement_ = true;
    } else if (ch == ';' && InFixedFormSource()) {
      SkipSpaces();
      if (IsDecimalDigit(*at_)) {
        if (features_.ShouldWarn(
                common::LanguageFeature::MiscSourceExtensions)) {
          Say(common::LanguageFeature::MiscSourceExtensions,
              GetProvenanceRange(at_, at_ + 1),
              "Label should be in the label field"_port_en_US);
        }
      }
    }
  }
  tokens.CloseToken();
  return true;
}

bool Prescanner::ExponentAndKind(TokenSequence &tokens) {
  char ed{ToLowerCaseLetter(*at_)};
  if (ed != 'e' && ed != 'd') {
    return false;
  }
  EmitCharAndAdvance(tokens, ed);
  if (*at_ == '+' || *at_ == '-') {
    EmitCharAndAdvance(tokens, *at_);
  }
  while (IsDecimalDigit(*at_)) {
    EmitCharAndAdvance(tokens, *at_);
  }
  if (*at_ == '_') {
    while (IsLegalInIdentifier(EmitCharAndAdvance(tokens, *at_))) {
    }
  }
  return true;
}

void Prescanner::QuotedCharacterLiteral(
    TokenSequence &tokens, const char *start) {
  char quote{*at_};
  const char *end{at_ + 1};
  inCharLiteral_ = true;
  continuationInCharLiteral_ = true;
  const auto emit{[&](char ch) { EmitChar(tokens, ch); }};
  const auto insert{[&](char ch) { EmitInsertedChar(tokens, ch); }};
  bool isEscaped{false};
  bool escapesEnabled{features_.IsEnabled(LanguageFeature::BackslashEscapes)};
  while (true) {
    if (*at_ == '\\') {
      if (escapesEnabled) {
        isEscaped = !isEscaped;
      } else {
        // The parser always processes escape sequences, so don't confuse it
        // when escapes are disabled.
        insert('\\');
      }
    } else {
      isEscaped = false;
    }
    EmitQuotedChar(static_cast<unsigned char>(*at_), emit, insert, false,
        Encoding::LATIN_1);
    while (PadOutCharacterLiteral(tokens)) {
    }
    if (*at_ == '\n') {
      if (!inPreprocessorDirective_) {
        Say(GetProvenanceRange(start, end),
            "Incomplete character literal"_err_en_US);
      }
      break;
    }
    // Here's a weird edge case.  When there's a two or more following
    // continuation lines at this point, and the entire significant part of
    // the next continuation line is the name of a keyword macro, replace
    // it in the character literal with its definition.  Example:
    //   #define FOO foo
    //   subroutine subr() bind(c, name="my_&
    //     &FOO&
    //     &_bar") ...
    // produces a binding name of "my_foo_bar".
    while (at_[1] == '&' && nextLine_ < limit_ && !InFixedFormSource()) {
      const char *idStart{nextLine_};
      if (const char *amper{SkipWhiteSpace(nextLine_)}; *amper == '&') {
        idStart = amper + 1;
      }
      if (IsLegalIdentifierStart(*idStart)) {
        std::size_t idLen{1};
        for (; IsLegalInIdentifier(idStart[idLen]); ++idLen) {
        }
        if (idStart[idLen] == '&') {
          CharBlock id{idStart, idLen};
          if (preprocessor_.IsNameDefined(id)) {
            TokenSequence ppTokens;
            ppTokens.Put(id, GetProvenance(idStart));
            if (auto replaced{
                    preprocessor_.MacroReplacement(ppTokens, *this)}) {
              tokens.Put(*replaced);
              at_ = &idStart[idLen - 1];
              NextLine();
              continue; // try again on the next line
            }
          }
        }
      }
      break;
    }
    end = at_ + 1;
    NextChar();
    if (*at_ == quote && !isEscaped) {
      // A doubled unescaped quote mark becomes a single instance of that
      // quote character in the literal (later).  There can be spaces between
      // the quotes in fixed form source.
      EmitChar(tokens, quote);
      inCharLiteral_ = false; // for cases like print *, '...'!comment
      NextChar();
      if (InFixedFormSource()) {
        SkipSpaces();
      }
      if (*at_ != quote) {
        break;
      }
      inCharLiteral_ = true;
    }
  }
  continuationInCharLiteral_ = false;
  inCharLiteral_ = false;
}

void Prescanner::Hollerith(
    TokenSequence &tokens, int count, const char *start) {
  inCharLiteral_ = true;
  CHECK(*at_ == 'h' || *at_ == 'H');
  EmitChar(tokens, 'H');
  while (count-- > 0) {
    if (PadOutCharacterLiteral(tokens)) {
    } else if (*at_ == '\n') {
      if (features_.ShouldWarn(common::UsageWarning::Scanning)) {
        Say(common::UsageWarning::Scanning, GetProvenanceRange(start, at_),
            "Possible truncated Hollerith literal"_warn_en_US);
      }
      break;
    } else {
      NextChar();
      // Each multi-byte character encoding counts as a single character.
      // No escape sequences are recognized.
      // Hollerith is always emitted to the cooked character
      // stream in UTF-8.
      DecodedCharacter decoded{DecodeCharacter(
          encoding_, at_, static_cast<std::size_t>(limit_ - at_), false)};
      if (decoded.bytes > 0) {
        EncodedCharacter utf8{
            EncodeCharacter<Encoding::UTF_8>(decoded.codepoint)};
        for (int j{0}; j < utf8.bytes; ++j) {
          EmitChar(tokens, utf8.buffer[j]);
        }
        at_ += decoded.bytes - 1;
      } else {
        Say(GetProvenanceRange(start, at_),
            "Bad character in Hollerith literal"_err_en_US);
        break;
      }
    }
  }
  if (*at_ != '\n') {
    NextChar();
  }
  inCharLiteral_ = false;
}

// In fixed form, source card images must be processed as if they were at
// least 72 columns wide, at least in character literal contexts.
bool Prescanner::PadOutCharacterLiteral(TokenSequence &tokens) {
  while (inFixedForm_ && !tabInCurrentLine_ && at_[1] == '\n') {
    if (column_ < fixedFormColumnLimit_) {
      tokens.PutNextTokenChar(' ', spaceProvenance_);
      ++column_;
      return true;
    }
    if (!FixedFormContinuation(false /*no need to insert space*/) ||
        tabInCurrentLine_) {
      return false;
    }
    CHECK(column_ == 7);
    --at_; // point to column 6 of continuation line
    column_ = 6;
  }
  return false;
}

static bool IsAtProcess(const char *p) {
  static const char pAtProc[]{"process"};
  for (std::size_t i{0}; i < sizeof pAtProc - 1; ++i) {
    if (ToLowerCaseLetter(*++p) != pAtProc[i])
      return false;
  }
  return true;
}

bool Prescanner::IsFixedFormCommentLine(const char *start) const {
  const char *p{start};

  // The @process directive must start in column 1.
  if (*p == '@' && IsAtProcess(p)) {
    return true;
  }

  if (IsFixedFormCommentChar(*p) || *p == '%' || // VAX %list, %eject, &c.
      ((*p == 'D' || *p == 'd') &&
          !features_.IsEnabled(LanguageFeature::OldDebugLines))) {
    return true;
  }
  bool anyTabs{false};
  while (true) {
    if (int n{IsSpace(p)}) {
      p += n;
    } else if (*p == '\t') {
      anyTabs = true;
      ++p;
    } else if (*p == '0' && !anyTabs && p == start + 5) {
      ++p; // 0 in column 6 must treated as a space
    } else {
      break;
    }
  }
  if (!anyTabs && p >= start + fixedFormColumnLimit_) {
    return true;
  }
  if (*p == '!' && !inCharLiteral_ && (anyTabs || p != start + 5)) {
    return true;
  }
  return *p == '\n';
}

const char *Prescanner::IsFreeFormComment(const char *p) const {
  p = SkipWhiteSpaceAndCComments(p);
  if (*p == '!' || *p == '\n') {
    return p;
  } else if (*p == '@') {
    return IsAtProcess(p) ? p : nullptr;
  } else {
    return nullptr;
  }
}

std::optional<std::size_t> Prescanner::IsIncludeLine(const char *start) const {
  if (!expandIncludeLines_) {
    return std::nullopt;
  }
  const char *p{SkipWhiteSpace(start)};
  if (*p == '0' && inFixedForm_ && p == start + 5) {
    // Accept "     0INCLUDE" in fixed form.
    p = SkipWhiteSpace(p + 1);
  }
  for (const char *q{"include"}; *q; ++q) {
    if (ToLowerCaseLetter(*p) != *q) {
      return std::nullopt;
    }
    p = SkipWhiteSpace(p + 1);
  }
  if (IsDecimalDigit(*p)) { // accept & ignore a numeric kind prefix
    for (p = SkipWhiteSpace(p + 1); IsDecimalDigit(*p);
         p = SkipWhiteSpace(p + 1)) {
    }
    if (*p != '_') {
      return std::nullopt;
    }
    p = SkipWhiteSpace(p + 1);
  }
  if (*p == '"' || *p == '\'') {
    return {p - start};
  }
  return std::nullopt;
}

void Prescanner::FortranInclude(const char *firstQuote) {
  const char *p{firstQuote};
  while (*p != '"' && *p != '\'') {
    ++p;
  }
  char quote{*p};
  std::string path;
  for (++p; *p != '\n'; ++p) {
    if (*p == quote) {
      if (p[1] != quote) {
        break;
      }
      ++p;
    }
    path += *p;
  }
  if (*p != quote) {
    Say(GetProvenanceRange(firstQuote, p),
        "malformed path name string"_err_en_US);
    return;
  }
  p = SkipWhiteSpace(p + 1);
  if (*p != '\n' && *p != '!') {
    const char *garbage{p};
    for (; *p != '\n' && *p != '!'; ++p) {
    }
    if (features_.ShouldWarn(common::UsageWarning::Scanning)) {
      Say(common::UsageWarning::Scanning, GetProvenanceRange(garbage, p),
          "excess characters after path name"_warn_en_US);
    }
  }
  std::string buf;
  llvm::raw_string_ostream error{buf};
  Provenance provenance{GetProvenance(nextLine_)};
  std::optional<std::string> prependPath;
  if (const SourceFile * currentFile{allSources_.GetSourceFile(provenance)}) {
    prependPath = DirectoryName(currentFile->path());
  }
  const SourceFile *included{
      allSources_.Open(path, error, std::move(prependPath))};
  if (!included) {
    Say(provenance, "INCLUDE: %s"_err_en_US, buf);
  } else if (included->bytes() > 0) {
    ProvenanceRange includeLineRange{
        provenance, static_cast<std::size_t>(p - nextLine_)};
    ProvenanceRange fileRange{
        allSources_.AddIncludedFile(*included, includeLineRange)};
    Preprocessor cleanPrepro{allSources_};
    if (preprocessor_.IsNameDefined("__FILE__"s)) {
      cleanPrepro.DefineStandardMacros(); // __FILE__, __LINE__, &c.
    }
    if (preprocessor_.IsNameDefined("_CUDA"s)) {
      cleanPrepro.Define("_CUDA"s, "1");
    }
    Prescanner{*this, cleanPrepro, /*isNestedInIncludeDirective=*/false}
        .set_encoding(included->encoding())
        .Prescan(fileRange);
  }
}

const char *Prescanner::IsPreprocessorDirectiveLine(const char *start) const {
  const char *p{start};
  while (int n{IsSpace(p)}) {
    p += n;
  }
  if (*p == '#') {
    if (inFixedForm_ && p == start + 5) {
      return nullptr;
    }
  } else {
    p = SkipWhiteSpace(p);
    if (*p != '#') {
      return nullptr;
    }
  }
  return SkipWhiteSpace(p + 1);
}

bool Prescanner::IsNextLinePreprocessorDirective() const {
  return IsPreprocessorDirectiveLine(nextLine_) != nullptr;
}

bool Prescanner::SkipCommentLine(bool afterAmpersand) {
  if (IsAtEnd()) {
    if (afterAmpersand && prescannerNesting_ > 0) {
      // A continuation marker at the end of the last line in an
      // include file inhibits the newline for that line.
      SkipToEndOfLine();
      omitNewline_ = true;
    }
  } else if (inPreprocessorDirective_) {
  } else {
    auto lineClass{ClassifyLine(nextLine_)};
    if (lineClass.kind == LineClassification::Kind::Comment) {
      NextLine();
      return true;
    } else if (lineClass.kind ==
            LineClassification::Kind::ConditionalCompilationDirective ||
        lineClass.kind == LineClassification::Kind::PreprocessorDirective) {
      // Allow conditional compilation directives (e.g., #ifdef) to affect
      // continuation lines.
      // Allow other preprocessor directives, too, except #include
      // (when it does not follow '&'), #define, and #undef (because
      // they cannot be allowed to affect preceding text on a
      // continued line).
      preprocessor_.Directive(TokenizePreprocessorDirective(), *this);
      return true;
    } else if (afterAmpersand &&
        (lineClass.kind == LineClassification::Kind::DefinitionDirective ||
            lineClass.kind == LineClassification::Kind::IncludeDirective ||
            lineClass.kind == LineClassification::Kind::IncludeLine)) {
      SkipToEndOfLine();
      omitNewline_ = true;
      skipLeadingAmpersand_ = true;
    }
  }
  return false;
}

const char *Prescanner::FixedFormContinuationLine(bool mightNeedSpace) {
  if (IsAtEnd()) {
    return nullptr;
  }
  tabInCurrentLine_ = false;
  char col1{*nextLine_};
  if (IsFixedFormCommentChar(col1)) {
    int j{1};
    if (InCompilerDirective()) {
      // Must be a continued compiler directive.
      for (; j < 5; ++j) {
        char ch{directiveSentinel_[j - 1]};
        if (ch == '\0') {
          break;
        }
        if (ch != ToLowerCaseLetter(nextLine_[j])) {
          return nullptr;
        }
      }
    } else if (features_.IsEnabled(LanguageFeature::OpenMP)) {
      // Fixed Source Form Conditional Compilation Sentinels.
      if (nextLine_[1] != '$') {
        return nullptr;
      }
      j++;
    } else {
      return nullptr;
    }
    for (; j < 5; ++j) {
      if (nextLine_[j] != ' ') {
        return nullptr;
      }
    }
    const char *col6{nextLine_ + 5};
    if (*col6 != '\n' && *col6 != '0' && !IsSpaceOrTab(col6)) {
      if (mightNeedSpace && !IsSpace(nextLine_ + 6)) {
        insertASpace_ = true;
      }
      return nextLine_ + 6;
    }
    return nullptr;
  } else {
    // Normal case: not in a compiler directive.
    if (col1 == '&' &&
        features_.IsEnabled(
            LanguageFeature::FixedFormContinuationWithColumn1Ampersand)) {
      // Extension: '&' as continuation marker
      if (features_.ShouldWarn(
              LanguageFeature::FixedFormContinuationWithColumn1Ampersand)) {
        Say(LanguageFeature::FixedFormContinuationWithColumn1Ampersand,
            GetProvenance(nextLine_), "nonstandard usage"_port_en_US);
      }
      return nextLine_ + 1;
    }
    if (col1 == '\t' && nextLine_[1] >= '1' && nextLine_[1] <= '9') {
      tabInCurrentLine_ = true;
      return nextLine_ + 2; // VAX extension
    }
    if ((col1 == ' ' ||
            ((col1 == 'D' || col1 == 'd') &&
                features_.IsEnabled(LanguageFeature::OldDebugLines))) &&
        nextLine_[1] == ' ' && nextLine_[2] == ' ' && nextLine_[3] == ' ' &&
        nextLine_[4] == ' ') {
      const char *col6{nextLine_ + 5};
      if (*col6 != '\n' && *col6 != '0' && !IsSpaceOrTab(col6)) {
        if ((*col6 == 'i' || *col6 == 'I') && IsIncludeLine(nextLine_)) {
          // It's An INCLUDE line, not a continuation
        } else {
          return nextLine_ + 6;
        }
      }
    }
    if (IsImplicitContinuation()) {
      return nextLine_;
    }
  }
  return nullptr; // not a continuation line
}

const char *Prescanner::FreeFormContinuationLine(bool ampersand) {
  const char *p{nextLine_};
  if (p >= limit_) {
    return nullptr;
  }
  p = SkipWhiteSpace(p);
  if (InCompilerDirective()) {
    if (*p++ != '!') {
      return nullptr;
    }
    for (const char *s{directiveSentinel_}; *s != '\0'; ++p, ++s) {
      if (*s != ToLowerCaseLetter(*p)) {
        return nullptr;
      }
    }
    p = SkipWhiteSpace(p);
    if (*p == '&') {
      if (!ampersand) {
        insertASpace_ = true;
      }
      return p + 1;
    } else if (ampersand) {
      return p;
    } else {
      return nullptr;
    }
  } else {
    if (*p == '&') {
      return p + 1;
    } else if (*p == '!' || *p == '\n' || *p == '#') {
      return nullptr;
    } else if (ampersand || IsImplicitContinuation()) {
      if (continuationInCharLiteral_) {
        // 'a'&            -> 'a''b' == "a'b"
        //   'b'
        if (features_.ShouldWarn(
                common::LanguageFeature::MiscSourceExtensions)) {
          Say(common::LanguageFeature::MiscSourceExtensions,
              GetProvenanceRange(p, p + 1),
              "Character literal continuation line should have been preceded by '&'"_port_en_US);
        }
      } else if (p > nextLine_) {
        --p;
      } else {
        insertASpace_ = true;
      }
      return p;
    } else {
      return nullptr;
    }
  }
}

bool Prescanner::FixedFormContinuation(bool mightNeedSpace) {
  // N.B. We accept '&' as a continuation indicator in fixed form, too,
  // but not in a character literal.
  if (*at_ == '&' && inCharLiteral_) {
    return false;
  }
  do {
    if (const char *cont{FixedFormContinuationLine(mightNeedSpace)}) {
      BeginSourceLine(cont);
      column_ = 7;
      NextLine();
      return true;
    }
  } while (SkipCommentLine(false /* not after ampersand */));
  return false;
}

bool Prescanner::FreeFormContinuation() {
  const char *p{at_};
  bool ampersand{*p == '&'};
  if (ampersand) {
    p = SkipWhiteSpace(p + 1);
  }
  if (*p != '\n') {
    if (inCharLiteral_) {
      return false;
    } else if (*p == '!') { // & ! comment - ok
    } else if (ampersand && isPossibleMacroCall_ && (*p == ',' || *p == ')')) {
      return false; // allow & at end of a macro argument
    } else if (features_.ShouldWarn(LanguageFeature::CruftAfterAmpersand)) {
      Say(LanguageFeature::CruftAfterAmpersand, GetProvenance(p),
          "missing ! before comment after &"_warn_en_US);
    }
  }
  do {
    if (const char *cont{FreeFormContinuationLine(ampersand)}) {
      BeginSourceLine(cont);
      NextLine();
      return true;
    }
  } while (SkipCommentLine(ampersand));
  return false;
}

// Implicit line continuation allows a preprocessor macro call with
// arguments to span multiple lines.
bool Prescanner::IsImplicitContinuation() const {
  return !inPreprocessorDirective_ && !inCharLiteral_ && isPossibleMacroCall_ &&
      parenthesisNesting_ > 0 && !IsAtEnd() &&
      ClassifyLine(nextLine_).kind == LineClassification::Kind::Source;
}

bool Prescanner::Continuation(bool mightNeedFixedFormSpace) {
  if (disableSourceContinuation_) {
    return false;
  } else if (*at_ == '\n' || *at_ == '&') {
    if (inFixedForm_) {
      return FixedFormContinuation(mightNeedFixedFormSpace);
    } else {
      return FreeFormContinuation();
    }
  } else if (*at_ == '\\' && at_ + 2 == nextLine_ &&
      backslashFreeFormContinuation_ && !inFixedForm_ && nextLine_ < limit_) {
    // cpp-like handling of \ at end of a free form source line
    BeginSourceLine(nextLine_);
    NextLine();
    return true;
  } else {
    return false;
  }
}

std::optional<Prescanner::LineClassification>
Prescanner::IsFixedFormCompilerDirectiveLine(const char *start) const {
  const char *p{start};
  char col1{*p++};
  if (!IsFixedFormCommentChar(col1)) {
    return std::nullopt;
  }
  char sentinel[5], *sp{sentinel};
  int column{2};
  for (; column < 6; ++column, ++p) {
    if (*p == '\n' || IsSpaceOrTab(p)) {
      break;
    }
    if (sp == sentinel + 1 && sentinel[0] == '$' && IsDecimalDigit(*p)) {
      // OpenMP conditional compilation line: leave the label alone
      break;
    }
    *sp++ = ToLowerCaseLetter(*p);
  }
  if (column == 6) {
    if (*p == '0') {
      ++p;
    } else if (int n{IsSpaceOrTab(p)}) {
      p += n;
    } else {
      // This is a Continuation line, not an initial directive line.
      return std::nullopt;
    }
  }
  if (sp == sentinel) {
    return std::nullopt;
  }
  *sp = '\0';
  if (const char *ss{IsCompilerDirectiveSentinel(
          sentinel, static_cast<std::size_t>(sp - sentinel))}) {
    std::size_t payloadOffset = p - start;
    return {LineClassification{
        LineClassification::Kind::CompilerDirective, payloadOffset, ss}};
  }
  return std::nullopt;
}

std::optional<Prescanner::LineClassification>
Prescanner::IsFreeFormCompilerDirectiveLine(const char *start) const {
  if (const char *p{SkipWhiteSpace(start)}; p && *p++ == '!') {
    if (auto maybePair{IsCompilerDirectiveSentinel(p)}) {
      auto offset{static_cast<std::size_t>(maybePair->second - start)};
      return {LineClassification{LineClassification::Kind::CompilerDirective,
          offset, maybePair->first}};
    }
  }
  return std::nullopt;
}

Prescanner &Prescanner::AddCompilerDirectiveSentinel(const std::string &dir) {
  std::uint64_t packed{0};
  for (char ch : dir) {
    packed = (packed << 8) | (ToLowerCaseLetter(ch) & 0xff);
  }
  compilerDirectiveBloomFilter_.set(packed % prime1);
  compilerDirectiveBloomFilter_.set(packed % prime2);
  compilerDirectiveSentinels_.insert(dir);
  return *this;
}

const char *Prescanner::IsCompilerDirectiveSentinel(
    const char *sentinel, std::size_t len) const {
  std::uint64_t packed{0};
  for (std::size_t j{0}; j < len; ++j) {
    packed = (packed << 8) | (sentinel[j] & 0xff);
  }
  if (len == 0 || !compilerDirectiveBloomFilter_.test(packed % prime1) ||
      !compilerDirectiveBloomFilter_.test(packed % prime2)) {
    return nullptr;
  }
  const auto iter{compilerDirectiveSentinels_.find(std::string(sentinel, len))};
  return iter == compilerDirectiveSentinels_.end() ? nullptr : iter->c_str();
}

const char *Prescanner::IsCompilerDirectiveSentinel(CharBlock token) const {
  const char *p{token.begin()};
  const char *end{p + token.size()};
  while (p < end && (*p == ' ' || *p == '\n')) {
    ++p;
  }
  if (p < end && *p == '!') {
    ++p;
  }
  while (end > p && (end[-1] == ' ' || end[-1] == '\t')) {
    --end;
  }
  return end > p && IsCompilerDirectiveSentinel(p, end - p) ? p : nullptr;
}

std::optional<std::pair<const char *, const char *>>
Prescanner::IsCompilerDirectiveSentinel(const char *p) const {
  char sentinel[8];
  for (std::size_t j{0}; j + 1 < sizeof sentinel && *p != '\n'; ++p, ++j) {
    if (int n{*p == '&' ? 1 : IsSpaceOrTab(p)}) {
      if (j > 0) {
        sentinel[j] = '\0';
        p = SkipWhiteSpace(p + n);
        if (*p != '!') {
          if (const char *sp{IsCompilerDirectiveSentinel(sentinel, j)}) {
            return std::make_pair(sp, p);
          }
        }
      }
      break;
    } else {
      sentinel[j] = ToLowerCaseLetter(*p);
    }
  }
  return std::nullopt;
}

constexpr bool IsDirective(const char *match, const char *dir) {
  for (; *match; ++match) {
    if (*match != ToLowerCaseLetter(*dir++)) {
      return false;
    }
  }
  return true;
}

Prescanner::LineClassification Prescanner::ClassifyLine(
    const char *start) const {
  if (inFixedForm_) {
    if (std::optional<LineClassification> lc{
            IsFixedFormCompilerDirectiveLine(start)}) {
      return std::move(*lc);
    }
    if (IsFixedFormCommentLine(start)) {
      return {LineClassification::Kind::Comment};
    }
  } else {
    if (std::optional<LineClassification> lc{
            IsFreeFormCompilerDirectiveLine(start)}) {
      return std::move(*lc);
    }
    if (const char *bang{IsFreeFormComment(start)}) {
      return {LineClassification::Kind::Comment,
          static_cast<std::size_t>(bang - start)};
    }
  }
  if (std::optional<std::size_t> quoteOffset{IsIncludeLine(start)}) {
    return {LineClassification::Kind::IncludeLine, *quoteOffset};
  }
  if (const char *dir{IsPreprocessorDirectiveLine(start)}) {
    if (IsDirective("if", dir) || IsDirective("elif", dir) ||
        IsDirective("else", dir) || IsDirective("endif", dir)) {
      return {LineClassification::Kind::ConditionalCompilationDirective};
    } else if (IsDirective("include", dir)) {
      return {LineClassification::Kind::IncludeDirective};
    } else if (IsDirective("define", dir) || IsDirective("undef", dir)) {
      return {LineClassification::Kind::DefinitionDirective};
    } else {
      return {LineClassification::Kind::PreprocessorDirective};
    }
  }
  return {LineClassification::Kind::Source};
}

Prescanner::LineClassification Prescanner::ClassifyLine(
    TokenSequence &tokens, Provenance newlineProvenance) const {
  // Append a newline temporarily.
  tokens.PutNextTokenChar('\n', newlineProvenance);
  tokens.CloseToken();
  const char *ppd{tokens.ToCharBlock().begin()};
  LineClassification classification{ClassifyLine(ppd)};
  tokens.pop_back(); // remove the newline
  return classification;
}

void Prescanner::SourceFormChange(std::string &&dir) {
  if (dir == "!dir$ free") {
    inFixedForm_ = false;
  } else if (dir == "!dir$ fixed") {
    inFixedForm_ = true;
  }
}

// Acquire and append compiler directive continuation lines to
// the tokens that constitute a compiler directive, even when those
// directive continuation lines are the result of macro expansion.
// (Not used when neither the original compiler directive line nor
// the directive continuation line result from preprocessing; regular
// line continuation during tokenization handles that normal case.)
bool Prescanner::CompilerDirectiveContinuation(
    TokenSequence &tokens, const char *origSentinel) {
  if (inFixedForm_ || tokens.empty() ||
      tokens.TokenAt(tokens.SizeInTokens() - 1) != "&") {
    return false;
  }
  LineClassification followingLine{ClassifyLine(nextLine_)};
  if (followingLine.kind == LineClassification::Kind::Comment) {
    nextLine_ += followingLine.payloadOffset; // advance to '!' or newline
    NextLine();
    return true;
  }
  CHECK(origSentinel != nullptr);
  directiveSentinel_ = origSentinel; // so InCompilerDirective() is true
  const char *nextContinuation{
      followingLine.kind == LineClassification::Kind::CompilerDirective
          ? FreeFormContinuationLine(true)
          : nullptr};
  if (!nextContinuation &&
      followingLine.kind != LineClassification::Kind::Source) {
    return false;
  }
  auto origNextLine{nextLine_};
  BeginSourceLine(nextLine_);
  NextLine();
  if (nextContinuation) {
    // What follows is !DIR$ & xxx; skip over the & so that it
    // doesn't cause a spurious continuation.
    at_ = nextContinuation;
  } else {
    // What follows looks like a source line before macro expansion,
    // but might become a directive continuation afterwards.
    SkipSpaces();
  }
  TokenSequence followingTokens;
  while (NextToken(followingTokens)) {
  }
  if (auto followingPrepro{
          preprocessor_.MacroReplacement(followingTokens, *this)}) {
    followingTokens = std::move(*followingPrepro);
  }
  followingTokens.RemoveRedundantBlanks();
  std::size_t startAt{0};
  std::size_t following{followingTokens.SizeInTokens()};
  bool ok{false};
  if (nextContinuation) {
    ok = true;
  } else {
    startAt = 2;
    if (startAt < following && followingTokens.TokenAt(0) == "!") {
      CharBlock sentinel{followingTokens.TokenAt(1)};
      if (!sentinel.empty() &&
          std::memcmp(sentinel.begin(), origSentinel, sentinel.size()) == 0) {
        ok = true;
        while (
            startAt < following && followingTokens.TokenAt(startAt).IsBlank()) {
          ++startAt;
        }
        if (startAt < following && followingTokens.TokenAt(startAt) == "&") {
          ++startAt;
        }
      }
    }
  }
  if (ok) {
    tokens.pop_back(); // delete original '&'
    tokens.Put(followingTokens, startAt, following - startAt);
    tokens.RemoveRedundantBlanks();
  } else {
    nextLine_ = origNextLine;
  }
  return ok;
}

// Similar, but for source line continuation after macro replacement.
bool Prescanner::SourceLineContinuation(TokenSequence &tokens) {
  if (!inFixedForm_ && !tokens.empty() &&
      tokens.TokenAt(tokens.SizeInTokens() - 1) == "&") {
    LineClassification followingLine{ClassifyLine(nextLine_)};
    if (followingLine.kind == LineClassification::Kind::Comment) {
      nextLine_ += followingLine.payloadOffset; // advance to '!' or newline
      NextLine();
      return true;
    } else if (const char *nextContinuation{FreeFormContinuationLine(true)}) {
      BeginSourceLine(nextLine_);
      NextLine();
      TokenSequence followingTokens;
      at_ = nextContinuation;
      while (NextToken(followingTokens)) {
      }
      if (auto followingPrepro{
              preprocessor_.MacroReplacement(followingTokens, *this)}) {
        followingTokens = std::move(*followingPrepro);
      }
      followingTokens.RemoveRedundantBlanks();
      tokens.pop_back(); // delete original '&'
      tokens.Put(followingTokens);
      return true;
    }
  }
  return false;
}
} // namespace Fortran::parser
llvm/flang/lib/Parser/prescan.cpp