// lexer.cc            see license.txt for copyright and terms of use
// code for lexer.h

#include "lexer.h"       // this module
#include "cc_lang.h"     // CCLang

#include <ctype.h>       // isdigit
#include <stdlib.h>      // atoi

using namespace sm;

/*
 * Note about nonseparating tokens and the 'checkForNonsep' function:
 *
 * To diagnose and report erroneous syntax like "0x5g", which would
 * naively be parsed as "0x5" and "g" (two legal tokens), I divide
 * all tokens into two classes: separating and nonseparating.
 *
 * Separating tokens are allowed to be adjacent to each other and
 * to nonseparating tokens.  An example is "(".
 *
 * Nonseparating tokens are not allowed to be adjacent to each other.
 * They must be separated by either whitespace, or at least one
 * separating token.  The nonseparating tokens are identifiers,
 * alphabetic keywords, and literals.  The lexer would of course never
 * yield two adjacent keywords, due to maximal munch, but classifying
 * such an event as an error is harmless.
 *
 * By keeping track of whether the last token yielded is separating or
 * not, we'll see (e.g.) "0x5g" as two consecutive nonseparating tokens,
 * and can report that as an error.
 *
 * The C++ standard is rather vague on this point as far as I can
 * tell.  I haven't checked the C standard.  In the C++ standard,
 * section 2.6 paragraph 1 states:
 *
 *  "There are five kinds of tokens: identifiers, keywords, literals,
 *   operators, and other separators.  Blanks, horizontal and
 *   vertical tabs, newlines, formfeeds, and comments (collectively,
 *   "whitespace"), as described below, are ignored except as they
 *   serve to separate tokens.  [Note: Some white space is required
 *   to separate otherwise adjacent identifiers, keywords, numeric
 *   literals, and alternative tokens containing alphabetic
 *   characters.]"
 *
 * The fact that the restriction is stated only in a parenthetical note
 * is of course nonideal.  I think the qualifier "numeric" on "literals"
 * is a mistake, otherwise "a'b'" would be a legal token sequence.  I
 * do not currently implement the "alternative tokens".
 *
 * Update: Mozilla includes things like "foo""bar", i.e. directly
 * adjacent string literals.  Therefore I'm going to interpret (the
 * note in) the standard literally, and take char and string literals
 * to be separating.
 */


// -------------------- TokenType ---------------------
// these aren't emitted into cc_tokens.cc because doing so would
// make that output dependent on smbase/xassert.h
char const *toString(TokenType type)
{
  xassert(NUM_TOKEN_TYPES == tokenNameTableSize);
  xassert((unsigned)type < (unsigned)NUM_TOKEN_TYPES);
  return tokenNameTable[type];
}

TokenFlag tokenFlags(TokenType type)
{
  xassert((unsigned)type < (unsigned)NUM_TOKEN_TYPES);
  return (TokenFlag)tokenFlagTable[type];
}


// ------------------------ Lexer -------------------
Lexer::Lexer(StringTable &s, CCLang &L, char const *fname)
  : BaseLexer(s, fname),

    prevIsNonsep(false),
    prevHashLineFile(s.add(fname)),
    currentMacro(NULL),

    lang(L)
{
  // prime this lexer with the first token
  getTokenFunc()(this);
}


Lexer::Lexer(StringTable &s, CCLang &L, SourceLoc initLoc,
             char const *buf, int len)
  : BaseLexer(s, initLoc, buf, len),

    prevIsNonsep(false),
    prevHashLineFile(s.add(sourceLocManager->getFile(initLoc))),
    currentMacro(NULL),

    lang(L)
{
  // do *not* prime the lexer; I think it is a mistake above, but
  // am leaving it for now
}


Lexer::~Lexer()
{}


void Lexer::whitespace()
{
  BaseLexer::whitespace();

  // various forms of whitespace can separate nonseparating tokens
  prevIsNonsep = false;
}


// this, and 'svalTok', are out of line because I don't want the
// yylex() function to be enormous; I want that to just have a bunch
// of calls into these routines, which themselves can then have
// plenty of things inlined into them
int Lexer::tok(TokenType t)
{
  checkForNonsep(t);
  updLoc();
  sval = NULL_SVAL;     // catch mistaken uses of 'sval' for single-spelling tokens
  return t;
}


int Lexer::svalTok(TokenType t)
{
  checkForNonsep(t);
  updLoc();
  sval = (SemanticValue)addString(yytext, yyleng);
  return t;
}


int Lexer::alternateKeyword_tok(TokenType t)
{
  if (lang.isCplusplus) {
    return tok(t);
  }
  else {
    // in C mode, they are just identifiers
    return svalTok(TOK_NAME);
  }
}


// examples of recognized forms
//   #line 4 "foo.cc"       // canonical form
//   # 4 "foo.cc"           // "line" can be omitted
//   # 4 "foo.cc" 1         // extra stuff is ignored
//   # 4                    // omitted filename means "same as previous"
void Lexer::parseHashLine(char *directive, int len)
{
  char *endp = directive+len;

  directive++;        // skip "#"
  if (*directive == 'l') {
    directive += 4;   // skip "line"
  }

  // skip whitespace
  while (*directive==' ' || *directive=='\t') {
    directive++;
  }

  // parse the line number
  if (!isdigit(*directive)) {
    pp_err("malformed #line directive line number");
    return;
  }
  int lineNum = atoi(directive);

  // skip digits and whitespace
  while (isdigit(*directive)) {
    directive++;
  }
  while (*directive==' ' || *directive=='\t') {
    directive++;
  }

  if (*directive == '\n') {
    // no filename: use previous
    srcFile->addHashLine(curLine, lineNum, prevHashLineFile);
    return;
  }

  if (*directive != '\"') {
    pp_err("#line directive missing leading quote on filename");
    return;
  }
  directive++;

  // look for trailing quote
  char *q = directive;
  while (q<endp && *q != '\"') {
    q++;
  }
  if (*q != '\"') {
    pp_err("#line directive missing trailing quote on filename");
    return;
  }

  // temporarily write a NUL so we can make a StringRef
  *q = 0;
  StringRef fname = strtable.add(directive);
  *q = '\"';

  // remember this directive
  srcFile->addHashLine(curLine, lineNum, fname);

  // remember the filename for future #line directives that
  // don't explicitly include one
  prevHashLineFile = fname;
}


// preprocessing error: report the location information in the
// preprocessed source, ignoring #line information
void Lexer::pp_err(char const *msg)
{
  // print only line information, and subtract one because I account
  // for whitespace (including the final newline) before processing it
  errors++;
  std::cerr << srcFile->name << ":" << (curLine-1) << ": error: " << msg << std::endl;
}


STATICDEF void Lexer::tokenFunc(LexerInterface *lex)
{
  Lexer *ths = static_cast<Lexer*>(lex);

  // call into the flex lexer; this updates 'loc' and sets
  // 'sval' as appropriate
  ths->type = ths->yylex();
}


STATICDEF void Lexer::c_tokenFunc(LexerInterface *lex)
{
  // as above
  Lexer *ths = static_cast<Lexer*>(lex);
  ths->type = ths->yylex();

  // map C++ keywords into identifiers
  TokenType tt = (TokenType)(ths->type);
  if (tokenFlags(tt) & TF_CPLUSPLUS) {
    // create the lexeme corresponding to the token's spelling
    StringRef str = ths->strtable.add(toString(tt));

    // set the LexerInterface fields to yield the new token
    ths->type = TOK_NAME;
    ths->sval = (SemanticValue)str;
  }
}


Lexer::NextTokenFunc Lexer::getTokenFunc() const
{
  if (lang.recognizeCppKeywords) {
    // expected case, yield the normal tokenizer
    return &Lexer::tokenFunc;
  }
  else {
    // yield the tokenizer that maps C++ keywords into C keywords
    return &Lexer::c_tokenFunc;
  }
}

string Lexer::tokenDesc() const
{
  if (tokenFlags((TokenType)type) & TF_MULTISPELL) {
    // for tokens with multiple spellings, decode 'sval' as a
    // StringRef
    //return string((StringRef)sval);
    return stringc << toString((TokenType)type) << ": " << (StringRef)sval;
  }
  else {
    // for all others, consult the static table
    return string(toString((TokenType)type));
  }
}

string Lexer::tokenKindDesc(int kind) const
{
  // static table only
  return toString((TokenType)kind);
}

string Lexer::tokenKindDescV(int kind) const
{
  stringBuilder s;
  s << toString((TokenType)kind)
    << " (" << kind << ")";
  return s;
}

// parse line:col expressions
static SourceLoc str2loc(char *str, char **endptr, char const * file) {
  int line = strtol(str, &str, 10);
  if (!line) return SL_UNKNOWN;
  str++;
  int col = strtol(str, &str, 10);
  if (endptr) *endptr = str;

  return sourceLocManager->encodeLineCol(file, line, col); 
}

// comment of form /*<NAME lineStart:colStart endEnd:colEnd*/
void Lexer::macroUndoStart(char *comment, int len) {
  updLoc();
  prevIsNonsep = false;
  if (!sourceLocManager->useHashLines) return;
  StringRef name(NULL);
  SourceLoc preStartLoc(SL_UNKNOWN);
  SourceLoc preEndLoc(SL_UNKNOWN);
  bool isParam = false;

  if (char *spc = strchr(comment, ' ')) {
    char *in = spc + 1;
    name = addString(comment, (int)(spc - comment));
    preStartLoc = str2loc(in, &in, prevHashLineFile);
    preEndLoc = str2loc(in + 1, NULL, prevHashLineFile);
  } else {
    name = addString(comment, len);
    isParam = strchr(name, ':') != NULL;
    if (isParam) {
      //this is a macro parameter
      MacroUndoEntry *parent = currentMacro;
      // get the parent that this macro param is defined in
      for (; strncmp(parent->name, name, strlen(parent->name));
           parent = parent->parent) {
      }
      for (TailListIterNC<MacroDefinition> it(parent->params);
           !it.isDone();
           it.adv()) {
        MacroDefinition *def = it.data();
        if (def->name != name) continue;
        preStartLoc = def->fromLoc;
        preEndLoc = def->toLoc;
      }
    }
  }
  MacroUndoEntry *current = new MacroUndoEntry(nextLoc, preStartLoc, preEndLoc, name, 
					       currentMacro);
  // add top-level and nested params to the list
  //if (!currentMacro || isParam) {
  macroUndoLog.append(current);
    //}
  currentMacro = current;
}

// m is only returned if it has a position
void Lexer::addMacroDefinition(char *macro, int len, MacroDefinition **m) {
  SourceLoc fromLoc = SL_UNKNOWN;
  SourceLoc toLoc = SL_UNKNOWN;

  char *spc = strchr(macro, ' ');
  if (spc) {
    char *in = spc + 1;
    fromLoc = str2loc(in, &in, prevHashLineFile);
    toLoc = str2loc(in + 1, NULL, prevHashLineFile);
    if (m) {
      StringRef name = addString(macro, int(spc - macro));
      *m = new MacroDefinition(name, fromLoc, toLoc);
    }

  } else {
    spc = macro + len;
  }
}

// comment of form /*<mNAME lineStart:colStart endEnd:colEnd*/
void Lexer::macroDefinition(char *macro, int len) {
  updLoc();
  prevIsNonsep = false;

  if (!sourceLocManager->useHashLines) return;
  addMacroDefinition(macro, len);
}
  
// comment of form /*<!NAME lineStart:colStart endEnd:colEnd*/
void Lexer::macroParamDefinition(char *macro, int len) {
  updLoc();
  prevIsNonsep = false;

  if (!sourceLocManager->useHashLines) return;
  MacroDefinition *param = NULL;
  addMacroDefinition(macro, len, &param);
  if (!currentMacro) {
    pp_err("Invalid macro parameter definition");
    return;
  }
  // only record parameters with positions
  if (param) currentMacro->params.append(param);
  // macroParamDef always follows macroUndoStart
  // thus code doesn't start until the last
  // macroParamDef is done
  currentMacro->postStartLoc = nextLoc;
}

void Lexer::macroUndoStop() {
  SourceLoc postEndLoc = nextLoc;

  updLoc();
  prevIsNonsep = false;
  if (!sourceLocManager->useHashLines) return;

  if (!currentMacro) {
    std::cerr << toString(postEndLoc) << ": Macro end tag without a start tag" << std::endl;
    exit(1);
    return;
  }

  currentMacro->postEndLoc = postEndLoc;
  currentMacro = currentMacro->parent;
}

// EOF
