/* read-files.l
 *
 * Hacked to map all words into lower case
 */

/* Everything between the lines "%{" and "%}" is copied verbatim into read-files.c
 */

%{
#include "read-files.h"

#include <cassert>
#include <cctype>
#include <cstdio>
#include <cstdlib>
#include <iostream>
#include <string>

  int    lineno = 0;        /* these globals are useful for debugging */
  string filename;

  enum   lexreturntype { eof = 0, word, tag, sentence_boundary, s_and_p, at_and_t, chiat, opensqb, closesqb };

  inline void message(char *s1, char *s2) {
    cerr << filename << ':' << lineno << ' ' << s1 << s2 << endl;
  }

%}

/* Declare 3 recognition states in addition to the default INITIAL state */

%x TAGSEPARATOR
%x TAG
%x POSTTAGJUNK

/* Declare some regular expression abbreviations */

WORD	([^ \t\n/\[\]]|"\\".)+
TAG     [^ \t\n/\[\]\|]+
HEADLINE ^(("*x*""x*"+)|("*x*"[ \t].*[ \t]"*x*"))$

/* The pattern/action pairs follow the line "%%".  They define the function yylex() */

%%

{HEADLINE}			/* skip headlines */

"S*/NNP&P/NN"			return s_and_p;  /* Hardwire corrections */
"AT*/NNP&T/NN"			return at_and_t; /*  for Penn WSJ errors */
"Chiat\\/NNP"			return chiat;
"[/("				return opensqb;
"]/)"				return closesqb;

^[ \t]*"="+[ \t]*$		return sentence_boundary;

"["				/* eat square brackets */
"]"				/* eat square brackets */

"{"[ \t]			/* eat curly brackets */
"}"[ \t]			/* eat curly brackets */
"{\n"				lineno++; /* eat curly brackets */
"}\n"				lineno++; /* eat curly brackets */

{WORD}				BEGIN(TAGSEPARATOR); return word;
"A[fj]"				BEGIN(TAGSEPARATOR); return word;
<TAGSEPARATOR>"/"		BEGIN(TAG); /* eat separator */
<TAGSEPARATOR>.			message("Expected a `/': ", yytext); BEGIN(TAG);
<TAG>{TAG}		   	BEGIN(POSTTAGJUNK); return tag;
<TAG>.			   	message("Expected a tag: ", yytext); BEGIN(POSTTAGJUNK);
<POSTTAGJUNK>([^ \t\n])*	BEGIN(INITIAL); /* eat any funny stuff following tag */
<POSTTAGJUNK>[ \t]              BEGIN(INITIAL);
<POSTTAGJUNK>"\n"               lineno++; BEGIN(INITIAL);

[ \t]+			        /* eat white space */
\n				lineno++;

.				message("Unrecognized input: ", yytext);


%%

/* These functions are copied verbatim into read-files.c _after_ yylex() 
 */

void read_files_new_file()
{
  yyin = fopen(filename.c_str(), "r");   // yylex() reads from yyin
  if (yyin == NULL) {
    cerr << "Could not open data file: " << filename << endl;
    abort();
  }
  lineno = 1;
  BEGIN(INITIAL);                         // set recognizer state to INITIAL
}

bool read_files_next_sentence(vector<symbol>& words, vector<symbol>& tags)
{
  words.clear();
  tags.clear();

  while (1) {
    lexreturntype lt = static_cast<lexreturntype>(yylex());
    switch (lt) {
    case eof: 
      goto endoffile;
      break;
    case word:
      words.push_back(symbol(yytext));
      break;
    case tag:
      tags.push_back(symbol(yytext));
      break;
    case sentence_boundary:
      assert(words.size() == tags.size());
      if (!words.empty()) 
	return true;
      break;
    case s_and_p:
      words.push_back(symbol("S&P"));
      tags.push_back(symbol("NNP"));
      break;
    case at_and_t:
      words.push_back(symbol("AT&T"));
      tags.push_back(symbol("NNP"));
      break;
    case chiat:
      words.push_back(symbol("Chiat"));
      tags.push_back(symbol("NNP"));
      break;
    case opensqb:
      words.push_back(symbol("["));
      tags.push_back(symbol("("));
      break;
    case closesqb:
      words.push_back(symbol("]"));
      tags.push_back(symbol(")"));
      break;
    default:
      cerr << filename << ':' << lineno << ": Unknown lextype " << lt << endl;
    }
  }
    
 endoffile:
  if (!words.empty()) {
    assert(words.size() == tags.size());
    return true;
  }
  fclose(yyin);
  return false;
}

