// java.g for CCCC
// 
// this is basically TJP's java grammar with some stuff taken out
// and a very little put in

/*
 * Java 1.0.2 Grammar for ANTLR parser generator.
 *
 * Developed by MageLang Institute (www.MageLang.com)
 * Authors:
 *  Terence Parr (parrt@magelang.com)
 *  John Mitchell of Non, Inc. (john@non.net)
 *  Jim Coker (jcoker@magelang.com)
 *
 * The grammar looks best at tabs = 4.
 *
 * SOFTWARE RIGHTS
 *
 * This file is a Java language grammar and is free software.  We do not
 * restrict its use or distribution, but you may NOT claim ownership or
 * authorship of this grammar or support code.  An individual or company
 * may otherwise do whatever they wish with the grammar distributed
 * herewith including the incorporation of the grammar or the output
 * generated by ANTLR into commerical software.  You may redistribute in
 * source or binary form without payment of royalties to us as long as
 * this header remains in all source distributions.
 *
 * We encourage users to develop parsers/tools using this grammar.
 * In return, we ask that credit is given to us for developing this
 * grammar.  By "credit", we mean that if you incorporate our grammar or
 * the generated code into one of your programs (commercial product,
 * research project, or otherwise) that you acknowledge this fact in the
 * documentation, research report, etc....  In addition, you should say nice
 * things about us at every opportunity.
 *
 * As long as these guidelines are kept, we expect to continue enhancing
 * this grammar.  Feel free to send us enhancements, fixes, bug reports,
 * suggestions, or general words of encouragement at parrt@magelang.com.
 *
 * DISCLAIMER: We make no guarantees that this grammar works, makes sense,
 *             or can be used to do anything useful.
 *
 * HISTORY:
 *
 * 1.00
 *  Initial release
 *
 * 1.10
 *  Modified grammar to use rule names that are closer to Arthur
 *  Van Hoff's rule names in his JDK compiler.
 *  Fixed the grammar so that it accepts all files in hotjava.src
 *  except for a few that look like errors in java source.
 *  
 *  1.20
 *  Various modifications to match grammar in _The Java Language Specification_
 *  Note that the language spec using "declaration" instead of the term
 *  "definition".  In the old days, a declaration ala C++ was "class A;"
 *  and the definition was "class A {...};".  We use definition in this
 *  grammar.
 *
 * KNOWN PROBLEMS:
 *
 *  It doesn't handle some escape sequences.
 *
 *  It doesn't handle inner classes in the 1.1 language spec.
 */

#header <<

#include "cccc.h"
#include "cccc_ast.h"
#include "cccc_utl.h"

// the objects which PCCTS creates for ASTs as the #0 variable etc
// have type "pointer to ASTBase", which means they need to be cast
// to a pointer to my variant of AST if I want to call my AST
// methods on them
#define MY_AST(X) ( (AST*) X)
>>

#lexclass COMMENTS

#token "\*/"            << 
  IncrementCount(tcCOMLINES);
  mode(START);
  skip();
>>

#token EOL "\n" <<
  IncrementCount(tcCOMLINES);
  newline();
  skip();
>>

#token "\*"             << skip(); >>
#token "~[\*\n]+"       << skip(); >>

#lexclass STRINGS

#token STRINGVAL "\"" << mode (START); >>
#token "\\n"            << replchar('\n'); more(); >>
#token "\\r"            << replchar('\r'); more(); >>
#token "\\t"            << replchar('\t'); more(); >>
#token "\\\n"           << replstr(""); more(); >>
#token "\\\\"           << replchar('\\'); more(); >>
#token "\\\""           << replchar('"'); more(); >>
#token "~[\"\\]+"       << more(); >>

#lexclass START

#token Eof "@"		<< replstr("<EOF>"); >>

// single line comments
#token "// ~[\n]* \n"   << 
  IncrementCount(tcCOMLINES);
  newline(); skip(); 
>>

// normal end of line
#token "\n" << 
  if(ANTLRToken::bCodeLine != 0) {
    IncrementCount(tcCODELINES);
    ANTLRToken::bCodeLine=0;
  }
  newline(); skip(); 
>>

#token "/\*"            << mode(COMMENTS); skip(); >>
#token "[\t\r\ ]+"        << skip(); >>

#token "\""             << mode(STRINGS); more(); >>
#token CHARVAL "'(~[\\]|\\~[]|\\u[0-9A-Fa-f][0-9A-Fa-f][0-9A-Fa-f][0-9A-Fa-f])'"


// CCCC needs to be able to recognize some keywords, so we define them here
// rather than by regexp in the grammar

#token IF "if" << IncrementCount(tcMCCABES_VG); >>
#token FOR "for" << IncrementCount(tcMCCABES_VG); >>
#token WHILE "while" << IncrementCount(tcMCCABES_VG); >>
#token SWITCH "switch" << IncrementCount(tcMCCABES_VG); >>
#token CASE "case" << IncrementCount(tcMCCABES_VG); >>
#token BREAK "break" << IncrementCount(tcMCCABES_VG);>>
#token RETURN "return" << IncrementCount(tcMCCABES_VG);>>
#token QUERY "?" << IncrementCount(tcMCCABES_VG); >>
#token LOGICAL_OR "\|\|" <<IncrementCount(tcMCCABES_VG); >>
#token LOGICAL_AND "&&" <<IncrementCount(tcMCCABES_VG); >>

#token LBRACE "\{" << IncrementNesting(); >>
#token RBRACE "\}" << DecrementNesting(); >>
#token SEMICOLON ";" << ; >>
#tokclass RESYNCHRONISATION { "\}" ";" } 

class JParser {
<<
  ParseUtility ps;
  void tracein(char *rulename)  { ps.tracein(rulename,guessing,LT(1)); }
  void traceout(char *rulename)  { ps.traceout(rulename,guessing,LT(1)); }

  void syn(
    _ANTLRTokenPtr tok, ANTLRChar *egroup, SetWordType *eset,
    ANTLRTokenType etok, int k) 
  { 
    if(DebugMask&PARSER) { ps.syn(tok); }
  }

/* Parser Members */

public:

    void init(char *filename)
        {
	    ps.reset(this);
	    ps.set_string(pssMODULE, filename);
	    ps.set_string(pssMODTYPE, "Java file");
	    ps.set_string(pssFILE, filename);
            ANTLRParser::init();
        }

    // these are by TJP
    // they correspond roughly to things which CCCC records in the 
    // ps object
    char *currentClassOrInterface;
    ANTLRTokenPtr currentMethod;

    // more things TJP uses, less useful here    
    int currentBlockNumber;
    int numBlocks;
    FILE *out;

>>

                /* F I L E S  /  P A C K A G E S */

compilationUnit  /* pass in the output stream */
    :   
        { package } (import)* (typeDefinition)* Eof
    ;

package
    :   "package" qualifiedName SEMICOLON
    ;

import
    :   "import" qualifiedNameStar SEMICOLON
    ;

typeDefinition
    :   
<< 
	ParseUtility saved_ps=ps; 

	// get ready in case we need to resynchronize...
        ANTLRTokenPtr initial_token=LT(1);
	CCCC_String initial_text=ps.lookahead_text(3);
>>
	(modifier)*
        (   classDefinition
        |   interfaceDefinition
        )
	<< ps=saved_ps; >>
    |   SEMICOLON
    ;
<< 
  ANTLRTokenPtr resync_token;
  int resync_nesting=mytoken(initial_token)->getNestingLevel();
  ps.resynchronize(resync_nesting,RESYNCHRONISATION_set,resync_token);

  cerr << "Syntax error: parser failed to handle "
       << initial_text << "..." << resync_token->getText()
	<< " on lines " << initial_token->getLine() 
	<< " to " << resync_token->getLine() << endl;

  // now we build an AST representing the rejected area...
  initial_token->setText(initial_text);
  AST *rejected_ast=new AST(initial_token);
  AST *rejected_ast_end=new AST(resync_token);
  rejected_ast->setRight(rejected_ast_end);
  ps.record_rejected_extent(rejected_ast);
  
  // we only delete the root tree - it deletes the other one
  delete rejected_ast;

  ps=saved_ps; 
>>
                /* T Y P E S / D E C L S */

referenceType
    :   qualifiedName ("\[" "\]")*
    ;

typeSpec
    :   type ("\[" "\]")*
    << ps.set_string(pssITYPE,MY_AST(#1)->canonical_name()); >>
    ;

type:   qualifiedName
    |   builtInType
    ;

builtInType
    :   "void"
    |   "boolean"
    |   "byte"
    |   "char"
    |   "short"
    |   "int"
    |   "float"
    |   "long"
    |   "double"
    ;

qualifiedName
    :    
	  ( IDENT "." qualifiedName )?
	| id:IDENT
        <<
            ps.set_string(pssUTYPE,$id->getText());
	>>
    ;

qualifiedNameStar
    :   qualifiedName optDotStar
    ;

optDotStar
    :
	{ "." "\*" }
    ; 

modifier
    :   "private" << ps.set_flag(vPRIVATE); >>
    |   "public"  << ps.set_flag(vPUBLIC); >>
    |   "protected" << ps.set_flag(vPROTECTED); >>
    |   "static" 
    |   "transient"
    |   "final"
    |   "abstract"
    |   "native"
    |   "threadsafe"
    |   "synchronized"
    |   "const"
    ;


                /* C L A S S E S */

classDefinition
    :   "class" id:IDENT 
        <<
	        ps.set_string(pssMODULE,$id->getText());
		ps.set_string(pssMODTYPE,"Java class");
	>>
	extends implements classBlock
	<<
		ps.set_string(pssDESCRIPTION,"definition");
		ps.record_module_extent(MY_AST(#0),utDEFINITION);
        >>
    ;

interfaceDefinition
    :   "interface" id:IDENT 
        <<
	        ps.set_string(pssMODULE,$id->getText());
		ps.set_string(pssMODTYPE,"Java interface");
	>>
	interfaceExtends implements classBlock
	<<
		ps.set_string(pssDESCRIPTION,"definition");
		ps.record_module_extent(MY_AST(#0),utDEFINITION);
        >>
    ;

classBlock
    :   LBRACE
            ( field )*
        RBRACE
    ;

extends
    :   "extends" qualifiedName
        <<
	    ps.set_string(pssDESCRIPTION,"extends");
            ps.record_userel_extent(MY_AST(#0),utINHERITS);
            ps.set_string(pssUTYPE,"");
	>>
    |
    ;

interfaceExtends
    :   "extends" qualifiedName ("," qualifiedName)*
        <<
	    ps.set_string(pssDESCRIPTION,"extends");
            ps.record_userel_extent(MY_AST(#0),utINHERITS);
            ps.set_string(pssUTYPE,"");
	>>
    |
    ;

implements
    :   "implements" qualifiedName ( "," qualifiedName )*
       <<
	    ps.set_string(pssDESCRIPTION,"implements");
            ps.record_userel_extent(MY_AST(#0),utINHERITS);
            ps.set_string(pssUTYPE,"");
	>>
    |
    ;

/** in the following rule, two syntactic predicates (the expressions in 
 * parens followed by '?') are used to resolve lookahead issues between
 * constructors and method defs and between method defs and variable defs. 
 * Without the predicates, the rule would be:
 * 
 * field
 *  :   constructorDefinition
 *  |   methodDefinition
 *  |   variableDefinitions
 *  |   "static" compoundStatement
 *  |   SEMICOLON
 *  ;
 * 
 *  We could left-factor out the modifiers and typespec, but it does
 *  not cost us much to backtrack over these few tokens and the grammar
 *  is more readable with the predicate.
 */
field
    :   ( (modifier)* methodHead LBRACE )?
        constructorDefinition
    |   ( (modifier)* typeSpec methodHead ( LBRACE | SEMICOLON ) )?
        methodDefinition
    |   // "static { ... }" initializer
        ( "static" LBRACE )? "static" compoundStatement  
    |   variableDefinitions
    |   SEMICOLON
    ;

localVariableDefinitions
    :   typeSpec variableDeclarator ( "," variableDeclarator )*
    ;

variableDefinitions
    :   (modifier)* typeSpec variableDeclarator ( "," variableDeclarator )*
    <<
        ps.set_string(pssDESCRIPTION,"Java variable instantiation");
	ps.record_userel_extent(MY_AST(#0),utHASBYREF);
    >>
    ;

variableDeclarator
    :   id:IDENT ("\[" "\]")* { "=" initializer }
    ;

initializer
    :   assignmentExpression
    |   arrayInitializer
    ;

arrayInitializer
    :   LBRACE { initializerList {","} } RBRACE
    ;

initializerList
    :
          initializer ( "," initializer )*
    ;


                /* M E T H O D S */

methodHead
    :   id:IDENT 
	<<  
	        // we need to record the use relationship to the return
		// value here, as we will be clobbering the value in
		// the pssUTYPE string as we process the parameters
		ps.set_string(pssMEMBER,$id->getText()); 
	        ps.set_string(pssDESCRIPTION,"return by reference");
                ps.record_userel_extent(MY_AST(#0),utPARBYREF);
	>>
par:paramList
	<<
		ps.set_string(
		  pssPARAMS,MY_AST(#par)->canonical_name());
	>>
    ;

paramList 
	:
        "\(" {parameterDefinitionList} "\)" ("\[" "\]")* {throwsClause}
	;

throwsClause
    :   "throws" qualifiedName ("," qualifiedName)*
    ;
    
methodDefinition
    :  
        (modifier)* typeSpec methodHead ( compoundStatement | SEMICOLON )
        <<
            ps.record_function_extent(MY_AST(#0),utDEFINITION);
        >>
    ;

constructorDefinition
    :
        (modifier)* methodHead compoundStatement
        <<
		ps.record_function_extent(MY_AST(#0),utDEFINITION);
	        currentMethod=NULL;
        >>
    ;

parameterDefinitionList
    :   parameterDefinition ( "," parameterDefinition )*
    ;

parameterDefinition
    :   typeSpec id:IDENT! ("\[" "\]")*
       <<
	    ps.set_string(pssITYPE,MY_AST(#1)->canonical_name());
            ps.set_string(pssDESCRIPTION,"Java parameter"); 
	    ps.record_userel_extent(MY_AST(#0),utPARBYREF);
	>>
    ;


                /* S T A T E M E N T S */

compoundStatement
    :   <<
        numBlocks++;
        int saveBlock = currentBlockNumber;
        currentBlockNumber = numBlocks;
        >>
        LBRACE
            (statement)*
            <<
            currentBlockNumber = saveBlock;
            >>
        RBRACE
    ;

statement
    :   (IDENT ":")? IDENT ":" statement
    |   compoundStatement

        /* distinguishing between a local variable definition and
         * an expression requires k>2 lookahead.  Rather than increase
         * the lookahead of the overall parser, we use backtracking to
         * ensure we match local variables.  If a local variable declaration
         * is not found, an expression (the next alternative) is attempted.
         * Consider that after having seen "t[" you don't know if it's
         * an assignment to an array "t[3]=4;" or an variable def "t[] b;"
         */
    |   (localVariableDefinitions SEMICOLON)?

    |   expression SEMICOLON

    |   IF "\(" expression "\)" statement
        /* the {"else" statement} optional clause is a language ambiguity
         * that results in a parser nondeterminism.  The parser's default
         * response of simply matching the "else" if it sees it, resolves
         * the problem.  We use a #pragma to tell the parser that it's
         * approximate lookahead is sufficient to handle the problem--
         * the desired side effect is that ANTLR doesn't warn us about
         * this ambiguity with the #pragma in place.
         */
        #pragma approx
        { "else" statement }

        /* As with locals versus expressions at the statement level,
         * loop variables must be distinguished from expressions.
         */
    |   FOR "\("
        ( 
	      (localVariableDefinitions SEMICOLON)? 
	    | expressionList SEMICOLON 
	    | SEMICOLON 
	)
        {expression} SEMICOLON
        {expressionList}
        "\)"
        statement
    |   WHILE "\(" expression "\)" statement
    |   "do" statement WHILE "\(" expression "\)" SEMICOLON
    |   BREAK {IDENT} SEMICOLON
    |   "continue" {IDENT} SEMICOLON
    |   RETURN {expression} SEMICOLON
    |   SWITCH "\(" expression "\)" LBRACE
            (   CASE expression ":" (statement)*
            |   "default" ":" (statement)*
            )*
        RBRACE
    |   tryBlock
    |   "throw" expression SEMICOLON
    |   "goto" IDENT SEMICOLON
    |   "synchronized" "\(" expression "\)" compoundStatement
    |   SEMICOLON
    ;

/* "catch" and "finally" clauses cause ambiguity that is resolved
 * correctly by ANTLR; this is similar to the dangling-else ambiguity.
 * Again, the #pragma is used to turn off a warning message from ANTLR
 * during grammar analysis.  See the statement rule.
 */
tryBlock
    :   "try" compoundStatement
        #pragma approx ( handler )*
        #pragma approx { "finally" compoundStatement }
    ;

handler
    :   "catch" "\(" parameterDefinition "\)" compoundStatement
    ;


                /* E X P R E S S I O N S */

expressionList
    :   assignmentExpression ("," assignmentExpression)*
    ;

expression
    :   assignmentExpression
    ;

/* right-to-left for assignment op -> use tail recursion */
assignmentExpression
    :   conditionalExpression
        {   assignmentOp
            assignmentExpression
        }
    ;

assignmentOp
    :   "="
    |   "\+="
    |   "\-="
    |   "\*="
    |   "/="
    |   "\%="
    |   "\>\>="
    |   "\>\>\>="
    |   "\<\<="
    |   "&="
    |   "^="
    |   "\|="
    ;

conditionalExpression
    :   logicalOrExpression
        { QUERY conditionalExpression ":" conditionalExpression }
    ;

logicalOrExpression
    :   logicalAndExpression ( LOGICAL_OR logicalAndExpression)*
    ;

logicalAndExpression
    :   inclusiveOrExpression ( LOGICAL_AND inclusiveOrExpression)*
    ;

inclusiveOrExpression
    :   exclusiveOrExpression ("\|" exclusiveOrExpression)*
    ;

exclusiveOrExpression
    :   andExpression ("^" andExpression)*
    ;

andExpression
    :   equalityExpression ("&" equalityExpression)*
    ;

equalityExpression
    :   relationalExpression (("!=" | "==") relationalExpression)*
    ;

relationalExpression
    :   shiftExpression
        (   (   "<"
            |   ">"
            |   "<="
            |   ">="
            )
            shiftExpression
        )*
    ;

shiftExpression
    :   additiveExpression (("\<\<" | "\>\>" | "\>\>\>") additiveExpression)*
    ;

additiveExpression
    :   multiplicativeExpression (("\+" | "\-") multiplicativeExpression)*
    ;

multiplicativeExpression
    :   castExpression (("\*" | "/" | "\%" ) castExpression)*
    ;

/*
 *  This is the way castExpression should look if I had a symbol table:
 * 
 * castExpression
 *  :   unaryExpression
 *  |   <<isType(LT(2)->getText())>>? "\(" typeSpec "\)" castExpression
 *  ;
 * 
 *  I use a syntactic pred (...)? here to just check the lookahead arbitrarily
 *  ahead; slower, but it works.
 */
castExpression
    :   ( "\(" typeSpec "\)" castExpression )?
    |   unaryExpression
    ;

unaryExpression
    :   "\+\+" castExpression
    |   "\-\-" castExpression
    |   "\-" castExpression
    |   "\~" castExpression
    |   "!" castExpression
    |   postfixExpression { "instanceof" referenceType }
    ;

/* ambiguity warning turned off with the pragma.  Ambiguity is
 * new T[n] with "new T" returning from newExpression or
 * with "new T[n]" returning from newExpression.  The [..] stuff
 * could also be matched by the postfixExpression.
 */
newArray
    :   #pragma approx
        ( "\[" expression "\]" )+ squareBracketList
    ;

squareBracketPair
    :
        "\[" "\]"
    ;

squareBracketList
    :
          ( squareBracketPair )? squareBracketPair squareBracketList
        | /* empty */
    ;
 
postfixExpression
    :   primaryExpression
        (   "\[" expression "\]"
        |   "\(" { expressionList } "\)"
        |   "." primaryExpression
        |   "\+\+"
        |   "\-\-"
        )*
    ;

/*
 * Valid new expressions:
 *      new Class(...)
 *      new type[n][m][]...
 *      new Package.Class(...)
 *
 * NOTE: This binding differs from C++.
 */
newExpression
    :   "new" type
        (   "\(" { expressionList } "\)"
        |   newArray
        )
    ;

primaryExpression
    :   IDENT
    |   newExpression
    |   constant
    |   "super"
    |   "this"
    |   "true"
    |   "false"
    |   "null"
    |   STRINGVAL
    |   "\(" expression "\)"
    ;

constant
    :   OCTALINT
    |   DECIMALINT
    |   HEXADECIMALINT
    |   CHARVAL
    |   FLOATONE
    |   FLOATTWO
    ;

}

#token OCTALINT "0[0-7]*{[uUlL]}"
#token DECIMALINT "[1-9][0-9]*{[uUlL]}"
#token HEXADECIMALINT "(0x|0X)[0-9a-fA-F]+{[uUlL]}"
#token FLOATONE "([0-9]+.[0-9]* | [0-9]*.[0-9]+) {[eE]{[\-\+]}[0-9]+} {[fFlLdD]}"
#token FLOATTWO "[0-9]+ [eE]{[\-\+]}[0-9]+ {[fFlLdD]}"

#token IDENT "[a-zA-Z_][a-zA-Z0-9_]*" <<;>>

