A Full Compiler Example Using PCCTS:

This is a simple example of a complete compiler done using PCCTS. This is a slightly modified version of the first project in EE573 at Purdue a couple of years back. There may be some problems with the functionality of this compiler, but it still serves as a good example of a non-AST based compiler.

Subheading


/* nempl.g *******************************************************************
 Coding History:
	??-??-?? H.Dietz	C coded sol'n to project
	02-17-96 R.Fisher	Base grammar and tokens
	02-18-96 R.Fisher	Added actions from Hank's solution
	02-19-96 R.Fisher	Edit of actions for PCCTS
	02-20-96 R.Fisher	Some debugging.
	02-22-96 R.Fisher	Tried to write "array" with passed args, etc.
	02-24-96 R.Fisher	Break "array" into "lval_" and "rval_" "array"
 Notes:
	This is an attempt to meet the original nempl spec as presented in
	  the EE595E Project Handout (13 September 1994) by Hank Dietz.
	The only non-spec part that I know if is that it allows mixed-case
	  keywords.
	This is essentially a port of Hank's solution to PCCTS with some minor
	  adjustments.
 ToDo:
	Put in proper handling for WORDs.
	  Note that the spec allows functionname = whatever, but Hank's soln
	  does not.
	Split "array" into two parts, one for handling lvals, one for rvals.
*****************************************************************************/


/* Start up code ************************************************************
	This makes the program start ANTLR, which will begin parsing the
	standard input starting with the rule "prog".
*****************************************************************************/
#header
<<
#include "charptr.h"

#include <stdio.h>
#include <stdlib.h>
#include <ctype.h>

#define DEBUG	1

#define SINGLE		1
#define PLURAL		2
#define NONARRAY	3
#define CALL		4

#define sym     struct _sym
sym {
        char    *text;
        int     type;
        int     class;
        int     base;
        int     size;
};

extern sym symtab[];

#define ZZCOL
>>

<<
sym	symtab[513];
sym     *symptr = &(symtab[0]);

int	indent = 0;	/* current output indent */
int	offset = 0;	/* stack offset for var */
int	reg = -1;	/* next register to use */

char buf[513];		/* Used by rval_array and fact to pass a string */

#include "charptr.c"

main(argc, argv)
int argc;
char **argv;
{
	if (argc != 1) {
		error("no command-line args; use redirection for file I/O");
	}

	ANTLR (prog(), stdin);

	return(0);
}

warn(fmt, a, b, c)
char *fmt;
int a, b, c;
{
	fprintf(stderr, "line %d: ", zzline);
	fprintf(stderr, fmt, a, b, c);
	fprintf(stderr, "\n");
}

error(fmt, a, b, c)
char *fmt;
int a, b, c;
{
	warn(fmt, a, b, c);
	exit(1);
}

sym *
enter(s)
register char *s;
{
	/* extern char *malloc(); */
	register char *p;
	if ( (p=malloc(strlen(s) + 1)) == NULL )
		fprintf(stderr,"Couldn't alloc space for symbol table entry\n");
	if ( DEBUG )
		fprintf ( stderr, "Entering %s into symtab\n", s );
	strcpy(p, s);
	symptr->text = p;
	if ( DEBUG )
		fprintf ( stderr, "%s entered into symtab\n", symptr->text );
	return(symptr++);
}

sym *
lookup(s)
register char *s;
{
	register sym *p = (symptr - 1);

	if ( DEBUG )
		fprintf ( stderr, "Looking up %s in symtab\n", s );
	while (p > &(symtab[0])) {
		if (!strcmp(p->text, s)) {
			if ( DEBUG )
				fprintf(stderr,"Found %s in symtab\n", p->text);
			return(p);
		}
		--p;
	}
	if ( DEBUG )
		fprintf( stderr,"%s not found in symtab\n", s );
	return(NULL);
}

tab()
{
	register int x;

	for (x=0; x<indent; ++x) {
		putchar(' ');
	}
}

>>





/* Tokens *******************************************************************
	These are 'regular expressions' which describe the basic building
	blocks that the lexer will recognize as it scans the input.

	They may be unnamed, but if they have a name, it must begin with a
	capital letter.

	The <<gt;> action after the last defined token ensures that any
	following grammar actions are not assumed to be for the last token in
	the list.

	#lexaction places an action in the lexer, not the parser.

	May want to use a separate #lexclass (DFA) to handle comments, etc.
	(( zzmode(COMMENT) >> would switch to the DFA defined by
	#lexclass COMMENT.  The lexclass START is the default lexclass.

	Tokens that are special cases of a more general regex should be
	placed before the more general regex.  The first matching token is the
	one used for a given input string.

	%%, << and >> have control meanings for DLG (the lexer generator), so
	they must be escaped if needed as literals in a regex.
	Also,
		Character	Matches
		 @		 End of input
		 \t		 tab
		 \n		 newline
		 \r		 carriage return
		 \b		 backspace
		 \0nnn		 Char with octal value nnn
		 \0xnn		 Char with hex value nn
		 \mnn		 Char with dec value mnn. 1<m<9
		 \c		 The character 'c'.

	See section 2.6 of PCCTS manual for explanation of the regex's.

	Things that can be used in an action:
		zzlextext is a variable which contains the text of the token.
		zzreplchar(c) - replaces a single character token with 'c'.
		zzreplstr(s) - replaces a string token with "s".
		zzmode(lexclass) - start using the lexical description for
		  'lexclass'.
		zzmore() - Continue lexing.  Next thing is part of this token.
		zzline is a variable representing the number of input lines
		  counted.  It must be user-updated.  This is enabled by
		  adding a "#define ZZCOL" to an action in the lexer.
		zzbegexpr - A ptr to the beginning of the current token text.
		zzendexpr - A ptr to the end of the current token text,
		  excluding the automatically added '\0'.
		zztokens (num) - Returns the label or regex defined in the
		  grammar for token number 'num'.
		zzbegcol is a variable representing the beginning column of
		  the input line?  Must be enabled as zzline is.
		zzendcol is a variable representing the ending column of the
		  input line?  It must be user-corrected for tabs and returns.
		  Must be enabled as zzline is.
****************************************************************************/
#lexclass START
/* Punctuation */
#token	"[\ \t]+"		<< zzskip(); >>		/* Leading White */
#token	BANG	"!"
#token	DOT	"\."
#token	EQUAL	"="
#token	GT	"\>
#token	LBRACK	"\["
#token	LBRACE	"\{"
#token	LPAREN	"\("
#token	LT	"\<
#token	MINUS	"\-"
#token	MOD	"%"
#token	NL	"\n"		<< zzline++; zzmore(); >>
#token	PLUS	"\+"
#token	RBRACK	"\]"
#token	RBRACE	"\}"
#token	RPAREN	"\)"
#token	SEMI	";"
#token	SLASH	"\/"
#token	STAR	"\*"

/* Keywords and Intrinsic Function Names - Accepts any mix of cases. */
#token	K_ALL		"[aA][lL][lL]"
#token	K_ELSE		"[eE][lL][sS][eE]"
#token	K_IF		"[iI][fF]"
#token	K_INT		"[iI][nN][tT]"
#token	K_IPROC		"[iI][pP][rR][oO][cC]"
#token	K_NPROC		"[nN][pP][rR][oO][cC]"
#token	K_PRINT		"[pP][rR][iI][nN][tT]"
#token	K_PLURAL	"[pP][lL][uU][rR][aA][lL]"
#token	K_RETURN	"[rR][eE][tT][uU][rR][nN]"
#token	K_ROUTER	"[rR][oO][uU][tT][eE][rR]"
#token	K_WHILE		"[wW][hH][iI][lL][eE]"

/* Non-Key Patterns */
#token	WORD	"[a-zA-Z][a-zA-Z0-9]*"
#token	NUM	"[0-9]+"
/* End lexclass ***************************************/





/* Rules ********************************************************************
	These are the rules that the parser follows to try to determine the
	meaning of the input.  These specify other rules to try, and actions
	to take when parts of the rule are satisfied.
	Their names must begin with a lower-case letter.

	Notes:
		Subrules are allowed at most one parameter which must be of
		  type "Attrib".
		$0 represents the return value of a rule.
		$n (n<0) represents the value of the nth item in the
		  current alternative of the rule being matched.
****************************************************************************/
prog	:	<< printf("#include \"nempl.h\"\n\n"); >>
		decls[0]
		funcs
	;

decls[int scope] :
		<<
			register int sc;
			register int newoff = 0;
		>>
		(
		  << register sym *s; >>
		  stc > [sc]
		  WORD
			<< s = enter($2);
			   s->type = $scope;
			   s->class = sc;
			   s->base = offset + newoff;
			   s->size = 1;
			>>
		  {
			LBRACK
			NUM
			<< s->size = atoi($2); >>
			RBRACK
		  }
		  <<	/* add size of this var to total allocated... */
			newoff += s->size;

			tab();
			if (s->size > 1) {
				printf("int %s[%d];\n", s->text, s->size);
			} else {
				printf("int %s;\n", s->text);
			}
		  >>
		  SEMI
		)*
		<<
			if (newoff > 0) {
				offset += newoff;
			}
		>>
	;

stc > [int class] :
		K_INT		<<$class = SINGLE;>>
	|	K_PLURAL	<<$class = PLURAL;>>
	;

funcs	:	(
			WORD
			<<
				/* generate code for a function def */
				printf("\n%s(", $1);
			>>
			LPAREN WORD
			<<
				{
					register sym *s = enter($3);
					s->type = 1;
					s->class = PLURAL;
					s->base = 0;
					s->size = 1;
					printf("%s", $3);
					/* leave space for arg */
					offset = 1;
				}
			>>
			RPAREN
			<<
				printf(")\n{\n");
				++indent;
				tab();
				printf( "register int r0, r1, r2, r3, r4, r5, "
					"r6, r7;\n");
			>>
			stat
			<<
				tab();
				printf("return(0);\n}\n");
				--indent;
			>>
		)*
	;

slist	:	( stat )*
	;

stat	:	<<
			/* These are intended to be for any alt */
			register int myclass;
		>>
	(
		LBRACE
		<<
			tab();
			printf("{\n");
			++indent;
		>>
		<<
			{
				register sym *p = symptr;
		>>
		decls[1] slist
		<<
				while (symptr > p) {
					free((char *) symptr->text);
					--symptr;
				}
			}
		>>
		RBRACE
		<<
			--indent;
			tab();
			printf("}\n");
		>>
	|	<<
			{
				register sym *p;
				register int s;
		>>
		lval_array > [p,s]
		EQUAL
		cond > [myclass]
		<<
			if ((p->class == SINGLE) && (myclass != SINGLE))
				warn("int = plural assignment");
			tab();
			if (p->class != SINGLE) {
				++indent;
				printf("if (ENABLED) {\n");
				tab();
				if (s == NONARRAY)
				{
					printf("%s = r%d;\n", p->text, reg);
				}
				else
				{
					printf("%s[r%d] = r%d;\n",
						p->text, reg-1, reg);
				}
				--indent;
				tab();
				printf("}\n");
			} else if ((s == NONARRAY) && (myclass == SINGLE)) {
				printf("%s = r%d;\n", p->text, reg);
			} else {
				if (s == NONARRAY)
					printf("%s = simd_reduceOr(r%d);\n",
						p->text, reg);
				else
					printf("%s[r%d] = simd_reduceOr(r%d);"
						"\n", p->text, reg-1, reg);
			}
			--reg;
		>>
		SEMI
		<<
			}
		>>
	|	K_IF
		<<
			tab();
			++indent;
			printf("{\n");
		>>
		<<
			{
				register int ctype;
		>>
		newcond > [ctype]
		<<
				tab();
				if (ctype == SINGLE) {
					printf("if (r%d) {\n", reg);
				} else {
					printf("ENPUSH;\n");
					tab();
					printf("if (simd_any(r%d)) {\n", reg);
				}
				++indent;
		>>
		stat
		<<
				--indent;
		>>
		{
			K_ELSE
			<<
				tab();
				if (ctype != SINGLE) {
					printf("}\n");
					tab();
					printf("ENELSE;\n");
					tab();
					printf("if (simd_any(1)) {\n");
				} else {
					printf("} else {\n");
				}
				++indent;
			>>
			stat
			<<
				--indent;
			>>
		}
		<<
				tab();
				printf("}\n");
				if (ctype != SINGLE) {
					tab();
					printf("ENPOP;\n");
				}
			}
			--indent;
			tab();
			printf("}\n");
		>>
	|	K_WHILE
		<<
			tab();
			++indent;
			printf("{\n");
			tab();
			printf("ENPUSH;\n");
			{
				static whilelab = 0;
				register lab = ++whilelab;
				register int ctype;

				printf("while_%d:\n", lab);
		>>
		newcond > [ctype]
		<<
				if ( ctype == SINGLE) {
					tab();
					++indent;
					printf("if (r%d) {\n", reg);
				} else {
					tab();
					++indent;
					printf("if (simd_any(r%d)) {\n", reg);
				}
		>>
		stat
		<<
				tab();
				printf("goto while_%d;\n", lab);
				--indent;
				tab();
				printf("}\n");
			}
			tab();
			--indent;
			printf("ENPOP;\n");
			tab();
			printf("}\n");
		>>
	|	K_RETURN
		newcond
		<<
			tab();
			printf("return(r%d);\n", reg);
		>>
		SEMI
	|	K_PRINT
		newcond
		<<
			tab();
			printf("simd_print(r%d);\n", reg);
		>>
		SEMI
	|	K_ALL
		<<
			tab();
			printf("{\n");
			++indent;
			tab();
			printf("ENALL;\n");
		>>
		stat
		<<
			tab();
			printf("ENRESTORE;\n");
			--indent;
			tab();
			printf("}\n");
		>>
	|	SEMI
	)
	;

newcond > [int unnamed] :
		<<
			reg = -1;
		>>
		cond > [$unnamed]
	;

cond > [int myclass] :
		<<
			register char op;
			register int mytype;
		>>
		expr > [$myclass]
		(
			(  GT	<< op = '> >>
			 | LT	<< op = '< >>
			)
			expr > [mytype]
			<<
				if (mytype == PLURAL) $myclass = PLURAL;
				tab();
				printf("r%d = (r%d %c r%d);\n",
					 reg-1, reg-1, op, reg);
				--reg;
			>>
		)*
	;

expr > [int myclass] :
		<<
			register char op;
			register int mytype;
		>>
		term > [$myclass]
		(	(  PLUS		<< op = '+'; >>
			 | MINUS	<< op = '-'; >>
			)
			term > [mytype]
			<<
				if (mytype == PLURAL) $myclass = PLURAL;
				tab();
				printf("r%d %c= r%d;\n", reg-1, reg);
				--reg;
			>>
		)*
	;

term > [int myclass] :
		<<
			register char op;
			register int mytype;
		>>
		fact > [$myclass]
		(	(  STAR		<< op = '*'; >>
			 | SLASH	<< op = '/'; >>
			 | MOD		<< op = '%'; >>
			)
			fact > [mytype]
			<<
				if (mytype == PLURAL) $myclass = PLURAL;
				tab();
				printf("r%d %c= r%d;\n", reg-1, op, reg);
				--reg;
			>>
		)*
	;

fact > [int myclass] :
		<<
			sym *p;
			register int s;
		>>
		NUM
		<<
			++reg;
			tab();
			printf("r%d = %d;\n", reg, atoi($1));
			$myclass = SINGLE;
		>>
	|	rval_array > [p,s]
		{
			LPAREN
			<<
				if (s != CALL)
					error("undefined function");
			>>
			cond
			RPAREN
			<<
				tab();
				printf("r%d = %s(r%d);\n", reg,&(buf[0]),reg);
				$myclass = s = PLURAL;
			>>
		}
		<<
			if (s == CALL)
				error("undefined variable");
			else {
				if (s == PLURAL)
					$myclass = PLURAL;
				else if (p->class != SINGLE)
					$myclass = PLURAL;
				else
					$myclass = SINGLE;
					
			}
		>>
	|	MINUS fact > [$myclass]
		<<
			tab();
			printf("r%d = -r%d;\n", reg, reg);
		>>
	|	BANG fact > [$myclass]
		<<
			tab();
			printf("r%d = !r%d;\n", reg, reg);
		>>
	|	LPAREN cond > [$myclass] RPAREN
	|	K_ROUTER
		<<
			/* Get the PE number we want to get from... */
		>>
		LBRACK cond RBRACK
		<<
			/* Evaluate the expression on the other guy... */
		>>
		DOT
		<<
			tab();
			printf("{\n");
			++indent;
			tab();
			printf("ENREMOTE(r%d);\n", reg);
		>>
		fact
		<<
			tab();
			printf("ENRESTORE;\n");
			--indent;
			tab();
			printf("}\n");

			/* Send the values.... */
			tab();
			printf("r%d = simd_putget(r%d, r%d);\n",
				reg-1, reg, reg-1);
			--reg;
			$myclass = PLURAL;
		>>
	;

lval_array > [sym *p, int subclass]:
		<<
			$subclass = NONARRAY;
		>>
		WORD
		<<
			$p = lookup ($1);
		>>
		{
			LBRACK
		 	newcond > [$subclass]
			<<
				if (($p->class==SINGLE) && ($subclass!=SINGLE))
					warn("plural subscript on int array");
			>>
		 	RBRACK
		}
	;

rval_array > [sym *p, int subclass]:
		<<
			$subclass = CALL;
		>>
		WORD
		<<
			$p = lookup ($1);
			if ($p == NULL) {
				/* a function call */
				strcpy(&(buf[0]), $1);
			} else {
				/* a variable */
				$subclass = NONARRAY;
			}
		>>
		{
			LBRACK
			<<
				if ($subclass == CALL)
					error("undefined variable");
			>>
			cond > [$subclass]
			<<
					tab();
					printf("r%d = %s[r%d];\n",
						reg, $p->text, reg);
			>>
			RBRACK
		}
		<<
			if ($subclass == NONARRAY) {
				++reg;
				tab();
				printf("r%d = %s;\n", reg, $p->text);
			}
		>>
	;

/* array as defined in lang spec:
array:
		WORD { LBRACK cond RBRACK }
	;
*/


This page was last modified .