A Full Compiler Example Using PCCTS:
This is a simple example of a complete compiler done using PCCTS. This is a
slightly modified version of the first project in EE573 at Purdue a couple of
years back. There may be some problems with the functionality of this
compiler, but it still serves as a good example of a non-AST based compiler.
Subheading
/* nempl.g *******************************************************************
Coding History:
??-??-?? H.Dietz C coded sol'n to project
02-17-96 R.Fisher Base grammar and tokens
02-18-96 R.Fisher Added actions from Hank's solution
02-19-96 R.Fisher Edit of actions for PCCTS
02-20-96 R.Fisher Some debugging.
02-22-96 R.Fisher Tried to write "array" with passed args, etc.
02-24-96 R.Fisher Break "array" into "lval_" and "rval_" "array"
Notes:
This is an attempt to meet the original nempl spec as presented in
the EE595E Project Handout (13 September 1994) by Hank Dietz.
The only non-spec part that I know if is that it allows mixed-case
keywords.
This is essentially a port of Hank's solution to PCCTS with some minor
adjustments.
ToDo:
Put in proper handling for WORDs.
Note that the spec allows functionname = whatever, but Hank's soln
does not.
Split "array" into two parts, one for handling lvals, one for rvals.
*****************************************************************************/
/* Start up code ************************************************************
This makes the program start ANTLR, which will begin parsing the
standard input starting with the rule "prog".
*****************************************************************************/
#header
<<
#include "charptr.h"
#include <stdio.h>
#include <stdlib.h>
#include <ctype.h>
#define DEBUG 1
#define SINGLE 1
#define PLURAL 2
#define NONARRAY 3
#define CALL 4
#define sym struct _sym
sym {
char *text;
int type;
int class;
int base;
int size;
};
extern sym symtab[];
#define ZZCOL
>>
<<
sym symtab[513];
sym *symptr = &(symtab[0]);
int indent = 0; /* current output indent */
int offset = 0; /* stack offset for var */
int reg = -1; /* next register to use */
char buf[513]; /* Used by rval_array and fact to pass a string */
#include "charptr.c"
main(argc, argv)
int argc;
char **argv;
{
if (argc != 1) {
error("no command-line args; use redirection for file I/O");
}
ANTLR (prog(), stdin);
return(0);
}
warn(fmt, a, b, c)
char *fmt;
int a, b, c;
{
fprintf(stderr, "line %d: ", zzline);
fprintf(stderr, fmt, a, b, c);
fprintf(stderr, "\n");
}
error(fmt, a, b, c)
char *fmt;
int a, b, c;
{
warn(fmt, a, b, c);
exit(1);
}
sym *
enter(s)
register char *s;
{
/* extern char *malloc(); */
register char *p;
if ( (p=malloc(strlen(s) + 1)) == NULL )
fprintf(stderr,"Couldn't alloc space for symbol table entry\n");
if ( DEBUG )
fprintf ( stderr, "Entering %s into symtab\n", s );
strcpy(p, s);
symptr->text = p;
if ( DEBUG )
fprintf ( stderr, "%s entered into symtab\n", symptr->text );
return(symptr++);
}
sym *
lookup(s)
register char *s;
{
register sym *p = (symptr - 1);
if ( DEBUG )
fprintf ( stderr, "Looking up %s in symtab\n", s );
while (p > &(symtab[0])) {
if (!strcmp(p->text, s)) {
if ( DEBUG )
fprintf(stderr,"Found %s in symtab\n", p->text);
return(p);
}
--p;
}
if ( DEBUG )
fprintf( stderr,"%s not found in symtab\n", s );
return(NULL);
}
tab()
{
register int x;
for (x=0; x<indent; ++x) {
putchar(' ');
}
}
>>
/* Tokens *******************************************************************
These are 'regular expressions' which describe the basic building
blocks that the lexer will recognize as it scans the input.
They may be unnamed, but if they have a name, it must begin with a
capital letter.
The <<gt;> action after the last defined token ensures that any
following grammar actions are not assumed to be for the last token in
the list.
#lexaction places an action in the lexer, not the parser.
May want to use a separate #lexclass (DFA) to handle comments, etc.
(( zzmode(COMMENT) >> would switch to the DFA defined by
#lexclass COMMENT. The lexclass START is the default lexclass.
Tokens that are special cases of a more general regex should be
placed before the more general regex. The first matching token is the
one used for a given input string.
%%, << and >> have control meanings for DLG (the lexer generator), so
they must be escaped if needed as literals in a regex.
Also,
Character Matches
@ End of input
\t tab
\n newline
\r carriage return
\b backspace
\0nnn Char with octal value nnn
\0xnn Char with hex value nn
\mnn Char with dec value mnn. 1<m<9
\c The character 'c'.
See section 2.6 of PCCTS manual for explanation of the regex's.
Things that can be used in an action:
zzlextext is a variable which contains the text of the token.
zzreplchar(c) - replaces a single character token with 'c'.
zzreplstr(s) - replaces a string token with "s".
zzmode(lexclass) - start using the lexical description for
'lexclass'.
zzmore() - Continue lexing. Next thing is part of this token.
zzline is a variable representing the number of input lines
counted. It must be user-updated. This is enabled by
adding a "#define ZZCOL" to an action in the lexer.
zzbegexpr - A ptr to the beginning of the current token text.
zzendexpr - A ptr to the end of the current token text,
excluding the automatically added '\0'.
zztokens (num) - Returns the label or regex defined in the
grammar for token number 'num'.
zzbegcol is a variable representing the beginning column of
the input line? Must be enabled as zzline is.
zzendcol is a variable representing the ending column of the
input line? It must be user-corrected for tabs and returns.
Must be enabled as zzline is.
****************************************************************************/
#lexclass START
/* Punctuation */
#token "[\ \t]+" << zzskip(); >> /* Leading White */
#token BANG "!"
#token DOT "\."
#token EQUAL "="
#token GT "\>
#token LBRACK "\["
#token LBRACE "\{"
#token LPAREN "\("
#token LT "\<
#token MINUS "\-"
#token MOD "%"
#token NL "\n" << zzline++; zzmore(); >>
#token PLUS "\+"
#token RBRACK "\]"
#token RBRACE "\}"
#token RPAREN "\)"
#token SEMI ";"
#token SLASH "\/"
#token STAR "\*"
/* Keywords and Intrinsic Function Names - Accepts any mix of cases. */
#token K_ALL "[aA][lL][lL]"
#token K_ELSE "[eE][lL][sS][eE]"
#token K_IF "[iI][fF]"
#token K_INT "[iI][nN][tT]"
#token K_IPROC "[iI][pP][rR][oO][cC]"
#token K_NPROC "[nN][pP][rR][oO][cC]"
#token K_PRINT "[pP][rR][iI][nN][tT]"
#token K_PLURAL "[pP][lL][uU][rR][aA][lL]"
#token K_RETURN "[rR][eE][tT][uU][rR][nN]"
#token K_ROUTER "[rR][oO][uU][tT][eE][rR]"
#token K_WHILE "[wW][hH][iI][lL][eE]"
/* Non-Key Patterns */
#token WORD "[a-zA-Z][a-zA-Z0-9]*"
#token NUM "[0-9]+"
/* End lexclass ***************************************/
/* Rules ********************************************************************
These are the rules that the parser follows to try to determine the
meaning of the input. These specify other rules to try, and actions
to take when parts of the rule are satisfied.
Their names must begin with a lower-case letter.
Notes:
Subrules are allowed at most one parameter which must be of
type "Attrib".
$0 represents the return value of a rule.
$n (n<0) represents the value of the nth item in the
current alternative of the rule being matched.
****************************************************************************/
prog : << printf("#include \"nempl.h\"\n\n"); >>
decls[0]
funcs
;
decls[int scope] :
<<
register int sc;
register int newoff = 0;
>>
(
<< register sym *s; >>
stc > [sc]
WORD
<< s = enter($2);
s->type = $scope;
s->class = sc;
s->base = offset + newoff;
s->size = 1;
>>
{
LBRACK
NUM
<< s->size = atoi($2); >>
RBRACK
}
<< /* add size of this var to total allocated... */
newoff += s->size;
tab();
if (s->size > 1) {
printf("int %s[%d];\n", s->text, s->size);
} else {
printf("int %s;\n", s->text);
}
>>
SEMI
)*
<<
if (newoff > 0) {
offset += newoff;
}
>>
;
stc > [int class] :
K_INT <<$class = SINGLE;>>
| K_PLURAL <<$class = PLURAL;>>
;
funcs : (
WORD
<<
/* generate code for a function def */
printf("\n%s(", $1);
>>
LPAREN WORD
<<
{
register sym *s = enter($3);
s->type = 1;
s->class = PLURAL;
s->base = 0;
s->size = 1;
printf("%s", $3);
/* leave space for arg */
offset = 1;
}
>>
RPAREN
<<
printf(")\n{\n");
++indent;
tab();
printf( "register int r0, r1, r2, r3, r4, r5, "
"r6, r7;\n");
>>
stat
<<
tab();
printf("return(0);\n}\n");
--indent;
>>
)*
;
slist : ( stat )*
;
stat : <<
/* These are intended to be for any alt */
register int myclass;
>>
(
LBRACE
<<
tab();
printf("{\n");
++indent;
>>
<<
{
register sym *p = symptr;
>>
decls[1] slist
<<
while (symptr > p) {
free((char *) symptr->text);
--symptr;
}
}
>>
RBRACE
<<
--indent;
tab();
printf("}\n");
>>
| <<
{
register sym *p;
register int s;
>>
lval_array > [p,s]
EQUAL
cond > [myclass]
<<
if ((p->class == SINGLE) && (myclass != SINGLE))
warn("int = plural assignment");
tab();
if (p->class != SINGLE) {
++indent;
printf("if (ENABLED) {\n");
tab();
if (s == NONARRAY)
{
printf("%s = r%d;\n", p->text, reg);
}
else
{
printf("%s[r%d] = r%d;\n",
p->text, reg-1, reg);
}
--indent;
tab();
printf("}\n");
} else if ((s == NONARRAY) && (myclass == SINGLE)) {
printf("%s = r%d;\n", p->text, reg);
} else {
if (s == NONARRAY)
printf("%s = simd_reduceOr(r%d);\n",
p->text, reg);
else
printf("%s[r%d] = simd_reduceOr(r%d);"
"\n", p->text, reg-1, reg);
}
--reg;
>>
SEMI
<<
}
>>
| K_IF
<<
tab();
++indent;
printf("{\n");
>>
<<
{
register int ctype;
>>
newcond > [ctype]
<<
tab();
if (ctype == SINGLE) {
printf("if (r%d) {\n", reg);
} else {
printf("ENPUSH;\n");
tab();
printf("if (simd_any(r%d)) {\n", reg);
}
++indent;
>>
stat
<<
--indent;
>>
{
K_ELSE
<<
tab();
if (ctype != SINGLE) {
printf("}\n");
tab();
printf("ENELSE;\n");
tab();
printf("if (simd_any(1)) {\n");
} else {
printf("} else {\n");
}
++indent;
>>
stat
<<
--indent;
>>
}
<<
tab();
printf("}\n");
if (ctype != SINGLE) {
tab();
printf("ENPOP;\n");
}
}
--indent;
tab();
printf("}\n");
>>
| K_WHILE
<<
tab();
++indent;
printf("{\n");
tab();
printf("ENPUSH;\n");
{
static whilelab = 0;
register lab = ++whilelab;
register int ctype;
printf("while_%d:\n", lab);
>>
newcond > [ctype]
<<
if ( ctype == SINGLE) {
tab();
++indent;
printf("if (r%d) {\n", reg);
} else {
tab();
++indent;
printf("if (simd_any(r%d)) {\n", reg);
}
>>
stat
<<
tab();
printf("goto while_%d;\n", lab);
--indent;
tab();
printf("}\n");
}
tab();
--indent;
printf("ENPOP;\n");
tab();
printf("}\n");
>>
| K_RETURN
newcond
<<
tab();
printf("return(r%d);\n", reg);
>>
SEMI
| K_PRINT
newcond
<<
tab();
printf("simd_print(r%d);\n", reg);
>>
SEMI
| K_ALL
<<
tab();
printf("{\n");
++indent;
tab();
printf("ENALL;\n");
>>
stat
<<
tab();
printf("ENRESTORE;\n");
--indent;
tab();
printf("}\n");
>>
| SEMI
)
;
newcond > [int unnamed] :
<<
reg = -1;
>>
cond > [$unnamed]
;
cond > [int myclass] :
<<
register char op;
register int mytype;
>>
expr > [$myclass]
(
( GT << op = '> >>
| LT << op = '< >>
)
expr > [mytype]
<<
if (mytype == PLURAL) $myclass = PLURAL;
tab();
printf("r%d = (r%d %c r%d);\n",
reg-1, reg-1, op, reg);
--reg;
>>
)*
;
expr > [int myclass] :
<<
register char op;
register int mytype;
>>
term > [$myclass]
( ( PLUS << op = '+'; >>
| MINUS << op = '-'; >>
)
term > [mytype]
<<
if (mytype == PLURAL) $myclass = PLURAL;
tab();
printf("r%d %c= r%d;\n", reg-1, reg);
--reg;
>>
)*
;
term > [int myclass] :
<<
register char op;
register int mytype;
>>
fact > [$myclass]
( ( STAR << op = '*'; >>
| SLASH << op = '/'; >>
| MOD << op = '%'; >>
)
fact > [mytype]
<<
if (mytype == PLURAL) $myclass = PLURAL;
tab();
printf("r%d %c= r%d;\n", reg-1, op, reg);
--reg;
>>
)*
;
fact > [int myclass] :
<<
sym *p;
register int s;
>>
NUM
<<
++reg;
tab();
printf("r%d = %d;\n", reg, atoi($1));
$myclass = SINGLE;
>>
| rval_array > [p,s]
{
LPAREN
<<
if (s != CALL)
error("undefined function");
>>
cond
RPAREN
<<
tab();
printf("r%d = %s(r%d);\n", reg,&(buf[0]),reg);
$myclass = s = PLURAL;
>>
}
<<
if (s == CALL)
error("undefined variable");
else {
if (s == PLURAL)
$myclass = PLURAL;
else if (p->class != SINGLE)
$myclass = PLURAL;
else
$myclass = SINGLE;
}
>>
| MINUS fact > [$myclass]
<<
tab();
printf("r%d = -r%d;\n", reg, reg);
>>
| BANG fact > [$myclass]
<<
tab();
printf("r%d = !r%d;\n", reg, reg);
>>
| LPAREN cond > [$myclass] RPAREN
| K_ROUTER
<<
/* Get the PE number we want to get from... */
>>
LBRACK cond RBRACK
<<
/* Evaluate the expression on the other guy... */
>>
DOT
<<
tab();
printf("{\n");
++indent;
tab();
printf("ENREMOTE(r%d);\n", reg);
>>
fact
<<
tab();
printf("ENRESTORE;\n");
--indent;
tab();
printf("}\n");
/* Send the values.... */
tab();
printf("r%d = simd_putget(r%d, r%d);\n",
reg-1, reg, reg-1);
--reg;
$myclass = PLURAL;
>>
;
lval_array > [sym *p, int subclass]:
<<
$subclass = NONARRAY;
>>
WORD
<<
$p = lookup ($1);
>>
{
LBRACK
newcond > [$subclass]
<<
if (($p->class==SINGLE) && ($subclass!=SINGLE))
warn("plural subscript on int array");
>>
RBRACK
}
;
rval_array > [sym *p, int subclass]:
<<
$subclass = CALL;
>>
WORD
<<
$p = lookup ($1);
if ($p == NULL) {
/* a function call */
strcpy(&(buf[0]), $1);
} else {
/* a variable */
$subclass = NONARRAY;
}
>>
{
LBRACK
<<
if ($subclass == CALL)
error("undefined variable");
>>
cond > [$subclass]
<<
tab();
printf("r%d = %s[r%d];\n",
reg, $p->text, reg);
>>
RBRACK
}
<<
if ($subclass == NONARRAY) {
++reg;
tab();
printf("r%d = %s;\n", reg, $p->text);
}
>>
;
/* array as defined in lang spec:
array:
WORD { LBRACK cond RBRACK }
;
*/
This page was last modified
.