// BNF grammar for sweet-expressions, an easy-to-read S-expression notation. // (c) 2012-2013 David A. Wheeler, released under "MIT" license. // This BNF is an LL(1) grammar, written using ANTLR version 3. // More info on ANTLR is at "http://www.antlr.org/". // It is written to be easy to translate to a recursive-descent parser, // even one that doesn't do full tokenizing. // The actions in "sweet.g" are written inside {...} using Java syntax, // but invoke normal Scheme procedures (e.g., "car" and "cons") as follows: // +------------------|------------|------------------------------------------+ // | Action Construct | Scheme | Discussion | // |------------------|------------|------------------------------------------| // | cons(x, y) | (cons x y) | Java syntax used for procedure calls | // | nullp(x) | (null? x) | "?" in Scheme procedure name becomes "p" | // | null | '() | Java null represents the empty list | // | "somename" | 'somename | Java strings represent atoms | // +------------------|------------|------------------------------------------+ // The program "schemify" translates these actions into Scheme, assuming // they meet certain formatting rules. // This BNF specifically shows how non-indent whitespace is handled // (SRFI-49's BNF is simpler but isn't specific about this, making it // harder to interpret). Note that this BNF presumes there's a preprocessor // that does indent processing. While indent processing is enabled, indents // are marked with INDENT and dedents with DEDENT (one for each dedent). // Lines with ONLY indent characters are considered blank lines. // There is no "SAME" token to show "same indent level", but some rules // use an empty "same" nonterminal to emphasize their lack of INDENT/DEDENT. // Bad indentation emits BADDENT, which is never accepted. // If there's an indent when there is NO preceding un-indented line, the // preprocessor emits INITIAL_INDENT; this // does *NOT* change the indent level (this is so initial indents // will disable indentation processing, but only on that line). // TODO: // - (Maybe) Handle EOF in weird places. grammar sweet; options { k = 1; } // Force grammar to be LL(1). // LEXER SECTION tokens { BADDENT; // Generated by indent processor, always illegal. } // ANTLR v3 does not allow the parser to communicate to the lexer // (the lexer runs completely before the parser begins). // The lexer *can* store and use state, which we have to do anyway to // do indentation processing if we're going to do indentation processing // inside the lexer. Thus, in this implementation, the lexer // tracks the parentheses enclosure level, so that \n is just a delimiter // inside (...) but is handled specially in indentation processing, // while symbols like "$" are just ordinary atoms inside (...). // The lexer must also specially note "<*" and "*>" and specially modify // the indentation levels when those are received. // This ANTLR lexer does indentation processing, so it generates // INDENT, DEDENT, and so on as appropriate. @lexer::header { import java.util.Deque; } @lexer::members { // Track enclosure level. "(" etc. adds 1, ")" etc. subtracts: long enclosure = 0; Boolean initial_indent = false; // Are we doing indent processing?: private Boolean indent_processing() { return (enclosure == 0) && !initial_indent; } // Permit sending multiple tokens per rule, per ANTLR FAQ. // This is necessary to support DEDENT, as a single token may cause // the generation of multiple DEDENTs. Deque tokens = new java.util.ArrayDeque(); @Override public void emit(Token token) { state.token = token; tokens.addLast(token); } @Override public Token nextToken() { super.nextToken(); if (tokens.isEmpty()) return Token.EOF_TOKEN; return tokens.removeFirst(); } // This stack records the string indents; use push, pop, peek. Deque indents = new java.util.ArrayDeque(); private Boolean outside_t_expr() { return indent_processing() && (indents.isEmpty() || indents.peek().equals("")); } private Boolean indentation_greater_than(String indent1, String indent2) { int len1 = indent1.length(); int len2 = indent2.length(); return (len1 > len2) && indent2.equals(indent1.substring(0, len2)); } private void process_indent(String indent_text, Token t) { if (indents.isEmpty()) indents.push(""); if (indents.peek().equals(indent_text)) { // no-op, aka "same" } else if (indentation_greater_than(indent_text, indents.peek())) { // System.out.print("Generate INDENT\n"); t.setType(INDENT); emit(t); indents.push(indent_text); } else if (indentation_greater_than(indents.peek(), indent_text)) { while (!indents.isEmpty() && indentation_greater_than(indents.peek(), indent_text)) { indents.removeFirst(); // drop // System.out.print("Generate DEDENT(s)\n"); t.setType(DEDENT); emit(t); } if (!indents.peek().equals(indent_text)) { // System.out.print("Generate BADDENT(s)\n"); t.setType(BADDENT); emit(t); } } else { // System.out.print("Generate BADDENT\n"); t.setType(BADDENT); emit(t); } } private void process_initial_indent(String indent_text, Token t) { t.setType(INITIAL_INDENT); emit(t); initial_indent = true; } private void restart_indent_level() { if (indents.isEmpty()) indents.push(""); // Initialize if needed indents.push(""); // Let's start over! } private void emit_type(int token_type) { CommonToken extra = new CommonToken(input, Token.INVALID_TOKEN_TYPE, Token.DEFAULT_CHANNEL, getCharIndex(), getCharIndex()-1); extra.setLine(getLine()); extra.setCharPositionInLine(getCharPositionInLine()); extra.setType(token_type); emit(extra); } private void process_collecting_end(Token t) { // Must send independent EOL token for ANTLR. emit_type(EOL); /* Dedent (if needed) */ CommonToken dedent_it = new CommonToken(input, Token.INVALID_TOKEN_TYPE, Token.DEFAULT_CHANNEL, getCharIndex(), getCharIndex()-1); dedent_it.setLine(getLine()); dedent_it.setCharPositionInLine(getCharPositionInLine()); process_indent("", dedent_it); // Emit any dedents needed. t.setType(COLLECTING_END); emit(t); // Restore original indent stack. indents.removeFirst(); } } @parser::header { import scheme.*; import static scheme.Pair.*; } @parser::members { // This is a bogus variable to work around an ANTLR bug. ANTLR by default // is greedy, but warns on ambiguities unless "greedy=true", e.g.: // part1 (options {greedy = true;} : hspace)* part2 // But there's no greedy variable defined for the parser, so you can't use // the feature properly. Here we create a settable variable, so you can // selectively disable unnecessary warnings. If a later ANTLR also // defines this variable, then remove it: public Boolean greedy = true; private void generate_eof() { System.exit(0); } // These similate basic Lisp/Scheme procedures: public static Object cadr(Pair x) { return car( (Pair) cdr(x)); } public static Object cddr(Pair x) { return cdr( (Pair) cdr(x)); } public static Boolean nullp(Object x) { // Scheme "null?" return x == null; } public static Boolean pairp(Object x) { // Scheme "pair?" return x instanceof Pair; } public static Boolean equalp(Object x, Object y) { // Scheme "equal?" if ((x == null) && (y == null)) { return true; } else if ((x == null) || (y == null)) { return false ; } else if (pairp(x) && pairp(y)) { Pair px = (Pair) x; Pair py = (Pair) y; return equalp(car(px), car(py)) && equalp(cdr(px), cdr(py)); } else if (pairp(x) || pairp(y)) { return false; } else { return x.equals(y); } } // This does not check for cycles private static Boolean ends_in_null(Pair x) { if (nullp(cdr(x))) { return true; } else if (! pairp(cdr(x))) { return false; } else { return ends_in_null( (Pair) cdr(x)); } } public static Boolean listp(Object x) { // Scheme "list?" if (nullp(x)) return false; else if (! pairp(x)) return false; else return ends_in_null( (Pair) x); } public static Pair list(Object x) { return cons(x, null); } public static Pair list(Object x, Object y) { return cons(x, list(y)); } // "y" really needs to be a Pair, but that makes it harder to // to work with ANTLR public static Object append(Object x, Object y) { if (x == null) { return y; } else if (x instanceof Pair) { Pair p = (Pair) x; return cons(car(p), append(cdr(p), y)); } else { throw new Error(); // for now. } } private static String string_datum_tail(Object x) { if (x == null) { return ")"; } else if (x instanceof Pair) { Pair p = (Pair) x; return " " + string_datum(car(p)) + string_datum_tail(cdr(p)); } else return " . " + string_datum(x) + ")"; } public static String string_datum(Object x) { if (x == null) { return "()"; } else if (x instanceof Pair) { Pair p = (Pair) x; return "(" + string_datum(car(p)) + string_datum_tail(cdr(p)); } else { return x.toString(); } } // ; Utility function: // ; If x is a 1-element list, return (car x), else return x // (define (monify x) // (cond // ((not (pair? x)) x) // ((null? (cdr x)) (car x)) // (#t x))) public static Object monify(Object x) { if (! pairp(x)) { return x; } else if (nullp(cdr( (Pair) x))) { return car( (Pair) x); } else {return x;} } // This isn't an accurate implementation of "if" because it always // evaluates the t_branch AND f_branch. But if t_branch and f_branch // have no side-effects, it's good enough. public static Object ifp(Boolean b, Object t_branch, Object f_branch) { if (b) return t_branch; else return f_branch; } // Is the parameter v a representation of the symbol "."? public static Boolean isperiodp(Object v) { if (!(v instanceof String)) return false; else return ((String) v).equals("."); } // ; -------------------------------------------------------- // ; Curly-infix support procedures - converted from SRFI-105 // ; -------------------------------------------------------- // ; Return true if lyst has an even # of parameters, and the (alternating) // ; first parameters are "op". Used to determine if a longer lyst is infix. // ; If passed empty list, returns true (so recursion works correctly). // (define (even-and-op-prefix? op lyst) // (cond // ((null? lyst) #t) // ((not (pair? lyst)) #f) // ((not (equal? op (car lyst))) #f) ; fail - operators not the same // ((not (pair? (cdr lyst))) #f) ; Wrong # of parameters or improper // (#t (even-and-op-prefix? op (cddr lyst))))) ; recurse. public static Boolean even_and_op_prefixp(Object op, Object lyst) { if (nullp(lyst)) { return true; } else if (! pairp(lyst)) { return false; } else if (! equalp(op, car( (Pair) lyst))) { return false; } else if (! pairp( cdr( (Pair) lyst))) { return false; } else { return even_and_op_prefixp(op, cddr( (Pair) lyst)); } } // ; Return true if the lyst is in simple infix format // ; (and thus should be reordered at read time). // (define (simple-infix-list? lyst) // (and // (pair? lyst) ; Must have list; '() doesn't count. // (pair? (cdr lyst)) ; Must have a second argument. // (pair? (cddr lyst)) ; Must have a third argument (we check it // ; this way for performance) // (even-and-op-prefix? (cadr lyst) (cdr lyst)))) ; true if rest simple public static Boolean simple_infix_listp(Object lyst) { return pairp(lyst) && pairp(cdr((Pair) lyst)) && pairp(cddr((Pair) lyst)) && even_and_op_prefixp(cadr((Pair) lyst), cdr((Pair) lyst)); } // ; Return alternating parameters in a lyst (1st, 3rd, 5th, etc.) // (define (alternating-parameters lyst) // (if (or (null? lyst) (null? (cdr lyst))) // lyst // (cons (car lyst) (alternating-parameters (cddr lyst))))) public static Pair alternating_parameters(Pair lyst) { if (nullp(lyst) || nullp(cdr(lyst))) { return lyst; } else { return cons(car(lyst), alternating_parameters((Pair) cddr(lyst))); } } // ; Not a simple infix list - transform it. Written as a separate procedure // ; so that future experiments or SRFIs can easily replace just this piece. // Handle dollar sign specially for ANTLR. public static Pair transform_mixed_infix(Pair lyst) { return cons("$" + "nfx" + "$", lyst); } // ; Given curly-infix lyst, map it to its final internal format. // (define (process-curly lyst) // (cond // ((not (pair? lyst)) lyst) ; E.G., map {} to (). // ((null? (cdr lyst)) ; Map {a} to a. // (car lyst)) // ((and (pair? (cdr lyst)) (null? (cddr lyst))) ; Map {a b} to (a b). // lyst) // ((simple-infix-list? lyst) ; Map {a OP b [OP c...]} to (OP a b [c...]) // (cons (cadr lyst) (alternating-parameters lyst))) // (#t (transform-mixed-infix lyst)))) public static Object process_curly(Object lyst) { if (! pairp(lyst)) { return lyst; } else { Pair plyst = (Pair) lyst; if (nullp(cdr(plyst))) { return car(plyst); // Map {a} to a. } else if (pairp(cdr(plyst)) && nullp(cddr(plyst))) { return plyst; // Map {a b} to (a b). } else if (simple_infix_listp(plyst)) { return cons(cadr(plyst), alternating_parameters(plyst)); } else { return transform_mixed_infix(plyst); } } } } // Define start symbol for parser (rest of parser is later). start : t_expr {System.out.print(string_datum($t_expr.v) + "\n"); } ; // LEXER SECTION. // Lexical token (terminal) names are in all upper case // INCLUDE IN SRFI SPACE : ' '; TAB : '\t'; PERIOD : '.'; // Special markers, which only have meaning outside (), [], {}. GROUP_SPLIT : {indent_processing()}? => '\\' '\\'; // GROUP/split symbol. SUBLIST : {indent_processing()}? =>'$'; COLLECTING : {indent_processing()}? => '<*' { restart_indent_level(); } ; // This generates EOL + (any DEDENTs ) + COLLECTING_END, and restores indents: COLLECTING_END : {indent_processing()}? => t='*>' {process_collecting_end($t);}; RESERVED_TRIPLE_DOLLAR : {indent_processing()}? => '$$$'; // Reserved. // Abbreviations followed by certain whitespace are special: APOSW : {indent_processing()}? => '\'' (SPACE | TAB) ; QUASIQUOTEW : {indent_processing()}? => '\`' (SPACE | TAB) ; UNQUOTE_SPLICEW : {indent_processing()}? => ',@' (SPACE | TAB) ; UNQUOTEW : {indent_processing()}? => ',' (SPACE | TAB) ; // Abbreviations followed by EOL also generate abbrevW: APOS_EOL : {indent_processing()}? => '\'' EOL_SEQUENCE SPECIAL_IGNORED_LINE* i=INDENT_CHARS_PLUS {emit_type(APOSW); emit_type(EOL); process_indent($i.text, $i);}; QUASIQUOTE_EOL : {indent_processing()}? => '\`' EOL_SEQUENCE SPECIAL_IGNORED_LINE* i=INDENT_CHARS_PLUS {emit_type(QUASIQUOTEW); emit_type(EOL); process_indent($i.text, $i);}; UNQUOTE_SPLICE_EOL: {indent_processing()}? => ',@' EOL_SEQUENCE SPECIAL_IGNORED_LINE* i=INDENT_CHARS_PLUS {emit_type(UNQUOTE_SPLICEW); emit_type(EOL); process_indent($i.text, $i);}; UNQUOTE_EOL : {indent_processing()}? => ',' EOL_SEQUENCE SPECIAL_IGNORED_LINE* i=INDENT_CHARS_PLUS {emit_type(UNQUOTEW); emit_type(EOL); process_indent($i.text, $i);}; // Abbreviations not followed by horizontal space are ordinary: APOS : '\''; QUASIQUOTE : '\`'; UNQUOTE_SPLICE : ',@'; UNQUOTE : ','; // Special end-of-line character definitions. fragment EOL_CHAR : '\n' | '\r' ; fragment NOT_EOL_CHAR : (~ (EOL_CHAR)); fragment NOT_EOL_CHARS : NOT_EOL_CHAR*; fragment EOL_SEQUENCE : ('\r' '\n'? | '\n'); // Comments. LCOMMENT=line comment, scomment=special comment. LCOMMENT : ';' NOT_EOL_CHARS ; // Line comment - doesn't include EOL BLOCK_COMMENT : '#|' // This is #| ... #| (options {greedy=false;} : (BLOCK_COMMENT | .))* '|#' ; DATUM_COMMENT_START : '#;' ; // SRFI-105 notes that "implementations could trivially support // (simultaneously) markers beginning with #! followed by a letter // (such as the one to identify support for curly-infix-expressions), // the SRFI-22 #!+space marker as an ignored line, and the // format #!/ ... !# and #!. ... !# as a multi-line comment." // We'll implement that approach for maximum flexibility. SRFI_22_COMMENT : '#! ' NOT_EOL_CHARS ; SHARP_BANG_FILE : '#!' ('/' | '.') (options {greedy=false;} : .)* '!#' (SPACE|TAB)* ; // These match #!fold-case, #!no-fold-case, #!sweet, and #!curly-infix; // it also matches a lone "#!". The "#!"+space case is handled above, // in SRFI_22_COMMENT, overriding this one: SHARP_BANG_DIRECTIVE : '#!' (('a'..'z'|'A'..'Z'|'_') ('a'..'z'|'A'..'Z'|'_'|'0'..'9'|'-')*)? (SPACE|TAB)* ; // STOP INCLUDING IN SRFI // Specially handle formfeed (\f) and vertical tab (\v). // We support lone formfeeds on a line to support the GNU Coding Standards // (http://www.gnu.org/prep/standards/standards.html), which says: // "Please use formfeed characters (control-L) to divide the program // into pages at logical places (but not within a function)... // The formfeeds should appear alone on lines by themselves." // Some argue against vertical tabs (http://prog21.dadgum.com/76.html). // Any FF and VT must be at the beginning of a line, must be followed by // EOL_SEQUENCE, and they terminate a t-expression. FF : '\f' {if (enclosure==0) initial_indent = true;} ; // Formfeed, \u000c VT : '\u000b' {if (enclosure==0) initial_indent = true;} ; // Vertical tab, \v // Various forms of comments - line comments and special comments. // We'll specifically ignore lines that begin with ";" LCOMMENT_LINE : {(getCharPositionInLine() == 0)}? => ';' contents=NOT_EOL_CHARS EOL_SEQUENCE { // We can't directly use the ANTLR Lexer to generate ";" comments, // because the lexer doesn't run synchronously with the parser. // That's no problem, because the spec is designed to allow that // to be handled separately and not here. // if (outside_t_expr()) System.out.println(";" + $contents.text); skip(); }; // The following implements INDENT/DEDENT tokens, semi-similar to Python in // http://docs.python.org/2/reference/lexical_analysis.html#indentation // Page 95 of "The Definitive ANTLR Reference" has a code outline for // generating INDENT and DEDENT, but it is fundamentally wrong for us. // That only acts when there is 1+ indent characters on a line, so it // cannot implement all the DEDENTs necessary when it encounters a blank line. // End-of-line (EOL) is extremely special in sweet-expressions. // After reading it, we'll need to read in any following indent characters // (if indent processing is active) to determine if have an INDENT or DEDENTs. // As part of tokenizing, we'll consume any following lines that // are ;-only lines, and treat indent-only lines equivalent to blank lines. fragment SPECIAL_IGNORED_LINE : (' ' | '\t' | '!' )* ';' NOT_EOL_CHARS EOL_SEQUENCE ; fragment INDENT_CHAR : (' ' | '\t' | '!'); fragment INDENT_CHARS : INDENT_CHAR*; fragment INDENT_CHARS_PLUS : INDENT_CHAR+; // These are reported as fragments, not tokens, to silence // spurious warnings: fragment INITIAL_INDENT: ' '; fragment INDENT: ' '; fragment DEDENT: ' '; // EOL after contents - may have a following indented/dedented line. EOL: {enclosure==0 && ((getCharPositionInLine() != 0) && !initial_indent)}? => e=EOL_SEQUENCE SPECIAL_IGNORED_LINE* i=INDENT_CHARS // This is the indent for the next line extra=EOL_SEQUENCE* // If this exists, the indents are useless. { $e.setType(EOL); emit($e); // Emit the EOL token if ($extra != null || ($i.text).equals("")) { process_indent("", $i); // Indented EOL = EOL emit($e); // Emit the extra EOL token } else { // Normal case: EOL, possibly followed by indent; process it. process_indent($i.text, $i); } } ; BLANK_EOL : {enclosure==0 && (getCharPositionInLine() == 0)}? => e=EOL_SEQUENCE { $e.setType(EOL); emit($e); process_indent("", $e); } ; INITIAL_INDENT_EOL : {enclosure==0 && initial_indent}? => e=EOL_SEQUENCE { $e.setType(EOL); emit($e); initial_indent = false; } ; // Generate special initial indents for initial indents // not preceded by lines with contents INITIAL_INDENT_INIT : {(enclosure == 0) && (getCharPositionInLine() == 0) }? => i=INDENT_CHARS_PLUS { process_initial_indent($i.text, $i); } ; ENCLOSED_EOL_CHAR: {enclosure > 0}? => EOL_CHAR; // Do not reference '\n' or '\r' inside a non-lexing rule in ANTLR. // If you do, ANTLR will quietly create new lexical tokens for them, and // those new tokens will interfere with EOL processing. E.G., do NOT do this: // eolchar : '\n' | '\r'; // Other simple character or two-character sequences: LPAREN : '(' {enclosure++;} ; VECTOR_START : '#(' {enclosure++;}; BYTEVECTOR_START : '#u8(' {enclosure++;}; RPAREN : ')' {if (enclosure>0) {enclosure--;};}; LBRACKET : '[' {enclosure++;}; RBRACKET : ']' {if (enclosure>0) {enclosure--;};}; LBRACE : '{' {enclosure++;}; RBRACE : '}' {if (enclosure>0) {enclosure--;};}; fragment BANG : '!'; // Here we give lexical definitions for the so-called simple datums, // such as symbols and numbers... what many people would call "atoms". // Anybody who says "Scheme doesn't have syntax" has never tried // to parse Scheme identifiers or numbers. Scheme's syntax for // identifiers and numbers is extremely complicated; the number of // productions needed are vast, even if (in our case) we're only trying // to match a string instead something more complex. They're not even // all that easy to implement when you're trying to directly follow the spec. // To prove the point, the following shows a typical lexical definition // for identifiers and numbers in many languages: // fragment ID_INITIAL: ('A'..'Z' | 'a'..'z' | '_') // IDENTIFIER: ID_INITIAL (ID_INITIAL | '0'..'9')* ; // fragment EXPONENT : ('e'|'E') ('+'|'-')? ('0'..'9')+ ; // FLOAT : ('0'..'9')+ '.' ('0'..'9')* EXPONENT? // | '.' ('0'..'9')+ EXPONENT? // | ('0'..'9')+ EXPONENT ; // INT : ('0'..'9')+ ; // number : INT | FLOAT ; // Languages might have 1-3 more productions for various bases (2,8,16). // // In contrast, the productions below show how to parse Scheme identifiers // and numbers, which are FULL of complications, and thus are chock-full // of complicated productions. Numbers, in particular, are frighteningly // hard to get right. In fact, the Scheme number spec (R7RS draft 8) has // a number of problems that I had to correct (I have sent my comments // to the spec authors). // Note: The following maps most stuff into strings, because we don't // need to do more than that for a translation. The action rules can // be changed to generate real values, not just string representations, // but for our purposes that would add needless complications. // Note: The spec requires that in numbers case is not relevant, // so many of the productions show upper/lowercase. IDENTIFIER : INITIAL SUBSEQUENT* | | '|' SYMBOL_ELEMENT* '|' | PECULIAR_IDENTIFIER ; fragment INITIAL : LETTER | SPECIAL_INITIAL | INLINE_HEX_ESCAPE ; fragment LETTER : 'a'..'z' | 'A'..'Z' ; fragment SPECIAL_INITIAL : '!' | '$' | '%' | '&' | '*' | '/' | ':' | '<' | '=' | '>' | '?' | '^' | '_' | '~'; fragment SUBSEQUENT : INITIAL | DIGIT | SPECIAL_SUBSEQUENT ; fragment DIGIT : '0' .. '9' ; fragment HEX_DIGIT : DIGIT |'a'..'f' ; fragment EXPLICIT_SIGN : '+' | '-' ; fragment SPECIAL_SUBSEQUENT : EXPLICIT_SIGN | '.' | '@' ; fragment INLINE_HEX_ESCAPE : '\\' 'x' HEX_SCALAR_VALUE ';' ; fragment HEX_SCALAR_VALUE : HEX_DIGIT+ ; fragment PECULIAR_IDENTIFIER : EXPLICIT_SIGN ( /* empty */ | SIGN_SUBSEQUENT SUBSEQUENT* | '.' DOT_SUBSEQUENT SUBSEQUENT* ) | '.' DOT_SUBSEQUENT SUBSEQUENT* // Note: This is a bogus extension for testing purposes, to make sure that // a \\ is not interpreted in an initial indent: | '\\' '\\' ; fragment DOT_SUBSEQUENT : SIGN_SUBSEQUENT | '.' ; fragment SIGN_SUBSEQUENT : INITIAL | EXPLICIT_SIGN | '@' ; // Annoyingly, SYMBOL_ELEMENT overlaps with STRING_ELEMENT in // the R7RS Scheme draft 8 fix; this (below) fixes that: fragment SYMBOL_ELEMENT : (~('|' | '\\')) | SPECIAL_STRING_ELEMENT // NOTE: Double-quote should NOT be here, already matched above. | '\\|' ; // boolean, character, character_name BOOLEAN : '#t' | '#f' | '#true' | '#false' ; CHARACTER : '#\\' ((~ ('a'..'z')) | ('a'..'w' | 'y' | 'z') ('a'..'z')* | 'x' HEX_SCALAR_VALUE ) ; STRING : '\"' STRING_ELEMENT* '\"' ; fragment STRING_ELEMENT : ~( '"' | '\\' ) | SPECIAL_STRING_ELEMENT ; fragment INTRALINE_WHITESPACE : (' ' | '\t')* EOL_SEQUENCE (' ' | '\t')* ; fragment SPECIAL_STRING_ELEMENT : '\\' ('a' | 'b' | 't' | 'n' | 'r' | '"' | '\\' | INTRALINE_WHITESPACE ) | INLINE_HEX_ESCAPE ; // The following is structured so that exactness (if stated) // can precede *OR* follow the radix (base) declaration, while // (1) forbidding two exactness statements (only 1/number), // (2) allowing exactness and radix declarations to both be optional without // creating a useless ambiguity. // Instead of calling a "NUM_2", we explicitly make statements about // exactness. NUMBER : (EXACTNESS (RADIX_2 COMPLEX_2 | RADIX_8 COMPLEX_8 | RADIX_10 COMPLEX_10 | RADIX_16 COMPLEX_16 | COMPLEX_10)) | RADIX_2 EXACTNESS? COMPLEX_2 | RADIX_8 EXACTNESS? COMPLEX_8 | RADIX_10 EXACTNESS? COMPLEX_10 | RADIX_16 EXACTNESS? COMPLEX_16 | COMPLEX_10 ; fragment I : 'i' | 'I' ; // These aren't used directly, they are templates. // Note that UREAL_R is special for base 10 (supports decimal) // The PREFIX_R has to handled specially because EXACTNESS can be empty, // as can RADIX_10. // // fragment NUM_R : PREFIX_R COMPLEX_R ; // fragment COMPLEX_R : REAL_R // ('@' REAL_R // | ('+' | '-') UREAL_R? I // | INFNAN I )? // | ('+' | '-') UREAL_R I // | INFNAN I // | ('+' | '-') I ; // fragment REAL_R : SIGN UREAL_R | INFNAN ; // fragment UREAL_R : UINTEGER_R ('/' UINTEGER_R)? // | DECIMAL_R; // fragment UINTEGER_R : DIGIT_R+ ; // fragment PREFIX_R : RADIX_R EXACTNESS | EXACTNESS RADIX_R ; // fragment NUM_2 : PREFIX_2 COMPLEX_2 ; fragment COMPLEX_2 : REAL_2 ('@' REAL_2 | ('+' | '-') UREAL_2? I | INFNAN I )? | ('+' | '-') UREAL_2 I | INFNAN I | ('+' | '-') I ; fragment REAL_2 : SIGN UREAL_2 | INFNAN ; fragment UREAL_2 : UINTEGER_2 ('/' UINTEGER_2)? ; fragment UINTEGER_2 : DIGIT_2+ ; // fragment PREFIX_2 : RADIX_2 EXACTNESS? ; // fragment NUM_8 : PREFIX_8 COMPLEX_8 ; fragment COMPLEX_8 : REAL_8 ('@' REAL_8 | ('+' | '-') UREAL_8? I | INFNAN I )? | ('+' | '-') UREAL_8 I | INFNAN I | ('+' | '-') I ; fragment REAL_8 : SIGN UREAL_8 | INFNAN ; fragment UREAL_8 : UINTEGER_8 ('/' UINTEGER_8)? ; fragment UINTEGER_8 : DIGIT_8+ ; // fragment PREFIX_8 : RADIX_8 EXACTNESS? ; // fragment NUM_10 : PREFIX_10 COMPLEX_10 ; fragment COMPLEX_10 : REAL_10 ('@' REAL_10 | ('+' | '-') UREAL_10? I | INFNAN I )? | ('+' | '-') UREAL_10 I | INFNAN I | ('+' | '-') I ; fragment REAL_10 : SIGN UREAL_10 | INFNAN ; // UREAL_10 has to be handled carefully. The spec has this: // fragment UREAL_10 : UINTEGER_10 ('/' UINTEGER_10)? | DECIMAL_10; // fragment DECIMAL_10 : UINTEGER_10 SUFFIX // | PERIOD DIGIT_10+ SUFFIX // | DIGIT_10+ PERIOD DIGIT_10* SUFFIX ; // which is ambiguous; for UREAL_10, "1" can match branch 1 or 2. fragment UREAL_10 : /* SUFFIX can be empty, so making it optional creates unnecessary ambiguity */ UINTEGER_10 ('/' UINTEGER_10 | SUFFIX | PERIOD DIGIT_10* SUFFIX) | PERIOD DIGIT_10+ SUFFIX ; fragment UINTEGER_10 : DIGIT_10+ ; // fragment PREFIX_10 : RADIX_10 EXACTNESS? ; // fragment NUM_16 : PREFIX_16 COMPLEX_16 ; fragment COMPLEX_16 : REAL_16 ('@' REAL_16 | ('+' | '-') UREAL_16? I | INFNAN I )? | ('+' | '-') UREAL_16 I | INFNAN I | ('+' | '-') I ; fragment REAL_16 : SIGN UREAL_16 | INFNAN ; fragment UREAL_16 : UINTEGER_16 ('/' UINTEGER_16)? ; fragment UINTEGER_16 : DIGIT_16+ ; // fragment PREFIX_16 : RADIX_16 EXACTNESS? ; fragment INFNAN : ('+' | '-') ('inf.0' | 'nan.0') ; fragment SUFFIX : /*empty*/ | EXPONENT_MARKER SIGN DIGIT_10+ ; fragment EXPONENT_MARKER : 'e' | 's' | 'f' | 'd' | 'l' | 'E' | 'S' | 'F' | 'D' | 'L' ; fragment SIGN : /*empty*/ | '+' | '-' ; // NOTE: This *requires* non-empty. fragment EXACTNESS : '#i' | '#I' | '#e' | '#E' ; fragment RADIX_2 : '#b' | '#B' ; fragment RADIX_8 : '#o' | '#O' ; // NOTE: This *requires* non-empty. fragment RADIX_10 : '#d' | '#D'; fragment RADIX_16 : '#x' | '#X'; fragment DIGIT_2 : '0' | '1' ; fragment DIGIT_8 : '0'..'7' ; fragment DIGIT_10 : DIGIT ; fragment DIGIT_16 : DIGIT_10 | 'a'..'f' | 'A'..'F'; // Define ERROR this way so we don't get a spurious ANTLR warning. // This is used by the later "error" non-terminal. fragment ERROR: ' ' ; // PARSER SECTION // Special non-terminals that act essentially as comments. // They are used clarify the grammar meaning, as follows: // empty : ; // This used to identify an empty branch // // but doing that interfered with ANTLR debugging info same : ; // Emphasizes where neither indent nor dedent has occurred error : ERROR; // Specifically identifies an error branch. // Note that errors can occur elsewhere, and an implementation // may include an extension where an error is noted in this grammar. // However, the error non-terminal makes it clear where an action is // not defined, indicates where a parser might specifically check for // errors, and also acts as a check on the grammar itself (to ensure that // there isn't some valid interpretation for that sequence at that point). // Here we create some non-terminals with the same names as terminals, // just lowercase instead of uppercase. // These renames are completely unnecessasry, but they are helpful // when using the ANTLR debugger. // collecting_end : COLLECTING_END; // indent : INDENT; // dedent : DEDENT; // This BNF uses the following slightly complicated pattern in some places: // from_n_expr ((hspace+ (stuff {action1} | /*empty*/ {action2} )) // | /*empty*/ {action2} ) // This is an expanded form of this BNF pattern (sans actions): // from_n_expr (hspace+ stuff?)? // Note that this pattern quietly removes horizontal spaces at the // end of the line correctly; that's important because you can't see them, // so quietly handling them eliminates a source of hard-to-find and // unnecessary errors. // If from_n_expr (etc.) is as greedy as possible (it needs to be), // we *could* instead accept this simpler BNF pattern: // from_n_expr hspace* stuff? // but while that simpler BNF pattern would correctly accept *good* input, // it would also accept *incorrect* input like "(x)q" or other n-expressions // followed immediately by other n-expressions without intervening whitespace. // We want to detect such situations as errors, so we'll use the // more complex (and more persnickety) BNF pattern instead. wspace : hspace | ENCLOSED_EOL_CHAR | FF | VT | LCOMMENT ; // Separators inside (...) etc. list_contents_real returns [Object v] : n1=n_expr (wspace+ (lc=list_contents_real {$v = cons($n1.v, $lc.v);} | /*empty*/ {$v = list($n1.v);}) | /*empty*/ {$v = list($n1.v);}) | PERIOD (wspace+ (n2=n_expr wspace* {$v = $n2.v;} | /*empty*/ {$v = list(".");}) | /*empty*/ {$v = list(".");}) ; list_contents returns [Object v] : wspace* (list_contents_real {$v=$list_contents_real.v;} | /*empty*/ {$v = null;} ) ; // The "greedy=true" option here forces n_expr_tail to look at the // next character and forceably consume in *this* production if it's // an opening paren, bracket, or brace. Without this, // ANTLR notices ambiguities and complains with reports like: // Decision can match input such as "LPAREN" using multiple alternatives: 1, 4 // This is because there are constructs such as: // #;a#|hi|#(x) // which technically are ambiguous; do we mean "#;a" or "#;a(x)"? // The option breaks the ambiguity. n_expr_tail[Object prefix] returns [Object v] : (options {greedy=true;} : LPAREN c1=list_contents RPAREN r1=n_expr_tail[cons(prefix, $c1.v)] {$v = $r1.v;} | LBRACKET c2=list_contents RBRACKET r2=n_expr_tail[cons("$" + "bracket-apply" + "$", cons(prefix, $c2.v))] {$v = $r2.v;} | LBRACE c3=list_contents RBRACE // Map f{} to (f) and not (f ()). Note that f{x} maps to (f x). r3=n_expr_tail[nullp($c3.v) ? list(prefix) : list(prefix, process_curly($c3.v))] {$v = $r3.v;} | /*empty*/ {$v = prefix;} ) ; vector returns [Object v] : VECTOR_START list_contents RPAREN {$v = cons("vector", $list_contents.v); } ; // Currently return value ignored because simple_datum ignores it. bytevector returns [Object v] : BYTEVECTOR_START list_contents RPAREN { $v = cons("bytevector", $list_contents.v); } ; simple_datum : BOOLEAN | NUMBER | CHARACTER | STRING | symbol | bytevector; symbol : IDENTIFIER ; compound_datum returns [Object v] : LPAREN norm=list_contents RPAREN {$v = $norm.v;} | LBRACKET bracketed=list_contents RBRACKET {$v = $bracketed.v;} | LBRACE braced=list_contents RBRACE {$v = process_curly($braced.v);} | vector {$v = $vector.v;} ; // Production "n_expr_prefix" is an n-expression prefixes per SRFI-105: n_expr_prefix returns [Object v] : simple_datum {$v = $simple_datum.text;} | compound_datum {$v = $compound_datum.v;} ; // Production "n_expr_noabbrev" is an n-expression as defined in SRFI-105, // but has no leading abbreviations (see below): n_expr_noabbrev returns [Object v] : n_expr_prefix n_expr_tail[$n_expr_prefix.v] {$v = $n_expr_tail.v;} ; // INCLUDE IN SRFI // IMPORTANT SUPPORTING PARSER DEFINITIONS for the BNF hspace : SPACE | TAB ; // horizontal space // This is for handling non-first n-expressions in initial indent. // An implementation MAY implement this as "(hspace | '!')+" // (e.g., to avoid retaining state when returning a value). hspaces_maybe_bang: hspace+ ; // Production "abbrevw" is an abbreviation with a following whitespace: abbrevw returns [Object v] : APOSW {$v = "quote";} | QUASIQUOTEW {$v = "quasiquote";} | UNQUOTE_SPLICEW {$v = "unquote-splicing";} | UNQUOTEW {$v = "unquote";} ; // Production "abbrev_no_w" is an abbreviation without a following whitespace: abbrev_no_w returns [Object v] : APOS {$v = "quote";} | QUASIQUOTE {$v = "quasiquote";} | UNQUOTE_SPLICE {$v = "unquote-splicing";} | UNQUOTE {$v = "unquote";}; abbrev_all returns [Object v] : abbrevw {$v = $abbrevw.v;} | abbrev_no_w {$v = $abbrev_no_w.v;} ; // Production "n_expr" is a full neoteric-expression as defined in SRFI-105. // n_expr does *not* consume any following horizontal space. // Uses "n_expr_noabbrev", an n-expression with no leading abbreviations: n_expr returns [Object v] : abbrev_all n1=n_expr {$v = list($abbrev_all.v, $n1.v);} | n_expr_noabbrev {$v = $n_expr_noabbrev.v;} ; // Production "n_expr_first" is a neoteric-expression, but leading // abbreviations cannot have an whitespace afterwards (used by "head"): n_expr_first returns [Object v] : abbrev_no_w n1=n_expr_first {$v = list($abbrev_no_w.v, $n1.v);} | n_expr_noabbrev {$v = $n_expr_noabbrev.v;} ; // Production "scomment" (special comment) defines comments other than ";": sharp_bang_comments : SRFI_22_COMMENT | SHARP_BANG_FILE | SHARP_BANG_DIRECTIVE ; scomment : BLOCK_COMMENT | DATUM_COMMENT_START (options {greedy=true;} : hspace)* n_expr | sharp_bang_comments ; // Production "comment_eol" reads an optional ;-comment (if it exists), // and then reads the end-of-line (EOL) sequence. EOL processing consumes // additional comment-only lines (if any) which may be indented. comment_eol : LCOMMENT? EOL; // KEY BNF PRODUCTIONS for sweet-expressions: // Production "collecting_tail" returns a collecting list's contents. // Precondition: At beginning of line. // Postcondition: Consumed the matching COLLECTING_END. // FF = formfeed (\f aka \u000c), VT = vertical tab (\v aka \u000b) collecting_tail returns [Object v] : it_expr more=collecting_tail {$v = cons($it_expr.v, $more.v);} | comment_eol retry1=collecting_tail {$v = $retry1.v;} | INITIAL_INDENT /* Initial indent only allowed for lcomment */ LCOMMENT EOL retry2=collecting_tail {$v = $retry2.v;} | (FF | VT)+ EOL retry3=collecting_tail {$v = $retry3.v;} | COLLECTING_END {$v = null;} ; // Process line after ". hspace+" sequence. Does not go past current line. post_period returns [Object v] : scomment hspace* rpt=post_period {$v = $rpt.v;} // (scomment hspace*)* | pn=n_expr hspace* (scomment hspace*)* (n_expr error)? {$v = $pn.v;} | COLLECTING hspace* pc=collecting_tail hspace* (scomment hspace*)* (n_expr error)? {$v = $pc.v;} | /*empty*/ {$v = ".";} ; // Production "head" reads 1+ n-expressions on one line; it will // return the list of n-expressions on the line. If there is one n-expression // on the line, it returns a list of exactly one item; this makes it // easy to append to later (if appropriate). In some cases, we want // single items to be themselves, not in a list; function monify does this. // The "head" production never reads beyond the current line // (except within a block comment), so it doesn't need to keep track // of indentation, and indentation will NOT change within head. // The "head" production only directly handles the first n-expression on the // line, and then calls on "rest" to process the rest (if any); we do this // because in a few cases it matters if an expression is the first one. // Callers can depend on "head" and "rest" *not* changing indentation. // On entry, all indentation/hspace must have already been read. // On return, it will have consumed all hspace (spaces and tabs). // Precondition: At beginning of line+indent // Postcondition: At unconsumed EOL head returns [Object v] : PERIOD /* Leading ".": escape following datum like an n-expression. */ (hspace+ pp=post_period {$v = list($pp.v);} | /*empty*/ {$v = list(".");} ) | COLLECTING hspace* collecting_tail hspace* (rr=rest {$v = cons($collecting_tail.v, $rr.v); } | /*empty*/ {$v = list($collecting_tail.v); } ) | basic=n_expr_first /* Only match n_expr_first */ ((hspace+ (br=rest {$v = cons($basic.v, $br.v);} | /*empty*/ {$v = list($basic.v);} )) | /*empty*/ {$v = list($basic.v);} ) ; // Production "rest" production reads the rest of the expressions on a line // (the "rest of the head"), after the first expression of the line. // Like head, it consumes any hspace before it returns. // The "rest" production is written this way so a non-tokenizing // implementation can read an expression specially. E.G., if it sees a period, // read the expression directly and then see if it's just a period. // Precondition: At beginning of non-first expression on line (past hspace) // Postcondition: At unconsumed EOL rest returns [Object v] : PERIOD /* Improper list */ (hspace+ pp=post_period {$v = $pp.v;} | /*empty*/ {$v = list(".");}) | scomment hspace* (sr=rest {$v = $sr.v;} | /*empty*/ {$v = null;} ) | COLLECTING hspace* collecting_tail hspace* (rr=rest {$v = cons($collecting_tail.v, $rr.v);} | /*empty*/ {$v = list($collecting_tail.v);} ) | basic=n_expr ((hspace+ (br=rest {$v = cons($basic.v, $br.v);} | /*empty*/ {$v = list($basic.v);} )) | /*empty*/ {$v = list($basic.v);} ) ; // Production "body" handles the sequence of 1+ child lines in an it_expr // (e.g., after a "head"), each of which is itself an it_expr. // It returns the list of expressions in the body. // Note that an it-expr will consume any line comments or hspaces // before it returns back to the "body" production. // Since (list x) is simply (cons x '()), this production always does a // cons of the first it_expr and another body [if it exists] or '() [if not]. body returns [Object v] : i=it_expr (same ( {isperiodp($i.v)}? => f=it_expr DEDENT {$v = $f.v;} // Improper list final value | {! isperiodp($i.v)}? => nxt=body {$v = cons($i.v, $nxt.v);} ) | DEDENT {$v = list($i.v);} ) ; // Production "it_expr" (indented sweet-expressions) // is the main production for sweet-expressions in the usual case. // Precondition: At beginning of line+indent // Postcondition: it-expr ended by consuming EOL + examining indent // Note: This BNF presumes that "*>" generates multiple tokens, // "EOL DEDENT* COLLECTING_END", and resets the indentation list. // You can change the BNF below to allow "head /*empty*/", and handle dedents // by directly comparing values; then "*>" only needs to generate // COLLECTING_END. But this creates a bunch of ambiguities // like a 'dangling else', which must all be disambiguated by accepting // the first or the longer sequence first. Either approach is needed to // support "*>" as the non-first element so that the "head" can end // without a literal EOL, e.g., as in "let <* y 5 *>". it_expr returns [Object v] : head (options {greedy=true;} : ( GROUP_SPLIT hspace* /* Not initial; interpret as split */ (options {greedy=true;} : // To allow \\ EOL as line-continuation, instead do: // comment_eol same more=it_expr {$v = append($head.v, $more.v);} comment_eol error | /*empty*/ {$v = monify($head.v);} ) | SUBLIST hspace* /* head SUBLIST ... case */ (sub_i=it_expr {$v=append($head.v, list($sub_i.v));} | comment_eol error ) | comment_eol // Normal case, handle child lines if any: (INDENT children=body {$v = append($head.v, $children.v);} | /*empty*/ {$v = monify($head.v);} /* No child lines */ ) // If COLLECTING_END doesn't generate multiple tokens, can do: // | /*empty*/ {$v = monify($head.v);} )) | (GROUP_SPLIT | scomment) hspace* /* Initial; Interpet as group */ (group_i=it_expr {$v = $group_i.v;} /* Ignore initial GROUP/scomment */ | comment_eol (INDENT g_body=body {$v = $g_body.v;} /* Normal GROUP use */ | same ( g_i=it_expr {$v = $g_i.v;} /* Plausible separator */ /* Handle #!sweet EOL EOL t_expr */ | comment_eol restart=t_expr {$v = $restart.v;} ) | DEDENT error )) | SUBLIST hspace* /* "$" first on line */ (is_i=it_expr {$v=list($is_i.v);} | comment_eol error ) | abbrevw hspace* (comment_eol INDENT ab=body {$v = append(list($abbrevw.v), $ab.v);} | ai=it_expr {$v=list($abbrevw.v, $ai.v);} ) ; // Production "t_expr" is the top-level production for sweet-expressions. // This production handles special cases, then in the normal case // drops to the it_expr production. // Precondition: At beginning of line // Postcondition: At beginning of line // The rule for "indent processing disabled on initial top-level hspace" // is a very simple (and clever) BNF construction by Alan Manuel K. Gloria. // If there is an indent it simply reads a single n-expression and returns. // If there is more than one on an initially-indented line, the later // horizontal space will not have have been read, so this production will // fire again on the next invocation, doing the right thing. t_expr returns [Object v] : comment_eol retry1=t_expr {$v=$retry1.v;} | (FF | VT)+ EOL retry2=t_expr {$v=$retry2.v;} | (INITIAL_INDENT | hspaces_maybe_bang) /* initial indent */ (n_expr {$v = $n_expr.v;} | (scomment (options {greedy=true;} : hspace)* sretry=t_expr {$v=$sretry.v;}) | comment_eol retry3=t_expr {$v=$retry3.v;} ) | EOF {generate_eof();} /* End of file */ | it_expr {$v = $it_expr.v;} /* Normal case */ ;