Below are portable implementations of a read and write that use the
srfi-38 notation.
Some nitty gritty:
* The composition of characters into tokens and of tokens into data
should be explicitly described in BNF and in English. Points
to cover:
-- The #nn# and #nn= tokens are self-terminating
-- Leading zeros allowed? #007#
-- Multiple labels allowed? #1=#2=(#1# #2#)
-- Labeled references allowed? #1=(#2=#1# #2#)
-- Require that #1=#1# signal an error rather than cause an infinite loop.
* Mention explicitly that numbers do not survive across reads and
writes.
* May the numbers start at any number? Must they be used
sequentially? May they be reused? May they be used before they are
declared, (#1# #1=foo)? I think all these answers should be No.
The implementations below are not heavily tested.
Read-with-numbered-parts should be able to handle any correct r5rs
datum. It signals some errors but allows several non-standard things
as well, like (. x) and #(x . (y)).
-al
(define (read-with-numbered-parts . optional-port)
(define port
(if (null? optional-port) (current-input-port) (car optional-port)))
(define (read-char*) (read-char port))
(define (peek-char*) (peek-char port))
(define (looking-at? c)
(eqv? c (peek-char*)))
(define (delimiter? c)
(case c
((#\( #\) #\" #\;) #t)
(else (or (eof-object? c)
(char-whitespace? c)))))
(define (not-delimiter? c) (not (delimiter? c)))
(define (eat-intertoken-space)
(define c (peek-char*))
(cond ((eof-object? c))
((char-whitespace? c) (read-char*) (eat-intertoken-space))
((char=? c #\;)
(do ((c (read-char*) (read-char*)))
((or (eof-object? c) (char=? c #\newline))))
(eat-intertoken-space))))
(define (read-string)
(read-char*)
(let read-it ((chars '()))
(let ((c (read-char*)))
(case c
((#\") (list->string (reverse chars)))
((#\\) (read-it (cons (read-char*) chars)))
(else (read-it (cons c chars)))))))
;; reads chars that match PRED and returns them as a string.
(define (read-some-chars pred)
(let iter ((chars '()))
(let ((c (peek-char*)))
(if (or (eof-object? c) (not (pred c)))
(list->string (reverse chars))
(iter (cons (read-char*) chars))))))
;; reads a character after the #\ has been read.
(define (read-character)
(let ((c (peek-char*)))
(if (char-alphabetic? c)
(let ((name (read-some-chars char-alphabetic?)))
(cond ((= 1 (string-length name)) (string-ref name 0))
((string-ci=? name "space") #\space)
((string-ci=? name "newline") #\newline)
(else (error "Unknown named character: " name))))
(read-char*))))
(define (read-number first-char)
(let ((str (string-append (string first-char)
(read-some-chars not-delimiter?))))
(or (string->number str)
(error "Malformed number: " str))))
(define char-standard-case
(if (char=? #\a (string-ref (symbol->string 'a) 0))
char-downcase
char-upcase))
(define (string-standard-case str)
(let* ((len (string-length str))
(new (make-string len)))
(do ((i 0 (+ i 1)))
((= i len) new)
(string-set! new i (char-standard-case (string-ref str i))))))
(define (read-identifier)
(string->symbol (string-standard-case (read-some-chars not-delimiter?))))
(define (read-part-spec)
(let ((n (string->number (read-some-chars char-numeric?))))
(let ((c (read-char*)))
(case c
((#\=) (cons 'label n))
((#\#) (cons 'use n))
(else (error "Malformed shared part specifier"))))))
;; Tokens: strings, characters, numbers, booleans, and
;; identifiers/symbols are represented as themselves.
;; Single-character tokens are represented as (CHAR), the
;; two-character tokens #( and ,@ become (#\#) and (#\@).
;; #NN= and #NN# become (label NN) and (use NN).
(define (read-token)
(eat-intertoken-space)
(let ((c (peek-char*)))
(case c
((#\( #\) #\' #\`) (read-char*) (list c))
((#\,)
(read-char*)
(if (looking-at? #\@)
(begin (read-char*) '(#\@))
'(#\,)))
((#\") (read-string))
((#\.)
(read-char*)
(cond ((delimiter? (peek-char*)) '(#\.))
((not (looking-at? #\.)) (read-number #\.))
((begin (read-char*) (looking-at? #\.)) (read-char*) '...)
(else (error "Malformed token starting with \"..\""))))
((#\+) (read-char*) (if (delimiter? (peek-char*)) '+ (read-number c)))
((#\-) (read-char*) (if (delimiter? (peek-char*)) '- (read-number c)))
((#\#)
(read-char*)
(let ((c (peek-char*)))
(case c
((#\() (read-char*) '(#\#))
((#\\) (read-char*) (read-character))
((#\t #\T) (read-char*) #t)
((#\f #\F) (read-char*) #f)
(else (cond ((char-numeric? c) (read-part-spec))
(else (read-number #\#)))))))
(else (cond ((eof-object? c) c)
((char-numeric? c) (read-char*) (read-number c))
(else (read-identifier)))))))
;; Maps the number of each part to a thunk that returns the part.
(define parts-alist '())
(define (read-object)
(finish-reading-object (read-token)))
(define (finish-reading-object first-token)
(if (not (pair? first-token))
first-token
(case (car first-token)
((#\() (read-tail))
((#\#) (list->vector (read-tail)))
((#\. #\)) (error "Unexpected \"" token "\""))
((use) (let ((n (cdr first-token)))
(cond ((assv n parts-alist) => cdr)
(else (error "Use of undeclared part " n)))))
((label)
;; This is complicated in order to allow #1=#2=() and
;; #1=(#2=#1#) and not to loop forever when given #1=#1#.
(let ((n (cdr first-token)))
(if (assv n parts-alist)
(error "Double declaration of part " n)
(let read-labels ((labels (list n)))
(define (add-labels-to-alist! thunk)
(set! parts-alist
(append (map (lambda (n) (cons n thunk)) labels)
parts-alist)))
(let ((token (read-token)))
(cond ((and (pair? token) (eq? 'label (car token)))
;; An additional label in a chain of them.
(let ((n (cdr token)))
(if (or (assv n parts-alist) (memv n labels))
(error "Double declaration of part " n))
(read-labels (cons n labels))))
((and (pair? token) (eq? 'use (car token)))
;; The labeled object is a use of a previous label:
;; reuse its thunk for our thunk.
(let* ((n (cdr token)) (p (assv n parts-alist)))
(if (not p)
(error "Use of undeclared part " n)
(let ((thunk (cdr p)))
(add-labels-to-alist! thunk)
thunk))))
(else
;; Normal case. Make a thunk (which doesn't
;; need to be usable until the read is complete.)
(letrec
((obj (begin
(add-labels-to-alist! (lambda () obj))
(finish-reading-object token))))
obj))))))))
(else (list (caadr (assv (car first-token)
'((#\' 'x) (#\, ,x) (#\` `x) (#\@ ,@x))))
(read-object))))))
(define (read-tail)
(let ((token (read-token)))
(cond ((eof-object? token) (error "EOF inside a list or vector"))
((not (pair? token)) (cons token (read-tail)))
(else (case (car token)
((#\)) '())
((#\.) (let* ((obj (read-object))
(tok (read-token)))
(if (and (pair? tok) (char=? #\) (car tok)))
obj
(error "Extra junk after a dot"))))
(else (let ((obj (finish-reading-object token)))
(cons obj (read-tail)))))))))
(let ((obj (read-object)))
(let fill-in-parts ((obj obj))
(cond ((pair? obj)
(if (procedure? (car obj))
(set-car! obj ((car obj)))
(fill-in-parts (car obj)))
(if (procedure? (cdr obj))
(set-cdr! obj ((cdr obj)))
(fill-in-parts (cdr obj))))
((vector? obj)
(let ((len (vector-length obj)))
(do ((i 0 (+ i 1)))
((= i len))
(let ((elt (vector-ref obj i)))
(if (procedure? elt)
(vector-set! obj i (elt))
(fill-in-parts elt))))))))
obj))
(define (write-with-numbered-parts obj . optional-port)
(define port
(if (null? optional-port) (current-output-port) (car optional-port)))
(define (write* obj) (write obj port))
(define (display* obj) (display obj port))
(define (acons key val alist)
(cons (cons key val) alist))
;; We only track duplicates of pairs, vectors, and strings. We
;; ignore zero-length vectors and strings because r5rs doesn't
;; guarantee that eq? treats them sanely (and they aren't very
;; interesting anyway).
(define (interesting? obj)
(or (pair? obj)
(and (vector? obj) (not (zero? (vector-length obj))))
(and (string? obj) (not (zero? (string-length obj))))))
;; (write-obj OBJ ALIST):
;; ALIST has an entry for each interesting part of OBJ. The
;; associated value will be:
;; -- a number if the part has been given one,
;; -- #t if the part will need to be assigned a number but has not been yet,
;; -- #f if the part will not need a number.
;; The cdr of ALIST's first element should be the most recently
;; assigned number.
;; Returns an alist with new shadowing entries for any parts that
;; had numbers assigned.
(define (write-obj obj alist)
(define (write-interesting alist)
(cond ((pair? obj)
(let ((caro (car obj)) (cdro (cdr obj)))
(cond ((and (pair? cdro)
(null? (cdr cdro))
(let ((abbrev (assq caro '('"'" `"`" ,"," ,@",@"))))
(and abbrev
;; we can't abbreviate (quote . #1#)
(not (cdr (assq cdro alist)))
abbrev)))
=> (lambda (abbrev)
(display* (cadr abbrev))
(write-obj (car cdro) alist)))
(else (display* "(")
(let write-cdr ((obj cdro)
(alist (write-obj caro alist)))
(cond ((and (pair? obj)
(not (cdr (assq obj alist))))
(display* " ")
(write-cdr (cdr obj)
(write-obj (car obj) alist)))
((null? obj) (display* ")") alist)
(else (display* " . ")
(let ((alist (write-obj obj alist)))
(display* ")")
alist))))))))
((vector? obj)
(display* "#(")
(let ((len (vector-length obj)))
(do ((i 1 (+ i 1))
(alist (write-obj (vector-ref obj 0) alist)
(write-obj (vector-ref obj i) alist)))
((= i len) (display* ")") alist)
(display* " "))))
;; else it's a string
(else (write* obj) alist)))
(cond ((interesting? obj)
(let ((val (cdr (assq obj alist))))
(cond ((not val) (write-interesting alist))
((number? val)
(display* "#") (write* val) (display* "#") alist)
(else
(let ((n (+ 1 (cdar alist))))
(display* "#") (write* n) (display* "=")
(write-interesting (acons obj n alist)))))))
(else (write* obj) alist)))
;; Scan computes the initial value of the alist, which maps each
;; interesting part of the object to #t if it occurs multiple times,
;; #f if only once.
(define (scan obj alist)
(cond ((not (interesting? obj)) alist)
((assq obj alist) =>
(lambda (p) (if (cdr p) alist (acons obj #t alist))))
(else
(let ((alist (acons obj #f alist)))
(cond ((pair? obj) (scan (car obj) (scan (cdr obj) alist)))
((vector? obj)
(let ((len (vector-length obj)))
(do ((i 0 (+ 1 i))
(alist alist (scan (vector-ref obj i) alist)))
((= i len) alist))))
(else alist))))))
(write-obj obj (acons 'dummy 0 (scan obj '())))
;; Don't want to return the big alist that write-obj did, lest it hinder gc.
(if #f #f))