A portable read implementation Albus Petrofsky 26 Dec 2002 01:00 UTC

Below are portable implementations of a read and write that use the
srfi-38 notation.

Some nitty gritty:

* The composition of characters into tokens and of tokens into data
  should be explicitly described in BNF and in English.  Points
  to cover:
    -- The #nn# and #nn= tokens are self-terminating
    -- Leading zeros allowed?  #007#
    -- Multiple labels allowed?  #1=#2=(#1# #2#)
    -- Labeled references allowed?  #1=(#2=#1# #2#)
    -- Require that #1=#1# signal an error rather than cause an infinite loop.

* Mention explicitly that numbers do not survive across reads and
  writes.

* May the numbers start at any number?  Must they be used
  sequentially?  May they be reused?  May they be used before they are
  declared, (#1# #1=foo)?  I think all these answers should be No.

The implementations below are not heavily tested.
Read-with-numbered-parts should be able to handle any correct r5rs
datum.  It signals some errors but allows several non-standard things
as well, like (. x) and #(x . (y)).

-al

(define (read-with-numbered-parts . optional-port)
  (define port
    (if (null? optional-port) (current-input-port) (car optional-port)))

  (define (read-char*) (read-char port))
  (define (peek-char*) (peek-char port))

  (define (looking-at? c)
    (eqv? c (peek-char*)))

  (define (delimiter? c)
    (case c
      ((#\( #\) #\" #\;) #t)
      (else (or (eof-object? c)
		(char-whitespace? c)))))

  (define (not-delimiter? c) (not (delimiter? c)))

  (define (eat-intertoken-space)
    (define c (peek-char*))
    (cond ((eof-object? c))
	  ((char-whitespace? c) (read-char*) (eat-intertoken-space))
	  ((char=? c #\;)
	   (do ((c (read-char*) (read-char*)))
	       ((or (eof-object? c) (char=? c #\newline))))
	   (eat-intertoken-space))))

  (define (read-string)
    (read-char*)
    (let read-it ((chars '()))
      (let ((c (read-char*)))
	(case c
	  ((#\") (list->string (reverse chars)))
	  ((#\\) (read-it (cons (read-char*) chars)))
	  (else (read-it (cons c chars)))))))

  ;; reads chars that match PRED and returns them as a string.
  (define (read-some-chars pred)
    (let iter ((chars '()))
      (let ((c (peek-char*)))
	(if (or (eof-object? c) (not (pred c)))
	    (list->string (reverse chars))
	    (iter (cons (read-char*) chars))))))

  ;; reads a character after the #\ has been read.
  (define (read-character)
    (let ((c (peek-char*)))
      (if (char-alphabetic? c)
	  (let ((name (read-some-chars char-alphabetic?)))
	    (cond ((= 1 (string-length name)) (string-ref name 0))
		  ((string-ci=? name "space") #\space)
		  ((string-ci=? name "newline") #\newline)
		  (else (error "Unknown named character: " name))))
	  (read-char*))))

  (define (read-number first-char)
    (let ((str (string-append (string first-char)
			      (read-some-chars not-delimiter?))))
      (or (string->number str)
	  (error "Malformed number: " str))))

  (define char-standard-case
    (if (char=? #\a (string-ref (symbol->string 'a) 0))
	char-downcase
	char-upcase))

  (define (string-standard-case str)
    (let* ((len (string-length str))
	   (new (make-string len)))
      (do ((i 0 (+ i 1)))
	  ((= i len) new)
	(string-set! new i (char-standard-case (string-ref str i))))))

  (define (read-identifier)
    (string->symbol (string-standard-case (read-some-chars not-delimiter?))))

  (define (read-part-spec)
    (let ((n (string->number (read-some-chars char-numeric?))))
      (let ((c (read-char*)))
	(case c
	  ((#\=) (cons 'label n))
	  ((#\#) (cons 'use n))
	  (else (error "Malformed shared part specifier"))))))

  ;; Tokens: strings, characters, numbers, booleans, and
  ;; identifiers/symbols are represented as themselves.
  ;; Single-character tokens are represented as (CHAR), the
  ;; two-character tokens #( and ,@ become (#\#) and (#\@).
  ;; #NN= and #NN# become (label NN) and (use NN).
  (define (read-token)
    (eat-intertoken-space)
    (let ((c (peek-char*)))
      (case c
	((#\( #\) #\' #\`) (read-char*) (list c))
	((#\,)
	 (read-char*)
	 (if (looking-at? #\@)
	     (begin (read-char*) '(#\@))
	     '(#\,)))
	((#\") (read-string))
	((#\.)
	 (read-char*)
	 (cond ((delimiter? (peek-char*)) '(#\.))
	       ((not (looking-at? #\.)) (read-number #\.))
	       ((begin (read-char*) (looking-at? #\.)) (read-char*) '...)
	       (else (error "Malformed token starting with \"..\""))))
	((#\+) (read-char*) (if (delimiter? (peek-char*)) '+ (read-number c)))
	((#\-) (read-char*) (if (delimiter? (peek-char*)) '- (read-number c)))
	((#\#)
	 (read-char*)
	 (let ((c (peek-char*)))
	   (case c
	     ((#\() (read-char*) '(#\#))
	     ((#\\) (read-char*) (read-character))
	     ((#\t #\T) (read-char*) #t)
	     ((#\f #\F) (read-char*) #f)
	     (else (cond ((char-numeric? c) (read-part-spec))
			 (else (read-number #\#)))))))
	(else (cond ((eof-object? c) c)
		    ((char-numeric? c) (read-char*) (read-number c))
		    (else (read-identifier)))))))

  ;; Maps the number of each part to a thunk that returns the part.
  (define parts-alist '())

  (define (read-object)
    (finish-reading-object (read-token)))

  (define (finish-reading-object first-token)
    (if (not (pair? first-token))
	first-token
	(case (car first-token)
	  ((#\() (read-tail))
	  ((#\#) (list->vector (read-tail)))
	  ((#\. #\)) (error "Unexpected \"" token "\""))
	  ((use) (let ((n (cdr first-token)))
		   (cond ((assv n parts-alist) => cdr)
			 (else (error "Use of undeclared part " n)))))
	  ((label)
	   ;; This is complicated in order to allow #1=#2=() and
	   ;; #1=(#2=#1#) and not to loop forever when given #1=#1#.
	   (let ((n (cdr first-token)))
	     (if (assv n parts-alist)
		 (error "Double declaration of part " n)
		 (let read-labels ((labels (list n)))
		   (define (add-labels-to-alist! thunk)
		     (set! parts-alist
			   (append (map (lambda (n) (cons n thunk)) labels)
				   parts-alist)))
		   (let ((token (read-token)))
		     (cond ((and (pair? token) (eq? 'label (car token)))
			    ;; An additional label in a chain of them.
			    (let ((n (cdr token)))
			      (if (or (assv n parts-alist) (memv n labels))
				  (error "Double declaration of part " n))
			      (read-labels (cons n labels))))
			   ((and (pair? token) (eq? 'use (car token)))
			    ;; The labeled object is a use of a previous label:
			    ;; reuse its thunk for our thunk.
			    (let* ((n (cdr token)) (p (assv n parts-alist)))
			      (if (not p)
				  (error "Use of undeclared part " n)
				  (let ((thunk (cdr p)))
				    (add-labels-to-alist! thunk)
				    thunk))))
			   (else
			    ;; Normal case.  Make a thunk (which doesn't
			    ;; need to be usable until the read is complete.)
			    (letrec
				((obj (begin
					(add-labels-to-alist! (lambda () obj))
					(finish-reading-object token))))
			      obj))))))))
	  (else (list (caadr (assv (car first-token)
				   '((#\' 'x) (#\, ,x) (#\` `x) (#\@ ,@x))))
		      (read-object))))))

  (define (read-tail)
    (let ((token (read-token)))
      (cond ((eof-object? token) (error "EOF inside a list or vector"))
	    ((not (pair? token)) (cons token (read-tail)))
	    (else (case (car token)
		    ((#\)) '())
		    ((#\.) (let* ((obj (read-object))
				  (tok (read-token)))
			     (if (and (pair? tok) (char=? #\) (car tok)))
				 obj
				 (error "Extra junk after a dot"))))
		    (else (let ((obj (finish-reading-object token)))
			    (cons obj (read-tail)))))))))

  (let ((obj (read-object)))
    (let fill-in-parts ((obj obj))
      (cond ((pair? obj)
	     (if (procedure? (car obj))
		 (set-car! obj ((car obj)))
		 (fill-in-parts (car obj)))
	     (if (procedure? (cdr obj))
		 (set-cdr! obj ((cdr obj)))
		 (fill-in-parts (cdr obj))))
	    ((vector? obj)
	     (let ((len (vector-length obj)))
	       (do ((i 0 (+ i 1)))
		   ((= i len))
		 (let ((elt (vector-ref obj i)))
		   (if (procedure? elt)
		       (vector-set! obj i (elt))
		       (fill-in-parts elt))))))))
    obj))

(define (write-with-numbered-parts obj . optional-port)
  (define port
    (if (null? optional-port) (current-output-port) (car optional-port)))

  (define (write* obj) (write obj port))
  (define (display* obj) (display obj port))

  (define (acons key val alist)
    (cons (cons key val) alist))
  ;; We only track duplicates of pairs, vectors, and strings.  We
  ;; ignore zero-length vectors and strings because r5rs doesn't
  ;; guarantee that eq? treats them sanely (and they aren't very
  ;; interesting anyway).
  (define (interesting? obj)
    (or (pair? obj)
	(and (vector? obj) (not (zero? (vector-length obj))))
	(and (string? obj) (not (zero? (string-length obj))))))

  ;; (write-obj OBJ ALIST):
  ;; ALIST has an entry for each interesting part of OBJ.  The
  ;; associated value will be:
  ;;  -- a number if the part has been given one,
  ;;  -- #t if the part will need to be assigned a number but has not been yet,
  ;;  -- #f if the part will not need a number.
  ;; The cdr of ALIST's first element should be the most recently
  ;; assigned number.
  ;; Returns an alist with new shadowing entries for any parts that
  ;; had numbers assigned.
  (define (write-obj obj alist)
    (define (write-interesting alist)
      (cond ((pair? obj)
	     (let ((caro (car obj)) (cdro (cdr obj)))
	       (cond ((and (pair? cdro)
			   (null? (cdr cdro))
			   (let ((abbrev (assq caro '('"'" `"`" ,"," ,@",@"))))
			     (and abbrev
				  ;; we can't abbreviate (quote . #1#)
				  (not (cdr (assq cdro alist)))
				  abbrev)))
		      => (lambda (abbrev)
			   (display* (cadr abbrev))
			   (write-obj (car cdro) alist)))
		     (else (display* "(")
			   (let write-cdr ((obj cdro)
					   (alist (write-obj caro alist)))
			     (cond ((and (pair? obj)
					 (not (cdr (assq obj alist))))
				    (display* " ")
				    (write-cdr (cdr obj)
					       (write-obj (car obj) alist)))
				   ((null? obj) (display* ")") alist)
				   (else (display* " . ")
					 (let ((alist (write-obj obj alist)))
					   (display* ")")
					   alist))))))))
	    ((vector? obj)
	     (display* "#(")
	     (let ((len (vector-length obj)))
	       (do ((i 1 (+ i 1))
		    (alist (write-obj (vector-ref obj 0) alist)
			   (write-obj (vector-ref obj i) alist)))
		   ((= i len) (display* ")") alist)
		 (display* " "))))
	    ;; else it's a string
	    (else (write* obj) alist)))

    (cond ((interesting? obj)
	   (let ((val (cdr (assq obj alist))))
	     (cond ((not val) (write-interesting alist))
		   ((number? val)
		    (display* "#") (write* val) (display* "#") alist)
		   (else
		    (let ((n (+ 1 (cdar alist))))
		      (display* "#") (write* n) (display* "=")
		      (write-interesting (acons obj n alist)))))))
	  (else (write* obj) alist)))

  ;; Scan computes the initial value of the alist, which maps each
  ;; interesting part of the object to #t if it occurs multiple times,
  ;; #f if only once.
  (define (scan obj alist)
    (cond ((not (interesting? obj)) alist)
	  ((assq obj alist) =>
	   (lambda (p) (if (cdr p) alist (acons obj #t alist))))
	  (else
	   (let ((alist (acons obj #f alist)))
	     (cond ((pair? obj) (scan (car obj) (scan (cdr obj) alist)))
		   ((vector? obj)
		    (let ((len (vector-length obj)))
		      (do ((i 0 (+ 1 i))
			   (alist alist (scan (vector-ref obj i) alist)))
			  ((= i len) alist))))
		   (else alist))))))

  (write-obj obj (acons 'dummy 0 (scan obj '())))
  ;; Don't want to return the big alist that write-obj did, lest it hinder gc.
  (if #f #f))