On Thu, Oct 17, 2013 at 4:14 AM, Per Bothner <xxxxxx@bothner.com> wrote:
On 10/16/2013 11:44 AM, Michael Montague wrote:
Maybe I am being a heretic, but what are the benefits of the SRE syntax?

I think structured regular expressions make sense when integrated
with a general pattern-matching framework (by which I mean something
like http://docs.racket-lang.org/reference/match.html). Also,
sub-matches should produce variable bindings.

Without that, I also see little value is yet another ad hoc
string-only pattern-matching syntax, which is likely to
be have different syntax from any general pattern-matching framework.

I think it would be strange to provide regexp
matching as part of a general matching framework
without providing access to the underlying regexp
library.

It can also be handy to build macro utilities on top
of SREs.  There are a number of regexp-case utilities
out there, but the following allows referring to named
submatches directly (preserving hygiene):

(define-syntax regexp-case
  (syntax-rules ()
    ((regexp-case str (pat . body) ...)
     (let ((s str))
       (rx-case s () (pat . body) ...)))))

(define-syntax rx-case
  (syntax-rules ()
    ((rx-case s ((pat (vars ...) . body) ...))
     (cond
      ((regexp-match `pat s)
       => (lambda (m)
            (let ((vars (rx-match-submatch m s 'vars)) ...)
              . body)))
      ...
      (else
       (error "no patterns matched"))))
    ((rx-case s pats (pat . body) . rest)
     (rx-extract pat (rx-case-step s pats (pat . body) . rest)))))

(define-syntax rx-case-step
  (syntax-rules ()
    ((rx-case-step vars s (pats ...) (pat . body) . rest)
     (rx-case s (pats ... (pat vars . body)) . rest))))

(define-syntax rx-extract
  (syntax-rules (=> submatch-named unquote unquote-splicing)
    ((rx-extract (=> var sre ...) (k args ...))
     (rx-extract (sre ...) (rx-append (var) (k args ...))))
    ((rx-extract (submatch-named var sre ...) (k args ...))
     (rx-extract (sre ...) (rx-append (var) (k args ...))))
    ((rx-extract (unquote x) (k args ...))
     (k () args ...))
    ((rx-extract (unquote-splicing x) (k args ...))
     (k () args ...))
    ((rx-extract (x . y) (k args ...))
     (rx-extract x (rx-extract-step y (k args ...))))
    ((rx-extract x (k args ...))
     (k () args ...))))

(define-syntax rx-extract-step
  (syntax-rules ()
    ((rx-extract-step ls y (k ...))
     (rx-extract y (rx-append ls (k ...))))))

(define-syntax rx-append
  (syntax-rules ()
    ((rx-append (a ...) (b ...) (k args ...))
     (k (a ... b ...) args ...))))

A sample usage is:

;; Utility to liberally parse dates in any of a variety of common
;; formats, returning a SRFI-19 date object.
(define parse-date
  (let ()
    (define (i x) (or (and (string? x) (string->number x)) 0))
    (define (abbrev-rx x n)
      (let ((str (symbol->string x)))
        `(: ,(substring str 0 n) (? ,(substring str n)))))
    (define (abbrev-ls ls n)
      `(or ,@(map (lambda (x) (abbrev-rx x n)) ls)))
    (define wday-en
      '(Sunday Monday Tuesday Wednesday Thursday Friday Saturday))
    (define rx-wday-en (abbrev-ls wday-en 3))
    (define mon-en
      '(January February March April May June July
        August September October November December))
    (define rx-mon-en (abbrev-ls mon-en 3))
    (define sep '(: (* space) (? ("-_,;:/")) (* space)))
    (define d2 '(= 2 digit))
    (define d4 '(= 4 digit))
    (define rx-tz
      '(or (: ("+-") (= 2 digit) ("013") ("05"))
           (: word "/" word)
           (: (? "(") (= 3 alpha) (? ")"))))
    (lambda (str)
      (regexp-case
       str
       ;; RFC-822 day-month-year-time family
       ((w/nocase
         (? ,rx-wday-en ,sep) (=> day ,d2) ,sep
         (=> mon ,rx-mon-en) ,sep (=> year ,d4)
         (? ,sep (=> hour ,d2) ,sep (=> min ,d2) ,sep (=> sec ,d2))
         (? ,sep (=> tz ,rx-tz)))
        (make-date 0 (i sec) (i min) (i hour) (i day)
                   (month->number mon) (i year) (tz->number tz)))
       ;; Unix month-day-time-year family
       ((w/nocase
         (? ,rx-wday-en ,sep) (=> mon ,rx-mon-en) ,sep (=> day ,d2)
         (? ,sep (=> hour ,d2) ,sep (=> min ,d2) ,sep (=> sec ,d2))
         (? ,sep (=> tz ,rx-tz)) ,sep (=> year ,d4))
        (make-date 0 (i sec) (i min) (i hour) (i day)
                   (month->number mon) (i year) (tz->number tz))) 
       ;; YYYY-MM-DD family
       ((w/nocase
         (=> year ,d4) ,sep (=> mon ,d2) ,sep (=> day ,d2)
         (? (? "T") ,sep (=> hour ,d2) ,sep (=> min ,d2) ,sep (=> sec ,d2)
            (? ,sep (=> tz ,rx-tz))))
        (make-date 0 (i sec) (i min) (i hour) (i day)
                   (month->number mon) (i year) (tz->number tz)))))))

(define (month->number x)
  (case (string->symbol
         (string-downcase (if (symbol? x) (symbol->string x) x)))
    ((jan) 1) ((feb) 2) ((mar) 3) ((apr) 4)
    ((may) 5) ((jun) 6) ((jul) 7) ((aug) 8)
    ((sep) 9) ((oct) 10) ((nov) 11) ((dec) 12)
    (else #f)))

(define time-zone-abbrevs
  `(("CDT" . ,(* -5 60 60))
    ("CST" . ,(* -6 60 60))
    ("EDT" . ,(* -4 60 60))
    ("EST" . ,(* -5 60 60))
    ("PDT" . ,(* -7 60 60))
    ("PST" . ,(* -8 60 60))
    ("NDT" . ,(+ (* -2 60 60) (* -30 60)))
    ("NST" . ,(+ (* -3 60 60) (* -30 60)))
    ("JST" . ,(+ 9 60 60))
    ("GMT" . 0)
    ("UTC" . 0)))

(define (tz->number x)
  (cond
   ((and (string? x)
         (regexp-match '(: ($ ("+-") (= 2 digit)) ($ (* digit))) x))
    => (lambda (m)
         (* 60 (+ (* 60 (rx-match-submatch m 1)) (rx-match-submatch m 2)))))
   ((assoc x time-zone-abbrevs) => cdr)
   (else 0)))

I have no problem with it as a SRFI, but I think it would be a
mistake for RnRS.

The intention is to include this in the "large" language,
which is intended to be very large indeed.  If you look
through the "Work Item Dockets" for WG" at

  http://trac.sacrideo.us/wg/wiki

you'll get an idea of the scope of the large language,
including threads, networking, gettext, formatting combinators,
loop syntax, pattern matching, and "posix" (which itself includes
regexps), to name a few.  It would be very strange not to include
a regexp library in this language.

That said, I think regexps are often overused.  They
are just one of many tools, but they're a tool programmers
expect to have these days.

-- 
Alex