|
Array broadcasting
Bradley Lucier
(13 Sep 2025 03:12 UTC)
|
|
Re: Array broadcasting
Alex Shinn
(15 Sep 2025 01:24 UTC)
|
|
Re: Array broadcasting
Bradley J Lucier
(15 Sep 2025 17:26 UTC)
|
|
Re: Array broadcasting Bradley J Lucier (15 Sep 2025 17:43 UTC)
|
|
Re: Array broadcasting
Bradley J Lucier
(16 Sep 2025 02:45 UTC)
|
By the way, here is a k-means implementation using SRFI 231 for which scikit-learn
https://scikit-learn.org/stable/index.html
uses array broadcasting, I believe; the GitHub repository is here:
https://github.com/scikit-learn/scikit-learn
Perhaps there are places in my code where simple vectors would be more appropriate than using arrays (perhaps I fell into the “everything is an array when all I have is an array toolkit” hole), but I think the code is OK and somewhat idiomatic.
The output, compiled on my M1 Max MacBook Pro:
> (import (srfi 231))
> (array->list* (time (k-means data first-centroids)))
1
(116790 90586 92624)
2
(104430 97591 97979)
3
(101064 99470 99466)
4
(100219 99945 99836)
5
(100016 100049 99935)
6
(99964 100065 99971)
7
(99950 100070 99980)
8
(99949 100070 99981)
9
(99949 100070 99981)
(time (k-means data first-centroids))
4.291165 secs real time
4.281770 secs cpu time (4.208877 user, 0.072893 system)
28 collections accounting for 0.284431 secs real time (0.277717 user, 0.006233 system)
28416693608 bytes allocated
36861 minor faults
no major faults
((-.0072542400755287026 -.004211587528113073)
(.4993840412376627 .872389343634882)
(1.007577990650729 -.0033251928060448292))
Brad
(declare
(standard-bindings)
(extended-bindings)
(block)
(not safe)
)
(import (srfi 231))
(define (k-means-iteration data centroids)
;; data: an M x N array of M data points with N features
;; centroids: An K x N array of K centroids with N features.
(define (classify data centroids)
(define (Frobenius A B)
(sqrt (array-reduce + (array-map (lambda (a b) (square (- a b))) A B))))
(define (argmin f interval)
(cdr (interval-fold-left (lambda args
(cons (apply f args) args))
(lambda (existing new)
(if (< (car new)
(car existing))
new existing))
'(+inf.0) ;;; don't need any args
interval)))
(let ((curried-centroids
(array-copy (array-curry centroids 1))))
(array-copy
(array-map (lambda (observation)
(car ;; the result is a list of length 1, we just need the index
(argmin (lambda (i)
(Frobenius observation (array-ref curried-centroids i)))
(array-domain curried-centroids))))
(array-curry data 1)))))
(define (new-centroids data classification)
;; The one-dimensional classification array gives the classification
;; index of each row of the data array
(define (array-increment! array . multi-index)
(apply array-set!
array
(+ 1 (apply array-ref array multi-index))
multi-index))
(define (add-to-curried-array array value . multi-index)
(let ((subarray (apply array-ref array multi-index)))
(array-assign! subarray (array-map + value subarray))))
(let* ((new-centroids
(make-specialized-array (array-domain centroids) generic-storage-class 0))
(curried-new-centroids
(array-curry new-centroids 1))
(points-in-each-centroid
(make-specialized-array
(make-interval (vector (interval-upper-bound (array-domain centroids) 0)))
generic-storage-class
0)))
(array-for-each (lambda (data-row centroid-index)
(add-to-curried-array curried-new-centroids data-row centroid-index )
(array-increment! points-in-each-centroid centroid-index))
(array-curry data 1)
classification)
(if (array-any zero? points-in-each-centroid)
(error "k-means: Each cluster must have at least one point."))
(array-decurry
(array-map (lambda (row number-of-points)
(array-map (lambda (x)
(/ x number-of-points))
row))
curried-new-centroids
points-in-each-centroid))))
(new-centroids data (classify data centroids)))
;;; Sample, routines from SRFI 194
(define current-random-source (make-parameter default-random-source))
(define (with-random-source random-source thunk)
(unless (random-source? random-source)
(error "expected random source"))
(parameterize ((current-random-source random-source))
(thunk)))
;; Normal distribution (continuous - generates real numbers)
;; Box-Muller algorithm
;; NB: We tested Ziggurat method, too,
;; only to find out Box-Muller is faster about 12% - presumably
;; the overhead of each ops is larger in Gauche than C/C++, and
;; so the difference of cost of log or sin from the primitive
;; addition/multiplication are negligible.
;; NOTE: this implementation is not thread safe
(define make-normal-generator
(case-lambda
(()
(make-normal-generator 0.0 1.0))
((mean)
(make-normal-generator mean 1.0))
((mean deviation)
(let ((rand-real-proc (random-source-make-reals (current-random-source)))
(state #f))
(unless (and (real? mean)
(finite? mean))
(error "expected mean to be finite real number"))
(unless (and (real? deviation)
(finite? deviation)
(> deviation 0))
(error "expected deviation to be positive finite real number"))
(lambda ()
(define PI (* 4 (atan 1)))
(if state
(let ((result state))
(set! state #f)
result)
(let* ((r (sqrt (* -2 (log (rand-real-proc)))))
(theta (* 2 PI (rand-real-proc))))
(set! state (+ mean (* deviation r (cos theta))))
(+ mean (* deviation r (sin theta))))))))))
(define examples/3 100000)
(define point-interval (make-interval '#(2)))
(define (make-random-point x-center y-center)
(let ((random-x (make-normal-generator x-center 1/4))
(random-y (make-normal-generator y-center 1/4)))
(lambda ()
(make-array point-interval
(lambda (j)
(case j
((0) (random-x))
((1) (random-y))))))))
(define data
;; three clusters of Gaussian-distributed points near
;; (0, 0), (1/2, sqrt(3/4)), (1, 0) with variance 1/8
(array-append 0
(list
(let ((random-point
(make-random-point 0 0)))
(array-decurry
(make-array (make-interval (vector examples/3))
(lambda (i) (random-point)))))
(let ((random-point
(make-random-point 1/2 (sqrt 3/4))))
(array-decurry
(make-array (make-interval (vector examples/3))
(lambda (i) (random-point)))))
(let ((random-point
(make-random-point 1 0)))
(array-decurry
(make-array (make-interval (vector examples/3))
(lambda (i) (random-point))))))))
(define first-centroids
(list*->array
2
'((1/4 1/4)
(1/3 .9)
(1.1 -.1))))
(define (k-means data first-centroids)
(let loop ((centroids first-centroids)
(i 1))
(pp i)
(let ((new-centroids (k-means-iteration data centroids)))
(if (array-every = centroids new-centroids)
centroids
(loop new-centroids (+ i 1))))))