[Openmcl-devel] What is the status of OpenMCL Altivec support

Wed Feb 25 16:54:17 PST 2004

On Sat, 21 Feb 2004, Randall Beer wrote:

> If enough people want to make use of AltiVec in OpenMCL, then I might
> suggest thinking about writing a mini-compiler for it.  I did this with
> PPC-FPC, a compiler that translates a subset of Lisp involving
> arithmetic operations  into equivalent LAP
> (http://vorlon.cwru.edu/~beer). An intermediate option is to make use
> of DEFPPCLAPMACRO to provide higher-level virtual instructions for
> working with AltiVec in LAP. Although these approaches may take a bit
> more work initially, they pay off handsomely in the long run.
>
> Randy Beer

I actually did something similar a year or so ago: defined (with LAP)
functions wrapped around -most- vector instructions (basically, those
that take register arguments only.)  The functions take arguments that're
presumed to be 16-byte aligned MACPTRs, load the contents of those pointers
into vector registers, execute the instruction, then copy the result back
to a distinguished argument (the first.)

The intent of the code was to make it easier to experiment with Altivec-based
algorithms and data organization.  All of the extra function-call overhead
and extra loading and storing probably make the instruction mix a lot
different from what it'd be if a compiler was open-coding these operations,
so it's probably hard to draw too many useful conclusions from this; if you
can convince yourself that two algorithms/data representations would each
suffer about equally, it might actually be useful.

NOTE: this was written about a year ago (so a couple of references to the
"ARCH" package should probably be changed to "TARGET".)  This never got
used much if at all, so there may be a few bugs in the way that functions
are defined.  The macros that generate the little LAP functions are just
concerned with what operands the corresponding instruction takes; the
assembler's supposed to know how to actually generate the right machine
code.
-------------- next part --------------
;;;-*- Mode: Lisp; Package: CCL -*-

;;; 031003 gb First cut.

(in-package "CCL")

(eval-when (:load-toplevel :execute)
  (unless (altivec-available-p)
    (error "No vector unit here.")))

;;; Some glue around AltiVec, to make AltiVec instructions callable
;;; from Lisp code.
;;; Operands and results are represented as macptrs to 16-byte-aligned
;;; addresses.  A four-address instruction D = vecop A B C should be
;;; called as (vecop d a b c), which will destructively modify and return
;;; D.

(defppclapfunction %get-vrsave ()
  (mfspr imm0 256)
  (ba .SPmakeu32))

(defppclapfunction %set-vrsave ((val arg_z))
  (save-lisp-context)
  (bla .SPgetxlong)
  (mtspr 256 imm0)
  (restore-full-lisp-context)
  (blr))

;;; The lap routines below use (at most) vr0-vr2.  You need to do this
;;; in each thread for which you don't want those registers to be
;;; dropped on the floor on context switch.

(%set-vrsave #xE0000000)

(defun vector-as-u128-integer (vecptr)
  (dpb (%get-unsigned-long vecptr 0)
       (byte 32 96)
       (dpb (%get-unsigned-long vecptr 4)
            (byte 32 64)
            (dpb (%get-unsigned-long vecptr 8)
                 (byte 32 32)
                 (%get-unsigned-long vecptr 12)))))

(defun vector-from-u128-integer (vecptr integer)
  (setf (%get-unsigned-long vecptr 0) (ldb (byte 32 96) integer)
        (%get-unsigned-long vecptr 4) (ldb (byte 32 64) integer)
        (%get-unsigned-long vecptr 8) (ldb (byte 32 32) integer)
        (%get-unsigned-long vecptr 12) (ldb (byte 32 23) integer))
  vecptr)

(macrolet  ((define-2-address-vector-instruction (name)
                `(defppclapfunction ,name ((d arg_y) (a arg_z))
                  (macptr-ptr imm1 a)
                  (macptr-ptr imm0 d)
                  (lvx vr0 imm1 rzero)
                  (,name vr1 vr0)
                  (stvx vr1 imm0 rzero)
                  (mr arg_z d)
                  (blr)))
            (define-3-address-vector-instruction (name)
                `(defppclapfunction ,name ((d arg_x) (a arg_y) (b arg_z))
                  (macptr-ptr imm2 b)
                  (macptr-ptr imm1 a)
                  (macptr-ptr imm0 d)
                  (lvx vr0 imm1 rzero)
                  (lvx vr1 imm2 rzero)
                  (,name vr1 vr0 vr1)
                  (stvx vr1 imm0 rzero)
                  (mr arg_z d)
                  (blr)))
            (define-4-address-vector-instruction (name)
                `(defppclapfunction ,name ((d 0) (a arg_x) (b arg_y) (c arg_z))
                  (macptr-ptr imm2 c)
                  (macptr-ptr imm1 b)
                  (macptr-ptr imm0 a)
                  (vpop arg_z)
                  (macptr-ptr imm3 arg_z)
                  (lvx vr0 imm2 rzero)
                  (lvx vr1 imm1 rzero)
                  (lvx vr2 imm0 rzero)
                  (,name vr1 vr0 vr1 vr2)
                  (stvx vr1 imm3 rzero)
                  (blr)))
            (define-vector-load-instruction (name)
                `(defppclapfunction ,name ((d arg_x) (a arg_y) (b arg_z))
                  (macptr-ptr d imm0)
                  (macptr-ptr a imm1)
                  (unbox-fixnum imm2 b)
                  (,name vr1 imm1 imm2)
                  (stvx vr1 imm0 rzero)
                  (mr arg_z d)
                  (blr)))
            (define-vector-store-instruction (name)
                `(defppclapfunction ,name ((s arg_x) (a arg_y) (b arg_z))
                  (macptr-ptr s imm0)
                  (macptr-ptr a imm1)
                  (unbox-fixnum imm2 b)
                  (lvx vr1 imm0 rzero)
                  (,name vr1 imm1 imm2)
                  (mr arg_z s)
                  (blr)))                
            (define-2-address-vector-instructions (&rest names)
                `(progn
                  ,@(mapcar #'(lambda (name)
                                `(define-2-address-vector-instruction ,name))
                            names)))
            (define-3-address-vector-instructions (&rest names)
                `(progn
                  ,@(mapcar #'(lambda (name)
                                `(define-3-address-vector-instruction ,name))
                            names)))
            (define-4-address-vector-instructions (&rest names)
                `(progn
                  ,@(mapcar #'(lambda (name)
                                `(define-4-address-vector-instruction ,name))
                            names)))
            (define-vector-load-instructions (&rest names)
                `(progn
                  ,@(mapcar #'(lambda (name)
                                `(define-vector-load-instruction ,name))
                            names)))
            (define-vector-store-instructions (&rest names)
                `(progn
                  ,@(mapcar #'(lambda (name)
                                `(define-vector-store-instruction ,name))
                            names))))
  (define-2-address-vector-instructions
    vrefp                               ; 04 vD 0_0000 vB 266
    vrsqrtefp                           ; 04 vD 0_0000 vB 330
    vexptefp                            ; 04 vD 0_0000 vB 394
    vlogefp                             ; 04 vD 0_0000 vB 458
    vrfin                               ; 04 vD 0_0000 vB 522
    vrfiz                               ; 04 vD 0_0000 vB 586
    vrfip                               ; 04 vD 0_0000 vB 650
    vrfim                               ; 04 vD 0_0000 vB 714
    vupkhsb                             ; 04 vD 0_0000 vB 526
    vupkhsh                             ; 04 vD 0_0000 vB 590
    vupklsb                             ; 04 vD 0_0000 vB 654
    vupklsh                             ; 04 vD 0_0000 vB 718
    vupkhpx                             ; 04 vD 0_0000 vB 846
    vupklpx                             ; 04 vD 0_0000 vB 974
    )
  (define-3-address-vector-instructions
    vaddubm                             ; 04 vD vA vB 0
    vadduhm                             ; 04 vD vA vB 64
    vadduwm                             ; 04 vD vA vB 128
    vaddcuw                             ; 04 vD vA vB 384
    vaddubs                             ; 04 vD vA vB 512
    vadduhs                             ; 04 vD vA vB 576
    vadduws                             ; 04 vD vA vB 640
    vaddsbs                             ; 04 vD vA vB 768
    vaddshs                             ; 04 vD vA vB 832
    vaddsws                             ; 04 vD vA vB 896
    vsububm                             ; 04 vD vA vB 1024
    vsubuhm                             ; 04 vD vA vB 1088
    vsubuwm                             ; 04 vD vA vB 1152
    vsubcuw                             ; 04 vD vA vB 1408
    vsububs                             ; 04 vD vA vB 1536
    vsubuhs                             ; 04 vD vA vB 1600
    vsubuws                             ; 04 vD vA vB 1664
    vsubsbs                             ; 04 vD vA vB 1792
    vsubshs                             ; 04 vD vA vB 1856
    vsubsws                             ; 04 vD vA vB 1920
    vmaxub                              ; 04 vD vA vB 2
    vmaxuh                              ; 04 vD vA vB 66
    vmaxuw                              ; 04 vD vA vB 130
    vmaxsb                              ; 04 vD vA vB 258
    vmaxsh                              ; 04 vD vA vB 322
    vminub                              ; 04 vD vA vB 514
    vminuh                              ; 04 vD vA vB 578
    vminuw                              ; 04 vD vA vB 642
    vminsb                              ; 04 vD vA vB 770
    vminsh                              ; 04 vD vA vB 834
    vminsw                              ; 04 vD vA vB 898
    vavgub                              ; 04 vD vA vB 1026
    vavguh                              ; 04 vD vA vB 1090
    vavguw                              ; 04 vD vA vB 1154
    vavgsb                              ; 04 vD vA vB 1282
    vavgsh                              ; 04 vD vA vB 1346
    vavgsw                              ; 04 vD vA vB 1410
    vrlb                                ; 04 vD vA vB 4
    vrlh                                ; 04 vD vA vB 68
    vrlw                                ; 04 vD vA vB 132
    vslb                                ; 04 vD vA vB 260
    vslh                                ; 04 vD vA vB 324
    vslw                                ; 04 vD vA vB 388
    vsl                                 ; 04 vD vA vB 452
    vsrb                                ; 04 vD vA vB 516
    vsrh                                ; 04 vD vA vB 580
    vsrw                                ; 04 vD vA vB 644
    vsr                                 ; 04 vD vA vB 708
    vsrab                               ; 04 vD vA vB 772
    vsrah                               ; 04 vD vA vB 836
    vsraw                               ; 04 vD vA vB 900
    vand                                ; 04 vD vA vB 1028
    vandc                               ; 04 vD vA vB 1092
    vor                                 ; 04 vD vA vB 1156
    vnor                                ; 04 vD vA vB 1284
    vmuloub                             ; 04 vD vA vB 8
    vmulouh                             ; 04 vD vA vB 72
    vmulosb                             ; 04 vD vA vB 264
    vmulosh                             ; 04 vD vA vB 328
    vmuleub                             ; 04 vD vA vB 520
    vmuleuh                             ; 04 vD vA vB 584
    vmulesb                             ; 04 vD vA vB 776
    vmulesh                             ; 04 vD vA vB 840
    vsum4ubs                            ; 04 vD vA vB 1544
    vsum4sbs                            ; 04 vD vA vB 1800
    vsum4shs                            ; 04 vD vA vB 1608
    vsum2sws                            ; 04 vD vA vB 1672
    vsumsws                             ; 04 vD vA vB 1928
    vaddfp                              ; 04 vD vA vB 10
    vsubfp                              ; 04 vD vA vB 74
    vmaxfp                              ; 04 vD vA vB 1034
    vminfp                              ; 04 vD vA vB 1098
    vmrghb                              ; 04 vD vA vB 12
    vmrghh                              ; 04 vD vA vB 76
    vmrghw                              ; 04 vD vA vB 140
    vmrglb                              ; 04 vD vA vB 268
    vmrglh                              ; 04 vD vA vB 332
    vmrglw                              ; 04 vD vA vB 396
    vslo                                ; 04 vD vA vB 1036
    vsro                                ; 04 vD vA vB 1100
    vpkuhum                             ; 04 vD vA vB 14
    vpkuwum                             ; 04 vD vA vB 78
    vpkuhus                             ; 04 vD vA vB 142
    vpkuwus                             ; 04 vD vA vB 206
    vpkshus                             ; 04 vD vA vB 270
    vpkswus                             ; 04 vD vA vB 334
    vpkshss                             ; 04 vD vA vB 398
    vpkswss                             ; 04 vD vA vB 462
    vpkpx                               ; 04 vD vA vB 782
    vxor                                ; 04 vD vA vB 1220
    ;; These instructions also come in "." form, where
    ;; a condition register field is set as a result
    ;; of a comparison.  I suppose that the CR field could
    ;; be returned as a second value.  Or something.
    vcmpbfp                             ; 04 vD vA vB Rc 966
    vcmpeqfp                            ; 04 vD vA vB Rc 198
    vcmpequb                            ; 04 vD vA vB Rc 6
    vcmpequh                            ; 04 vD vA vB Rc 70
    vcmpequw                            ; 04 vD vA vB Rc 134
    vcmpgefp                            ; 04 vD vA vB Rc 454
    vcmpgtfp                            ; 04 vD vA vB Rc 710
    vcmpgtsb                            ; 04 vD vA vB Rc 774
    vcmpgtsh                            ; 04 vD vA vB Rc 838
    vcmpgtsw                            ; 04 vD vA vB Rc 902
    vcmpgtub                            ; 04 vD vA vB Rc 518
    vcmpgtuh                            ; 04 vD vA vB Rc 582
    vcmpgtuw                            ; 04 vD vA vB Rc 646
    )
  (define-4-address-vector-instructions
    vmhraddshs                          ; 04 vD vA vB vC 33
    vmladduhm                           ; 04 vD vA vB vC 34
    vmsumubm                            ; 04 vD vA vB vC 36
    vmsummbm                            ; 04 vD vA vB vC 37
    vmsumuhm                            ; 04 vD vA vB vC 38
    vmsumuhs                            ; 04 vD vA vB vC 39
    vmsumshm                            ; 04 vD vA vB vC 40
    vmsumshs                            ; 04 vD vA vB vC 41
    vmaddfp                             ; 04 vD vA vB vC 46
    vnmsubfp                            ; 04 vD vA vB vC 47
    vsel                                ; 04 vD vA vB vC 42
    vperm                               ; 04 vD vA vB vC 43
    )
  ;; Loads and stores.
  ;; The first source operand is assumed to be a macptr; the second
  ;; is assumed to be a fixnum (byte offset).
  (define-vector-load-instructions
    lvebx                               ; 31 vD A B 7 0
    lvehx                               ; 31 vD A B 39 0
    lvewx                               ; 31 vD A B 71 0
    lvsl                                ; 31 vD A B 6 0
    lvsr                                ; 31 vD A B 38 0
    lvx                                 ; 31 vD A B 103 0
    lvxl                                ; 31 vD A B 359 0
    )
  (define-vector-store-instructions
    stvebx                              ; 31 vS A B 135 0
    stvehx                              ; 31 vS A B 167 0
    stvewx                              ; 31 vS A B 199 0
    stvx                                ; 31 vS A B 231 0
    stvxl                               ; 31 vS A B 487 0
    )
  )

;;; Hopelsss, for various reasons
#|
vsldoi                              ; 04 vD vA vB 0 SH 44
dst                                 ; 31 T 0_0 STRM A B 342 0
dstt                                ; 31 1 0_0 STRM A B 342 0
dstst                               ; 31 T 0_0 STRM A B 374 0
dststt                              ; 31 1 0_0 STRM A B 374 0
dss                                 ; 31 A 0_0 STRM 0_0000 0000_0 822 0
dssall                              ; 31 1 0_0 STRM 0_0000 0000_0 822 0
vcfux                               ; 04 vD UIMM vB 778
vcfsx                               ; 04 vD UIMM vB 842
vctuxs                              ; 04 vD UIMM vB 906
vctsxs                              ; 04 vD UIMM vB 970    
vspltb                              ; 04 vD UIMM vB 524
vsplth                              ; 04 vD UIMM vB 588
vspltw                              ; 04 vD UIMM vB 652
vspltisb                            ; 04 vD SIMM 0000_0 780
vspltish                            ; 04 vD SIMM 0000_0 844
vspltisw                            ; 04 vD SIMM 0000_0 908
|#

(defppclapfunction mfvscr ((dest arg_z)) ; 04 vD 0_0000 0000_0 1540
  (mfvscr vr1)
  (macptr-ptr imm0 dest)
  (stvx vr1 imm0 rzero)
  (blr))

(defppclapfunction mtvscr ((src arg_z)) ; 04 00_000 0_0000 vB 1604
  (macptr-ptr imm0 src)
  (lvx vr1 imm0 rzero)
  (mtvscr vr1)
  (blr))

(defun new-quadwords (n)
  (#+openmcl malloc #-openmcl #_NewPtr (ash n 16)))

(defun free-quadwords (ptr)
  (#+openmcl free #-openmcl #_DisposePtr ptr))

(defppclapfunction %quadword-align-macptr ((p arg_z))
  (macptr-ptr imm0 p)
  (la imm0 15 imm0)
  (clrrwi imm0 imm0 4)
  (stw imm0 #+openmcl arch::macptr.address #-openmcl ppc::macptr.address p)
  (blr))

(defmacro with-quadwords ((&rest vars) &body body)
  (let* ((n (length vars))
         (raw (gensym))
         (bindings (do* ((vars vars (cdr vars))
                         (offset 0 (+ offset 16))
                         (res ()))
                        ((null vars) (nreverse res))
                     (let* ((v (car vars)))
                       (push `(,v (%inc-ptr ,raw ,offset)) res)))))
    `(%stack-block ((,raw (+ 15 (ash ,n 4))))
      (%quadword-align-macptr ,raw)
      (with-macptrs (, at bindings)
        , at body))))

(defmacro with-quadword-vectors ((&rest bindings) &body body &environment env)
  (multiple-value-bind (body decls) (parse-body body env)
    (let* ((initforms
            (mapcar
             #'(lambda (spec)
                        `(,(car spec) (+ 15 (ash ,(or (cadr spec) 1) 4))))
             bindings))
           (fixups
            (mapcar #'(lambda (spec) `(%quadword-align-macptr ,(car spec)))
                    bindings)))
    `(%stack-block ,initforms
      , at decls
      (progn
        , at fixups
        , at body)))))

(defun load-quadword (dest source &optional (index 0))
  (declare (type macptr dest source) (fixnum index))
  (lvx dest source (the fixnum (ash index 4))))

(defun store-quadword (source dest &optional (index 0))
  (declare (type macptr dest source) (fixnum index))
  (stvx dest source (the fixnum (ash index 4))))

(defun zero-quadword (q)
  (vxor q q q))

(defun fill-quadword-bytes (q b)
  (dotimes (i 16) (setf (%get-unsigned-byte q i) b)))

(defun get-quadword-bytes (q)
  (let* ((r ()))
    (dotimes (i 16 (nreverse r))
      (push (%get-unsigned-byte q i) r))))

(defun fill-quadword-halfwords (q h)
  (do* ((i 0 (+ i 2)))
       ((= i 16))
    (setf (%get-unsigned-word q i) h)))

(defun get-quadword-halfwords (q)
  (do* ((r ())
        (i 0 (+ i 2)))
       ((= i 16) (nreverse r))
    (push (%get-unsigned-word q i) r)))

(defun fill-quadword-words (q w)
  (do* ((i 0 (+ i 4)))
       ((= i 16))
    (setf (%get-unsigned-long q i) w)))

(defun get-quadword-words (q)
  (do* ((r ())
        (i 0 (+ i 4)))
       ((= i 16) (nreverse r))
    (push (%get-unsigned-long q i) r)))

(defun nquadwords-for-nwords (w)
  (ceiling w 4))

;;; Return the number of quadwords needed to contain the bignum b.
(defun nquadwords-for-bignum (b)
  (nquadwords-for-nwords (uvsize b)))

;;; Copy an MCL bignum (little-endian 32-bit bigits) to a sequence of
;;; quadwords; the least-significant bits of the bignum will be
;;; right-justified and the bignum will be sign-extended.  Return the
;;; quadword vector and its size as multiple values
(defun quadwords-from-bignum (b)
  (check-type b bignum)
  (bignum-to-quadwords b (new-quadwords (nquadwords-for-bignum b))))

;;; Quadword vector V is at least big enough to hold the bignum b.
(defun bignum-to-quadwords (b v)
  (let* ((nwords (uvsize b))
         (nq (ceiling nwords 4)))
    (do* ((i 0 (1+ i))
          (word (uvref b i) (if (< i nwords) (uvref v i) word))
          (p (- (ash nq 4) 4) (- p 4)))
         ((= i nwords) (do* ((sign (if (logbitp 31 word) -1 0))
                        (p p (- p 4)))
                       ((< p 0) (values v nq))
                    (setf (%get-signed-long v p) sign)))
      (setf (%get-unsigned-long v p) word))))

(defun quadwords-to-integer (q n)
  (let* ((b (%alloc-misc (ash n 2) #+openmcl arch::subtag-bignum #-openmcl ppc::subtag-bignum)))
    (do* ((p (- (ash n 4) 4) (- p 4))
          (i 0 (1+ i)))
         ((< p 0) (%normalize-bignum-2 t b))
      (setf (uvref b i) (%get-unsigned-long q p)))))

#|
(defun vector-multiply-bignums (x y)
  (let* ((nx (uvsize z))
         (ny (uvsize y))
         (qx (nquadwords-for-nwords nx))
         (qy (nquadwords-for-nwords ny))
         (qres (nquadwords-for-nwords (+ nx ny))))
    (with-quadword-vectors ((vx qx)
                            (vy qy)
                            (vres qres))
      (bignum-to-quadwords x vx)
      (bignum-to-quadwords y vy)
      (%vector-multiply-quadwords vres qres vx qx vy qy)
      (quadwords-to-integer vres qres))))
|#