[Openmcl-devel] What is the status of OpenMCL Altivec support
Gary Byers
gb at clozure.com
Wed Feb 25 16:54:17 PST 2004
On Sat, 21 Feb 2004, Randall Beer wrote:
> If enough people want to make use of AltiVec in OpenMCL, then I might
> suggest thinking about writing a mini-compiler for it. I did this with
> PPC-FPC, a compiler that translates a subset of Lisp involving
> arithmetic operations into equivalent LAP
> (http://vorlon.cwru.edu/~beer). An intermediate option is to make use
> of DEFPPCLAPMACRO to provide higher-level virtual instructions for
> working with AltiVec in LAP. Although these approaches may take a bit
> more work initially, they pay off handsomely in the long run.
>
> Randy Beer
I actually did something similar a year or so ago: defined (with LAP)
functions wrapped around -most- vector instructions (basically, those
that take register arguments only.) The functions take arguments that're
presumed to be 16-byte aligned MACPTRs, load the contents of those pointers
into vector registers, execute the instruction, then copy the result back
to a distinguished argument (the first.)
The intent of the code was to make it easier to experiment with Altivec-based
algorithms and data organization. All of the extra function-call overhead
and extra loading and storing probably make the instruction mix a lot
different from what it'd be if a compiler was open-coding these operations,
so it's probably hard to draw too many useful conclusions from this; if you
can convince yourself that two algorithms/data representations would each
suffer about equally, it might actually be useful.
NOTE: this was written about a year ago (so a couple of references to the
"ARCH" package should probably be changed to "TARGET".) This never got
used much if at all, so there may be a few bugs in the way that functions
are defined. The macros that generate the little LAP functions are just
concerned with what operands the corresponding instruction takes; the
assembler's supposed to know how to actually generate the right machine
code.
-------------- next part --------------
;;;-*- Mode: Lisp; Package: CCL -*-
;;; 031003 gb First cut.
(in-package "CCL")
(eval-when (:load-toplevel :execute)
(unless (altivec-available-p)
(error "No vector unit here.")))
;;; Some glue around AltiVec, to make AltiVec instructions callable
;;; from Lisp code.
;;; Operands and results are represented as macptrs to 16-byte-aligned
;;; addresses. A four-address instruction D = vecop A B C should be
;;; called as (vecop d a b c), which will destructively modify and return
;;; D.
(defppclapfunction %get-vrsave ()
(mfspr imm0 256)
(ba .SPmakeu32))
(defppclapfunction %set-vrsave ((val arg_z))
(save-lisp-context)
(bla .SPgetxlong)
(mtspr 256 imm0)
(restore-full-lisp-context)
(blr))
;;; The lap routines below use (at most) vr0-vr2. You need to do this
;;; in each thread for which you don't want those registers to be
;;; dropped on the floor on context switch.
(%set-vrsave #xE0000000)
(defun vector-as-u128-integer (vecptr)
(dpb (%get-unsigned-long vecptr 0)
(byte 32 96)
(dpb (%get-unsigned-long vecptr 4)
(byte 32 64)
(dpb (%get-unsigned-long vecptr 8)
(byte 32 32)
(%get-unsigned-long vecptr 12)))))
(defun vector-from-u128-integer (vecptr integer)
(setf (%get-unsigned-long vecptr 0) (ldb (byte 32 96) integer)
(%get-unsigned-long vecptr 4) (ldb (byte 32 64) integer)
(%get-unsigned-long vecptr 8) (ldb (byte 32 32) integer)
(%get-unsigned-long vecptr 12) (ldb (byte 32 23) integer))
vecptr)
(macrolet ((define-2-address-vector-instruction (name)
`(defppclapfunction ,name ((d arg_y) (a arg_z))
(macptr-ptr imm1 a)
(macptr-ptr imm0 d)
(lvx vr0 imm1 rzero)
(,name vr1 vr0)
(stvx vr1 imm0 rzero)
(mr arg_z d)
(blr)))
(define-3-address-vector-instruction (name)
`(defppclapfunction ,name ((d arg_x) (a arg_y) (b arg_z))
(macptr-ptr imm2 b)
(macptr-ptr imm1 a)
(macptr-ptr imm0 d)
(lvx vr0 imm1 rzero)
(lvx vr1 imm2 rzero)
(,name vr1 vr0 vr1)
(stvx vr1 imm0 rzero)
(mr arg_z d)
(blr)))
(define-4-address-vector-instruction (name)
`(defppclapfunction ,name ((d 0) (a arg_x) (b arg_y) (c arg_z))
(macptr-ptr imm2 c)
(macptr-ptr imm1 b)
(macptr-ptr imm0 a)
(vpop arg_z)
(macptr-ptr imm3 arg_z)
(lvx vr0 imm2 rzero)
(lvx vr1 imm1 rzero)
(lvx vr2 imm0 rzero)
(,name vr1 vr0 vr1 vr2)
(stvx vr1 imm3 rzero)
(blr)))
(define-vector-load-instruction (name)
`(defppclapfunction ,name ((d arg_x) (a arg_y) (b arg_z))
(macptr-ptr d imm0)
(macptr-ptr a imm1)
(unbox-fixnum imm2 b)
(,name vr1 imm1 imm2)
(stvx vr1 imm0 rzero)
(mr arg_z d)
(blr)))
(define-vector-store-instruction (name)
`(defppclapfunction ,name ((s arg_x) (a arg_y) (b arg_z))
(macptr-ptr s imm0)
(macptr-ptr a imm1)
(unbox-fixnum imm2 b)
(lvx vr1 imm0 rzero)
(,name vr1 imm1 imm2)
(mr arg_z s)
(blr)))
(define-2-address-vector-instructions (&rest names)
`(progn
,@(mapcar #'(lambda (name)
`(define-2-address-vector-instruction ,name))
names)))
(define-3-address-vector-instructions (&rest names)
`(progn
,@(mapcar #'(lambda (name)
`(define-3-address-vector-instruction ,name))
names)))
(define-4-address-vector-instructions (&rest names)
`(progn
,@(mapcar #'(lambda (name)
`(define-4-address-vector-instruction ,name))
names)))
(define-vector-load-instructions (&rest names)
`(progn
,@(mapcar #'(lambda (name)
`(define-vector-load-instruction ,name))
names)))
(define-vector-store-instructions (&rest names)
`(progn
,@(mapcar #'(lambda (name)
`(define-vector-store-instruction ,name))
names))))
(define-2-address-vector-instructions
vrefp ; 04 vD 0_0000 vB 266
vrsqrtefp ; 04 vD 0_0000 vB 330
vexptefp ; 04 vD 0_0000 vB 394
vlogefp ; 04 vD 0_0000 vB 458
vrfin ; 04 vD 0_0000 vB 522
vrfiz ; 04 vD 0_0000 vB 586
vrfip ; 04 vD 0_0000 vB 650
vrfim ; 04 vD 0_0000 vB 714
vupkhsb ; 04 vD 0_0000 vB 526
vupkhsh ; 04 vD 0_0000 vB 590
vupklsb ; 04 vD 0_0000 vB 654
vupklsh ; 04 vD 0_0000 vB 718
vupkhpx ; 04 vD 0_0000 vB 846
vupklpx ; 04 vD 0_0000 vB 974
)
(define-3-address-vector-instructions
vaddubm ; 04 vD vA vB 0
vadduhm ; 04 vD vA vB 64
vadduwm ; 04 vD vA vB 128
vaddcuw ; 04 vD vA vB 384
vaddubs ; 04 vD vA vB 512
vadduhs ; 04 vD vA vB 576
vadduws ; 04 vD vA vB 640
vaddsbs ; 04 vD vA vB 768
vaddshs ; 04 vD vA vB 832
vaddsws ; 04 vD vA vB 896
vsububm ; 04 vD vA vB 1024
vsubuhm ; 04 vD vA vB 1088
vsubuwm ; 04 vD vA vB 1152
vsubcuw ; 04 vD vA vB 1408
vsububs ; 04 vD vA vB 1536
vsubuhs ; 04 vD vA vB 1600
vsubuws ; 04 vD vA vB 1664
vsubsbs ; 04 vD vA vB 1792
vsubshs ; 04 vD vA vB 1856
vsubsws ; 04 vD vA vB 1920
vmaxub ; 04 vD vA vB 2
vmaxuh ; 04 vD vA vB 66
vmaxuw ; 04 vD vA vB 130
vmaxsb ; 04 vD vA vB 258
vmaxsh ; 04 vD vA vB 322
vminub ; 04 vD vA vB 514
vminuh ; 04 vD vA vB 578
vminuw ; 04 vD vA vB 642
vminsb ; 04 vD vA vB 770
vminsh ; 04 vD vA vB 834
vminsw ; 04 vD vA vB 898
vavgub ; 04 vD vA vB 1026
vavguh ; 04 vD vA vB 1090
vavguw ; 04 vD vA vB 1154
vavgsb ; 04 vD vA vB 1282
vavgsh ; 04 vD vA vB 1346
vavgsw ; 04 vD vA vB 1410
vrlb ; 04 vD vA vB 4
vrlh ; 04 vD vA vB 68
vrlw ; 04 vD vA vB 132
vslb ; 04 vD vA vB 260
vslh ; 04 vD vA vB 324
vslw ; 04 vD vA vB 388
vsl ; 04 vD vA vB 452
vsrb ; 04 vD vA vB 516
vsrh ; 04 vD vA vB 580
vsrw ; 04 vD vA vB 644
vsr ; 04 vD vA vB 708
vsrab ; 04 vD vA vB 772
vsrah ; 04 vD vA vB 836
vsraw ; 04 vD vA vB 900
vand ; 04 vD vA vB 1028
vandc ; 04 vD vA vB 1092
vor ; 04 vD vA vB 1156
vnor ; 04 vD vA vB 1284
vmuloub ; 04 vD vA vB 8
vmulouh ; 04 vD vA vB 72
vmulosb ; 04 vD vA vB 264
vmulosh ; 04 vD vA vB 328
vmuleub ; 04 vD vA vB 520
vmuleuh ; 04 vD vA vB 584
vmulesb ; 04 vD vA vB 776
vmulesh ; 04 vD vA vB 840
vsum4ubs ; 04 vD vA vB 1544
vsum4sbs ; 04 vD vA vB 1800
vsum4shs ; 04 vD vA vB 1608
vsum2sws ; 04 vD vA vB 1672
vsumsws ; 04 vD vA vB 1928
vaddfp ; 04 vD vA vB 10
vsubfp ; 04 vD vA vB 74
vmaxfp ; 04 vD vA vB 1034
vminfp ; 04 vD vA vB 1098
vmrghb ; 04 vD vA vB 12
vmrghh ; 04 vD vA vB 76
vmrghw ; 04 vD vA vB 140
vmrglb ; 04 vD vA vB 268
vmrglh ; 04 vD vA vB 332
vmrglw ; 04 vD vA vB 396
vslo ; 04 vD vA vB 1036
vsro ; 04 vD vA vB 1100
vpkuhum ; 04 vD vA vB 14
vpkuwum ; 04 vD vA vB 78
vpkuhus ; 04 vD vA vB 142
vpkuwus ; 04 vD vA vB 206
vpkshus ; 04 vD vA vB 270
vpkswus ; 04 vD vA vB 334
vpkshss ; 04 vD vA vB 398
vpkswss ; 04 vD vA vB 462
vpkpx ; 04 vD vA vB 782
vxor ; 04 vD vA vB 1220
;; These instructions also come in "." form, where
;; a condition register field is set as a result
;; of a comparison. I suppose that the CR field could
;; be returned as a second value. Or something.
vcmpbfp ; 04 vD vA vB Rc 966
vcmpeqfp ; 04 vD vA vB Rc 198
vcmpequb ; 04 vD vA vB Rc 6
vcmpequh ; 04 vD vA vB Rc 70
vcmpequw ; 04 vD vA vB Rc 134
vcmpgefp ; 04 vD vA vB Rc 454
vcmpgtfp ; 04 vD vA vB Rc 710
vcmpgtsb ; 04 vD vA vB Rc 774
vcmpgtsh ; 04 vD vA vB Rc 838
vcmpgtsw ; 04 vD vA vB Rc 902
vcmpgtub ; 04 vD vA vB Rc 518
vcmpgtuh ; 04 vD vA vB Rc 582
vcmpgtuw ; 04 vD vA vB Rc 646
)
(define-4-address-vector-instructions
vmhraddshs ; 04 vD vA vB vC 33
vmladduhm ; 04 vD vA vB vC 34
vmsumubm ; 04 vD vA vB vC 36
vmsummbm ; 04 vD vA vB vC 37
vmsumuhm ; 04 vD vA vB vC 38
vmsumuhs ; 04 vD vA vB vC 39
vmsumshm ; 04 vD vA vB vC 40
vmsumshs ; 04 vD vA vB vC 41
vmaddfp ; 04 vD vA vB vC 46
vnmsubfp ; 04 vD vA vB vC 47
vsel ; 04 vD vA vB vC 42
vperm ; 04 vD vA vB vC 43
)
;; Loads and stores.
;; The first source operand is assumed to be a macptr; the second
;; is assumed to be a fixnum (byte offset).
(define-vector-load-instructions
lvebx ; 31 vD A B 7 0
lvehx ; 31 vD A B 39 0
lvewx ; 31 vD A B 71 0
lvsl ; 31 vD A B 6 0
lvsr ; 31 vD A B 38 0
lvx ; 31 vD A B 103 0
lvxl ; 31 vD A B 359 0
)
(define-vector-store-instructions
stvebx ; 31 vS A B 135 0
stvehx ; 31 vS A B 167 0
stvewx ; 31 vS A B 199 0
stvx ; 31 vS A B 231 0
stvxl ; 31 vS A B 487 0
)
)
;;; Hopelsss, for various reasons
#|
vsldoi ; 04 vD vA vB 0 SH 44
dst ; 31 T 0_0 STRM A B 342 0
dstt ; 31 1 0_0 STRM A B 342 0
dstst ; 31 T 0_0 STRM A B 374 0
dststt ; 31 1 0_0 STRM A B 374 0
dss ; 31 A 0_0 STRM 0_0000 0000_0 822 0
dssall ; 31 1 0_0 STRM 0_0000 0000_0 822 0
vcfux ; 04 vD UIMM vB 778
vcfsx ; 04 vD UIMM vB 842
vctuxs ; 04 vD UIMM vB 906
vctsxs ; 04 vD UIMM vB 970
vspltb ; 04 vD UIMM vB 524
vsplth ; 04 vD UIMM vB 588
vspltw ; 04 vD UIMM vB 652
vspltisb ; 04 vD SIMM 0000_0 780
vspltish ; 04 vD SIMM 0000_0 844
vspltisw ; 04 vD SIMM 0000_0 908
|#
(defppclapfunction mfvscr ((dest arg_z)) ; 04 vD 0_0000 0000_0 1540
(mfvscr vr1)
(macptr-ptr imm0 dest)
(stvx vr1 imm0 rzero)
(blr))
(defppclapfunction mtvscr ((src arg_z)) ; 04 00_000 0_0000 vB 1604
(macptr-ptr imm0 src)
(lvx vr1 imm0 rzero)
(mtvscr vr1)
(blr))
(defun new-quadwords (n)
(#+openmcl malloc #-openmcl #_NewPtr (ash n 16)))
(defun free-quadwords (ptr)
(#+openmcl free #-openmcl #_DisposePtr ptr))
(defppclapfunction %quadword-align-macptr ((p arg_z))
(macptr-ptr imm0 p)
(la imm0 15 imm0)
(clrrwi imm0 imm0 4)
(stw imm0 #+openmcl arch::macptr.address #-openmcl ppc::macptr.address p)
(blr))
(defmacro with-quadwords ((&rest vars) &body body)
(let* ((n (length vars))
(raw (gensym))
(bindings (do* ((vars vars (cdr vars))
(offset 0 (+ offset 16))
(res ()))
((null vars) (nreverse res))
(let* ((v (car vars)))
(push `(,v (%inc-ptr ,raw ,offset)) res)))))
`(%stack-block ((,raw (+ 15 (ash ,n 4))))
(%quadword-align-macptr ,raw)
(with-macptrs (, at bindings)
, at body))))
(defmacro with-quadword-vectors ((&rest bindings) &body body &environment env)
(multiple-value-bind (body decls) (parse-body body env)
(let* ((initforms
(mapcar
#'(lambda (spec)
`(,(car spec) (+ 15 (ash ,(or (cadr spec) 1) 4))))
bindings))
(fixups
(mapcar #'(lambda (spec) `(%quadword-align-macptr ,(car spec)))
bindings)))
`(%stack-block ,initforms
, at decls
(progn
, at fixups
, at body)))))
(defun load-quadword (dest source &optional (index 0))
(declare (type macptr dest source) (fixnum index))
(lvx dest source (the fixnum (ash index 4))))
(defun store-quadword (source dest &optional (index 0))
(declare (type macptr dest source) (fixnum index))
(stvx dest source (the fixnum (ash index 4))))
(defun zero-quadword (q)
(vxor q q q))
(defun fill-quadword-bytes (q b)
(dotimes (i 16) (setf (%get-unsigned-byte q i) b)))
(defun get-quadword-bytes (q)
(let* ((r ()))
(dotimes (i 16 (nreverse r))
(push (%get-unsigned-byte q i) r))))
(defun fill-quadword-halfwords (q h)
(do* ((i 0 (+ i 2)))
((= i 16))
(setf (%get-unsigned-word q i) h)))
(defun get-quadword-halfwords (q)
(do* ((r ())
(i 0 (+ i 2)))
((= i 16) (nreverse r))
(push (%get-unsigned-word q i) r)))
(defun fill-quadword-words (q w)
(do* ((i 0 (+ i 4)))
((= i 16))
(setf (%get-unsigned-long q i) w)))
(defun get-quadword-words (q)
(do* ((r ())
(i 0 (+ i 4)))
((= i 16) (nreverse r))
(push (%get-unsigned-long q i) r)))
(defun nquadwords-for-nwords (w)
(ceiling w 4))
;;; Return the number of quadwords needed to contain the bignum b.
(defun nquadwords-for-bignum (b)
(nquadwords-for-nwords (uvsize b)))
;;; Copy an MCL bignum (little-endian 32-bit bigits) to a sequence of
;;; quadwords; the least-significant bits of the bignum will be
;;; right-justified and the bignum will be sign-extended. Return the
;;; quadword vector and its size as multiple values
(defun quadwords-from-bignum (b)
(check-type b bignum)
(bignum-to-quadwords b (new-quadwords (nquadwords-for-bignum b))))
;;; Quadword vector V is at least big enough to hold the bignum b.
(defun bignum-to-quadwords (b v)
(let* ((nwords (uvsize b))
(nq (ceiling nwords 4)))
(do* ((i 0 (1+ i))
(word (uvref b i) (if (< i nwords) (uvref v i) word))
(p (- (ash nq 4) 4) (- p 4)))
((= i nwords) (do* ((sign (if (logbitp 31 word) -1 0))
(p p (- p 4)))
((< p 0) (values v nq))
(setf (%get-signed-long v p) sign)))
(setf (%get-unsigned-long v p) word))))
(defun quadwords-to-integer (q n)
(let* ((b (%alloc-misc (ash n 2) #+openmcl arch::subtag-bignum #-openmcl ppc::subtag-bignum)))
(do* ((p (- (ash n 4) 4) (- p 4))
(i 0 (1+ i)))
((< p 0) (%normalize-bignum-2 t b))
(setf (uvref b i) (%get-unsigned-long q p)))))
#|
(defun vector-multiply-bignums (x y)
(let* ((nx (uvsize z))
(ny (uvsize y))
(qx (nquadwords-for-nwords nx))
(qy (nquadwords-for-nwords ny))
(qres (nquadwords-for-nwords (+ nx ny))))
(with-quadword-vectors ((vx qx)
(vy qy)
(vres qres))
(bignum-to-quadwords x vx)
(bignum-to-quadwords y vy)
(%vector-multiply-quadwords vres qres vx qx vy qy)
(quadwords-to-integer vres qres))))
|#
More information about the Openmcl-devel
mailing list