[Openmcl-devel] ICU sketch

Takehiko Abe keke at gol.com
Thu Apr 19 21:14:45 PDT 2007

;;; I played with ICU. <http://icu-project.org/>
;;; The codes below define:
;;;   unorm-normalize
;;;   unorm-compare
;;; for normalization and
;;;    ucol-strcoll
;;; for collation.
;;; Written and (only very casually) tested on Mac OSX 3.9 + OpenMCL
;;; Version 1.1-pre-070214 (DarwinPPC32)

;;; About libicucore.dylib:
;;; ICU is preinstalled on Mac OSX Panther as
;;; /usr/lib/libicucore.dylib (I don't know about Tiger or Leopard)
;;; It is probably compiled as 32-bit library. So it won't work for
;;; OpenMCL 64bit version.
;;; The source of this apple version is available at
;;; <http://www.opensource.apple.com/darwinsource/>

(open-shared-library "/usr/lib/libicucore.dylib")

;;; The standard ICU is available at icu-project.org.  The straight
;;; build produces several shared libs and they should be placed in a
;;; standard location (e.g. /usr/local/lib). There are some
;;; dependencies among them.

;; ccl::%copy-ivector-to-ptr would be much simpler and more
;; efficient.
(defun %init-utf-16-ptr (ptr string)
  (loop for char across string
	with ptr = (%inc-ptr ptr 0)
	(let ((code (char-code char)))
	  (if (< #xFFFF code)
	      (setf (%get-word ptr)
		    (logior #xD800
			    (ash (- code #x10000) -10))
		    (%get-word ptr 2)
		    (logior #xDC00
			    (logand #x3FF code)))
	      (%incf-ptr ptr 2))
	    (setf (%get-word ptr) code)))
	(%incf-ptr ptr 2)))

(defmacro with-utf-16-ptrs ((&rest specs) &body body)
  (let ((ptr-vars ())
	(strings ())
	(size-vars ())
	(inits ())
	(i 0))
    (dolist (spec specs)
      (destructuring-bind (ptr-var size-var string)
	(let ((string-var (make-symbol (format nil "STRING-~D" i))))
	  (push `(,ptr-var ,size-var) ptr-vars)
	  (push `(,string-var ,string) strings)
	  (push `(,size-var (1- (ccl::cstring-encoded-length-in-bytes
				   ,(get-character-encoding :utf-16)
				   ,string-var nil nil)))
	  (push `(%init-utf-16-ptr ,ptr-var ,string-var) inits))
	(incf i)))
    `(let ,strings
      (let ,size-vars
	(%stack-block ,ptr-vars
	  , at inits
	  , at body)))))

;; %copy-ptr-to-ivector would be much simpler and more
;; effcient.
(defun %string-from-utf-16-ptr (ptr byte-size)
  (let ((len 0))
    ;; check length
    (do* ((i 0 (+ i 1))
	  (c 0 (+ c 2))
	  (code (%get-unsigned-word ptr c)
		(%get-unsigned-word ptr c))
	  (word-size (ash byte-size -1)))
	 ((= i word-size))
       ;; malformed inputs:
       ;; What to do when we get a malformed sequence?
       ;; Should provide restarts:
       ;; -- put the standard replacement char #\U+FFFD
       ;; -- let user to choose replacement char.
       ;; -- ??
       ((<= #xD800 code #xDBFF)
        (incf c 2)
        (decf len)
        (unless (<= #xDC00 (%get-unsigned-word ptr c)  #xDFFF)
          (error "malformed utf-16")))
       ((<= #xDC00 code #xDFFF)
        (error "malformed utf-16")))
      (incf len))
    ;; fill string
    (let ((string (make-string len)))
      (do ((i 0 (1+ i))
	   (c 0 (+ c 2)))
	  ((= i len) string)
	(setf (schar string i)
	      (let ((code (%get-unsigned-word ptr c)))
		(if (<= #xD800 code #xDBFF)
		  (code-char (logior
			      (+ (ash (logand #x3FF code) 10) #x10000)
			      (logand #x3FF
                                       (incf c 2)))))
		  (code-char code))))))))

;;; normalization -- unorm.h
;;; <http://www.icu-project.org/apiref/icu4c/unorm_8h.html>

(defun unorm-normalize (source form)
  (with-utf-16-ptrs ((in-str in-length source))
    (let ((dst-length 0)
	  (nf (ecase form
		((:nfd) 2)
		((:nfkd) 3)
		((:nfc) 4)
		((:nfkc) 5))))
      (rlet ((status :signed-long 0))
	(setq in-length (ash in-length -1))
	(setq dst-length
	      (external-call "_unorm_normalize"
			     :address in-str
			     :long in-length
			     :long nf
			     :long 0 ; #x20
			     :address (%null-ptr)
			     :long dst-length
			     :address status
	(setf (%get-long status) 0)
	(%stack-block ((out-str (* dst-length 2)))
	  (external-call "_unorm_normalize"
			 :address in-str
			 :long in-length
			 :long nf
			 :long 0 ; #x20
			 :address out-str
			 :long dst-length
			 :address status
	  (let ((error (%get-signed-long status)))
	    (if (or (zerop error)
		    (= error -124))
	      (%string-from-utf-16-ptr out-str (+ dst-length dst-length))
	      (error "icu error : ~D" error))))))))

(defun unorm-compare (string1 string2 &optional ignore-case)
  (with-utf-16-ptrs ((str1 len1 string1)
		     (str2 len2 string2))
    (rlet ((error :signed-long 0))
      (external-call "_unorm_compare"
		     :Address str1
		     :long (ash len1 -1)
		     :address str2
		     :long (ash len2 -1) 
		     :long (if ignore-case #x10000 0)
		     :address error

;;; collation - ucol.h
;;; <http://www.icu-project.org/apiref/icu4c/ucol_8h.html>

;; locale - uloc.h
;; <http://www.icu-project.org/apiref/icu4c/uloc_8h.html>

(defun uloc-set-default (locale-string)
  (with-cstrs ((locale locale-string))
    (rlet ((status :signed-long 0))
      (external-call "_uloc_setDefault" :address locale
		     :address status
      (zerop (%get-long status)))))

(defun uloc-get-default ()
  (%get-cstring (external-call "_uloc_getDefault" :address)))

;; ucol.h

(defun ucol-open (locale)
  (with-cstrs ((loc locale))
    (rlet ((error :signed-long 0))
      (let ((collator (external-call "_ucol_open"
				     :address loc
				     :address error
	(let ((code (%get-long error)))
	  (if (or (zerop code)
		  (= -128 code)
		  (= -127 code))
	    (values collator
	      (ucol-close collator)
	      (error "icu-error: ~D" code))))))))

(defun ucol-close (collator)
  (external-call "_ucol_close"
		 :address collator

(defmacro with-ucollator ((var &optional (locale (uloc-get-default)))
                          &body body)
  `(let ((,var (ucol-open ,locale)))
      (progn , at body)
      (ucol-close ,var))))

(defun ucol-strcoll (string1 string2 collator)
  (with-utf-16-ptrs ((str1 size1 string1)
		     (str2 size2 string2))
    (external-call "_ucol_strcoll"
		   :address collator
		   :address str1
		   :long (ash size1 -1)
		   :address str2
		   :long (ash size2 -1)

;;; uchar.h
;;; <http://www.icu-project.org/apiref/icu4c/uchar_8h.html>

(defun u-char-name (codepoint)
  (%stack-block ((buffer 128))
    (rlet ((error :signed-long 0))
      (external-call "_u_charName"
		     :long codepoint
		     :long 2
		     :address buffer
		     :long 256
		     :address error)
      (if (zerop (%get-long error))
	(%get-cstring buffer)))))

;;; tests

;; unorm-normalize

(unorm-normalize (concatenate 'string "abc" (string #\u+FA)) :nfd)

(unorm-normalize (string #\U+D338) :nfd)

(char (unorm-normalize (unorm-normalize (string #\U+D338) :nfd)

(length (unorm-normalize (string #\Latin_Capital_Letter_A_With_Grave)

;; unorm-compare

(unorm-compare "ABC" "abc" t)

(let ((string (string (code-char #xac7a))))
  (unorm-compare (unorm-normalize string :nfd)

 (unorm-normalize (concatenate 'string 
                               (string #\U+2000b)
                               (string #\U+ac7a))
 (concatenate 'string
              (string #\U+2000b)
              (string #\U+ac7a))

;; collation with ucol-strcoll

;; not very interesting
(let ((names (list "DIABATE" "balke" "Bartok" "N'Doye")))
  (with-ucollator (coll "en_US")
    (sort names #'(lambda (str1 str2)
		    (minusp (ucol-strcoll str1 str2 coll))))))

;;; character name

(u-char-name #xFBF9)
(u-char-name #xD800)


"Always first steps."

More information about the Openmcl-devel mailing list