[Openmcl-devel] ICU sketch

Takehiko Abe keke at gol.com
Thu Apr 19 21:14:45 PDT 2007


;;;
;;; I played with ICU. <http://icu-project.org/>
;;;
;;; The codes below define:
;;;
;;;   unorm-normalize
;;;   unorm-compare
;;;
;;; for normalization and
;;;
;;;    ucol-strcoll
;;;
;;; for collation.
;;;
;;;
;;; Written and (only very casually) tested on Mac OSX 3.9 + OpenMCL
;;; Version 1.1-pre-070214 (DarwinPPC32)


;;; About libicucore.dylib:
;;;
;;; ICU is preinstalled on Mac OSX Panther as
;;; /usr/lib/libicucore.dylib (I don't know about Tiger or Leopard)
;;;
;;; It is probably compiled as 32-bit library. So it won't work for
;;; OpenMCL 64bit version.
;;;
;;; The source of this apple version is available at
;;; <http://www.opensource.apple.com/darwinsource/>

(open-shared-library "/usr/lib/libicucore.dylib")

;;; The standard ICU is available at icu-project.org.  The straight
;;; build produces several shared libs and they should be placed in a
;;; standard location (e.g. /usr/local/lib). There are some
;;; dependencies among them.
;;;

;; ccl::%copy-ivector-to-ptr would be much simpler and more
;; efficient.
(defun %init-utf-16-ptr (ptr string)
  (loop for char across string
	with ptr = (%inc-ptr ptr 0)
	do
	(let ((code (char-code char)))
	  (if (< #xFFFF code)
	    (progn
	      (setf (%get-word ptr)
		    (logior #xD800
			    (ash (- code #x10000) -10))
		    (%get-word ptr 2)
		    (logior #xDC00
			    (logand #x3FF code)))
	      (%incf-ptr ptr 2))
	    (setf (%get-word ptr) code)))
	(%incf-ptr ptr 2)))

(defmacro with-utf-16-ptrs ((&rest specs) &body body)
  (let ((ptr-vars ())
	(strings ())
	(size-vars ())
	(inits ())
	(i 0))
    (dolist (spec specs)
      (destructuring-bind (ptr-var size-var string)
			  spec
	(let ((string-var (make-symbol (format nil "STRING-~D" i))))
	  (push `(,ptr-var ,size-var) ptr-vars)
	  (push `(,string-var ,string) strings)
	  (push `(,size-var (1- (ccl::cstring-encoded-length-in-bytes
				   ,(get-character-encoding :utf-16)
				   ,string-var nil nil)))
		size-vars)
	  (push `(%init-utf-16-ptr ,ptr-var ,string-var) inits))
	(incf i)))
    `(let ,strings
      (let ,size-vars
	(%stack-block ,ptr-vars
	  , at inits
	  , at body)))))

;; %copy-ptr-to-ivector would be much simpler and more
;; effcient.
(defun %string-from-utf-16-ptr (ptr byte-size)
  (let ((len 0))
    ;; check length
    (do* ((i 0 (+ i 1))
	  (c 0 (+ c 2))
	  (code (%get-unsigned-word ptr c)
		(%get-unsigned-word ptr c))
	  (word-size (ash byte-size -1)))
	 ((= i word-size))
      (cond
       ;; malformed inputs:
       ;; What to do when we get a malformed sequence?
       ;; Should provide restarts:
       ;; -- put the standard replacement char #\U+FFFD
       ;; -- let user to choose replacement char.
       ;; -- ??
       ((<= #xD800 code #xDBFF)
        (incf c 2)
        (decf len)
        (unless (<= #xDC00 (%get-unsigned-word ptr c)  #xDFFF)
          (error "malformed utf-16")))
       ((<= #xDC00 code #xDFFF)
        (error "malformed utf-16")))
      (incf len))
    ;; fill string
    (let ((string (make-string len)))
      (do ((i 0 (1+ i))
	   (c 0 (+ c 2)))
	  ((= i len) string)
	(setf (schar string i)
	      (let ((code (%get-unsigned-word ptr c)))
		(if (<= #xD800 code #xDBFF)
		  (code-char (logior
			      (+ (ash (logand #x3FF code) 10) #x10000)
			      (logand #x3FF
				      (%get-unsigned-word
                                       ptr
                                       (incf c 2)))))
		  (code-char code))))))))

;;;
;;; normalization -- unorm.h
;;; <http://www.icu-project.org/apiref/icu4c/unorm_8h.html>
;;;

(defun unorm-normalize (source form)
  (with-utf-16-ptrs ((in-str in-length source))
    (let ((dst-length 0)
	  (nf (ecase form
		((:nfd) 2)
		((:nfkd) 3)
		((:nfc) 4)
		((:nfkc) 5))))
      (rlet ((status :signed-long 0))
	(setq in-length (ash in-length -1))
	(setq dst-length
	      (external-call "_unorm_normalize"
			     :address in-str
			     :long in-length
			     :long nf
			     :long 0 ; #x20
			     :address (%null-ptr)
			     :long dst-length
			     :address status
			     :long))
	(setf (%get-long status) 0)
	(%stack-block ((out-str (* dst-length 2)))
	  (external-call "_unorm_normalize"
			 :address in-str
			 :long in-length
			 :long nf
			 :long 0 ; #x20
			 :address out-str
			 :long dst-length
			 :address status
			 :long)
	  (let ((error (%get-signed-long status)))
	    (if (or (zerop error)
		    (= error -124))
	      (%string-from-utf-16-ptr out-str (+ dst-length dst-length))
	      (error "icu error : ~D" error))))))))

(defun unorm-compare (string1 string2 &optional ignore-case)
  (with-utf-16-ptrs ((str1 len1 string1)
		     (str2 len2 string2))
    (rlet ((error :signed-long 0))
      (external-call "_unorm_compare"
		     :Address str1
		     :long (ash len1 -1)
		     :address str2
		     :long (ash len2 -1) 
		     :long (if ignore-case #x10000 0)
		     :address error
		     :long))))

;;;
;;; collation - ucol.h
;;; <http://www.icu-project.org/apiref/icu4c/ucol_8h.html>
;;;

;;
;; locale - uloc.h
;; <http://www.icu-project.org/apiref/icu4c/uloc_8h.html>

(defun uloc-set-default (locale-string)
  (with-cstrs ((locale locale-string))
    (rlet ((status :signed-long 0))
      (external-call "_uloc_setDefault" :address locale
		     :address status
		     nil)
      (zerop (%get-long status)))))

(defun uloc-get-default ()
  (%get-cstring (external-call "_uloc_getDefault" :address)))

;;
;; ucol.h
;;

(defun ucol-open (locale)
  (with-cstrs ((loc locale))
    (rlet ((error :signed-long 0))
      (let ((collator (external-call "_ucol_open"
				     :address loc
				     :address error
				     :address)))
	(let ((code (%get-long error)))
	  (if (or (zerop code)
		  (= -128 code)
		  (= -127 code))
	    (values collator
		    code)
	    (progn
	      (ucol-close collator)
	      (error "icu-error: ~D" code))))))))

(defun ucol-close (collator)
  (external-call "_ucol_close"
		 :address collator
		 nil))

(defmacro with-ucollator ((var &optional (locale (uloc-get-default)))
                          &body body)
  `(let ((,var (ucol-open ,locale)))
    (unwind-protect
      (progn , at body)
      (ucol-close ,var))))

(defun ucol-strcoll (string1 string2 collator)
  (with-utf-16-ptrs ((str1 size1 string1)
		     (str2 size2 string2))
    (external-call "_ucol_strcoll"
		   :address collator
		   :address str1
		   :long (ash size1 -1)
		   :address str2
		   :long (ash size2 -1)
		   :long)))

;;;
;;; uchar.h
;;; <http://www.icu-project.org/apiref/icu4c/uchar_8h.html>
;;;

(defun u-char-name (codepoint)
  (%stack-block ((buffer 128))
    (rlet ((error :signed-long 0))
      (external-call "_u_charName"
		     :long codepoint
		     :long 2
		     :address buffer
		     :long 256
		     :address error)
      (if (zerop (%get-long error))
	(%get-cstring buffer)))))



#|
;;; tests

;; unorm-normalize

(unorm-normalize (concatenate 'string "abc" (string #\u+FA)) :nfd)

(unorm-normalize (string #\U+D338) :nfd)

(char (unorm-normalize (unorm-normalize (string #\U+D338) :nfd)
		       :nfc)
      0)

(length (unorm-normalize (string #\Latin_Capital_Letter_A_With_Grave)
			 :nfd))


;; unorm-compare

(unorm-compare "ABC" "abc" t)

(let ((string (string (code-char #xac7a))))
  (unorm-compare (unorm-normalize string :nfd)
	         string))

(unorm-compare
 (unorm-normalize (concatenate 'string 
                               (string #\U+2000b)
                               "ABc"
                               (string #\U+ac7a))
                  :nfd)
 (concatenate 'string
              (string #\U+2000b)
              "abc"
              (string #\U+ac7a))
 t)

;; collation with ucol-strcoll

;; not very interesting
(let ((names (list "DIABATE" "balke" "Bartok" "N'Doye")))
  (with-ucollator (coll "en_US")
    (sort names #'(lambda (str1 str2)
		    (minusp (ucol-strcoll str1 str2 coll))))))


;;; character name

(u-char-name #xFBF9)
(u-char-name #xD800)

|#

--
"Always first steps."





More information about the Openmcl-devel mailing list