;;;; Unicode Transformation Format (UTF) encodings
;;;;
;;;; In our interpretation, these are distinct from UCS encodings: the
;;;; UCS encodings are a direct encoding of the code point, in 16- and
;;;; 32-bit variants; by contrast, the UTF encodings handle Unicode
;;;; surrogate code points specially.

;;;; This software is part of the SBCL system. See the README file for
;;;; more information.
;;;;
;;;; This software is derived from the CMU CL system, which was
;;;; written at Carnegie Mellon University and released into the
;;;; public domain. The software is in the public domain and is
;;;; provided with absolutely no warranty. See the COPYING and CREDITS
;;;; files for more information.

(in-package "SB!IMPL")


(declaim (inline utf-noncharacter-code-p))
(defun utf-noncharacter-code-p (code)
  (or (<= #xd800 code #xdfff)
      (<= #xfdd0 code #xfdef)
      (= (logand code #xfffe) #xfffe)))

;;; Conversion to UTF-16{LE,BE}
(declaim (inline char->utf-16le))
(defun char->utf-16le (char dest string pos)
  (declare (optimize speed (safety 0))
           (type (array (unsigned-byte 8) (*)) dest))
  (let ((code (char-code char)))
    (if (utf-noncharacter-code-p code)
        (let ((replacement (encoding-error :utf-16le string pos)))
          (declare (type (simple-array (unsigned-byte 8) (*)) replacement))
          (dotimes (i (length replacement))
            (vector-push-extend (aref replacement i) dest)))
        (flet ((add-byte (b)
                 (declare (type (unsigned-byte 8) b))
                 (vector-push-extend b dest)))
          (declare (inline add-byte))
          (cond
            ((< code #x10000)
             (add-byte (ldb (byte 8 0) code))
             (add-byte (ldb (byte 8 8) code)))
            (t
             (let* ((codeoid (- code #x10000))
                    (high (dpb (ldb (byte 10 10) codeoid) (byte 10 0) #xd800))
                    (low (dpb (ldb (byte 10 0) codeoid) (byte 10 0) #xdc00)))
               (add-byte (ldb (byte 8 0) high))
               (add-byte (ldb (byte 8 8) high))
               (add-byte (ldb (byte 8 0) low))
               (add-byte (ldb (byte 8 8) low)))))))))

(declaim (inline char->utf-16be))
(defun char->utf-16be (char dest string pos)
  (declare (optimize speed (safety 0))
           (type (array (unsigned-byte 8) (*)) dest))
    (let ((code (char-code char)))
    (if (utf-noncharacter-code-p code)
        (let ((replacement (encoding-error :utf-16be string pos)))
          (declare (type (simple-array (unsigned-byte 8) (*)) replacement))
          (dotimes (i (length replacement))
            (vector-push-extend (aref replacement i) dest)))
        (flet ((add-byte (b)
                 (declare (type (unsigned-byte 8) b))
                 (vector-push-extend b dest)))
          (declare (inline add-byte))
          (cond
            ((< code #x10000)
             (add-byte (ldb (byte 8 8) code))
             (add-byte (ldb (byte 8 0) code)))
            (t
             (let* ((codeoid (- code #x10000))
                    (high (dpb (ldb (byte 10 10) codeoid) (byte 10 0) #xd800))
                    (low (dpb (ldb (byte 10 0) codeoid) (byte 10 0) #xdc00)))
               (add-byte (ldb (byte 8 8) high))
               (add-byte (ldb (byte 8 0) high))
               (add-byte (ldb (byte 8 8) low))
               (add-byte (ldb (byte 8 0) low)))))))))

(defun string->utf-16le (string sstart send additional-space)
  (declare (optimize speed (safety 0))
           (type simple-string string)
           (type array-range sstart send additional-space))
  (let ((array (make-array (* 2 (+ additional-space (- send sstart)))
                           :element-type '(unsigned-byte 8)
                           :fill-pointer 0 :adjustable t)))
    (loop for i from sstart below send
          do (char->utf-16le (char string i) array string i))
    (dotimes (i (* 2 additional-space))
      (vector-push-extend 0 array))
    (coerce array '(simple-array (unsigned-byte 8) (*)))))

(defun string->utf-16be (string sstart send additional-space)
  (declare (optimize speed (safety 0))
           (type simple-string string)
           (type array-range sstart send additional-space))
  (let ((array (make-array (* 2 (+ additional-space (- send sstart)))
                           :element-type '(unsigned-byte 8)
                           :fill-pointer 0 :adjustable t)))
    (loop for i from sstart below send
          do (char->utf-16be (char string i) array string i))
    (dotimes (i (* 2 additional-space))
      (vector-push-extend 0 array))
    (coerce array '(simple-array (unsigned-byte 8) (*)))))

;; Conversion from UTF-16{LE,BE}
(defmacro define-bytes-per-utf16-character (accessor type)
  (declare (ignore type))
  (let ((name-le (make-od-name 'bytes-per-utf-16le-character accessor))
        (name-be (make-od-name 'bytes-per-utf-16be-character accessor)))
    `(progn
      (defun ,name-le (array pos end)
        (let ((remaining (- end pos)))
          (when (< remaining 2)
            (return-from ,name-le (values remaining (decoding-error array pos end :utf-16le 'octet-decoding-error pos))))
          (let ((low (dpb (,accessor array (+ pos 1)) (byte 8 8) (,accessor array pos))))
            (if (<= #xd800 low #xdbff)
                (if (< remaining 4)
                    (values remaining (decoding-error array pos end :utf-16le 'octet-decoding-error pos))
                    (let ((high (dpb (,accessor array (+ pos 3)) (byte 8 8) (,accessor array (+ pos 2)))))
                      (if (<= #xdc00 high #xdfff)
                          (let ((code (dpb (ldb (byte 10 0) low) (byte 10 10) (ldb (byte 10 0) high))))
                            (if (= (logand code #xfffe) #xfffe)
                                (values 4 (decoding-error array pos (+ pos 4) :utf-16le 'octet-decoding-error pos))
                                (values 4 nil)))
                          (values 2 (decoding-error array pos (+ pos 2) :utf-16le 'octet-decoding-error pos)))))
                (if (or (<= #xdc00 low #xdfff)
                        (<= #xfdd0 low #xfdef)
                        (= (logand low #xfffe) #xfffe))
                    (values 2 (decoding-error array pos (+ pos 2) :utf-16le 'octet-decoding-error pos))
                    (values 2 nil))))))
      (defun ,name-be (array pos end)
        (let ((remaining (- end pos)))
          (when (< remaining 2)
            (return-from ,name-be (values remaining (decoding-error array pos end :utf-16le 'octet-decoding-error pos))))
          (let ((low (dpb (,accessor array pos) (byte 8 8) (,accessor array (+ pos 1)))))
            (if (<= #xd800 low #xdbff)
                (if (< remaining 4)
                    (values remaining (decoding-error array pos end :utf-16le 'octet-decoding-error pos))
                    (let ((high (dpb (,accessor array (+ pos 2)) (byte 8 8) (,accessor array (+ pos 3)))))
                      (if (<= #xdc00 high #xdfff)
                          (let ((code (dpb (ldb (byte 10 0) low) (byte 10 10) (ldb (byte 10 0) high))))
                            (if (= (logand code #xfffe) #xfffe)
                                (values 4 (decoding-error array pos (+ pos 4) :utf-16le 'octet-decoding-error pos))
                                (values 4 nil)))
                          (values 2 (decoding-error array pos (+ pos 2) :utf-16le 'octet-decoding-error pos)))))
                (if (or (<= #xdc00 low #xdfff)
                        (<= #xfdd0 low #xfdef)
                        (= (logand low #xfffe) #xfffe))
                    (values 2 (decoding-error array pos (+ pos 2) :utf-16le 'octet-decoding-error pos))
                    (values 2 nil)))))))))
(instantiate-octets-definition define-bytes-per-utf16-character)

(defmacro define-simple-get-utf16-character (accessor type)
  (let ((name-le (make-od-name 'simple-get-utf-16le-char accessor))
        (name-be (make-od-name 'simple-get-utf-16be-char accessor)))
    `(progn
      (defun ,name-le (array pos bytes)
        (declare (optimize speed (safety 0))
                 (type ,type array)
                 (type array-range pos)
                 (type (integer 1 4) bytes)
                 (ignore bytes))
        ;; Optimization for SYSTEM-AREA-POINTER: use SAP-REF-16LE that
        ;; reads two bytes at once on some architectures.
        (let ((code ,(if (and (eq accessor 'sap-ref-8)
                              (eq type 'system-area-pointer))
                         '(sap-ref-16le array pos)
                         `(flet ((cref (x)
                                   (,accessor array (the array-range (+ pos x)))))
                            (declare (inline cref))
                            (dpb (cref 1) (byte 8 8) (cref 0))))))
          (if (<= #xd800 code #xdbff)
              (let ((next ,(if (and (eq accessor 'sap-ref-8)
                                    (eq type 'system-area-pointer))
                               '(sap-ref-16le array (+ pos 2))
                               `(flet ((cref (x)
                                         (,accessor array (the array-range (+ pos x)))))
                                  (declare (inline cref))
                                  (dpb (cref 3) (byte 8 8) (cref 2))))))
                (code-char (+ #x10000 (dpb (ldb (byte 10 0) code) (byte 10 10) (ldb (byte 10 0) next)))))
              (code-char code))))
      (defun ,name-be (array pos bytes)
        (declare (optimize speed (safety 0))
                 (type ,type array)
                 (type array-range pos)
                 (type (integer 1 4) bytes)
                 (ignore bytes))
        ;; Use SAP-REF-16BE even if it is not optimized
        (let ((code ,(if (and (eq accessor 'sap-ref-8)
                              (eq type 'system-area-pointer))
                         '(sap-ref-16be array pos)
                         `(flet ((cref (x)
                                   (,accessor array (the array-range (+ pos x)))))
                            (declare (inline cref))
                            (dpb (cref 0) (byte 8 8) (cref 1))))))
          (if (<= #xd800 code #xdbff)
              (let ((next ,(if (and (eq accessor 'sap-ref-8)
                                    (eq type 'system-area-pointer))
                               '(sap-ref-16be array (+ pos 2))
                               `(flet ((cref (x)
                                         (,accessor array (the array-range (+ pos x)))))
                                  (declare (inline cref))
                                  (dpb (cref 2) (byte 8 8) (cref 3))))))
                (code-char (+ #x10000 (dpb (ldb (byte 10 0) code) (byte 10 10) (ldb (byte 10 0) next)))))
              (code-char code)))))))

(instantiate-octets-definition define-simple-get-utf16-character)

(defmacro define-utf-16->string (accessor type)
  (let ((name-le (make-od-name 'utf-16le->string accessor))
        (name-be (make-od-name 'utf-16be->string accessor)))
    `(progn
      (defun ,name-le (array astart aend)
        (declare (optimize speed (safety 0))
                 (type ,type array)
                 (type array-range astart aend))
        (let ((string (make-array 0 :adjustable t :fill-pointer 0 :element-type 'character)))
          (loop with pos = astart
                while (< pos aend)
                do (multiple-value-bind (bytes invalid)
                       (,(make-od-name 'bytes-per-utf-16le-character accessor) array pos aend)
                     (declare (type (or null string) invalid))
                     (cond
                       ((null invalid)
                        (vector-push-extend
                         (,(make-od-name 'simple-get-utf-16le-char accessor)
                           array pos bytes)
                         string))
                       (t (dotimes (i (length invalid))
                            (vector-push-extend (char invalid i) string))))
                     (incf pos bytes)))
          string))
      (defun ,name-be (array astart aend)
        (declare (optimize speed (safety 0))
                 (type ,type array)
                 (type array-range astart aend))
        (let ((string (make-array 0 :adjustable t :fill-pointer 0 :element-type 'character)))
          (loop with pos = astart
                while (< pos aend)
                do (multiple-value-bind (bytes invalid)
                       (,(make-od-name 'bytes-per-utf-16be-character accessor) array pos aend)
                     (declare (type (or null string) invalid))
                     (cond
                       ((null invalid)
                        (vector-push-extend
                         (,(make-od-name 'simple-get-utf-16be-char accessor)
                           array pos bytes)
                         string))
                       (t (dotimes (i (length invalid))
                            (vector-push-extend (char invalid i) string))))
                     (incf pos bytes)))
          string)))))

(instantiate-octets-definition define-utf-16->string)

(define-external-format/variable-width (:utf-16le :utf16le) t
  (code-char #xfffd)
  (let ((bits (char-code byte)))
    (if (< bits #x10000) 2 4))
  (cond
    ((< bits #x10000)
     (if (utf-noncharacter-code-p bits)
         (external-format-encoding-error stream bits)
         (setf (sap-ref-16le sap tail) bits)))
    (t (if (= (logand bits #xfffe) #xfffe)
           (external-format-encoding-error stream bits)
           (let* ((new-bits (- bits #x10000))
                  (high (ldb (byte 10 10) new-bits))
                  (low (ldb (byte 10 0) new-bits)))
             (setf (sap-ref-16le sap tail) (dpb high (byte 10 0) #xd800))
             (setf (sap-ref-16le sap (+ tail 2)) (dpb low (byte 10 0) #xdc00))))))
  (2 (if (<= #xd800 (sap-ref-16le sap head) #xdbff) 4 2))
  (let ((bits (sap-ref-16le sap head)))
    (cond
      ((or (<= #xdc00 bits #xdfff)
           (<= #xfdd0 bits #xfdef)
           (= (logand bits #xfffe) #xfffe))
       (return-from decode-break-reason 2))
      ((<= #xd800 bits #xdbff)
       (let ((next (sap-ref-16le sap (+ head 2))))
         (unless (<= #xdc00 next #xdfff)
           (return-from decode-break-reason 2))
         (let ((code (dpb (ldb (byte 10 0) bits) (byte 10 10) (ldb (byte 10 0) next))))
           (if (= (logand code #xfffe) #xfffe)
               (return-from decode-break-reason 4)
               (code-char (+ #x10000 code))))))
      (t (code-char bits))))
  utf-16le->string-aref
  string->utf-16le)

(define-external-format/variable-width (:utf-16be :utf16be) t
  (code-char #xfffd)
  (let ((bits (char-code byte)))
    (if (< bits #x10000) 2 4))
  (cond
    ((< bits #x10000)
     (if (utf-noncharacter-code-p bits)
         (external-format-encoding-error stream bits)
         (setf (sap-ref-16be sap tail) bits)))
    (t (if (= (logand bits #xfffe) #xfffe)
           (external-format-encoding-error stream bits)
           (let* ((new-bits (- bits #x10000))
                  (high (ldb (byte 10 10) new-bits))
                  (low (ldb (byte 10 0) new-bits)))
             (setf (sap-ref-16be sap tail) (dpb high (byte 10 0) #xd800))
             (setf (sap-ref-16be sap (+ tail 2)) (dpb low (byte 10 0) #xdc00))))))
  (2 (if (<= #xd800 (sap-ref-16be sap head) #xdbff) 4 2))
  (let ((bits (sap-ref-16be sap head)))
    (cond
      ((or (<= #xdc00 bits #xdfff)
           (<= #xfdd0 bits #xfdef)
           (= (logand bits #xfffe) #xfffe))
       (return-from decode-break-reason 2))
      ((<= #xd800 bits #xdbff)
       (let ((next (sap-ref-16be sap (+ head 2))))
         (unless (<= #xdc00 next #xdfff)
           (return-from decode-break-reason 2))
         (let ((code (dpb (ldb (byte 10 0) bits) (byte 10 10) (ldb (byte 10 0) next))))
           (if (= (logand code #xfffe) #xfffe)
               (return-from decode-break-reason 4)
               (code-char (+ #x10000 code))))))
      (t (code-char bits))))
  utf-16be->string-aref
  string->utf-16be)

(declaim (inline char->utf-32le))
(defun char->utf-32le (char dest string pos)
  (declare (optimize speed (safety 0))
           (type (array (unsigned-byte 8) (*)) dest))
  (let ((code (char-code char)))
    (if (utf-noncharacter-code-p code)
        (let ((replacement (encoding-error :utf-32le string pos)))
          (declare (type (simple-array (unsigned-byte 8) (*)) replacement))
          (dotimes (i (length replacement))
            (vector-push-extend (aref replacement i) dest)))
        (flet ((add-byte (b)
                 (declare (type (unsigned-byte 8) b))
                 (vector-push-extend b dest)))
          (declare (inline add-byte))
          (add-byte (ldb (byte 8 0) code))
          (add-byte (ldb (byte 8 8) code))
          (add-byte (ldb (byte 8 16) code))
          (add-byte (ldb (byte 8 24) code))))))

(declaim (inline char->utf-32be))
(defun char->utf-32be (char dest string pos)
  (declare (optimize speed (safety 0))
           (type (array (unsigned-byte 8) (*)) dest))
  (let ((code (char-code char)))
    (if (utf-noncharacter-code-p code)
        (let ((replacement (encoding-error :utf-32be string pos)))
          (declare (type (simple-array (unsigned-byte 8) (*)) replacement))
          (dotimes (i (length replacement))
            (vector-push-extend (aref replacement i) dest)))
        (flet ((add-byte (b)
                 (declare (type (unsigned-byte 8) b))
                 (vector-push-extend b dest)))
          (declare (inline add-byte))
          (add-byte (ldb (byte 8 24) code))
          (add-byte (ldb (byte 8 16) code))
          (add-byte (ldb (byte 8 8) code))
          (add-byte (ldb (byte 8 0) code))))))

(defun string->utf-32le (string sstart send additional-space)
  (declare (optimize speed (safety 0))
           (type simple-string string)
           (type array-range sstart send additional-space))
  (let ((array (make-array (* 4 (+ additional-space (- send sstart)))
                           :element-type '(unsigned-byte 8)
                           :fill-pointer 0 :adjustable t)))
    (loop for i from sstart below send
          do (char->utf-32le (char string i) array string i))
    (dotimes (i (* 4 additional-space))
      (vector-push-extend 0 array))
    (coerce array '(simple-array (unsigned-byte 8) (*)))))

(defun string->utf-32be (string sstart send additional-space)
  (declare (optimize speed (safety 0))
           (type simple-string string)
           (type array-range sstart send additional-space))
  (let ((array (make-array (* 4 (+ additional-space (- send sstart)))
                           :element-type '(unsigned-byte 8)
                           :fill-pointer 0 :adjustable t)))
    (loop for i from sstart below send
          do (char->utf-32be (char string i) array string i))
    (dotimes (i (* 4 additional-space))
      (vector-push-extend 0 array))
    (coerce array '(simple-array (unsigned-byte 8) (*)))))

;; Conversion from UTF-32{LE,BE}
(defmacro define-bytes-per-utf32-character (accessor type)
  (declare (ignore type))
  (let ((name-le (make-od-name 'bytes-per-utf-32le-character accessor))
        (name-be (make-od-name 'bytes-per-utf-32be-character accessor)))
    `(progn
      (defun ,name-le (array pos end)
        (declare (ignore array pos end))
        (values 4 nil))
      (defun ,name-be (array pos end)
        (declare (ignore array pos end))
        (values 4 nil)))))
(instantiate-octets-definition define-bytes-per-utf32-character)

(defmacro define-simple-get-utf32-character (accessor type)
  (let ((name-le (make-od-name 'simple-get-utf-32le-char accessor))
        (name-be (make-od-name 'simple-get-utf-32be-char accessor)))
    `(progn
      (defun ,name-le (array pos bytes)
        (declare (optimize speed (safety 0))
                 (type ,type array)
                 (type array-range pos)
                 (type (integer 1 4) bytes))
        ;; Optimization for SYSTEM-AREA-POINTER: use SAP-REF-32LE that
        ;; reads four bytes at once on some architectures.
        (let ((code ,(if (and (eq accessor 'sap-ref-8)
                              (eq type 'system-area-pointer))
                         '(sap-ref-32le array pos)
                         `(flet ((cref (x)
                                   (,accessor array (the array-range (+ pos x)))))
                            (declare (inline cref))
                            (dpb (cref 3) (byte 8 24)
                                 (dpb (cref 2) (byte 8 16)
                                      (dpb (cref 1) (byte 8 8) (cref 0))))))))
          (if (and (< code sb!xc:char-code-limit)
                   (not (utf-noncharacter-code-p code)))
              (code-char code)
              (decoding-error array pos (+ pos bytes) :utf-32le
                              'octet-decoding-error pos))))
      (defun ,name-be (array pos bytes)
        (declare (optimize speed (safety 0))
                 (type ,type array)
                 (type array-range pos)
                 (type (integer 1 4) bytes))
        ;; Use SAP-REF-32BE even if it is not optimized
        (let ((code ,(if (and (eq accessor 'sap-ref-8)
                              (eq type 'system-area-pointer))
                         '(sap-ref-32be array pos)
                         `(flet ((cref (x)
                                   (,accessor array (the array-range (+ pos x)))))
                            (declare (inline cref))
                            (dpb (cref 0) (byte 8 24)
                                 (dpb (cref 1) (byte 8 16)
                                      (dpb (cref 2) (byte 8 8) (cref 3))))))))
          (if (and (< code sb!xc:char-code-limit)
                   (not (utf-noncharacter-code-p code)))
              (code-char code)
              (decoding-error array pos (+ pos bytes) :utf-32be
                              'octet-decoding-error pos)))))))

(instantiate-octets-definition define-simple-get-utf32-character)

(defmacro define-utf-32->string (accessor type)
  (let ((name-le (make-od-name 'utf-32le->string accessor))
        (name-be (make-od-name 'utf-32be->string accessor)))
    `(progn
      (defun ,name-le (array astart aend)
        (declare (optimize speed (safety 0))
                 (type ,type array)
                 (type array-range astart aend))
        (let ((string (make-array 0 :adjustable t :fill-pointer 0 :element-type 'character)))
          (loop with pos = astart
                while (< pos aend)
                do (multiple-value-bind (bytes invalid)
                       (,(make-od-name 'bytes-per-utf-32le-character accessor) array pos aend)
                     (declare (type (or null string) invalid))
                     (aver (null invalid))
                     (let ((thing (,(make-od-name 'simple-get-utf-32le-char accessor) array pos bytes)))
                       (typecase thing
                         (character (vector-push-extend thing string))
                         (string (dotimes (i (length thing))
                                   (vector-push-extend (char thing i) string)))))
                     (incf pos bytes)))
          string))
      (defun ,name-be (array astart aend)
        (declare (optimize speed (safety 0))
                 (type ,type array)
                 (type array-range astart aend))
        (let ((string (make-array 0 :adjustable t :fill-pointer 0 :element-type 'character)))
          (loop with pos = astart
                while (< pos aend)
                do (multiple-value-bind (bytes invalid)
                       (,(make-od-name 'bytes-per-utf-32be-character accessor) array pos aend)
                     (declare (type (or null string) invalid))
                     (aver (null invalid))
                     (let ((thing (,(make-od-name 'simple-get-utf-32be-char accessor) array pos bytes)))
                       (typecase thing
                         (character (vector-push-extend thing string))
                         (string (dotimes (i (length thing))
                                   (vector-push-extend (char thing i) string)))))
                     (incf pos bytes)))
          string)))))

(instantiate-octets-definition define-utf-32->string)

(define-external-format/variable-width (:utf-32le :utf32le) t
  (code-char #xfffd)
  4
  (if (utf-noncharacter-code-p bits)
      (external-format-encoding-error stream bits)
      (setf (sap-ref-32le sap tail) bits))
  4
  (let ((code (sap-ref-32le sap head)))
    (if (and (< code sb!xc:char-code-limit)
             (not (utf-noncharacter-code-p code)))
        (code-char code)
        (return-from decode-break-reason 4)))
  utf-32le->string-aref
  string->utf-32le)

(define-external-format/variable-width (:utf-32be :utf32be) t
  (code-char #xfffd)
  4
  (if (utf-noncharacter-code-p bits)
      (external-format-encoding-error stream bits)
      (setf (sap-ref-32be sap tail) bits))
  4
  (let ((code (sap-ref-32be sap head)))
    (if (and (< code sb!xc:char-code-limit)
             (not (utf-noncharacter-code-p code)))
        (code-char code)
        (return-from decode-break-reason 4)))
  utf-32be->string-aref
  string->utf-32be)
