[bitc-dev] Code for bitc_vector_string

Sam Mason sam at samason.me.uk
Mon Jul 3 20:27:39 EDT 2006


On Mon, Jul 03, 2006 at 08:30:20PM +0100, David Hopwood wrote:
> Only code points up to 0x10FFFF (and therefore only up to 4-byte UTF-8
> character encodings) are valid; also the code points reserved for UTF-16
> surrogates are not valid in UTF-8.

I'd like to be able to claim innocence and just say that I was copying
what was there already (mainly in libbitc/stdio.c), but that's probably
not a useful way of fixing the code so I'll try and get both of them
doing the right thing.

> Please see the conformance requirements in chapter 3 of
> <http://www.unicode.org/versions/Unicode4.1.0/>, particularly C12a.

I've never looked at Unicode's standard before so I may well be reading
it wrong; but what I see associated with C12a appears to relate to the
handling of ill-formed "code unit sequences".  D36 of the same chapter
seems to document the valid byte sequences, I would interpret table
3-5 and accompanying text as saying that five and six-byte sequences
are invalid.


I've rehashed my code and the existing code to only accept one to
four-byte byte sequences, and added an exception to make sure we only
write out UTF-8 containing these characters.  I should probably put
some better checking in the two decoding routines so they validate their
input better, but it's late!

  Sam
-------------- next part --------------
diff -rN -u old-coyotos/src.coyotos/src/ccs/bitcc-bootstrap/libbitc/stdio.c new-coyotos/src.coyotos/src/ccs/bitcc-bootstrap/libbitc/stdio.c
--- old-coyotos/src.coyotos/src/ccs/bitcc-bootstrap/libbitc/stdio.c	2006-07-04 01:22:20.000000000 +0100
+++ new-coyotos/src.coyotos/src/ccs/bitcc-bootstrap/libbitc/stdio.c	2006-07-04 01:22:21.000000000 +0100
@@ -141,29 +141,6 @@
       + (encoded[2]-128)*64
       + (encoded[3]-128);
   }
-  else if (encoded[0] <= 251) {
-    result = fread(&encoded[1], 4, 1, ios->f);
-    if (result != 1)
-      bitc_throw(&val_ExAtEOF);
-    ucs4 = 
-      (encoded[0] - 248)*16777216 
-      + (encoded[1]-128)*262144
-      + (encoded[2]-128)*4096
-      + (encoded[3]-128)*64
-      + (encoded[4]-128);
-  }
-  else if (encoded[0] <= 253) {
-    result = fread(&encoded[1], 5, 1, ios->f);
-    if (result != 1)
-      bitc_throw(&val_ExAtEOF);
-    ucs4 = 
-      (encoded[0] - 252)*1073741824
-      + (encoded[1]-128)*16777216 
-      + (encoded[2]-128)*262144
-      + (encoded[3]-128)*4096
-      + (encoded[4]-128)*64
-      + (encoded[5]-128);
-  }
   else
     bitc_throw(&val_ExNotUTF8);
 
@@ -179,7 +156,7 @@
   ssize_t result;
   fix_stdio_stream(ios);
 
-  bitc_uns8_t encoded[6];
+  bitc_uns8_t encoded[4];
   bitc_uns8_t *utf8 = encoded;
 
   if (ucs4 <= 0x7f) {
@@ -200,21 +177,9 @@
     *utf8++ = 128u + ((ucs4 / 64) % 64);
     *utf8++ = 128u + (ucs4 % 64);
   }
-  else if (ucs4 <= 0x3ffffff) {
-    *utf8++ = 248u + (ucs4 / 16777216);
-    *utf8++ = 128u + ((ucs4 / 262144) % 64);
-    *utf8++ = 128u + ((ucs4 / 4096) % 64);
-    *utf8++ = 128u + ((ucs4 / 64) % 64);
-    *utf8++ = 128u + (ucs4 % 64);
-  }
-  else if (ucs4 <= 0x7fffffff) {
-    *utf8++ = 252u + (ucs4 / 1073741824);
-    *utf8++ = 128u + ((ucs4 / 16777216) % 64);
-    *utf8++ = 128u + ((ucs4 / 262144) % 64);
-    *utf8++ = 128u + ((ucs4 / 4096) % 64);
-    *utf8++ = 128u + ((ucs4 / 64) % 64);
-    *utf8++ = 128u + (ucs4 % 64);
-  }
+  else
+    bitc_throw(&val_ExNotUnicodeChar);
+
 
   result = fwrite(encoded, utf8-encoded, 1, ios->f);
 
diff -rN -u old-coyotos/src.coyotos/src/ccs/bitcc-bootstrap/libbitc/string.c new-coyotos/src.coyotos/src/ccs/bitcc-bootstrap/libbitc/string.c
--- old-coyotos/src.coyotos/src/ccs/bitcc-bootstrap/libbitc/string.c	2006-07-04 01:22:20.000000000 +0100
+++ new-coyotos/src.coyotos/src/ccs/bitcc-bootstrap/libbitc/string.c	2006-07-04 01:22:21.000000000 +0100
@@ -46,21 +46,8 @@
     ucs4 += (*sb++ - 128)*64;
     ucs4 += (*sb++ - 128);
   }
-  else if (*sb <= 251) {
-    ucs4 = (*sb++ - 192)*16777216;
-    ucs4 += (*sb++ - 128)*262144;
-    ucs4 += (*sb++ - 128)*4096;
-    ucs4 += (*sb++ - 128)*64;
-    ucs4 += (*sb++ - 128);
-  }
-  else if (*sb <= 253) {
-    ucs4 = (*sb++ - 192)*1073741824;
-    ucs4 += (*sb++ - 128)*16777216;
-    ucs4 += (*sb++ - 128)*262144;
-    ucs4 += (*sb++ - 128)*4096;
-    ucs4 += (*sb++ - 128)*64;
-    ucs4 += (*sb++ - 128);
-  }
+  else
+    bitc_throw(&val_ExNotUTF8);
 
   if (snext) *snext = (char *)sb;
   return ucs4;
@@ -98,18 +85,54 @@
 }
 DEFCLOSURE(bitc_string_nth);
 
-#if 0
 bitc_string_t *
-DEFUN(bitc_vector_string, arg0_bitc_vector_string vec)
+DEFUN(bitc_vector_string, arg_0_bitc_vector_string vec)
 {
-  size_t len = vec->strlen(s);
+  bitc_word_t   len  = vec->len;
+  bitc_char_t * ucs4 = vec->elem;
+  
+  bitc_word_t utf8len = 0;
+  for (bitc_word_t i = vec->len-1; i >= 0; i--) {
+    if (ucs4[i] <= 0x7f) {
+      utf8len += 1;
+    } else if (ucs4[i] <= 0x7ff) {
+      utf8len += 2;
+    } else if (ucs4[i] <= 0xffff) {
+      utf8len += 3;
+    } else if (ucs4[i] <= 0x1fffff) {
+      utf8len += 4;
+    } else
+      bitc_throw (&val_ExNotUnicodeChar);
+  }
+  char * s = GC_ALLOC(utf8len);
+  char * utf8 = s;
+  for (bitc_word_t i = 0; i < vec->len-1; i++) {
+    if (ucs4[i] <= 0x7f) {
+      *utf8++ = ucs4[i];
+    }
+    else if (ucs4[i] <= 0x7ff) {
+      *utf8++ = 192u + (ucs4[i] / 64);
+      *utf8++ = 128u + (ucs4[i] % 64);
+    }
+    else if (ucs4[i] <= 0xffff) {
+      *utf8++ = 224u + (ucs4[i] / 4096);
+      *utf8++ = 128u + ((ucs4[i] / 64) % 64);
+      *utf8++ = 128u + (ucs4[i] % 64);
+    }
+    else if (ucs4[i] <= 0x1fffff) {
+      *utf8++ = 240u + (ucs4[i] / 262144);
+      *utf8++ = 128u + ((ucs4[i] / 4096) % 64);
+      *utf8++ = 128u + ((ucs4[i] / 64) % 64);
+      *utf8++ = 128u + (ucs4[i] % 64);
+    }
+  }
   bitc_string_t *tmp = (bitc_string_t *) 
     GC_ALLOC_ATOMIC(sizeof(bitc_string_t));
   tmp->length = len;
   tmp->s = s;
   return tmp;
 }
-#endif
+DEFCLOSURE(bitc_vector_string);
 
 #if 0
 bitc_unit_t
diff -rN -u old-coyotos/src.coyotos/src/ccs/bitcc-bootstrap/runtime/bitc/stdio.bitc new-coyotos/src.coyotos/src/ccs/bitcc-bootstrap/runtime/bitc/stdio.bitc
--- old-coyotos/src.coyotos/src/ccs/bitcc-bootstrap/runtime/bitc/stdio.bitc	2006-07-04 01:22:20.000000000 +0100
+++ new-coyotos/src.coyotos/src/ccs/bitcc-bootstrap/runtime/bitc/stdio.bitc	2006-07-04 01:22:21.000000000 +0100
@@ -9,11 +9,9 @@
   ;; a write is attempted on a read-only stream.
   (proclaim NoPermission : exception external ExNoPermission)
   (proclaim AtEOF : exception external ExAtEOF)
-  (proclaim NotUTF8 : exception external ExNotUTF8)
 
   (defexception NoPermission)
   (defexception AtEOF)
-  (defexception NotUTF8)
 
   ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
   ;;
diff -rN -u old-coyotos/src.coyotos/src/ccs/bitcc-bootstrap/runtime/bitc/string.bitc new-coyotos/src.coyotos/src/ccs/bitcc-bootstrap/runtime/bitc/string.bitc
--- old-coyotos/src.coyotos/src/ccs/bitcc-bootstrap/runtime/bitc/string.bitc	2006-07-04 01:22:20.000000000 +0100
+++ new-coyotos/src.coyotos/src/ccs/bitcc-bootstrap/runtime/bitc/string.bitc	2006-07-04 01:22:21.000000000 +0100
@@ -2,6 +2,12 @@
   (import lst bitc.list)
   (import vec bitc.vector)
 
+  (proclaim NotUTF8 :        exception external ExNotUTF8)
+  (proclaim NotUnicodeChar : exception external ExNotUnicodeChar)
+
+  (defexception NotUTF8)
+  (defexception NotUnicodeChar)
+
   (proclaim string-length : (fn (string) word) external bitc_string_length)
   (proclaim string-nth :    (fn (string word) char) external bitc_string_nth)
 



More information about the bitc-dev mailing list