[bitc-dev] Code for bitc_vector_string
Sam Mason
sam at samason.me.uk
Mon Jul 3 20:27:39 EDT 2006
On Mon, Jul 03, 2006 at 08:30:20PM +0100, David Hopwood wrote:
> Only code points up to 0x10FFFF (and therefore only up to 4-byte UTF-8
> character encodings) are valid; also the code points reserved for UTF-16
> surrogates are not valid in UTF-8.
I'd like to be able to claim innocence and just say that I was copying
what was there already (mainly in libbitc/stdio.c), but that's probably
not a useful way of fixing the code so I'll try and get both of them
doing the right thing.
> Please see the conformance requirements in chapter 3 of
> <http://www.unicode.org/versions/Unicode4.1.0/>, particularly C12a.
I've never looked at Unicode's standard before so I may well be reading
it wrong; but what I see associated with C12a appears to relate to the
handling of ill-formed "code unit sequences". D36 of the same chapter
seems to document the valid byte sequences, I would interpret table
3-5 and accompanying text as saying that five and six-byte sequences
are invalid.
I've rehashed my code and the existing code to only accept one to
four-byte byte sequences, and added an exception to make sure we only
write out UTF-8 containing these characters. I should probably put
some better checking in the two decoding routines so they validate their
input better, but it's late!
Sam
-------------- next part --------------
diff -rN -u old-coyotos/src.coyotos/src/ccs/bitcc-bootstrap/libbitc/stdio.c new-coyotos/src.coyotos/src/ccs/bitcc-bootstrap/libbitc/stdio.c
--- old-coyotos/src.coyotos/src/ccs/bitcc-bootstrap/libbitc/stdio.c 2006-07-04 01:22:20.000000000 +0100
+++ new-coyotos/src.coyotos/src/ccs/bitcc-bootstrap/libbitc/stdio.c 2006-07-04 01:22:21.000000000 +0100
@@ -141,29 +141,6 @@
+ (encoded[2]-128)*64
+ (encoded[3]-128);
}
- else if (encoded[0] <= 251) {
- result = fread(&encoded[1], 4, 1, ios->f);
- if (result != 1)
- bitc_throw(&val_ExAtEOF);
- ucs4 =
- (encoded[0] - 248)*16777216
- + (encoded[1]-128)*262144
- + (encoded[2]-128)*4096
- + (encoded[3]-128)*64
- + (encoded[4]-128);
- }
- else if (encoded[0] <= 253) {
- result = fread(&encoded[1], 5, 1, ios->f);
- if (result != 1)
- bitc_throw(&val_ExAtEOF);
- ucs4 =
- (encoded[0] - 252)*1073741824
- + (encoded[1]-128)*16777216
- + (encoded[2]-128)*262144
- + (encoded[3]-128)*4096
- + (encoded[4]-128)*64
- + (encoded[5]-128);
- }
else
bitc_throw(&val_ExNotUTF8);
@@ -179,7 +156,7 @@
ssize_t result;
fix_stdio_stream(ios);
- bitc_uns8_t encoded[6];
+ bitc_uns8_t encoded[4];
bitc_uns8_t *utf8 = encoded;
if (ucs4 <= 0x7f) {
@@ -200,21 +177,9 @@
*utf8++ = 128u + ((ucs4 / 64) % 64);
*utf8++ = 128u + (ucs4 % 64);
}
- else if (ucs4 <= 0x3ffffff) {
- *utf8++ = 248u + (ucs4 / 16777216);
- *utf8++ = 128u + ((ucs4 / 262144) % 64);
- *utf8++ = 128u + ((ucs4 / 4096) % 64);
- *utf8++ = 128u + ((ucs4 / 64) % 64);
- *utf8++ = 128u + (ucs4 % 64);
- }
- else if (ucs4 <= 0x7fffffff) {
- *utf8++ = 252u + (ucs4 / 1073741824);
- *utf8++ = 128u + ((ucs4 / 16777216) % 64);
- *utf8++ = 128u + ((ucs4 / 262144) % 64);
- *utf8++ = 128u + ((ucs4 / 4096) % 64);
- *utf8++ = 128u + ((ucs4 / 64) % 64);
- *utf8++ = 128u + (ucs4 % 64);
- }
+ else
+ bitc_throw(&val_ExNotUnicodeChar);
+
result = fwrite(encoded, utf8-encoded, 1, ios->f);
diff -rN -u old-coyotos/src.coyotos/src/ccs/bitcc-bootstrap/libbitc/string.c new-coyotos/src.coyotos/src/ccs/bitcc-bootstrap/libbitc/string.c
--- old-coyotos/src.coyotos/src/ccs/bitcc-bootstrap/libbitc/string.c 2006-07-04 01:22:20.000000000 +0100
+++ new-coyotos/src.coyotos/src/ccs/bitcc-bootstrap/libbitc/string.c 2006-07-04 01:22:21.000000000 +0100
@@ -46,21 +46,8 @@
ucs4 += (*sb++ - 128)*64;
ucs4 += (*sb++ - 128);
}
- else if (*sb <= 251) {
- ucs4 = (*sb++ - 192)*16777216;
- ucs4 += (*sb++ - 128)*262144;
- ucs4 += (*sb++ - 128)*4096;
- ucs4 += (*sb++ - 128)*64;
- ucs4 += (*sb++ - 128);
- }
- else if (*sb <= 253) {
- ucs4 = (*sb++ - 192)*1073741824;
- ucs4 += (*sb++ - 128)*16777216;
- ucs4 += (*sb++ - 128)*262144;
- ucs4 += (*sb++ - 128)*4096;
- ucs4 += (*sb++ - 128)*64;
- ucs4 += (*sb++ - 128);
- }
+ else
+ bitc_throw(&val_ExNotUTF8);
if (snext) *snext = (char *)sb;
return ucs4;
@@ -98,18 +85,54 @@
}
DEFCLOSURE(bitc_string_nth);
-#if 0
bitc_string_t *
-DEFUN(bitc_vector_string, arg0_bitc_vector_string vec)
+DEFUN(bitc_vector_string, arg_0_bitc_vector_string vec)
{
- size_t len = vec->strlen(s);
+ bitc_word_t len = vec->len;
+ bitc_char_t * ucs4 = vec->elem;
+
+ bitc_word_t utf8len = 0;
+ for (bitc_word_t i = vec->len-1; i >= 0; i--) {
+ if (ucs4[i] <= 0x7f) {
+ utf8len += 1;
+ } else if (ucs4[i] <= 0x7ff) {
+ utf8len += 2;
+ } else if (ucs4[i] <= 0xffff) {
+ utf8len += 3;
+ } else if (ucs4[i] <= 0x1fffff) {
+ utf8len += 4;
+ } else
+ bitc_throw (&val_ExNotUnicodeChar);
+ }
+ char * s = GC_ALLOC(utf8len);
+ char * utf8 = s;
+ for (bitc_word_t i = 0; i < vec->len-1; i++) {
+ if (ucs4[i] <= 0x7f) {
+ *utf8++ = ucs4[i];
+ }
+ else if (ucs4[i] <= 0x7ff) {
+ *utf8++ = 192u + (ucs4[i] / 64);
+ *utf8++ = 128u + (ucs4[i] % 64);
+ }
+ else if (ucs4[i] <= 0xffff) {
+ *utf8++ = 224u + (ucs4[i] / 4096);
+ *utf8++ = 128u + ((ucs4[i] / 64) % 64);
+ *utf8++ = 128u + (ucs4[i] % 64);
+ }
+ else if (ucs4[i] <= 0x1fffff) {
+ *utf8++ = 240u + (ucs4[i] / 262144);
+ *utf8++ = 128u + ((ucs4[i] / 4096) % 64);
+ *utf8++ = 128u + ((ucs4[i] / 64) % 64);
+ *utf8++ = 128u + (ucs4[i] % 64);
+ }
+ }
bitc_string_t *tmp = (bitc_string_t *)
GC_ALLOC_ATOMIC(sizeof(bitc_string_t));
tmp->length = len;
tmp->s = s;
return tmp;
}
-#endif
+DEFCLOSURE(bitc_vector_string);
#if 0
bitc_unit_t
diff -rN -u old-coyotos/src.coyotos/src/ccs/bitcc-bootstrap/runtime/bitc/stdio.bitc new-coyotos/src.coyotos/src/ccs/bitcc-bootstrap/runtime/bitc/stdio.bitc
--- old-coyotos/src.coyotos/src/ccs/bitcc-bootstrap/runtime/bitc/stdio.bitc 2006-07-04 01:22:20.000000000 +0100
+++ new-coyotos/src.coyotos/src/ccs/bitcc-bootstrap/runtime/bitc/stdio.bitc 2006-07-04 01:22:21.000000000 +0100
@@ -9,11 +9,9 @@
;; a write is attempted on a read-only stream.
(proclaim NoPermission : exception external ExNoPermission)
(proclaim AtEOF : exception external ExAtEOF)
- (proclaim NotUTF8 : exception external ExNotUTF8)
(defexception NoPermission)
(defexception AtEOF)
- (defexception NotUTF8)
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;;
diff -rN -u old-coyotos/src.coyotos/src/ccs/bitcc-bootstrap/runtime/bitc/string.bitc new-coyotos/src.coyotos/src/ccs/bitcc-bootstrap/runtime/bitc/string.bitc
--- old-coyotos/src.coyotos/src/ccs/bitcc-bootstrap/runtime/bitc/string.bitc 2006-07-04 01:22:20.000000000 +0100
+++ new-coyotos/src.coyotos/src/ccs/bitcc-bootstrap/runtime/bitc/string.bitc 2006-07-04 01:22:21.000000000 +0100
@@ -2,6 +2,12 @@
(import lst bitc.list)
(import vec bitc.vector)
+ (proclaim NotUTF8 : exception external ExNotUTF8)
+ (proclaim NotUnicodeChar : exception external ExNotUnicodeChar)
+
+ (defexception NotUTF8)
+ (defexception NotUnicodeChar)
+
(proclaim string-length : (fn (string) word) external bitc_string_length)
(proclaim string-nth : (fn (string word) char) external bitc_string_nth)
More information about the bitc-dev
mailing list