* gnu/gcj/convert/Input_UTF8.java (read): Fixed handling of

author tromey <tromey@138bc75d-0d04-0410-961f-82ee72b054a4>

Tue, 8 Aug 2000 17:35:32 +0000 (17:35 +0000)

committer tromey <tromey@138bc75d-0d04-0410-961f-82ee72b054a4>

Tue, 8 Aug 2000 17:35:32 +0000 (17:35 +0000)
author tromey <tromey@138bc75d-0d04-0410-961f-82ee72b054a4>
Tue, 8 Aug 2000 17:35:32 +0000 (17:35 +0000)
committer tromey <tromey@138bc75d-0d04-0410-961f-82ee72b054a4>
Tue, 8 Aug 2000 17:35:32 +0000 (17:35 +0000)
diff --git a/libjava/ChangeLog b/libjava/ChangeLog

index d586919..12e4dca 100644 (file)
--- a/libjava/ChangeLog
+++ b/libjava/ChangeLog
@@ -1,3 +1,11 @@
+2000-08-08  Tom Tromey  <tromey@cygnus.com>
+
+       * gnu/gcj/convert/Input_UTF8.java (read): Fixed handling of
+       surrogate characters.
+       * gnu/gcj/convert/Output_UTF8.java (standardUTF8): Default to
+       true.
+       (write): Correct handling of surrogate characters.
+
  2000-08-07  Tom Tromey  <tromey@cygnus.com>
  
         * java/lang/reflect/Method.java (hashCode): Use getName().
diff --git a/libjava/gnu/gcj/convert/Input_UTF8.java b/libjava/gnu/gcj/convert/Input_UTF8.java

index f76f282..433a0d1 100644 (file)
--- a/libjava/gnu/gcj/convert/Input_UTF8.java
+++ b/libjava/gnu/gcj/convert/Input_UTF8.java
@@ -1,4 +1,4 @@
-/* Copyright (C) 1999  Free Software Foundation
+/* Copyright (C) 1999, 2000  Free Software Foundation
  
     This file is part of libgcj.
  
@@ -56,10 +56,11 @@ public class Input_UTF8 extends BytesToUnicode
                         // partial == (hi-0xD800)*0x10+((lo-0xDC00)>>6)+0x400.
                         // The definition lo>=0xDC00 && lo<=0xDFFF implies
                         // that (lo-0xDC00)>>6 is in the range 0..15.
-                       // Hence we can infer (partial-0x400)>>4 == (hi-0xDB00)
-                       // and we can emit the high-surrogate without waiting
-                       // for the final byte:
-                       outbuffer[outpos++] = (char) (0xDA00+(partial>>4));
+                       // Hence we can solve for `hi' and we can emit
+                       // the high-surrogate without waiting for the
+                       // final byte:
+                       outbuffer[outpos++]
+                         = (char) (0xD800 + ((partial - 0x400) >> 4));
  
                         // Now we want to set it up so that when we read
                         // the final byte on the next iteration, we will
diff --git a/libjava/gnu/gcj/convert/Output_UTF8.java b/libjava/gnu/gcj/convert/Output_UTF8.java

index 7fb5910..01f5ce8 100644 (file)
--- a/libjava/gnu/gcj/convert/Output_UTF8.java
+++ b/libjava/gnu/gcj/convert/Output_UTF8.java
@@ -1,4 +1,4 @@
-/* Copyright (C) 1999  Free Software Foundation
+/* Copyright (C) 1999, 2000  Free Software Foundation
  
     This file is part of libgcj.
  
@@ -21,7 +21,7 @@ public class Output_UTF8 extends UnicodeToBytes
    /** True if a surrogate pair should be emitted as a single UTF8 sequence.
     * Otherwise, a surrogate pair is treated as two separate characters.
     * Also, '\0' is emitted as {0} if true, and as {0xC0,0x80} if false. */
-  public boolean standardUTF8;
+  public boolean standardUTF8 = true;
  
    // Saves the previous char if it was a high-surrogate.
    char hi_part;
@@ -60,9 +60,27 @@ public class Output_UTF8 extends UnicodeToBytes
             while (bytes_todo > 0 && avail > 0);
             continue;
           }
+
         char ch = inbuffer[inpos++];
         inlength--;
-       if (ch < 128 && (ch != 0 || standardUTF8))
+
+       if ((hi_part != 0 && (ch <= 0xDBFF || ch > 0xDFFF))
+           || (hi_part == 0 && ch >= 0xDC00 && ch <= 0xDFFF))
+         {
+           // If the previous character was a high surrogate, and we
+           // don't now have a low surrogate, we print the high
+           // surrogate as an isolated character.  If this character
+           // is a low surrogate and we didn't previously see a high
+           // surrogate, we do the same thing.
+           --inpos;
+           ++inlength;
+           buf[count++] = (byte) (0xE0 | (hi_part >> 12));
+           value = hi_part;
+           hi_part = 0;
+           avail--;
+           bytes_todo = 2;
+         }
+       else if (ch < 128 && (ch != 0 || standardUTF8))
           {
             avail--;
             buf[count++] = (byte) ch;
@@ -78,19 +96,16 @@ public class Output_UTF8 extends UnicodeToBytes
           {
             if (ch <= 0xDBFF)  // High surrogates
               {
-               // The first byte is (0xF0 | value>>18), where value is the
-               // Unicode scalar value of the combine character - which
-               // we may not know yet.  But from substituting:
-               // value == (hi-0xD800)*0x400+(lo-0xDC00)+0x10000,
-               // hi==ch, and cancelling we get:
-               buf[count++] = (byte) (0xF0 | ((ch-0xD800) >> 8));
-               avail--;
+               // Just save the high surrogate until the next
+               // character comes along.
                 hi_part = ch;
               }
             else // Low surrogates
               {
                 value = (hi_part - 0xD800) * 0x400 + (ch - 0xDC00) + 0x10000;
+               buf[count++] = (byte) (0xF0 | (value >> 18));
                 bytes_todo = 3;
+               hi_part = 0;
               }
           }
         else
author	tromey <tromey@138bc75d-0d04-0410-961f-82ee72b054a4>
	Tue, 8 Aug 2000 17:35:32 +0000 (17:35 +0000)
committer	tromey <tromey@138bc75d-0d04-0410-961f-82ee72b054a4>
	Tue, 8 Aug 2000 17:35:32 +0000 (17:35 +0000)
libjava/ChangeLog		patch \| blob \| history
libjava/gnu/gcj/convert/Input_UTF8.java		patch \| blob \| history
libjava/gnu/gcj/convert/Output_UTF8.java		patch \| blob \| history