-/* Copyright (C) 1999 Free Software Foundation
+/* Copyright (C) 1999, 2000, 2003, 2006 Free Software Foundation
This file is part of libgcj.
/** True if a surrogate pair should be emitted as a single UTF8 sequence.
* Otherwise, a surrogate pair is treated as two separate characters.
* Also, '\0' is emitted as {0} if true, and as {0xC0,0x80} if false. */
- public boolean standardUTF8;
+ public boolean standardUTF8 = true;
// Saves the previous char if it was a high-surrogate.
char hi_part;
int avail = buf.length - count;
for (;;)
{
- if (avail == 0 || (inlength == 0 && bytes_todo == 0))
+ if (avail == 0 || (inlength == 0 && bytes_todo == 0 && hi_part == 0))
break;
// The algorithm is made more complicated because we want to write
// at least one byte in the output buffer, if there is room for
while (bytes_todo > 0 && avail > 0);
continue;
}
+
+ // Handle a high surrogate at the end of the input stream.
+ if (inlength == 0 && hi_part != 0)
+ {
+ buf[count++] = (byte) (0xE0 | (hi_part >> 12));
+ value = hi_part;
+ hi_part = 0;
+ avail--;
+ bytes_todo = 2;
+ continue;
+ }
+
char ch = inbuffer[inpos++];
inlength--;
- if (ch < 128 && (ch != 0 || standardUTF8))
+
+ if (hi_part != 0 && (ch <= 0xDBFF || ch > 0xDFFF))
+ {
+ // If the previous character was a high surrogate, and we
+ // don't now have a low surrogate, we print the high
+ // surrogate as an isolated character.
+ --inpos;
+ ++inlength;
+ buf[count++] = (byte) (0xE0 | (hi_part >> 12));
+ value = hi_part;
+ hi_part = 0;
+ avail--;
+ bytes_todo = 2;
+ }
+ else if (hi_part == 0 && ch >= 0xDC00 && ch <= 0xDFFF)
+ {
+ // If this character is a low surrogate and we didn't
+ // previously see a high surrogate, we do the same thing
+ // as above.
+ buf[count++] = (byte) (0xE0 | (ch >> 12));
+ value = ch;
+ avail--;
+ bytes_todo = 2;
+ }
+ else if (ch < 128 && (ch != 0 || standardUTF8))
{
avail--;
buf[count++] = (byte) ch;
{
if (ch <= 0xDBFF) // High surrogates
{
- // The first byte is (0xF0 | value>>18), where value is the
- // Unicode scalar value of the combine character - which
- // we may not know yet. But from substituting:
- // value == (hi-0xD800)*0x400+(lo-0xDC00)+0x10000,
- // hi==ch, and cancelling we get:
- buf[count++] = (byte) (0xF0 | ((ch-0xD800) >> 8));
- avail--;
+ // Just save the high surrogate until the next
+ // character comes along.
hi_part = ch;
}
else // Low surrogates
{
value = (hi_part - 0xD800) * 0x400 + (ch - 0xDC00) + 0x10000;
+ buf[count++] = (byte) (0xF0 | (value >> 18));
+ avail--;
bytes_todo = 3;
+ hi_part = 0;
}
}
else
}
return inpos - start_pos;
}
+
+ public boolean havePendingBytes()
+ {
+ return bytes_todo > 0 || hi_part != 0;
+ }
+
}