1 ------------------------------------------------------------------------------
3 -- GNAT RUN-TIME COMPONENTS --
5 -- ADA.STRINGS.UTF_ENCODING.WIDE_ENCODING --
9 -- Copyright (C) 2010, Free Software Foundation, Inc. --
11 -- GNAT is free software; you can redistribute it and/or modify it under --
12 -- terms of the GNU General Public License as published by the Free Soft- --
13 -- ware Foundation; either version 2, or (at your option) any later ver- --
14 -- sion. GNAT is distributed in the hope that it will be useful, but WITH- --
15 -- OUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY --
16 -- or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License --
17 -- for more details. You should have received a copy of the GNU General --
18 -- Public License distributed with GNAT; see file COPYING. If not, write --
19 -- to the Free Software Foundation, 51 Franklin Street, Fifth Floor, --
20 -- Boston, MA 02110-1301, USA. --
22 -- As a special exception, if other files instantiate generics from this --
23 -- unit, or you link this unit with other files to produce an executable, --
24 -- this unit does not by itself cause the resulting executable to be --
25 -- covered by the GNU General Public License. This exception does not --
26 -- however invalidate any other reasons why the executable file might be --
27 -- covered by the GNU Public License. --
29 -- GNAT was originally developed by the GNAT team at New York University. --
30 -- Extensive contributions were provided by Ada Core Technologies Inc. --
32 ------------------------------------------------------------------------------
34 package body Ada.Strings.UTF_Encoding.Wide_Encoding is
41 -- Version to decode UTF-8/UTF-16BE/UTF-16LE input to Wide_String
45 Input_Scheme : Encoding_Scheme) return Wide_String
48 if Input_Scheme = UTF_8 then
51 return Decode (To_UTF_16 (Item, Input_Scheme));
55 -- Decode UTF-8 input to Wide_String
57 function Decode (Item : UTF_8_String) return Wide_String is
58 Result : Wide_String (1 .. Item'Length);
59 -- Result string (worst case is same length as input)
62 -- Length of result stored so far
70 procedure Get_Continuation;
71 -- Reads a continuation byte of the form 10xxxxxx, shifts R left
72 -- by 6 bits, and or's in the xxxxxx to the low order 6 bits. On
73 -- return Ptr is incremented. Raises exceptioon if continuation
74 -- byte does not exist or is invalid.
76 ----------------------
77 -- Get_Continuation --
78 ----------------------
80 procedure Get_Continuation is
82 if Iptr > Item'Last then
83 Raise_Encoding_Error (Iptr - 1);
86 C := To_Unsigned_8 (Item (Iptr));
89 if C not in 2#10_000000# .. 2#10_111111# then
90 Raise_Encoding_Error (Iptr - 1);
92 R := Shift_Left (R, 6) or Unsigned_16 (C and 2#00_111111#);
97 -- Start of processing for Decode
105 and then Item (Iptr .. Iptr + 2) = BOM_8
111 elsif Item'Length >= 2
112 and then (Item (Iptr .. Iptr + 1) = BOM_16BE
114 Item (Iptr .. Iptr + 1) = BOM_16LE)
116 Raise_Encoding_Error (Iptr);
119 while Iptr <= Item'Last loop
120 C := To_Unsigned_8 (Item (Iptr));
123 -- Codes in the range 16#00# - 16#7F# are represented as
127 R := Unsigned_16 (C);
129 -- No initial code can be of the form 10xxxxxx. Such codes are used
130 -- only for continuations.
132 elsif C <= 2#10_111111# then
133 Raise_Encoding_Error (Iptr - 1);
135 -- Codes in the range 16#80# - 16#7FF# are represented as
138 elsif C <= 2#110_11111# then
139 R := Unsigned_16 (C and 2#000_11111#);
142 -- Codes in the range 16#800# - 16#FFFF# are represented as
143 -- 1110yyyy 10yyyyxx 10xxxxxx
145 elsif C <= 2#1110_1111# then
146 R := Unsigned_16 (C and 2#0000_1111#);
150 -- Codes in the range 16#10000# - 16#10FFFF# are represented as
151 -- 11110zzz 10zzyyyy 10yyyyxx 10xxxxxx
153 -- Such codes are out of range for Wide_String output
156 Raise_Encoding_Error (Iptr - 1);
160 Result (Len) := Wide_Character'Val (R);
163 return Result (1 .. Len);
166 -- Decode UTF-16 input to Wide_String
168 function Decode (Item : UTF_16_Wide_String) return Wide_String is
169 Result : Wide_String (1 .. Item'Length);
170 -- Result is same length as input (possibly minus 1 if BOM present)
176 -- Index of next Item element
181 -- Skip UTF-16 BOM at start
185 if Item'Length > 0 and then Item (Iptr) = BOM_16 (1) then
189 -- Loop through input characters
191 while Iptr <= Item'Last loop
192 C := To_Unsigned_16 (Item (Iptr));
195 -- Codes in the range 16#0000#..16#D7FF# or 16#E000#..16#FFFD#
196 -- represent their own value.
198 if C <= 16#D7FF# or else C in 16#E000# .. 16#FFFD# then
200 Result (Len) := Wide_Character'Val (C);
202 -- Codes in the range 16#D800#..16#DBFF# represent the first of the
203 -- two surrogates used to encode the range 16#01_000#..16#10_FFFF".
204 -- Such codes are out of range for 16-bit output.
206 -- The case of input in the range 16#DC00#..16#DFFF# must never
207 -- occur, since it means we have a second surrogate character with
208 -- no corresponding first surrogate.
210 -- Codes in the range 16#FFFE# .. 16#FFFF# are also invalid since
211 -- they conflict with codes used for BOM values.
213 -- Thus all remaining codes are invalid
216 Raise_Encoding_Error (Iptr - 1);
220 return Result (1 .. Len);
227 -- Encode Wide_String in UTF-8, UTF-16BE or UTF-16LE
231 Output_Scheme : Encoding_Scheme;
232 Output_BOM : Boolean := False) return UTF_String
237 if Output_Scheme = UTF_8 then
238 return Encode (Item, Output_BOM);
240 -- Case of UTF_16LE or UTF_16BE, use UTF-16 intermediary
243 return From_UTF_16 (UTF_16_Wide_String'(Encode (Item)),
244 Output_Scheme, Output_BOM);
248 -- Encode Wide_String in UTF-8
252 Output_BOM : Boolean := False) return UTF_8_String
254 Result : UTF_8_String (1 .. 3 * Item'Length + 3);
255 -- Worst case is three bytes per input byte + space for BOM
258 -- Number of output codes stored in Result
261 -- Single input character
263 procedure Store (C : Unsigned_16);
264 pragma Inline (Store);
265 -- Store one output code, C is in the range 0 .. 255
271 procedure Store (C : Unsigned_16) is
274 Result (Len) := Character'Val (C);
277 -- Start of processing for UTF8_Encode
280 -- Output BOM if required
283 Result (1 .. 3) := BOM_8;
289 -- Loop through characters of input
291 for J in Item'Range loop
292 C := To_Unsigned_16 (Item (J));
294 -- Codes in the range 16#00# - 16#7F# are represented as
300 -- Codes in the range 16#80# - 16#7FF# are represented as
303 elsif C <= 16#7FF# then
304 Store (2#110_00000# or Shift_Right (C, 6));
305 Store (2#10_000000# or (C and 2#00_111111#));
307 -- Codes in the range 16#800# - 16#FFFF# are represented as
308 -- 1110yyyy 10yyyyxx 10xxxxxx
311 Store (2#1110_0000# or Shift_Right (C, 12));
312 Store (2#10_000000# or
313 Shift_Right (C and 2#111111_000000#, 6));
314 Store (2#10_000000# or (C and 2#00_111111#));
318 return Result (1 .. Len);
321 -- Encode Wide_String in UTF-16
325 Output_BOM : Boolean := False) return UTF_16_Wide_String
327 Result : Wide_String (1 .. Item'Length + Boolean'Pos (Output_BOM));
328 -- Output is same length as input + possible BOM
331 -- Length of output string
336 -- Output BOM if required
339 Result (1) := BOM_16 (1);
345 -- Loop through input characters encoding them
347 for Iptr in Item'Range loop
348 C := To_Unsigned_16 (Item (Iptr));
350 -- Codes in the range 16#0000#..16#D7FF# or 16#E000#..16#FFFD# are
353 if C <= 16#D7FF# or else C in 16#E000# .. 16#FFFD# then
355 Result (Len) := Wide_Character'Val (C);
357 -- Codes in tne range 16#D800#..16#DFFF# should never appear in the
358 -- input, since no valid Unicode characters are in this range (which
359 -- would conflict with the UTF-16 surrogate encodings). Similarly
360 -- codes in the range 16#FFFE#..16#FFFF conflict with BOM codes.
361 -- Thus all remaining codes are illegal.
364 Raise_Encoding_Error (Iptr);
371 end Ada.Strings.UTF_Encoding.Wide_Encoding;