1 ------------------------------------------------------------------------------
3 -- GNAT RUN-TIME COMPONENTS --
5 -- G N A T . D E C O D E _ S T R I N G --
9 -- Copyright (C) 2007-2008, AdaCore --
11 -- GNAT is free software; you can redistribute it and/or modify it under --
12 -- terms of the GNU General Public License as published by the Free Soft- --
13 -- ware Foundation; either version 2, or (at your option) any later ver- --
14 -- sion. GNAT is distributed in the hope that it will be useful, but WITH- --
15 -- OUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY --
16 -- or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License --
17 -- for more details. You should have received a copy of the GNU General --
18 -- Public License distributed with GNAT; see file COPYING. If not, write --
19 -- to the Free Software Foundation, 51 Franklin Street, Fifth Floor, --
20 -- Boston, MA 02110-1301, USA. --
22 -- As a special exception, if other files instantiate generics from this --
23 -- unit, or you link this unit with other files to produce an executable, --
24 -- this unit does not by itself cause the resulting executable to be --
25 -- covered by the GNU General Public License. This exception does not --
26 -- however invalidate any other reasons why the executable file might be --
27 -- covered by the GNU Public License. --
29 -- GNAT was originally developed by the GNAT team at New York University. --
30 -- Extensive contributions were provided by Ada Core Technologies Inc. --
32 ------------------------------------------------------------------------------
34 -- This package provides a utility routine for converting from an encoded
35 -- string to a corresponding Wide_String or Wide_Wide_String value.
37 with Interfaces; use Interfaces;
39 with System.WCh_Cnv; use System.WCh_Cnv;
40 with System.WCh_Con; use System.WCh_Con;
42 package body GNAT.Decode_String is
44 -----------------------
45 -- Local Subprograms --
46 -----------------------
49 pragma No_Return (Bad);
50 -- Raise error for bad encoding
53 pragma No_Return (Past_End);
54 -- Raise error for off end of string
62 raise Constraint_Error with
63 "bad encoding or character out of range";
66 ---------------------------
67 -- Decode_Wide_Character --
68 ---------------------------
70 procedure Decode_Wide_Character
73 Result : out Wide_Character)
75 Char : Wide_Wide_Character;
77 Decode_Wide_Wide_Character (Input, Ptr, Char);
79 if Wide_Wide_Character'Pos (Char) > 16#FFFF# then
82 Result := Wide_Character'Val (Wide_Wide_Character'Pos (Char));
84 end Decode_Wide_Character;
86 ------------------------
87 -- Decode_Wide_String --
88 ------------------------
90 function Decode_Wide_String (S : String) return Wide_String is
91 Result : Wide_String (1 .. S'Length);
94 Decode_Wide_String (S, Result, Length);
95 return Result (1 .. Length);
96 end Decode_Wide_String;
98 procedure Decode_Wide_String
100 Result : out Wide_String;
101 Length : out Natural)
108 while Ptr <= S'Last loop
109 if Length >= Result'Last then
113 Length := Length + 1;
114 Decode_Wide_Character (S, Ptr, Result (Length));
116 end Decode_Wide_String;
118 --------------------------------
119 -- Decode_Wide_Wide_Character --
120 --------------------------------
122 procedure Decode_Wide_Wide_Character
124 Ptr : in out Natural;
125 Result : out Wide_Wide_Character)
129 function In_Char return Character;
130 pragma Inline (In_Char);
131 -- Function to get one input character
137 function In_Char return Character is
139 if Ptr <= Input'Last then
141 return Input (Ptr - 1);
147 -- Start of processing for Decode_Wide_Wide_Character
152 -- Special fast processing for UTF-8 case
154 if Encoding_Method = WCEM_UTF8 then
159 procedure Get_UTF_Byte;
160 pragma Inline (Get_UTF_Byte);
161 -- Used to interpret 2#10xxxxxx# continuation byte in UTF-8 mode.
162 -- Reads a byte, and raises CE if the first two bits are not 10.
163 -- Otherwise shifts W 6 bits left and or's in the 6 xxxxxx bits.
169 procedure Get_UTF_Byte is
171 U := Unsigned_32 (Character'Pos (In_Char));
173 if (U and 2#11000000#) /= 2#10_000000# then
177 W := Shift_Left (W, 6) or (U and 2#00111111#);
180 -- Start of processing for UTF8 case
183 -- Note: for details of UTF8 encoding see RFC 3629
185 U := Unsigned_32 (Character'Pos (C));
187 -- 16#00_0000#-16#00_007F#: 0xxxxxxx
189 if (U and 2#10000000#) = 2#00000000# then
190 Result := Wide_Wide_Character'Val (Character'Pos (C));
192 -- 16#00_0080#-16#00_07FF#: 110xxxxx 10xxxxxx
194 elsif (U and 2#11100000#) = 2#110_00000# then
195 W := U and 2#00011111#;
197 Result := Wide_Wide_Character'Val (W);
199 -- 16#00_0800#-16#00_ffff#: 1110xxxx 10xxxxxx 10xxxxxx
201 elsif (U and 2#11110000#) = 2#1110_0000# then
202 W := U and 2#00001111#;
205 Result := Wide_Wide_Character'Val (W);
207 -- 16#01_0000#-16#10_FFFF#: 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
209 elsif (U and 2#11111000#) = 2#11110_000# then
210 W := U and 2#00000111#;
216 Result := Wide_Wide_Character'Val (W);
218 -- 16#0020_0000#-16#03FF_FFFF#: 111110xx 10xxxxxx 10xxxxxx
221 elsif (U and 2#11111100#) = 2#111110_00# then
222 W := U and 2#00000011#;
228 Result := Wide_Wide_Character'Val (W);
230 -- All other cases are invalid, note that this includes:
232 -- 16#0400_0000#-16#7FFF_FFFF#: 1111110x 10xxxxxx 10xxxxxx
233 -- 10xxxxxx 10xxxxxx 10xxxxxx
235 -- since Wide_Wide_Character does not include code values
236 -- greater than 16#03FF_FFFF#.
243 -- All encoding functions other than UTF-8
247 function Char_Sequence_To_UTF is
248 new Char_Sequence_To_UTF_32 (In_Char);
251 -- For brackets, must test for specific case of [ not followed by
252 -- quotation, where we must not call Char_Sequence_To_UTF, but
253 -- instead just return the bracket unchanged.
255 if Encoding_Method = WCEM_Brackets
257 and then (Ptr > Input'Last or else Input (Ptr) /= '"')
261 -- All other cases including [" with Brackets
265 Wide_Wide_Character'Val
266 (Char_Sequence_To_UTF (C, Encoding_Method));
270 end Decode_Wide_Wide_Character;
272 -----------------------------
273 -- Decode_Wide_Wide_String --
274 -----------------------------
276 function Decode_Wide_Wide_String (S : String) return Wide_Wide_String is
277 Result : Wide_Wide_String (1 .. S'Length);
280 Decode_Wide_Wide_String (S, Result, Length);
281 return Result (1 .. Length);
282 end Decode_Wide_Wide_String;
284 procedure Decode_Wide_Wide_String
286 Result : out Wide_Wide_String;
287 Length : out Natural)
294 while Ptr <= S'Last loop
295 if Length >= Result'Last then
299 Length := Length + 1;
300 Decode_Wide_Wide_Character (S, Ptr, Result (Length));
302 end Decode_Wide_Wide_String;
304 -------------------------
305 -- Next_Wide_Character --
306 -------------------------
308 procedure Next_Wide_Character (Input : String; Ptr : in out Natural) is
310 if Ptr < Input'First then
314 -- Special efficient encoding for UTF-8 case
316 if Encoding_Method = WCEM_UTF8 then
321 pragma Inline (Getc);
322 -- Gets the character at Input (Ptr) and returns code in U as
323 -- Unsigned_32 value. On return Ptr is bumped past the character.
325 procedure Skip_UTF_Byte;
326 pragma Inline (Skip_UTF_Byte);
327 -- Skips past one encoded byte which must be 2#10xxxxxx#
335 if Ptr > Input'Last then
338 U := Unsigned_32 (Character'Pos (Input (Ptr)));
347 procedure Skip_UTF_Byte is
351 if (U and 2#11000000#) /= 2#10_000000# then
356 -- Start of processing for UTF-8 case
359 -- 16#00_0000#-16#00_007F#: 0xxxxxxx
363 if (U and 2#10000000#) = 2#00000000# then
366 -- 16#00_0080#-16#00_07FF#: 110xxxxx 10xxxxxx
368 elsif (U and 2#11100000#) = 2#110_00000# then
371 -- 16#00_0800#-16#00_ffff#: 1110xxxx 10xxxxxx 10xxxxxx
373 elsif (U and 2#11110000#) = 2#1110_0000# then
377 -- Any other code is invalid, note that this includes:
379 -- 16#01_0000#-16#10_FFFF#: 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
381 -- 16#0020_0000#-16#03FF_FFFF#: 111110xx 10xxxxxx 10xxxxxx
384 -- 16#0400_0000#-16#7FFF_FFFF#: 1111110x 10xxxxxx 10xxxxxx
385 -- 10xxxxxx 10xxxxxx 10xxxxxx
387 -- since Wide_Character does not allow codes > 16#FFFF#
398 Discard : Wide_Character;
400 Decode_Wide_Character (Input, Ptr, Discard);
403 end Next_Wide_Character;
405 ------------------------------
406 -- Next_Wide_Wide_Character --
407 ------------------------------
409 procedure Next_Wide_Wide_Character (Input : String; Ptr : in out Natural) is
411 -- Special efficient encoding for UTF-8 case
413 if Encoding_Method = WCEM_UTF8 then
418 pragma Inline (Getc);
419 -- Gets the character at Input (Ptr) and returns code in U as
420 -- Unsigned_32 value. On return Ptr is bumped past the character.
422 procedure Skip_UTF_Byte;
423 pragma Inline (Skip_UTF_Byte);
424 -- Skips past one encoded byte which must be 2#10xxxxxx#
432 if Ptr > Input'Last then
435 U := Unsigned_32 (Character'Pos (Input (Ptr)));
444 procedure Skip_UTF_Byte is
448 if (U and 2#11000000#) /= 2#10_000000# then
453 -- Start of processing for UTF-8 case
456 if Ptr < Input'First then
460 -- 16#00_0000#-16#00_007F#: 0xxxxxxx
464 if (U and 2#10000000#) = 2#00000000# then
467 -- 16#00_0080#-16#00_07FF#: 110xxxxx 10xxxxxx
469 elsif (U and 2#11100000#) = 2#110_00000# then
472 -- 16#00_0800#-16#00_ffff#: 1110xxxx 10xxxxxx 10xxxxxx
474 elsif (U and 2#11110000#) = 2#1110_0000# then
478 -- 16#01_0000#-16#10_FFFF#: 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
480 elsif (U and 2#11111000#) = 2#11110_000# then
485 -- 16#0020_0000#-16#03FF_FFFF#: 111110xx 10xxxxxx 10xxxxxx
488 elsif (U and 2#11111100#) = 2#111110_00# then
493 -- Any other code is invalid, note that this includes:
495 -- 16#0400_0000#-16#7FFF_FFFF#: 1111110x 10xxxxxx 10xxxxxx
496 -- 10xxxxxx 10xxxxxx 10xxxxxx
498 -- since Wide_Wide_Character does not allow codes > 16#03FF_FFFF#
509 Discard : Wide_Wide_Character;
511 Decode_Wide_Wide_Character (Input, Ptr, Discard);
514 end Next_Wide_Wide_Character;
520 procedure Past_End is
522 raise Constraint_Error with "past end of string";
525 -------------------------
526 -- Prev_Wide_Character --
527 -------------------------
529 procedure Prev_Wide_Character (Input : String; Ptr : in out Natural) is
531 if Ptr > Input'Last + 1 then
535 -- Special efficient encoding for UTF-8 case
537 if Encoding_Method = WCEM_UTF8 then
542 pragma Inline (Getc);
543 -- Gets the character at Input (Ptr - 1) and returns code in U as
544 -- Unsigned_32 value. On return Ptr is decremented by one.
546 procedure Skip_UTF_Byte;
547 pragma Inline (Skip_UTF_Byte);
548 -- Checks that U is 2#10xxxxxx# and then calls Get
556 if Ptr <= Input'First then
560 U := Unsigned_32 (Character'Pos (Input (Ptr)));
568 procedure Skip_UTF_Byte is
570 if (U and 2#11000000#) = 2#10_000000# then
577 -- Start of processing for UTF-8 case
580 -- 16#00_0000#-16#00_007F#: 0xxxxxxx
584 if (U and 2#10000000#) = 2#00000000# then
587 -- 16#00_0080#-16#00_07FF#: 110xxxxx 10xxxxxx
592 if (U and 2#11100000#) = 2#110_00000# then
595 -- 16#00_0800#-16#00_ffff#: 1110xxxx 10xxxxxx 10xxxxxx
600 if (U and 2#11110000#) = 2#1110_0000# then
603 -- Any other code is invalid, note that this includes:
605 -- 16#01_0000#-16#10_FFFF#: 11110xxx 10xxxxxx 10xxxxxx
608 -- 16#0020_0000#-16#03FF_FFFF#: 111110xx 10xxxxxx
612 -- 16#0400_0000#-16#7FFF_FFFF#: 1111110x 10xxxxxx
616 -- since Wide_Character does not allow codes > 16#FFFF#
625 -- Special efficient encoding for brackets case
627 elsif Encoding_Method = WCEM_Brackets then
633 -- See if we have "] at end positions
635 if Ptr > Input'First + 1
636 and then Input (Ptr - 1) = ']'
637 and then Input (Ptr - 2) = '"'
641 -- Loop back looking for [" at start
643 while P >= Ptr - 10 loop
644 if P <= Input'First + 1 then
647 elsif Input (P - 1) = '"'
648 and then Input (P - 2) = '['
650 -- Found ["..."], scan forward to check it
654 Next_Wide_Character (Input, P);
656 -- OK if at original pointer, else error
669 -- Falling through loop means more than 8 chars between the
670 -- enclosing brackets (or simply a missing left bracket)
674 -- Here if no bracket sequence present
677 if Ptr = Input'First then
685 -- Non-UTF-8/Brackets. These are the inefficient cases where we have to
686 -- go to the start of the string and skip forwards till Ptr matches.
689 Non_UTF_Brackets : declare
690 Discard : Wide_Character;
703 Decode_Wide_Character (Input, PtrS, Discard);
709 elsif PtrS > Ptr then
715 when Constraint_Error =>
717 end Non_UTF_Brackets;
719 end Prev_Wide_Character;
721 ------------------------------
722 -- Prev_Wide_Wide_Character --
723 ------------------------------
725 procedure Prev_Wide_Wide_Character (Input : String; Ptr : in out Natural) is
727 if Ptr > Input'Last + 1 then
731 -- Special efficient encoding for UTF-8 case
733 if Encoding_Method = WCEM_UTF8 then
738 pragma Inline (Getc);
739 -- Gets the character at Input (Ptr - 1) and returns code in U as
740 -- Unsigned_32 value. On return Ptr is decremented by one.
742 procedure Skip_UTF_Byte;
743 pragma Inline (Skip_UTF_Byte);
744 -- Checks that U is 2#10xxxxxx# and then calls Get
752 if Ptr <= Input'First then
756 U := Unsigned_32 (Character'Pos (Input (Ptr)));
764 procedure Skip_UTF_Byte is
766 if (U and 2#11000000#) = 2#10_000000# then
773 -- Start of processing for UTF-8 case
776 -- 16#00_0000#-16#00_007F#: 0xxxxxxx
780 if (U and 2#10000000#) = 2#00000000# then
783 -- 16#00_0080#-16#00_07FF#: 110xxxxx 10xxxxxx
788 if (U and 2#11100000#) = 2#110_00000# then
791 -- 16#00_0800#-16#00_ffff#: 1110xxxx 10xxxxxx 10xxxxxx
796 if (U and 2#11110000#) = 2#1110_0000# then
799 -- 16#01_0000#-16#10_FFFF#: 11110xxx 10xxxxxx 10xxxxxx
805 if (U and 2#11111000#) = 2#11110_000# then
808 -- 16#0020_0000#-16#03FF_FFFF#: 111110xx 10xxxxxx
815 if (U and 2#11111100#) = 2#111110_00# then
818 -- Any other code is invalid, note that this includes:
820 -- 16#0400_0000#-16#7FFF_FFFF#: 1111110x 10xxxxxx
824 -- since Wide_Wide_Character does not allow codes
825 -- greater than 16#03FF_FFFF#
836 -- Special efficient encoding for brackets case
838 elsif Encoding_Method = WCEM_Brackets then
844 -- See if we have "] at end positions
846 if Ptr > Input'First + 1
847 and then Input (Ptr - 1) = ']'
848 and then Input (Ptr - 2) = '"'
852 -- Loop back looking for [" at start
854 while P >= Ptr - 10 loop
855 if P <= Input'First + 1 then
858 elsif Input (P - 1) = '"'
859 and then Input (P - 2) = '['
861 -- Found ["..."], scan forward to check it
865 Next_Wide_Wide_Character (Input, P);
867 -- OK if at original pointer, else error
880 -- Falling through loop means more than 8 chars between the
881 -- enclosing brackets (or simply a missing left bracket)
885 -- Here if no bracket sequence present
888 if Ptr = Input'First then
896 -- Non-UTF-8/Brackets. These are the inefficient cases where we have to
897 -- go to the start of the string and skip forwards till Ptr matches.
900 Non_UTF8_Brackets : declare
901 Discard : Wide_Wide_Character;
914 Decode_Wide_Wide_Character (Input, PtrS, Discard);
920 elsif PtrS > Ptr then
926 when Constraint_Error =>
928 end Non_UTF8_Brackets;
930 end Prev_Wide_Wide_Character;
932 --------------------------
933 -- Validate_Wide_String --
934 --------------------------
936 function Validate_Wide_String (S : String) return Boolean is
941 while Ptr <= S'Last loop
942 Next_Wide_Character (S, Ptr);
948 when Constraint_Error =>
950 end Validate_Wide_String;
952 -------------------------------
953 -- Validate_Wide_Wide_String --
954 -------------------------------
956 function Validate_Wide_Wide_String (S : String) return Boolean is
961 while Ptr <= S'Last loop
962 Next_Wide_Wide_Character (S, Ptr);
968 when Constraint_Error =>
970 end Validate_Wide_Wide_String;
972 end GNAT.Decode_String;