1 ------------------------------------------------------------------------------
3 -- GNAT RUN-TIME COMPONENTS --
5 -- G N A T . D E C O D E _ S T R I N G --
9 -- Copyright (C) 2007-2010, AdaCore --
11 -- GNAT is free software; you can redistribute it and/or modify it under --
12 -- terms of the GNU General Public License as published by the Free Soft- --
13 -- ware Foundation; either version 3, or (at your option) any later ver- --
14 -- sion. GNAT is distributed in the hope that it will be useful, but WITH- --
15 -- OUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY --
16 -- or FITNESS FOR A PARTICULAR PURPOSE. --
18 -- As a special exception under Section 7 of GPL version 3, you are granted --
19 -- additional permissions described in the GCC Runtime Library Exception, --
20 -- version 3.1, as published by the Free Software Foundation. --
22 -- You should have received a copy of the GNU General Public License and --
23 -- a copy of the GCC Runtime Library Exception along with this program; --
24 -- see the files COPYING3 and COPYING.RUNTIME respectively. If not, see --
25 -- <http://www.gnu.org/licenses/>. --
27 -- GNAT was originally developed by the GNAT team at New York University. --
28 -- Extensive contributions were provided by Ada Core Technologies Inc. --
30 ------------------------------------------------------------------------------
32 -- This package provides a utility routine for converting from an encoded
33 -- string to a corresponding Wide_String or Wide_Wide_String value.
35 with Interfaces; use Interfaces;
37 with System.WCh_Cnv; use System.WCh_Cnv;
38 with System.WCh_Con; use System.WCh_Con;
40 package body GNAT.Decode_String is
42 -----------------------
43 -- Local Subprograms --
44 -----------------------
47 pragma No_Return (Bad);
48 -- Raise error for bad encoding
51 pragma No_Return (Past_End);
52 -- Raise error for off end of string
60 raise Constraint_Error with
61 "bad encoding or character out of range";
64 ---------------------------
65 -- Decode_Wide_Character --
66 ---------------------------
68 procedure Decode_Wide_Character
71 Result : out Wide_Character)
73 Char : Wide_Wide_Character;
75 Decode_Wide_Wide_Character (Input, Ptr, Char);
77 if Wide_Wide_Character'Pos (Char) > 16#FFFF# then
80 Result := Wide_Character'Val (Wide_Wide_Character'Pos (Char));
82 end Decode_Wide_Character;
84 ------------------------
85 -- Decode_Wide_String --
86 ------------------------
88 function Decode_Wide_String (S : String) return Wide_String is
89 Result : Wide_String (1 .. S'Length);
92 Decode_Wide_String (S, Result, Length);
93 return Result (1 .. Length);
94 end Decode_Wide_String;
96 procedure Decode_Wide_String
98 Result : out Wide_String;
106 while Ptr <= S'Last loop
107 if Length >= Result'Last then
111 Length := Length + 1;
112 Decode_Wide_Character (S, Ptr, Result (Length));
114 end Decode_Wide_String;
116 --------------------------------
117 -- Decode_Wide_Wide_Character --
118 --------------------------------
120 procedure Decode_Wide_Wide_Character
122 Ptr : in out Natural;
123 Result : out Wide_Wide_Character)
127 function In_Char return Character;
128 pragma Inline (In_Char);
129 -- Function to get one input character
135 function In_Char return Character is
137 if Ptr <= Input'Last then
139 return Input (Ptr - 1);
145 -- Start of processing for Decode_Wide_Wide_Character
150 -- Special fast processing for UTF-8 case
152 if Encoding_Method = WCEM_UTF8 then
157 procedure Get_UTF_Byte;
158 pragma Inline (Get_UTF_Byte);
159 -- Used to interpret 2#10xxxxxx# continuation byte in UTF-8 mode.
160 -- Reads a byte, and raises CE if the first two bits are not 10.
161 -- Otherwise shifts W 6 bits left and or's in the 6 xxxxxx bits.
167 procedure Get_UTF_Byte is
169 U := Unsigned_32 (Character'Pos (In_Char));
171 if (U and 2#11000000#) /= 2#10_000000# then
175 W := Shift_Left (W, 6) or (U and 2#00111111#);
178 -- Start of processing for UTF8 case
181 -- Note: for details of UTF8 encoding see RFC 3629
183 U := Unsigned_32 (Character'Pos (C));
185 -- 16#00_0000#-16#00_007F#: 0xxxxxxx
187 if (U and 2#10000000#) = 2#00000000# then
188 Result := Wide_Wide_Character'Val (Character'Pos (C));
190 -- 16#00_0080#-16#00_07FF#: 110xxxxx 10xxxxxx
192 elsif (U and 2#11100000#) = 2#110_00000# then
193 W := U and 2#00011111#;
195 Result := Wide_Wide_Character'Val (W);
197 -- 16#00_0800#-16#00_ffff#: 1110xxxx 10xxxxxx 10xxxxxx
199 elsif (U and 2#11110000#) = 2#1110_0000# then
200 W := U and 2#00001111#;
203 Result := Wide_Wide_Character'Val (W);
205 -- 16#01_0000#-16#10_FFFF#: 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
207 elsif (U and 2#11111000#) = 2#11110_000# then
208 W := U and 2#00000111#;
214 Result := Wide_Wide_Character'Val (W);
216 -- 16#0020_0000#-16#03FF_FFFF#: 111110xx 10xxxxxx 10xxxxxx
219 elsif (U and 2#11111100#) = 2#111110_00# then
220 W := U and 2#00000011#;
226 Result := Wide_Wide_Character'Val (W);
228 -- All other cases are invalid, note that this includes:
230 -- 16#0400_0000#-16#7FFF_FFFF#: 1111110x 10xxxxxx 10xxxxxx
231 -- 10xxxxxx 10xxxxxx 10xxxxxx
233 -- since Wide_Wide_Character does not include code values
234 -- greater than 16#03FF_FFFF#.
241 -- All encoding functions other than UTF-8
245 function Char_Sequence_To_UTF is
246 new Char_Sequence_To_UTF_32 (In_Char);
249 -- For brackets, must test for specific case of [ not followed by
250 -- quotation, where we must not call Char_Sequence_To_UTF, but
251 -- instead just return the bracket unchanged.
253 if Encoding_Method = WCEM_Brackets
255 and then (Ptr > Input'Last or else Input (Ptr) /= '"')
259 -- All other cases including [" with Brackets
263 Wide_Wide_Character'Val
264 (Char_Sequence_To_UTF (C, Encoding_Method));
268 end Decode_Wide_Wide_Character;
270 -----------------------------
271 -- Decode_Wide_Wide_String --
272 -----------------------------
274 function Decode_Wide_Wide_String (S : String) return Wide_Wide_String is
275 Result : Wide_Wide_String (1 .. S'Length);
278 Decode_Wide_Wide_String (S, Result, Length);
279 return Result (1 .. Length);
280 end Decode_Wide_Wide_String;
282 procedure Decode_Wide_Wide_String
284 Result : out Wide_Wide_String;
285 Length : out Natural)
292 while Ptr <= S'Last loop
293 if Length >= Result'Last then
297 Length := Length + 1;
298 Decode_Wide_Wide_Character (S, Ptr, Result (Length));
300 end Decode_Wide_Wide_String;
302 -------------------------
303 -- Next_Wide_Character --
304 -------------------------
306 procedure Next_Wide_Character (Input : String; Ptr : in out Natural) is
308 if Ptr < Input'First then
312 -- Special efficient encoding for UTF-8 case
314 if Encoding_Method = WCEM_UTF8 then
319 pragma Inline (Getc);
320 -- Gets the character at Input (Ptr) and returns code in U as
321 -- Unsigned_32 value. On return Ptr is bumped past the character.
323 procedure Skip_UTF_Byte;
324 pragma Inline (Skip_UTF_Byte);
325 -- Skips past one encoded byte which must be 2#10xxxxxx#
333 if Ptr > Input'Last then
336 U := Unsigned_32 (Character'Pos (Input (Ptr)));
345 procedure Skip_UTF_Byte is
349 if (U and 2#11000000#) /= 2#10_000000# then
354 -- Start of processing for UTF-8 case
357 -- 16#00_0000#-16#00_007F#: 0xxxxxxx
361 if (U and 2#10000000#) = 2#00000000# then
364 -- 16#00_0080#-16#00_07FF#: 110xxxxx 10xxxxxx
366 elsif (U and 2#11100000#) = 2#110_00000# then
369 -- 16#00_0800#-16#00_ffff#: 1110xxxx 10xxxxxx 10xxxxxx
371 elsif (U and 2#11110000#) = 2#1110_0000# then
375 -- Any other code is invalid, note that this includes:
377 -- 16#01_0000#-16#10_FFFF#: 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
379 -- 16#0020_0000#-16#03FF_FFFF#: 111110xx 10xxxxxx 10xxxxxx
382 -- 16#0400_0000#-16#7FFF_FFFF#: 1111110x 10xxxxxx 10xxxxxx
383 -- 10xxxxxx 10xxxxxx 10xxxxxx
385 -- since Wide_Character does not allow codes > 16#FFFF#
396 Discard : Wide_Character;
398 Decode_Wide_Character (Input, Ptr, Discard);
401 end Next_Wide_Character;
403 ------------------------------
404 -- Next_Wide_Wide_Character --
405 ------------------------------
407 procedure Next_Wide_Wide_Character (Input : String; Ptr : in out Natural) is
409 -- Special efficient encoding for UTF-8 case
411 if Encoding_Method = WCEM_UTF8 then
416 pragma Inline (Getc);
417 -- Gets the character at Input (Ptr) and returns code in U as
418 -- Unsigned_32 value. On return Ptr is bumped past the character.
420 procedure Skip_UTF_Byte;
421 pragma Inline (Skip_UTF_Byte);
422 -- Skips past one encoded byte which must be 2#10xxxxxx#
430 if Ptr > Input'Last then
433 U := Unsigned_32 (Character'Pos (Input (Ptr)));
442 procedure Skip_UTF_Byte is
446 if (U and 2#11000000#) /= 2#10_000000# then
451 -- Start of processing for UTF-8 case
454 if Ptr < Input'First then
458 -- 16#00_0000#-16#00_007F#: 0xxxxxxx
462 if (U and 2#10000000#) = 2#00000000# then
465 -- 16#00_0080#-16#00_07FF#: 110xxxxx 10xxxxxx
467 elsif (U and 2#11100000#) = 2#110_00000# then
470 -- 16#00_0800#-16#00_ffff#: 1110xxxx 10xxxxxx 10xxxxxx
472 elsif (U and 2#11110000#) = 2#1110_0000# then
476 -- 16#01_0000#-16#10_FFFF#: 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
478 elsif (U and 2#11111000#) = 2#11110_000# then
483 -- 16#0020_0000#-16#03FF_FFFF#: 111110xx 10xxxxxx 10xxxxxx
486 elsif (U and 2#11111100#) = 2#111110_00# then
491 -- Any other code is invalid, note that this includes:
493 -- 16#0400_0000#-16#7FFF_FFFF#: 1111110x 10xxxxxx 10xxxxxx
494 -- 10xxxxxx 10xxxxxx 10xxxxxx
496 -- since Wide_Wide_Character does not allow codes > 16#03FF_FFFF#
507 Discard : Wide_Wide_Character;
509 Decode_Wide_Wide_Character (Input, Ptr, Discard);
512 end Next_Wide_Wide_Character;
518 procedure Past_End is
520 raise Constraint_Error with "past end of string";
523 -------------------------
524 -- Prev_Wide_Character --
525 -------------------------
527 procedure Prev_Wide_Character (Input : String; Ptr : in out Natural) is
529 if Ptr > Input'Last + 1 then
533 -- Special efficient encoding for UTF-8 case
535 if Encoding_Method = WCEM_UTF8 then
540 pragma Inline (Getc);
541 -- Gets the character at Input (Ptr - 1) and returns code in U as
542 -- Unsigned_32 value. On return Ptr is decremented by one.
544 procedure Skip_UTF_Byte;
545 pragma Inline (Skip_UTF_Byte);
546 -- Checks that U is 2#10xxxxxx# and then calls Get
554 if Ptr <= Input'First then
558 U := Unsigned_32 (Character'Pos (Input (Ptr)));
566 procedure Skip_UTF_Byte is
568 if (U and 2#11000000#) = 2#10_000000# then
575 -- Start of processing for UTF-8 case
578 -- 16#00_0000#-16#00_007F#: 0xxxxxxx
582 if (U and 2#10000000#) = 2#00000000# then
585 -- 16#00_0080#-16#00_07FF#: 110xxxxx 10xxxxxx
590 if (U and 2#11100000#) = 2#110_00000# then
593 -- 16#00_0800#-16#00_ffff#: 1110xxxx 10xxxxxx 10xxxxxx
598 if (U and 2#11110000#) = 2#1110_0000# then
601 -- Any other code is invalid, note that this includes:
603 -- 16#01_0000#-16#10_FFFF#: 11110xxx 10xxxxxx 10xxxxxx
606 -- 16#0020_0000#-16#03FF_FFFF#: 111110xx 10xxxxxx
610 -- 16#0400_0000#-16#7FFF_FFFF#: 1111110x 10xxxxxx
614 -- since Wide_Character does not allow codes > 16#FFFF#
623 -- Special efficient encoding for brackets case
625 elsif Encoding_Method = WCEM_Brackets then
631 -- See if we have "] at end positions
633 if Ptr > Input'First + 1
634 and then Input (Ptr - 1) = ']'
635 and then Input (Ptr - 2) = '"'
639 -- Loop back looking for [" at start
641 while P >= Ptr - 10 loop
642 if P <= Input'First + 1 then
645 elsif Input (P - 1) = '"'
646 and then Input (P - 2) = '['
648 -- Found ["..."], scan forward to check it
652 Next_Wide_Character (Input, P);
654 -- OK if at original pointer, else error
667 -- Falling through loop means more than 8 chars between the
668 -- enclosing brackets (or simply a missing left bracket)
672 -- Here if no bracket sequence present
675 if Ptr = Input'First then
683 -- Non-UTF-8/Brackets. These are the inefficient cases where we have to
684 -- go to the start of the string and skip forwards till Ptr matches.
687 Non_UTF_Brackets : declare
688 Discard : Wide_Character;
701 Decode_Wide_Character (Input, PtrS, Discard);
707 elsif PtrS > Ptr then
713 when Constraint_Error =>
715 end Non_UTF_Brackets;
717 end Prev_Wide_Character;
719 ------------------------------
720 -- Prev_Wide_Wide_Character --
721 ------------------------------
723 procedure Prev_Wide_Wide_Character (Input : String; Ptr : in out Natural) is
725 if Ptr > Input'Last + 1 then
729 -- Special efficient encoding for UTF-8 case
731 if Encoding_Method = WCEM_UTF8 then
736 pragma Inline (Getc);
737 -- Gets the character at Input (Ptr - 1) and returns code in U as
738 -- Unsigned_32 value. On return Ptr is decremented by one.
740 procedure Skip_UTF_Byte;
741 pragma Inline (Skip_UTF_Byte);
742 -- Checks that U is 2#10xxxxxx# and then calls Get
750 if Ptr <= Input'First then
754 U := Unsigned_32 (Character'Pos (Input (Ptr)));
762 procedure Skip_UTF_Byte is
764 if (U and 2#11000000#) = 2#10_000000# then
771 -- Start of processing for UTF-8 case
774 -- 16#00_0000#-16#00_007F#: 0xxxxxxx
778 if (U and 2#10000000#) = 2#00000000# then
781 -- 16#00_0080#-16#00_07FF#: 110xxxxx 10xxxxxx
786 if (U and 2#11100000#) = 2#110_00000# then
789 -- 16#00_0800#-16#00_ffff#: 1110xxxx 10xxxxxx 10xxxxxx
794 if (U and 2#11110000#) = 2#1110_0000# then
797 -- 16#01_0000#-16#10_FFFF#: 11110xxx 10xxxxxx 10xxxxxx
803 if (U and 2#11111000#) = 2#11110_000# then
806 -- 16#0020_0000#-16#03FF_FFFF#: 111110xx 10xxxxxx
813 if (U and 2#11111100#) = 2#111110_00# then
816 -- Any other code is invalid, note that this includes:
818 -- 16#0400_0000#-16#7FFF_FFFF#: 1111110x 10xxxxxx
822 -- since Wide_Wide_Character does not allow codes
823 -- greater than 16#03FF_FFFF#
834 -- Special efficient encoding for brackets case
836 elsif Encoding_Method = WCEM_Brackets then
842 -- See if we have "] at end positions
844 if Ptr > Input'First + 1
845 and then Input (Ptr - 1) = ']'
846 and then Input (Ptr - 2) = '"'
850 -- Loop back looking for [" at start
852 while P >= Ptr - 10 loop
853 if P <= Input'First + 1 then
856 elsif Input (P - 1) = '"'
857 and then Input (P - 2) = '['
859 -- Found ["..."], scan forward to check it
863 Next_Wide_Wide_Character (Input, P);
865 -- OK if at original pointer, else error
878 -- Falling through loop means more than 8 chars between the
879 -- enclosing brackets (or simply a missing left bracket)
883 -- Here if no bracket sequence present
886 if Ptr = Input'First then
894 -- Non-UTF-8/Brackets. These are the inefficient cases where we have to
895 -- go to the start of the string and skip forwards till Ptr matches.
898 Non_UTF8_Brackets : declare
899 Discard : Wide_Wide_Character;
912 Decode_Wide_Wide_Character (Input, PtrS, Discard);
918 elsif PtrS > Ptr then
924 when Constraint_Error =>
926 end Non_UTF8_Brackets;
928 end Prev_Wide_Wide_Character;
930 --------------------------
931 -- Validate_Wide_String --
932 --------------------------
934 function Validate_Wide_String (S : String) return Boolean is
939 while Ptr <= S'Last loop
940 Next_Wide_Character (S, Ptr);
946 when Constraint_Error =>
948 end Validate_Wide_String;
950 -------------------------------
951 -- Validate_Wide_Wide_String --
952 -------------------------------
954 function Validate_Wide_Wide_String (S : String) return Boolean is
959 while Ptr <= S'Last loop
960 Next_Wide_Wide_Character (S, Ptr);
966 when Constraint_Error =>
968 end Validate_Wide_Wide_String;
970 end GNAT.Decode_String;