2011-12-05 Bob Duff <duff@adacore.com>

[pf3gnuchains/gcc-fork.git] / gcc / ada / sinput.ads
diff --git a/gcc/ada/sinput.ads b/gcc/ada/sinput.ads

index c679e24..1d13f6e 100644 (file)
--- a/gcc/ada/sinput.ads
+++ b/gcc/ada/sinput.ads
@@ -6,7 +6,7 @@
  --                                                                          --
  --                                 S p e c                                  --
  --                                                                          --
---          Copyright (C) 1992-2009, Free Software Foundation, Inc.         --
+--          Copyright (C) 1992-2011, Free Software Foundation, Inc.         --
  --                                                                          --
  -- GNAT is free software;  you can  redistribute it  and/or modify it under --
  -- terms of the  GNU General Public License as published  by the Free Soft- --
@@ -43,7 +43,7 @@
  --    described in RM 2.2 (13). Any of the characters FF, LF, CR or VT or any
  --    wide character that is a Line or Paragraph Separator acts as an end of
  --    logical line in this sense, and it is essentially irrelevant whether one
---    or more appears in sequence (since if sequence of such characters is
+--    or more appears in sequence (since if a sequence of such characters is
  --    regarded as separate ends of line, then the intervening logical lines
  --    are null in any case).
  
@@ -451,6 +451,75 @@ package Sinput is
                             Internal_Source'Unrestricted_Access;
     --  Pointer to internal source buffer
  
+   -----------------------------------------
+   -- Handling of Source Line Terminators --
+   -----------------------------------------
+
+   --  In this section we discuss in detail the issue of terminators used to
+   --  terminate source lines. The RM says that one or more format effectors
+   --  (other than horizontal tab) end a source line, and defines the set of
+   --  such format effectors, but does not talk about exactly how they are
+   --  represented in the source program (since in general the RM is not in
+   --  the business of specifying source program formats).
+
+   --  The type Types.Line_Terminator is defined as a subtype of Character
+   --  that includes CR/LF/VT/FF. The most common line enders in practice
+   --  are CR (some MAC systems), LF (Unix systems), and CR/LF (DOS/Windows
+   --  systems). Any of these sequences is recognized as ending a physical
+   --  source line, and if multiple such terminators appear (e.g. LF/LF),
+   --  then we consider we have an extra blank line.
+
+   --  VT and FF are recognized as terminating source lines, but they are
+   --  considered to end a logical line instead of a physical line, so that
+   --  the line numbering ignores such terminators. The use of VT and FF is
+   --  mandated by the standard, and correctly handled in a conforming manner
+   --  by GNAT, but their use is not recommended.
+
+   --  In addition to the set of characters defined by the type in Types, in
+   --  wide character encoding, then the codes returning True for a call to
+   --  System.UTF_32.Is_UTF_32_Line_Terminator are also recognized as ending a
+   --  source line. This includes the standard codes defined above in addition
+   --  to NEL (NEXT LINE), LINE SEPARATOR and PARAGRAPH SEPARATOR. Again, as in
+   --  the case of VT and FF, the standard requires we recognize these as line
+   --  terminators, but we consider them to be logical line terminators. The
+   --  only physical line terminators recognized are the standard ones (CR,
+   --  LF, or CR/LF).
+
+   --  However, we do not recognize the NEL (16#85#) character as having the
+   --  significance of an end of line character when operating in normal 8-bit
+   --  Latin-n input mode for the compiler. Instead the rule in this mode is
+   --  that all upper half control codes (16#80# .. 16#9F#) are illegal if they
+   --  occur in program text, and are ignored if they appear in comments.
+
+   --  First, note that this behavior is fully conforming with the standard.
+   --  The standard has nothing whatever to say about source representation
+   --  and implementations are completely free to make there own rules. In
+   --  this case, in 8-bit mode, GNAT decides that the 16#0085# character is
+   --  not a representation of the NEL character, even though it looks like it.
+   --  If you have NEL's in your program, which you expect to be treated as
+   --  end of line characters, you must use a wide character encoding such as
+   --  UTF-8 for this code to be recognized.
+
+   --  Second, an explanation of why we take this slightly surprising choice.
+   --  We have never encountered anyone actually using the NEL character to
+   --  end lines. One user raised the issue as a result of some experiments,
+   --  but no one has ever submitted a program encoded this way, in any of
+   --  the possible encodings. It seems that even when using wide character
+   --  codes extensively, the normal approach is to use standard line enders
+   --  (LF or CR/LF). So the failure to recognize NEL in this mode seems to
+   --  have no practical downside.
+
+   --  Moreover, what we have seen in a significant number of programs from
+   --  multiple sources is the practice of writing all program text in lower
+   --  half (ASCII) form, but using UTF-8 encoded wide characters freely in
+   --  comments, where the comments are terminated by normal line endings
+   --  (LF or CR/LF). The comments do not contain NEL codes, but they can and
+   --  do contain other UTF-8 encoding sequences where one of the bytes is the
+   --  NEL code. Now such programs can of course be compiled in UTF-8 mode,
+   --  but in practice they also compile fine in standard 8-bit mode without
+   --  specifying a character encoding. Since this is common practice, it would
+   --  be a signficant upwards incompatibility to recognize NEL in 8-bit mode.
+
     -----------------
     -- Subprograms --
     -----------------
@@ -471,13 +540,9 @@ package Sinput is
     --  ASCII.NUL, with Name_Length indicating the length not including the
     --  terminating Nul.
  
-   function Expr_First_Char (Expr : Node_Id) return Source_Ptr;
-   --  Given a node for a subexpression, returns the source location of the
-   --  first character of the expression.
-
-   function Expr_Last_Char (Expr : Node_Id) return Source_Ptr;
-   --  Given a node for a subexpression, returns the source location of the
-   --  last character of the expression.
+   function Build_Location_String (Loc : Source_Ptr) return String;
+   --  Functional form returning a string, which does not include a terminating
+   --  null character. The contents of Name_Buffer is destroyed.
  
     function Get_Column_Number (P : Source_Ptr) return Column_Number;
     --  The ones-origin column number of the specified Source_Ptr value is
@@ -495,6 +560,11 @@ package Sinput is
     --  reference pragmas have been encountered, the value returned is
     --  the same as the physical line number.
  
+   function Get_Logical_Line_Number_Img
+     (P : Source_Ptr) return String;
+   --  Same as above function, but returns the line number as a string of
+   --  decimal digits, with no leading space. Destroys Name_Buffer.
+
     function Get_Physical_Line_Number
       (P : Source_Ptr) return Physical_Line_Number;
     --  The line number of the specified source position is obtained by
@@ -571,12 +641,12 @@ package Sinput is
     procedure Skip_Line_Terminators
       (P        : in out Source_Ptr;
        Physical : out Boolean);
-   --  On entry, P points to a line terminator that has been encountered, which
-   --  is one of FF,LF,VT,CR or a wide character sequence whose value is in
-   --  category Separator,Line or Separator,Paragraph. P points just past the
-   --  character that was scanned. The purpose of this routine is to
-   --  distinguish physical and logical line endings. A physical line ending is
-   --  one of:
+   --  On entry, P points to a line terminator that has been encountered,
+   --  which is one of FF,LF,VT,CR or a wide character sequence whose value is
+   --  in category Separator,Line or Separator,Paragraph. P points just past
+   --  the character that was scanned. The purpose of this routine is to
+   --  distinguish physical and logical line endings. A physical line ending
+   --  is one of:
     --
     --     CR on its own (MAC System 7)
     --     LF on its own (Unix and unix-like systems)
@@ -603,6 +673,15 @@ package Sinput is
     --  makes sure that the lines table for the current source file has an
     --  appropriate entry for the start of the new physical line.
  
+   procedure Sloc_Range (N : Node_Id; Min, Max : out Source_Ptr);
+   --  Given a node, returns the minimum and maximum source locations of any
+   --  node in the syntactic subtree for the node. This is not quite the same
+   --  as the locations of the first and last token in the node construct
+   --  because parentheses at the outer level do not have a recorded Sloc.
+   --
+   --  Note: if the tree for the expression contains no "real" Sloc values,
+   --  i.e. values > No_Location, then both Min and Max are set to Sloc (Expr).
+
     function Source_Offset (S : Source_Ptr) return Nat;
     --  Returns the zero-origin offset of the given source location from the
     --  start of its corresponding unit. This is used for creating canonical