libjava/java/text/RuleBasedCollator.java

   1 /* RuleBasedCollator.java -- Concrete Collator Class
   2    Copyright (C) 1998, 1999, 2000, 2001, 2003  Free Software Foundation, Inc.
   3
   4 This file is part of GNU Classpath.
   5
   6 GNU Classpath is free software; you can redistribute it and/or modify
   7 it under the terms of the GNU General Public License as published by
   8 the Free Software Foundation; either version 2, or (at your option)
   9 any later version.
  10
  11 GNU Classpath is distributed in the hope that it will be useful, but
  12 WITHOUT ANY WARRANTY; without even the implied warranty of
  13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  14 General Public License for more details.
  15
  16 You should have received a copy of the GNU General Public License
  17 along with GNU Classpath; see the file COPYING.  If not, write to the
  18 Free Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
  19 02111-1307 USA.
  20
  21 Linking this library statically or dynamically with other modules is
  22 making a combined work based on this library.  Thus, the terms and
  23 conditions of the GNU General Public License cover the whole
  24 combination.
  25
  26 As a special exception, the copyright holders of this library give you
  27 permission to link this library with independent modules to produce an
  28 executable, regardless of the license terms of these independent
  29 modules, and to copy and distribute the resulting executable under
  30 terms of your choice, provided that you also meet, for each linked
  31 independent module, the terms and conditions of the license of that
  32 module.  An independent module is a module which is not derived from
  33 or based on this library.  If you modify this library, you may extend
  34 this exception to your version of the library, but you are not
  35 obligated to do so.  If you do not wish to do so, delete this
  36 exception statement from your version. */
  37
  38
  39 package java.text;
  40
  41 import java.util.Enumeration;
  42 import java.util.Hashtable;
  43 import java.util.Vector;
  44
  45 /* Written using "Java Class Libraries", 2nd edition, plus online
  46  * API docs for JDK 1.2 from http://www.javasoft.com.
  47  * Status: Believed complete and correct
  48  */
  49
  50 /**
  51  * This class is a concrete subclass of <code>Collator</code> suitable
  52  * for string collation in a wide variety of languages.  An instance of
  53  * this class is normally returned by the <code>getInstance</code> method
  54  * of <code>Collator</code> with rules predefined for the requested
  55  * locale.  However, an instance of this class can be created manually
  56  * with any desired rules.
  57  * <p>
  58  * Rules take the form of a <code>String</code> with the following syntax
  59  * <ul>
  60  * <li> Modifier: '@'
  61  * <li> Relation: '&lt;' | ';' | ',' | '=' : <text>
  62  * <li> Reset: '&amp;' : <text>
  63  * </ul>
  64  * The modifier character indicates that accents sort backward as is the
  65  * case with French.  The relational operators specify how the text
  66  * argument relates to the previous term.  The relation characters have
  67  * the following meanings:
  68  * <ul>
  69  * <li>'&lt;' - The text argument is greater than the prior term at the primary
  70  * difference level.
  71  * <li>';' - The text argument is greater than the prior term at the secondary
  72  * difference level.
  73  * <li>',' - The text argument is greater than the prior term at the tertiary
  74  * difference level.
  75  * <li>'=' - The text argument is equal to the prior term
  76  * </ul>
  77  * <p>
  78  * As for the text argument itself, this is any sequence of Unicode
  79  * characters not in the following ranges: 0x0009-0x000D, 0x0020-0x002F,
  80  * 0x003A-0x0040, 0x005B-0x0060, and 0x007B-0x007E. If these characters are
  81  * desired, they must be enclosed in single quotes.  If any whitespace is
  82  * encountered, it is ignored.  (For example, "a b" is equal to "ab").
  83  * <p>
  84  * The reset operation inserts the following rule at the point where the
  85  * text argument to it exists in the previously declared rule string.  This
  86  * makes it easy to add new rules to an existing string by simply including
  87  * them in a reset sequence at the end.  Note that the text argument, or
  88  * at least the first character of it, must be present somewhere in the
  89  * previously declared rules in order to be inserted properly.  If this
  90  * is not satisfied, a <code>ParseException</code> will be thrown.
  91  * <p>
  92  * This system of configuring <code>RuleBasedCollator</code> is needlessly
  93  * complex and the people at Taligent who developed it (along with the folks
  94  * at Sun who accepted it into the Java standard library) deserve a slow
  95  * and agonizing death.
  96  * <p>
  97  * Here are a couple of example of rule strings:
  98  * <p>
  99  * "&lt; a &lt; b &lt; c" - This string says that a is greater than b which is
 100  * greater than c, with all differences being primary differences.
 101  * <p>
 102  * "&lt; a,A &lt; b,B &lt; c,C" - This string says that 'A' is greater than 'a' with
 103  * a tertiary strength comparison.  Both 'b' and 'B' are greater than 'a' and
 104  * 'A' during a primary strength comparison.  But 'B' is greater than 'b'
 105  * under a tertiary strength comparison.
 106  * <p>
 107  * "&lt; a &lt; c &amp; a &lt; b " - This sequence is identical in function to the
 108  * "&lt; a &lt; b &lt; c" rule string above.  The '&amp;' reset symbol indicates that
 109  * the rule "&lt; b" is to be inserted after the text argument "a" in the
 110  * previous rule string segment.
 111  * <p>
 112  * "&lt; a &lt; b &amp; y &lt; z" - This is an error.  The character 'y' does not appear
 113  * anywhere in the previous rule string segment so the rule following the
 114  * reset rule cannot be inserted.
 115  * <p>
 116  * For a description of the various comparison strength types, see the
 117  * documentation for the <code>Collator</code> class.
 118  * <p>
 119  * As an additional complication to this already overly complex rule scheme,
 120  * if any characters precede the first rule, these characters are considered
 121  * ignorable.  They will be treated as if they did not exist during
 122  * comparisons.  For example, "- &lt; a &lt; b ..." would make '-' an ignorable
 123  * character such that the strings "high-tech" and "hightech" would
 124  * be considered identical.
 125  * <p>
 126  * A <code>ParseException</code> will be thrown for any of the following
 127  * conditions:
 128  * <ul>
 129  * <li>Unquoted punctuation characters in a text argument.
 130  * <li>A relational or reset operator not followed by a text argument
 131  * <li>A reset operator where the text argument is not present in
 132  * the previous rule string section.
 133  * </ul>
 134  *
 135  * @author Aaron M. Renn <arenn@urbanophile.com>
 136  * @author Tom Tromey <tromey@cygnus.com>
 137  * @date March 25, 1999
 138  */
 139
 140 final class RBCElement
 141 {
 142   String key;
 143   char relation;
 144
 145   RBCElement (String key, char relation)
 146   {
 147     this.key = key;
 148     this.relation = relation;
 149   }
 150 }
 151
 152 public class RuleBasedCollator extends Collator
 153 {
 154   // True if we are using French-style accent ordering.
 155   private boolean frenchAccents;
 156
 157   /**
 158    * This the the original rule string.
 159    */
 160   private String rules;
 161
 162   // This maps strings onto collation values.
 163   private Hashtable map;
 164
 165   // An entry in this hash means that more lookahead is required for
 166   // the prefix string.
 167   private Hashtable prefixes;
 168
 169   public Object clone ()
 170   {
 171     RuleBasedCollator c = (RuleBasedCollator) super.clone ();
 172     c.map = (Hashtable) map.clone ();
 173     c.prefixes = (Hashtable) map.clone ();
 174     return c;
 175   }
 176
 177   // A helper for CollationElementIterator.next().
 178   int ceiNext (CollationElementIterator cei)
 179   {
 180     if (cei.lookahead_set)
 181       {
 182         cei.lookahead_set = false;
 183         return cei.lookahead;
 184       }
 185
 186     int save = cei.index;
 187     int max = cei.text.length();
 188     String s = null;
 189
 190     // It is possible to have a case where `abc' has a mapping, but
 191     // neither `ab' nor `abd' do.  In this case we must treat `abd' as
 192     // nothing special.
 193     boolean found = false;
 194
 195     int i;
 196     for (i = save + 1; i <= max; ++i)
 197       {
 198         s = cei.text.substring(save, i);
 199         if (prefixes.get(s) == null)
 200           break;
 201         found = true;
 202       }
 203     // Assume s != null.
 204
 205     Object obj = map.get(s);
 206     // The special case.
 207     while (found && obj == null && s.length() > 1)
 208       {
 209         --i;
 210         s = cei.text.substring(save, i);
 211         obj = map.get(s);
 212       }
 213
 214     // Update state.
 215     cei.index = i;
 216
 217     if (obj == null)
 218       {
 219         // This idea, and the values, come from JDK.
 220         // assert (s.length() == 1)
 221         cei.lookahead_set = true;
 222         cei.lookahead = s.charAt(0) << 8;
 223         return 0x7fff << 16;
 224       }
 225
 226     return ((Integer) obj).intValue();
 227   }
 228
 229   // A helper for compareTo() that returns the next character that has
 230   // a nonzero ordering at the indicated strength.  This is also used
 231   // in CollationKey.
 232   static final int next (CollationElementIterator iter, int strength)
 233   {
 234     while (true)
 235       {
 236         int os = iter.next();
 237         if (os == CollationElementIterator.NULLORDER)
 238           return os;
 239         int c = 0;
 240         switch (strength)
 241           {
 242           case PRIMARY:
 243             c = os & ~0xffff;
 244             break;
 245           case SECONDARY:
 246             c = os & ~0x00ff;
 247             break;
 248           case TERTIARY:
 249           case IDENTICAL:
 250             c = os;
 251             break;
 252           }
 253         if (c != 0)
 254           return c;
 255       }
 256   }
 257
 258   public int compare (String source, String target)
 259   {
 260     CollationElementIterator cs, ct;
 261
 262     cs = new CollationElementIterator (source, this);
 263     ct = new CollationElementIterator (target, this);
 264
 265     while (true)
 266       {
 267         int os = next (cs, strength);
 268         int ot = next (ct, strength);
 269
 270         if (os == CollationElementIterator.NULLORDER
 271             && ot == CollationElementIterator.NULLORDER)
 272           break;
 273         else if (os == CollationElementIterator.NULLORDER)
 274           {
 275             // Source string is shorter, so return "less than".
 276             return -1;
 277           }
 278         else if (ot == CollationElementIterator.NULLORDER)
 279           {
 280             // Target string is shorter, so return "greater than".
 281             return 1;
 282           }
 283
 284         if (os != ot)
 285           return os - ot;
 286       }
 287
 288     return 0;
 289   }
 290
 291   public boolean equals (Object obj)
 292   {
 293     if (! (obj instanceof RuleBasedCollator) || ! super.equals(obj))
 294       return false;
 295     RuleBasedCollator rbc = (RuleBasedCollator) obj;
 296     // FIXME: this is probably wrong.  Instead we should compare maps
 297     // directly.
 298     return (frenchAccents == rbc.frenchAccents
 299             && rules.equals(rbc.rules));
 300   }
 301
 302   public CollationElementIterator getCollationElementIterator (String source)
 303   {
 304     StringBuffer expand = new StringBuffer (source.length());
 305     int max = source.length();
 306     for (int i = 0; i < max; ++i)
 307       decomposeCharacter (source.charAt(i), expand);
 308     return new CollationElementIterator (expand.toString(), this);
 309   }
 310
 311   public CollationElementIterator getCollationElementIterator (CharacterIterator source)
 312   {
 313     StringBuffer expand = new StringBuffer ();
 314     for (char c = source.first ();
 315          c != CharacterIterator.DONE;
 316          c = source.next ())
 317       decomposeCharacter (c, expand);
 318
 319     return new CollationElementIterator (expand.toString(), this);
 320   }
 321
 322   public CollationKey getCollationKey (String source)
 323   {
 324     return new CollationKey (getCollationElementIterator (source), source,
 325                              strength);
 326   }
 327
 328   public String getRules ()
 329   {
 330     return rules;
 331   }
 332
 333   public int hashCode ()
 334   {
 335     return (frenchAccents ? 1231 : 1237
 336             ^ rules.hashCode()
 337             ^ map.hashCode()
 338             ^ prefixes.hashCode());
 339   }
 340
 341   private final boolean is_special (char c)
 342   {
 343     // Rules from JCL book.
 344     return ((c >= 0x0009 && c <= 0x000d)
 345             || (c >= 0x0020 && c <= 0x002f)
 346             || (c >= 0x003a && c <= 0x0040)
 347             || (c >= 0x005b && c <= 0x0060)
 348             || (c >= 0x007b && c <= 0x007e));
 349   }
 350
 351   private final int text_argument (String rules, int index,
 352                                    StringBuffer result)
 353   {
 354     result.setLength(0);
 355     int len = rules.length();
 356     while (index < len)
 357       {
 358         char c = rules.charAt(index);
 359         if (c == '\'' && index + 2 < len
 360             && rules.charAt(index + 2) == '\''
 361             && is_special (rules.charAt(index + 1)))
 362           index += 2;
 363         else if (is_special (c) || Character.isWhitespace(c))
 364           return index;
 365         result.append(c);
 366         ++index;
 367       }
 368     return index;
 369   }
 370
 371   public RuleBasedCollator (String rules) throws ParseException
 372   {
 373     this.rules = rules;
 374     this.frenchAccents = false;
 375
 376     // We keep each rule in order in a vector.  At the end we traverse
 377     // the vector and compute collation values from it.
 378     int insertion_index = 0;
 379     Vector vec = new Vector ();
 380
 381     StringBuffer argument = new StringBuffer ();
 382
 383     int len = rules.length();
 384     for (int index = 0; index < len; ++index)
 385       {
 386         char c = rules.charAt(index);
 387
 388         // Just skip whitespace.
 389         if (Character.isWhitespace(c))
 390           continue;
 391
 392         // Modifier.
 393         if (c == '@')
 394           {
 395             frenchAccents = true;
 396             continue;
 397           }
 398
 399         // Check for relation or reset operator.
 400         if (! (c == '<' || c == ';' || c == ',' || c == '=' || c == '&'))
 401           throw new ParseException ("invalid character", index);
 402
 403         ++index;
 404         while (index < len)
 405           {
 406             if (! Character.isWhitespace(rules.charAt(index)))
 407               break;
 408             ++index;
 409           }
 410         if (index == len)
 411           throw new ParseException ("missing argument", index);
 412
 413         int save = index;
 414         index = text_argument (rules, index, argument);
 415         if (argument.length() == 0)
 416           throw new ParseException ("invalid character", save);
 417         String arg = argument.toString();
 418         int item_index = vec.indexOf(arg);
 419         if (c != '&')
 420           {
 421             // If the argument already appears in the vector, then we
 422             // must remove it in order to re-order.
 423             if (item_index != -1)
 424               {
 425                 vec.removeElementAt(item_index);
 426                 if (insertion_index >= item_index)
 427                   --insertion_index;
 428               }
 429             RBCElement r = new RBCElement (arg, c);
 430             vec.insertElementAt(r, insertion_index);
 431             ++insertion_index;
 432           }
 433         else
 434           {
 435             // Reset.
 436             if (item_index == -1)
 437               throw
 438                 new ParseException ("argument to reset not previously seen",
 439                                     save);
 440             insertion_index = item_index + 1;
 441           }
 442
 443         // Ugly: in this case the resulting INDEX comes from
 444         // text_argument, which returns the index of the next
 445         // character we should examine.
 446         --index;
 447       }
 448
 449     // Now construct a hash table that maps strings onto their
 450     // collation values.
 451     int primary = 0;
 452     int secondary = 0;
 453     int tertiary = 0;
 454     this.map = new Hashtable ();
 455     this.prefixes = new Hashtable ();
 456     Enumeration e = vec.elements();
 457     while (e.hasMoreElements())
 458       {
 459         RBCElement r = (RBCElement) e.nextElement();
 460         switch (r.relation)
 461           {
 462           case '<':
 463             ++primary;
 464             secondary = 0;
 465             tertiary = 0;
 466             break;
 467           case ';':
 468             ++secondary;
 469             tertiary = 0;
 470             break;
 471           case ',':
 472             ++tertiary;
 473             break;
 474           case '=':
 475             break;
 476           }
 477         // This must match CollationElementIterator.
 478         map.put(r.key, new Integer (primary << 16
 479                                     | secondary << 8 | tertiary));
 480
 481         // Make a map of all lookaheads we might need.
 482         for (int i = r.key.length() - 1; i >= 1; --i)
 483           prefixes.put(r.key.substring(0, i), Boolean.TRUE);
 484       }
 485   }
 486 }