1 /* RuleBasedCollator.java -- Concrete Collator Class
2 Copyright (C) 1998, 1999, 2000, 2001, 2003 Free Software Foundation, Inc.
4 This file is part of GNU Classpath.
6 GNU Classpath is free software; you can redistribute it and/or modify
7 it under the terms of the GNU General Public License as published by
8 the Free Software Foundation; either version 2, or (at your option)
11 GNU Classpath is distributed in the hope that it will be useful, but
12 WITHOUT ANY WARRANTY; without even the implied warranty of
13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14 General Public License for more details.
16 You should have received a copy of the GNU General Public License
17 along with GNU Classpath; see the file COPYING. If not, write to the
18 Free Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
21 Linking this library statically or dynamically with other modules is
22 making a combined work based on this library. Thus, the terms and
23 conditions of the GNU General Public License cover the whole
26 As a special exception, the copyright holders of this library give you
27 permission to link this library with independent modules to produce an
28 executable, regardless of the license terms of these independent
29 modules, and to copy and distribute the resulting executable under
30 terms of your choice, provided that you also meet, for each linked
31 independent module, the terms and conditions of the license of that
32 module. An independent module is a module which is not derived from
33 or based on this library. If you modify this library, you may extend
34 this exception to your version of the library, but you are not
35 obligated to do so. If you do not wish to do so, delete this
36 exception statement from your version. */
41 import java.util.Enumeration;
42 import java.util.Hashtable;
43 import java.util.Vector;
45 /* Written using "Java Class Libraries", 2nd edition, plus online
46 * API docs for JDK 1.2 from http://www.javasoft.com.
47 * Status: Believed complete and correct
51 * This class is a concrete subclass of <code>Collator</code> suitable
52 * for string collation in a wide variety of languages. An instance of
53 * this class is normally returned by the <code>getInstance</code> method
54 * of <code>Collator</code> with rules predefined for the requested
55 * locale. However, an instance of this class can be created manually
56 * with any desired rules.
58 * Rules take the form of a <code>String</code> with the following syntax
61 * <li> Relation: '<' | ';' | ',' | '=' : <text>
62 * <li> Reset: '&' : <text>
64 * The modifier character indicates that accents sort backward as is the
65 * case with French. The relational operators specify how the text
66 * argument relates to the previous term. The relation characters have
67 * the following meanings:
69 * <li>'<' - The text argument is greater than the prior term at the primary
71 * <li>';' - The text argument is greater than the prior term at the secondary
73 * <li>',' - The text argument is greater than the prior term at the tertiary
75 * <li>'=' - The text argument is equal to the prior term
78 * As for the text argument itself, this is any sequence of Unicode
79 * characters not in the following ranges: 0x0009-0x000D, 0x0020-0x002F,
80 * 0x003A-0x0040, 0x005B-0x0060, and 0x007B-0x007E. If these characters are
81 * desired, they must be enclosed in single quotes. If any whitespace is
82 * encountered, it is ignored. (For example, "a b" is equal to "ab").
84 * The reset operation inserts the following rule at the point where the
85 * text argument to it exists in the previously declared rule string. This
86 * makes it easy to add new rules to an existing string by simply including
87 * them in a reset sequence at the end. Note that the text argument, or
88 * at least the first character of it, must be present somewhere in the
89 * previously declared rules in order to be inserted properly. If this
90 * is not satisfied, a <code>ParseException</code> will be thrown.
92 * This system of configuring <code>RuleBasedCollator</code> is needlessly
93 * complex and the people at Taligent who developed it (along with the folks
94 * at Sun who accepted it into the Java standard library) deserve a slow
95 * and agonizing death.
97 * Here are a couple of example of rule strings:
99 * "< a < b < c" - This string says that a is greater than b which is
100 * greater than c, with all differences being primary differences.
102 * "< a,A < b,B < c,C" - This string says that 'A' is greater than 'a' with
103 * a tertiary strength comparison. Both 'b' and 'B' are greater than 'a' and
104 * 'A' during a primary strength comparison. But 'B' is greater than 'b'
105 * under a tertiary strength comparison.
107 * "< a < c & a < b " - This sequence is identical in function to the
108 * "< a < b < c" rule string above. The '&' reset symbol indicates that
109 * the rule "< b" is to be inserted after the text argument "a" in the
110 * previous rule string segment.
112 * "< a < b & y < z" - This is an error. The character 'y' does not appear
113 * anywhere in the previous rule string segment so the rule following the
114 * reset rule cannot be inserted.
116 * For a description of the various comparison strength types, see the
117 * documentation for the <code>Collator</code> class.
119 * As an additional complication to this already overly complex rule scheme,
120 * if any characters precede the first rule, these characters are considered
121 * ignorable. They will be treated as if they did not exist during
122 * comparisons. For example, "- < a < b ..." would make '-' an ignorable
123 * character such that the strings "high-tech" and "hightech" would
124 * be considered identical.
126 * A <code>ParseException</code> will be thrown for any of the following
129 * <li>Unquoted punctuation characters in a text argument.
130 * <li>A relational or reset operator not followed by a text argument
131 * <li>A reset operator where the text argument is not present in
132 * the previous rule string section.
135 * @author Aaron M. Renn <arenn@urbanophile.com>
136 * @author Tom Tromey <tromey@cygnus.com>
137 * @date March 25, 1999
140 final class RBCElement
145 RBCElement (String key, char relation)
148 this.relation = relation;
152 public class RuleBasedCollator extends Collator
154 // True if we are using French-style accent ordering.
155 private boolean frenchAccents;
158 * This the the original rule string.
160 private String rules;
162 // This maps strings onto collation values.
163 private Hashtable map;
165 // An entry in this hash means that more lookahead is required for
166 // the prefix string.
167 private Hashtable prefixes;
169 public Object clone ()
171 RuleBasedCollator c = (RuleBasedCollator) super.clone ();
172 c.map = (Hashtable) map.clone ();
173 c.prefixes = (Hashtable) map.clone ();
177 // A helper for CollationElementIterator.next().
178 int ceiNext (CollationElementIterator cei)
180 if (cei.lookahead_set)
182 cei.lookahead_set = false;
183 return cei.lookahead;
186 int save = cei.index;
187 int max = cei.text.length();
190 // It is possible to have a case where `abc' has a mapping, but
191 // neither `ab' nor `abd' do. In this case we must treat `abd' as
193 boolean found = false;
196 for (i = save + 1; i <= max; ++i)
198 s = cei.text.substring(save, i);
199 if (prefixes.get(s) == null)
205 Object obj = map.get(s);
207 while (found && obj == null && s.length() > 1)
210 s = cei.text.substring(save, i);
219 // This idea, and the values, come from JDK.
220 // assert (s.length() == 1)
221 cei.lookahead_set = true;
222 cei.lookahead = s.charAt(0) << 8;
226 return ((Integer) obj).intValue();
229 // A helper for compareTo() that returns the next character that has
230 // a nonzero ordering at the indicated strength. This is also used
232 static final int next (CollationElementIterator iter, int strength)
236 int os = iter.next();
237 if (os == CollationElementIterator.NULLORDER)
258 public int compare (String source, String target)
260 CollationElementIterator cs, ct;
262 cs = new CollationElementIterator (source, this);
263 ct = new CollationElementIterator (target, this);
267 int os = next (cs, strength);
268 int ot = next (ct, strength);
270 if (os == CollationElementIterator.NULLORDER
271 && ot == CollationElementIterator.NULLORDER)
273 else if (os == CollationElementIterator.NULLORDER)
275 // Source string is shorter, so return "less than".
278 else if (ot == CollationElementIterator.NULLORDER)
280 // Target string is shorter, so return "greater than".
291 public boolean equals (Object obj)
293 if (! (obj instanceof RuleBasedCollator) || ! super.equals(obj))
295 RuleBasedCollator rbc = (RuleBasedCollator) obj;
296 // FIXME: this is probably wrong. Instead we should compare maps
298 return (frenchAccents == rbc.frenchAccents
299 && rules.equals(rbc.rules));
302 public CollationElementIterator getCollationElementIterator (String source)
304 StringBuffer expand = new StringBuffer (source.length());
305 int max = source.length();
306 for (int i = 0; i < max; ++i)
307 decomposeCharacter (source.charAt(i), expand);
308 return new CollationElementIterator (expand.toString(), this);
311 public CollationElementIterator getCollationElementIterator (CharacterIterator source)
313 StringBuffer expand = new StringBuffer ();
314 for (char c = source.first ();
315 c != CharacterIterator.DONE;
317 decomposeCharacter (c, expand);
319 return new CollationElementIterator (expand.toString(), this);
322 public CollationKey getCollationKey (String source)
324 return new CollationKey (getCollationElementIterator (source), source,
328 public String getRules ()
333 public int hashCode ()
335 return (frenchAccents ? 1231 : 1237
338 ^ prefixes.hashCode());
341 private final boolean is_special (char c)
343 // Rules from JCL book.
344 return ((c >= 0x0009 && c <= 0x000d)
345 || (c >= 0x0020 && c <= 0x002f)
346 || (c >= 0x003a && c <= 0x0040)
347 || (c >= 0x005b && c <= 0x0060)
348 || (c >= 0x007b && c <= 0x007e));
351 private final int text_argument (String rules, int index,
355 int len = rules.length();
358 char c = rules.charAt(index);
359 if (c == '\'' && index + 2 < len
360 && rules.charAt(index + 2) == '\''
361 && is_special (rules.charAt(index + 1)))
363 else if (is_special (c) || Character.isWhitespace(c))
371 public RuleBasedCollator (String rules) throws ParseException
374 this.frenchAccents = false;
376 // We keep each rule in order in a vector. At the end we traverse
377 // the vector and compute collation values from it.
378 int insertion_index = 0;
379 Vector vec = new Vector ();
381 StringBuffer argument = new StringBuffer ();
383 int len = rules.length();
384 for (int index = 0; index < len; ++index)
386 char c = rules.charAt(index);
388 // Just skip whitespace.
389 if (Character.isWhitespace(c))
395 frenchAccents = true;
399 // Check for relation or reset operator.
400 if (! (c == '<' || c == ';' || c == ',' || c == '=' || c == '&'))
401 throw new ParseException ("invalid character", index);
406 if (! Character.isWhitespace(rules.charAt(index)))
411 throw new ParseException ("missing argument", index);
414 index = text_argument (rules, index, argument);
415 if (argument.length() == 0)
416 throw new ParseException ("invalid character", save);
417 String arg = argument.toString();
418 int item_index = vec.indexOf(arg);
421 // If the argument already appears in the vector, then we
422 // must remove it in order to re-order.
423 if (item_index != -1)
425 vec.removeElementAt(item_index);
426 if (insertion_index >= item_index)
429 RBCElement r = new RBCElement (arg, c);
430 vec.insertElementAt(r, insertion_index);
436 if (item_index == -1)
438 new ParseException ("argument to reset not previously seen",
440 insertion_index = item_index + 1;
443 // Ugly: in this case the resulting INDEX comes from
444 // text_argument, which returns the index of the next
445 // character we should examine.
449 // Now construct a hash table that maps strings onto their
454 this.map = new Hashtable ();
455 this.prefixes = new Hashtable ();
456 Enumeration e = vec.elements();
457 while (e.hasMoreElements())
459 RBCElement r = (RBCElement) e.nextElement();
477 // This must match CollationElementIterator.
478 map.put(r.key, new Integer (primary << 16
479 | secondary << 8 | tertiary));
481 // Make a map of all lookaheads we might need.
482 for (int i = r.key.length() - 1; i >= 1; --i)
483 prefixes.put(r.key.substring(0, i), Boolean.TRUE);