1 // RuleBasedCollator.java - Concrete class for locale-based string compare.
3 /* Copyright (C) 1999 Cygnus Solutions
5 This file is part of libgcj.
7 This software is copyrighted work licensed under the terms of the
8 Libgcj License. Please consult the file "LIBGCJ_LICENSE" for
13 import java.util.Enumeration;
14 import java.util.Hashtable;
15 import java.util.Vector;
18 * @author Tom Tromey <tromey@cygnus.com>
19 * @date March 25, 1999
21 /* Written using "Java Class Libraries", 2nd edition, plus online
22 * API docs for JDK 1.2 from http://www.javasoft.com.
23 * Status: Believed complete and correct
31 RBCElement (String key, char relation)
34 this.relation = relation;
38 public class RuleBasedCollator extends Collator
40 public Object clone ()
42 return new RuleBasedCollator (this);
45 // A helper for CollationElementIterator.next().
46 static int ceiNext (CollationElementIterator cei)
48 if (cei.lookahead_set)
50 cei.lookahead_set = false;
55 int max = cei.text.length();
58 // It is possible to have a case where `abc' has a mapping, but
59 // neither `ab' nor `abd' do. In this case we must treat `abd' as
61 boolean found = false;
64 for (i = save; i < max; ++i)
66 s = cei.text.substring(save, i);
67 if (prefixes.get(s) == null)
73 Object obj = map.get(s);
75 while (found && obj == null && s.length() > 1)
78 s = cei.text.substring(save, i);
87 // This idea, and the values, come from JDK.
88 // assert (s.length() == 1)
89 cei.lookahead_set = true;
90 cei.lookahead = s.charAt(0) << 8;
94 return ((Integer) obj).intValue();
97 // A helper for compareTo() that returns the next character that has
98 // a nonzero ordering at the indicated strength. This is also used
100 static final int next (CollationElementIterator iter, int strength)
104 int os = iter.next();
105 if (os == CollationElementIterator.NULLORDER)
111 c |= CollationElementIterator.primaryOrder(os);
114 c |= CollationElementIterator.secondaryOrder(os);
117 c |= CollationElementIterator.tertiaryOrder(os);
127 public int compare (String source, String target)
129 CollationElementIterator cs, ct;
131 cs = new CollationElementIterator (source);
132 ct = new CollationElementIterator (target);
136 int os = next (cs, strength);
137 int ot = next (ct, strength);
139 if (os == CollationElementIterator.NULLORDER
140 && ot == CollationElementIterator.NULLORDER)
142 else if (os == CollationElementIterator.NULLORDER)
144 else if (ot == CollationElementIterator.NULLORDER)
154 public boolean equals (Object obj)
156 if (! (obj instanceof RuleBasedCollator) || ! super.equals(obj))
158 RuleBasedCollator rbc = (RuleBasedCollator) obj;
159 // FIXME: this is probably wrong. Instead we should compare maps
161 return (frenchAccents == rbc.frenchAccents
162 && rules.equals(rbc.rules));
165 public CollationElementIterator getCollationElementIterator (String source)
167 StringBuffer expand = new StringBuffer (source.length());
168 int max = source.length();
169 for (int i = 0; i < max; ++i)
170 decomposeCharacter (source.charAt(i), expand);
171 return new CollationElementIterator (expand.toString());
174 public CollationKey getCollationKey (String source)
176 return new CollationKey (getCollationElementIterator (source), source,
180 public String getRules ()
185 public int hashCode ()
187 return (frenchAccents ? 1231 : 1237
190 ^ prefixes.hashCode());
193 private final boolean is_special (char c)
195 // Rules from JCL book.
196 return ((c >= 0x0009 && c <= 0x000d)
197 || (c >= 0x0020 && c <= 0x002f)
198 || (c >= 0x003a && c <= 0x0040)
199 || (c >= 0x005b && c <= 0x0060)
200 || (c >= 0x007b && c <= 0x007e));
203 private final int text_argument (String rules, int index,
207 int len = rules.length();
210 char c = rules.charAt(index);
211 if (c == '\'' && index + 2 < len
212 && rules.charAt(index + 2) == '\''
213 && is_special (rules.charAt(index + 1)))
215 else if (is_special (c) || Character.isWhitespace(c))
223 public RuleBasedCollator (String rules) throws ParseException
226 this.frenchAccents = false;
228 // We keep each rule in order in a vector. At the end we traverse
229 // the vector and compute collation values from it.
230 int insertion_index = 0;
231 Vector vec = new Vector ();
233 StringBuffer argument = new StringBuffer ();
235 int len = rules.length();
236 for (int index = 0; index < len; ++index)
238 char c = rules.charAt(index);
240 // Just skip whitespace.
241 if (Character.isWhitespace(c))
247 frenchAccents = true;
251 // Check for relation or reset operator.
252 if (! (c == '<' || c == ';' || c == ',' || c == '=' || c == '&'))
253 throw new ParseException ("invalid character", index);
258 if (! Character.isWhitespace(rules.charAt(index)))
263 throw new ParseException ("missing argument", index);
266 index = text_argument (rules, index, argument);
267 if (argument.length() == 0)
268 throw new ParseException ("invalid character", save);
269 String arg = argument.toString();
270 int item_index = vec.indexOf(arg);
273 // If the argument already appears in the vector, then we
274 // must remove it in order to re-order.
275 if (item_index != -1)
277 vec.removeElementAt(item_index);
278 if (insertion_index >= item_index)
281 RBCElement r = new RBCElement (arg, c);
282 vec.insertElementAt(r, insertion_index);
288 if (item_index == -1)
290 new ParseException ("argument to reset not previously seen",
292 insertion_index = item_index + 1;
295 // Ugly: in this case the resulting INDEX comes from
296 // text_argument, which returns the index of the next
297 // character we should examine.
301 // Now construct a hash table that maps strings onto their
306 this.map = new Hashtable ();
307 this.prefixes = new Hashtable ();
308 Enumeration e = vec.elements();
309 while (e.hasMoreElements())
311 RBCElement r = (RBCElement) e.nextElement();
329 // This must match CollationElementIterator.
330 map.put(r.key, new Integer (primary << 16
331 | secondary << 8 | tertiary));
333 // Make a map of all lookaheads we might need.
334 for (int i = r.key.length() - 1; i >= 1; --i)
335 prefixes.put(r.key.substring(0, i), Boolean.TRUE);
339 // This is a helper for clone.
340 private RuleBasedCollator (RuleBasedCollator other)
342 frenchAccents = other.frenchAccents;
345 strength = other.strength;
347 prefixes = other.prefixes;
350 // True if we are using French-style accent ordering.
351 private boolean frenchAccents;
353 // It's easier to just save the rules than to try to recreate them.
354 private String rules;
356 // This maps strings onto collation values.
357 private Hashtable map;
358 // An entry in this hash means that more lookahead is required for
359 // the prefix string.
360 private Hashtable prefixes;