1 // Copyright 2011 The Go Authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style
3 // license that can be found in the LICENSE file.
5 // Package norm contains types and functions for normalizing Unicode strings.
10 // A Form denotes a canonical representation of Unicode code points.
11 // The Unicode-defined normalization and equivalence forms are:
13 // NFC Unicode Normalization Form C
14 // NFD Unicode Normalization Form D
15 // NFKC Unicode Normalization Form KC
16 // NFKD Unicode Normalization Form KD
18 // For a Form f, this documentation uses the notation f(x) to mean
19 // the bytes or string x converted to the given form.
20 // A position n in x is called a boundary if conversion to the form can
21 // proceed independently on both sides:
22 // f(x) == append(f(x[0:n]), f(x[n:])...)
24 // References: http://unicode.org/reports/tr15/ and
25 // http://unicode.org/notes/tn5/.
35 // Bytes returns f(b). May return b if f(b) = b.
36 func (f Form) Bytes(b []byte) []byte {
39 n := quickSpan(&rb, 0)
43 out := make([]byte, n, len(b))
45 return doAppend(&rb, out, n)
48 // String returns f(s).
49 func (f Form) String(s string) string {
52 n := quickSpan(&rb, 0)
56 out := make([]byte, n, len(s))
58 return string(doAppend(&rb, out, n))
61 // IsNormal returns true if b == f(b).
62 func (f Form) IsNormal(b []byte) bool {
65 bp := quickSpan(&rb, 0)
70 decomposeSegment(&rb, bp)
74 for i := 0; i < rb.nrune; i++ {
76 if bp+int(info.size) > len(b) {
82 if b[bp] != rb.byte[p] {
89 bp = quickSpan(&rb, bp)
94 // IsNormalString returns true if s == f(s).
95 func (f Form) IsNormalString(s string) bool {
98 bp := quickSpan(&rb, 0)
103 decomposeSegment(&rb, bp)
107 for i := 0; i < rb.nrune; i++ {
109 if bp+int(info.size) > len(s) {
115 if s[bp] != rb.byte[p] {
122 bp = quickSpan(&rb, bp)
127 // patchTail fixes a case where a rune may be incorrectly normalized
128 // if it is followed by illegal continuation bytes. It returns the
129 // patched buffer and whether there were trailing continuation bytes.
130 func patchTail(rb *reorderBuffer, buf []byte) ([]byte, bool) {
131 info, p := lastRuneStart(&rb.f, buf)
132 if p == -1 || info.size == 0 {
135 end := p + int(info.size)
136 extra := len(buf) - end
138 // Potentially allocating memory. However, this only
139 // happens with ill-formed UTF-8.
141 x = append(x, buf[len(buf)-extra:]...)
142 buf = decomposeToLastBoundary(rb, buf[:end])
147 return append(buf, x...), true
152 func appendQuick(rb *reorderBuffer, dst []byte, i int) ([]byte, int) {
156 end := quickSpan(rb, i)
157 return rb.src.appendSlice(dst, i, end), end
160 // Append returns f(append(out, b...)).
161 // The buffer out must be nil, empty, or equal to f(out).
162 func (f Form) Append(out []byte, src ...byte) []byte {
166 rb := reorderBuffer{}
168 return doAppend(&rb, out, 0)
171 func doAppend(rb *reorderBuffer, out []byte, p int) []byte {
172 src, n := rb.src, rb.nsrc
173 doMerge := len(out) > 0
174 if q := src.skipNonStarter(p); q > p {
175 // Move leading non-starters to destination.
176 out = src.appendSlice(out, p, q)
177 buf, endsInError := patchTail(rb, out)
180 doMerge = false // no need to merge, ends with illegal UTF-8
182 out = decomposeToLastBoundary(rb, buf) // force decomposition
190 info = fd.info(src, p)
191 if p == 0 && !fd.boundaryBefore(fd, info) {
192 out = decomposeToLastBoundary(rb, out)
195 if info.size == 0 || fd.boundaryBefore(fd, info) {
201 // Append incomplete UTF-8 encoding.
202 return src.appendSlice(out, p, n)
207 out, p = appendQuick(rb, out, p)
210 p = decomposeSegment(rb, p)
215 out, p = appendQuick(rb, out, p)
220 // AppendString returns f(append(out, []byte(s))).
221 // The buffer out must be nil, empty, or equal to f(out).
222 func (f Form) AppendString(out []byte, src string) []byte {
226 rb := reorderBuffer{}
227 rb.initString(f, src)
228 return doAppend(&rb, out, 0)
231 // QuickSpan returns a boundary n such that b[0:n] == f(b[0:n]).
232 // It is not guaranteed to return the largest such n.
233 func (f Form) QuickSpan(b []byte) int {
234 rb := reorderBuffer{}
236 n := quickSpan(&rb, 0)
240 func quickSpan(rb *reorderBuffer, i int) int {
244 src, n := rb.src, rb.nsrc
246 if j := src.skipASCII(i); i != j {
253 info := rb.f.info(src, i)
255 // include incomplete runes
260 if !info.flags.isYesC() {
264 if !info.flags.isYesD() {
272 if nc >= maxCombiningChars {
295 // QuickSpanString returns a boundary n such that b[0:n] == f(s[0:n]).
296 // It is not guaranteed to return the largest such n.
297 func (f Form) QuickSpanString(s string) int {
298 rb := reorderBuffer{}
300 return quickSpan(&rb, 0)
303 // FirstBoundary returns the position i of the first boundary in b
304 // or -1 if b contains no boundary.
305 func (f Form) FirstBoundary(b []byte) int {
306 rb := reorderBuffer{}
308 return firstBoundary(&rb)
311 func firstBoundary(rb *reorderBuffer) int {
312 src, nsrc := rb.src, rb.nsrc
313 i := src.skipNonStarter(0)
318 info := fd.info(src, i)
319 for n := 0; info.size != 0 && !fd.boundaryBefore(fd, info); {
321 if n++; n >= maxCombiningChars {
325 if !fd.boundaryAfter(fd, info) {
330 info = fd.info(src, i)
338 // FirstBoundaryInString returns the position i of the first boundary in s
339 // or -1 if s contains no boundary.
340 func (f Form) FirstBoundaryInString(s string) int {
341 rb := reorderBuffer{}
343 return firstBoundary(&rb)
346 // LastBoundary returns the position i of the last boundary in b
347 // or -1 if b contains no boundary.
348 func (f Form) LastBoundary(b []byte) int {
349 return lastBoundary(formTable[f], b)
352 func lastBoundary(fd *formInfo, b []byte) int {
354 info, p := lastRuneStart(fd, b)
358 if info.size == 0 { // ends with incomplete rune
359 if p == 0 { // starts wtih incomplete rune
363 info, p = lastRuneStart(fd, b[:i])
364 if p == -1 { // incomplete UTF-8 encoding or non-starter bytes without a starter
368 if p+int(info.size) != i { // trailing non-starter bytes: illegal UTF-8
371 if fd.boundaryAfter(fd, info) {
375 for n := 0; i >= 0 && !fd.boundaryBefore(fd, info); {
376 info, p = lastRuneStart(fd, b[:i])
377 if n++; n >= maxCombiningChars {
380 if p+int(info.size) != i {
381 if p == -1 { // no boundary found
384 return i // boundary after an illegal UTF-8 encoding
391 // decomposeSegment scans the first segment in src into rb.
392 // It returns the number of bytes consumed from src.
393 // TODO(mpvl): consider inserting U+034f (Combining Grapheme Joiner)
394 // when we detect a sequence of 30+ non-starter chars.
395 func decomposeSegment(rb *reorderBuffer, sp int) int {
396 // Force one character to be consumed.
397 info := rb.f.info(rb.src, sp)
401 for rb.insert(rb.src, sp, info) {
406 info = rb.f.info(rb.src, sp)
407 bound := rb.f.boundaryBefore(&rb.f, info)
408 if bound || info.size == 0 {
415 // lastRuneStart returns the runeInfo and position of the last
416 // rune in buf or the zero runeInfo and -1 if no rune was found.
417 func lastRuneStart(fd *formInfo, buf []byte) (runeInfo, int) {
419 for ; p >= 0 && !utf8.RuneStart(buf[p]); p-- {
422 return runeInfo{0, 0, 0, 0}, -1
424 return fd.info(inputBytes(buf), p), p
427 // decomposeToLastBoundary finds an open segment at the end of the buffer
428 // and scans it into rb. Returns the buffer minus the last segment.
429 func decomposeToLastBoundary(rb *reorderBuffer, buf []byte) []byte {
431 info, i := lastRuneStart(fd, buf)
432 if int(info.size) != len(buf)-i {
433 // illegal trailing continuation bytes
436 if rb.f.boundaryAfter(fd, info) {
439 var add [maxBackRunes]runeInfo // stores runeInfo in reverse order
443 p := len(buf) - int(info.size)
444 for ; p >= 0 && !rb.f.boundaryBefore(fd, info); p -= int(info.size) {
445 info, i = lastRuneStart(fd, buf[:p])
446 if int(info.size) != p-i {
449 // Check that decomposition doesn't result in overflow.
450 if info.flags.hasDecomposition() {
451 dcomp := rb.f.decompose(inputBytes(buf), p-int(info.size))
452 for i := 0; i < len(dcomp); {
453 inf := rb.f.info(inputBytes(dcomp), i)
460 if n > maxBackRunes {
467 for padd--; padd >= 0; padd-- {
469 rb.insert(inputBytes(buf), pp, info)