1
2
3
4
5 package norm
6
7 import (
8 "fmt"
9 "unicode/utf8"
10 )
11
12
13
14 const MaxSegmentSize = maxByteBufferSize
15
16
17
18 type Iter struct {
19 rb reorderBuffer
20 buf [maxByteBufferSize]byte
21 info Properties
22 next iterFunc
23 asciiF iterFunc
24
25 p int
26 multiSeg []byte
27 }
28
29 type iterFunc func(*Iter) []byte
30
31
32 func (i *Iter) Init(f Form, src []byte) {
33 i.p = 0
34 if len(src) == 0 {
35 i.setDone()
36 i.rb.nsrc = 0
37 return
38 }
39 i.multiSeg = nil
40 i.rb.init(f, src)
41 i.next = i.rb.f.nextMain
42 i.asciiF = nextASCIIBytes
43 i.info = i.rb.f.info(i.rb.src, i.p)
44 i.rb.ss.first(i.info)
45 }
46
47
48 func (i *Iter) InitString(f Form, src string) {
49 i.p = 0
50 if len(src) == 0 {
51 i.setDone()
52 i.rb.nsrc = 0
53 return
54 }
55 i.multiSeg = nil
56 i.rb.initString(f, src)
57 i.next = i.rb.f.nextMain
58 i.asciiF = nextASCIIString
59 i.info = i.rb.f.info(i.rb.src, i.p)
60 i.rb.ss.first(i.info)
61 }
62
63
64
65
66 func (i *Iter) Seek(offset int64, whence int) (int64, error) {
67 var abs int64
68 switch whence {
69 case 0:
70 abs = offset
71 case 1:
72 abs = int64(i.p) + offset
73 case 2:
74 abs = int64(i.rb.nsrc) + offset
75 default:
76 return 0, fmt.Errorf("norm: invalid whence")
77 }
78 if abs < 0 {
79 return 0, fmt.Errorf("norm: negative position")
80 }
81 if int(abs) >= i.rb.nsrc {
82 i.setDone()
83 return int64(i.p), nil
84 }
85 i.p = int(abs)
86 i.multiSeg = nil
87 i.next = i.rb.f.nextMain
88 i.info = i.rb.f.info(i.rb.src, i.p)
89 i.rb.ss.first(i.info)
90 return abs, nil
91 }
92
93
94
95
96
97 func (i *Iter) returnSlice(a, b int) []byte {
98 if i.rb.src.bytes == nil {
99 return i.buf[:copy(i.buf[:], i.rb.src.str[a:b])]
100 }
101 return i.rb.src.bytes[a:b]
102 }
103
104
105 func (i *Iter) Pos() int {
106 return i.p
107 }
108
109 func (i *Iter) setDone() {
110 i.next = nextDone
111 i.p = i.rb.nsrc
112 }
113
114
115 func (i *Iter) Done() bool {
116 return i.p >= i.rb.nsrc
117 }
118
119
120
121
122
123
124 func (i *Iter) Next() []byte {
125 return i.next(i)
126 }
127
128 func nextASCIIBytes(i *Iter) []byte {
129 p := i.p + 1
130 if p >= i.rb.nsrc {
131 p0 := i.p
132 i.setDone()
133 return i.rb.src.bytes[p0:p]
134 }
135 if i.rb.src.bytes[p] < utf8.RuneSelf {
136 p0 := i.p
137 i.p = p
138 return i.rb.src.bytes[p0:p]
139 }
140 i.info = i.rb.f.info(i.rb.src, i.p)
141 i.next = i.rb.f.nextMain
142 return i.next(i)
143 }
144
145 func nextASCIIString(i *Iter) []byte {
146 p := i.p + 1
147 if p >= i.rb.nsrc {
148 i.buf[0] = i.rb.src.str[i.p]
149 i.setDone()
150 return i.buf[:1]
151 }
152 if i.rb.src.str[p] < utf8.RuneSelf {
153 i.buf[0] = i.rb.src.str[i.p]
154 i.p = p
155 return i.buf[:1]
156 }
157 i.info = i.rb.f.info(i.rb.src, i.p)
158 i.next = i.rb.f.nextMain
159 return i.next(i)
160 }
161
162 func nextHangul(i *Iter) []byte {
163 p := i.p
164 next := p + hangulUTF8Size
165 if next >= i.rb.nsrc {
166 i.setDone()
167 } else if i.rb.src.hangul(next) == 0 {
168 i.rb.ss.next(i.info)
169 i.info = i.rb.f.info(i.rb.src, i.p)
170 i.next = i.rb.f.nextMain
171 return i.next(i)
172 }
173 i.p = next
174 return i.buf[:decomposeHangul(i.buf[:], i.rb.src.hangul(p))]
175 }
176
177 func nextDone(i *Iter) []byte {
178 return nil
179 }
180
181
182
183 func nextMulti(i *Iter) []byte {
184 j := 0
185 d := i.multiSeg
186
187 for j = 1; j < len(d) && !utf8.RuneStart(d[j]); j++ {
188 }
189 for j < len(d) {
190 info := i.rb.f.info(input{bytes: d}, j)
191 if info.BoundaryBefore() {
192 i.multiSeg = d[j:]
193 return d[:j]
194 }
195 j += int(info.size)
196 }
197
198 i.next = i.rb.f.nextMain
199 return i.next(i)
200 }
201
202
203
204 func nextMultiNorm(i *Iter) []byte {
205 j := 0
206 d := i.multiSeg
207 for j < len(d) {
208 info := i.rb.f.info(input{bytes: d}, j)
209 if info.BoundaryBefore() {
210 i.rb.compose()
211 seg := i.buf[:i.rb.flushCopy(i.buf[:])]
212 i.rb.insertUnsafe(input{bytes: d}, j, info)
213 i.multiSeg = d[j+int(info.size):]
214 return seg
215 }
216 i.rb.insertUnsafe(input{bytes: d}, j, info)
217 j += int(info.size)
218 }
219 i.multiSeg = nil
220 i.next = nextComposed
221 return doNormComposed(i)
222 }
223
224
225 func nextDecomposed(i *Iter) (next []byte) {
226 outp := 0
227 inCopyStart, outCopyStart := i.p, 0
228 for {
229 if sz := int(i.info.size); sz <= 1 {
230 i.rb.ss = 0
231 p := i.p
232 i.p++
233 if i.p >= i.rb.nsrc {
234 i.setDone()
235 return i.returnSlice(p, i.p)
236 } else if i.rb.src._byte(i.p) < utf8.RuneSelf {
237 i.next = i.asciiF
238 return i.returnSlice(p, i.p)
239 }
240 outp++
241 } else if d := i.info.Decomposition(); d != nil {
242
243
244
245
246 p := outp + len(d)
247 if outp > 0 {
248 i.rb.src.copySlice(i.buf[outCopyStart:], inCopyStart, i.p)
249
250
251 if p > len(i.buf) {
252 return i.buf[:outp]
253 }
254 } else if i.info.multiSegment() {
255
256
257 if i.multiSeg == nil {
258 i.multiSeg = d
259 i.next = nextMulti
260 return nextMulti(i)
261 }
262
263 d = i.multiSeg
264 i.multiSeg = nil
265 p = len(d)
266 }
267 prevCC := i.info.tccc
268 if i.p += sz; i.p >= i.rb.nsrc {
269 i.setDone()
270 i.info = Properties{}
271 } else {
272 i.info = i.rb.f.info(i.rb.src, i.p)
273 }
274 switch i.rb.ss.next(i.info) {
275 case ssOverflow:
276 i.next = nextCGJDecompose
277 fallthrough
278 case ssStarter:
279 if outp > 0 {
280 copy(i.buf[outp:], d)
281 return i.buf[:p]
282 }
283 return d
284 }
285 copy(i.buf[outp:], d)
286 outp = p
287 inCopyStart, outCopyStart = i.p, outp
288 if i.info.ccc < prevCC {
289 goto doNorm
290 }
291 continue
292 } else if r := i.rb.src.hangul(i.p); r != 0 {
293 outp = decomposeHangul(i.buf[:], r)
294 i.p += hangulUTF8Size
295 inCopyStart, outCopyStart = i.p, outp
296 if i.p >= i.rb.nsrc {
297 i.setDone()
298 break
299 } else if i.rb.src.hangul(i.p) != 0 {
300 i.next = nextHangul
301 return i.buf[:outp]
302 }
303 } else {
304 p := outp + sz
305 if p > len(i.buf) {
306 break
307 }
308 outp = p
309 i.p += sz
310 }
311 if i.p >= i.rb.nsrc {
312 i.setDone()
313 break
314 }
315 prevCC := i.info.tccc
316 i.info = i.rb.f.info(i.rb.src, i.p)
317 if v := i.rb.ss.next(i.info); v == ssStarter {
318 break
319 } else if v == ssOverflow {
320 i.next = nextCGJDecompose
321 break
322 }
323 if i.info.ccc < prevCC {
324 goto doNorm
325 }
326 }
327 if outCopyStart == 0 {
328 return i.returnSlice(inCopyStart, i.p)
329 } else if inCopyStart < i.p {
330 i.rb.src.copySlice(i.buf[outCopyStart:], inCopyStart, i.p)
331 }
332 return i.buf[:outp]
333 doNorm:
334
335
336 i.rb.src.copySlice(i.buf[outCopyStart:], inCopyStart, i.p)
337 i.rb.insertDecomposed(i.buf[0:outp])
338 return doNormDecomposed(i)
339 }
340
341 func doNormDecomposed(i *Iter) []byte {
342 for {
343 i.rb.insertUnsafe(i.rb.src, i.p, i.info)
344 if i.p += int(i.info.size); i.p >= i.rb.nsrc {
345 i.setDone()
346 break
347 }
348 i.info = i.rb.f.info(i.rb.src, i.p)
349 if i.info.ccc == 0 {
350 break
351 }
352 if s := i.rb.ss.next(i.info); s == ssOverflow {
353 i.next = nextCGJDecompose
354 break
355 }
356 }
357
358 return i.buf[:i.rb.flushCopy(i.buf[:])]
359 }
360
361 func nextCGJDecompose(i *Iter) []byte {
362 i.rb.ss = 0
363 i.rb.insertCGJ()
364 i.next = nextDecomposed
365 i.rb.ss.first(i.info)
366 buf := doNormDecomposed(i)
367 return buf
368 }
369
370
371 func nextComposed(i *Iter) []byte {
372 outp, startp := 0, i.p
373 var prevCC uint8
374 for {
375 if !i.info.isYesC() {
376 goto doNorm
377 }
378 prevCC = i.info.tccc
379 sz := int(i.info.size)
380 if sz == 0 {
381 sz = 1
382 }
383 p := outp + sz
384 if p > len(i.buf) {
385 break
386 }
387 outp = p
388 i.p += sz
389 if i.p >= i.rb.nsrc {
390 i.setDone()
391 break
392 } else if i.rb.src._byte(i.p) < utf8.RuneSelf {
393 i.rb.ss = 0
394 i.next = i.asciiF
395 break
396 }
397 i.info = i.rb.f.info(i.rb.src, i.p)
398 if v := i.rb.ss.next(i.info); v == ssStarter {
399 break
400 } else if v == ssOverflow {
401 i.next = nextCGJCompose
402 break
403 }
404 if i.info.ccc < prevCC {
405 goto doNorm
406 }
407 }
408 return i.returnSlice(startp, i.p)
409 doNorm:
410
411 i.p = startp
412 i.info = i.rb.f.info(i.rb.src, i.p)
413 i.rb.ss.first(i.info)
414 if i.info.multiSegment() {
415 d := i.info.Decomposition()
416 info := i.rb.f.info(input{bytes: d}, 0)
417 i.rb.insertUnsafe(input{bytes: d}, 0, info)
418 i.multiSeg = d[int(info.size):]
419 i.next = nextMultiNorm
420 return nextMultiNorm(i)
421 }
422 i.rb.ss.first(i.info)
423 i.rb.insertUnsafe(i.rb.src, i.p, i.info)
424 return doNormComposed(i)
425 }
426
427 func doNormComposed(i *Iter) []byte {
428
429 for {
430 if i.p += int(i.info.size); i.p >= i.rb.nsrc {
431 i.setDone()
432 break
433 }
434 i.info = i.rb.f.info(i.rb.src, i.p)
435 if s := i.rb.ss.next(i.info); s == ssStarter {
436 break
437 } else if s == ssOverflow {
438 i.next = nextCGJCompose
439 break
440 }
441 i.rb.insertUnsafe(i.rb.src, i.p, i.info)
442 }
443 i.rb.compose()
444 seg := i.buf[:i.rb.flushCopy(i.buf[:])]
445 return seg
446 }
447
448 func nextCGJCompose(i *Iter) []byte {
449 i.rb.ss = 0
450 i.rb.insertCGJ()
451 i.next = nextComposed
452
453
454
455 i.rb.ss.first(i.info)
456 i.rb.insertUnsafe(i.rb.src, i.p, i.info)
457 return doNormComposed(i)
458 }
459
View as plain text