parse.go

     1  // Copyright 2013 The Go Authors. All rights reserved.
     2  // Use of this source code is governed by a BSD-style
     3  // license that can be found in the LICENSE file.
     4  
     5  package language
     6  
     7  import (
     8  	"bytes"
     9  	"errors"
    10  	"fmt"
    11  	"sort"
    12  
    13  	"golang.org/x/text/internal/tag"
    14  )
    15  
    16  // isAlpha returns true if the byte is not a digit.
    17  // b must be an ASCII letter or digit.
    18  func isAlpha(b byte) bool {
    19  	return b > '9'
    20  }
    21  
    22  // isAlphaNum returns true if the string contains only ASCII letters or digits.
    23  func isAlphaNum(s []byte) bool {
    24  	for _, c := range s {
    25  		if !('a' <= c && c <= 'z' || 'A' <= c && c <= 'Z' || '0' <= c && c <= '9') {
    26  			return false
    27  		}
    28  	}
    29  	return true
    30  }
    31  
    32  // ErrSyntax is returned by any of the parsing functions when the
    33  // input is not well-formed, according to BCP 47.
    34  // TODO: return the position at which the syntax error occurred?
    35  var ErrSyntax = errors.New("language: tag is not well-formed")
    36  
    37  // ErrDuplicateKey is returned when a tag contains the same key twice with
    38  // different values in the -u section.
    39  var ErrDuplicateKey = errors.New("language: different values for same key in -u extension")
    40  
    41  // ValueError is returned by any of the parsing functions when the
    42  // input is well-formed but the respective subtag is not recognized
    43  // as a valid value.
    44  type ValueError struct {
    45  	v [8]byte
    46  }
    47  
    48  // NewValueError creates a new ValueError.
    49  func NewValueError(tag []byte) ValueError {
    50  	var e ValueError
    51  	copy(e.v[:], tag)
    52  	return e
    53  }
    54  
    55  func (e ValueError) tag() []byte {
    56  	n := bytes.IndexByte(e.v[:], 0)
    57  	if n == -1 {
    58  		n = 8
    59  	}
    60  	return e.v[:n]
    61  }
    62  
    63  // Error implements the error interface.
    64  func (e ValueError) Error() string {
    65  	return fmt.Sprintf("language: subtag %q is well-formed but unknown", e.tag())
    66  }
    67  
    68  // Subtag returns the subtag for which the error occurred.
    69  func (e ValueError) Subtag() string {
    70  	return string(e.tag())
    71  }
    72  
    73  // scanner is used to scan BCP 47 tokens, which are separated by _ or -.
    74  type scanner struct {
    75  	b     []byte
    76  	bytes [max99thPercentileSize]byte
    77  	token []byte
    78  	start int // start position of the current token
    79  	end   int // end position of the current token
    80  	next  int // next point for scan
    81  	err   error
    82  	done  bool
    83  }
    84  
    85  func makeScannerString(s string) scanner {
    86  	scan := scanner{}
    87  	if len(s) <= len(scan.bytes) {
    88  		scan.b = scan.bytes[:copy(scan.bytes[:], s)]
    89  	} else {
    90  		scan.b = []byte(s)
    91  	}
    92  	scan.init()
    93  	return scan
    94  }
    95  
    96  // makeScanner returns a scanner using b as the input buffer.
    97  // b is not copied and may be modified by the scanner routines.
    98  func makeScanner(b []byte) scanner {
    99  	scan := scanner{b: b}
   100  	scan.init()
   101  	return scan
   102  }
   103  
   104  func (s *scanner) init() {
   105  	for i, c := range s.b {
   106  		if c == '_' {
   107  			s.b[i] = '-'
   108  		}
   109  	}
   110  	s.scan()
   111  }
   112  
   113  // restToLower converts the string between start and end to lower case.
   114  func (s *scanner) toLower(start, end int) {
   115  	for i := start; i < end; i++ {
   116  		c := s.b[i]
   117  		if 'A' <= c && c <= 'Z' {
   118  			s.b[i] += 'a' - 'A'
   119  		}
   120  	}
   121  }
   122  
   123  func (s *scanner) setError(e error) {
   124  	if s.err == nil || (e == ErrSyntax && s.err != ErrSyntax) {
   125  		s.err = e
   126  	}
   127  }
   128  
   129  // resizeRange shrinks or grows the array at position oldStart such that
   130  // a new string of size newSize can fit between oldStart and oldEnd.
   131  // Sets the scan point to after the resized range.
   132  func (s *scanner) resizeRange(oldStart, oldEnd, newSize int) {
   133  	s.start = oldStart
   134  	if end := oldStart + newSize; end != oldEnd {
   135  		diff := end - oldEnd
   136  		var b []byte
   137  		if n := len(s.b) + diff; n > cap(s.b) {
   138  			b = make([]byte, n)
   139  			copy(b, s.b[:oldStart])
   140  		} else {
   141  			b = s.b[:n]
   142  		}
   143  		copy(b[end:], s.b[oldEnd:])
   144  		s.b = b
   145  		s.next = end + (s.next - s.end)
   146  		s.end = end
   147  	}
   148  }
   149  
   150  // replace replaces the current token with repl.
   151  func (s *scanner) replace(repl string) {
   152  	s.resizeRange(s.start, s.end, len(repl))
   153  	copy(s.b[s.start:], repl)
   154  }
   155  
   156  // gobble removes the current token from the input.
   157  // Caller must call scan after calling gobble.
   158  func (s *scanner) gobble(e error) {
   159  	s.setError(e)
   160  	if s.start == 0 {
   161  		s.b = s.b[:+copy(s.b, s.b[s.next:])]
   162  		s.end = 0
   163  	} else {
   164  		s.b = s.b[:s.start-1+copy(s.b[s.start-1:], s.b[s.end:])]
   165  		s.end = s.start - 1
   166  	}
   167  	s.next = s.start
   168  }
   169  
   170  // deleteRange removes the given range from s.b before the current token.
   171  func (s *scanner) deleteRange(start, end int) {
   172  	s.b = s.b[:start+copy(s.b[start:], s.b[end:])]
   173  	diff := end - start
   174  	s.next -= diff
   175  	s.start -= diff
   176  	s.end -= diff
   177  }
   178  
   179  // scan parses the next token of a BCP 47 string.  Tokens that are larger
   180  // than 8 characters or include non-alphanumeric characters result in an error
   181  // and are gobbled and removed from the output.
   182  // It returns the end position of the last token consumed.
   183  func (s *scanner) scan() (end int) {
   184  	end = s.end
   185  	s.token = nil
   186  	for s.start = s.next; s.next < len(s.b); {
   187  		i := bytes.IndexByte(s.b[s.next:], '-')
   188  		if i == -1 {
   189  			s.end = len(s.b)
   190  			s.next = len(s.b)
   191  			i = s.end - s.start
   192  		} else {
   193  			s.end = s.next + i
   194  			s.next = s.end + 1
   195  		}
   196  		token := s.b[s.start:s.end]
   197  		if i < 1 || i > 8 || !isAlphaNum(token) {
   198  			s.gobble(ErrSyntax)
   199  			continue
   200  		}
   201  		s.token = token
   202  		return end
   203  	}
   204  	if n := len(s.b); n > 0 && s.b[n-1] == '-' {
   205  		s.setError(ErrSyntax)
   206  		s.b = s.b[:len(s.b)-1]
   207  	}
   208  	s.done = true
   209  	return end
   210  }
   211  
   212  // acceptMinSize parses multiple tokens of the given size or greater.
   213  // It returns the end position of the last token consumed.
   214  func (s *scanner) acceptMinSize(min int) (end int) {
   215  	end = s.end
   216  	s.scan()
   217  	for ; len(s.token) >= min; s.scan() {
   218  		end = s.end
   219  	}
   220  	return end
   221  }
   222  
   223  // Parse parses the given BCP 47 string and returns a valid Tag. If parsing
   224  // failed it returns an error and any part of the tag that could be parsed.
   225  // If parsing succeeded but an unknown value was found, it returns
   226  // ValueError. The Tag returned in this case is just stripped of the unknown
   227  // value. All other values are preserved. It accepts tags in the BCP 47 format
   228  // and extensions to this standard defined in
   229  // https://www.unicode.org/reports/tr35/#Unicode_Language_and_Locale_Identifiers.
   230  func Parse(s string) (t Tag, err error) {
   231  	// TODO: consider supporting old-style locale key-value pairs.
   232  	if s == "" {
   233  		return Und, ErrSyntax
   234  	}
   235  	defer func() {
   236  		if recover() != nil {
   237  			t = Und
   238  			err = ErrSyntax
   239  			return
   240  		}
   241  	}()
   242  	if len(s) <= maxAltTaglen {
   243  		b := [maxAltTaglen]byte{}
   244  		for i, c := range s {
   245  			// Generating invalid UTF-8 is okay as it won't match.
   246  			if 'A' <= c && c <= 'Z' {
   247  				c += 'a' - 'A'
   248  			} else if c == '_' {
   249  				c = '-'
   250  			}
   251  			b[i] = byte(c)
   252  		}
   253  		if t, ok := grandfathered(b); ok {
   254  			return t, nil
   255  		}
   256  	}
   257  	scan := makeScannerString(s)
   258  	return parse(&scan, s)
   259  }
   260  
   261  func parse(scan *scanner, s string) (t Tag, err error) {
   262  	t = Und
   263  	var end int
   264  	if n := len(scan.token); n <= 1 {
   265  		scan.toLower(0, len(scan.b))
   266  		if n == 0 || scan.token[0] != 'x' {
   267  			return t, ErrSyntax
   268  		}
   269  		end = parseExtensions(scan)
   270  	} else if n >= 4 {
   271  		return Und, ErrSyntax
   272  	} else { // the usual case
   273  		t, end = parseTag(scan, true)
   274  		if n := len(scan.token); n == 1 {
   275  			t.pExt = uint16(end)
   276  			end = parseExtensions(scan)
   277  		} else if end < len(scan.b) {
   278  			scan.setError(ErrSyntax)
   279  			scan.b = scan.b[:end]
   280  		}
   281  	}
   282  	if int(t.pVariant) < len(scan.b) {
   283  		if end < len(s) {
   284  			s = s[:end]
   285  		}
   286  		if len(s) > 0 && tag.Compare(s, scan.b) == 0 {
   287  			t.str = s
   288  		} else {
   289  			t.str = string(scan.b)
   290  		}
   291  	} else {
   292  		t.pVariant, t.pExt = 0, 0
   293  	}
   294  	return t, scan.err
   295  }
   296  
   297  // parseTag parses language, script, region and variants.
   298  // It returns a Tag and the end position in the input that was parsed.
   299  // If doNorm is true, then <lang>-<extlang> will be normalized to <extlang>.
   300  func parseTag(scan *scanner, doNorm bool) (t Tag, end int) {
   301  	var e error
   302  	// TODO: set an error if an unknown lang, script or region is encountered.
   303  	t.LangID, e = getLangID(scan.token)
   304  	scan.setError(e)
   305  	scan.replace(t.LangID.String())
   306  	langStart := scan.start
   307  	end = scan.scan()
   308  	for len(scan.token) == 3 && isAlpha(scan.token[0]) {
   309  		// From http://tools.ietf.org/html/bcp47, <lang>-<extlang> tags are equivalent
   310  		// to a tag of the form <extlang>.
   311  		if doNorm {
   312  			lang, e := getLangID(scan.token)
   313  			if lang != 0 {
   314  				t.LangID = lang
   315  				langStr := lang.String()
   316  				copy(scan.b[langStart:], langStr)
   317  				scan.b[langStart+len(langStr)] = '-'
   318  				scan.start = langStart + len(langStr) + 1
   319  			}
   320  			scan.gobble(e)
   321  		}
   322  		end = scan.scan()
   323  	}
   324  	if len(scan.token) == 4 && isAlpha(scan.token[0]) {
   325  		t.ScriptID, e = getScriptID(script, scan.token)
   326  		if t.ScriptID == 0 {
   327  			scan.gobble(e)
   328  		}
   329  		end = scan.scan()
   330  	}
   331  	if n := len(scan.token); n >= 2 && n <= 3 {
   332  		t.RegionID, e = getRegionID(scan.token)
   333  		if t.RegionID == 0 {
   334  			scan.gobble(e)
   335  		} else {
   336  			scan.replace(t.RegionID.String())
   337  		}
   338  		end = scan.scan()
   339  	}
   340  	scan.toLower(scan.start, len(scan.b))
   341  	t.pVariant = byte(end)
   342  	end = parseVariants(scan, end, t)
   343  	t.pExt = uint16(end)
   344  	return t, end
   345  }
   346  
   347  var separator = []byte{'-'}
   348  
   349  // parseVariants scans tokens as long as each token is a valid variant string.
   350  // Duplicate variants are removed.
   351  func parseVariants(scan *scanner, end int, t Tag) int {
   352  	start := scan.start
   353  	varIDBuf := [4]uint8{}
   354  	variantBuf := [4][]byte{}
   355  	varID := varIDBuf[:0]
   356  	variant := variantBuf[:0]
   357  	last := -1
   358  	needSort := false
   359  	for ; len(scan.token) >= 4; scan.scan() {
   360  		// TODO: measure the impact of needing this conversion and redesign
   361  		// the data structure if there is an issue.
   362  		v, ok := variantIndex[string(scan.token)]
   363  		if !ok {
   364  			// unknown variant
   365  			// TODO: allow user-defined variants?
   366  			scan.gobble(NewValueError(scan.token))
   367  			continue
   368  		}
   369  		varID = append(varID, v)
   370  		variant = append(variant, scan.token)
   371  		if !needSort {
   372  			if last < int(v) {
   373  				last = int(v)
   374  			} else {
   375  				needSort = true
   376  				// There is no legal combinations of more than 7 variants
   377  				// (and this is by no means a useful sequence).
   378  				const maxVariants = 8
   379  				if len(varID) > maxVariants {
   380  					break
   381  				}
   382  			}
   383  		}
   384  		end = scan.end
   385  	}
   386  	if needSort {
   387  		sort.Sort(variantsSort{varID, variant})
   388  		k, l := 0, -1
   389  		for i, v := range varID {
   390  			w := int(v)
   391  			if l == w {
   392  				// Remove duplicates.
   393  				continue
   394  			}
   395  			varID[k] = varID[i]
   396  			variant[k] = variant[i]
   397  			k++
   398  			l = w
   399  		}
   400  		if str := bytes.Join(variant[:k], separator); len(str) == 0 {
   401  			end = start - 1
   402  		} else {
   403  			scan.resizeRange(start, end, len(str))
   404  			copy(scan.b[scan.start:], str)
   405  			end = scan.end
   406  		}
   407  	}
   408  	return end
   409  }
   410  
   411  type variantsSort struct {
   412  	i []uint8
   413  	v [][]byte
   414  }
   415  
   416  func (s variantsSort) Len() int {
   417  	return len(s.i)
   418  }
   419  
   420  func (s variantsSort) Swap(i, j int) {
   421  	s.i[i], s.i[j] = s.i[j], s.i[i]
   422  	s.v[i], s.v[j] = s.v[j], s.v[i]
   423  }
   424  
   425  func (s variantsSort) Less(i, j int) bool {
   426  	return s.i[i] < s.i[j]
   427  }
   428  
   429  type bytesSort struct {
   430  	b [][]byte
   431  	n int // first n bytes to compare
   432  }
   433  
   434  func (b bytesSort) Len() int {
   435  	return len(b.b)
   436  }
   437  
   438  func (b bytesSort) Swap(i, j int) {
   439  	b.b[i], b.b[j] = b.b[j], b.b[i]
   440  }
   441  
   442  func (b bytesSort) Less(i, j int) bool {
   443  	for k := 0; k < b.n; k++ {
   444  		if b.b[i][k] == b.b[j][k] {
   445  			continue
   446  		}
   447  		return b.b[i][k] < b.b[j][k]
   448  	}
   449  	return false
   450  }
   451  
   452  // parseExtensions parses and normalizes the extensions in the buffer.
   453  // It returns the last position of scan.b that is part of any extension.
   454  // It also trims scan.b to remove excess parts accordingly.
   455  func parseExtensions(scan *scanner) int {
   456  	start := scan.start
   457  	exts := [][]byte{}
   458  	private := []byte{}
   459  	end := scan.end
   460  	for len(scan.token) == 1 {
   461  		extStart := scan.start
   462  		ext := scan.token[0]
   463  		end = parseExtension(scan)
   464  		extension := scan.b[extStart:end]
   465  		if len(extension) < 3 || (ext != 'x' && len(extension) < 4) {
   466  			scan.setError(ErrSyntax)
   467  			end = extStart
   468  			continue
   469  		} else if start == extStart && (ext == 'x' || scan.start == len(scan.b)) {
   470  			scan.b = scan.b[:end]
   471  			return end
   472  		} else if ext == 'x' {
   473  			private = extension
   474  			break
   475  		}
   476  		exts = append(exts, extension)
   477  	}
   478  	sort.Sort(bytesSort{exts, 1})
   479  	if len(private) > 0 {
   480  		exts = append(exts, private)
   481  	}
   482  	scan.b = scan.b[:start]
   483  	if len(exts) > 0 {
   484  		scan.b = append(scan.b, bytes.Join(exts, separator)...)
   485  	} else if start > 0 {
   486  		// Strip trailing '-'.
   487  		scan.b = scan.b[:start-1]
   488  	}
   489  	return end
   490  }
   491  
   492  // parseExtension parses a single extension and returns the position of
   493  // the extension end.
   494  func parseExtension(scan *scanner) int {
   495  	start, end := scan.start, scan.end
   496  	switch scan.token[0] {
   497  	case 'u': // https://www.ietf.org/rfc/rfc6067.txt
   498  		attrStart := end
   499  		scan.scan()
   500  		for last := []byte{}; len(scan.token) > 2; scan.scan() {
   501  			if bytes.Compare(scan.token, last) != -1 {
   502  				// Attributes are unsorted. Start over from scratch.
   503  				p := attrStart + 1
   504  				scan.next = p
   505  				attrs := [][]byte{}
   506  				for scan.scan(); len(scan.token) > 2; scan.scan() {
   507  					attrs = append(attrs, scan.token)
   508  					end = scan.end
   509  				}
   510  				sort.Sort(bytesSort{attrs, 3})
   511  				copy(scan.b[p:], bytes.Join(attrs, separator))
   512  				break
   513  			}
   514  			last = scan.token
   515  			end = scan.end
   516  		}
   517  		// Scan key-type sequences. A key is of length 2 and may be followed
   518  		// by 0 or more "type" subtags from 3 to the maximum of 8 letters.
   519  		var last, key []byte
   520  		for attrEnd := end; len(scan.token) == 2; last = key {
   521  			key = scan.token
   522  			end = scan.end
   523  			for scan.scan(); end < scan.end && len(scan.token) > 2; scan.scan() {
   524  				end = scan.end
   525  			}
   526  			// TODO: check key value validity
   527  			if bytes.Compare(key, last) != 1 || scan.err != nil {
   528  				// We have an invalid key or the keys are not sorted.
   529  				// Start scanning keys from scratch and reorder.
   530  				p := attrEnd + 1
   531  				scan.next = p
   532  				keys := [][]byte{}
   533  				for scan.scan(); len(scan.token) == 2; {
   534  					keyStart := scan.start
   535  					end = scan.end
   536  					for scan.scan(); end < scan.end && len(scan.token) > 2; scan.scan() {
   537  						end = scan.end
   538  					}
   539  					keys = append(keys, scan.b[keyStart:end])
   540  				}
   541  				sort.Stable(bytesSort{keys, 2})
   542  				if n := len(keys); n > 0 {
   543  					k := 0
   544  					for i := 1; i < n; i++ {
   545  						if !bytes.Equal(keys[k][:2], keys[i][:2]) {
   546  							k++
   547  							keys[k] = keys[i]
   548  						} else if !bytes.Equal(keys[k], keys[i]) {
   549  							scan.setError(ErrDuplicateKey)
   550  						}
   551  					}
   552  					keys = keys[:k+1]
   553  				}
   554  				reordered := bytes.Join(keys, separator)
   555  				if e := p + len(reordered); e < end {
   556  					scan.deleteRange(e, end)
   557  					end = e
   558  				}
   559  				copy(scan.b[p:], reordered)
   560  				break
   561  			}
   562  		}
   563  	case 't': // https://www.ietf.org/rfc/rfc6497.txt
   564  		scan.scan()
   565  		if n := len(scan.token); n >= 2 && n <= 3 && isAlpha(scan.token[1]) {
   566  			_, end = parseTag(scan, false)
   567  			scan.toLower(start, end)
   568  		}
   569  		for len(scan.token) == 2 && !isAlpha(scan.token[1]) {
   570  			end = scan.acceptMinSize(3)
   571  		}
   572  	case 'x':
   573  		end = scan.acceptMinSize(1)
   574  	default:
   575  		end = scan.acceptMinSize(2)
   576  	}
   577  	return end
   578  }
   579  
   580  // getExtension returns the name, body and end position of the extension.
   581  func getExtension(s string, p int) (end int, ext string) {
   582  	if s[p] == '-' {
   583  		p++
   584  	}
   585  	if s[p] == 'x' {
   586  		return len(s), s[p:]
   587  	}
   588  	end = nextExtension(s, p)
   589  	return end, s[p:end]
   590  }
   591  
   592  // nextExtension finds the next extension within the string, searching
   593  // for the -<char>- pattern from position p.
   594  // In the fast majority of cases, language tags will have at most
   595  // one extension and extensions tend to be small.
   596  func nextExtension(s string, p int) int {
   597  	for n := len(s) - 3; p < n; {
   598  		if s[p] == '-' {
   599  			if s[p+2] == '-' {
   600  				return p
   601  			}
   602  			p += 3
   603  		} else {
   604  			p++
   605  		}
   606  	}
   607  	return len(s)
   608  }
   609
View as plain text