match.go

     1  // Copyright 2013 The Go Authors. All rights reserved.
     2  // Use of this source code is governed by a BSD-style
     3  // license that can be found in the LICENSE file.
     4  
     5  package language
     6  
     7  import (
     8  	"errors"
     9  	"strings"
    10  
    11  	"golang.org/x/text/internal/language"
    12  )
    13  
    14  // A MatchOption configures a Matcher.
    15  type MatchOption func(*matcher)
    16  
    17  // PreferSameScript will, in the absence of a match, result in the first
    18  // preferred tag with the same script as a supported tag to match this supported
    19  // tag. The default is currently true, but this may change in the future.
    20  func PreferSameScript(preferSame bool) MatchOption {
    21  	return func(m *matcher) { m.preferSameScript = preferSame }
    22  }
    23  
    24  // TODO(v1.0.0): consider making Matcher a concrete type, instead of interface.
    25  // There doesn't seem to be too much need for multiple types.
    26  // Making it a concrete type allows MatchStrings to be a method, which will
    27  // improve its discoverability.
    28  
    29  // MatchStrings parses and matches the given strings until one of them matches
    30  // the language in the Matcher. A string may be an Accept-Language header as
    31  // handled by ParseAcceptLanguage. The default language is returned if no
    32  // other language matched.
    33  func MatchStrings(m Matcher, lang ...string) (tag Tag, index int) {
    34  	for _, accept := range lang {
    35  		desired, _, err := ParseAcceptLanguage(accept)
    36  		if err != nil {
    37  			continue
    38  		}
    39  		if tag, index, conf := m.Match(desired...); conf != No {
    40  			return tag, index
    41  		}
    42  	}
    43  	tag, index, _ = m.Match()
    44  	return
    45  }
    46  
    47  // Matcher is the interface that wraps the Match method.
    48  //
    49  // Match returns the best match for any of the given tags, along with
    50  // a unique index associated with the returned tag and a confidence
    51  // score.
    52  type Matcher interface {
    53  	Match(t ...Tag) (tag Tag, index int, c Confidence)
    54  }
    55  
    56  // Comprehends reports the confidence score for a speaker of a given language
    57  // to being able to comprehend the written form of an alternative language.
    58  func Comprehends(speaker, alternative Tag) Confidence {
    59  	_, _, c := NewMatcher([]Tag{alternative}).Match(speaker)
    60  	return c
    61  }
    62  
    63  // NewMatcher returns a Matcher that matches an ordered list of preferred tags
    64  // against a list of supported tags based on written intelligibility, closeness
    65  // of dialect, equivalence of subtags and various other rules. It is initialized
    66  // with the list of supported tags. The first element is used as the default
    67  // value in case no match is found.
    68  //
    69  // Its Match method matches the first of the given Tags to reach a certain
    70  // confidence threshold. The tags passed to Match should therefore be specified
    71  // in order of preference. Extensions are ignored for matching.
    72  //
    73  // The index returned by the Match method corresponds to the index of the
    74  // matched tag in t, but is augmented with the Unicode extension ('u')of the
    75  // corresponding preferred tag. This allows user locale options to be passed
    76  // transparently.
    77  func NewMatcher(t []Tag, options ...MatchOption) Matcher {
    78  	return newMatcher(t, options)
    79  }
    80  
    81  func (m *matcher) Match(want ...Tag) (t Tag, index int, c Confidence) {
    82  	var tt language.Tag
    83  	match, w, c := m.getBest(want...)
    84  	if match != nil {
    85  		tt, index = match.tag, match.index
    86  	} else {
    87  		// TODO: this should be an option
    88  		tt = m.default_.tag
    89  		if m.preferSameScript {
    90  		outer:
    91  			for _, w := range want {
    92  				script, _ := w.Script()
    93  				if script.scriptID == 0 {
    94  					// Don't do anything if there is no script, such as with
    95  					// private subtags.
    96  					continue
    97  				}
    98  				for i, h := range m.supported {
    99  					if script.scriptID == h.maxScript {
   100  						tt, index = h.tag, i
   101  						break outer
   102  					}
   103  				}
   104  			}
   105  		}
   106  		// TODO: select first language tag based on script.
   107  	}
   108  	if w.RegionID != tt.RegionID && w.RegionID != 0 {
   109  		if w.RegionID != 0 && tt.RegionID != 0 && tt.RegionID.Contains(w.RegionID) {
   110  			tt.RegionID = w.RegionID
   111  			tt.RemakeString()
   112  		} else if r := w.RegionID.String(); len(r) == 2 {
   113  			// TODO: also filter macro and deprecated.
   114  			tt, _ = tt.SetTypeForKey("rg", strings.ToLower(r)+"zzzz")
   115  		}
   116  	}
   117  	// Copy options from the user-provided tag into the result tag. This is hard
   118  	// to do after the fact, so we do it here.
   119  	// TODO: add in alternative variants to -u-va-.
   120  	// TODO: add preferred region to -u-rg-.
   121  	if e := w.Extensions(); len(e) > 0 {
   122  		b := language.Builder{}
   123  		b.SetTag(tt)
   124  		for _, e := range e {
   125  			b.AddExt(e)
   126  		}
   127  		tt = b.Make()
   128  	}
   129  	return makeTag(tt), index, c
   130  }
   131  
   132  // ErrMissingLikelyTagsData indicates no information was available
   133  // to compute likely values of missing tags.
   134  var ErrMissingLikelyTagsData = errors.New("missing likely tags data")
   135  
   136  // func (t *Tag) setTagsFrom(id Tag) {
   137  // 	t.LangID = id.LangID
   138  // 	t.ScriptID = id.ScriptID
   139  // 	t.RegionID = id.RegionID
   140  // }
   141  
   142  // Tag Matching
   143  // CLDR defines an algorithm for finding the best match between two sets of language
   144  // tags. The basic algorithm defines how to score a possible match and then find
   145  // the match with the best score
   146  // (see https://www.unicode.org/reports/tr35/#LanguageMatching).
   147  // Using scoring has several disadvantages. The scoring obfuscates the importance of
   148  // the various factors considered, making the algorithm harder to understand. Using
   149  // scoring also requires the full score to be computed for each pair of tags.
   150  //
   151  // We will use a different algorithm which aims to have the following properties:
   152  // - clarity on the precedence of the various selection factors, and
   153  // - improved performance by allowing early termination of a comparison.
   154  //
   155  // Matching algorithm (overview)
   156  // Input:
   157  //   - supported: a set of supported tags
   158  //   - default:   the default tag to return in case there is no match
   159  //   - desired:   list of desired tags, ordered by preference, starting with
   160  //                the most-preferred.
   161  //
   162  // Algorithm:
   163  //   1) Set the best match to the lowest confidence level
   164  //   2) For each tag in "desired":
   165  //     a) For each tag in "supported":
   166  //        1) compute the match between the two tags.
   167  //        2) if the match is better than the previous best match, replace it
   168  //           with the new match. (see next section)
   169  //     b) if the current best match is Exact and pin is true the result will be
   170  //        frozen to the language found thusfar, although better matches may
   171  //        still be found for the same language.
   172  //   3) If the best match so far is below a certain threshold, return "default".
   173  //
   174  // Ranking:
   175  // We use two phases to determine whether one pair of tags are a better match
   176  // than another pair of tags. First, we determine a rough confidence level. If the
   177  // levels are different, the one with the highest confidence wins.
   178  // Second, if the rough confidence levels are identical, we use a set of tie-breaker
   179  // rules.
   180  //
   181  // The confidence level of matching a pair of tags is determined by finding the
   182  // lowest confidence level of any matches of the corresponding subtags (the
   183  // result is deemed as good as its weakest link).
   184  // We define the following levels:
   185  //   Exact    - An exact match of a subtag, before adding likely subtags.
   186  //   MaxExact - An exact match of a subtag, after adding likely subtags.
   187  //              [See Note 2].
   188  //   High     - High level of mutual intelligibility between different subtag
   189  //              variants.
   190  //   Low      - Low level of mutual intelligibility between different subtag
   191  //              variants.
   192  //   No       - No mutual intelligibility.
   193  //
   194  // The following levels can occur for each type of subtag:
   195  //   Base:    Exact, MaxExact, High, Low, No
   196  //   Script:  Exact, MaxExact [see Note 3], Low, No
   197  //   Region:  Exact, MaxExact, High
   198  //   Variant: Exact, High
   199  //   Private: Exact, No
   200  //
   201  // Any result with a confidence level of Low or higher is deemed a possible match.
   202  // Once a desired tag matches any of the supported tags with a level of MaxExact
   203  // or higher, the next desired tag is not considered (see Step 2.b).
   204  // Note that CLDR provides languageMatching data that defines close equivalence
   205  // classes for base languages, scripts and regions.
   206  //
   207  // Tie-breaking
   208  // If we get the same confidence level for two matches, we apply a sequence of
   209  // tie-breaking rules. The first that succeeds defines the result. The rules are
   210  // applied in the following order.
   211  //   1) Original language was defined and was identical.
   212  //   2) Original region was defined and was identical.
   213  //   3) Distance between two maximized regions was the smallest.
   214  //   4) Original script was defined and was identical.
   215  //   5) Distance from want tag to have tag using the parent relation [see Note 5.]
   216  // If there is still no winner after these rules are applied, the first match
   217  // found wins.
   218  //
   219  // Notes:
   220  // [2] In practice, as matching of Exact is done in a separate phase from
   221  //     matching the other levels, we reuse the Exact level to mean MaxExact in
   222  //     the second phase. As a consequence, we only need the levels defined by
   223  //     the Confidence type. The MaxExact confidence level is mapped to High in
   224  //     the public API.
   225  // [3] We do not differentiate between maximized script values that were derived
   226  //     from suppressScript versus most likely tag data. We determined that in
   227  //     ranking the two, one ranks just after the other. Moreover, the two cannot
   228  //     occur concurrently. As a consequence, they are identical for practical
   229  //     purposes.
   230  // [4] In case of deprecated, macro-equivalents and legacy mappings, we assign
   231  //     the MaxExact level to allow iw vs he to still be a closer match than
   232  //     en-AU vs en-US, for example.
   233  // [5] In CLDR a locale inherits fields that are unspecified for this locale
   234  //     from its parent. Therefore, if a locale is a parent of another locale,
   235  //     it is a strong measure for closeness, especially when no other tie
   236  //     breaker rule applies. One could also argue it is inconsistent, for
   237  //     example, when pt-AO matches pt (which CLDR equates with pt-BR), even
   238  //     though its parent is pt-PT according to the inheritance rules.
   239  //
   240  // Implementation Details:
   241  // There are several performance considerations worth pointing out. Most notably,
   242  // we preprocess as much as possible (within reason) at the time of creation of a
   243  // matcher. This includes:
   244  //   - creating a per-language map, which includes data for the raw base language
   245  //     and its canonicalized variant (if applicable),
   246  //   - expanding entries for the equivalence classes defined in CLDR's
   247  //     languageMatch data.
   248  // The per-language map ensures that typically only a very small number of tags
   249  // need to be considered. The pre-expansion of canonicalized subtags and
   250  // equivalence classes reduces the amount of map lookups that need to be done at
   251  // runtime.
   252  
   253  // matcher keeps a set of supported language tags, indexed by language.
   254  type matcher struct {
   255  	default_         *haveTag
   256  	supported        []*haveTag
   257  	index            map[language.Language]*matchHeader
   258  	passSettings     bool
   259  	preferSameScript bool
   260  }
   261  
   262  // matchHeader has the lists of tags for exact matches and matches based on
   263  // maximized and canonicalized tags for a given language.
   264  type matchHeader struct {
   265  	haveTags []*haveTag
   266  	original bool
   267  }
   268  
   269  // haveTag holds a supported Tag and its maximized script and region. The maximized
   270  // or canonicalized language is not stored as it is not needed during matching.
   271  type haveTag struct {
   272  	tag language.Tag
   273  
   274  	// index of this tag in the original list of supported tags.
   275  	index int
   276  
   277  	// conf is the maximum confidence that can result from matching this haveTag.
   278  	// When conf < Exact this means it was inserted after applying a CLDR equivalence rule.
   279  	conf Confidence
   280  
   281  	// Maximized region and script.
   282  	maxRegion language.Region
   283  	maxScript language.Script
   284  
   285  	// altScript may be checked as an alternative match to maxScript. If altScript
   286  	// matches, the confidence level for this match is Low. Theoretically there
   287  	// could be multiple alternative scripts. This does not occur in practice.
   288  	altScript language.Script
   289  
   290  	// nextMax is the index of the next haveTag with the same maximized tags.
   291  	nextMax uint16
   292  }
   293  
   294  func makeHaveTag(tag language.Tag, index int) (haveTag, language.Language) {
   295  	max := tag
   296  	if tag.LangID != 0 || tag.RegionID != 0 || tag.ScriptID != 0 {
   297  		max, _ = canonicalize(All, max)
   298  		max, _ = max.Maximize()
   299  		max.RemakeString()
   300  	}
   301  	return haveTag{tag, index, Exact, max.RegionID, max.ScriptID, altScript(max.LangID, max.ScriptID), 0}, max.LangID
   302  }
   303  
   304  // altScript returns an alternative script that may match the given script with
   305  // a low confidence.  At the moment, the langMatch data allows for at most one
   306  // script to map to another and we rely on this to keep the code simple.
   307  func altScript(l language.Language, s language.Script) language.Script {
   308  	for _, alt := range matchScript {
   309  		// TODO: also match cases where language is not the same.
   310  		if (language.Language(alt.wantLang) == l || language.Language(alt.haveLang) == l) &&
   311  			language.Script(alt.haveScript) == s {
   312  			return language.Script(alt.wantScript)
   313  		}
   314  	}
   315  	return 0
   316  }
   317  
   318  // addIfNew adds a haveTag to the list of tags only if it is a unique tag.
   319  // Tags that have the same maximized values are linked by index.
   320  func (h *matchHeader) addIfNew(n haveTag, exact bool) {
   321  	h.original = h.original || exact
   322  	// Don't add new exact matches.
   323  	for _, v := range h.haveTags {
   324  		if equalsRest(v.tag, n.tag) {
   325  			return
   326  		}
   327  	}
   328  	// Allow duplicate maximized tags, but create a linked list to allow quickly
   329  	// comparing the equivalents and bail out.
   330  	for i, v := range h.haveTags {
   331  		if v.maxScript == n.maxScript &&
   332  			v.maxRegion == n.maxRegion &&
   333  			v.tag.VariantOrPrivateUseTags() == n.tag.VariantOrPrivateUseTags() {
   334  			for h.haveTags[i].nextMax != 0 {
   335  				i = int(h.haveTags[i].nextMax)
   336  			}
   337  			h.haveTags[i].nextMax = uint16(len(h.haveTags))
   338  			break
   339  		}
   340  	}
   341  	h.haveTags = append(h.haveTags, &n)
   342  }
   343  
   344  // header returns the matchHeader for the given language. It creates one if
   345  // it doesn't already exist.
   346  func (m *matcher) header(l language.Language) *matchHeader {
   347  	if h := m.index[l]; h != nil {
   348  		return h
   349  	}
   350  	h := &matchHeader{}
   351  	m.index[l] = h
   352  	return h
   353  }
   354  
   355  func toConf(d uint8) Confidence {
   356  	if d <= 10 {
   357  		return High
   358  	}
   359  	if d < 30 {
   360  		return Low
   361  	}
   362  	return No
   363  }
   364  
   365  // newMatcher builds an index for the given supported tags and returns it as
   366  // a matcher. It also expands the index by considering various equivalence classes
   367  // for a given tag.
   368  func newMatcher(supported []Tag, options []MatchOption) *matcher {
   369  	m := &matcher{
   370  		index:            make(map[language.Language]*matchHeader),
   371  		preferSameScript: true,
   372  	}
   373  	for _, o := range options {
   374  		o(m)
   375  	}
   376  	if len(supported) == 0 {
   377  		m.default_ = &haveTag{}
   378  		return m
   379  	}
   380  	// Add supported languages to the index. Add exact matches first to give
   381  	// them precedence.
   382  	for i, tag := range supported {
   383  		tt := tag.tag()
   384  		pair, _ := makeHaveTag(tt, i)
   385  		m.header(tt.LangID).addIfNew(pair, true)
   386  		m.supported = append(m.supported, &pair)
   387  	}
   388  	m.default_ = m.header(supported[0].lang()).haveTags[0]
   389  	// Keep these in two different loops to support the case that two equivalent
   390  	// languages are distinguished, such as iw and he.
   391  	for i, tag := range supported {
   392  		tt := tag.tag()
   393  		pair, max := makeHaveTag(tt, i)
   394  		if max != tt.LangID {
   395  			m.header(max).addIfNew(pair, true)
   396  		}
   397  	}
   398  
   399  	// update is used to add indexes in the map for equivalent languages.
   400  	// update will only add entries to original indexes, thus not computing any
   401  	// transitive relations.
   402  	update := func(want, have uint16, conf Confidence) {
   403  		if hh := m.index[language.Language(have)]; hh != nil {
   404  			if !hh.original {
   405  				return
   406  			}
   407  			hw := m.header(language.Language(want))
   408  			for _, ht := range hh.haveTags {
   409  				v := *ht
   410  				if conf < v.conf {
   411  					v.conf = conf
   412  				}
   413  				v.nextMax = 0 // this value needs to be recomputed
   414  				if v.altScript != 0 {
   415  					v.altScript = altScript(language.Language(want), v.maxScript)
   416  				}
   417  				hw.addIfNew(v, conf == Exact && hh.original)
   418  			}
   419  		}
   420  	}
   421  
   422  	// Add entries for languages with mutual intelligibility as defined by CLDR's
   423  	// languageMatch data.
   424  	for _, ml := range matchLang {
   425  		update(ml.want, ml.have, toConf(ml.distance))
   426  		if !ml.oneway {
   427  			update(ml.have, ml.want, toConf(ml.distance))
   428  		}
   429  	}
   430  
   431  	// Add entries for possible canonicalizations. This is an optimization to
   432  	// ensure that only one map lookup needs to be done at runtime per desired tag.
   433  	// First we match deprecated equivalents. If they are perfect equivalents
   434  	// (their canonicalization simply substitutes a different language code, but
   435  	// nothing else), the match confidence is Exact, otherwise it is High.
   436  	for i, lm := range language.AliasMap {
   437  		// If deprecated codes match and there is no fiddling with the script
   438  		// or region, we consider it an exact match.
   439  		conf := Exact
   440  		if language.AliasTypes[i] != language.Macro {
   441  			if !isExactEquivalent(language.Language(lm.From)) {
   442  				conf = High
   443  			}
   444  			update(lm.To, lm.From, conf)
   445  		}
   446  		update(lm.From, lm.To, conf)
   447  	}
   448  	return m
   449  }
   450  
   451  // getBest gets the best matching tag in m for any of the given tags, taking into
   452  // account the order of preference of the given tags.
   453  func (m *matcher) getBest(want ...Tag) (got *haveTag, orig language.Tag, c Confidence) {
   454  	best := bestMatch{}
   455  	for i, ww := range want {
   456  		w := ww.tag()
   457  		var max language.Tag
   458  		// Check for exact match first.
   459  		h := m.index[w.LangID]
   460  		if w.LangID != 0 {
   461  			if h == nil {
   462  				continue
   463  			}
   464  			// Base language is defined.
   465  			max, _ = canonicalize(Legacy|Deprecated|Macro, w)
   466  			// A region that is added through canonicalization is stronger than
   467  			// a maximized region: set it in the original (e.g. mo -> ro-MD).
   468  			if w.RegionID != max.RegionID {
   469  				w.RegionID = max.RegionID
   470  			}
   471  			// TODO: should we do the same for scripts?
   472  			// See test case: en, sr, nl ; sh ; sr
   473  			max, _ = max.Maximize()
   474  		} else {
   475  			// Base language is not defined.
   476  			if h != nil {
   477  				for i := range h.haveTags {
   478  					have := h.haveTags[i]
   479  					if equalsRest(have.tag, w) {
   480  						return have, w, Exact
   481  					}
   482  				}
   483  			}
   484  			if w.ScriptID == 0 && w.RegionID == 0 {
   485  				// We skip all tags matching und for approximate matching, including
   486  				// private tags.
   487  				continue
   488  			}
   489  			max, _ = w.Maximize()
   490  			if h = m.index[max.LangID]; h == nil {
   491  				continue
   492  			}
   493  		}
   494  		pin := true
   495  		for _, t := range want[i+1:] {
   496  			if w.LangID == t.lang() {
   497  				pin = false
   498  				break
   499  			}
   500  		}
   501  		// Check for match based on maximized tag.
   502  		for i := range h.haveTags {
   503  			have := h.haveTags[i]
   504  			best.update(have, w, max.ScriptID, max.RegionID, pin)
   505  			if best.conf == Exact {
   506  				for have.nextMax != 0 {
   507  					have = h.haveTags[have.nextMax]
   508  					best.update(have, w, max.ScriptID, max.RegionID, pin)
   509  				}
   510  				return best.have, best.want, best.conf
   511  			}
   512  		}
   513  	}
   514  	if best.conf <= No {
   515  		if len(want) != 0 {
   516  			return nil, want[0].tag(), No
   517  		}
   518  		return nil, language.Tag{}, No
   519  	}
   520  	return best.have, best.want, best.conf
   521  }
   522  
   523  // bestMatch accumulates the best match so far.
   524  type bestMatch struct {
   525  	have            *haveTag
   526  	want            language.Tag
   527  	conf            Confidence
   528  	pinnedRegion    language.Region
   529  	pinLanguage     bool
   530  	sameRegionGroup bool
   531  	// Cached results from applying tie-breaking rules.
   532  	origLang     bool
   533  	origReg      bool
   534  	paradigmReg  bool
   535  	regGroupDist uint8
   536  	origScript   bool
   537  }
   538  
   539  // update updates the existing best match if the new pair is considered to be a
   540  // better match. To determine if the given pair is a better match, it first
   541  // computes the rough confidence level. If this surpasses the current match, it
   542  // will replace it and update the tie-breaker rule cache. If there is a tie, it
   543  // proceeds with applying a series of tie-breaker rules. If there is no
   544  // conclusive winner after applying the tie-breaker rules, it leaves the current
   545  // match as the preferred match.
   546  //
   547  // If pin is true and have and tag are a strong match, it will henceforth only
   548  // consider matches for this language. This corresponds to the idea that most
   549  // users have a strong preference for the first defined language. A user can
   550  // still prefer a second language over a dialect of the preferred language by
   551  // explicitly specifying dialects, e.g. "en, nl, en-GB". In this case pin should
   552  // be false.
   553  func (m *bestMatch) update(have *haveTag, tag language.Tag, maxScript language.Script, maxRegion language.Region, pin bool) {
   554  	// Bail if the maximum attainable confidence is below that of the current best match.
   555  	c := have.conf
   556  	if c < m.conf {
   557  		return
   558  	}
   559  	// Don't change the language once we already have found an exact match.
   560  	if m.pinLanguage && tag.LangID != m.want.LangID {
   561  		return
   562  	}
   563  	// Pin the region group if we are comparing tags for the same language.
   564  	if tag.LangID == m.want.LangID && m.sameRegionGroup {
   565  		_, sameGroup := regionGroupDist(m.pinnedRegion, have.maxRegion, have.maxScript, m.want.LangID)
   566  		if !sameGroup {
   567  			return
   568  		}
   569  	}
   570  	if c == Exact && have.maxScript == maxScript {
   571  		// If there is another language and then another entry of this language,
   572  		// don't pin anything, otherwise pin the language.
   573  		m.pinLanguage = pin
   574  	}
   575  	if equalsRest(have.tag, tag) {
   576  	} else if have.maxScript != maxScript {
   577  		// There is usually very little comprehension between different scripts.
   578  		// In a few cases there may still be Low comprehension. This possibility
   579  		// is pre-computed and stored in have.altScript.
   580  		if Low < m.conf || have.altScript != maxScript {
   581  			return
   582  		}
   583  		c = Low
   584  	} else if have.maxRegion != maxRegion {
   585  		if High < c {
   586  			// There is usually a small difference between languages across regions.
   587  			c = High
   588  		}
   589  	}
   590  
   591  	// We store the results of the computations of the tie-breaker rules along
   592  	// with the best match. There is no need to do the checks once we determine
   593  	// we have a winner, but we do still need to do the tie-breaker computations.
   594  	// We use "beaten" to keep track if we still need to do the checks.
   595  	beaten := false // true if the new pair defeats the current one.
   596  	if c != m.conf {
   597  		if c < m.conf {
   598  			return
   599  		}
   600  		beaten = true
   601  	}
   602  
   603  	// Tie-breaker rules:
   604  	// We prefer if the pre-maximized language was specified and identical.
   605  	origLang := have.tag.LangID == tag.LangID && tag.LangID != 0
   606  	if !beaten && m.origLang != origLang {
   607  		if m.origLang {
   608  			return
   609  		}
   610  		beaten = true
   611  	}
   612  
   613  	// We prefer if the pre-maximized region was specified and identical.
   614  	origReg := have.tag.RegionID == tag.RegionID && tag.RegionID != 0
   615  	if !beaten && m.origReg != origReg {
   616  		if m.origReg {
   617  			return
   618  		}
   619  		beaten = true
   620  	}
   621  
   622  	regGroupDist, sameGroup := regionGroupDist(have.maxRegion, maxRegion, maxScript, tag.LangID)
   623  	if !beaten && m.regGroupDist != regGroupDist {
   624  		if regGroupDist > m.regGroupDist {
   625  			return
   626  		}
   627  		beaten = true
   628  	}
   629  
   630  	paradigmReg := isParadigmLocale(tag.LangID, have.maxRegion)
   631  	if !beaten && m.paradigmReg != paradigmReg {
   632  		if !paradigmReg {
   633  			return
   634  		}
   635  		beaten = true
   636  	}
   637  
   638  	// Next we prefer if the pre-maximized script was specified and identical.
   639  	origScript := have.tag.ScriptID == tag.ScriptID && tag.ScriptID != 0
   640  	if !beaten && m.origScript != origScript {
   641  		if m.origScript {
   642  			return
   643  		}
   644  		beaten = true
   645  	}
   646  
   647  	// Update m to the newly found best match.
   648  	if beaten {
   649  		m.have = have
   650  		m.want = tag
   651  		m.conf = c
   652  		m.pinnedRegion = maxRegion
   653  		m.sameRegionGroup = sameGroup
   654  		m.origLang = origLang
   655  		m.origReg = origReg
   656  		m.paradigmReg = paradigmReg
   657  		m.origScript = origScript
   658  		m.regGroupDist = regGroupDist
   659  	}
   660  }
   661  
   662  func isParadigmLocale(lang language.Language, r language.Region) bool {
   663  	for _, e := range paradigmLocales {
   664  		if language.Language(e[0]) == lang && (r == language.Region(e[1]) || r == language.Region(e[2])) {
   665  			return true
   666  		}
   667  	}
   668  	return false
   669  }
   670  
   671  // regionGroupDist computes the distance between two regions based on their
   672  // CLDR grouping.
   673  func regionGroupDist(a, b language.Region, script language.Script, lang language.Language) (dist uint8, same bool) {
   674  	const defaultDistance = 4
   675  
   676  	aGroup := uint(regionToGroups[a]) << 1
   677  	bGroup := uint(regionToGroups[b]) << 1
   678  	for _, ri := range matchRegion {
   679  		if language.Language(ri.lang) == lang && (ri.script == 0 || language.Script(ri.script) == script) {
   680  			group := uint(1 << (ri.group &^ 0x80))
   681  			if 0x80&ri.group == 0 {
   682  				if aGroup&bGroup&group != 0 { // Both regions are in the group.
   683  					return ri.distance, ri.distance == defaultDistance
   684  				}
   685  			} else {
   686  				if (aGroup|bGroup)&group == 0 { // Both regions are not in the group.
   687  					return ri.distance, ri.distance == defaultDistance
   688  				}
   689  			}
   690  		}
   691  	}
   692  	return defaultDistance, true
   693  }
   694  
   695  // equalsRest compares everything except the language.
   696  func equalsRest(a, b language.Tag) bool {
   697  	// TODO: don't include extensions in this comparison. To do this efficiently,
   698  	// though, we should handle private tags separately.
   699  	return a.ScriptID == b.ScriptID && a.RegionID == b.RegionID && a.VariantOrPrivateUseTags() == b.VariantOrPrivateUseTags()
   700  }
   701  
   702  // isExactEquivalent returns true if canonicalizing the language will not alter
   703  // the script or region of a tag.
   704  func isExactEquivalent(l language.Language) bool {
   705  	for _, o := range notEquivalent {
   706  		if o == l {
   707  			return false
   708  		}
   709  	}
   710  	return true
   711  }
   712  
   713  var notEquivalent []language.Language
   714  
   715  func init() {
   716  	// Create a list of all languages for which canonicalization may alter the
   717  	// script or region.
   718  	for _, lm := range language.AliasMap {
   719  		tag := language.Tag{LangID: language.Language(lm.From)}
   720  		if tag, _ = canonicalize(All, tag); tag.ScriptID != 0 || tag.RegionID != 0 {
   721  			notEquivalent = append(notEquivalent, language.Language(lm.From))
   722  		}
   723  	}
   724  	// Maximize undefined regions of paradigm locales.
   725  	for i, v := range paradigmLocales {
   726  		t := language.Tag{LangID: language.Language(v[0])}
   727  		max, _ := t.Maximize()
   728  		if v[1] == 0 {
   729  			paradigmLocales[i][1] = uint16(max.RegionID)
   730  		}
   731  		if v[2] == 0 {
   732  			paradigmLocales[i][2] = uint16(max.RegionID)
   733  		}
   734  	}
   735  }
   736
View as plain text