Source file src/strconv/quote.go

     1  // Copyright 2009 The Go Authors. All rights reserved.
     2  // Use of this source code is governed by a BSD-style
     3  // license that can be found in the LICENSE file.
     4  
     5  //go:generate go run makeisprint.go -output isprint.go
     6  
     7  package strconv
     8  
     9  import (
    10  	"unicode/utf8"
    11  )
    12  
    13  const (
    14  	lowerhex = "0123456789abcdef"
    15  	upperhex = "0123456789ABCDEF"
    16  )
    17  
    18  // contains reports whether the string contains the byte c.
    19  func contains(s string, c byte) bool {
    20  	return index(s, c) != -1
    21  }
    22  
    23  func quoteWith(s string, quote byte, ASCIIonly, graphicOnly bool) string {
    24  	return string(appendQuotedWith(make([]byte, 0, 3*len(s)/2), s, quote, ASCIIonly, graphicOnly))
    25  }
    26  
    27  func quoteRuneWith(r rune, quote byte, ASCIIonly, graphicOnly bool) string {
    28  	return string(appendQuotedRuneWith(nil, r, quote, ASCIIonly, graphicOnly))
    29  }
    30  
    31  func appendQuotedWith(buf []byte, s string, quote byte, ASCIIonly, graphicOnly bool) []byte {
    32  	// Often called with big strings, so preallocate. If there's quoting,
    33  	// this is conservative but still helps a lot.
    34  	if cap(buf)-len(buf) < len(s) {
    35  		nBuf := make([]byte, len(buf), len(buf)+1+len(s)+1)
    36  		copy(nBuf, buf)
    37  		buf = nBuf
    38  	}
    39  	buf = append(buf, quote)
    40  	for width := 0; len(s) > 0; s = s[width:] {
    41  		r := rune(s[0])
    42  		width = 1
    43  		if r >= utf8.RuneSelf {
    44  			r, width = utf8.DecodeRuneInString(s)
    45  		}
    46  		if width == 1 && r == utf8.RuneError {
    47  			buf = append(buf, `\x`...)
    48  			buf = append(buf, lowerhex[s[0]>>4])
    49  			buf = append(buf, lowerhex[s[0]&0xF])
    50  			continue
    51  		}
    52  		buf = appendEscapedRune(buf, r, quote, ASCIIonly, graphicOnly)
    53  	}
    54  	buf = append(buf, quote)
    55  	return buf
    56  }
    57  
    58  func appendQuotedRuneWith(buf []byte, r rune, quote byte, ASCIIonly, graphicOnly bool) []byte {
    59  	buf = append(buf, quote)
    60  	if !utf8.ValidRune(r) {
    61  		r = utf8.RuneError
    62  	}
    63  	buf = appendEscapedRune(buf, r, quote, ASCIIonly, graphicOnly)
    64  	buf = append(buf, quote)
    65  	return buf
    66  }
    67  
    68  func appendEscapedRune(buf []byte, r rune, quote byte, ASCIIonly, graphicOnly bool) []byte {
    69  	if r == rune(quote) || r == '\\' { // always backslashed
    70  		buf = append(buf, '\\')
    71  		buf = append(buf, byte(r))
    72  		return buf
    73  	}
    74  	if ASCIIonly {
    75  		if r < utf8.RuneSelf && IsPrint(r) {
    76  			buf = append(buf, byte(r))
    77  			return buf
    78  		}
    79  	} else if IsPrint(r) || graphicOnly && isInGraphicList(r) {
    80  		return utf8.AppendRune(buf, r)
    81  	}
    82  	switch r {
    83  	case '\a':
    84  		buf = append(buf, `\a`...)
    85  	case '\b':
    86  		buf = append(buf, `\b`...)
    87  	case '\f':
    88  		buf = append(buf, `\f`...)
    89  	case '\n':
    90  		buf = append(buf, `\n`...)
    91  	case '\r':
    92  		buf = append(buf, `\r`...)
    93  	case '\t':
    94  		buf = append(buf, `\t`...)
    95  	case '\v':
    96  		buf = append(buf, `\v`...)
    97  	default:
    98  		switch {
    99  		case r < ' ' || r == 0x7f:
   100  			buf = append(buf, `\x`...)
   101  			buf = append(buf, lowerhex[byte(r)>>4])
   102  			buf = append(buf, lowerhex[byte(r)&0xF])
   103  		case !utf8.ValidRune(r):
   104  			r = 0xFFFD
   105  			fallthrough
   106  		case r < 0x10000:
   107  			buf = append(buf, `\u`...)
   108  			for s := 12; s >= 0; s -= 4 {
   109  				buf = append(buf, lowerhex[r>>uint(s)&0xF])
   110  			}
   111  		default:
   112  			buf = append(buf, `\U`...)
   113  			for s := 28; s >= 0; s -= 4 {
   114  				buf = append(buf, lowerhex[r>>uint(s)&0xF])
   115  			}
   116  		}
   117  	}
   118  	return buf
   119  }
   120  
   121  // Quote returns a double-quoted Go string literal representing s. The
   122  // returned string uses Go escape sequences (\t, \n, \xFF, \u0100) for
   123  // control characters and non-printable characters as defined by
   124  // [IsPrint].
   125  func Quote(s string) string {
   126  	return quoteWith(s, '"', false, false)
   127  }
   128  
   129  // AppendQuote appends a double-quoted Go string literal representing s,
   130  // as generated by [Quote], to dst and returns the extended buffer.
   131  func AppendQuote(dst []byte, s string) []byte {
   132  	return appendQuotedWith(dst, s, '"', false, false)
   133  }
   134  
   135  // QuoteToASCII returns a double-quoted Go string literal representing s.
   136  // The returned string uses Go escape sequences (\t, \n, \xFF, \u0100) for
   137  // non-ASCII characters and non-printable characters as defined by [IsPrint].
   138  func QuoteToASCII(s string) string {
   139  	return quoteWith(s, '"', true, false)
   140  }
   141  
   142  // AppendQuoteToASCII appends a double-quoted Go string literal representing s,
   143  // as generated by [QuoteToASCII], to dst and returns the extended buffer.
   144  func AppendQuoteToASCII(dst []byte, s string) []byte {
   145  	return appendQuotedWith(dst, s, '"', true, false)
   146  }
   147  
   148  // QuoteToGraphic returns a double-quoted Go string literal representing s.
   149  // The returned string leaves Unicode graphic characters, as defined by
   150  // [IsGraphic], unchanged and uses Go escape sequences (\t, \n, \xFF, \u0100)
   151  // for non-graphic characters.
   152  func QuoteToGraphic(s string) string {
   153  	return quoteWith(s, '"', false, true)
   154  }
   155  
   156  // AppendQuoteToGraphic appends a double-quoted Go string literal representing s,
   157  // as generated by [QuoteToGraphic], to dst and returns the extended buffer.
   158  func AppendQuoteToGraphic(dst []byte, s string) []byte {
   159  	return appendQuotedWith(dst, s, '"', false, true)
   160  }
   161  
   162  // QuoteRune returns a single-quoted Go character literal representing the
   163  // rune. The returned string uses Go escape sequences (\t, \n, \xFF, \u0100)
   164  // for control characters and non-printable characters as defined by [IsPrint].
   165  // If r is not a valid Unicode code point, it is interpreted as the Unicode
   166  // replacement character U+FFFD.
   167  func QuoteRune(r rune) string {
   168  	return quoteRuneWith(r, '\'', false, false)
   169  }
   170  
   171  // AppendQuoteRune appends a single-quoted Go character literal representing the rune,
   172  // as generated by [QuoteRune], to dst and returns the extended buffer.
   173  func AppendQuoteRune(dst []byte, r rune) []byte {
   174  	return appendQuotedRuneWith(dst, r, '\'', false, false)
   175  }
   176  
   177  // QuoteRuneToASCII returns a single-quoted Go character literal representing
   178  // the rune. The returned string uses Go escape sequences (\t, \n, \xFF,
   179  // \u0100) for non-ASCII characters and non-printable characters as defined
   180  // by [IsPrint].
   181  // If r is not a valid Unicode code point, it is interpreted as the Unicode
   182  // replacement character U+FFFD.
   183  func QuoteRuneToASCII(r rune) string {
   184  	return quoteRuneWith(r, '\'', true, false)
   185  }
   186  
   187  // AppendQuoteRuneToASCII appends a single-quoted Go character literal representing the rune,
   188  // as generated by [QuoteRuneToASCII], to dst and returns the extended buffer.
   189  func AppendQuoteRuneToASCII(dst []byte, r rune) []byte {
   190  	return appendQuotedRuneWith(dst, r, '\'', true, false)
   191  }
   192  
   193  // QuoteRuneToGraphic returns a single-quoted Go character literal representing
   194  // the rune. If the rune is not a Unicode graphic character,
   195  // as defined by [IsGraphic], the returned string will use a Go escape sequence
   196  // (\t, \n, \xFF, \u0100).
   197  // If r is not a valid Unicode code point, it is interpreted as the Unicode
   198  // replacement character U+FFFD.
   199  func QuoteRuneToGraphic(r rune) string {
   200  	return quoteRuneWith(r, '\'', false, true)
   201  }
   202  
   203  // AppendQuoteRuneToGraphic appends a single-quoted Go character literal representing the rune,
   204  // as generated by [QuoteRuneToGraphic], to dst and returns the extended buffer.
   205  func AppendQuoteRuneToGraphic(dst []byte, r rune) []byte {
   206  	return appendQuotedRuneWith(dst, r, '\'', false, true)
   207  }
   208  
   209  // CanBackquote reports whether the string s can be represented
   210  // unchanged as a single-line backquoted string without control
   211  // characters other than tab.
   212  func CanBackquote(s string) bool {
   213  	for len(s) > 0 {
   214  		r, wid := utf8.DecodeRuneInString(s)
   215  		s = s[wid:]
   216  		if wid > 1 {
   217  			if r == '\ufeff' {
   218  				return false // BOMs are invisible and should not be quoted.
   219  			}
   220  			continue // All other multibyte runes are correctly encoded and assumed printable.
   221  		}
   222  		if r == utf8.RuneError {
   223  			return false
   224  		}
   225  		if (r < ' ' && r != '\t') || r == '`' || r == '\u007F' {
   226  			return false
   227  		}
   228  	}
   229  	return true
   230  }
   231  
   232  func unhex(b byte) (v rune, ok bool) {
   233  	c := rune(b)
   234  	switch {
   235  	case '0' <= c && c <= '9':
   236  		return c - '0', true
   237  	case 'a' <= c && c <= 'f':
   238  		return c - 'a' + 10, true
   239  	case 'A' <= c && c <= 'F':
   240  		return c - 'A' + 10, true
   241  	}
   242  	return
   243  }
   244  
   245  // UnquoteChar decodes the first character or byte in the escaped string
   246  // or character literal represented by the string s.
   247  // It returns four values:
   248  //
   249  //  1. value, the decoded Unicode code point or byte value;
   250  //  2. multibyte, a boolean indicating whether the decoded character requires a multibyte UTF-8 representation;
   251  //  3. tail, the remainder of the string after the character; and
   252  //  4. an error that will be nil if the character is syntactically valid.
   253  //
   254  // The second argument, quote, specifies the type of literal being parsed
   255  // and therefore which escaped quote character is permitted.
   256  // If set to a single quote, it permits the sequence \' and disallows unescaped '.
   257  // If set to a double quote, it permits \" and disallows unescaped ".
   258  // If set to zero, it does not permit either escape and allows both quote characters to appear unescaped.
   259  func UnquoteChar(s string, quote byte) (value rune, multibyte bool, tail string, err error) {
   260  	// easy cases
   261  	if len(s) == 0 {
   262  		err = ErrSyntax
   263  		return
   264  	}
   265  	switch c := s[0]; {
   266  	case c == quote && (quote == '\'' || quote == '"'):
   267  		err = ErrSyntax
   268  		return
   269  	case c >= utf8.RuneSelf:
   270  		r, size := utf8.DecodeRuneInString(s)
   271  		return r, true, s[size:], nil
   272  	case c != '\\':
   273  		return rune(s[0]), false, s[1:], nil
   274  	}
   275  
   276  	// hard case: c is backslash
   277  	if len(s) <= 1 {
   278  		err = ErrSyntax
   279  		return
   280  	}
   281  	c := s[1]
   282  	s = s[2:]
   283  
   284  	switch c {
   285  	case 'a':
   286  		value = '\a'
   287  	case 'b':
   288  		value = '\b'
   289  	case 'f':
   290  		value = '\f'
   291  	case 'n':
   292  		value = '\n'
   293  	case 'r':
   294  		value = '\r'
   295  	case 't':
   296  		value = '\t'
   297  	case 'v':
   298  		value = '\v'
   299  	case 'x', 'u', 'U':
   300  		n := 0
   301  		switch c {
   302  		case 'x':
   303  			n = 2
   304  		case 'u':
   305  			n = 4
   306  		case 'U':
   307  			n = 8
   308  		}
   309  		var v rune
   310  		if len(s) < n {
   311  			err = ErrSyntax
   312  			return
   313  		}
   314  		for j := 0; j < n; j++ {
   315  			x, ok := unhex(s[j])
   316  			if !ok {
   317  				err = ErrSyntax
   318  				return
   319  			}
   320  			v = v<<4 | x
   321  		}
   322  		s = s[n:]
   323  		if c == 'x' {
   324  			// single-byte string, possibly not UTF-8
   325  			value = v
   326  			break
   327  		}
   328  		if !utf8.ValidRune(v) {
   329  			err = ErrSyntax
   330  			return
   331  		}
   332  		value = v
   333  		multibyte = true
   334  	case '0', '1', '2', '3', '4', '5', '6', '7':
   335  		v := rune(c) - '0'
   336  		if len(s) < 2 {
   337  			err = ErrSyntax
   338  			return
   339  		}
   340  		for j := 0; j < 2; j++ { // one digit already; two more
   341  			x := rune(s[j]) - '0'
   342  			if x < 0 || x > 7 {
   343  				err = ErrSyntax
   344  				return
   345  			}
   346  			v = (v << 3) | x
   347  		}
   348  		s = s[2:]
   349  		if v > 255 {
   350  			err = ErrSyntax
   351  			return
   352  		}
   353  		value = v
   354  	case '\\':
   355  		value = '\\'
   356  	case '\'', '"':
   357  		if c != quote {
   358  			err = ErrSyntax
   359  			return
   360  		}
   361  		value = rune(c)
   362  	default:
   363  		err = ErrSyntax
   364  		return
   365  	}
   366  	tail = s
   367  	return
   368  }
   369  
   370  // QuotedPrefix returns the quoted string (as understood by [Unquote]) at the prefix of s.
   371  // If s does not start with a valid quoted string, QuotedPrefix returns an error.
   372  func QuotedPrefix(s string) (string, error) {
   373  	out, _, err := unquote(s, false)
   374  	return out, err
   375  }
   376  
   377  // Unquote interprets s as a single-quoted, double-quoted,
   378  // or backquoted Go string literal, returning the string value
   379  // that s quotes.  (If s is single-quoted, it would be a Go
   380  // character literal; Unquote returns the corresponding
   381  // one-character string. For '' Unquote returns the empty string.)
   382  func Unquote(s string) (string, error) {
   383  	out, rem, err := unquote(s, true)
   384  	if len(rem) > 0 {
   385  		return "", ErrSyntax
   386  	}
   387  	return out, err
   388  }
   389  
   390  // unquote parses a quoted string at the start of the input,
   391  // returning the parsed prefix, the remaining suffix, and any parse errors.
   392  // If unescape is true, the parsed prefix is unescaped,
   393  // otherwise the input prefix is provided verbatim.
   394  func unquote(in string, unescape bool) (out, rem string, err error) {
   395  	// Determine the quote form and optimistically find the terminating quote.
   396  	if len(in) < 2 {
   397  		return "", in, ErrSyntax
   398  	}
   399  	quote := in[0]
   400  	end := index(in[1:], quote)
   401  	if end < 0 {
   402  		return "", in, ErrSyntax
   403  	}
   404  	end += 2 // position after terminating quote; may be wrong if escape sequences are present
   405  
   406  	switch quote {
   407  	case '`':
   408  		switch {
   409  		case !unescape:
   410  			out = in[:end] // include quotes
   411  		case !contains(in[:end], '\r'):
   412  			out = in[len("`") : end-len("`")] // exclude quotes
   413  		default:
   414  			// Carriage return characters ('\r') inside raw string literals
   415  			// are discarded from the raw string value.
   416  			buf := make([]byte, 0, end-len("`")-len("\r")-len("`"))
   417  			for i := len("`"); i < end-len("`"); i++ {
   418  				if in[i] != '\r' {
   419  					buf = append(buf, in[i])
   420  				}
   421  			}
   422  			out = string(buf)
   423  		}
   424  		// NOTE: Prior implementations did not verify that raw strings consist
   425  		// of valid UTF-8 characters and we continue to not verify it as such.
   426  		// The Go specification does not explicitly require valid UTF-8,
   427  		// but only mention that it is implicitly valid for Go source code
   428  		// (which must be valid UTF-8).
   429  		return out, in[end:], nil
   430  	case '"', '\'':
   431  		// Handle quoted strings without any escape sequences.
   432  		if !contains(in[:end], '\\') && !contains(in[:end], '\n') {
   433  			var valid bool
   434  			switch quote {
   435  			case '"':
   436  				valid = utf8.ValidString(in[len(`"`) : end-len(`"`)])
   437  			case '\'':
   438  				r, n := utf8.DecodeRuneInString(in[len("'") : end-len("'")])
   439  				valid = len("'")+n+len("'") == end && (r != utf8.RuneError || n != 1)
   440  			}
   441  			if valid {
   442  				out = in[:end]
   443  				if unescape {
   444  					out = out[1 : end-1] // exclude quotes
   445  				}
   446  				return out, in[end:], nil
   447  			}
   448  		}
   449  
   450  		// Handle quoted strings with escape sequences.
   451  		var buf []byte
   452  		in0 := in
   453  		in = in[1:] // skip starting quote
   454  		if unescape {
   455  			buf = make([]byte, 0, 3*end/2) // try to avoid more allocations
   456  		}
   457  		for len(in) > 0 && in[0] != quote {
   458  			// Process the next character,
   459  			// rejecting any unescaped newline characters which are invalid.
   460  			r, multibyte, rem, err := UnquoteChar(in, quote)
   461  			if in[0] == '\n' || err != nil {
   462  				return "", in0, ErrSyntax
   463  			}
   464  			in = rem
   465  
   466  			// Append the character if unescaping the input.
   467  			if unescape {
   468  				if r < utf8.RuneSelf || !multibyte {
   469  					buf = append(buf, byte(r))
   470  				} else {
   471  					buf = utf8.AppendRune(buf, r)
   472  				}
   473  			}
   474  
   475  			// Single quoted strings must be a single character.
   476  			if quote == '\'' {
   477  				break
   478  			}
   479  		}
   480  
   481  		// Verify that the string ends with a terminating quote.
   482  		if !(len(in) > 0 && in[0] == quote) {
   483  			return "", in0, ErrSyntax
   484  		}
   485  		in = in[1:] // skip terminating quote
   486  
   487  		if unescape {
   488  			return string(buf), in, nil
   489  		}
   490  		return in0[:len(in0)-len(in)], in, nil
   491  	default:
   492  		return "", in, ErrSyntax
   493  	}
   494  }
   495  
   496  // bsearch is semantically the same as [slices.BinarySearch] (without NaN checks)
   497  // We copied this function because we can not import "slices" here.
   498  func bsearch[S ~[]E, E ~uint16 | ~uint32](s S, v E) (int, bool) {
   499  	n := len(s)
   500  	i, j := 0, n
   501  	for i < j {
   502  		h := i + (j-i)>>1
   503  		if s[h] < v {
   504  			i = h + 1
   505  		} else {
   506  			j = h
   507  		}
   508  	}
   509  	return i, i < n && s[i] == v
   510  }
   511  
   512  // TODO: IsPrint is a local implementation of unicode.IsPrint, verified by the tests
   513  // to give the same answer. It allows this package not to depend on unicode,
   514  // and therefore not pull in all the Unicode tables. If the linker were better
   515  // at tossing unused tables, we could get rid of this implementation.
   516  // That would be nice.
   517  
   518  // IsPrint reports whether the rune is defined as printable by Go, with
   519  // the same definition as [unicode.IsPrint]: letters, numbers, punctuation,
   520  // symbols and ASCII space.
   521  func IsPrint(r rune) bool {
   522  	// Fast check for Latin-1
   523  	if r <= 0xFF {
   524  		if 0x20 <= r && r <= 0x7E {
   525  			// All the ASCII is printable from space through DEL-1.
   526  			return true
   527  		}
   528  		if 0xA1 <= r && r <= 0xFF {
   529  			// Similarly for ¡ through ÿ...
   530  			return r != 0xAD // ...except for the bizarre soft hyphen.
   531  		}
   532  		return false
   533  	}
   534  
   535  	// Same algorithm, either on uint16 or uint32 value.
   536  	// First, find first i such that isPrint[i] >= x.
   537  	// This is the index of either the start or end of a pair that might span x.
   538  	// The start is even (isPrint[i&^1]) and the end is odd (isPrint[i|1]).
   539  	// If we find x in a range, make sure x is not in isNotPrint list.
   540  
   541  	if 0 <= r && r < 1<<16 {
   542  		rr, isPrint, isNotPrint := uint16(r), isPrint16, isNotPrint16
   543  		i, _ := bsearch(isPrint, rr)
   544  		if i >= len(isPrint) || rr < isPrint[i&^1] || isPrint[i|1] < rr {
   545  			return false
   546  		}
   547  		_, found := bsearch(isNotPrint, rr)
   548  		return !found
   549  	}
   550  
   551  	rr, isPrint, isNotPrint := uint32(r), isPrint32, isNotPrint32
   552  	i, _ := bsearch(isPrint, rr)
   553  	if i >= len(isPrint) || rr < isPrint[i&^1] || isPrint[i|1] < rr {
   554  		return false
   555  	}
   556  	if r >= 0x20000 {
   557  		return true
   558  	}
   559  	r -= 0x10000
   560  	_, found := bsearch(isNotPrint, uint16(r))
   561  	return !found
   562  }
   563  
   564  // IsGraphic reports whether the rune is defined as a Graphic by Unicode. Such
   565  // characters include letters, marks, numbers, punctuation, symbols, and
   566  // spaces, from categories L, M, N, P, S, and Zs.
   567  func IsGraphic(r rune) bool {
   568  	if IsPrint(r) {
   569  		return true
   570  	}
   571  	return isInGraphicList(r)
   572  }
   573  
   574  // isInGraphicList reports whether the rune is in the isGraphic list. This separation
   575  // from IsGraphic allows quoteWith to avoid two calls to IsPrint.
   576  // Should be called only if IsPrint fails.
   577  func isInGraphicList(r rune) bool {
   578  	// We know r must fit in 16 bits - see makeisprint.go.
   579  	if r > 0xFFFF {
   580  		return false
   581  	}
   582  	_, found := bsearch(isGraphic, uint16(r))
   583  	return found
   584  }
   585  

View as plain text