Source file src/archive/zip/zip64_sparse_test.go

     1  // Copyright 2026 The Go Authors. All rights reserved.
     2  // Use of this source code is governed by a BSD-style
     3  // license that can be found in the LICENSE file.
     4  
     5  package zip
     6  
     7  import (
     8  	"cmp"
     9  	"compress/gzip"
    10  	"errors"
    11  	"fmt"
    12  	"io"
    13  	"os"
    14  	"slices"
    15  )
    16  
    17  // A sparseFile represents an archive as a sequence of non-zero byte spans
    18  // (the LFH headers, the Central Directory, the EOCD records, and any
    19  // non-zero compressed bodies) plus a total length. Bytes outside any span
    20  // are implicitly zero. This is the storage format used for goldens under
    21  // testdata/zip64/ (suffix .zsparse) and the in-memory shape produced by
    22  // the writer-reproduction harness.
    23  //
    24  // On-disk layout (all little-endian):
    25  //
    26  //	uint64 size
    27  //	uint32 numSpans
    28  //	for each span:
    29  //		uint64 offset
    30  //		uint32 dataLen
    31  //		dataLen bytes
    32  //
    33  // Spans are sorted by offset and non-overlapping.
    34  type sparseFile struct {
    35  	Size  int64
    36  	Spans []sparseSpan
    37  }
    38  
    39  type sparseSpan struct {
    40  	Offset int64
    41  	Data   []byte
    42  }
    43  
    44  // ReadAt implements [io.ReaderAt] by serving the underlying spans and
    45  // synthesizing zero bytes for any gap inside [0, Size).
    46  func (f *sparseFile) ReadAt(p []byte, off int64) (int, error) {
    47  	if off < 0 {
    48  		return 0, errors.New("sparseFile: negative offset")
    49  	}
    50  	if off >= f.Size {
    51  		return 0, io.EOF
    52  	}
    53  	end := min(off+int64(len(p)), f.Size)
    54  	n := int(end - off)
    55  	clear(p[:n])
    56  	for _, s := range f.Spans {
    57  		sEnd := s.Offset + int64(len(s.Data))
    58  		if sEnd <= off || s.Offset >= end {
    59  			continue
    60  		}
    61  		from := max(s.Offset, off)
    62  		to := min(sEnd, end)
    63  		copy(p[from-off:to-off], s.Data[from-s.Offset:to-s.Offset])
    64  	}
    65  	if n < len(p) {
    66  		return n, io.EOF
    67  	}
    68  	return n, nil
    69  }
    70  
    71  // materializeTail returns the last keep bytes of the conceptual file as a
    72  // plain byte slice, suitable for [parseCD].
    73  func (f *sparseFile) materializeTail(keep int64) (data []byte, baseOff uint64) {
    74  	if keep > f.Size {
    75  		keep = f.Size
    76  	}
    77  	base := f.Size - keep
    78  	buf := make([]byte, keep)
    79  	f.ReadAt(buf, base)
    80  	return buf, uint64(base)
    81  }
    82  
    83  const sparseChunk = 4096
    84  
    85  // scanSparse stream-reads r and builds a sparseFile, treating any contiguous
    86  // run of zero bytes (rounded to sparseChunk boundaries) as a gap. Adjacent
    87  // non-zero chunks are coalesced into one span.
    88  func scanSparse(r io.Reader) (*sparseFile, error) {
    89  	f := &sparseFile{}
    90  	var cur *sparseSpan
    91  	buf := make([]byte, sparseChunk)
    92  	for {
    93  		n, err := io.ReadFull(r, buf)
    94  		if n > 0 {
    95  			chunk := buf[:n]
    96  			if isAllZero(chunk) {
    97  				if cur != nil {
    98  					f.Spans = append(f.Spans, *cur)
    99  					cur = nil
   100  				}
   101  			} else {
   102  				if cur == nil {
   103  					cur = &sparseSpan{Offset: f.Size}
   104  				}
   105  				cur.Data = append(cur.Data, chunk...)
   106  			}
   107  			f.Size += int64(n)
   108  		}
   109  		if err != nil {
   110  			if err == io.EOF || err == io.ErrUnexpectedEOF {
   111  				break
   112  			}
   113  			return nil, err
   114  		}
   115  	}
   116  	if cur != nil {
   117  		f.Spans = append(f.Spans, *cur)
   118  	}
   119  	return f, nil
   120  }
   121  
   122  // writeSparse serializes f to w in the on-disk format described on
   123  // [sparseFile].
   124  func writeSparse(w io.Writer, f *sparseFile) error {
   125  	var hdr [12]byte
   126  	le.PutUint64(hdr[:8], uint64(f.Size))
   127  	le.PutUint32(hdr[8:12], uint32(len(f.Spans)))
   128  	if _, err := w.Write(hdr[:]); err != nil {
   129  		return err
   130  	}
   131  	for _, s := range f.Spans {
   132  		var b [12]byte
   133  		le.PutUint64(b[:8], uint64(s.Offset))
   134  		le.PutUint32(b[8:12], uint32(len(s.Data)))
   135  		if _, err := w.Write(b[:]); err != nil {
   136  			return err
   137  		}
   138  		if _, err := w.Write(s.Data); err != nil {
   139  			return err
   140  		}
   141  	}
   142  	return nil
   143  }
   144  
   145  // readSparse parses the on-disk format from r.
   146  func readSparse(r io.Reader) (*sparseFile, error) {
   147  	var hdr [12]byte
   148  	if _, err := io.ReadFull(r, hdr[:]); err != nil {
   149  		return nil, err
   150  	}
   151  	f := &sparseFile{
   152  		Size: int64(le.Uint64(hdr[:8])),
   153  	}
   154  	n := le.Uint32(hdr[8:12])
   155  	if n > 1<<20 {
   156  		return nil, fmt.Errorf("sparseFile: implausible span count %d", n)
   157  	}
   158  	f.Spans = make([]sparseSpan, n)
   159  	for i := range f.Spans {
   160  		var b [12]byte
   161  		if _, err := io.ReadFull(r, b[:]); err != nil {
   162  			return nil, err
   163  		}
   164  		f.Spans[i].Offset = int64(le.Uint64(b[:8]))
   165  		sz := le.Uint32(b[8:12])
   166  		f.Spans[i].Data = make([]byte, sz)
   167  		if _, err := io.ReadFull(r, f.Spans[i].Data); err != nil {
   168  			return nil, err
   169  		}
   170  	}
   171  	if !slices.IsSortedFunc(f.Spans, func(a, b sparseSpan) int {
   172  		return cmp.Compare(a.Offset, b.Offset)
   173  	}) {
   174  		return nil, errors.New("sparseFile: spans not sorted")
   175  	}
   176  	return f, nil
   177  }
   178  
   179  // readSparseFile reads a sparse file from path. The file is expected to be
   180  // gzip-compressed; the outer gzip wrap shrinks goldens that contain non-zero
   181  // compressed bodies (e.g., the deflate-zeros entries) by 100x because
   182  // deflate-of-zeros is highly repetitive. Small Store goldens benefit too:
   183  // gzip's header overhead is ~30 bytes, well under the bytes saved on a 4 KB
   184  // sparse representation.
   185  func readSparseFile(path string) (*sparseFile, error) {
   186  	f, err := os.Open(path)
   187  	if err != nil {
   188  		return nil, err
   189  	}
   190  	defer f.Close()
   191  	zr, err := gzip.NewReader(f)
   192  	if err != nil {
   193  		return nil, err
   194  	}
   195  	defer zr.Close()
   196  	return readSparse(zr)
   197  }
   198  
   199  // isAllZero reports whether every byte in b is 0.
   200  func isAllZero(b []byte) bool {
   201  	for _, c := range b {
   202  		if c != 0 {
   203  			return false
   204  		}
   205  	}
   206  	return true
   207  }
   208  
   209  // sparseBuffer accumulates writes into a [sparseFile], dropping any
   210  // chunkSize-byte chunk that is all-zero. This makes capturing the result
   211  // of pushing multi-GiB streams of zeros through the writer almost free —
   212  // the only bytes that end up retained are the LFHs, the Central
   213  // Directory, the EOCD records, and any non-zero compressed body.
   214  type sparseBuffer struct {
   215  	f   sparseFile
   216  	cur *sparseSpan
   217  }
   218  
   219  func (t *sparseBuffer) Write(p []byte) (int, error) {
   220  	n := len(p)
   221  	for len(p) > 0 {
   222  		k := len(p)
   223  		if k > sparseChunk {
   224  			k = sparseChunk
   225  		}
   226  		chunk := p[:k]
   227  		if isAllZero(chunk) {
   228  			t.cur = nil
   229  		} else {
   230  			if t.cur == nil {
   231  				t.f.Spans = append(t.f.Spans, sparseSpan{Offset: t.f.Size})
   232  				t.cur = &t.f.Spans[len(t.f.Spans)-1]
   233  			}
   234  			t.cur.Data = append(t.cur.Data, chunk...)
   235  		}
   236  		t.f.Size += int64(k)
   237  		p = p[k:]
   238  	}
   239  	return n, nil
   240  }
   241  

View as plain text