// Copyright 2026 The Go Authors. All rights reserved. // Use of this source code is governed by a BSD-style // license that can be found in the LICENSE file. package zip import ( "cmp" "compress/gzip" "errors" "fmt" "io" "os" "slices" ) // A sparseFile represents an archive as a sequence of non-zero byte spans // (the LFH headers, the Central Directory, the EOCD records, and any // non-zero compressed bodies) plus a total length. Bytes outside any span // are implicitly zero. This is the storage format used for goldens under // testdata/zip64/ (suffix .zsparse) and the in-memory shape produced by // the writer-reproduction harness. // // On-disk layout (all little-endian): // // uint64 size // uint32 numSpans // for each span: // uint64 offset // uint32 dataLen // dataLen bytes // // Spans are sorted by offset and non-overlapping. type sparseFile struct { Size int64 Spans []sparseSpan } type sparseSpan struct { Offset int64 Data []byte } // ReadAt implements [io.ReaderAt] by serving the underlying spans and // synthesizing zero bytes for any gap inside [0, Size). func (f *sparseFile) ReadAt(p []byte, off int64) (int, error) { if off < 0 { return 0, errors.New("sparseFile: negative offset") } if off >= f.Size { return 0, io.EOF } end := min(off+int64(len(p)), f.Size) n := int(end - off) clear(p[:n]) for _, s := range f.Spans { sEnd := s.Offset + int64(len(s.Data)) if sEnd <= off || s.Offset >= end { continue } from := max(s.Offset, off) to := min(sEnd, end) copy(p[from-off:to-off], s.Data[from-s.Offset:to-s.Offset]) } if n < len(p) { return n, io.EOF } return n, nil } // materializeTail returns the last keep bytes of the conceptual file as a // plain byte slice, suitable for [parseCD]. func (f *sparseFile) materializeTail(keep int64) (data []byte, baseOff uint64) { if keep > f.Size { keep = f.Size } base := f.Size - keep buf := make([]byte, keep) f.ReadAt(buf, base) return buf, uint64(base) } const sparseChunk = 4096 // scanSparse stream-reads r and builds a sparseFile, treating any contiguous // run of zero bytes (rounded to sparseChunk boundaries) as a gap. Adjacent // non-zero chunks are coalesced into one span. func scanSparse(r io.Reader) (*sparseFile, error) { f := &sparseFile{} var cur *sparseSpan buf := make([]byte, sparseChunk) for { n, err := io.ReadFull(r, buf) if n > 0 { chunk := buf[:n] if isAllZero(chunk) { if cur != nil { f.Spans = append(f.Spans, *cur) cur = nil } } else { if cur == nil { cur = &sparseSpan{Offset: f.Size} } cur.Data = append(cur.Data, chunk...) } f.Size += int64(n) } if err != nil { if err == io.EOF || err == io.ErrUnexpectedEOF { break } return nil, err } } if cur != nil { f.Spans = append(f.Spans, *cur) } return f, nil } // writeSparse serializes f to w in the on-disk format described on // [sparseFile]. func writeSparse(w io.Writer, f *sparseFile) error { var hdr [12]byte le.PutUint64(hdr[:8], uint64(f.Size)) le.PutUint32(hdr[8:12], uint32(len(f.Spans))) if _, err := w.Write(hdr[:]); err != nil { return err } for _, s := range f.Spans { var b [12]byte le.PutUint64(b[:8], uint64(s.Offset)) le.PutUint32(b[8:12], uint32(len(s.Data))) if _, err := w.Write(b[:]); err != nil { return err } if _, err := w.Write(s.Data); err != nil { return err } } return nil } // readSparse parses the on-disk format from r. func readSparse(r io.Reader) (*sparseFile, error) { var hdr [12]byte if _, err := io.ReadFull(r, hdr[:]); err != nil { return nil, err } f := &sparseFile{ Size: int64(le.Uint64(hdr[:8])), } n := le.Uint32(hdr[8:12]) if n > 1<<20 { return nil, fmt.Errorf("sparseFile: implausible span count %d", n) } f.Spans = make([]sparseSpan, n) for i := range f.Spans { var b [12]byte if _, err := io.ReadFull(r, b[:]); err != nil { return nil, err } f.Spans[i].Offset = int64(le.Uint64(b[:8])) sz := le.Uint32(b[8:12]) f.Spans[i].Data = make([]byte, sz) if _, err := io.ReadFull(r, f.Spans[i].Data); err != nil { return nil, err } } if !slices.IsSortedFunc(f.Spans, func(a, b sparseSpan) int { return cmp.Compare(a.Offset, b.Offset) }) { return nil, errors.New("sparseFile: spans not sorted") } return f, nil } // readSparseFile reads a sparse file from path. The file is expected to be // gzip-compressed; the outer gzip wrap shrinks goldens that contain non-zero // compressed bodies (e.g., the deflate-zeros entries) by 100x because // deflate-of-zeros is highly repetitive. Small Store goldens benefit too: // gzip's header overhead is ~30 bytes, well under the bytes saved on a 4 KB // sparse representation. func readSparseFile(path string) (*sparseFile, error) { f, err := os.Open(path) if err != nil { return nil, err } defer f.Close() zr, err := gzip.NewReader(f) if err != nil { return nil, err } defer zr.Close() return readSparse(zr) } // isAllZero reports whether every byte in b is 0. func isAllZero(b []byte) bool { for _, c := range b { if c != 0 { return false } } return true } // sparseBuffer accumulates writes into a [sparseFile], dropping any // chunkSize-byte chunk that is all-zero. This makes capturing the result // of pushing multi-GiB streams of zeros through the writer almost free — // the only bytes that end up retained are the LFHs, the Central // Directory, the EOCD records, and any non-zero compressed body. type sparseBuffer struct { f sparseFile cur *sparseSpan } func (t *sparseBuffer) Write(p []byte) (int, error) { n := len(p) for len(p) > 0 { k := len(p) if k > sparseChunk { k = sparseChunk } chunk := p[:k] if isAllZero(chunk) { t.cur = nil } else { if t.cur == nil { t.f.Spans = append(t.f.Spans, sparseSpan{Offset: t.f.Size}) t.cur = &t.f.Spans[len(t.f.Spans)-1] } t.cur.Data = append(t.cur.Data, chunk...) } t.f.Size += int64(k) p = p[k:] } return n, nil }