libgo/go/compress/bzip2/bzip2.go

   1 // Copyright 2011 The Go Authors. All rights reserved.
   2 // Use of this source code is governed by a BSD-style
   3 // license that can be found in the LICENSE file.
   4
   5 // Package bzip2 implements bzip2 decompression.
   6 package bzip2
   7
   8 import "io"
   9
  10 // There's no RFC for bzip2. I used the Wikipedia page for reference and a lot
  11 // of guessing: http://en.wikipedia.org/wiki/Bzip2
  12 // The source code to pyflate was useful for debugging:
  13 // http://www.paul.sladen.org/projects/pyflate
  14
  15 // A StructuralError is returned when the bzip2 data is found to be
  16 // syntactically invalid.
  17 type StructuralError string
  18
  19 func (s StructuralError) Error() string {
  20         return "bzip2 data invalid: " + string(s)
  21 }
  22
  23 // A reader decompresses bzip2 compressed data.
  24 type reader struct {
  25         br        bitReader
  26         setupDone bool // true if we have parsed the bzip2 header.
  27         blockSize int  // blockSize in bytes, i.e. 900 * 1024.
  28         eof       bool
  29         buf       []byte    // stores Burrows-Wheeler transformed data.
  30         c         [256]uint // the `C' array for the inverse BWT.
  31         tt        []uint32  // mirrors the `tt' array in the bzip2 source and contains the P array in the upper 24 bits.
  32         tPos      uint32    // Index of the next output byte in tt.
  33
  34         preRLE      []uint32 // contains the RLE data still to be processed.
  35         preRLEUsed  int      // number of entries of preRLE used.
  36         lastByte    int      // the last byte value seen.
  37         byteRepeats uint     // the number of repeats of lastByte seen.
  38         repeats     uint     // the number of copies of lastByte to output.
  39 }
  40
  41 // NewReader returns an io.Reader which decompresses bzip2 data from r.
  42 func NewReader(r io.Reader) io.Reader {
  43         bz2 := new(reader)
  44         bz2.br = newBitReader(r)
  45         return bz2
  46 }
  47
  48 const bzip2FileMagic = 0x425a // "BZ"
  49 const bzip2BlockMagic = 0x314159265359
  50 const bzip2FinalMagic = 0x177245385090
  51
  52 // setup parses the bzip2 header.
  53 func (bz2 *reader) setup() error {
  54         br := &bz2.br
  55
  56         magic := br.ReadBits(16)
  57         if magic != bzip2FileMagic {
  58                 return StructuralError("bad magic value")
  59         }
  60
  61         t := br.ReadBits(8)
  62         if t != 'h' {
  63                 return StructuralError("non-Huffman entropy encoding")
  64         }
  65
  66         level := br.ReadBits(8)
  67         if level < '1' || level > '9' {
  68                 return StructuralError("invalid compression level")
  69         }
  70
  71         bz2.blockSize = 100 * 1024 * (int(level) - '0')
  72         bz2.tt = make([]uint32, bz2.blockSize)
  73         return nil
  74 }
  75
  76 func (bz2 *reader) Read(buf []byte) (n int, err error) {
  77         if bz2.eof {
  78                 return 0, io.EOF
  79         }
  80
  81         if !bz2.setupDone {
  82                 err = bz2.setup()
  83                 brErr := bz2.br.Error()
  84                 if brErr != nil {
  85                         err = brErr
  86                 }
  87                 if err != nil {
  88                         return 0, err
  89                 }
  90                 bz2.setupDone = true
  91         }
  92
  93         n, err = bz2.read(buf)
  94         brErr := bz2.br.Error()
  95         if brErr != nil {
  96                 err = brErr
  97         }
  98         return
  99 }
 100
 101 func (bz2 *reader) read(buf []byte) (n int, err error) {
 102         // bzip2 is a block based compressor, except that it has a run-length
 103         // preprocessing step. The block based nature means that we can
 104         // preallocate fixed-size buffers and reuse them. However, the RLE
 105         // preprocessing would require allocating huge buffers to store the
 106         // maximum expansion. Thus we process blocks all at once, except for
 107         // the RLE which we decompress as required.
 108
 109         for (bz2.repeats > 0 || bz2.preRLEUsed < len(bz2.preRLE)) && n < len(buf) {
 110                 // We have RLE data pending.
 111
 112                 // The run-length encoding works like this:
 113                 // Any sequence of four equal bytes is followed by a length
 114                 // byte which contains the number of repeats of that byte to
 115                 // include. (The number of repeats can be zero.) Because we are
 116                 // decompressing on-demand our state is kept in the reader
 117                 // object.
 118
 119                 if bz2.repeats > 0 {
 120                         buf[n] = byte(bz2.lastByte)
 121                         n++
 122                         bz2.repeats--
 123                         if bz2.repeats == 0 {
 124                                 bz2.lastByte = -1
 125                         }
 126                         continue
 127                 }
 128
 129                 bz2.tPos = bz2.preRLE[bz2.tPos]
 130                 b := byte(bz2.tPos)
 131                 bz2.tPos >>= 8
 132                 bz2.preRLEUsed++
 133
 134                 if bz2.byteRepeats == 3 {
 135                         bz2.repeats = uint(b)
 136                         bz2.byteRepeats = 0
 137                         continue
 138                 }
 139
 140                 if bz2.lastByte == int(b) {
 141                         bz2.byteRepeats++
 142                 } else {
 143                         bz2.byteRepeats = 0
 144                 }
 145                 bz2.lastByte = int(b)
 146
 147                 buf[n] = b
 148                 n++
 149         }
 150
 151         if n > 0 {
 152                 return
 153         }
 154
 155         // No RLE data is pending so we need to read a block.
 156
 157         br := &bz2.br
 158         magic := br.ReadBits64(48)
 159         if magic == bzip2FinalMagic {
 160                 br.ReadBits64(32) // ignored CRC
 161                 bz2.eof = true
 162                 return 0, io.EOF
 163         } else if magic != bzip2BlockMagic {
 164                 return 0, StructuralError("bad magic value found")
 165         }
 166
 167         err = bz2.readBlock()
 168         if err != nil {
 169                 return 0, err
 170         }
 171
 172         return bz2.read(buf)
 173 }
 174
 175 // readBlock reads a bzip2 block. The magic number should already have been consumed.
 176 func (bz2 *reader) readBlock() (err error) {
 177         br := &bz2.br
 178         br.ReadBits64(32) // skip checksum. TODO: check it if we can figure out what it is.
 179         randomized := br.ReadBits(1)
 180         if randomized != 0 {
 181                 return StructuralError("deprecated randomized files")
 182         }
 183         origPtr := uint(br.ReadBits(24))
 184
 185         // If not every byte value is used in the block (i.e., it's text) then
 186         // the symbol set is reduced. The symbols used are stored as a
 187         // two-level, 16x16 bitmap.
 188         symbolRangeUsedBitmap := br.ReadBits(16)
 189         symbolPresent := make([]bool, 256)
 190         numSymbols := 0
 191         for symRange := uint(0); symRange < 16; symRange++ {
 192                 if symbolRangeUsedBitmap&(1<<(15-symRange)) != 0 {
 193                         bits := br.ReadBits(16)
 194                         for symbol := uint(0); symbol < 16; symbol++ {
 195                                 if bits&(1<<(15-symbol)) != 0 {
 196                                         symbolPresent[16*symRange+symbol] = true
 197                                         numSymbols++
 198                                 }
 199                         }
 200                 }
 201         }
 202
 203         // A block uses between two and six different Huffman trees.
 204         numHuffmanTrees := br.ReadBits(3)
 205         if numHuffmanTrees < 2 || numHuffmanTrees > 6 {
 206                 return StructuralError("invalid number of Huffman trees")
 207         }
 208
 209         // The Huffman tree can switch every 50 symbols so there's a list of
 210         // tree indexes telling us which tree to use for each 50 symbol block.
 211         numSelectors := br.ReadBits(15)
 212         treeIndexes := make([]uint8, numSelectors)
 213
 214         // The tree indexes are move-to-front transformed and stored as unary
 215         // numbers.
 216         mtfTreeDecoder := newMTFDecoderWithRange(numHuffmanTrees)
 217         for i := range treeIndexes {
 218                 c := 0
 219                 for {
 220                         inc := br.ReadBits(1)
 221                         if inc == 0 {
 222                                 break
 223                         }
 224                         c++
 225                 }
 226                 if c >= numHuffmanTrees {
 227                         return StructuralError("tree index too large")
 228                 }
 229                 treeIndexes[i] = uint8(mtfTreeDecoder.Decode(c))
 230         }
 231
 232         // The list of symbols for the move-to-front transform is taken from
 233         // the previously decoded symbol bitmap.
 234         symbols := make([]byte, numSymbols)
 235         nextSymbol := 0
 236         for i := 0; i < 256; i++ {
 237                 if symbolPresent[i] {
 238                         symbols[nextSymbol] = byte(i)
 239                         nextSymbol++
 240                 }
 241         }
 242         mtf := newMTFDecoder(symbols)
 243
 244         numSymbols += 2 // to account for RUNA and RUNB symbols
 245         huffmanTrees := make([]huffmanTree, numHuffmanTrees)
 246
 247         // Now we decode the arrays of code-lengths for each tree.
 248         lengths := make([]uint8, numSymbols)
 249         for i := 0; i < numHuffmanTrees; i++ {
 250                 // The code lengths are delta encoded from a 5-bit base value.
 251                 length := br.ReadBits(5)
 252                 for j := 0; j < numSymbols; j++ {
 253                         for {
 254                                 if !br.ReadBit() {
 255                                         break
 256                                 }
 257                                 if br.ReadBit() {
 258                                         length--
 259                                 } else {
 260                                         length++
 261                                 }
 262                         }
 263                         if length < 0 || length > 20 {
 264                                 return StructuralError("Huffman length out of range")
 265                         }
 266                         lengths[j] = uint8(length)
 267                 }
 268                 huffmanTrees[i], err = newHuffmanTree(lengths)
 269                 if err != nil {
 270                         return err
 271                 }
 272         }
 273
 274         selectorIndex := 1 // the next tree index to use
 275         currentHuffmanTree := huffmanTrees[treeIndexes[0]]
 276         bufIndex := 0 // indexes bz2.buf, the output buffer.
 277         // The output of the move-to-front transform is run-length encoded and
 278         // we merge the decoding into the Huffman parsing loop. These two
 279         // variables accumulate the repeat count. See the Wikipedia page for
 280         // details.
 281         repeat := 0
 282         repeat_power := 0
 283
 284         // The `C' array (used by the inverse BWT) needs to be zero initialized.
 285         for i := range bz2.c {
 286                 bz2.c[i] = 0
 287         }
 288
 289         decoded := 0 // counts the number of symbols decoded by the current tree.
 290         for {
 291                 if decoded == 50 {
 292                         currentHuffmanTree = huffmanTrees[treeIndexes[selectorIndex]]
 293                         selectorIndex++
 294                         decoded = 0
 295                 }
 296
 297                 v := currentHuffmanTree.Decode(br)
 298                 decoded++
 299
 300                 if v < 2 {
 301                         // This is either the RUNA or RUNB symbol.
 302                         if repeat == 0 {
 303                                 repeat_power = 1
 304                         }
 305                         repeat += repeat_power << v
 306                         repeat_power <<= 1
 307
 308                         // This limit of 2 million comes from the bzip2 source
 309                         // code. It prevents repeat from overflowing.
 310                         if repeat > 2*1024*1024 {
 311                                 return StructuralError("repeat count too large")
 312                         }
 313                         continue
 314                 }
 315
 316                 if repeat > 0 {
 317                         // We have decoded a complete run-length so we need to
 318                         // replicate the last output symbol.
 319                         for i := 0; i < repeat; i++ {
 320                                 b := byte(mtf.First())
 321                                 bz2.tt[bufIndex] = uint32(b)
 322                                 bz2.c[b]++
 323                                 bufIndex++
 324                         }
 325                         repeat = 0
 326                 }
 327
 328                 if int(v) == numSymbols-1 {
 329                         // This is the EOF symbol. Because it's always at the
 330                         // end of the move-to-front list, and never gets moved
 331                         // to the front, it has this unique value.
 332                         break
 333                 }
 334
 335                 // Since two metasymbols (RUNA and RUNB) have values 0 and 1,
 336                 // one would expect |v-2| to be passed to the MTF decoder.
 337                 // However, the front of the MTF list is never referenced as 0,
 338                 // it's always referenced with a run-length of 1. Thus 0
 339                 // doesn't need to be encoded and we have |v-1| in the next
 340                 // line.
 341                 b := byte(mtf.Decode(int(v - 1)))
 342                 bz2.tt[bufIndex] = uint32(b)
 343                 bz2.c[b]++
 344                 bufIndex++
 345         }
 346
 347         if origPtr >= uint(bufIndex) {
 348                 return StructuralError("origPtr out of bounds")
 349         }
 350
 351         // We have completed the entropy decoding. Now we can perform the
 352         // inverse BWT and setup the RLE buffer.
 353         bz2.preRLE = bz2.tt[:bufIndex]
 354         bz2.preRLEUsed = 0
 355         bz2.tPos = inverseBWT(bz2.preRLE, origPtr, bz2.c[:])
 356         bz2.lastByte = -1
 357         bz2.byteRepeats = 0
 358         bz2.repeats = 0
 359
 360         return nil
 361 }
 362
 363 // inverseBWT implements the inverse Burrows-Wheeler transform as described in
 364 // http://www.hpl.hp.com/techreports/Compaq-DEC/SRC-RR-124.pdf, section 4.2.
 365 // In that document, origPtr is called `I' and c is the `C' array after the
 366 // first pass over the data. It's an argument here because we merge the first
 367 // pass with the Huffman decoding.
 368 //
 369 // This also implements the `single array' method from the bzip2 source code
 370 // which leaves the output, still shuffled, in the bottom 8 bits of tt with the
 371 // index of the next byte in the top 24-bits. The index of the first byte is
 372 // returned.
 373 func inverseBWT(tt []uint32, origPtr uint, c []uint) uint32 {
 374         sum := uint(0)
 375         for i := 0; i < 256; i++ {
 376                 sum += c[i]
 377                 c[i] = sum - c[i]
 378         }
 379
 380         for i := range tt {
 381                 b := tt[i] & 0xff
 382                 tt[c[b]] |= uint32(i) << 8
 383                 c[b]++
 384         }
 385
 386         return tt[origPtr] >> 8
 387 }