001/* 002 * Licensed to the Apache Software Foundation (ASF) under one 003 * or more contributor license agreements. See the NOTICE file 004 * distributed with this work for additional information 005 * regarding copyright ownership. The ASF licenses this file 006 * to you under the Apache License, Version 2.0 (the 007 * "License"); you may not use this file except in compliance 008 * with the License. You may obtain a copy of the License at 009 * 010 * http://www.apache.org/licenses/LICENSE-2.0 011 * 012 * Unless required by applicable law or agreed to in writing, 013 * software distributed under the License is distributed on an 014 * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 015 * KIND, either express or implied. See the License for the 016 * specific language governing permissions and limitations 017 * under the License. 018 */ 019package org.apache.commons.compress.compressors.snappy; 020 021import java.io.IOException; 022import java.io.InputStream; 023 024import org.apache.commons.compress.compressors.CompressorInputStream; 025import org.apache.commons.compress.utils.IOUtils; 026 027/** 028 * CompressorInputStream for the raw Snappy format. 029 * 030 * <p>This implementation uses an internal buffer in order to handle 031 * the back-references that are at the heart of the LZ77 algorithm. 032 * The size of the buffer must be at least as big as the biggest 033 * offset used in the compressed stream. The current version of the 034 * Snappy algorithm as defined by Google works on 32k blocks and 035 * doesn't contain offsets bigger than 32k which is the default block 036 * size used by this class.</p> 037 * 038 * @see <a href="http://code.google.com/p/snappy/source/browse/trunk/format_description.txt">Snappy compressed format description</a> 039 * @since 1.7 040 */ 041public class SnappyCompressorInputStream extends CompressorInputStream { 042 043 /** Mask used to determine the type of "tag" is being processed */ 044 private static final int TAG_MASK = 0x03; 045 046 /** Default block size */ 047 public static final int DEFAULT_BLOCK_SIZE = 32768; 048 049 /** Buffer to write decompressed bytes to for back-references */ 050 private final byte[] decompressBuf; 051 052 /** One behind the index of the last byte in the buffer that was written */ 053 private int writeIndex; 054 055 /** Index of the next byte to be read. */ 056 private int readIndex; 057 058 /** The actual block size specified */ 059 private final int blockSize; 060 061 /** The underlying stream to read compressed data from */ 062 private final InputStream in; 063 064 /** The size of the uncompressed data */ 065 private final int size; 066 067 /** Number of uncompressed bytes still to be read. */ 068 private int uncompressedBytesRemaining; 069 070 // used in no-arg read method 071 private final byte[] oneByte = new byte[1]; 072 073 private boolean endReached = false; 074 075 /** 076 * Constructor using the default buffer size of 32k. 077 * 078 * @param is 079 * An InputStream to read compressed data from 080 * 081 * @throws IOException if reading fails 082 */ 083 public SnappyCompressorInputStream(final InputStream is) throws IOException { 084 this(is, DEFAULT_BLOCK_SIZE); 085 } 086 087 /** 088 * Constructor using a configurable buffer size. 089 * 090 * @param is 091 * An InputStream to read compressed data from 092 * @param blockSize 093 * The block size used in compression 094 * 095 * @throws IOException if reading fails 096 */ 097 public SnappyCompressorInputStream(final InputStream is, final int blockSize) 098 throws IOException { 099 this.in = is; 100 this.blockSize = blockSize; 101 this.decompressBuf = new byte[blockSize * 3]; 102 this.writeIndex = readIndex = 0; 103 uncompressedBytesRemaining = size = (int) readSize(); 104 } 105 106 /** {@inheritDoc} */ 107 @Override 108 public int read() throws IOException { 109 return read(oneByte, 0, 1) == -1 ? -1 : oneByte[0] & 0xFF; 110 } 111 112 /** {@inheritDoc} */ 113 @Override 114 public void close() throws IOException { 115 in.close(); 116 } 117 118 /** {@inheritDoc} */ 119 @Override 120 public int available() { 121 return writeIndex - readIndex; 122 } 123 124 /** 125 * {@inheritDoc} 126 */ 127 @Override 128 public int read(final byte[] b, final int off, final int len) throws IOException { 129 if (endReached) { 130 return -1; 131 } 132 final int avail = available(); 133 if (len > avail) { 134 fill(len - avail); 135 } 136 137 final int readable = Math.min(len, available()); 138 if (readable == 0 && len > 0) { 139 return -1; 140 } 141 System.arraycopy(decompressBuf, readIndex, b, off, readable); 142 readIndex += readable; 143 if (readIndex > blockSize) { 144 slideBuffer(); 145 } 146 return readable; 147 } 148 149 /** 150 * Try to fill the buffer with enough bytes to satisfy the current 151 * read request. 152 * 153 * @param len the number of uncompressed bytes to read 154 */ 155 private void fill(final int len) throws IOException { 156 if (uncompressedBytesRemaining == 0) { 157 endReached = true; 158 } 159 int readNow = Math.min(len, uncompressedBytesRemaining); 160 161 while (readNow > 0) { 162 final int b = readOneByte(); 163 int length = 0; 164 long offset = 0; 165 166 switch (b & TAG_MASK) { 167 168 case 0x00: 169 170 length = readLiteralLength(b); 171 172 if (expandLiteral(length)) { 173 return; 174 } 175 break; 176 177 case 0x01: 178 179 /* 180 * These elements can encode lengths between [4..11] bytes and 181 * offsets between [0..2047] bytes. (len-4) occupies three bits 182 * and is stored in bits [2..4] of the tag byte. The offset 183 * occupies 11 bits, of which the upper three are stored in the 184 * upper three bits ([5..7]) of the tag byte, and the lower 185 * eight are stored in a byte following the tag byte. 186 */ 187 188 length = 4 + ((b >> 2) & 0x07); 189 offset = (b & 0xE0) << 3; 190 offset |= readOneByte(); 191 192 if (expandCopy(offset, length)) { 193 return; 194 } 195 break; 196 197 case 0x02: 198 199 /* 200 * These elements can encode lengths between [1..64] and offsets 201 * from [0..65535]. (len-1) occupies six bits and is stored in 202 * the upper six bits ([2..7]) of the tag byte. The offset is 203 * stored as a little-endian 16-bit integer in the two bytes 204 * following the tag byte. 205 */ 206 207 length = (b >> 2) + 1; 208 209 offset = readOneByte(); 210 offset |= readOneByte() << 8; 211 212 if (expandCopy(offset, length)) { 213 return; 214 } 215 break; 216 217 case 0x03: 218 219 /* 220 * These are like the copies with 2-byte offsets (see previous 221 * subsection), except that the offset is stored as a 32-bit 222 * integer instead of a 16-bit integer (and thus will occupy 223 * four bytes). 224 */ 225 226 length = (b >> 2) + 1; 227 228 offset = readOneByte(); 229 offset |= readOneByte() << 8; 230 offset |= readOneByte() << 16; 231 offset |= ((long) readOneByte()) << 24; 232 233 if (expandCopy(offset, length)) { 234 return; 235 } 236 break; 237 } 238 239 readNow -= length; 240 uncompressedBytesRemaining -= length; 241 } 242 } 243 244 /** 245 * Slide buffer. 246 * 247 * <p>Move all bytes of the buffer after the first block down to 248 * the beginning of the buffer.</p> 249 */ 250 private void slideBuffer() { 251 System.arraycopy(decompressBuf, blockSize, decompressBuf, 0, 252 blockSize * 2); 253 writeIndex -= blockSize; 254 readIndex -= blockSize; 255 } 256 257 258 /* 259 * For literals up to and including 60 bytes in length, the 260 * upper six bits of the tag byte contain (len-1). The literal 261 * follows immediately thereafter in the bytestream. - For 262 * longer literals, the (len-1) value is stored after the tag 263 * byte, little-endian. The upper six bits of the tag byte 264 * describe how many bytes are used for the length; 60, 61, 62 265 * or 63 for 1-4 bytes, respectively. The literal itself follows 266 * after the length. 267 */ 268 private int readLiteralLength(final int b) throws IOException { 269 int length; 270 switch (b >> 2) { 271 case 60: 272 length = readOneByte(); 273 break; 274 case 61: 275 length = readOneByte(); 276 length |= readOneByte() << 8; 277 break; 278 case 62: 279 length = readOneByte(); 280 length |= readOneByte() << 8; 281 length |= readOneByte() << 16; 282 break; 283 case 63: 284 length = readOneByte(); 285 length |= readOneByte() << 8; 286 length |= readOneByte() << 16; 287 length |= (((long) readOneByte()) << 24); 288 break; 289 default: 290 length = b >> 2; 291 break; 292 } 293 294 return length + 1; 295 } 296 297 /** 298 * Literals are uncompressed data stored directly in the byte stream. 299 * 300 * @param length 301 * The number of bytes to read from the underlying stream 302 * 303 * @throws IOException 304 * If the first byte cannot be read for any reason other than 305 * end of file, or if the input stream has been closed, or if 306 * some other I/O error occurs. 307 * @return True if the decompressed data should be flushed 308 */ 309 private boolean expandLiteral(final int length) throws IOException { 310 final int bytesRead = IOUtils.readFully(in, decompressBuf, writeIndex, length); 311 count(bytesRead); 312 if (length != bytesRead) { 313 throw new IOException("Premature end of stream"); 314 } 315 316 writeIndex += length; 317 return writeIndex >= 2 * this.blockSize; 318 } 319 320 /** 321 * Copies are references back into previous decompressed data, telling the 322 * decompressor to reuse data it has previously decoded. They encode two 323 * values: The offset, saying how many bytes back from the current position 324 * to read, and the length, how many bytes to copy. Offsets of zero can be 325 * encoded, but are not legal; similarly, it is possible to encode 326 * backreferences that would go past the end of the block (offset > current 327 * decompressed position), which is also nonsensical and thus not allowed. 328 * 329 * @param off 330 * The offset from the backward from the end of expanded stream 331 * @param length 332 * The number of bytes to copy 333 * 334 * @throws IOException 335 * An the offset expands past the front of the decompression 336 * buffer 337 * @return True if the decompressed data should be flushed 338 */ 339 private boolean expandCopy(final long off, final int length) throws IOException { 340 if (off > blockSize) { 341 throw new IOException("Offset is larger than block size"); 342 } 343 final int offset = (int) off; 344 345 if (offset == 1) { 346 final byte lastChar = decompressBuf[writeIndex - 1]; 347 for (int i = 0; i < length; i++) { 348 decompressBuf[writeIndex++] = lastChar; 349 } 350 } else if (length < offset) { 351 System.arraycopy(decompressBuf, writeIndex - offset, 352 decompressBuf, writeIndex, length); 353 writeIndex += length; 354 } else { 355 int fullRotations = length / offset; 356 final int pad = length - (offset * fullRotations); 357 358 while (fullRotations-- != 0) { 359 System.arraycopy(decompressBuf, writeIndex - offset, 360 decompressBuf, writeIndex, offset); 361 writeIndex += offset; 362 } 363 364 if (pad > 0) { 365 System.arraycopy(decompressBuf, writeIndex - offset, 366 decompressBuf, writeIndex, pad); 367 368 writeIndex += pad; 369 } 370 } 371 return writeIndex >= 2 * this.blockSize; 372 } 373 374 /** 375 * This helper method reads the next byte of data from the input stream. The 376 * value byte is returned as an <code>int</code> in the range <code>0</code> 377 * to <code>255</code>. If no byte is available because the end of the 378 * stream has been reached, an Exception is thrown. 379 * 380 * @return The next byte of data 381 * @throws IOException 382 * EOF is reached or error reading the stream 383 */ 384 private int readOneByte() throws IOException { 385 final int b = in.read(); 386 if (b == -1) { 387 throw new IOException("Premature end of stream"); 388 } 389 count(1); 390 return b & 0xFF; 391 } 392 393 /** 394 * The stream starts with the uncompressed length (up to a maximum of 2^32 - 395 * 1), stored as a little-endian varint. Varints consist of a series of 396 * bytes, where the lower 7 bits are data and the upper bit is set iff there 397 * are more bytes to be read. In other words, an uncompressed length of 64 398 * would be stored as 0x40, and an uncompressed length of 2097150 (0x1FFFFE) 399 * would be stored as 0xFE 0xFF 0x7F. 400 * 401 * @return The size of the uncompressed data 402 * 403 * @throws IOException 404 * Could not read a byte 405 */ 406 private long readSize() throws IOException { 407 int index = 0; 408 long sz = 0; 409 int b = 0; 410 411 do { 412 b = readOneByte(); 413 sz |= (b & 0x7f) << (index++ * 7); 414 } while (0 != (b & 0x80)); 415 return sz; 416 } 417 418 /** 419 * Get the uncompressed size of the stream 420 * 421 * @return the uncompressed size 422 */ 423 public int getSize() { 424 return size; 425 } 426 427}