Source code

001/*
002 * Licensed to the Apache Software Foundation (ASF) under one
003 * or more contributor license agreements.  See the NOTICE file
004 * distributed with this work for additional information
005 * regarding copyright ownership.  The ASF licenses this file
006 * to you under the Apache License, Version 2.0 (the
007 * "License"); you may not use this file except in compliance
008 * with the License.  You may obtain a copy of the License at
009 *
010 * http://www.apache.org/licenses/LICENSE-2.0
011 *
012 * Unless required by applicable law or agreed to in writing,
013 * software distributed under the License is distributed on an
014 * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
015 * KIND, either express or implied.  See the License for the
016 * specific language governing permissions and limitations
017 * under the License.
018 */
019package org.apache.commons.compress.compressors.snappy;
020
021import java.io.IOException;
022import java.io.InputStream;
023
024import org.apache.commons.compress.compressors.CompressorInputStream;
025import org.apache.commons.compress.utils.IOUtils;
026
027/**
028 * CompressorInputStream for the raw Snappy format.
029 *
030 * <p>This implementation uses an internal buffer in order to handle
031 * the back-references that are at the heart of the LZ77 algorithm.
032 * The size of the buffer must be at least as big as the biggest
033 * offset used in the compressed stream.  The current version of the
034 * Snappy algorithm as defined by Google works on 32k blocks and
035 * doesn't contain offsets bigger than 32k which is the default block
036 * size used by this class.</p>
037 *
038 * @see <a href="http://code.google.com/p/snappy/source/browse/trunk/format_description.txt">Snappy compressed format description</a>
039 * @since 1.7
040 */
041public class SnappyCompressorInputStream extends CompressorInputStream {
042
043    /** Mask used to determine the type of "tag" is being processed */
044    private static final int TAG_MASK = 0x03;
045
046    /** Default block size */
047    public static final int DEFAULT_BLOCK_SIZE = 32768;
048
049    /** Buffer to write decompressed bytes to for back-references */
050    private final byte[] decompressBuf;
051
052    /** One behind the index of the last byte in the buffer that was written */
053    private int writeIndex;
054
055    /** Index of the next byte to be read. */
056    private int readIndex;
057
058    /** The actual block size specified */
059    private final int blockSize;
060
061    /** The underlying stream to read compressed data from */
062    private final InputStream in;
063
064    /** The size of the uncompressed data */
065    private final int size;
066
067    /** Number of uncompressed bytes still to be read. */
068    private int uncompressedBytesRemaining;
069
070    // used in no-arg read method
071    private final byte[] oneByte = new byte[1];
072
073    private boolean endReached = false;
074
075    /**
076     * Constructor using the default buffer size of 32k.
077     * 
078     * @param is
079     *            An InputStream to read compressed data from
080     * 
081     * @throws IOException if reading fails
082     */
083    public SnappyCompressorInputStream(final InputStream is) throws IOException {
084        this(is, DEFAULT_BLOCK_SIZE);
085    }
086
087    /**
088     * Constructor using a configurable buffer size.
089     * 
090     * @param is
091     *            An InputStream to read compressed data from
092     * @param blockSize
093     *            The block size used in compression
094     * 
095     * @throws IOException if reading fails
096     */
097    public SnappyCompressorInputStream(final InputStream is, final int blockSize)
098            throws IOException {
099        this.in = is;
100        this.blockSize = blockSize;
101        this.decompressBuf = new byte[blockSize * 3];
102        this.writeIndex = readIndex = 0;
103        uncompressedBytesRemaining = size = (int) readSize();
104    }
105
106    /** {@inheritDoc} */
107    @Override
108    public int read() throws IOException {
109        return read(oneByte, 0, 1) == -1 ? -1 : oneByte[0] & 0xFF;
110    }
111
112    /** {@inheritDoc} */
113    @Override
114    public void close() throws IOException {
115        in.close();
116    }
117
118    /** {@inheritDoc} */
119    @Override
120    public int available() {
121        return writeIndex - readIndex;
122    }
123
124    /**
125     * {@inheritDoc}
126     */
127    @Override
128    public int read(final byte[] b, final int off, final int len) throws IOException {
129        if (endReached) {
130            return -1;
131        }
132        final int avail = available();
133        if (len > avail) {
134            fill(len - avail);
135        }
136
137        final int readable = Math.min(len, available());
138        if (readable == 0 && len > 0) {
139            return -1;
140        }
141        System.arraycopy(decompressBuf, readIndex, b, off, readable);
142        readIndex += readable;
143        if (readIndex > blockSize) {
144            slideBuffer();
145        }
146        return readable;
147    }
148
149    /**
150     * Try to fill the buffer with enough bytes to satisfy the current
151     * read request.
152     *
153     * @param len the number of uncompressed bytes to read
154     */
155    private void fill(final int len) throws IOException {
156        if (uncompressedBytesRemaining == 0) {
157            endReached = true;
158        }
159        int readNow = Math.min(len, uncompressedBytesRemaining);
160
161        while (readNow > 0) {
162            final int b = readOneByte();
163            int length = 0;
164            long offset = 0;
165
166            switch (b & TAG_MASK) {
167
168            case 0x00:
169
170                length = readLiteralLength(b);
171
172                if (expandLiteral(length)) {
173                    return;
174                }
175                break;
176
177            case 0x01:
178
179                /*
180                 * These elements can encode lengths between [4..11] bytes and
181                 * offsets between [0..2047] bytes. (len-4) occupies three bits
182                 * and is stored in bits [2..4] of the tag byte. The offset
183                 * occupies 11 bits, of which the upper three are stored in the
184                 * upper three bits ([5..7]) of the tag byte, and the lower
185                 * eight are stored in a byte following the tag byte.
186                 */
187
188                length = 4 + ((b >> 2) & 0x07);
189                offset = (b & 0xE0) << 3;
190                offset |= readOneByte();
191
192                if (expandCopy(offset, length)) {
193                    return;
194                }
195                break;
196
197            case 0x02:
198
199                /*
200                 * These elements can encode lengths between [1..64] and offsets
201                 * from [0..65535]. (len-1) occupies six bits and is stored in
202                 * the upper six bits ([2..7]) of the tag byte. The offset is
203                 * stored as a little-endian 16-bit integer in the two bytes
204                 * following the tag byte.
205                 */
206
207                length = (b >> 2) + 1;
208
209                offset = readOneByte();
210                offset |= readOneByte() << 8;
211
212                if (expandCopy(offset, length)) {
213                    return;
214                }
215                break;
216
217            case 0x03:
218
219                /*
220                 * These are like the copies with 2-byte offsets (see previous
221                 * subsection), except that the offset is stored as a 32-bit
222                 * integer instead of a 16-bit integer (and thus will occupy
223                 * four bytes).
224                 */
225
226                length = (b >> 2) + 1;
227
228                offset = readOneByte();
229                offset |= readOneByte() << 8;
230                offset |= readOneByte() << 16;
231                offset |= ((long) readOneByte()) << 24;
232
233                if (expandCopy(offset, length)) {
234                    return;
235                }
236                break;
237            }
238
239            readNow -= length;
240            uncompressedBytesRemaining -= length;
241        }
242    }
243
244    /**
245     * Slide buffer.
246     *
247     * <p>Move all bytes of the buffer after the first block down to
248     * the beginning of the buffer.</p>
249     */
250    private void slideBuffer() {
251        System.arraycopy(decompressBuf, blockSize, decompressBuf, 0,
252                         blockSize * 2);
253        writeIndex -= blockSize;
254        readIndex -= blockSize;
255    }
256
257
258    /*
259     * For literals up to and including 60 bytes in length, the
260     * upper six bits of the tag byte contain (len-1). The literal
261     * follows immediately thereafter in the bytestream. - For
262     * longer literals, the (len-1) value is stored after the tag
263     * byte, little-endian. The upper six bits of the tag byte
264     * describe how many bytes are used for the length; 60, 61, 62
265     * or 63 for 1-4 bytes, respectively. The literal itself follows
266     * after the length.
267     */
268    private int readLiteralLength(final int b) throws IOException {
269        int length;
270        switch (b >> 2) {
271        case 60:
272            length = readOneByte();
273            break;
274        case 61:
275            length = readOneByte();
276            length |= readOneByte() << 8;
277            break;
278        case 62:
279            length = readOneByte();
280            length |= readOneByte() << 8;
281            length |= readOneByte() << 16;
282            break;
283        case 63:
284            length = readOneByte();
285            length |= readOneByte() << 8;
286            length |= readOneByte() << 16;
287            length |= (((long) readOneByte()) << 24);
288            break;
289        default:
290            length = b >> 2;
291            break;
292        }
293
294        return length + 1;
295    }
296
297    /**
298     * Literals are uncompressed data stored directly in the byte stream.
299     * 
300     * @param length
301     *            The number of bytes to read from the underlying stream
302     * 
303     * @throws IOException
304     *             If the first byte cannot be read for any reason other than
305     *             end of file, or if the input stream has been closed, or if
306     *             some other I/O error occurs.
307     * @return True if the decompressed data should be flushed
308     */
309    private boolean expandLiteral(final int length) throws IOException {
310        final int bytesRead = IOUtils.readFully(in, decompressBuf, writeIndex, length);
311        count(bytesRead);
312        if (length != bytesRead) {
313            throw new IOException("Premature end of stream");
314        }
315
316        writeIndex += length;
317        return writeIndex >= 2 * this.blockSize;
318    }
319
320    /**
321     * Copies are references back into previous decompressed data, telling the
322     * decompressor to reuse data it has previously decoded. They encode two
323     * values: The offset, saying how many bytes back from the current position
324     * to read, and the length, how many bytes to copy. Offsets of zero can be
325     * encoded, but are not legal; similarly, it is possible to encode
326     * backreferences that would go past the end of the block (offset > current
327     * decompressed position), which is also nonsensical and thus not allowed.
328     * 
329     * @param off
330     *            The offset from the backward from the end of expanded stream
331     * @param length
332     *            The number of bytes to copy
333     * 
334     * @throws IOException
335     *             An the offset expands past the front of the decompression
336     *             buffer
337     * @return True if the decompressed data should be flushed
338     */
339    private boolean expandCopy(final long off, final int length) throws IOException {
340        if (off > blockSize) {
341            throw new IOException("Offset is larger than block size");
342        }
343        final int offset = (int) off;
344
345        if (offset == 1) {
346            final byte lastChar = decompressBuf[writeIndex - 1];
347            for (int i = 0; i < length; i++) {
348                decompressBuf[writeIndex++] = lastChar;
349            }
350        } else if (length < offset) {
351            System.arraycopy(decompressBuf, writeIndex - offset,
352                    decompressBuf, writeIndex, length);
353            writeIndex += length;
354        } else {
355            int fullRotations = length / offset;
356            final int pad = length - (offset * fullRotations);
357
358            while (fullRotations-- != 0) {
359                System.arraycopy(decompressBuf, writeIndex - offset,
360                        decompressBuf, writeIndex, offset);
361                writeIndex += offset;
362            }
363
364            if (pad > 0) {
365                System.arraycopy(decompressBuf, writeIndex - offset,
366                        decompressBuf, writeIndex, pad);
367
368                writeIndex += pad;
369            }
370        }
371        return writeIndex >= 2 * this.blockSize;
372    }
373
374    /**
375     * This helper method reads the next byte of data from the input stream. The
376     * value byte is returned as an <code>int</code> in the range <code>0</code>
377     * to <code>255</code>. If no byte is available because the end of the
378     * stream has been reached, an Exception is thrown.
379     * 
380     * @return The next byte of data
381     * @throws IOException
382     *             EOF is reached or error reading the stream
383     */
384    private int readOneByte() throws IOException {
385        final int b = in.read();
386        if (b == -1) {
387            throw new IOException("Premature end of stream");
388        }
389        count(1);
390        return b & 0xFF;
391    }
392
393    /**
394     * The stream starts with the uncompressed length (up to a maximum of 2^32 -
395     * 1), stored as a little-endian varint. Varints consist of a series of
396     * bytes, where the lower 7 bits are data and the upper bit is set iff there
397     * are more bytes to be read. In other words, an uncompressed length of 64
398     * would be stored as 0x40, and an uncompressed length of 2097150 (0x1FFFFE)
399     * would be stored as 0xFE 0xFF 0x7F.
400     * 
401     * @return The size of the uncompressed data
402     * 
403     * @throws IOException
404     *             Could not read a byte
405     */
406    private long readSize() throws IOException {
407        int index = 0;
408        long sz = 0;
409        int b = 0;
410
411        do {
412            b = readOneByte();
413            sz |= (b & 0x7f) << (index++ * 7);
414        } while (0 != (b & 0x80));
415        return sz;
416    }
417
418    /**
419     * Get the uncompressed size of the stream
420     * 
421     * @return the uncompressed size
422     */
423    public int getSize() {
424        return size;
425    }
426
427}