Source code

001/*
002 * Licensed to the Apache Software Foundation (ASF) under one
003 * or more contributor license agreements.  See the NOTICE file
004 * distributed with this work for additional information
005 * regarding copyright ownership.  The ASF licenses this file
006 * to you under the Apache License, Version 2.0 (the
007 * "License"); you may not use this file except in compliance
008 * with the License.  You may obtain a copy of the License at
009 *
010 * http://www.apache.org/licenses/LICENSE-2.0
011 *
012 * Unless required by applicable law or agreed to in writing,
013 * software distributed under the License is distributed on an
014 * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
015 * KIND, either express or implied.  See the License for the
016 * specific language governing permissions and limitations
017 * under the License.
018 */
019package org.apache.commons.compress.compressors.gzip;
020
021import java.io.ByteArrayOutputStream;
022import java.io.IOException;
023import java.io.EOFException;
024import java.io.InputStream;
025import java.io.DataInputStream;
026import java.io.BufferedInputStream;
027import java.util.zip.DataFormatException;
028import java.util.zip.Deflater;
029import java.util.zip.Inflater;
030import java.util.zip.CRC32;
031
032import org.apache.commons.compress.compressors.CompressorInputStream;
033import org.apache.commons.compress.utils.CharsetNames;
034
035/**
036 * Input stream that decompresses .gz files.
037 * This supports decompressing concatenated .gz files which is important
038 * when decompressing standalone .gz files.
039 * <p>
040 * {@link java.util.zip.GZIPInputStream} doesn't decompress concatenated .gz
041 * files: it stops after the first member and silently ignores the rest.
042 * It doesn't leave the read position to point to the beginning of the next
043 * member, which makes it difficult workaround the lack of concatenation
044 * support.
045 * <p>
046 * Instead of using <code>GZIPInputStream</code>, this class has its own .gz
047 * container format decoder. The actual decompression is done with
048 * {@link java.util.zip.Inflater}.
049 */
050public class GzipCompressorInputStream extends CompressorInputStream {
051    // Header flags
052    // private static final int FTEXT = 0x01; // Uninteresting for us
053    private static final int FHCRC = 0x02;
054    private static final int FEXTRA = 0x04;
055    private static final int FNAME = 0x08;
056    private static final int FCOMMENT = 0x10;
057    private static final int FRESERVED = 0xE0;
058
059    // Compressed input stream, possibly wrapped in a BufferedInputStream
060    private final InputStream in;
061
062    // True if decompressing multimember streams.
063    private final boolean decompressConcatenated;
064
065    // Buffer to hold the input data
066    private final byte[] buf = new byte[8192];
067
068    // Amount of data in buf.
069    private int bufUsed = 0;
070
071    // Decompressor
072    private Inflater inf = new Inflater(true);
073
074    // CRC32 from uncompressed data
075    private final CRC32 crc = new CRC32();
076
077    // True once everything has been decompressed
078    private boolean endReached = false;
079
080    // used in no-arg read method
081    private final byte[] oneByte = new byte[1];
082
083    private final GzipParameters parameters = new GzipParameters();
084
085    /**
086     * Constructs a new input stream that decompresses gzip-compressed data
087     * from the specified input stream.
088     * <p>
089     * This is equivalent to
090     * <code>GzipCompressorInputStream(inputStream, false)</code> and thus
091     * will not decompress concatenated .gz files.
092     *
093     * @param inputStream  the InputStream from which this object should
094     *                     be created of
095     *
096     * @throws IOException if the stream could not be created
097     */
098    public GzipCompressorInputStream(final InputStream inputStream)
099            throws IOException {
100        this(inputStream, false);
101    }
102
103    /**
104     * Constructs a new input stream that decompresses gzip-compressed data
105     * from the specified input stream.
106     * <p>
107     * If <code>decompressConcatenated</code> is {@code false}:
108     * This decompressor might read more input than it will actually use.
109     * If <code>inputStream</code> supports <code>mark</code> and
110     * <code>reset</code>, then the input position will be adjusted
111     * so that it is right after the last byte of the compressed stream.
112     * If <code>mark</code> isn't supported, the input position will be
113     * undefined.
114     *
115     * @param inputStream  the InputStream from which this object should
116     *                     be created of
117     * @param decompressConcatenated
118     *                     if true, decompress until the end of the input;
119     *                     if false, stop after the first .gz member
120     *
121     * @throws IOException if the stream could not be created
122     */
123    public GzipCompressorInputStream(final InputStream inputStream,
124                                     final boolean decompressConcatenated)
125            throws IOException {
126        // Mark support is strictly needed for concatenated files only,
127        // but it's simpler if it is always available.
128        if (inputStream.markSupported()) {
129            in = inputStream;
130        } else {
131            in = new BufferedInputStream(inputStream);
132        }
133
134        this.decompressConcatenated = decompressConcatenated;
135        init(true);
136    }
137
138    /**
139     * Provides the stream's meta data - may change with each stream
140     * when decompressing concatenated streams.
141     * @return the stream's meta data
142     * @since 1.8
143     */
144    public GzipParameters getMetaData() {
145        return parameters;
146    }
147
148    private boolean init(final boolean isFirstMember) throws IOException {
149        assert isFirstMember || decompressConcatenated;
150
151        // Check the magic bytes without a possibility of EOFException.
152        final int magic0 = in.read();
153        final int magic1 = in.read();
154
155        // If end of input was reached after decompressing at least
156        // one .gz member, we have reached the end of the file successfully.
157        if (magic0 == -1 && !isFirstMember) {
158            return false;
159        }
160
161        if (magic0 != 31 || magic1 != 139) {
162            throw new IOException(isFirstMember
163                                  ? "Input is not in the .gz format"
164                                  : "Garbage after a valid .gz stream");
165        }
166
167        // Parsing the rest of the header may throw EOFException.
168        final DataInputStream inData = new DataInputStream(in);
169        final int method = inData.readUnsignedByte();
170        if (method != Deflater.DEFLATED) {
171            throw new IOException("Unsupported compression method "
172                                  + method + " in the .gz header");
173        }
174
175        final int flg = inData.readUnsignedByte();
176        if ((flg & FRESERVED) != 0) {
177            throw new IOException(
178                    "Reserved flags are set in the .gz header");
179        }
180
181        parameters.setModificationTime(readLittleEndianInt(inData) * 1000);
182        switch (inData.readUnsignedByte()) { // extra flags
183        case 2:
184            parameters.setCompressionLevel(Deflater.BEST_COMPRESSION);
185            break;
186        case 4:
187            parameters.setCompressionLevel(Deflater.BEST_SPEED);
188            break;
189        default:
190            // ignored for now
191            break;
192        }
193        parameters.setOperatingSystem(inData.readUnsignedByte());
194
195        // Extra field, ignored
196        if ((flg & FEXTRA) != 0) {
197            int xlen = inData.readUnsignedByte();
198            xlen |= inData.readUnsignedByte() << 8;
199
200            // This isn't as efficient as calling in.skip would be,
201            // but it's lazier to handle unexpected end of input this way.
202            // Most files don't have an extra field anyway.
203            while (xlen-- > 0) {
204                inData.readUnsignedByte();
205            }
206        }
207
208        // Original file name
209        if ((flg & FNAME) != 0) {
210            parameters.setFilename(new String(readToNull(inData),
211                                              CharsetNames.ISO_8859_1));
212        }
213
214        // Comment
215        if ((flg & FCOMMENT) != 0) {
216            parameters.setComment(new String(readToNull(inData),
217                                             CharsetNames.ISO_8859_1));
218        }
219
220        // Header "CRC16" which is actually a truncated CRC32 (which isn't
221        // as good as real CRC16). I don't know if any encoder implementation
222        // sets this, so it's not worth trying to verify it. GNU gzip 1.4
223        // doesn't support this field, but zlib seems to be able to at least
224        // skip over it.
225        if ((flg & FHCRC) != 0) {
226            inData.readShort();
227        }
228
229        // Reset
230        inf.reset();
231        crc.reset();
232
233        return true;
234    }
235
236    private byte[] readToNull(final DataInputStream inData) throws IOException {
237        final ByteArrayOutputStream bos = new ByteArrayOutputStream();
238        int b = 0;
239        while ((b = inData.readUnsignedByte()) != 0x00) { // NOPMD
240            bos.write(b);
241        }
242        return bos.toByteArray();
243    }
244
245    private long readLittleEndianInt(final DataInputStream inData) throws IOException {
246        return inData.readUnsignedByte()
247            | (inData.readUnsignedByte() << 8)
248            | (inData.readUnsignedByte() << 16)
249            | (((long) inData.readUnsignedByte()) << 24);
250    }
251
252    @Override
253    public int read() throws IOException {
254        return read(oneByte, 0, 1) == -1 ? -1 : oneByte[0] & 0xFF;
255    }
256
257    /**
258     * {@inheritDoc}
259     *
260     * @since 1.1
261     */
262    @Override
263    public int read(final byte[] b, int off, int len) throws IOException {
264        if (endReached) {
265            return -1;
266        }
267
268        int size = 0;
269
270        while (len > 0) {
271            if (inf.needsInput()) {
272                // Remember the current position because we may need to
273                // rewind after reading too much input.
274                in.mark(buf.length);
275
276                bufUsed = in.read(buf);
277                if (bufUsed == -1) {
278                    throw new EOFException();
279                }
280
281                inf.setInput(buf, 0, bufUsed);
282            }
283
284            int ret;
285            try {
286                ret = inf.inflate(b, off, len);
287            } catch (final DataFormatException e) {
288                throw new IOException("Gzip-compressed data is corrupt");
289            }
290
291            crc.update(b, off, ret);
292            off += ret;
293            len -= ret;
294            size += ret;
295            count(ret);
296
297            if (inf.finished()) {
298                // We may have read too many bytes. Rewind the read
299                // position to match the actual amount used.
300                //
301                // NOTE: The "if" is there just in case. Since we used
302                // in.mark earler, it should always skip enough.
303                in.reset();
304
305                final int skipAmount = bufUsed - inf.getRemaining();
306                if (in.skip(skipAmount) != skipAmount) {
307                    throw new IOException();
308                }
309
310                bufUsed = 0;
311
312                final DataInputStream inData = new DataInputStream(in);
313
314                // CRC32
315                final long crcStored = readLittleEndianInt(inData);
316
317                if (crcStored != crc.getValue()) {
318                    throw new IOException("Gzip-compressed data is corrupt "
319                                          + "(CRC32 error)");
320                }
321
322                // Uncompressed size modulo 2^32 (ISIZE in the spec)
323                final long isize = readLittleEndianInt(inData);
324
325                if (isize != (inf.getBytesWritten() & 0xffffffffl)) {
326                    throw new IOException("Gzip-compressed data is corrupt"
327                                          + "(uncompressed size mismatch)");
328                }
329
330                // See if this is the end of the file.
331                if (!decompressConcatenated || !init(false)) {
332                    inf.end();
333                    inf = null;
334                    endReached = true;
335                    return size == 0 ? -1 : size;
336                }
337            }
338        }
339
340        return size;
341    }
342
343    /**
344     * Checks if the signature matches what is expected for a .gz file.
345     *
346     * @param signature the bytes to check
347     * @param length    the number of bytes to check
348     * @return          true if this is a .gz stream, false otherwise
349     *
350     * @since 1.1
351     */
352    public static boolean matches(final byte[] signature, final int length) {
353
354        if (length < 2) {
355            return false;
356        }
357
358        if (signature[0] != 31) {
359            return false;
360        }
361
362        if (signature[1] != -117) {
363            return false;
364        }
365
366        return true;
367    }
368
369    /**
370     * Closes the input stream (unless it is System.in).
371     *
372     * @since 1.2
373     */
374    @Override
375    public void close() throws IOException {
376        if (inf != null) {
377            inf.end();
378            inf = null;
379        }
380
381        if (this.in != System.in) {
382            this.in.close();
383        }
384    }
385}