001/* 002 * Licensed to the Apache Software Foundation (ASF) under one 003 * or more contributor license agreements. See the NOTICE file 004 * distributed with this work for additional information 005 * regarding copyright ownership. The ASF licenses this file 006 * to you under the Apache License, Version 2.0 (the 007 * "License"); you may not use this file except in compliance 008 * with the License. You may obtain a copy of the License at 009 * 010 * http://www.apache.org/licenses/LICENSE-2.0 011 * 012 * Unless required by applicable law or agreed to in writing, 013 * software distributed under the License is distributed on an 014 * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 015 * KIND, either express or implied. See the License for the 016 * specific language governing permissions and limitations 017 * under the License. 018 */ 019package org.apache.commons.compress.compressors.gzip; 020 021import java.io.ByteArrayOutputStream; 022import java.io.IOException; 023import java.io.EOFException; 024import java.io.InputStream; 025import java.io.DataInputStream; 026import java.io.BufferedInputStream; 027import java.util.zip.DataFormatException; 028import java.util.zip.Deflater; 029import java.util.zip.Inflater; 030import java.util.zip.CRC32; 031 032import org.apache.commons.compress.compressors.CompressorInputStream; 033import org.apache.commons.compress.utils.CharsetNames; 034 035/** 036 * Input stream that decompresses .gz files. 037 * This supports decompressing concatenated .gz files which is important 038 * when decompressing standalone .gz files. 039 * <p> 040 * {@link java.util.zip.GZIPInputStream} doesn't decompress concatenated .gz 041 * files: it stops after the first member and silently ignores the rest. 042 * It doesn't leave the read position to point to the beginning of the next 043 * member, which makes it difficult workaround the lack of concatenation 044 * support. 045 * <p> 046 * Instead of using <code>GZIPInputStream</code>, this class has its own .gz 047 * container format decoder. The actual decompression is done with 048 * {@link java.util.zip.Inflater}. 049 */ 050public class GzipCompressorInputStream extends CompressorInputStream { 051 // Header flags 052 // private static final int FTEXT = 0x01; // Uninteresting for us 053 private static final int FHCRC = 0x02; 054 private static final int FEXTRA = 0x04; 055 private static final int FNAME = 0x08; 056 private static final int FCOMMENT = 0x10; 057 private static final int FRESERVED = 0xE0; 058 059 // Compressed input stream, possibly wrapped in a BufferedInputStream 060 private final InputStream in; 061 062 // True if decompressing multimember streams. 063 private final boolean decompressConcatenated; 064 065 // Buffer to hold the input data 066 private final byte[] buf = new byte[8192]; 067 068 // Amount of data in buf. 069 private int bufUsed = 0; 070 071 // Decompressor 072 private Inflater inf = new Inflater(true); 073 074 // CRC32 from uncompressed data 075 private final CRC32 crc = new CRC32(); 076 077 // True once everything has been decompressed 078 private boolean endReached = false; 079 080 // used in no-arg read method 081 private final byte[] oneByte = new byte[1]; 082 083 private final GzipParameters parameters = new GzipParameters(); 084 085 /** 086 * Constructs a new input stream that decompresses gzip-compressed data 087 * from the specified input stream. 088 * <p> 089 * This is equivalent to 090 * <code>GzipCompressorInputStream(inputStream, false)</code> and thus 091 * will not decompress concatenated .gz files. 092 * 093 * @param inputStream the InputStream from which this object should 094 * be created of 095 * 096 * @throws IOException if the stream could not be created 097 */ 098 public GzipCompressorInputStream(final InputStream inputStream) 099 throws IOException { 100 this(inputStream, false); 101 } 102 103 /** 104 * Constructs a new input stream that decompresses gzip-compressed data 105 * from the specified input stream. 106 * <p> 107 * If <code>decompressConcatenated</code> is {@code false}: 108 * This decompressor might read more input than it will actually use. 109 * If <code>inputStream</code> supports <code>mark</code> and 110 * <code>reset</code>, then the input position will be adjusted 111 * so that it is right after the last byte of the compressed stream. 112 * If <code>mark</code> isn't supported, the input position will be 113 * undefined. 114 * 115 * @param inputStream the InputStream from which this object should 116 * be created of 117 * @param decompressConcatenated 118 * if true, decompress until the end of the input; 119 * if false, stop after the first .gz member 120 * 121 * @throws IOException if the stream could not be created 122 */ 123 public GzipCompressorInputStream(final InputStream inputStream, 124 final boolean decompressConcatenated) 125 throws IOException { 126 // Mark support is strictly needed for concatenated files only, 127 // but it's simpler if it is always available. 128 if (inputStream.markSupported()) { 129 in = inputStream; 130 } else { 131 in = new BufferedInputStream(inputStream); 132 } 133 134 this.decompressConcatenated = decompressConcatenated; 135 init(true); 136 } 137 138 /** 139 * Provides the stream's meta data - may change with each stream 140 * when decompressing concatenated streams. 141 * @return the stream's meta data 142 * @since 1.8 143 */ 144 public GzipParameters getMetaData() { 145 return parameters; 146 } 147 148 private boolean init(final boolean isFirstMember) throws IOException { 149 assert isFirstMember || decompressConcatenated; 150 151 // Check the magic bytes without a possibility of EOFException. 152 final int magic0 = in.read(); 153 final int magic1 = in.read(); 154 155 // If end of input was reached after decompressing at least 156 // one .gz member, we have reached the end of the file successfully. 157 if (magic0 == -1 && !isFirstMember) { 158 return false; 159 } 160 161 if (magic0 != 31 || magic1 != 139) { 162 throw new IOException(isFirstMember 163 ? "Input is not in the .gz format" 164 : "Garbage after a valid .gz stream"); 165 } 166 167 // Parsing the rest of the header may throw EOFException. 168 final DataInputStream inData = new DataInputStream(in); 169 final int method = inData.readUnsignedByte(); 170 if (method != Deflater.DEFLATED) { 171 throw new IOException("Unsupported compression method " 172 + method + " in the .gz header"); 173 } 174 175 final int flg = inData.readUnsignedByte(); 176 if ((flg & FRESERVED) != 0) { 177 throw new IOException( 178 "Reserved flags are set in the .gz header"); 179 } 180 181 parameters.setModificationTime(readLittleEndianInt(inData) * 1000); 182 switch (inData.readUnsignedByte()) { // extra flags 183 case 2: 184 parameters.setCompressionLevel(Deflater.BEST_COMPRESSION); 185 break; 186 case 4: 187 parameters.setCompressionLevel(Deflater.BEST_SPEED); 188 break; 189 default: 190 // ignored for now 191 break; 192 } 193 parameters.setOperatingSystem(inData.readUnsignedByte()); 194 195 // Extra field, ignored 196 if ((flg & FEXTRA) != 0) { 197 int xlen = inData.readUnsignedByte(); 198 xlen |= inData.readUnsignedByte() << 8; 199 200 // This isn't as efficient as calling in.skip would be, 201 // but it's lazier to handle unexpected end of input this way. 202 // Most files don't have an extra field anyway. 203 while (xlen-- > 0) { 204 inData.readUnsignedByte(); 205 } 206 } 207 208 // Original file name 209 if ((flg & FNAME) != 0) { 210 parameters.setFilename(new String(readToNull(inData), 211 CharsetNames.ISO_8859_1)); 212 } 213 214 // Comment 215 if ((flg & FCOMMENT) != 0) { 216 parameters.setComment(new String(readToNull(inData), 217 CharsetNames.ISO_8859_1)); 218 } 219 220 // Header "CRC16" which is actually a truncated CRC32 (which isn't 221 // as good as real CRC16). I don't know if any encoder implementation 222 // sets this, so it's not worth trying to verify it. GNU gzip 1.4 223 // doesn't support this field, but zlib seems to be able to at least 224 // skip over it. 225 if ((flg & FHCRC) != 0) { 226 inData.readShort(); 227 } 228 229 // Reset 230 inf.reset(); 231 crc.reset(); 232 233 return true; 234 } 235 236 private byte[] readToNull(final DataInputStream inData) throws IOException { 237 final ByteArrayOutputStream bos = new ByteArrayOutputStream(); 238 int b = 0; 239 while ((b = inData.readUnsignedByte()) != 0x00) { // NOPMD 240 bos.write(b); 241 } 242 return bos.toByteArray(); 243 } 244 245 private long readLittleEndianInt(final DataInputStream inData) throws IOException { 246 return inData.readUnsignedByte() 247 | (inData.readUnsignedByte() << 8) 248 | (inData.readUnsignedByte() << 16) 249 | (((long) inData.readUnsignedByte()) << 24); 250 } 251 252 @Override 253 public int read() throws IOException { 254 return read(oneByte, 0, 1) == -1 ? -1 : oneByte[0] & 0xFF; 255 } 256 257 /** 258 * {@inheritDoc} 259 * 260 * @since 1.1 261 */ 262 @Override 263 public int read(final byte[] b, int off, int len) throws IOException { 264 if (endReached) { 265 return -1; 266 } 267 268 int size = 0; 269 270 while (len > 0) { 271 if (inf.needsInput()) { 272 // Remember the current position because we may need to 273 // rewind after reading too much input. 274 in.mark(buf.length); 275 276 bufUsed = in.read(buf); 277 if (bufUsed == -1) { 278 throw new EOFException(); 279 } 280 281 inf.setInput(buf, 0, bufUsed); 282 } 283 284 int ret; 285 try { 286 ret = inf.inflate(b, off, len); 287 } catch (final DataFormatException e) { 288 throw new IOException("Gzip-compressed data is corrupt"); 289 } 290 291 crc.update(b, off, ret); 292 off += ret; 293 len -= ret; 294 size += ret; 295 count(ret); 296 297 if (inf.finished()) { 298 // We may have read too many bytes. Rewind the read 299 // position to match the actual amount used. 300 // 301 // NOTE: The "if" is there just in case. Since we used 302 // in.mark earler, it should always skip enough. 303 in.reset(); 304 305 final int skipAmount = bufUsed - inf.getRemaining(); 306 if (in.skip(skipAmount) != skipAmount) { 307 throw new IOException(); 308 } 309 310 bufUsed = 0; 311 312 final DataInputStream inData = new DataInputStream(in); 313 314 // CRC32 315 final long crcStored = readLittleEndianInt(inData); 316 317 if (crcStored != crc.getValue()) { 318 throw new IOException("Gzip-compressed data is corrupt " 319 + "(CRC32 error)"); 320 } 321 322 // Uncompressed size modulo 2^32 (ISIZE in the spec) 323 final long isize = readLittleEndianInt(inData); 324 325 if (isize != (inf.getBytesWritten() & 0xffffffffl)) { 326 throw new IOException("Gzip-compressed data is corrupt" 327 + "(uncompressed size mismatch)"); 328 } 329 330 // See if this is the end of the file. 331 if (!decompressConcatenated || !init(false)) { 332 inf.end(); 333 inf = null; 334 endReached = true; 335 return size == 0 ? -1 : size; 336 } 337 } 338 } 339 340 return size; 341 } 342 343 /** 344 * Checks if the signature matches what is expected for a .gz file. 345 * 346 * @param signature the bytes to check 347 * @param length the number of bytes to check 348 * @return true if this is a .gz stream, false otherwise 349 * 350 * @since 1.1 351 */ 352 public static boolean matches(final byte[] signature, final int length) { 353 354 if (length < 2) { 355 return false; 356 } 357 358 if (signature[0] != 31) { 359 return false; 360 } 361 362 if (signature[1] != -117) { 363 return false; 364 } 365 366 return true; 367 } 368 369 /** 370 * Closes the input stream (unless it is System.in). 371 * 372 * @since 1.2 373 */ 374 @Override 375 public void close() throws IOException { 376 if (inf != null) { 377 inf.end(); 378 inf = null; 379 } 380 381 if (this.in != System.in) { 382 this.in.close(); 383 } 384 } 385}