001/* 002 * Licensed to the Apache Software Foundation (ASF) under one or more 003 * contributor license agreements. See the NOTICE file distributed with 004 * this work for additional information regarding copyright ownership. 005 * The ASF licenses this file to You under the Apache License, Version 2.0 006 * (the "License"); you may not use this file except in compliance with 007 * the License. You may obtain a copy of the License at 008 * 009 * http://www.apache.org/licenses/LICENSE-2.0 010 * 011 * Unless required by applicable law or agreed to in writing, software 012 * distributed under the License is distributed on an "AS IS" BASIS, 013 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 014 * See the License for the specific language governing permissions and 015 * limitations under the License. 016 * 017 */ 018 019/* 020 * This package is based on the work done by Timothy Gerard Endres 021 * (time@ice.com) to whom the Ant project is very grateful for his great code. 022 */ 023 024package org.apache.commons.compress.archivers.tar; 025 026import java.io.ByteArrayOutputStream; 027import java.io.IOException; 028import java.io.InputStream; 029import java.util.HashMap; 030import java.util.Map; 031import java.util.Map.Entry; 032 033import org.apache.commons.compress.archivers.ArchiveEntry; 034import org.apache.commons.compress.archivers.ArchiveInputStream; 035import org.apache.commons.compress.archivers.zip.ZipEncoding; 036import org.apache.commons.compress.archivers.zip.ZipEncodingHelper; 037import org.apache.commons.compress.utils.ArchiveUtils; 038import org.apache.commons.compress.utils.CharsetNames; 039import org.apache.commons.compress.utils.IOUtils; 040 041/** 042 * The TarInputStream reads a UNIX tar archive as an InputStream. 043 * methods are provided to position at each successive entry in 044 * the archive, and the read each entry as a normal input stream 045 * using read(). 046 * @NotThreadSafe 047 */ 048public class TarArchiveInputStream extends ArchiveInputStream { 049 050 private static final int SMALL_BUFFER_SIZE = 256; 051 052 private final byte[] SMALL_BUF = new byte[SMALL_BUFFER_SIZE]; 053 054 /** The size the TAR header */ 055 private final int recordSize; 056 057 /** The size of a block */ 058 private final int blockSize; 059 060 /** True if file has hit EOF */ 061 private boolean hasHitEOF; 062 063 /** Size of the current entry */ 064 private long entrySize; 065 066 /** How far into the entry the stream is at */ 067 private long entryOffset; 068 069 /** An input stream to read from */ 070 private final InputStream is; 071 072 /** The meta-data about the current entry */ 073 private TarArchiveEntry currEntry; 074 075 /** The encoding of the file */ 076 private final ZipEncoding zipEncoding; 077 078 // the provided encoding (for unit tests) 079 final String encoding; 080 081 /** 082 * Constructor for TarInputStream. 083 * @param is the input stream to use 084 */ 085 public TarArchiveInputStream(InputStream is) { 086 this(is, TarConstants.DEFAULT_BLKSIZE, TarConstants.DEFAULT_RCDSIZE); 087 } 088 089 /** 090 * Constructor for TarInputStream. 091 * @param is the input stream to use 092 * @param encoding name of the encoding to use for file names 093 * @since 1.4 094 */ 095 public TarArchiveInputStream(InputStream is, String encoding) { 096 this(is, TarConstants.DEFAULT_BLKSIZE, TarConstants.DEFAULT_RCDSIZE, 097 encoding); 098 } 099 100 /** 101 * Constructor for TarInputStream. 102 * @param is the input stream to use 103 * @param blockSize the block size to use 104 */ 105 public TarArchiveInputStream(InputStream is, int blockSize) { 106 this(is, blockSize, TarConstants.DEFAULT_RCDSIZE); 107 } 108 109 /** 110 * Constructor for TarInputStream. 111 * @param is the input stream to use 112 * @param blockSize the block size to use 113 * @param encoding name of the encoding to use for file names 114 * @since 1.4 115 */ 116 public TarArchiveInputStream(InputStream is, int blockSize, 117 String encoding) { 118 this(is, blockSize, TarConstants.DEFAULT_RCDSIZE, encoding); 119 } 120 121 /** 122 * Constructor for TarInputStream. 123 * @param is the input stream to use 124 * @param blockSize the block size to use 125 * @param recordSize the record size to use 126 */ 127 public TarArchiveInputStream(InputStream is, int blockSize, int recordSize) { 128 this(is, blockSize, recordSize, null); 129 } 130 131 /** 132 * Constructor for TarInputStream. 133 * @param is the input stream to use 134 * @param blockSize the block size to use 135 * @param recordSize the record size to use 136 * @param encoding name of the encoding to use for file names 137 * @since 1.4 138 */ 139 public TarArchiveInputStream(InputStream is, int blockSize, int recordSize, 140 String encoding) { 141 this.is = is; 142 this.hasHitEOF = false; 143 this.encoding = encoding; 144 this.zipEncoding = ZipEncodingHelper.getZipEncoding(encoding); 145 this.recordSize = recordSize; 146 this.blockSize = blockSize; 147 } 148 149 /** 150 * Closes this stream. Calls the TarBuffer's close() method. 151 * @throws IOException on error 152 */ 153 @Override 154 public void close() throws IOException { 155 is.close(); 156 } 157 158 /** 159 * Get the record size being used by this stream's buffer. 160 * 161 * @return The TarBuffer record size. 162 */ 163 public int getRecordSize() { 164 return recordSize; 165 } 166 167 /** 168 * Get the available data that can be read from the current 169 * entry in the archive. This does not indicate how much data 170 * is left in the entire archive, only in the current entry. 171 * This value is determined from the entry's size header field 172 * and the amount of data already read from the current entry. 173 * Integer.MAX_VALUE is returned in case more than Integer.MAX_VALUE 174 * bytes are left in the current entry in the archive. 175 * 176 * @return The number of available bytes for the current entry. 177 * @throws IOException for signature 178 */ 179 @Override 180 public int available() throws IOException { 181 if (entrySize - entryOffset > Integer.MAX_VALUE) { 182 return Integer.MAX_VALUE; 183 } 184 return (int) (entrySize - entryOffset); 185 } 186 187 188 /** 189 * Skips over and discards <code>n</code> bytes of data from this input 190 * stream. The <code>skip</code> method may, for a variety of reasons, end 191 * up skipping over some smaller number of bytes, possibly <code>0</code>. 192 * This may result from any of a number of conditions; reaching end of file 193 * or end of entry before <code>n</code> bytes have been skipped; are only 194 * two possibilities. The actual number of bytes skipped is returned. If 195 * <code>n</code> is negative, no bytes are skipped. 196 * 197 * 198 * @param n 199 * the number of bytes to be skipped. 200 * @return the actual number of bytes skipped. 201 * @exception IOException 202 * if some other I/O error occurs. 203 */ 204 @Override 205 public long skip(final long n) throws IOException { 206 if (n <= 0) { 207 return 0; 208 } 209 210 final long available = entrySize - entryOffset; 211 final long skipped = is.skip(Math.min(n, available)); 212 count(skipped); 213 entryOffset += skipped; 214 return skipped; 215 } 216 217 /** 218 * Since we do not support marking just yet, we return false. 219 * 220 * @return False. 221 */ 222 @Override 223 public boolean markSupported() { 224 return false; 225 } 226 227 /** 228 * Since we do not support marking just yet, we do nothing. 229 * 230 * @param markLimit The limit to mark. 231 */ 232 @Override 233 public void mark(int markLimit) { 234 } 235 236 /** 237 * Since we do not support marking just yet, we do nothing. 238 */ 239 @Override 240 public synchronized void reset() { 241 } 242 243 /** 244 * Get the next entry in this tar archive. This will skip 245 * over any remaining data in the current entry, if there 246 * is one, and place the input stream at the header of the 247 * next entry, and read the header and instantiate a new 248 * TarEntry from the header bytes and return that entry. 249 * If there are no more entries in the archive, null will 250 * be returned to indicate that the end of the archive has 251 * been reached. 252 * 253 * @return The next TarEntry in the archive, or null. 254 * @throws IOException on error 255 */ 256 public TarArchiveEntry getNextTarEntry() throws IOException { 257 if (hasHitEOF) { 258 return null; 259 } 260 261 if (currEntry != null) { 262 /* Skip will only go to the end of the current entry */ 263 IOUtils.skip(this, Long.MAX_VALUE); 264 265 /* skip to the end of the last record */ 266 skipRecordPadding(); 267 } 268 269 byte[] headerBuf = getRecord(); 270 271 if (headerBuf == null) { 272 /* hit EOF */ 273 currEntry = null; 274 return null; 275 } 276 277 try { 278 currEntry = new TarArchiveEntry(headerBuf, zipEncoding); 279 } catch (IllegalArgumentException e) { 280 IOException ioe = new IOException("Error detected parsing the header"); 281 ioe.initCause(e); 282 throw ioe; 283 } 284 285 entryOffset = 0; 286 entrySize = currEntry.getSize(); 287 288 if (currEntry.isGNULongLinkEntry()) { 289 byte[] longLinkData = getLongNameData(); 290 if (longLinkData == null) { 291 // Bugzilla: 40334 292 // Malformed tar file - long link entry name not followed by 293 // entry 294 return null; 295 } 296 currEntry.setLinkName(zipEncoding.decode(longLinkData)); 297 } 298 299 if (currEntry.isGNULongNameEntry()) { 300 byte[] longNameData = getLongNameData(); 301 if (longNameData == null) { 302 // Bugzilla: 40334 303 // Malformed tar file - long entry name not followed by 304 // entry 305 return null; 306 } 307 currEntry.setName(zipEncoding.decode(longNameData)); 308 } 309 310 if (currEntry.isPaxHeader()){ // Process Pax headers 311 paxHeaders(); 312 } 313 314 if (currEntry.isGNUSparse()){ // Process sparse files 315 readGNUSparse(); 316 } 317 318 // If the size of the next element in the archive has changed 319 // due to a new size being reported in the posix header 320 // information, we update entrySize here so that it contains 321 // the correct value. 322 entrySize = currEntry.getSize(); 323 324 return currEntry; 325 } 326 327 /** 328 * The last record block should be written at the full size, so skip any 329 * additional space used to fill a record after an entry 330 */ 331 private void skipRecordPadding() throws IOException { 332 if (this.entrySize > 0 && this.entrySize % this.recordSize != 0) { 333 long numRecords = (this.entrySize / this.recordSize) + 1; 334 long padding = (numRecords * this.recordSize) - this.entrySize; 335 long skipped = IOUtils.skip(is, padding); 336 count(skipped); 337 } 338 } 339 340 /** 341 * Get the next entry in this tar archive as longname data. 342 * 343 * @return The next entry in the archive as longname data, or null. 344 * @throws IOException on error 345 */ 346 protected byte[] getLongNameData() throws IOException { 347 // read in the name 348 ByteArrayOutputStream longName = new ByteArrayOutputStream(); 349 int length = 0; 350 while ((length = read(SMALL_BUF)) >= 0) { 351 longName.write(SMALL_BUF, 0, length); 352 } 353 getNextEntry(); 354 if (currEntry == null) { 355 // Bugzilla: 40334 356 // Malformed tar file - long entry name not followed by entry 357 return null; 358 } 359 byte[] longNameData = longName.toByteArray(); 360 // remove trailing null terminator(s) 361 length = longNameData.length; 362 while (length > 0 && longNameData[length - 1] == 0) { 363 --length; 364 } 365 if (length != longNameData.length) { 366 byte[] l = new byte[length]; 367 System.arraycopy(longNameData, 0, l, 0, length); 368 longNameData = l; 369 } 370 return longNameData; 371 } 372 373 /** 374 * Get the next record in this tar archive. This will skip 375 * over any remaining data in the current entry, if there 376 * is one, and place the input stream at the header of the 377 * next entry. 378 * 379 * <p>If there are no more entries in the archive, null will be 380 * returned to indicate that the end of the archive has been 381 * reached. At the same time the {@code hasHitEOF} marker will be 382 * set to true.</p> 383 * 384 * @return The next header in the archive, or null. 385 * @throws IOException on error 386 */ 387 private byte[] getRecord() throws IOException { 388 byte[] headerBuf = readRecord(); 389 hasHitEOF = isEOFRecord(headerBuf); 390 if (hasHitEOF && headerBuf != null) { 391 tryToConsumeSecondEOFRecord(); 392 consumeRemainderOfLastBlock(); 393 headerBuf = null; 394 } 395 return headerBuf; 396 } 397 398 /** 399 * Determine if an archive record indicate End of Archive. End of 400 * archive is indicated by a record that consists entirely of null bytes. 401 * 402 * @param record The record data to check. 403 * @return true if the record data is an End of Archive 404 */ 405 protected boolean isEOFRecord(byte[] record) { 406 return record == null || ArchiveUtils.isArrayZero(record, recordSize); 407 } 408 409 /** 410 * Read a record from the input stream and return the data. 411 * 412 * @return The record data or null if EOF has been hit. 413 * @throws IOException on error 414 */ 415 protected byte[] readRecord() throws IOException { 416 417 byte[] record = new byte[recordSize]; 418 419 int readNow = IOUtils.readFully(is, record); 420 count(readNow); 421 if (readNow != recordSize) { 422 return null; 423 } 424 425 return record; 426 } 427 428 private void paxHeaders() throws IOException{ 429 Map<String, String> headers = parsePaxHeaders(this); 430 getNextEntry(); // Get the actual file entry 431 applyPaxHeadersToCurrentEntry(headers); 432 } 433 434 Map<String, String> parsePaxHeaders(InputStream i) throws IOException { 435 Map<String, String> headers = new HashMap<String, String>(); 436 // Format is "length keyword=value\n"; 437 while(true){ // get length 438 int ch; 439 int len = 0; 440 int read = 0; 441 while((ch = i.read()) != -1) { 442 read++; 443 if (ch == ' '){ // End of length string 444 // Get keyword 445 ByteArrayOutputStream coll = new ByteArrayOutputStream(); 446 while((ch = i.read()) != -1) { 447 read++; 448 if (ch == '='){ // end of keyword 449 String keyword = coll.toString(CharsetNames.UTF_8); 450 // Get rest of entry 451 final int restLen = len - read; 452 byte[] rest = new byte[restLen]; 453 int got = IOUtils.readFully(i, rest); 454 if (got != restLen) { 455 throw new IOException("Failed to read " 456 + "Paxheader. Expected " 457 + restLen 458 + " bytes, read " 459 + got); 460 } 461 // Drop trailing NL 462 String value = new String(rest, 0, 463 restLen - 1, CharsetNames.UTF_8); 464 headers.put(keyword, value); 465 break; 466 } 467 coll.write((byte) ch); 468 } 469 break; // Processed single header 470 } 471 len *= 10; 472 len += ch - '0'; 473 } 474 if (ch == -1){ // EOF 475 break; 476 } 477 } 478 return headers; 479 } 480 481 private void applyPaxHeadersToCurrentEntry(Map<String, String> headers) { 482 /* 483 * The following headers are defined for Pax. 484 * atime, ctime, charset: cannot use these without changing TarArchiveEntry fields 485 * mtime 486 * comment 487 * gid, gname 488 * linkpath 489 * size 490 * uid,uname 491 * SCHILY.devminor, SCHILY.devmajor: don't have setters/getters for those 492 */ 493 for (Entry<String, String> ent : headers.entrySet()){ 494 String key = ent.getKey(); 495 String val = ent.getValue(); 496 if ("path".equals(key)){ 497 currEntry.setName(val); 498 } else if ("linkpath".equals(key)){ 499 currEntry.setLinkName(val); 500 } else if ("gid".equals(key)){ 501 currEntry.setGroupId(Long.parseLong(val)); 502 } else if ("gname".equals(key)){ 503 currEntry.setGroupName(val); 504 } else if ("uid".equals(key)){ 505 currEntry.setUserId(Long.parseLong(val)); 506 } else if ("uname".equals(key)){ 507 currEntry.setUserName(val); 508 } else if ("size".equals(key)){ 509 currEntry.setSize(Long.parseLong(val)); 510 } else if ("mtime".equals(key)){ 511 currEntry.setModTime((long) (Double.parseDouble(val) * 1000)); 512 } else if ("SCHILY.devminor".equals(key)){ 513 currEntry.setDevMinor(Integer.parseInt(val)); 514 } else if ("SCHILY.devmajor".equals(key)){ 515 currEntry.setDevMajor(Integer.parseInt(val)); 516 } 517 } 518 } 519 520 /** 521 * Adds the sparse chunks from the current entry to the sparse chunks, 522 * including any additional sparse entries following the current entry. 523 * 524 * @throws IOException on error 525 * 526 * @todo Sparse files get not yet really processed. 527 */ 528 private void readGNUSparse() throws IOException { 529 /* we do not really process sparse files yet 530 sparses = new ArrayList(); 531 sparses.addAll(currEntry.getSparses()); 532 */ 533 if (currEntry.isExtended()) { 534 TarArchiveSparseEntry entry; 535 do { 536 byte[] headerBuf = getRecord(); 537 if (headerBuf == null) { 538 currEntry = null; 539 break; 540 } 541 entry = new TarArchiveSparseEntry(headerBuf); 542 /* we do not really process sparse files yet 543 sparses.addAll(entry.getSparses()); 544 */ 545 } while (entry.isExtended()); 546 } 547 } 548 549 /** 550 * Returns the next Archive Entry in this Stream. 551 * 552 * @return the next entry, 553 * or {@code null} if there are no more entries 554 * @throws IOException if the next entry could not be read 555 */ 556 @Override 557 public ArchiveEntry getNextEntry() throws IOException { 558 return getNextTarEntry(); 559 } 560 561 /** 562 * Tries to read the next record rewinding the stream if it is not a EOF record. 563 * 564 * <p>This is meant to protect against cases where a tar 565 * implementation has written only one EOF record when two are 566 * expected. Actually this won't help since a non-conforming 567 * implementation likely won't fill full blocks consisting of - by 568 * default - ten records either so we probably have already read 569 * beyond the archive anyway.</p> 570 */ 571 private void tryToConsumeSecondEOFRecord() throws IOException { 572 boolean shouldReset = true; 573 boolean marked = is.markSupported(); 574 if (marked) { 575 is.mark(recordSize); 576 } 577 try { 578 shouldReset = !isEOFRecord(readRecord()); 579 } finally { 580 if (shouldReset && marked) { 581 pushedBackBytes(recordSize); 582 is.reset(); 583 } 584 } 585 } 586 587 /** 588 * Reads bytes from the current tar archive entry. 589 * 590 * This method is aware of the boundaries of the current 591 * entry in the archive and will deal with them as if they 592 * were this stream's start and EOF. 593 * 594 * @param buf The buffer into which to place bytes read. 595 * @param offset The offset at which to place bytes read. 596 * @param numToRead The number of bytes to read. 597 * @return The number of bytes read, or -1 at EOF. 598 * @throws IOException on error 599 */ 600 @Override 601 public int read(byte[] buf, int offset, int numToRead) throws IOException { 602 int totalRead = 0; 603 604 if (hasHitEOF || entryOffset >= entrySize) { 605 return -1; 606 } 607 608 if (currEntry == null) { 609 throw new IllegalStateException("No current tar entry"); 610 } 611 612 numToRead = Math.min(numToRead, available()); 613 614 totalRead = is.read(buf, offset, numToRead); 615 616 if (totalRead == -1) { 617 if (numToRead > 0) { 618 throw new IOException("Truncated TAR archive"); 619 } 620 hasHitEOF = true; 621 } else { 622 count(totalRead); 623 entryOffset += totalRead; 624 } 625 626 return totalRead; 627 } 628 629 /** 630 * Whether this class is able to read the given entry. 631 * 632 * <p>May return false if the current entry is a sparse file.</p> 633 */ 634 @Override 635 public boolean canReadEntryData(ArchiveEntry ae) { 636 if (ae instanceof TarArchiveEntry) { 637 TarArchiveEntry te = (TarArchiveEntry) ae; 638 return !te.isGNUSparse(); 639 } 640 return false; 641 } 642 643 /** 644 * Get the current TAR Archive Entry that this input stream is processing 645 * 646 * @return The current Archive Entry 647 */ 648 public TarArchiveEntry getCurrentEntry() { 649 return currEntry; 650 } 651 652 protected final void setCurrentEntry(TarArchiveEntry e) { 653 currEntry = e; 654 } 655 656 protected final boolean isAtEOF() { 657 return hasHitEOF; 658 } 659 660 protected final void setAtEOF(boolean b) { 661 hasHitEOF = b; 662 } 663 664 /** 665 * This method is invoked once the end of the archive is hit, it 666 * tries to consume the remaining bytes under the assumption that 667 * the tool creating this archive has padded the last block. 668 */ 669 private void consumeRemainderOfLastBlock() throws IOException { 670 long bytesReadOfLastBlock = getBytesRead() % blockSize; 671 if (bytesReadOfLastBlock > 0) { 672 long skipped = IOUtils.skip(is, blockSize - bytesReadOfLastBlock); 673 count(skipped); 674 } 675 } 676 677 /** 678 * Checks if the signature matches what is expected for a tar file. 679 * 680 * @param signature 681 * the bytes to check 682 * @param length 683 * the number of bytes to check 684 * @return true, if this stream is a tar archive stream, false otherwise 685 */ 686 public static boolean matches(byte[] signature, int length) { 687 if (length < TarConstants.VERSION_OFFSET+TarConstants.VERSIONLEN) { 688 return false; 689 } 690 691 if (ArchiveUtils.matchAsciiBuffer(TarConstants.MAGIC_POSIX, 692 signature, TarConstants.MAGIC_OFFSET, TarConstants.MAGICLEN) 693 && 694 ArchiveUtils.matchAsciiBuffer(TarConstants.VERSION_POSIX, 695 signature, TarConstants.VERSION_OFFSET, TarConstants.VERSIONLEN) 696 ){ 697 return true; 698 } 699 if (ArchiveUtils.matchAsciiBuffer(TarConstants.MAGIC_GNU, 700 signature, TarConstants.MAGIC_OFFSET, TarConstants.MAGICLEN) 701 && 702 ( 703 ArchiveUtils.matchAsciiBuffer(TarConstants.VERSION_GNU_SPACE, 704 signature, TarConstants.VERSION_OFFSET, TarConstants.VERSIONLEN) 705 || 706 ArchiveUtils.matchAsciiBuffer(TarConstants.VERSION_GNU_ZERO, 707 signature, TarConstants.VERSION_OFFSET, TarConstants.VERSIONLEN) 708 ) 709 ){ 710 return true; 711 } 712 // COMPRESS-107 - recognise Ant tar files 713 if (ArchiveUtils.matchAsciiBuffer(TarConstants.MAGIC_ANT, 714 signature, TarConstants.MAGIC_OFFSET, TarConstants.MAGICLEN) 715 && 716 ArchiveUtils.matchAsciiBuffer(TarConstants.VERSION_ANT, 717 signature, TarConstants.VERSION_OFFSET, TarConstants.VERSIONLEN) 718 ){ 719 return true; 720 } 721 return false; 722 } 723 724}