disassemble.go 4.1 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154
  1. package asm
  2. import (
  3. "io"
  4. "github.com/vbatts/tar-split/archive/tar"
  5. "github.com/vbatts/tar-split/tar/storage"
  6. )
  7. // NewInputTarStream wraps the Reader stream of a tar archive and provides a
  8. // Reader stream of the same.
  9. //
  10. // In the middle it will pack the segments and file metadata to storage.Packer
  11. // `p`.
  12. //
  13. // The the storage.FilePutter is where payload of files in the stream are
  14. // stashed. If this stashing is not needed, you can provide a nil
  15. // storage.FilePutter. Since the checksumming is still needed, then a default
  16. // of NewDiscardFilePutter will be used internally
  17. func NewInputTarStream(r io.Reader, p storage.Packer, fp storage.FilePutter) (io.Reader, error) {
  18. // What to do here... folks will want their own access to the Reader that is
  19. // their tar archive stream, but we'll need that same stream to use our
  20. // forked 'archive/tar'.
  21. // Perhaps do an io.TeeReader that hands back an io.Reader for them to read
  22. // from, and we'll MITM the stream to store metadata.
  23. // We'll need a storage.FilePutter too ...
  24. // Another concern, whether to do any storage.FilePutter operations, such that we
  25. // don't extract any amount of the archive. But then again, we're not making
  26. // files/directories, hardlinks, etc. Just writing the io to the storage.FilePutter.
  27. // Perhaps we have a DiscardFilePutter that is a bit bucket.
  28. // we'll return the pipe reader, since TeeReader does not buffer and will
  29. // only read what the outputRdr Read's. Since Tar archives have padding on
  30. // the end, we want to be the one reading the padding, even if the user's
  31. // `archive/tar` doesn't care.
  32. pR, pW := io.Pipe()
  33. outputRdr := io.TeeReader(r, pW)
  34. // we need a putter that will generate the crc64 sums of file payloads
  35. if fp == nil {
  36. fp = storage.NewDiscardFilePutter()
  37. }
  38. go func() {
  39. tr := tar.NewReader(outputRdr)
  40. tr.RawAccounting = true
  41. for {
  42. hdr, err := tr.Next()
  43. if err != nil {
  44. if err != io.EOF {
  45. pW.CloseWithError(err)
  46. return
  47. }
  48. // even when an EOF is reached, there is often 1024 null bytes on
  49. // the end of an archive. Collect them too.
  50. if b := tr.RawBytes(); len(b) > 0 {
  51. _, err := p.AddEntry(storage.Entry{
  52. Type: storage.SegmentType,
  53. Payload: b,
  54. })
  55. if err != nil {
  56. pW.CloseWithError(err)
  57. return
  58. }
  59. }
  60. break // not return. We need the end of the reader.
  61. }
  62. if hdr == nil {
  63. break // not return. We need the end of the reader.
  64. }
  65. if b := tr.RawBytes(); len(b) > 0 {
  66. _, err := p.AddEntry(storage.Entry{
  67. Type: storage.SegmentType,
  68. Payload: b,
  69. })
  70. if err != nil {
  71. pW.CloseWithError(err)
  72. return
  73. }
  74. }
  75. var csum []byte
  76. if hdr.Size > 0 {
  77. var err error
  78. _, csum, err = fp.Put(hdr.Name, tr)
  79. if err != nil {
  80. pW.CloseWithError(err)
  81. return
  82. }
  83. }
  84. entry := storage.Entry{
  85. Type: storage.FileType,
  86. Size: hdr.Size,
  87. Payload: csum,
  88. }
  89. // For proper marshalling of non-utf8 characters
  90. entry.SetName(hdr.Name)
  91. // File entries added, regardless of size
  92. _, err = p.AddEntry(entry)
  93. if err != nil {
  94. pW.CloseWithError(err)
  95. return
  96. }
  97. if b := tr.RawBytes(); len(b) > 0 {
  98. _, err = p.AddEntry(storage.Entry{
  99. Type: storage.SegmentType,
  100. Payload: b,
  101. })
  102. if err != nil {
  103. pW.CloseWithError(err)
  104. return
  105. }
  106. }
  107. }
  108. // It is allowable, and not uncommon that there is further padding on
  109. // the end of an archive, apart from the expected 1024 null bytes. We
  110. // do this in chunks rather than in one go to avoid cases where a
  111. // maliciously crafted tar file tries to trick us into reading many GBs
  112. // into memory.
  113. const paddingChunkSize = 1024 * 1024
  114. var paddingChunk [paddingChunkSize]byte
  115. for {
  116. var isEOF bool
  117. n, err := outputRdr.Read(paddingChunk[:])
  118. if err != nil {
  119. if err != io.EOF {
  120. pW.CloseWithError(err)
  121. return
  122. }
  123. isEOF = true
  124. }
  125. _, err = p.AddEntry(storage.Entry{
  126. Type: storage.SegmentType,
  127. Payload: paddingChunk[:n],
  128. })
  129. if err != nil {
  130. pW.CloseWithError(err)
  131. return
  132. }
  133. if isEOF {
  134. break
  135. }
  136. }
  137. pW.Close()
  138. }()
  139. return pR, nil
  140. }