tarsum.go 7.8 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295
  1. // Package tarsum provides algorithms to perform checksum calculation on
  2. // filesystem layers.
  3. //
  4. // The transportation of filesystems, regarding Docker, is done with tar(1)
  5. // archives. There are a variety of tar serialization formats [2], and a key
  6. // concern here is ensuring a repeatable checksum given a set of inputs from a
  7. // generic tar archive. Types of transportation include distribution to and from a
  8. // registry endpoint, saving and loading through commands or Docker daemon APIs,
  9. // transferring the build context from client to Docker daemon, and committing the
  10. // filesystem of a container to become an image.
  11. //
  12. // As tar archives are used for transit, but not preserved in many situations, the
  13. // focus of the algorithm is to ensure the integrity of the preserved filesystem,
  14. // while maintaining a deterministic accountability. This includes neither
  15. // constraining the ordering or manipulation of the files during the creation or
  16. // unpacking of the archive, nor include additional metadata state about the file
  17. // system attributes.
  18. package tarsum
  19. import (
  20. "archive/tar"
  21. "bytes"
  22. "compress/gzip"
  23. "crypto"
  24. "crypto/sha256"
  25. "encoding/hex"
  26. "errors"
  27. "fmt"
  28. "hash"
  29. "io"
  30. "path"
  31. "strings"
  32. )
  33. const (
  34. buf8K = 8 * 1024
  35. buf16K = 16 * 1024
  36. buf32K = 32 * 1024
  37. )
  38. // NewTarSum creates a new interface for calculating a fixed time checksum of a
  39. // tar archive.
  40. //
  41. // This is used for calculating checksums of layers of an image, in some cases
  42. // including the byte payload of the image's json metadata as well, and for
  43. // calculating the checksums for buildcache.
  44. func NewTarSum(r io.Reader, dc bool, v Version) (TarSum, error) {
  45. return NewTarSumHash(r, dc, v, DefaultTHash)
  46. }
  47. // NewTarSumHash creates a new TarSum, providing a THash to use rather than
  48. // the DefaultTHash.
  49. func NewTarSumHash(r io.Reader, dc bool, v Version, tHash THash) (TarSum, error) {
  50. headerSelector, err := getTarHeaderSelector(v)
  51. if err != nil {
  52. return nil, err
  53. }
  54. ts := &tarSum{Reader: r, DisableCompression: dc, tarSumVersion: v, headerSelector: headerSelector, tHash: tHash}
  55. err = ts.initTarSum()
  56. return ts, err
  57. }
  58. // NewTarSumForLabel creates a new TarSum using the provided TarSum version+hash label.
  59. func NewTarSumForLabel(r io.Reader, disableCompression bool, label string) (TarSum, error) {
  60. parts := strings.SplitN(label, "+", 2)
  61. if len(parts) != 2 {
  62. return nil, errors.New("tarsum label string should be of the form: {tarsum_version}+{hash_name}")
  63. }
  64. versionName, hashName := parts[0], parts[1]
  65. version, ok := tarSumVersionsByName[versionName]
  66. if !ok {
  67. return nil, fmt.Errorf("unknown TarSum version name: %q", versionName)
  68. }
  69. hashConfig, ok := standardHashConfigs[hashName]
  70. if !ok {
  71. return nil, fmt.Errorf("unknown TarSum hash name: %q", hashName)
  72. }
  73. tHash := NewTHash(hashConfig.name, hashConfig.hash.New)
  74. return NewTarSumHash(r, disableCompression, version, tHash)
  75. }
  76. // TarSum is the generic interface for calculating fixed time
  77. // checksums of a tar archive.
  78. type TarSum interface {
  79. io.Reader
  80. GetSums() FileInfoSums
  81. Sum([]byte) string
  82. Version() Version
  83. Hash() THash
  84. }
  85. // tarSum struct is the structure for a Version0 checksum calculation.
  86. type tarSum struct {
  87. io.Reader
  88. tarR *tar.Reader
  89. tarW *tar.Writer
  90. writer writeCloseFlusher
  91. bufTar *bytes.Buffer
  92. bufWriter *bytes.Buffer
  93. bufData []byte
  94. h hash.Hash
  95. tHash THash
  96. sums FileInfoSums
  97. fileCounter int64
  98. currentFile string
  99. finished bool
  100. first bool
  101. DisableCompression bool // false by default. When false, the output gzip compressed.
  102. tarSumVersion Version // this field is not exported so it can not be mutated during use
  103. headerSelector tarHeaderSelector // handles selecting and ordering headers for files in the archive
  104. }
  105. func (ts tarSum) Hash() THash {
  106. return ts.tHash
  107. }
  108. func (ts tarSum) Version() Version {
  109. return ts.tarSumVersion
  110. }
  111. // THash provides a hash.Hash type generator and its name.
  112. type THash interface {
  113. Hash() hash.Hash
  114. Name() string
  115. }
  116. // NewTHash is a convenience method for creating a THash.
  117. func NewTHash(name string, h func() hash.Hash) THash {
  118. return simpleTHash{n: name, h: h}
  119. }
  120. type tHashConfig struct {
  121. name string
  122. hash crypto.Hash
  123. }
  124. var (
  125. // NOTE: DO NOT include MD5 or SHA1, which are considered insecure.
  126. standardHashConfigs = map[string]tHashConfig{
  127. "sha256": {name: "sha256", hash: crypto.SHA256},
  128. "sha512": {name: "sha512", hash: crypto.SHA512},
  129. }
  130. )
  131. // DefaultTHash is default TarSum hashing algorithm - "sha256".
  132. var DefaultTHash = NewTHash("sha256", sha256.New)
  133. type simpleTHash struct {
  134. n string
  135. h func() hash.Hash
  136. }
  137. func (sth simpleTHash) Name() string { return sth.n }
  138. func (sth simpleTHash) Hash() hash.Hash { return sth.h() }
  139. func (ts *tarSum) encodeHeader(h *tar.Header) error {
  140. for _, elem := range ts.headerSelector.selectHeaders(h) {
  141. if _, err := ts.h.Write([]byte(elem[0] + elem[1])); err != nil {
  142. return err
  143. }
  144. }
  145. return nil
  146. }
  147. func (ts *tarSum) initTarSum() error {
  148. ts.bufTar = bytes.NewBuffer([]byte{})
  149. ts.bufWriter = bytes.NewBuffer([]byte{})
  150. ts.tarR = tar.NewReader(ts.Reader)
  151. ts.tarW = tar.NewWriter(ts.bufTar)
  152. if !ts.DisableCompression {
  153. ts.writer = gzip.NewWriter(ts.bufWriter)
  154. } else {
  155. ts.writer = &nopCloseFlusher{Writer: ts.bufWriter}
  156. }
  157. if ts.tHash == nil {
  158. ts.tHash = DefaultTHash
  159. }
  160. ts.h = ts.tHash.Hash()
  161. ts.h.Reset()
  162. ts.first = true
  163. ts.sums = FileInfoSums{}
  164. return nil
  165. }
  166. func (ts *tarSum) Read(buf []byte) (int, error) {
  167. if ts.finished {
  168. return ts.bufWriter.Read(buf)
  169. }
  170. if len(ts.bufData) < len(buf) {
  171. switch {
  172. case len(buf) <= buf8K:
  173. ts.bufData = make([]byte, buf8K)
  174. case len(buf) <= buf16K:
  175. ts.bufData = make([]byte, buf16K)
  176. case len(buf) <= buf32K:
  177. ts.bufData = make([]byte, buf32K)
  178. default:
  179. ts.bufData = make([]byte, len(buf))
  180. }
  181. }
  182. buf2 := ts.bufData[:len(buf)]
  183. n, err := ts.tarR.Read(buf2)
  184. if err != nil {
  185. if err == io.EOF {
  186. if _, err := ts.h.Write(buf2[:n]); err != nil {
  187. return 0, err
  188. }
  189. if !ts.first {
  190. ts.sums = append(ts.sums, fileInfoSum{name: ts.currentFile, sum: hex.EncodeToString(ts.h.Sum(nil)), pos: ts.fileCounter})
  191. ts.fileCounter++
  192. ts.h.Reset()
  193. } else {
  194. ts.first = false
  195. }
  196. currentHeader, err := ts.tarR.Next()
  197. if err != nil {
  198. if err == io.EOF {
  199. if err := ts.tarW.Close(); err != nil {
  200. return 0, err
  201. }
  202. if _, err := io.Copy(ts.writer, ts.bufTar); err != nil {
  203. return 0, err
  204. }
  205. if err := ts.writer.Close(); err != nil {
  206. return 0, err
  207. }
  208. ts.finished = true
  209. return n, nil
  210. }
  211. return n, err
  212. }
  213. ts.currentFile = path.Clean(currentHeader.Name)
  214. if err := ts.encodeHeader(currentHeader); err != nil {
  215. return 0, err
  216. }
  217. if err := ts.tarW.WriteHeader(currentHeader); err != nil {
  218. return 0, err
  219. }
  220. if _, err := ts.tarW.Write(buf2[:n]); err != nil {
  221. return 0, err
  222. }
  223. ts.tarW.Flush()
  224. if _, err := io.Copy(ts.writer, ts.bufTar); err != nil {
  225. return 0, err
  226. }
  227. ts.writer.Flush()
  228. return ts.bufWriter.Read(buf)
  229. }
  230. return n, err
  231. }
  232. // Filling the hash buffer
  233. if _, err = ts.h.Write(buf2[:n]); err != nil {
  234. return 0, err
  235. }
  236. // Filling the tar writer
  237. if _, err = ts.tarW.Write(buf2[:n]); err != nil {
  238. return 0, err
  239. }
  240. ts.tarW.Flush()
  241. // Filling the output writer
  242. if _, err = io.Copy(ts.writer, ts.bufTar); err != nil {
  243. return 0, err
  244. }
  245. ts.writer.Flush()
  246. return ts.bufWriter.Read(buf)
  247. }
  248. func (ts *tarSum) Sum(extra []byte) string {
  249. ts.sums.SortBySums()
  250. h := ts.tHash.Hash()
  251. if extra != nil {
  252. h.Write(extra)
  253. }
  254. for _, fis := range ts.sums {
  255. h.Write([]byte(fis.Sum()))
  256. }
  257. checksum := ts.Version().String() + "+" + ts.tHash.Name() + ":" + hex.EncodeToString(h.Sum(nil))
  258. return checksum
  259. }
  260. func (ts *tarSum) GetSums() FileInfoSums {
  261. return ts.sums
  262. }