compact.go 32 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992993994995996997998999100010011002100310041005100610071008100910101011101210131014101510161017101810191020102110221023102410251026102710281029103010311032103310341035103610371038103910401041104210431044104510461047104810491050105110521053105410551056105710581059106010611062106310641065106610671068106910701071107210731074107510761077107810791080108110821083108410851086108710881089109010911092109310941095109610971098109911001101110211031104110511061107110811091110111111121113111411151116111711181119112011211122112311241125112611271128112911301131113211331134113511361137113811391140114111421143114411451146114711481149115011511152115311541155115611571158115911601161116211631164116511661167116811691170117111721173117411751176117711781179118011811182118311841185118611871188118911901191119211931194119511961197119811991200120112021203120412051206120712081209121012111212121312141215121612171218121912201221122212231224122512261227122812291230123112321233123412351236123712381239124012411242124312441245124612471248124912501251125212531254125512561257125812591260126112621263
  1. package compactext4
  2. import (
  3. "bufio"
  4. "bytes"
  5. "encoding/binary"
  6. "errors"
  7. "fmt"
  8. "io"
  9. "path"
  10. "sort"
  11. "strings"
  12. "time"
  13. "github.com/Microsoft/hcsshim/ext4/internal/format"
  14. )
  15. // Writer writes a compact ext4 file system.
  16. type Writer struct {
  17. f io.ReadWriteSeeker
  18. bw *bufio.Writer
  19. inodes []*inode
  20. curName string
  21. curInode *inode
  22. pos int64
  23. dataWritten, dataMax int64
  24. err error
  25. initialized bool
  26. supportInlineData bool
  27. maxDiskSize int64
  28. gdBlocks uint32
  29. }
  30. // Mode flags for Linux files.
  31. const (
  32. S_IXOTH = format.S_IXOTH
  33. S_IWOTH = format.S_IWOTH
  34. S_IROTH = format.S_IROTH
  35. S_IXGRP = format.S_IXGRP
  36. S_IWGRP = format.S_IWGRP
  37. S_IRGRP = format.S_IRGRP
  38. S_IXUSR = format.S_IXUSR
  39. S_IWUSR = format.S_IWUSR
  40. S_IRUSR = format.S_IRUSR
  41. S_ISVTX = format.S_ISVTX
  42. S_ISGID = format.S_ISGID
  43. S_ISUID = format.S_ISUID
  44. S_IFIFO = format.S_IFIFO
  45. S_IFCHR = format.S_IFCHR
  46. S_IFDIR = format.S_IFDIR
  47. S_IFBLK = format.S_IFBLK
  48. S_IFREG = format.S_IFREG
  49. S_IFLNK = format.S_IFLNK
  50. S_IFSOCK = format.S_IFSOCK
  51. TypeMask = format.TypeMask
  52. )
  53. type inode struct {
  54. Size int64
  55. Atime, Ctime, Mtime, Crtime uint64
  56. Number format.InodeNumber
  57. Mode uint16
  58. Uid, Gid uint32
  59. LinkCount uint32
  60. XattrBlock uint32
  61. BlockCount uint32
  62. Devmajor, Devminor uint32
  63. Flags format.InodeFlag
  64. Data []byte
  65. XattrInline []byte
  66. Children directory
  67. }
  68. func (node *inode) FileType() uint16 {
  69. return node.Mode & format.TypeMask
  70. }
  71. func (node *inode) IsDir() bool {
  72. return node.FileType() == S_IFDIR
  73. }
  74. // A File represents a file to be added to an ext4 file system.
  75. type File struct {
  76. Linkname string
  77. Size int64
  78. Mode uint16
  79. Uid, Gid uint32
  80. Atime, Ctime, Mtime, Crtime time.Time
  81. Devmajor, Devminor uint32
  82. Xattrs map[string][]byte
  83. }
  84. const (
  85. inodeFirst = 11
  86. inodeLostAndFound = inodeFirst
  87. blockSize = 4096
  88. blocksPerGroup = blockSize * 8
  89. inodeSize = 256
  90. maxInodesPerGroup = blockSize * 8 // Limited by the inode bitmap
  91. inodesPerGroupIncrement = blockSize / inodeSize
  92. defaultMaxDiskSize = 16 * 1024 * 1024 * 1024 // 16GB
  93. maxMaxDiskSize = 16 * 1024 * 1024 * 1024 * 1024 // 16TB
  94. groupDescriptorSize = 32 // Use the small group descriptor
  95. groupsPerDescriptorBlock = blockSize / groupDescriptorSize
  96. maxFileSize = 128 * 1024 * 1024 * 1024 // 128GB file size maximum for now
  97. smallSymlinkSize = 59 // max symlink size that goes directly in the inode
  98. maxBlocksPerExtent = 0x8000 // maximum number of blocks in an extent
  99. inodeDataSize = 60
  100. inodeUsedSize = 152 // fields through CrtimeExtra
  101. inodeExtraSize = inodeSize - inodeUsedSize
  102. xattrInodeOverhead = 4 + 4 // magic number + empty next entry value
  103. xattrBlockOverhead = 32 + 4 // header + empty next entry value
  104. inlineDataXattrOverhead = xattrInodeOverhead + 16 + 4 // entry + "data"
  105. inlineDataSize = inodeDataSize + inodeExtraSize - inlineDataXattrOverhead
  106. )
  107. type exceededMaxSizeError struct {
  108. Size int64
  109. }
  110. func (err exceededMaxSizeError) Error() string {
  111. return fmt.Sprintf("disk exceeded maximum size of %d bytes", err.Size)
  112. }
  113. var directoryEntrySize = binary.Size(format.DirectoryEntry{})
  114. var extraIsize = uint16(inodeUsedSize - 128)
  115. type directory map[string]*inode
  116. func splitFirst(p string) (string, string) {
  117. n := strings.IndexByte(p, '/')
  118. if n >= 0 {
  119. return p[:n], p[n+1:]
  120. }
  121. return p, ""
  122. }
  123. func (w *Writer) findPath(root *inode, p string) *inode {
  124. inode := root
  125. for inode != nil && len(p) != 0 {
  126. name, rest := splitFirst(p)
  127. p = rest
  128. inode = inode.Children[name]
  129. }
  130. return inode
  131. }
  132. func timeToFsTime(t time.Time) uint64 {
  133. if t.IsZero() {
  134. return 0
  135. }
  136. s := t.Unix()
  137. if s < -0x80000000 {
  138. return 0x80000000
  139. }
  140. if s > 0x37fffffff {
  141. return 0x37fffffff
  142. }
  143. return uint64(s) | uint64(t.Nanosecond())<<34
  144. }
  145. func fsTimeToTime(t uint64) time.Time {
  146. if t == 0 {
  147. return time.Time{}
  148. }
  149. s := int64(t & 0x3ffffffff)
  150. if s > 0x7fffffff && s < 0x100000000 {
  151. s = int64(int32(uint32(s)))
  152. }
  153. return time.Unix(s, int64(t>>34))
  154. }
  155. func (w *Writer) getInode(i format.InodeNumber) *inode {
  156. if i == 0 || int(i) > len(w.inodes) {
  157. return nil
  158. }
  159. return w.inodes[i-1]
  160. }
  161. var xattrPrefixes = []struct {
  162. Index uint8
  163. Prefix string
  164. }{
  165. {2, "system.posix_acl_access"},
  166. {3, "system.posix_acl_default"},
  167. {8, "system.richacl"},
  168. {7, "system."},
  169. {1, "user."},
  170. {4, "trusted."},
  171. {6, "security."},
  172. }
  173. func compressXattrName(name string) (uint8, string) {
  174. for _, p := range xattrPrefixes {
  175. if strings.HasPrefix(name, p.Prefix) {
  176. return p.Index, name[len(p.Prefix):]
  177. }
  178. }
  179. return 0, name
  180. }
  181. func decompressXattrName(index uint8, name string) string {
  182. for _, p := range xattrPrefixes {
  183. if index == p.Index {
  184. return p.Prefix + name
  185. }
  186. }
  187. return name
  188. }
  189. func hashXattrEntry(name string, value []byte) uint32 {
  190. var hash uint32
  191. for i := 0; i < len(name); i++ {
  192. hash = (hash << 5) ^ (hash >> 27) ^ uint32(name[i])
  193. }
  194. for i := 0; i+3 < len(value); i += 4 {
  195. hash = (hash << 16) ^ (hash >> 16) ^ binary.LittleEndian.Uint32(value[i:i+4])
  196. }
  197. if len(value)%4 != 0 {
  198. var last [4]byte
  199. copy(last[:], value[len(value)&^3:])
  200. hash = (hash << 16) ^ (hash >> 16) ^ binary.LittleEndian.Uint32(last[:])
  201. }
  202. return hash
  203. }
  204. type xattr struct {
  205. Name string
  206. Index uint8
  207. Value []byte
  208. }
  209. func (x *xattr) EntryLen() int {
  210. return (len(x.Name)+3)&^3 + 16
  211. }
  212. func (x *xattr) ValueLen() int {
  213. return (len(x.Value) + 3) &^ 3
  214. }
  215. type xattrState struct {
  216. inode, block []xattr
  217. inodeLeft, blockLeft int
  218. }
  219. func (s *xattrState) init() {
  220. s.inodeLeft = inodeExtraSize - xattrInodeOverhead
  221. s.blockLeft = blockSize - xattrBlockOverhead
  222. }
  223. func (s *xattrState) addXattr(name string, value []byte) bool {
  224. index, name := compressXattrName(name)
  225. x := xattr{
  226. Index: index,
  227. Name: name,
  228. Value: value,
  229. }
  230. length := x.EntryLen() + x.ValueLen()
  231. if s.inodeLeft >= length {
  232. s.inode = append(s.inode, x)
  233. s.inodeLeft -= length
  234. } else if s.blockLeft >= length {
  235. s.block = append(s.block, x)
  236. s.blockLeft -= length
  237. } else {
  238. return false
  239. }
  240. return true
  241. }
  242. func putXattrs(xattrs []xattr, b []byte, offsetDelta uint16) {
  243. offset := uint16(len(b)) + offsetDelta
  244. eb := b
  245. db := b
  246. for _, xattr := range xattrs {
  247. vl := xattr.ValueLen()
  248. offset -= uint16(vl)
  249. eb[0] = uint8(len(xattr.Name))
  250. eb[1] = xattr.Index
  251. binary.LittleEndian.PutUint16(eb[2:], offset)
  252. binary.LittleEndian.PutUint32(eb[8:], uint32(len(xattr.Value)))
  253. binary.LittleEndian.PutUint32(eb[12:], hashXattrEntry(xattr.Name, xattr.Value))
  254. copy(eb[16:], xattr.Name)
  255. eb = eb[xattr.EntryLen():]
  256. copy(db[len(db)-vl:], xattr.Value)
  257. db = db[:len(db)-vl]
  258. }
  259. }
  260. func getXattrs(b []byte, xattrs map[string][]byte, offsetDelta uint16) {
  261. eb := b
  262. for len(eb) != 0 {
  263. nameLen := eb[0]
  264. if nameLen == 0 {
  265. break
  266. }
  267. index := eb[1]
  268. offset := binary.LittleEndian.Uint16(eb[2:]) - offsetDelta
  269. valueLen := binary.LittleEndian.Uint32(eb[8:])
  270. attr := xattr{
  271. Index: index,
  272. Name: string(eb[16 : 16+nameLen]),
  273. Value: b[offset : uint32(offset)+valueLen],
  274. }
  275. xattrs[decompressXattrName(index, attr.Name)] = attr.Value
  276. eb = eb[attr.EntryLen():]
  277. }
  278. }
  279. func (w *Writer) writeXattrs(inode *inode, state *xattrState) error {
  280. // Write the inline attributes.
  281. if len(state.inode) != 0 {
  282. inode.XattrInline = make([]byte, inodeExtraSize)
  283. binary.LittleEndian.PutUint32(inode.XattrInline[0:], format.XAttrHeaderMagic) // Magic
  284. putXattrs(state.inode, inode.XattrInline[4:], 0)
  285. }
  286. // Write the block attributes. If there was previously an xattr block, then
  287. // rewrite it even if it is now empty.
  288. if len(state.block) != 0 || inode.XattrBlock != 0 {
  289. sort.Slice(state.block, func(i, j int) bool {
  290. return state.block[i].Index < state.block[j].Index ||
  291. len(state.block[i].Name) < len(state.block[j].Name) ||
  292. state.block[i].Name < state.block[j].Name
  293. })
  294. var b [blockSize]byte
  295. binary.LittleEndian.PutUint32(b[0:], format.XAttrHeaderMagic) // Magic
  296. binary.LittleEndian.PutUint32(b[4:], 1) // ReferenceCount
  297. binary.LittleEndian.PutUint32(b[8:], 1) // Blocks
  298. putXattrs(state.block, b[32:], 32)
  299. orig := w.block()
  300. if inode.XattrBlock == 0 {
  301. inode.XattrBlock = orig
  302. inode.BlockCount++
  303. } else {
  304. // Reuse the original block.
  305. w.seekBlock(inode.XattrBlock)
  306. defer w.seekBlock(orig)
  307. }
  308. if _, err := w.write(b[:]); err != nil {
  309. return err
  310. }
  311. }
  312. return nil
  313. }
  314. func (w *Writer) write(b []byte) (int, error) {
  315. if w.err != nil {
  316. return 0, w.err
  317. }
  318. if w.pos+int64(len(b)) > w.maxDiskSize {
  319. w.err = exceededMaxSizeError{w.maxDiskSize}
  320. return 0, w.err
  321. }
  322. n, err := w.bw.Write(b)
  323. w.pos += int64(n)
  324. w.err = err
  325. return n, err
  326. }
  327. func (w *Writer) zero(n int64) (int64, error) {
  328. if w.err != nil {
  329. return 0, w.err
  330. }
  331. if w.pos+int64(n) > w.maxDiskSize {
  332. w.err = exceededMaxSizeError{w.maxDiskSize}
  333. return 0, w.err
  334. }
  335. n, err := io.CopyN(w.bw, zero, n)
  336. w.pos += n
  337. w.err = err
  338. return n, err
  339. }
  340. func (w *Writer) makeInode(f *File, node *inode) (*inode, error) {
  341. mode := f.Mode
  342. if mode&format.TypeMask == 0 {
  343. mode |= format.S_IFREG
  344. }
  345. typ := mode & format.TypeMask
  346. ino := format.InodeNumber(len(w.inodes) + 1)
  347. if node == nil {
  348. node = &inode{
  349. Number: ino,
  350. }
  351. if typ == S_IFDIR {
  352. node.Children = make(directory)
  353. node.LinkCount = 1 // A directory is linked to itself.
  354. }
  355. } else if node.Flags&format.InodeFlagExtents != 0 {
  356. // Since we cannot deallocate or reuse blocks, don't allow updates that
  357. // would invalidate data that has already been written.
  358. return nil, errors.New("cannot overwrite file with non-inline data")
  359. }
  360. node.Mode = mode
  361. node.Uid = f.Uid
  362. node.Gid = f.Gid
  363. node.Flags = format.InodeFlagHugeFile
  364. node.Atime = timeToFsTime(f.Atime)
  365. node.Ctime = timeToFsTime(f.Ctime)
  366. node.Mtime = timeToFsTime(f.Mtime)
  367. node.Crtime = timeToFsTime(f.Crtime)
  368. node.Devmajor = f.Devmajor
  369. node.Devminor = f.Devminor
  370. node.Data = nil
  371. node.XattrInline = nil
  372. var xstate xattrState
  373. xstate.init()
  374. var size int64
  375. switch typ {
  376. case format.S_IFREG:
  377. size = f.Size
  378. if f.Size > maxFileSize {
  379. return nil, fmt.Errorf("file too big: %d > %d", f.Size, int64(maxFileSize))
  380. }
  381. if f.Size <= inlineDataSize && w.supportInlineData {
  382. node.Data = make([]byte, f.Size)
  383. extra := 0
  384. if f.Size > inodeDataSize {
  385. extra = int(f.Size - inodeDataSize)
  386. }
  387. // Add a dummy entry for now.
  388. if !xstate.addXattr("system.data", node.Data[:extra]) {
  389. panic("not enough room for inline data")
  390. }
  391. node.Flags |= format.InodeFlagInlineData
  392. }
  393. case format.S_IFLNK:
  394. node.Mode |= 0777 // Symlinks should appear as ugw rwx
  395. size = int64(len(f.Linkname))
  396. if size <= smallSymlinkSize {
  397. // Special case: small symlinks go directly in Block without setting
  398. // an inline data flag.
  399. node.Data = make([]byte, len(f.Linkname))
  400. copy(node.Data, f.Linkname)
  401. }
  402. case format.S_IFDIR, format.S_IFIFO, format.S_IFSOCK, format.S_IFCHR, format.S_IFBLK:
  403. default:
  404. return nil, fmt.Errorf("invalid mode %o", mode)
  405. }
  406. // Accumulate the extended attributes.
  407. if len(f.Xattrs) != 0 {
  408. // Sort the xattrs to avoid non-determinism in map iteration.
  409. var xattrs []string
  410. for name := range f.Xattrs {
  411. xattrs = append(xattrs, name)
  412. }
  413. sort.Strings(xattrs)
  414. for _, name := range xattrs {
  415. if !xstate.addXattr(name, f.Xattrs[name]) {
  416. return nil, fmt.Errorf("could not fit xattr %s", name)
  417. }
  418. }
  419. }
  420. if err := w.writeXattrs(node, &xstate); err != nil {
  421. return nil, err
  422. }
  423. node.Size = size
  424. if typ == format.S_IFLNK && size > smallSymlinkSize {
  425. // Write the link name as data.
  426. w.startInode("", node, size)
  427. if _, err := w.Write([]byte(f.Linkname)); err != nil {
  428. return nil, err
  429. }
  430. if err := w.finishInode(); err != nil {
  431. return nil, err
  432. }
  433. }
  434. if int(node.Number-1) >= len(w.inodes) {
  435. w.inodes = append(w.inodes, node)
  436. }
  437. return node, nil
  438. }
  439. func (w *Writer) root() *inode {
  440. return w.getInode(format.InodeRoot)
  441. }
  442. func (w *Writer) lookup(name string, mustExist bool) (*inode, *inode, string, error) {
  443. root := w.root()
  444. cleanname := path.Clean("/" + name)[1:]
  445. if len(cleanname) == 0 {
  446. return root, root, "", nil
  447. }
  448. dirname, childname := path.Split(cleanname)
  449. if len(childname) == 0 || len(childname) > 0xff {
  450. return nil, nil, "", fmt.Errorf("%s: invalid name", name)
  451. }
  452. dir := w.findPath(root, dirname)
  453. if dir == nil || !dir.IsDir() {
  454. return nil, nil, "", fmt.Errorf("%s: path not found", name)
  455. }
  456. child := dir.Children[childname]
  457. if child == nil && mustExist {
  458. return nil, nil, "", fmt.Errorf("%s: file not found", name)
  459. }
  460. return dir, child, childname, nil
  461. }
  462. // Create adds a file to the file system.
  463. func (w *Writer) Create(name string, f *File) error {
  464. if err := w.finishInode(); err != nil {
  465. return err
  466. }
  467. dir, existing, childname, err := w.lookup(name, false)
  468. if err != nil {
  469. return err
  470. }
  471. var reuse *inode
  472. if existing != nil {
  473. if existing.IsDir() {
  474. if f.Mode&TypeMask != S_IFDIR {
  475. return fmt.Errorf("%s: cannot replace a directory with a file", name)
  476. }
  477. reuse = existing
  478. } else if f.Mode&TypeMask == S_IFDIR {
  479. return fmt.Errorf("%s: cannot replace a file with a directory", name)
  480. } else if existing.LinkCount < 2 {
  481. reuse = existing
  482. }
  483. } else {
  484. if f.Mode&TypeMask == S_IFDIR && dir.LinkCount >= format.MaxLinks {
  485. return fmt.Errorf("%s: exceeded parent directory maximum link count", name)
  486. }
  487. }
  488. child, err := w.makeInode(f, reuse)
  489. if err != nil {
  490. return fmt.Errorf("%s: %s", name, err)
  491. }
  492. if existing != child {
  493. if existing != nil {
  494. existing.LinkCount--
  495. }
  496. dir.Children[childname] = child
  497. child.LinkCount++
  498. if child.IsDir() {
  499. dir.LinkCount++
  500. }
  501. }
  502. if child.Mode&format.TypeMask == format.S_IFREG {
  503. w.startInode(name, child, f.Size)
  504. }
  505. return nil
  506. }
  507. // Link adds a hard link to the file system.
  508. func (w *Writer) Link(oldname, newname string) error {
  509. if err := w.finishInode(); err != nil {
  510. return err
  511. }
  512. newdir, existing, newchildname, err := w.lookup(newname, false)
  513. if err != nil {
  514. return err
  515. }
  516. if existing != nil && (existing.IsDir() || existing.LinkCount < 2) {
  517. return fmt.Errorf("%s: cannot orphan existing file or directory", newname)
  518. }
  519. _, oldfile, _, err := w.lookup(oldname, true)
  520. if err != nil {
  521. return err
  522. }
  523. switch oldfile.Mode & format.TypeMask {
  524. case format.S_IFDIR, format.S_IFLNK:
  525. return fmt.Errorf("%s: link target cannot be a directory or symlink: %s", newname, oldname)
  526. }
  527. if existing != oldfile && oldfile.LinkCount >= format.MaxLinks {
  528. return fmt.Errorf("%s: link target would exceed maximum link count: %s", newname, oldname)
  529. }
  530. if existing != nil {
  531. existing.LinkCount--
  532. }
  533. oldfile.LinkCount++
  534. newdir.Children[newchildname] = oldfile
  535. return nil
  536. }
  537. // Stat returns information about a file that has been written.
  538. func (w *Writer) Stat(name string) (*File, error) {
  539. if err := w.finishInode(); err != nil {
  540. return nil, err
  541. }
  542. _, node, _, err := w.lookup(name, true)
  543. if err != nil {
  544. return nil, err
  545. }
  546. f := &File{
  547. Size: node.Size,
  548. Mode: node.Mode,
  549. Uid: node.Uid,
  550. Gid: node.Gid,
  551. Atime: fsTimeToTime(node.Atime),
  552. Ctime: fsTimeToTime(node.Ctime),
  553. Mtime: fsTimeToTime(node.Mtime),
  554. Crtime: fsTimeToTime(node.Crtime),
  555. Devmajor: node.Devmajor,
  556. Devminor: node.Devminor,
  557. }
  558. f.Xattrs = make(map[string][]byte)
  559. if node.XattrBlock != 0 || len(node.XattrInline) != 0 {
  560. if node.XattrBlock != 0 {
  561. orig := w.block()
  562. w.seekBlock(node.XattrBlock)
  563. if w.err != nil {
  564. return nil, w.err
  565. }
  566. var b [blockSize]byte
  567. _, err := w.f.Read(b[:])
  568. w.seekBlock(orig)
  569. if err != nil {
  570. return nil, err
  571. }
  572. getXattrs(b[32:], f.Xattrs, 32)
  573. }
  574. if len(node.XattrInline) != 0 {
  575. getXattrs(node.XattrInline[4:], f.Xattrs, 0)
  576. delete(f.Xattrs, "system.data")
  577. }
  578. }
  579. if node.FileType() == S_IFLNK {
  580. if node.Size > smallSymlinkSize {
  581. return nil, fmt.Errorf("%s: cannot retrieve link information", name)
  582. }
  583. f.Linkname = string(node.Data)
  584. }
  585. return f, nil
  586. }
  587. func (w *Writer) Write(b []byte) (int, error) {
  588. if len(b) == 0 {
  589. return 0, nil
  590. }
  591. if w.dataWritten+int64(len(b)) > w.dataMax {
  592. return 0, fmt.Errorf("%s: wrote too much: %d > %d", w.curName, w.dataWritten+int64(len(b)), w.dataMax)
  593. }
  594. if w.curInode.Flags&format.InodeFlagInlineData != 0 {
  595. copy(w.curInode.Data[w.dataWritten:], b)
  596. w.dataWritten += int64(len(b))
  597. return len(b), nil
  598. }
  599. n, err := w.write(b)
  600. w.dataWritten += int64(n)
  601. return n, err
  602. }
  603. func (w *Writer) startInode(name string, inode *inode, size int64) {
  604. if w.curInode != nil {
  605. panic("inode already in progress")
  606. }
  607. w.curName = name
  608. w.curInode = inode
  609. w.dataWritten = 0
  610. w.dataMax = size
  611. }
  612. func (w *Writer) block() uint32 {
  613. return uint32(w.pos / blockSize)
  614. }
  615. func (w *Writer) seekBlock(block uint32) {
  616. w.pos = int64(block) * blockSize
  617. if w.err != nil {
  618. return
  619. }
  620. w.err = w.bw.Flush()
  621. if w.err != nil {
  622. return
  623. }
  624. _, w.err = w.f.Seek(w.pos, io.SeekStart)
  625. }
  626. func (w *Writer) nextBlock() {
  627. if w.pos%blockSize != 0 {
  628. // Simplify callers; w.err is updated on failure.
  629. w.zero(blockSize - w.pos%blockSize)
  630. }
  631. }
  632. func fillExtents(hdr *format.ExtentHeader, extents []format.ExtentLeafNode, startBlock, offset, inodeSize uint32) {
  633. *hdr = format.ExtentHeader{
  634. Magic: format.ExtentHeaderMagic,
  635. Entries: uint16(len(extents)),
  636. Max: uint16(cap(extents)),
  637. Depth: 0,
  638. }
  639. for i := range extents {
  640. block := offset + uint32(i)*maxBlocksPerExtent
  641. length := inodeSize - block
  642. if length > maxBlocksPerExtent {
  643. length = maxBlocksPerExtent
  644. }
  645. start := startBlock + block
  646. extents[i] = format.ExtentLeafNode{
  647. Block: block,
  648. Length: uint16(length),
  649. StartLow: start,
  650. }
  651. }
  652. }
  653. func (w *Writer) writeExtents(inode *inode) error {
  654. start := w.pos - w.dataWritten
  655. if start%blockSize != 0 {
  656. panic("unaligned")
  657. }
  658. w.nextBlock()
  659. startBlock := uint32(start / blockSize)
  660. blocks := w.block() - startBlock
  661. usedBlocks := blocks
  662. const extentNodeSize = 12
  663. const extentsPerBlock = blockSize/extentNodeSize - 1
  664. extents := (blocks + maxBlocksPerExtent - 1) / maxBlocksPerExtent
  665. var b bytes.Buffer
  666. if extents == 0 {
  667. // Nothing to do.
  668. } else if extents <= 4 {
  669. var root struct {
  670. hdr format.ExtentHeader
  671. extents [4]format.ExtentLeafNode
  672. }
  673. fillExtents(&root.hdr, root.extents[:extents], startBlock, 0, blocks)
  674. binary.Write(&b, binary.LittleEndian, root)
  675. } else if extents <= 4*extentsPerBlock {
  676. const extentsPerBlock = blockSize/extentNodeSize - 1
  677. extentBlocks := extents/extentsPerBlock + 1
  678. usedBlocks += extentBlocks
  679. var b2 bytes.Buffer
  680. var root struct {
  681. hdr format.ExtentHeader
  682. nodes [4]format.ExtentIndexNode
  683. }
  684. root.hdr = format.ExtentHeader{
  685. Magic: format.ExtentHeaderMagic,
  686. Entries: uint16(extentBlocks),
  687. Max: 4,
  688. Depth: 1,
  689. }
  690. for i := uint32(0); i < extentBlocks; i++ {
  691. root.nodes[i] = format.ExtentIndexNode{
  692. Block: i * extentsPerBlock * maxBlocksPerExtent,
  693. LeafLow: w.block(),
  694. }
  695. extentsInBlock := extents - i*extentBlocks
  696. if extentsInBlock > extentsPerBlock {
  697. extentsInBlock = extentsPerBlock
  698. }
  699. var node struct {
  700. hdr format.ExtentHeader
  701. extents [extentsPerBlock]format.ExtentLeafNode
  702. _ [blockSize - (extentsPerBlock+1)*extentNodeSize]byte
  703. }
  704. offset := i * extentsPerBlock * maxBlocksPerExtent
  705. fillExtents(&node.hdr, node.extents[:extentsInBlock], startBlock+offset, offset, blocks)
  706. binary.Write(&b2, binary.LittleEndian, node)
  707. if _, err := w.write(b2.Next(blockSize)); err != nil {
  708. return err
  709. }
  710. }
  711. binary.Write(&b, binary.LittleEndian, root)
  712. } else {
  713. panic("file too big")
  714. }
  715. inode.Data = b.Bytes()
  716. inode.Flags |= format.InodeFlagExtents
  717. inode.BlockCount += usedBlocks
  718. return w.err
  719. }
  720. func (w *Writer) finishInode() error {
  721. if !w.initialized {
  722. if err := w.init(); err != nil {
  723. return err
  724. }
  725. }
  726. if w.curInode == nil {
  727. return nil
  728. }
  729. if w.dataWritten != w.dataMax {
  730. return fmt.Errorf("did not write the right amount: %d != %d", w.dataWritten, w.dataMax)
  731. }
  732. if w.dataMax != 0 && w.curInode.Flags&format.InodeFlagInlineData == 0 {
  733. if err := w.writeExtents(w.curInode); err != nil {
  734. return err
  735. }
  736. }
  737. w.dataWritten = 0
  738. w.dataMax = 0
  739. w.curInode = nil
  740. return w.err
  741. }
  742. func modeToFileType(mode uint16) format.FileType {
  743. switch mode & format.TypeMask {
  744. default:
  745. return format.FileTypeUnknown
  746. case format.S_IFREG:
  747. return format.FileTypeRegular
  748. case format.S_IFDIR:
  749. return format.FileTypeDirectory
  750. case format.S_IFCHR:
  751. return format.FileTypeCharacter
  752. case format.S_IFBLK:
  753. return format.FileTypeBlock
  754. case format.S_IFIFO:
  755. return format.FileTypeFIFO
  756. case format.S_IFSOCK:
  757. return format.FileTypeSocket
  758. case format.S_IFLNK:
  759. return format.FileTypeSymbolicLink
  760. }
  761. }
  762. type constReader byte
  763. var zero = constReader(0)
  764. func (r constReader) Read(b []byte) (int, error) {
  765. for i := range b {
  766. b[i] = byte(r)
  767. }
  768. return len(b), nil
  769. }
  770. func (w *Writer) writeDirectory(dir, parent *inode) error {
  771. if err := w.finishInode(); err != nil {
  772. return err
  773. }
  774. // The size of the directory is not known yet.
  775. w.startInode("", dir, 0x7fffffffffffffff)
  776. left := blockSize
  777. finishBlock := func() error {
  778. if left > 0 {
  779. e := format.DirectoryEntry{
  780. RecordLength: uint16(left),
  781. }
  782. err := binary.Write(w, binary.LittleEndian, e)
  783. if err != nil {
  784. return err
  785. }
  786. left -= directoryEntrySize
  787. if left < 4 {
  788. panic("not enough space for trailing entry")
  789. }
  790. _, err = io.CopyN(w, zero, int64(left))
  791. if err != nil {
  792. return err
  793. }
  794. }
  795. left = blockSize
  796. return nil
  797. }
  798. writeEntry := func(ino format.InodeNumber, name string) error {
  799. rlb := directoryEntrySize + len(name)
  800. rl := (rlb + 3) & ^3
  801. if left < rl+12 {
  802. if err := finishBlock(); err != nil {
  803. return err
  804. }
  805. }
  806. e := format.DirectoryEntry{
  807. Inode: ino,
  808. RecordLength: uint16(rl),
  809. NameLength: uint8(len(name)),
  810. FileType: modeToFileType(w.getInode(ino).Mode),
  811. }
  812. err := binary.Write(w, binary.LittleEndian, e)
  813. if err != nil {
  814. return err
  815. }
  816. _, err = w.Write([]byte(name))
  817. if err != nil {
  818. return err
  819. }
  820. var zero [4]byte
  821. _, err = w.Write(zero[:rl-rlb])
  822. if err != nil {
  823. return err
  824. }
  825. left -= rl
  826. return nil
  827. }
  828. if err := writeEntry(dir.Number, "."); err != nil {
  829. return err
  830. }
  831. if err := writeEntry(parent.Number, ".."); err != nil {
  832. return err
  833. }
  834. // Follow e2fsck's convention and sort the children by inode number.
  835. var children []string
  836. for name := range dir.Children {
  837. children = append(children, name)
  838. }
  839. sort.Slice(children, func(i, j int) bool {
  840. return dir.Children[children[i]].Number < dir.Children[children[j]].Number
  841. })
  842. for _, name := range children {
  843. child := dir.Children[name]
  844. if err := writeEntry(child.Number, name); err != nil {
  845. return err
  846. }
  847. }
  848. if err := finishBlock(); err != nil {
  849. return err
  850. }
  851. w.curInode.Size = w.dataWritten
  852. w.dataMax = w.dataWritten
  853. return nil
  854. }
  855. func (w *Writer) writeDirectoryRecursive(dir, parent *inode) error {
  856. if err := w.writeDirectory(dir, parent); err != nil {
  857. return err
  858. }
  859. for _, child := range dir.Children {
  860. if child.IsDir() {
  861. if err := w.writeDirectoryRecursive(child, dir); err != nil {
  862. return err
  863. }
  864. }
  865. }
  866. return nil
  867. }
  868. func (w *Writer) writeInodeTable(tableSize uint32) error {
  869. var b bytes.Buffer
  870. for _, inode := range w.inodes {
  871. if inode != nil {
  872. binode := format.Inode{
  873. Mode: inode.Mode,
  874. Uid: uint16(inode.Uid & 0xffff),
  875. Gid: uint16(inode.Gid & 0xffff),
  876. SizeLow: uint32(inode.Size & 0xffffffff),
  877. SizeHigh: uint32(inode.Size >> 32),
  878. LinksCount: uint16(inode.LinkCount),
  879. BlocksLow: inode.BlockCount,
  880. Flags: inode.Flags,
  881. XattrBlockLow: inode.XattrBlock,
  882. UidHigh: uint16(inode.Uid >> 16),
  883. GidHigh: uint16(inode.Gid >> 16),
  884. ExtraIsize: uint16(inodeUsedSize - 128),
  885. Atime: uint32(inode.Atime),
  886. AtimeExtra: uint32(inode.Atime >> 32),
  887. Ctime: uint32(inode.Ctime),
  888. CtimeExtra: uint32(inode.Ctime >> 32),
  889. Mtime: uint32(inode.Mtime),
  890. MtimeExtra: uint32(inode.Mtime >> 32),
  891. Crtime: uint32(inode.Crtime),
  892. CrtimeExtra: uint32(inode.Crtime >> 32),
  893. }
  894. switch inode.Mode & format.TypeMask {
  895. case format.S_IFDIR, format.S_IFREG, format.S_IFLNK:
  896. n := copy(binode.Block[:], inode.Data)
  897. if n < len(inode.Data) {
  898. // Rewrite the first xattr with the data.
  899. xattr := [1]xattr{{
  900. Name: "data",
  901. Index: 7, // "system."
  902. Value: inode.Data[n:],
  903. }}
  904. putXattrs(xattr[:], inode.XattrInline[4:], 0)
  905. }
  906. case format.S_IFBLK, format.S_IFCHR:
  907. dev := inode.Devminor&0xff | inode.Devmajor<<8 | (inode.Devminor&0xffffff00)<<12
  908. binary.LittleEndian.PutUint32(binode.Block[4:], dev)
  909. }
  910. binary.Write(&b, binary.LittleEndian, binode)
  911. b.Truncate(inodeUsedSize)
  912. n, _ := b.Write(inode.XattrInline)
  913. io.CopyN(&b, zero, int64(inodeExtraSize-n))
  914. } else {
  915. io.CopyN(&b, zero, inodeSize)
  916. }
  917. if _, err := w.write(b.Next(inodeSize)); err != nil {
  918. return err
  919. }
  920. }
  921. rest := tableSize - uint32(len(w.inodes)*inodeSize)
  922. if _, err := w.zero(int64(rest)); err != nil {
  923. return err
  924. }
  925. return nil
  926. }
  927. // NewWriter returns a Writer that writes an ext4 file system to the provided
  928. // WriteSeeker.
  929. func NewWriter(f io.ReadWriteSeeker, opts ...Option) *Writer {
  930. w := &Writer{
  931. f: f,
  932. bw: bufio.NewWriterSize(f, 65536*8),
  933. maxDiskSize: defaultMaxDiskSize,
  934. }
  935. for _, opt := range opts {
  936. opt(w)
  937. }
  938. return w
  939. }
  940. // An Option provides extra options to NewWriter.
  941. type Option func(*Writer)
  942. // InlineData instructs the Writer to write small files into the inode
  943. // structures directly. This creates smaller images but currently is not
  944. // compatible with DAX.
  945. func InlineData(w *Writer) {
  946. w.supportInlineData = true
  947. }
  948. // MaximumDiskSize instructs the writer to reserve enough metadata space for the
  949. // specified disk size. If not provided, then 16GB is the default.
  950. func MaximumDiskSize(size int64) Option {
  951. return func(w *Writer) {
  952. if size < 0 || size > maxMaxDiskSize {
  953. w.maxDiskSize = maxMaxDiskSize
  954. } else if size == 0 {
  955. w.maxDiskSize = defaultMaxDiskSize
  956. } else {
  957. w.maxDiskSize = (size + blockSize - 1) &^ (blockSize - 1)
  958. }
  959. }
  960. }
  961. func (w *Writer) init() error {
  962. // Skip the defective block inode.
  963. w.inodes = make([]*inode, 1, 32)
  964. // Create the root directory.
  965. root, _ := w.makeInode(&File{
  966. Mode: format.S_IFDIR | 0755,
  967. }, nil)
  968. root.LinkCount++ // The root is linked to itself.
  969. // Skip until the first non-reserved inode.
  970. w.inodes = append(w.inodes, make([]*inode, inodeFirst-len(w.inodes)-1)...)
  971. maxBlocks := (w.maxDiskSize-1)/blockSize + 1
  972. maxGroups := (maxBlocks-1)/blocksPerGroup + 1
  973. w.gdBlocks = uint32((maxGroups-1)/groupsPerDescriptorBlock + 1)
  974. // Skip past the superblock and block descriptor table.
  975. w.seekBlock(1 + w.gdBlocks)
  976. w.initialized = true
  977. // The lost+found directory is required to exist for e2fsck to pass.
  978. if err := w.Create("lost+found", &File{Mode: format.S_IFDIR | 0700}); err != nil {
  979. return err
  980. }
  981. return w.err
  982. }
  983. func groupCount(blocks uint32, inodes uint32, inodesPerGroup uint32) uint32 {
  984. inodeBlocksPerGroup := inodesPerGroup * inodeSize / blockSize
  985. dataBlocksPerGroup := blocksPerGroup - inodeBlocksPerGroup - 2 // save room for the bitmaps
  986. // Increase the block count to ensure there are enough groups for all the
  987. // inodes.
  988. minBlocks := (inodes-1)/inodesPerGroup*dataBlocksPerGroup + 1
  989. if blocks < minBlocks {
  990. blocks = minBlocks
  991. }
  992. return (blocks + dataBlocksPerGroup - 1) / dataBlocksPerGroup
  993. }
  994. func bestGroupCount(blocks uint32, inodes uint32) (groups uint32, inodesPerGroup uint32) {
  995. groups = 0xffffffff
  996. for ipg := uint32(inodesPerGroupIncrement); ipg <= maxInodesPerGroup; ipg += inodesPerGroupIncrement {
  997. g := groupCount(blocks, inodes, ipg)
  998. if g < groups {
  999. groups = g
  1000. inodesPerGroup = ipg
  1001. }
  1002. }
  1003. return
  1004. }
  1005. func (w *Writer) Close() error {
  1006. if err := w.finishInode(); err != nil {
  1007. return err
  1008. }
  1009. root := w.root()
  1010. if err := w.writeDirectoryRecursive(root, root); err != nil {
  1011. return err
  1012. }
  1013. // Finish the last inode (probably a directory).
  1014. if err := w.finishInode(); err != nil {
  1015. return err
  1016. }
  1017. // Write the inode table
  1018. inodeTableOffset := w.block()
  1019. groups, inodesPerGroup := bestGroupCount(inodeTableOffset, uint32(len(w.inodes)))
  1020. err := w.writeInodeTable(groups * inodesPerGroup * inodeSize)
  1021. if err != nil {
  1022. return err
  1023. }
  1024. // Write the bitmaps.
  1025. bitmapOffset := w.block()
  1026. bitmapSize := groups * 2
  1027. validDataSize := bitmapOffset + bitmapSize
  1028. diskSize := validDataSize
  1029. minSize := (groups-1)*blocksPerGroup + 1
  1030. if diskSize < minSize {
  1031. diskSize = minSize
  1032. }
  1033. usedGdBlocks := (groups-1)/groupDescriptorSize + 1
  1034. if usedGdBlocks > w.gdBlocks {
  1035. return exceededMaxSizeError{w.maxDiskSize}
  1036. }
  1037. gds := make([]format.GroupDescriptor, w.gdBlocks*groupsPerDescriptorBlock)
  1038. inodeTableSizePerGroup := inodesPerGroup * inodeSize / blockSize
  1039. var totalUsedBlocks, totalUsedInodes uint32
  1040. for g := uint32(0); g < groups; g++ {
  1041. var b [blockSize * 2]byte
  1042. var dirCount, usedInodeCount, usedBlockCount uint16
  1043. // Block bitmap
  1044. if (g+1)*blocksPerGroup <= validDataSize {
  1045. // This group is fully allocated.
  1046. for j := range b[:blockSize] {
  1047. b[j] = 0xff
  1048. }
  1049. usedBlockCount = blocksPerGroup
  1050. } else if g*blocksPerGroup < validDataSize {
  1051. for j := uint32(0); j < validDataSize-g*blocksPerGroup; j++ {
  1052. b[j/8] |= 1 << (j % 8)
  1053. usedBlockCount++
  1054. }
  1055. }
  1056. if g == 0 {
  1057. // Unused group descriptor blocks should be cleared.
  1058. for j := 1 + usedGdBlocks; j < 1+w.gdBlocks; j++ {
  1059. b[j/8] &^= 1 << (j % 8)
  1060. usedBlockCount--
  1061. }
  1062. }
  1063. if g == groups-1 && diskSize%blocksPerGroup != 0 {
  1064. // Blocks that aren't present in the disk should be marked as
  1065. // allocated.
  1066. for j := diskSize % blocksPerGroup; j < blocksPerGroup; j++ {
  1067. b[j/8] |= 1 << (j % 8)
  1068. usedBlockCount++
  1069. }
  1070. }
  1071. // Inode bitmap
  1072. for j := uint32(0); j < inodesPerGroup; j++ {
  1073. ino := format.InodeNumber(1 + g*inodesPerGroup + j)
  1074. inode := w.getInode(ino)
  1075. if ino < inodeFirst || inode != nil {
  1076. b[blockSize+j/8] |= 1 << (j % 8)
  1077. usedInodeCount++
  1078. }
  1079. if inode != nil && inode.Mode&format.TypeMask == format.S_IFDIR {
  1080. dirCount++
  1081. }
  1082. }
  1083. _, err := w.write(b[:])
  1084. if err != nil {
  1085. return err
  1086. }
  1087. gds[g] = format.GroupDescriptor{
  1088. BlockBitmapLow: bitmapOffset + 2*g,
  1089. InodeBitmapLow: bitmapOffset + 2*g + 1,
  1090. InodeTableLow: inodeTableOffset + g*inodeTableSizePerGroup,
  1091. UsedDirsCountLow: dirCount,
  1092. FreeInodesCountLow: uint16(inodesPerGroup) - usedInodeCount,
  1093. FreeBlocksCountLow: blocksPerGroup - usedBlockCount,
  1094. }
  1095. totalUsedBlocks += uint32(usedBlockCount)
  1096. totalUsedInodes += uint32(usedInodeCount)
  1097. }
  1098. // Zero up to the disk size.
  1099. _, err = w.zero(int64(diskSize-bitmapOffset-bitmapSize) * blockSize)
  1100. if err != nil {
  1101. return err
  1102. }
  1103. // Write the block descriptors
  1104. w.seekBlock(1)
  1105. if w.err != nil {
  1106. return w.err
  1107. }
  1108. err = binary.Write(w.bw, binary.LittleEndian, gds)
  1109. if err != nil {
  1110. return err
  1111. }
  1112. // Write the super block
  1113. var blk [blockSize]byte
  1114. b := bytes.NewBuffer(blk[:1024])
  1115. sb := &format.SuperBlock{
  1116. InodesCount: inodesPerGroup * groups,
  1117. BlocksCountLow: diskSize,
  1118. FreeBlocksCountLow: blocksPerGroup*groups - totalUsedBlocks,
  1119. FreeInodesCount: inodesPerGroup*groups - totalUsedInodes,
  1120. FirstDataBlock: 0,
  1121. LogBlockSize: 2, // 2^(10 + 2)
  1122. LogClusterSize: 2,
  1123. BlocksPerGroup: blocksPerGroup,
  1124. ClustersPerGroup: blocksPerGroup,
  1125. InodesPerGroup: inodesPerGroup,
  1126. Magic: format.SuperBlockMagic,
  1127. State: 1, // cleanly unmounted
  1128. Errors: 1, // continue on error?
  1129. CreatorOS: 0, // Linux
  1130. RevisionLevel: 1, // dynamic inode sizes
  1131. FirstInode: inodeFirst,
  1132. LpfInode: inodeLostAndFound,
  1133. InodeSize: inodeSize,
  1134. FeatureCompat: format.CompatSparseSuper2 | format.CompatExtAttr,
  1135. FeatureIncompat: format.IncompatFiletype | format.IncompatExtents | format.IncompatFlexBg,
  1136. FeatureRoCompat: format.RoCompatLargeFile | format.RoCompatHugeFile | format.RoCompatExtraIsize | format.RoCompatReadonly,
  1137. MinExtraIsize: extraIsize,
  1138. WantExtraIsize: extraIsize,
  1139. LogGroupsPerFlex: 31,
  1140. }
  1141. if w.supportInlineData {
  1142. sb.FeatureIncompat |= format.IncompatInlineData
  1143. }
  1144. binary.Write(b, binary.LittleEndian, sb)
  1145. w.seekBlock(0)
  1146. if _, err := w.write(blk[:]); err != nil {
  1147. return err
  1148. }
  1149. w.seekBlock(diskSize)
  1150. return w.err
  1151. }