utfbom.go 4.5 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192
  1. // Package utfbom implements the detection of the BOM (Unicode Byte Order Mark) and removing as necessary.
  2. // It wraps an io.Reader object, creating another object (Reader) that also implements the io.Reader
  3. // interface but provides automatic BOM checking and removing as necessary.
  4. package utfbom
  5. import (
  6. "errors"
  7. "io"
  8. )
  9. // Encoding is type alias for detected UTF encoding.
  10. type Encoding int
  11. // Constants to identify detected UTF encodings.
  12. const (
  13. // Unknown encoding, returned when no BOM was detected
  14. Unknown Encoding = iota
  15. // UTF8, BOM bytes: EF BB BF
  16. UTF8
  17. // UTF-16, big-endian, BOM bytes: FE FF
  18. UTF16BigEndian
  19. // UTF-16, little-endian, BOM bytes: FF FE
  20. UTF16LittleEndian
  21. // UTF-32, big-endian, BOM bytes: 00 00 FE FF
  22. UTF32BigEndian
  23. // UTF-32, little-endian, BOM bytes: FF FE 00 00
  24. UTF32LittleEndian
  25. )
  26. // String returns a user-friendly string representation of the encoding. Satisfies fmt.Stringer interface.
  27. func (e Encoding) String() string {
  28. switch e {
  29. case UTF8:
  30. return "UTF8"
  31. case UTF16BigEndian:
  32. return "UTF16BigEndian"
  33. case UTF16LittleEndian:
  34. return "UTF16LittleEndian"
  35. case UTF32BigEndian:
  36. return "UTF32BigEndian"
  37. case UTF32LittleEndian:
  38. return "UTF32LittleEndian"
  39. default:
  40. return "Unknown"
  41. }
  42. }
  43. const maxConsecutiveEmptyReads = 100
  44. // Skip creates Reader which automatically detects BOM (Unicode Byte Order Mark) and removes it as necessary.
  45. // It also returns the encoding detected by the BOM.
  46. // If the detected encoding is not needed, you can call the SkipOnly function.
  47. func Skip(rd io.Reader) (*Reader, Encoding) {
  48. // Is it already a Reader?
  49. b, ok := rd.(*Reader)
  50. if ok {
  51. return b, Unknown
  52. }
  53. enc, left, err := detectUtf(rd)
  54. return &Reader{
  55. rd: rd,
  56. buf: left,
  57. err: err,
  58. }, enc
  59. }
  60. // SkipOnly creates Reader which automatically detects BOM (Unicode Byte Order Mark) and removes it as necessary.
  61. func SkipOnly(rd io.Reader) *Reader {
  62. r, _ := Skip(rd)
  63. return r
  64. }
  65. // Reader implements automatic BOM (Unicode Byte Order Mark) checking and
  66. // removing as necessary for an io.Reader object.
  67. type Reader struct {
  68. rd io.Reader // reader provided by the client
  69. buf []byte // buffered data
  70. err error // last error
  71. }
  72. // Read is an implementation of io.Reader interface.
  73. // The bytes are taken from the underlying Reader, but it checks for BOMs, removing them as necessary.
  74. func (r *Reader) Read(p []byte) (n int, err error) {
  75. if len(p) == 0 {
  76. return 0, nil
  77. }
  78. if r.buf == nil {
  79. if r.err != nil {
  80. return 0, r.readErr()
  81. }
  82. return r.rd.Read(p)
  83. }
  84. // copy as much as we can
  85. n = copy(p, r.buf)
  86. r.buf = nilIfEmpty(r.buf[n:])
  87. return n, nil
  88. }
  89. func (r *Reader) readErr() error {
  90. err := r.err
  91. r.err = nil
  92. return err
  93. }
  94. var errNegativeRead = errors.New("utfbom: reader returned negative count from Read")
  95. func detectUtf(rd io.Reader) (enc Encoding, buf []byte, err error) {
  96. buf, err = readBOM(rd)
  97. if len(buf) >= 4 {
  98. if isUTF32BigEndianBOM4(buf) {
  99. return UTF32BigEndian, nilIfEmpty(buf[4:]), err
  100. }
  101. if isUTF32LittleEndianBOM4(buf) {
  102. return UTF32LittleEndian, nilIfEmpty(buf[4:]), err
  103. }
  104. }
  105. if len(buf) > 2 && isUTF8BOM3(buf) {
  106. return UTF8, nilIfEmpty(buf[3:]), err
  107. }
  108. if (err != nil && err != io.EOF) || (len(buf) < 2) {
  109. return Unknown, nilIfEmpty(buf), err
  110. }
  111. if isUTF16BigEndianBOM2(buf) {
  112. return UTF16BigEndian, nilIfEmpty(buf[2:]), err
  113. }
  114. if isUTF16LittleEndianBOM2(buf) {
  115. return UTF16LittleEndian, nilIfEmpty(buf[2:]), err
  116. }
  117. return Unknown, nilIfEmpty(buf), err
  118. }
  119. func readBOM(rd io.Reader) (buf []byte, err error) {
  120. const maxBOMSize = 4
  121. var bom [maxBOMSize]byte // used to read BOM
  122. // read as many bytes as possible
  123. for nEmpty, n := 0, 0; err == nil && len(buf) < maxBOMSize; buf = bom[:len(buf)+n] {
  124. if n, err = rd.Read(bom[len(buf):]); n < 0 {
  125. panic(errNegativeRead)
  126. }
  127. if n > 0 {
  128. nEmpty = 0
  129. } else {
  130. nEmpty++
  131. if nEmpty >= maxConsecutiveEmptyReads {
  132. err = io.ErrNoProgress
  133. }
  134. }
  135. }
  136. return
  137. }
  138. func isUTF32BigEndianBOM4(buf []byte) bool {
  139. return buf[0] == 0x00 && buf[1] == 0x00 && buf[2] == 0xFE && buf[3] == 0xFF
  140. }
  141. func isUTF32LittleEndianBOM4(buf []byte) bool {
  142. return buf[0] == 0xFF && buf[1] == 0xFE && buf[2] == 0x00 && buf[3] == 0x00
  143. }
  144. func isUTF8BOM3(buf []byte) bool {
  145. return buf[0] == 0xEF && buf[1] == 0xBB && buf[2] == 0xBF
  146. }
  147. func isUTF16BigEndianBOM2(buf []byte) bool {
  148. return buf[0] == 0xFE && buf[1] == 0xFF
  149. }
  150. func isUTF16LittleEndianBOM2(buf []byte) bool {
  151. return buf[0] == 0xFF && buf[1] == 0xFE
  152. }
  153. func nilIfEmpty(buf []byte) (res []byte) {
  154. if len(buf) > 0 {
  155. res = buf
  156. }
  157. return
  158. }