ocr.go 4.0 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165
  1. package model
  2. import (
  3. "io"
  4. "os"
  5. "path/filepath"
  6. "runtime/debug"
  7. "strings"
  8. "time"
  9. "github.com/88250/gulu"
  10. "github.com/dustin/go-humanize"
  11. "github.com/siyuan-note/logging"
  12. "github.com/siyuan-note/siyuan/kernel/cache"
  13. "github.com/siyuan-note/siyuan/kernel/task"
  14. "github.com/siyuan-note/siyuan/kernel/util"
  15. )
  16. func OCRAssetsJob() {
  17. if !util.TesseractEnabled {
  18. return
  19. }
  20. task.AppendTaskWithTimeout(task.OCRImage, 7*time.Second, autoOCRAssets)
  21. }
  22. func autoOCRAssets() {
  23. defer logging.Recover()
  24. assetsPath := util.GetDataAssetsAbsPath()
  25. assets := getUnOCRAssetsAbsPaths()
  26. if 0 < len(assets) {
  27. for i, assetAbsPath := range assets {
  28. text := util.Tesseract(assetAbsPath)
  29. p := strings.TrimPrefix(assetAbsPath, assetsPath)
  30. p = "assets" + filepath.ToSlash(p)
  31. util.AssetsTextsLock.Lock()
  32. util.AssetsTexts[p] = text
  33. util.AssetsTextsLock.Unlock()
  34. util.AssetsTextsChanged = true
  35. if 4 <= i { // 一次任务中最多处理 4 张图片,防止卡顿
  36. break
  37. }
  38. }
  39. }
  40. cleanNotExistAssetsTexts()
  41. }
  42. func cleanNotExistAssetsTexts() {
  43. util.AssetsTextsLock.Lock()
  44. defer util.AssetsTextsLock.Unlock()
  45. assetsPath := util.GetDataAssetsAbsPath()
  46. var toRemoves []string
  47. for asset, _ := range util.AssetsTexts {
  48. assetAbsPath := strings.TrimPrefix(asset, "assets")
  49. assetAbsPath = filepath.Join(assetsPath, assetAbsPath)
  50. if !gulu.File.IsExist(assetAbsPath) {
  51. toRemoves = append(toRemoves, asset)
  52. }
  53. }
  54. for _, asset := range toRemoves {
  55. delete(util.AssetsTexts, asset)
  56. util.AssetsTextsChanged = true
  57. }
  58. return
  59. }
  60. func getUnOCRAssetsAbsPaths() (ret []string) {
  61. var assetsPaths []string
  62. assets := cache.GetAssets()
  63. for _, asset := range assets {
  64. if !util.IsTesseractExtractable(asset.Path) {
  65. continue
  66. }
  67. assetsPaths = append(assetsPaths, asset.Path)
  68. }
  69. assetsPath := util.GetDataAssetsAbsPath()
  70. assetsTextsTmp := util.AssetsTexts
  71. for _, assetPath := range assetsPaths {
  72. if _, ok := assetsTextsTmp[assetPath]; ok {
  73. continue
  74. }
  75. absPath := filepath.Join(assetsPath, strings.TrimPrefix(assetPath, "assets"))
  76. ret = append(ret, absPath)
  77. }
  78. return
  79. }
  80. func FlushAssetsTextsJob() {
  81. SaveAssetsTexts()
  82. }
  83. func LoadAssetsTexts() {
  84. assetsPath := util.GetDataAssetsAbsPath()
  85. assetsTextsPath := filepath.Join(assetsPath, "ocr-texts.json")
  86. if !gulu.File.IsExist(assetsTextsPath) {
  87. return
  88. }
  89. start := time.Now()
  90. var err error
  91. fh, err := os.OpenFile(assetsTextsPath, os.O_RDWR, 0644)
  92. if nil != err {
  93. logging.LogErrorf("open assets texts failed: %s", err)
  94. return
  95. }
  96. defer fh.Close()
  97. data, err := io.ReadAll(fh)
  98. if nil != err {
  99. logging.LogErrorf("read assets texts failed: %s", err)
  100. return
  101. }
  102. util.AssetsTextsLock.Lock()
  103. if err = gulu.JSON.UnmarshalJSON(data, &util.AssetsTexts); nil != err {
  104. logging.LogErrorf("unmarshal assets texts failed: %s", err)
  105. if err = os.RemoveAll(assetsTextsPath); nil != err {
  106. logging.LogErrorf("removed corrupted assets texts failed: %s", err)
  107. }
  108. return
  109. }
  110. util.AssetsTextsLock.Unlock()
  111. debug.FreeOSMemory()
  112. if elapsed := time.Since(start).Seconds(); 2 < elapsed {
  113. logging.LogWarnf("read assets texts [%s] to [%s], elapsed [%.2fs]", humanize.Bytes(uint64(len(data))), assetsTextsPath, elapsed)
  114. }
  115. return
  116. }
  117. func SaveAssetsTexts() {
  118. if !util.AssetsTextsChanged {
  119. return
  120. }
  121. start := time.Now()
  122. util.AssetsTextsLock.Lock()
  123. data, err := gulu.JSON.MarshalIndentJSON(util.AssetsTexts, "", " ")
  124. if nil != err {
  125. logging.LogErrorf("marshal assets texts failed: %s", err)
  126. return
  127. }
  128. util.AssetsTextsLock.Unlock()
  129. assetsPath := util.GetDataAssetsAbsPath()
  130. assetsTextsPath := filepath.Join(assetsPath, "ocr-texts.json")
  131. if err = gulu.File.WriteFileSafer(assetsTextsPath, data, 0644); nil != err {
  132. logging.LogErrorf("write assets texts failed: %s", err)
  133. return
  134. }
  135. debug.FreeOSMemory()
  136. if elapsed := time.Since(start).Seconds(); 2 < elapsed {
  137. logging.LogWarnf("save assets texts [size=%s] to [%s], elapsed [%.2fs]", humanize.Bytes(uint64(len(data))), assetsTextsPath, elapsed)
  138. }
  139. util.AssetsTextsChanged = false
  140. }