ocr.go 7.9 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338
  1. // SiYuan - Build Your Eternal Digital Garden
  2. // Copyright (c) 2020-present, b3log.org
  3. //
  4. // This program is free software: you can redistribute it and/or modify
  5. // it under the terms of the GNU Affero General Public License as published by
  6. // the Free Software Foundation, either version 3 of the License, or
  7. // (at your option) any later version.
  8. //
  9. // This program is distributed in the hope that it will be useful,
  10. // but WITHOUT ANY WARRANTY; without even the implied warranty of
  11. // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
  12. // GNU Affero General Public License for more details.
  13. //
  14. // You should have received a copy of the GNU Affero General Public License
  15. // along with this program. If not, see <https://www.gnu.org/licenses/>.
  16. package util
  17. import (
  18. "bytes"
  19. "context"
  20. "io"
  21. "os"
  22. "os/exec"
  23. "path/filepath"
  24. "regexp"
  25. "runtime"
  26. "runtime/debug"
  27. "strings"
  28. "sync"
  29. "time"
  30. "github.com/88250/gulu"
  31. "github.com/dustin/go-humanize"
  32. "github.com/panjf2000/ants/v2"
  33. "github.com/siyuan-note/logging"
  34. )
  35. var (
  36. tesseractEnabled bool
  37. tesseractLangs []string
  38. assetsTexts = map[string]string{}
  39. assetsTextsLock = sync.Mutex{}
  40. assetsTextsChanged = false
  41. )
  42. func GetAssetText(asset string) string {
  43. assetsTextsLock.Lock()
  44. ret, ok := assetsTexts[asset]
  45. assetsTextsLock.Unlock()
  46. if ok {
  47. return ret
  48. }
  49. assetsPath := GetDataAssetsAbsPath()
  50. assetAbsPath := strings.TrimPrefix(asset, "assets")
  51. assetAbsPath = filepath.Join(assetsPath, assetAbsPath)
  52. ret = Tesseract(assetAbsPath)
  53. assetsTextsLock.Lock()
  54. assetsTexts[asset] = ret
  55. assetsTextsLock.Unlock()
  56. return ret
  57. }
  58. func Tesseract(imgAbsPath string) string {
  59. if ContainerStd != Container || !tesseractEnabled {
  60. return ""
  61. }
  62. info, err := os.Stat(imgAbsPath)
  63. if nil != err {
  64. return ""
  65. }
  66. defer logging.Recover()
  67. ctx, cancel := context.WithTimeout(context.Background(), 7*time.Second)
  68. defer cancel()
  69. now := time.Now()
  70. cmd := exec.CommandContext(ctx, "tesseract", "-c", "debug_file=/dev/null", imgAbsPath, "stdout", "-l", strings.Join(tesseractLangs, "+"))
  71. gulu.CmdAttr(cmd)
  72. output, err := cmd.CombinedOutput()
  73. if ctx.Err() == context.DeadlineExceeded {
  74. logging.LogWarnf("tesseract [path=%s, size=%d] timeout", imgAbsPath, info.Size())
  75. return ""
  76. }
  77. if nil != err {
  78. logging.LogWarnf("tesseract [path=%s, size=%d] failed: %s", imgAbsPath, info.Size(), err)
  79. return ""
  80. }
  81. ret := string(output)
  82. ret = strings.ReplaceAll(ret, "\r", "")
  83. ret = strings.ReplaceAll(ret, "\n", "")
  84. ret = strings.ReplaceAll(ret, "\t", " ")
  85. reg := regexp.MustCompile("\\s{2,}")
  86. ret = reg.ReplaceAllString(ret, " ")
  87. logging.LogInfof("tesseract [path=%s, size=%d, text=%s, elapsed=%dms]", imgAbsPath, info.Size(), ret, time.Since(now).Milliseconds())
  88. return ret
  89. }
  90. func AutoOCRAssets() {
  91. if !tesseractEnabled {
  92. return
  93. }
  94. for {
  95. autoOCRAssets()
  96. time.Sleep(7 * time.Second)
  97. }
  98. }
  99. func autoOCRAssets() {
  100. defer logging.Recover()
  101. assetsPath := GetDataAssetsAbsPath()
  102. assets := getUnOCRAssetsAbsPaths()
  103. poolSize := runtime.NumCPU()
  104. if 4 < poolSize {
  105. poolSize = 4
  106. }
  107. waitGroup := &sync.WaitGroup{}
  108. p, _ := ants.NewPoolWithFunc(poolSize, func(arg interface{}) {
  109. defer waitGroup.Done()
  110. assetAbsPath := arg.(string)
  111. text := Tesseract(assetAbsPath)
  112. p := strings.TrimPrefix(assetAbsPath, assetsPath)
  113. p = "assets" + filepath.ToSlash(p)
  114. assetsTextsLock.Lock()
  115. assetsTexts[p] = text
  116. assetsTextsLock.Unlock()
  117. assetsTextsChanged = true
  118. })
  119. for _, assetAbsPath := range assets {
  120. waitGroup.Add(1)
  121. p.Invoke(assetAbsPath)
  122. }
  123. waitGroup.Wait()
  124. p.Release()
  125. cleanNotFoundAssetsTexts()
  126. }
  127. func cleanNotFoundAssetsTexts() {
  128. tmp := assetsTexts
  129. assetsPath := GetDataAssetsAbsPath()
  130. var toRemoves []string
  131. for asset, _ := range tmp {
  132. assetAbsPath := strings.TrimPrefix(asset, "assets")
  133. assetAbsPath = filepath.Join(assetsPath, assetAbsPath)
  134. if !gulu.File.IsExist(assetAbsPath) {
  135. toRemoves = append(toRemoves, asset)
  136. }
  137. }
  138. assetsTextsLock.Lock()
  139. for _, asset := range toRemoves {
  140. delete(assetsTexts, asset)
  141. assetsTextsChanged = true
  142. }
  143. assetsTextsLock.Unlock()
  144. return
  145. }
  146. func getUnOCRAssetsAbsPaths() (ret []string) {
  147. assetsPath := GetDataAssetsAbsPath()
  148. var assetsPaths []string
  149. filepath.Walk(assetsPath, func(path string, info os.FileInfo, err error) error {
  150. name := info.Name()
  151. if info.IsDir() {
  152. if strings.HasPrefix(name, ".") {
  153. return filepath.SkipDir
  154. }
  155. return nil
  156. }
  157. lowerName := strings.ToLower(name)
  158. if !strings.HasSuffix(lowerName, ".png") && !strings.HasSuffix(lowerName, ".jpg") && !strings.HasSuffix(lowerName, ".jpeg") {
  159. return nil
  160. }
  161. assetsPaths = append(assetsPaths, path)
  162. return nil
  163. })
  164. assetsTextsTmp := assetsTexts
  165. for _, absPath := range assetsPaths {
  166. p := strings.TrimPrefix(absPath, assetsPath)
  167. p = "assets" + filepath.ToSlash(p)
  168. if _, ok := assetsTextsTmp[p]; ok {
  169. continue
  170. }
  171. ret = append(ret, absPath)
  172. }
  173. return
  174. }
  175. func AutoFlushAssetsTexts() {
  176. for {
  177. SaveAssetsTexts()
  178. time.Sleep(7 * time.Second)
  179. }
  180. }
  181. func LoadAssetsTexts() {
  182. assetsPath := GetDataAssetsAbsPath()
  183. assetsTextsPath := filepath.Join(assetsPath, "ocr-texts.json")
  184. if !gulu.File.IsExist(assetsTextsPath) {
  185. return
  186. }
  187. start := time.Now()
  188. var err error
  189. fh, err := os.OpenFile(assetsTextsPath, os.O_RDWR, 0644)
  190. if nil != err {
  191. logging.LogErrorf("open assets texts failed: %s", err)
  192. return
  193. }
  194. defer fh.Close()
  195. data, err := io.ReadAll(fh)
  196. if nil != err {
  197. logging.LogErrorf("read assets texts failed: %s", err)
  198. return
  199. }
  200. assetsTextsLock.Lock()
  201. if err = gulu.JSON.UnmarshalJSON(data, &assetsTexts); nil != err {
  202. logging.LogErrorf("unmarshal assets texts failed: %s", err)
  203. if err = os.RemoveAll(assetsTextsPath); nil != err {
  204. logging.LogErrorf("removed corrupted assets texts failed: %s", err)
  205. }
  206. return
  207. }
  208. assetsTextsLock.Unlock()
  209. debug.FreeOSMemory()
  210. if elapsed := time.Since(start).Seconds(); 2 < elapsed {
  211. logging.LogWarnf("read assets texts [%s] to [%s], elapsed [%.2fs]", humanize.Bytes(uint64(len(data))), assetsTextsPath, elapsed)
  212. }
  213. return
  214. }
  215. func SaveAssetsTexts() {
  216. if !assetsTextsChanged {
  217. return
  218. }
  219. start := time.Now()
  220. assetsTextsLock.Lock()
  221. data, err := gulu.JSON.MarshalIndentJSON(assetsTexts, "", " ")
  222. if nil != err {
  223. logging.LogErrorf("marshal assets texts failed: %s", err)
  224. return
  225. }
  226. assetsTextsLock.Unlock()
  227. assetsPath := GetDataAssetsAbsPath()
  228. assetsTextsPath := filepath.Join(assetsPath, "ocr-texts.json")
  229. if err = gulu.File.WriteFileSafer(assetsTextsPath, data, 0644); nil != err {
  230. logging.LogErrorf("write assets texts failed: %s", err)
  231. return
  232. }
  233. debug.FreeOSMemory()
  234. if elapsed := time.Since(start).Seconds(); 2 < elapsed {
  235. logging.LogWarnf("save assets texts [size=%s] to [%s], elapsed [%.2fs]", humanize.Bytes(uint64(len(data))), assetsTextsPath, elapsed)
  236. }
  237. assetsTextsChanged = false
  238. }
  239. func initTesseract() {
  240. ver := getTesseractVer()
  241. if "" == ver {
  242. return
  243. }
  244. tesseractLangs = getTesseractLangs()
  245. if 1 > len(tesseractLangs) {
  246. logging.LogWarnf("no tesseract langs found")
  247. tesseractEnabled = false
  248. return
  249. }
  250. logging.LogInfof("tesseract-ocr enabled [ver=%s, langs=%s]", ver, strings.Join(tesseractLangs, "+"))
  251. }
  252. func getTesseractVer() (ret string) {
  253. if ContainerStd != Container {
  254. return
  255. }
  256. cmd := exec.Command("tesseract", "--version")
  257. gulu.CmdAttr(cmd)
  258. data, err := cmd.CombinedOutput()
  259. if nil == err && strings.HasPrefix(string(data), "tesseract ") {
  260. parts := bytes.Split(data, []byte("\n"))
  261. if 0 < len(parts) {
  262. ret = strings.TrimPrefix(string(parts[0]), "tesseract ")
  263. ret = strings.TrimSpace(ret)
  264. tesseractEnabled = true
  265. }
  266. return
  267. }
  268. return
  269. }
  270. func getTesseractLangs() (ret []string) {
  271. if !tesseractEnabled {
  272. return nil
  273. }
  274. cmd := exec.Command("tesseract", "--list-langs")
  275. gulu.CmdAttr(cmd)
  276. data, err := cmd.CombinedOutput()
  277. if nil != err {
  278. return nil
  279. }
  280. parts := bytes.Split(data, []byte("\n"))
  281. if 0 < len(parts) {
  282. parts = parts[1:]
  283. }
  284. for _, part := range parts {
  285. part = bytes.TrimSpace(part)
  286. if 0 == len(part) {
  287. continue
  288. }
  289. ret = append(ret, string(part))
  290. }
  291. return
  292. }