瀏覽代碼

Merge remote-tracking branch 'origin/dev' into dev

Vanessa 2 年之前
父節點
當前提交
ff91a58747
共有 6 個文件被更改,包括 186 次插入48 次删除
  1. 3 0
      kernel/main.go
  2. 3 0
      kernel/mobile/kernel.go
  3. 1 0
      kernel/model/conf.go
  4. 2 0
      kernel/treenode/blocktree.go
  5. 1 2
      kernel/treenode/node.go
  6. 176 46
      kernel/util/ocr.go

+ 3 - 0
kernel/main.go

@@ -40,6 +40,7 @@ func main() {
 	model.BootSyncData()
 	model.InitBoxes()
 	model.InitFlashcards()
+	util.LoadAssetsTexts()
 
 	go model.AutoGenerateDocHistory()
 	go model.AutoSync()
@@ -52,6 +53,8 @@ func main() {
 	go treenode.AutoFlushBlockTree()
 	go cache.LoadAssets()
 	go model.AutoFixIndex()
+	go util.AutoOCRAssets()
+	go util.AutoFlushAssetsTexts()
 	go model.HookDesktopUIProc()
 	model.WatchAssets()
 	model.HandleSignal()

+ 3 - 0
kernel/mobile/kernel.go

@@ -54,6 +54,7 @@ func StartKernel(container, appDir, workspaceBaseDir, timezoneID, localIPs, lang
 		model.BootSyncData()
 		model.InitBoxes()
 		model.InitFlashcards()
+		util.LoadAssetsTexts()
 
 		go model.AutoGenerateDocHistory()
 		go model.AutoSync()
@@ -66,6 +67,8 @@ func StartKernel(container, appDir, workspaceBaseDir, timezoneID, localIPs, lang
 		go treenode.AutoFlushBlockTree()
 		go cache.LoadAssets()
 		go model.AutoFixIndex()
+		go util.AutoOCRAssets()
+		go util.AutoFlushAssetsTexts()
 	}()
 }
 

+ 1 - 0
kernel/model/conf.go

@@ -428,6 +428,7 @@ func Close(force bool, execInstallPkg int) (exitCode int) {
 	Conf.Close()
 	sql.CloseDatabase()
 	treenode.SaveBlockTree(false)
+	util.SaveAssetsTexts()
 	clearWorkspaceTemp()
 	clearPortJSON()
 	util.UnlockWorkspace()

+ 2 - 0
kernel/treenode/blocktree.go

@@ -386,6 +386,7 @@ func InitBlockTree(force bool) {
 	}
 	blockTreesLock.Unlock()
 	debug.FreeOSMemory()
+
 	if elapsed := time.Since(start).Seconds(); 2 < elapsed {
 		logging.LogWarnf("read block tree [%s] to [%s], elapsed [%.2fs]", humanize.Bytes(uint64(len(data))), util.BlockTreePath, elapsed)
 	}
@@ -414,6 +415,7 @@ func SaveBlockTree(force bool) {
 		return
 	}
 	debug.FreeOSMemory()
+
 	if elapsed := time.Since(start).Seconds(); 2 < elapsed {
 		logging.LogWarnf("save block tree [size=%s] to [%s], elapsed [%.2fs]", humanize.Bytes(uint64(len(data))), util.BlockTreePath, elapsed)
 	}

+ 1 - 2
kernel/treenode/node.go

@@ -18,7 +18,6 @@ package treenode
 
 import (
 	"bytes"
-	"path/filepath"
 	"strings"
 	"sync"
 
@@ -114,7 +113,7 @@ func NodeStaticContent(node *ast.Node, excludeTypes []string) string {
 				destNode := n.Parent.ChildByType(ast.NodeLinkDest)
 				if nil != destNode {
 					// 桌面端支持搜索图片 OCR 文本 https://github.com/siyuan-note/siyuan/issues/3470
-					if text := util2.Tesseract(filepath.Join(util2.DataDir, destNode.TokensStr())); "" != text {
+					if text := util2.GetAssetText(destNode.TokensStr()); "" != text {
 						buf.WriteByte(' ')
 						buf.WriteString(text)
 					}

+ 176 - 46
kernel/util/ocr.go

@@ -19,52 +19,34 @@ package util
 import (
 	"bytes"
 	"context"
+	"io"
 	"os"
 	"os/exec"
+	"path/filepath"
+	"regexp"
+	"runtime/debug"
 	"strings"
+	"sync"
 	"time"
 
 	"github.com/88250/gulu"
-	"github.com/dgraph-io/ristretto"
+	"github.com/dustin/go-humanize"
 	"github.com/siyuan-note/logging"
 )
 
 var (
-	tesseractEnabled bool
-	tesseractErrCnt  int
+	tesseractEnabled   bool
+	assetsTexts        = map[string]string{}
+	assetsTextsLock    = sync.Mutex{}
+	assetsTextsChanged = false
 )
 
-func initTesseract() {
-	ver := getTesseractVer()
-	if "" == ver {
-		return
-	}
-
-	logging.LogInfof("tesseract-ocr enabled [ver=%s]", ver)
-}
-
-func getTesseractVer() (ret string) {
-	cmd := exec.Command("tesseract", "--version")
-	gulu.CmdAttr(cmd)
-	data, err := cmd.CombinedOutput()
-	if nil == err && strings.HasPrefix(string(data), "tesseract v") {
-		parts := bytes.Split(data, []byte("\n"))
-		if 0 < len(parts) {
-			ret = strings.TrimPrefix(string(parts[0]), "tesseract ")
-			ret = strings.TrimSpace(ret)
-			tesseractEnabled = true
-		}
-		return
-	}
-	return
+func GetAssetText(assets string) string {
+	assetsTextsLock.Lock()
+	defer assetsTextsLock.Unlock()
+	return assetsTexts[assets]
 }
 
-var ocrResultCache, _ = ristretto.NewCache(&ristretto.Config{
-	NumCounters: 100000,
-	MaxCost:     1000 * 1000 * 64,
-	BufferItems: 64,
-})
-
 func Tesseract(imgAbsPath string) string {
 	if ContainerStd != Container || !tesseractEnabled {
 		return ""
@@ -75,14 +57,9 @@ func Tesseract(imgAbsPath string) string {
 		return ""
 	}
 
-	cached, ok := ocrResultCache.Get(imgAbsPath)
-	if ok {
-		return cached.(string)
-	}
-
 	defer logging.Recover()
 
-	ctx, cancel := context.WithTimeout(context.Background(), 3*time.Second)
+	ctx, cancel := context.WithTimeout(context.Background(), 7*time.Second)
 	defer cancel()
 
 	now := time.Now()
@@ -91,25 +68,178 @@ func Tesseract(imgAbsPath string) string {
 	output, err := cmd.CombinedOutput()
 	if ctx.Err() == context.DeadlineExceeded {
 		logging.LogWarnf("tesseract [path=%s, size=%d] timeout", imgAbsPath, info.Size())
-		tesseractErrCnt++
+		assetsTexts[imgAbsPath] = ""
+		assetsTextsChanged = true
 		return ""
 	}
 
 	if nil != err {
 		logging.LogWarnf("tesseract [path=%s, size=%d] failed: %s", imgAbsPath, info.Size(), err)
-		tesseractErrCnt++
+		assetsTexts[imgAbsPath] = ""
+		assetsTextsChanged = true
 		return ""
 	}
 
-	if 16 < tesseractErrCnt {
-		tesseractEnabled = false
-		logging.LogWarnf("disable tesseract-ocr caused by too many errors")
-	}
-
 	ret := string(output)
 	ret = strings.ReplaceAll(ret, "\r", "")
 	ret = strings.ReplaceAll(ret, "\n", "")
+	ret = strings.ReplaceAll(ret, "\t", " ")
+	reg := regexp.MustCompile("\\s{2,}")
+	ret = reg.ReplaceAllString(ret, " ")
 	logging.LogInfof("tesseract [path=%s, size=%d, text=%s, elapsed=%dms]", imgAbsPath, info.Size(), ret, time.Since(now).Milliseconds())
-	ocrResultCache.Set(imgAbsPath, ret, info.Size())
+	assetsTexts[imgAbsPath] = ret
+	assetsTextsChanged = true
 	return ret
 }
+
+func AutoOCRAssets() {
+	if !tesseractEnabled {
+		return
+	}
+
+	for {
+		assets := getUnOCRAssetsAbsPaths()
+		for _, p := range assets {
+			Tesseract(p)
+		}
+		time.Sleep(7 * time.Second)
+	}
+}
+
+func getUnOCRAssetsAbsPaths() (ret []string) {
+	assetsPath := GetDataAssetsAbsPath()
+	var assetsPaths []string
+	filepath.Walk(assetsPath, func(path string, info os.FileInfo, err error) error {
+		name := info.Name()
+		if info.IsDir() {
+			if strings.HasPrefix(name, ".") {
+				return filepath.SkipDir
+			}
+			return nil
+		}
+
+		lowerName := strings.ToLower(name)
+		if !strings.HasSuffix(lowerName, ".png") && !strings.HasSuffix(lowerName, ".jpg") && !strings.HasSuffix(lowerName, ".jpeg") {
+			return nil
+		}
+
+		assetsPaths = append(assetsPaths, path)
+		return nil
+	})
+
+	assetsTextsTmp := assetsTexts
+	for _, absPath := range assetsPaths {
+		p := strings.TrimPrefix(absPath, assetsPath)
+		p = "assets" + filepath.ToSlash(p)
+		if _, ok := assetsTextsTmp[p]; ok {
+			continue
+		}
+		ret = append(ret, absPath)
+	}
+	return
+}
+
+func AutoFlushAssetsTexts() {
+	for {
+		SaveAssetsTexts()
+		time.Sleep(7 * time.Second)
+	}
+}
+
+func LoadAssetsTexts() {
+	assetsPath := GetDataAssetsAbsPath()
+	assetsTextsPath := filepath.Join(assetsPath, "ocr-texts.json")
+	if !gulu.File.IsExist(assetsTextsPath) {
+		return
+	}
+
+	start := time.Now()
+	var err error
+	fh, err := os.OpenFile(assetsTextsPath, os.O_RDWR, 0644)
+	if nil != err {
+		logging.LogErrorf("open assets texts failed: %s", err)
+		return
+	}
+	defer fh.Close()
+
+	data, err := io.ReadAll(fh)
+	if nil != err {
+		logging.LogErrorf("read assets texts failed: %s", err)
+		return
+	}
+
+	assetsTextsLock.Lock()
+	if err = gulu.JSON.UnmarshalJSON(data, &assetsTexts); nil != err {
+		logging.LogErrorf("unmarshal assets texts failed: %s", err)
+		if err = os.RemoveAll(assetsTextsPath); nil != err {
+			logging.LogErrorf("removed corrupted assets texts failed: %s", err)
+		}
+		return
+	}
+	assetsTextsLock.Unlock()
+	debug.FreeOSMemory()
+
+	if elapsed := time.Since(start).Seconds(); 2 < elapsed {
+		logging.LogWarnf("read assets texts [%s] to [%s], elapsed [%.2fs]", humanize.Bytes(uint64(len(data))), assetsTextsPath, elapsed)
+	}
+	return
+}
+
+func SaveAssetsTexts() {
+	if !assetsTextsChanged {
+		return
+	}
+
+	start := time.Now()
+
+	assetsTextsLock.Lock()
+	data, err := gulu.JSON.MarshalIndentJSON(assetsTexts, "", "  ")
+	if nil != err {
+		logging.LogErrorf("marshal assets texts failed: %s", err)
+		return
+	}
+	assetsTextsLock.Unlock()
+
+	assetsPath := GetDataAssetsAbsPath()
+	assetsTextsPath := filepath.Join(assetsPath, "ocr-texts.json")
+	if err = gulu.File.WriteFileSafer(assetsTextsPath, data, 0644); nil != err {
+		logging.LogErrorf("write assets texts failed: %s", err)
+		return
+	}
+	debug.FreeOSMemory()
+
+	if elapsed := time.Since(start).Seconds(); 2 < elapsed {
+		logging.LogWarnf("save assets texts [size=%s] to [%s], elapsed [%.2fs]", humanize.Bytes(uint64(len(data))), assetsTextsPath, elapsed)
+	}
+
+	assetsTextsChanged = false
+}
+
+func initTesseract() {
+	ver := getTesseractVer()
+	if "" == ver {
+		return
+	}
+
+	logging.LogInfof("tesseract-ocr enabled [ver=%s]", ver)
+}
+
+func getTesseractVer() (ret string) {
+	if ContainerStd != Container {
+		return
+	}
+
+	cmd := exec.Command("tesseract", "--version")
+	gulu.CmdAttr(cmd)
+	data, err := cmd.CombinedOutput()
+	if nil == err && strings.HasPrefix(string(data), "tesseract v") {
+		parts := bytes.Split(data, []byte("\n"))
+		if 0 < len(parts) {
+			ret = strings.TrimPrefix(string(parts[0]), "tesseract ")
+			ret = strings.TrimSpace(ret)
+			tesseractEnabled = true
+		}
+		return
+	}
+	return
+}