Browse Source

:art: OCR no longer blocks document loading https://github.com/siyuan-note/siyuan/issues/9230

Daniel 1 year ago
parent
commit
283917a9a8

+ 8 - 0
kernel/cache/asset.go

@@ -51,6 +51,14 @@ func RemoveAsset(path string) {
 	delete(assetsCache, path)
 }
 
+func ExistAsset(path string) (ret bool) {
+	assetsLock.Lock()
+	defer assetsLock.Unlock()
+
+	_, ret = assetsCache[path]
+	return
+}
+
 func LoadAssets() {
 	defer logging.Recover()
 

+ 2 - 2
kernel/job/cron.go

@@ -41,8 +41,8 @@ func StartCron() {
 	go every(10*time.Minute, model.FixIndexJob)
 	go every(10*time.Minute, model.IndexEmbedBlockJob)
 	go every(10*time.Minute, model.CacheVirtualBlockRefJob)
-	go every(12*time.Second, model.OCRAssetsJob)
-	go every(12*time.Second, model.FlushAssetsTextsJob)
+	go every(30*time.Second, model.OCRAssetsJob)
+	go every(30*time.Second, model.FlushAssetsTextsJob)
 	go every(30*time.Second, model.HookDesktopUIProcJob)
 }
 

+ 2 - 2
kernel/model/ocr.go

@@ -21,7 +21,7 @@ func OCRAssetsJob() {
 		return
 	}
 
-	task.AppendTaskWithTimeout(task.OCRImage, 7*time.Second, autoOCRAssets)
+	task.AppendTaskWithTimeout(task.OCRImage, 30*time.Second, autoOCRAssets)
 }
 
 func autoOCRAssets() {
@@ -40,7 +40,7 @@ func autoOCRAssets() {
 			if "" != text {
 				util.AssetsTextsChanged = true
 			}
-			if 4 <= i { // 一次任务中最多处理 4 张图片,防止卡顿
+			if 7 <= i { // 一次任务中最多处理 7 张图片,防止长时间占用系统资源
 				break
 			}
 		}

+ 41 - 0
kernel/sql/block.go

@@ -20,6 +20,9 @@ import (
 	"database/sql"
 
 	"github.com/siyuan-note/siyuan/kernel/cache"
+	"github.com/siyuan-note/siyuan/kernel/filesys"
+	"github.com/siyuan-note/siyuan/kernel/treenode"
+	"github.com/siyuan-note/siyuan/kernel/util"
 )
 
 type Block struct {
@@ -88,3 +91,41 @@ func updateBlockContent(tx *sql.Tx, block *Block) (err error) {
 	putBlockCache(block)
 	return
 }
+
+func indexNode(tx *sql.Tx, id string) (err error) {
+	bt := treenode.GetBlockTree(id)
+	if nil == bt {
+		return
+	}
+
+	luteEngine := util.NewLute()
+	tree, _ := filesys.LoadTree(bt.BoxID, bt.Path, luteEngine)
+	if nil == tree {
+		return
+	}
+
+	node := treenode.GetNodeInTree(tree, id)
+	if nil == node {
+		return
+	}
+
+	content := treenode.NodeStaticContent(node, nil, true, indexAssetPath)
+	stmt := "UPDATE blocks SET content = ? WHERE id = ?"
+	if err = execStmtTx(tx, stmt, content, id); nil != err {
+		tx.Rollback()
+		return
+	}
+	stmt = "UPDATE blocks_fts SET content = ? WHERE id = ?"
+	if err = execStmtTx(tx, stmt, content, id); nil != err {
+		tx.Rollback()
+		return
+	}
+	if !caseSensitive {
+		stmt = "UPDATE blocks_fts_case_insensitive SET content = ? WHERE id = ?"
+		if err = execStmtTx(tx, stmt, content, id); nil != err {
+			tx.Rollback()
+			return
+		}
+	}
+	return
+}

+ 15 - 0
kernel/sql/database.go

@@ -798,9 +798,18 @@ func buildBlockFromNode(n *ast.Node, tree *parse.Tree) (block *Block, attributes
 		length = utf8.RuneCountInString(fcontent)
 	} else if n.IsContainerBlock() {
 		markdown = treenode.ExportNodeStdMd(n, luteEngine)
+
+		if !treenode.IsNodeOCRed(n) {
+			IndexNodeQueue(n.ID)
+		}
 		content = treenode.NodeStaticContent(n, nil, true, indexAssetPath)
 		fc := treenode.FirstLeafBlock(n)
+
+		if !treenode.IsNodeOCRed(fc) {
+			IndexNodeQueue(fc.ID)
+		}
 		fcontent = treenode.NodeStaticContent(fc, nil, true, false)
+
 		parentID = n.Parent.ID
 		// 将标题块作为父节点
 		if h := heading(n); nil != h {
@@ -809,7 +818,13 @@ func buildBlockFromNode(n *ast.Node, tree *parse.Tree) (block *Block, attributes
 		length = utf8.RuneCountInString(fcontent)
 	} else {
 		markdown = treenode.ExportNodeStdMd(n, luteEngine)
+
+		if !treenode.IsNodeOCRed(n) {
+			IndexNodeQueue(n.ID)
+		}
+
 		content = treenode.NodeStaticContent(n, nil, true, indexAssetPath)
+
 		parentID = n.Parent.ID
 		// 将标题块作为父节点
 		if h := heading(n); nil != h {

+ 17 - 0
kernel/sql/queue.go

@@ -51,6 +51,7 @@ type dbQueueOperation struct {
 	box                           string      // delete_box/delete_box_refs/index
 	renameTree                    *parse.Tree // rename/rename_sub_tree
 	block                         *Block      // update_block_content
+	id                            string      // index_node
 	removeAssetHashes             []string    // delete_assets
 }
 
@@ -191,6 +192,8 @@ func execOp(op *dbQueueOperation, tx *sql.Tx, context map[string]interface{}) (e
 		err = updateBlockContent(tx, op.block)
 	case "delete_assets":
 		err = deleteAssetsByHashes(tx, op.removeAssetHashes)
+	case "index_node":
+		err = indexNode(tx, op.id)
 	default:
 		msg := fmt.Sprintf("unknown operation [%s]", op.action)
 		logging.LogErrorf(msg)
@@ -199,6 +202,20 @@ func execOp(op *dbQueueOperation, tx *sql.Tx, context map[string]interface{}) (e
 	return
 }
 
+func IndexNodeQueue(id string) {
+	dbQueueLock.Lock()
+	defer dbQueueLock.Unlock()
+
+	newOp := &dbQueueOperation{id: id, inQueueTime: time.Now(), action: "index_node"}
+	for i, op := range operationQueue {
+		if "index_node" == op.action && op.id == id {
+			operationQueue[i] = newOp
+			return
+		}
+	}
+	operationQueue = append(operationQueue, newOp)
+}
+
 func BatchRemoveAssetsQueue(hashes []string) {
 	if 1 > len(hashes) {
 		return

+ 26 - 0
kernel/treenode/node.go

@@ -137,6 +137,32 @@ func ExportNodeStdMd(node *ast.Node, luteEngine *lute.Lute) string {
 	return markdown
 }
 
+func IsNodeOCRed(node *ast.Node) (ret bool) {
+	ret = true
+	ast.Walk(node, func(n *ast.Node, entering bool) ast.WalkStatus {
+		if !entering {
+			return ast.WalkContinue
+		}
+
+		if ast.NodeImage == n.Type {
+			linkDest := n.ChildByType(ast.NodeLinkDest)
+			if nil != linkDest {
+				linkDestStr := linkDest.TokensStr()
+				if !cache.ExistAsset(linkDestStr) {
+					return ast.WalkContinue
+				}
+
+				if !util.ExistsAssetText(linkDestStr) {
+					ret = false
+					return ast.WalkStop
+				}
+			}
+		}
+		return ast.WalkContinue
+	})
+	return
+}
+
 func NodeStaticContent(node *ast.Node, excludeTypes []string, includeTextMarkATitleURL, includeAssetPath bool) string {
 	if nil == node {
 		return ""

+ 12 - 7
kernel/util/tesseract.go

@@ -52,27 +52,32 @@ func SetAssetText(asset, text string) {
 	AssetsTextsChanged = true
 }
 
-func GetAssetText(asset string, force bool) string {
+func ExistsAssetText(asset string) (ret bool) {
+	AssetsTextsLock.Lock()
+	_, ret = AssetsTexts[asset]
+	AssetsTextsLock.Unlock()
+	return
+}
+
+func GetAssetText(asset string, force bool) (ret string) {
 	if !force {
 		AssetsTextsLock.Lock()
-		ret, ok := AssetsTexts[asset]
+		ret = AssetsTexts[asset]
 		AssetsTextsLock.Unlock()
-		if ok {
-			return ret
-		}
+		return
 	}
 
 	assetsPath := GetDataAssetsAbsPath()
 	assetAbsPath := strings.TrimPrefix(asset, "assets")
 	assetAbsPath = filepath.Join(assetsPath, assetAbsPath)
-	ret := Tesseract(assetAbsPath)
+	ret = Tesseract(assetAbsPath)
 	AssetsTextsLock.Lock()
 	AssetsTexts[asset] = ret
 	AssetsTextsLock.Unlock()
 	if "" != ret {
 		AssetsTextsChanged = true
 	}
-	return ret
+	return
 }
 
 func IsTesseractExtractable(p string) bool {