Merge remote-tracking branch 'origin/dev' into dev

This commit is contained in:
Vanessa 2023-01-16 22:43:53 +08:00
commit 3d20b5c834
9 changed files with 356 additions and 349 deletions

View file

@ -30,8 +30,8 @@
"leftRightLayout": "左右佈局",
"topBottomLayout": "上下佈局",
"keyword": "關鍵字",
"searchMethod":"搜索方式",
"regex":"正則表達式",
"searchMethod": "搜索方式",
"regex": "正則表達式",
"keywordsLimit": "關鍵字數量限制",
"exportAsImage": "導出為圖片",
"exportBySiYuan": "由思源筆記導出",

View file

@ -40,7 +40,7 @@ func main() {
model.BootSyncData()
model.InitBoxes()
model.InitFlashcards()
util.LoadAssetsTexts()
model.LoadAssetsTexts()
go model.AutoGenerateDocHistory()
go model.AutoSync()
@ -53,8 +53,8 @@ func main() {
go treenode.AutoFlushBlockTree()
go cache.LoadAssets()
go model.AutoFixIndex()
go util.AutoOCRAssets()
go util.AutoFlushAssetsTexts()
go model.AutoOCRAssets()
go model.AutoFlushAssetsTexts()
go model.HookDesktopUIProc()
model.WatchAssets()
model.HandleSignal()

View file

@ -54,7 +54,7 @@ func StartKernel(container, appDir, workspaceBaseDir, timezoneID, localIPs, lang
model.BootSyncData()
model.InitBoxes()
model.InitFlashcards()
util.LoadAssetsTexts()
model.LoadAssetsTexts()
go model.AutoGenerateDocHistory()
go model.AutoSync()
@ -67,8 +67,8 @@ func StartKernel(container, appDir, workspaceBaseDir, timezoneID, localIPs, lang
go treenode.AutoFlushBlockTree()
go cache.LoadAssets()
go model.AutoFixIndex()
go util.AutoOCRAssets()
go util.AutoFlushAssetsTexts()
go model.AutoOCRAssets()
go model.AutoFlushAssetsTexts()
}()
}

View file

@ -197,7 +197,6 @@ func NetImg2LocalAssets(rootID string) (err error) {
if err = writeJSONQueue(tree); nil != err {
return
}
sql.WaitForWritingDatabase()
util.PushUpdateMsg(msgId, fmt.Sprintf(Conf.Language(120), files), 5000)
} else {
util.PushUpdateMsg(msgId, Conf.Language(121), 3000)

View file

@ -428,7 +428,7 @@ func Close(force bool, execInstallPkg int) (exitCode int) {
Conf.Close()
sql.CloseDatabase()
treenode.SaveBlockTree(false)
util.SaveAssetsTexts()
SaveAssetsTexts()
clearWorkspaceTemp()
clearPortJSON()
util.UnlockWorkspace()

184
kernel/model/ocr.go Normal file
View file

@ -0,0 +1,184 @@
package model
import (
"github.com/dustin/go-humanize"
"io"
"os"
"path/filepath"
"runtime"
"runtime/debug"
"strings"
"sync"
"time"
"github.com/88250/gulu"
"github.com/panjf2000/ants/v2"
"github.com/siyuan-note/logging"
"github.com/siyuan-note/siyuan/kernel/cache"
"github.com/siyuan-note/siyuan/kernel/util"
)
func AutoOCRAssets() {
if !util.TesseractEnabled {
return
}
for {
autoOCRAssets()
time.Sleep(7 * time.Second)
}
}
func autoOCRAssets() {
defer logging.Recover()
assetsPath := util.GetDataAssetsAbsPath()
assets := getUnOCRAssetsAbsPaths()
poolSize := runtime.NumCPU()
if 4 < poolSize {
poolSize = 4
}
waitGroup := &sync.WaitGroup{}
p, _ := ants.NewPoolWithFunc(poolSize, func(arg interface{}) {
defer waitGroup.Done()
assetAbsPath := arg.(string)
text := util.Tesseract(assetAbsPath)
p := strings.TrimPrefix(assetAbsPath, assetsPath)
p = "assets" + filepath.ToSlash(p)
util.AssetsTextsLock.Lock()
util.AssetsTexts[p] = text
util.AssetsTextsLock.Unlock()
util.AssetsTextsChanged = true
})
for _, assetAbsPath := range assets {
waitGroup.Add(1)
p.Invoke(assetAbsPath)
}
waitGroup.Wait()
p.Release()
cleanNotFoundAssetsTexts()
}
func cleanNotFoundAssetsTexts() {
tmp := util.AssetsTexts
assetsPath := util.GetDataAssetsAbsPath()
var toRemoves []string
for asset, _ := range tmp {
assetAbsPath := strings.TrimPrefix(asset, "assets")
assetAbsPath = filepath.Join(assetsPath, assetAbsPath)
if !gulu.File.IsExist(assetAbsPath) {
toRemoves = append(toRemoves, asset)
}
}
util.AssetsTextsLock.Lock()
for _, asset := range toRemoves {
delete(util.AssetsTexts, asset)
util.AssetsTextsChanged = true
}
util.AssetsTextsLock.Unlock()
return
}
func getUnOCRAssetsAbsPaths() (ret []string) {
var assetsPaths []string
assets := cache.GetAssets()
for _, asset := range assets {
lowerName := strings.ToLower(asset.Path)
if !strings.HasSuffix(lowerName, ".png") && !strings.HasSuffix(lowerName, ".jpg") && !strings.HasSuffix(lowerName, ".jpeg") {
continue
}
assetsPaths = append(assetsPaths, asset.Path)
}
assetsPath := util.GetDataAssetsAbsPath()
assetsTextsTmp := util.AssetsTexts
for _, assetPath := range assetsPaths {
if _, ok := assetsTextsTmp[assetPath]; ok {
continue
}
absPath := filepath.Join(assetsPath, strings.TrimPrefix(assetPath, "assets"))
ret = append(ret, absPath)
}
return
}
func AutoFlushAssetsTexts() {
for {
SaveAssetsTexts()
time.Sleep(7 * time.Second)
}
}
func LoadAssetsTexts() {
assetsPath := util.GetDataAssetsAbsPath()
assetsTextsPath := filepath.Join(assetsPath, "ocr-texts.json")
if !gulu.File.IsExist(assetsTextsPath) {
return
}
start := time.Now()
var err error
fh, err := os.OpenFile(assetsTextsPath, os.O_RDWR, 0644)
if nil != err {
logging.LogErrorf("open assets texts failed: %s", err)
return
}
defer fh.Close()
data, err := io.ReadAll(fh)
if nil != err {
logging.LogErrorf("read assets texts failed: %s", err)
return
}
util.AssetsTextsLock.Lock()
if err = gulu.JSON.UnmarshalJSON(data, &util.AssetsTexts); nil != err {
logging.LogErrorf("unmarshal assets texts failed: %s", err)
if err = os.RemoveAll(assetsTextsPath); nil != err {
logging.LogErrorf("removed corrupted assets texts failed: %s", err)
}
return
}
util.AssetsTextsLock.Unlock()
debug.FreeOSMemory()
if elapsed := time.Since(start).Seconds(); 2 < elapsed {
logging.LogWarnf("read assets texts [%s] to [%s], elapsed [%.2fs]", humanize.Bytes(uint64(len(data))), assetsTextsPath, elapsed)
}
return
}
func SaveAssetsTexts() {
if !util.AssetsTextsChanged {
return
}
start := time.Now()
util.AssetsTextsLock.Lock()
data, err := gulu.JSON.MarshalIndentJSON(util.AssetsTexts, "", " ")
if nil != err {
logging.LogErrorf("marshal assets texts failed: %s", err)
return
}
util.AssetsTextsLock.Unlock()
assetsPath := util.GetDataAssetsAbsPath()
assetsTextsPath := filepath.Join(assetsPath, "ocr-texts.json")
if err = gulu.File.WriteFileSafer(assetsTextsPath, data, 0644); nil != err {
logging.LogErrorf("write assets texts failed: %s", err)
return
}
debug.FreeOSMemory()
if elapsed := time.Since(start).Seconds(); 2 < elapsed {
logging.LogWarnf("save assets texts [size=%s] to [%s], elapsed [%.2fs]", humanize.Bytes(uint64(len(data))), assetsTextsPath, elapsed)
}
util.AssetsTextsChanged = false
}

View file

@ -18,6 +18,7 @@ package treenode
import (
"bytes"
util2 "github.com/siyuan-note/siyuan/kernel/util"
"strings"
"sync"
@ -31,7 +32,6 @@ import (
"github.com/88250/lute/render"
"github.com/88250/lute/util"
"github.com/siyuan-note/logging"
util2 "github.com/siyuan-note/siyuan/kernel/util"
)
func GetBlockRef(n *ast.Node) (blockRefID, blockRefText, blockRefSubtype string) {

View file

@ -1,338 +0,0 @@
// SiYuan - Build Your Eternal Digital Garden
// Copyright (c) 2020-present, b3log.org
//
// This program is free software: you can redistribute it and/or modify
// it under the terms of the GNU Affero General Public License as published by
// the Free Software Foundation, either version 3 of the License, or
// (at your option) any later version.
//
// This program is distributed in the hope that it will be useful,
// but WITHOUT ANY WARRANTY; without even the implied warranty of
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
// GNU Affero General Public License for more details.
//
// You should have received a copy of the GNU Affero General Public License
// along with this program. If not, see <https://www.gnu.org/licenses/>.
package util
import (
"bytes"
"context"
"io"
"os"
"os/exec"
"path/filepath"
"regexp"
"runtime"
"runtime/debug"
"strings"
"sync"
"time"
"github.com/88250/gulu"
"github.com/dustin/go-humanize"
"github.com/panjf2000/ants/v2"
"github.com/siyuan-note/logging"
)
var (
tesseractEnabled bool
tesseractLangs []string
assetsTexts = map[string]string{}
assetsTextsLock = sync.Mutex{}
assetsTextsChanged = false
)
func GetAssetText(asset string) string {
assetsTextsLock.Lock()
ret, ok := assetsTexts[asset]
assetsTextsLock.Unlock()
if ok {
return ret
}
assetsPath := GetDataAssetsAbsPath()
assetAbsPath := strings.TrimPrefix(asset, "assets")
assetAbsPath = filepath.Join(assetsPath, assetAbsPath)
ret = Tesseract(assetAbsPath)
assetsTextsLock.Lock()
assetsTexts[asset] = ret
assetsTextsLock.Unlock()
return ret
}
func Tesseract(imgAbsPath string) string {
if ContainerStd != Container || !tesseractEnabled {
return ""
}
info, err := os.Stat(imgAbsPath)
if nil != err {
return ""
}
defer logging.Recover()
ctx, cancel := context.WithTimeout(context.Background(), 7*time.Second)
defer cancel()
now := time.Now()
cmd := exec.CommandContext(ctx, "tesseract", "-c", "debug_file=/dev/null", imgAbsPath, "stdout", "-l", strings.Join(tesseractLangs, "+"))
gulu.CmdAttr(cmd)
output, err := cmd.CombinedOutput()
if ctx.Err() == context.DeadlineExceeded {
logging.LogWarnf("tesseract [path=%s, size=%d] timeout", imgAbsPath, info.Size())
return ""
}
if nil != err {
logging.LogWarnf("tesseract [path=%s, size=%d] failed: %s", imgAbsPath, info.Size(), err)
return ""
}
ret := string(output)
ret = strings.ReplaceAll(ret, "\r", "")
ret = strings.ReplaceAll(ret, "\n", "")
ret = strings.ReplaceAll(ret, "\t", " ")
reg := regexp.MustCompile("\\s{2,}")
ret = reg.ReplaceAllString(ret, " ")
logging.LogInfof("tesseract [path=%s, size=%d, text=%s, elapsed=%dms]", imgAbsPath, info.Size(), ret, time.Since(now).Milliseconds())
return ret
}
func AutoOCRAssets() {
if !tesseractEnabled {
return
}
for {
autoOCRAssets()
time.Sleep(7 * time.Second)
}
}
func autoOCRAssets() {
defer logging.Recover()
assetsPath := GetDataAssetsAbsPath()
assets := getUnOCRAssetsAbsPaths()
poolSize := runtime.NumCPU()
if 4 < poolSize {
poolSize = 4
}
waitGroup := &sync.WaitGroup{}
p, _ := ants.NewPoolWithFunc(poolSize, func(arg interface{}) {
defer waitGroup.Done()
assetAbsPath := arg.(string)
text := Tesseract(assetAbsPath)
p := strings.TrimPrefix(assetAbsPath, assetsPath)
p = "assets" + filepath.ToSlash(p)
assetsTextsLock.Lock()
assetsTexts[p] = text
assetsTextsLock.Unlock()
assetsTextsChanged = true
})
for _, assetAbsPath := range assets {
waitGroup.Add(1)
p.Invoke(assetAbsPath)
}
waitGroup.Wait()
p.Release()
cleanNotFoundAssetsTexts()
}
func cleanNotFoundAssetsTexts() {
tmp := assetsTexts
assetsPath := GetDataAssetsAbsPath()
var toRemoves []string
for asset, _ := range tmp {
assetAbsPath := strings.TrimPrefix(asset, "assets")
assetAbsPath = filepath.Join(assetsPath, assetAbsPath)
if !gulu.File.IsExist(assetAbsPath) {
toRemoves = append(toRemoves, asset)
}
}
assetsTextsLock.Lock()
for _, asset := range toRemoves {
delete(assetsTexts, asset)
assetsTextsChanged = true
}
assetsTextsLock.Unlock()
return
}
func getUnOCRAssetsAbsPaths() (ret []string) {
assetsPath := GetDataAssetsAbsPath()
var assetsPaths []string
filepath.Walk(assetsPath, func(path string, info os.FileInfo, err error) error {
name := info.Name()
if info.IsDir() {
if strings.HasPrefix(name, ".") {
return filepath.SkipDir
}
return nil
}
lowerName := strings.ToLower(name)
if !strings.HasSuffix(lowerName, ".png") && !strings.HasSuffix(lowerName, ".jpg") && !strings.HasSuffix(lowerName, ".jpeg") {
return nil
}
assetsPaths = append(assetsPaths, path)
return nil
})
assetsTextsTmp := assetsTexts
for _, absPath := range assetsPaths {
p := strings.TrimPrefix(absPath, assetsPath)
p = "assets" + filepath.ToSlash(p)
if _, ok := assetsTextsTmp[p]; ok {
continue
}
ret = append(ret, absPath)
}
return
}
func AutoFlushAssetsTexts() {
for {
SaveAssetsTexts()
time.Sleep(7 * time.Second)
}
}
func LoadAssetsTexts() {
assetsPath := GetDataAssetsAbsPath()
assetsTextsPath := filepath.Join(assetsPath, "ocr-texts.json")
if !gulu.File.IsExist(assetsTextsPath) {
return
}
start := time.Now()
var err error
fh, err := os.OpenFile(assetsTextsPath, os.O_RDWR, 0644)
if nil != err {
logging.LogErrorf("open assets texts failed: %s", err)
return
}
defer fh.Close()
data, err := io.ReadAll(fh)
if nil != err {
logging.LogErrorf("read assets texts failed: %s", err)
return
}
assetsTextsLock.Lock()
if err = gulu.JSON.UnmarshalJSON(data, &assetsTexts); nil != err {
logging.LogErrorf("unmarshal assets texts failed: %s", err)
if err = os.RemoveAll(assetsTextsPath); nil != err {
logging.LogErrorf("removed corrupted assets texts failed: %s", err)
}
return
}
assetsTextsLock.Unlock()
debug.FreeOSMemory()
if elapsed := time.Since(start).Seconds(); 2 < elapsed {
logging.LogWarnf("read assets texts [%s] to [%s], elapsed [%.2fs]", humanize.Bytes(uint64(len(data))), assetsTextsPath, elapsed)
}
return
}
func SaveAssetsTexts() {
if !assetsTextsChanged {
return
}
start := time.Now()
assetsTextsLock.Lock()
data, err := gulu.JSON.MarshalIndentJSON(assetsTexts, "", " ")
if nil != err {
logging.LogErrorf("marshal assets texts failed: %s", err)
return
}
assetsTextsLock.Unlock()
assetsPath := GetDataAssetsAbsPath()
assetsTextsPath := filepath.Join(assetsPath, "ocr-texts.json")
if err = gulu.File.WriteFileSafer(assetsTextsPath, data, 0644); nil != err {
logging.LogErrorf("write assets texts failed: %s", err)
return
}
debug.FreeOSMemory()
if elapsed := time.Since(start).Seconds(); 2 < elapsed {
logging.LogWarnf("save assets texts [size=%s] to [%s], elapsed [%.2fs]", humanize.Bytes(uint64(len(data))), assetsTextsPath, elapsed)
}
assetsTextsChanged = false
}
func initTesseract() {
ver := getTesseractVer()
if "" == ver {
return
}
tesseractLangs = getTesseractLangs()
if 1 > len(tesseractLangs) {
logging.LogWarnf("no tesseract langs found")
tesseractEnabled = false
return
}
logging.LogInfof("tesseract-ocr enabled [ver=%s, langs=%s]", ver, strings.Join(tesseractLangs, "+"))
}
func getTesseractVer() (ret string) {
if ContainerStd != Container {
return
}
cmd := exec.Command("tesseract", "--version")
gulu.CmdAttr(cmd)
data, err := cmd.CombinedOutput()
if nil == err && strings.HasPrefix(string(data), "tesseract ") {
parts := bytes.Split(data, []byte("\n"))
if 0 < len(parts) {
ret = strings.TrimPrefix(string(parts[0]), "tesseract ")
ret = strings.TrimSpace(ret)
tesseractEnabled = true
}
return
}
return
}
func getTesseractLangs() (ret []string) {
if !tesseractEnabled {
return nil
}
cmd := exec.Command("tesseract", "--list-langs")
gulu.CmdAttr(cmd)
data, err := cmd.CombinedOutput()
if nil != err {
return nil
}
parts := bytes.Split(data, []byte("\n"))
if 0 < len(parts) {
parts = parts[1:]
}
for _, part := range parts {
part = bytes.TrimSpace(part)
if 0 == len(part) {
continue
}
ret = append(ret, string(part))
}
return
}

162
kernel/util/tesseract.go Normal file
View file

@ -0,0 +1,162 @@
// SiYuan - Build Your Eternal Digital Garden
// Copyright (c) 2020-present, b3log.org
//
// This program is free software: you can redistribute it and/or modify
// it under the terms of the GNU Affero General Public License as published by
// the Free Software Foundation, either version 3 of the License, or
// (at your option) any later version.
//
// This program is distributed in the hope that it will be useful,
// but WITHOUT ANY WARRANTY; without even the implied warranty of
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
// GNU Affero General Public License for more details.
//
// You should have received a copy of the GNU Affero General Public License
// along with this program. If not, see <https://www.gnu.org/licenses/>.
package util
import (
"bytes"
"context"
"fmt"
"os"
"os/exec"
"path/filepath"
"regexp"
"strings"
"sync"
"time"
"github.com/88250/gulu"
"github.com/siyuan-note/logging"
)
var (
TesseractEnabled bool
AssetsTexts = map[string]string{}
AssetsTextsLock = sync.Mutex{}
AssetsTextsChanged = false
tesseractLangs []string
)
func GetAssetText(asset string) string {
AssetsTextsLock.Lock()
ret, ok := AssetsTexts[asset]
AssetsTextsLock.Unlock()
if ok {
return ret
}
assetsPath := GetDataAssetsAbsPath()
assetAbsPath := strings.TrimPrefix(asset, "assets")
assetAbsPath = filepath.Join(assetsPath, assetAbsPath)
ret = Tesseract(assetAbsPath)
AssetsTextsLock.Lock()
AssetsTexts[asset] = ret
AssetsTextsLock.Unlock()
return ret
}
func Tesseract(imgAbsPath string) string {
if ContainerStd != Container || !TesseractEnabled {
return ""
}
info, err := os.Stat(imgAbsPath)
if nil != err {
return ""
}
defer logging.Recover()
ctx, cancel := context.WithTimeout(context.Background(), 7*time.Second)
defer cancel()
now := time.Now()
cmd := exec.CommandContext(ctx, "tesseract", "-c", "debug_file=/dev/null", imgAbsPath, "stdout", "-l", strings.Join(tesseractLangs, "+"))
gulu.CmdAttr(cmd)
output, err := cmd.CombinedOutput()
if ctx.Err() == context.DeadlineExceeded {
logging.LogWarnf("tesseract [path=%s, size=%d] timeout", imgAbsPath, info.Size())
return ""
}
if nil != err {
logging.LogWarnf("tesseract [path=%s, size=%d] failed: %s", imgAbsPath, info.Size(), err)
return ""
}
ret := string(output)
ret = strings.ReplaceAll(ret, "\r", "")
ret = strings.ReplaceAll(ret, "\n", "")
ret = strings.ReplaceAll(ret, "\t", " ")
reg := regexp.MustCompile("\\s{2,}")
ret = reg.ReplaceAllString(ret, " ")
logging.LogInfof("tesseract [path=%s, size=%d, text=%s, elapsed=%dms]", imgAbsPath, info.Size(), ret, time.Since(now).Milliseconds())
msg := fmt.Sprintf("OCR [%s] [%s]", info.Name(), ret)
PushStatusBar(msg)
return ret
}
func initTesseract() {
ver := getTesseractVer()
if "" == ver {
return
}
tesseractLangs = getTesseractLangs()
if 1 > len(tesseractLangs) {
logging.LogWarnf("no tesseract langs found")
TesseractEnabled = false
return
}
logging.LogInfof("tesseract-ocr enabled [ver=%s, langs=%s]", ver, strings.Join(tesseractLangs, "+"))
}
func getTesseractVer() (ret string) {
if ContainerStd != Container {
return
}
cmd := exec.Command("tesseract", "--version")
gulu.CmdAttr(cmd)
data, err := cmd.CombinedOutput()
if nil == err && strings.HasPrefix(string(data), "tesseract ") {
parts := bytes.Split(data, []byte("\n"))
if 0 < len(parts) {
ret = strings.TrimPrefix(string(parts[0]), "tesseract ")
ret = strings.TrimSpace(ret)
TesseractEnabled = true
}
return
}
return
}
func getTesseractLangs() (ret []string) {
if !TesseractEnabled {
return nil
}
cmd := exec.Command("tesseract", "--list-langs")
gulu.CmdAttr(cmd)
data, err := cmd.CombinedOutput()
if nil != err {
return nil
}
parts := bytes.Split(data, []byte("\n"))
if 0 < len(parts) {
parts = parts[1:]
}
for _, part := range parts {
part = bytes.TrimSpace(part)
if 0 == len(part) {
continue
}
ret = append(ret, string(part))
}
return
}