Support searching PDF asset content (#8985)
* feat(asset): add PDF parser * test: changed test function name to match parser * asset: add separator * asset: add pdf factory product * test: remove println --------- Co-authored-by: Heiko Besemann <heiko.besemann@qbeyond.de>
This commit is contained in:
parent
ec31ac0741
commit
19a295e157
5 changed files with 125 additions and 0 deletions
|
@ -64,6 +64,11 @@ require (
|
|||
golang.org/x/text v0.12.0
|
||||
)
|
||||
|
||||
require (
|
||||
github.com/jolestar/go-commons-pool/v2 v2.1.2 // indirect
|
||||
github.com/tetratelabs/wazero v1.3.1 // indirect
|
||||
)
|
||||
|
||||
require (
|
||||
dmitri.shuralyov.com/font/woff2 v0.0.0-20180220214647-957792cbbdab // indirect
|
||||
github.com/JalfResi/justext v0.0.0-20170829062021-c0282dea7198 // indirect
|
||||
|
@ -111,6 +116,7 @@ require (
|
|||
github.com/juju/errors v1.0.0 // indirect
|
||||
github.com/klauspost/compress v1.16.7 // indirect
|
||||
github.com/klauspost/cpuid/v2 v2.2.5 // indirect
|
||||
github.com/klippa-app/go-pdfium v1.6.0
|
||||
github.com/leodido/go-urn v1.2.4 // indirect
|
||||
github.com/levigross/exp-html v0.0.0-20120902181939-8df60c69a8f5 // indirect
|
||||
github.com/lufia/plan9stats v0.0.0-20230326075908-cb1d2100619a // indirect
|
||||
|
|
|
@ -173,6 +173,7 @@ github.com/fatih/set v0.2.1 h1:nn2CaJyknWE/6txyUDGwysr3G5QC6xWB/PtVjPBbeaA=
|
|||
github.com/fatih/set v0.2.1/go.mod h1:+RKtMCH+favT2+3YecHGxcc0b4KyVWA1QWWJUs4E0CI=
|
||||
github.com/flopp/go-findfont v0.1.0 h1:lPn0BymDUtJo+ZkV01VS3661HL6F4qFlkhcJN55u6mU=
|
||||
github.com/flopp/go-findfont v0.1.0/go.mod h1:wKKxRDjD024Rh7VMwoU90i6ikQRCr+JTHB5n4Ejkqvw=
|
||||
github.com/fortytw2/leaktest v1.3.0/go.mod h1:jDsjWgpAGjm2CA7WthBh/CdZYEPF31XHquHwclZch5g=
|
||||
github.com/frankban/quicktest v1.14.4 h1:g2rn0vABPOOXmZUj+vbmUp0lPoXEMuhTpIluN0XL9UY=
|
||||
github.com/fsnotify/fsnotify v1.6.0 h1:n+5WquG0fcWoWp6xPWfHdbskMCQaFnG6PfBrh1Ky4HY=
|
||||
github.com/fsnotify/fsnotify v1.6.0/go.mod h1:sl3t1tCWJFWoRz9R8WJCbQihKKwmorjAbSClcnxKAGw=
|
||||
|
@ -354,6 +355,8 @@ github.com/jmespath/go-jmespath v0.4.0 h1:BEgLn5cpjn8UN1mAw4NjwDrS35OdebyEtFe+9Y
|
|||
github.com/jmespath/go-jmespath v0.4.0/go.mod h1:T8mJZnbsbmF+m6zOOFylbeCJqk5+pHWvzYPziyZiYoo=
|
||||
github.com/jmespath/go-jmespath/internal/testify v1.5.1 h1:shLQSRRSCCPj3f2gpwzGwWFoC7ycTf1rcQZHOlsJ6N8=
|
||||
github.com/jmespath/go-jmespath/internal/testify v1.5.1/go.mod h1:L3OGu8Wl2/fWfCI6z80xFu9LTZmf1ZRjMHUOPmWr69U=
|
||||
github.com/jolestar/go-commons-pool/v2 v2.1.2 h1:E+XGo58F23t7HtZiC/W6jzO2Ux2IccSH/yx4nD+J1CM=
|
||||
github.com/jolestar/go-commons-pool/v2 v2.1.2/go.mod h1:r4NYccrkS5UqP1YQI1COyTZ9UjPJAAGTUxzcsK1kqhY=
|
||||
github.com/json-iterator/go v1.1.12 h1:PV8peI4a0ysnczrg+LtxykD8LfKY9ML6u2jnxaEnrnM=
|
||||
github.com/json-iterator/go v1.1.12/go.mod h1:e30LSqwooZae/UwlEbR2852Gd8hjQvJoHmT4TnhNGBo=
|
||||
github.com/jstemmer/go-junit-report v0.0.0-20190106144839-af01ea7f8024/go.mod h1:6v2b51hI/fHJwM22ozAgKL4VKDeJcHhJFhtBdhmNjmU=
|
||||
|
@ -373,6 +376,8 @@ github.com/klauspost/cpuid v1.2.0/go.mod h1:Pj4uuM528wm8OyEC2QMXAi2YiTZ96dNQPGgo
|
|||
github.com/klauspost/cpuid/v2 v2.0.9/go.mod h1:FInQzS24/EEf25PyTYn52gqo7WaD8xa0213Md/qVLRg=
|
||||
github.com/klauspost/cpuid/v2 v2.2.5 h1:0E5MSMDEoAulmXNFquVs//DdoomxaoTY1kUhbc/qbZg=
|
||||
github.com/klauspost/cpuid/v2 v2.2.5/go.mod h1:Lcz8mBdAVJIBVzewtcLocK12l3Y+JytZYpaMropDUws=
|
||||
github.com/klippa-app/go-pdfium v1.6.0 h1:swz+bKYsrRSuPrczot2cE/FoR/1h13R8CjBOv2RcDm4=
|
||||
github.com/klippa-app/go-pdfium v1.6.0/go.mod h1:Lh8U8bQ+Idxz3e89+0u59j64YTPaO3G5JbvRImVqIio=
|
||||
github.com/knz/go-libedit v1.10.1/go.mod h1:MZTVkCWyz0oBc7JOWP3wNAzd002ZbM/5hgShxwh4x8M=
|
||||
github.com/kr/pretty v0.1.0/go.mod h1:dAy3ld7l9f0ibDNOQOHHMYYIIbhfbHSm3C4ZsoJORNo=
|
||||
github.com/kr/pretty v0.2.0/go.mod h1:ipq/a2n7PKx3OHsz4KJII5eveXtPO4qwEXGdVfWzfnI=
|
||||
|
@ -538,6 +543,8 @@ github.com/stretchr/testify v1.8.4 h1:CcVxjf3Q8PM0mHUKJCdn+eZZtm5yQwehR5yeSVQQcU
|
|||
github.com/stretchr/testify v1.8.4/go.mod h1:sz/lmYIOXD/1dqDmKjjqLyZ2RngseejIcXlSw2iwfAo=
|
||||
github.com/studio-b12/gowebdav v0.9.0 h1:1j1sc9gQnNxbXXM4M/CebPOX4aXYtr7MojAVcN4dHjU=
|
||||
github.com/studio-b12/gowebdav v0.9.0/go.mod h1:bHA7t77X/QFExdeAnDzK6vKM34kEZAcE1OX4MfiwjkE=
|
||||
github.com/tetratelabs/wazero v1.3.1 h1:rnb9FgOEQRLLR8tgoD1mfjNjMhFeWRUk+a4b4j/GpUM=
|
||||
github.com/tetratelabs/wazero v1.3.1/go.mod h1:wYx2gNRg8/WihJfSDxA1TIL8H+GkfLYm+bIfbblu9VQ=
|
||||
github.com/tklauser/go-sysconf v0.3.11 h1:89WgdJhk5SNwJfu+GKyYveZ4IaJ7xAkecBo+KdJV0CM=
|
||||
github.com/tklauser/go-sysconf v0.3.11/go.mod h1:GqXfhXY3kiPa0nAXPDIQIWzJbMCB7AmcWpGR8lSZfqI=
|
||||
github.com/tklauser/numcpus v0.6.0/go.mod h1:FEZLMke0lhOUG6w2JadTzp0a+Nl8PF/GFkQ5UVIcaL4=
|
||||
|
|
|
@ -24,11 +24,14 @@ import (
|
|||
"strconv"
|
||||
"strings"
|
||||
"sync"
|
||||
"time"
|
||||
|
||||
"code.sajari.com/docconv"
|
||||
"github.com/88250/gulu"
|
||||
"github.com/88250/lute/ast"
|
||||
"github.com/dustin/go-humanize"
|
||||
"github.com/klippa-app/go-pdfium/requests"
|
||||
"github.com/klippa-app/go-pdfium/webassembly"
|
||||
"github.com/siyuan-note/eventbus"
|
||||
"github.com/siyuan-note/filelock"
|
||||
"github.com/siyuan-note/logging"
|
||||
|
@ -416,6 +419,7 @@ func NewAssetsSearcher() *AssetsSearcher {
|
|||
".docx": &DocxAssetParser{},
|
||||
".pptx": &PptxAssetParser{},
|
||||
".xlsx": &XlsxAssetParser{},
|
||||
".pdf": &PdfAssetParser{},
|
||||
},
|
||||
|
||||
lock: &sync.Mutex{},
|
||||
|
@ -604,3 +608,98 @@ func (parser *XlsxAssetParser) Parse(absPath string) (ret *AssetParseResult) {
|
|||
}
|
||||
return
|
||||
}
|
||||
|
||||
// PdfAssetParser parser factory product
|
||||
type PdfAssetParser struct {
|
||||
}
|
||||
|
||||
// Parse will parse a PDF document using PDFium webassembly module
|
||||
func (parser *PdfAssetParser) Parse(absPath string) (ret *AssetParseResult) {
|
||||
if !strings.HasSuffix(strings.ToLower(absPath), ".pdf") {
|
||||
return
|
||||
}
|
||||
|
||||
if !gulu.File.IsExist(absPath) {
|
||||
return
|
||||
}
|
||||
|
||||
tmp := copyTempAsset(absPath)
|
||||
if "" == tmp {
|
||||
return
|
||||
}
|
||||
defer os.RemoveAll(tmp)
|
||||
|
||||
f, err := os.Open(tmp)
|
||||
if nil != err {
|
||||
logging.LogErrorf("open [%s] failed: [%s]", tmp, err)
|
||||
return
|
||||
}
|
||||
defer f.Close()
|
||||
|
||||
stat, err := f.Stat()
|
||||
if nil != err {
|
||||
logging.LogErrorf("open [%s] failed: [%s]", tmp, err)
|
||||
return
|
||||
}
|
||||
|
||||
// initialize pdfium with one worker
|
||||
pool, err := webassembly.Init(webassembly.Config{
|
||||
MinIdle: 1,
|
||||
MaxIdle: 1,
|
||||
MaxTotal: 1,
|
||||
})
|
||||
if err != nil {
|
||||
logging.LogErrorf("convert [%s] failed: [%s]", tmp, err)
|
||||
return
|
||||
}
|
||||
defer pool.Close()
|
||||
|
||||
instance, err := pool.GetInstance(time.Second * 30)
|
||||
if err != nil {
|
||||
logging.LogErrorf("convert [%s] failed: [%s]", tmp, err)
|
||||
return
|
||||
}
|
||||
defer instance.Close()
|
||||
|
||||
// get number of pages inside PDF document
|
||||
doc, err := instance.OpenDocument(&requests.OpenDocument{
|
||||
FileReader: f,
|
||||
FileReaderSize: stat.Size(),
|
||||
})
|
||||
if err != nil {
|
||||
logging.LogErrorf("convert [%s] failed: [%s]", tmp, err)
|
||||
return
|
||||
}
|
||||
defer instance.FPDF_CloseDocument(&requests.FPDF_CloseDocument{
|
||||
Document: doc.Document,
|
||||
})
|
||||
|
||||
pageCount, err := instance.FPDF_GetPageCount(&requests.FPDF_GetPageCount{Document: doc.Document})
|
||||
if err != nil {
|
||||
logging.LogErrorf("convert [%s] failed: [%s]", tmp, err)
|
||||
return
|
||||
}
|
||||
// loop through pages and get content
|
||||
content := ""
|
||||
for page := 0; page < pageCount.PageCount; page++ {
|
||||
req := &requests.GetPageText{
|
||||
Page: requests.Page{
|
||||
ByIndex: &requests.PageByIndex{
|
||||
Document: doc.Document,
|
||||
Index: page,
|
||||
},
|
||||
},
|
||||
}
|
||||
pt, err := instance.GetPageText(req)
|
||||
if err != nil {
|
||||
logging.LogErrorf("convert [%s] failed: [%s]", tmp, err)
|
||||
return
|
||||
}
|
||||
content += " " + normalizeAssetContent(pt.Text)
|
||||
}
|
||||
|
||||
ret = &AssetParseResult{
|
||||
Content: content,
|
||||
}
|
||||
return
|
||||
}
|
||||
|
|
13
kernel/model/asset_content_test.go
Normal file
13
kernel/model/asset_content_test.go
Normal file
|
@ -0,0 +1,13 @@
|
|||
package model
|
||||
|
||||
import (
|
||||
"testing"
|
||||
)
|
||||
|
||||
func TestPDFParser(t *testing.T) {
|
||||
p := &PdfAssetParser{}
|
||||
res := p.Parse("../../testdata/parsertest.pdf")
|
||||
if res == nil || res.Content == "" {
|
||||
t.Fatalf("empty or nil PDF content result")
|
||||
}
|
||||
}
|
BIN
testdata/parsertest.pdf
vendored
Normal file
BIN
testdata/parsertest.pdf
vendored
Normal file
Binary file not shown.
Loading…
Add table
Reference in a new issue