Support searching PDF asset content (#8985)

* feat(asset): add PDF parser

* test: changed test function name to match parser

* asset: add separator

* asset: add pdf factory product

* test: remove println

---------

Co-authored-by: Heiko Besemann <heiko.besemann@qbeyond.de>
This commit is contained in:
nekrondev 2023-08-17 05:52:59 +02:00 committed by GitHub
parent ec31ac0741
commit 19a295e157
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
5 changed files with 125 additions and 0 deletions

View file

@ -64,6 +64,11 @@ require (
golang.org/x/text v0.12.0
)
require (
github.com/jolestar/go-commons-pool/v2 v2.1.2 // indirect
github.com/tetratelabs/wazero v1.3.1 // indirect
)
require (
dmitri.shuralyov.com/font/woff2 v0.0.0-20180220214647-957792cbbdab // indirect
github.com/JalfResi/justext v0.0.0-20170829062021-c0282dea7198 // indirect
@ -111,6 +116,7 @@ require (
github.com/juju/errors v1.0.0 // indirect
github.com/klauspost/compress v1.16.7 // indirect
github.com/klauspost/cpuid/v2 v2.2.5 // indirect
github.com/klippa-app/go-pdfium v1.6.0
github.com/leodido/go-urn v1.2.4 // indirect
github.com/levigross/exp-html v0.0.0-20120902181939-8df60c69a8f5 // indirect
github.com/lufia/plan9stats v0.0.0-20230326075908-cb1d2100619a // indirect

View file

@ -173,6 +173,7 @@ github.com/fatih/set v0.2.1 h1:nn2CaJyknWE/6txyUDGwysr3G5QC6xWB/PtVjPBbeaA=
github.com/fatih/set v0.2.1/go.mod h1:+RKtMCH+favT2+3YecHGxcc0b4KyVWA1QWWJUs4E0CI=
github.com/flopp/go-findfont v0.1.0 h1:lPn0BymDUtJo+ZkV01VS3661HL6F4qFlkhcJN55u6mU=
github.com/flopp/go-findfont v0.1.0/go.mod h1:wKKxRDjD024Rh7VMwoU90i6ikQRCr+JTHB5n4Ejkqvw=
github.com/fortytw2/leaktest v1.3.0/go.mod h1:jDsjWgpAGjm2CA7WthBh/CdZYEPF31XHquHwclZch5g=
github.com/frankban/quicktest v1.14.4 h1:g2rn0vABPOOXmZUj+vbmUp0lPoXEMuhTpIluN0XL9UY=
github.com/fsnotify/fsnotify v1.6.0 h1:n+5WquG0fcWoWp6xPWfHdbskMCQaFnG6PfBrh1Ky4HY=
github.com/fsnotify/fsnotify v1.6.0/go.mod h1:sl3t1tCWJFWoRz9R8WJCbQihKKwmorjAbSClcnxKAGw=
@ -354,6 +355,8 @@ github.com/jmespath/go-jmespath v0.4.0 h1:BEgLn5cpjn8UN1mAw4NjwDrS35OdebyEtFe+9Y
github.com/jmespath/go-jmespath v0.4.0/go.mod h1:T8mJZnbsbmF+m6zOOFylbeCJqk5+pHWvzYPziyZiYoo=
github.com/jmespath/go-jmespath/internal/testify v1.5.1 h1:shLQSRRSCCPj3f2gpwzGwWFoC7ycTf1rcQZHOlsJ6N8=
github.com/jmespath/go-jmespath/internal/testify v1.5.1/go.mod h1:L3OGu8Wl2/fWfCI6z80xFu9LTZmf1ZRjMHUOPmWr69U=
github.com/jolestar/go-commons-pool/v2 v2.1.2 h1:E+XGo58F23t7HtZiC/W6jzO2Ux2IccSH/yx4nD+J1CM=
github.com/jolestar/go-commons-pool/v2 v2.1.2/go.mod h1:r4NYccrkS5UqP1YQI1COyTZ9UjPJAAGTUxzcsK1kqhY=
github.com/json-iterator/go v1.1.12 h1:PV8peI4a0ysnczrg+LtxykD8LfKY9ML6u2jnxaEnrnM=
github.com/json-iterator/go v1.1.12/go.mod h1:e30LSqwooZae/UwlEbR2852Gd8hjQvJoHmT4TnhNGBo=
github.com/jstemmer/go-junit-report v0.0.0-20190106144839-af01ea7f8024/go.mod h1:6v2b51hI/fHJwM22ozAgKL4VKDeJcHhJFhtBdhmNjmU=
@ -373,6 +376,8 @@ github.com/klauspost/cpuid v1.2.0/go.mod h1:Pj4uuM528wm8OyEC2QMXAi2YiTZ96dNQPGgo
github.com/klauspost/cpuid/v2 v2.0.9/go.mod h1:FInQzS24/EEf25PyTYn52gqo7WaD8xa0213Md/qVLRg=
github.com/klauspost/cpuid/v2 v2.2.5 h1:0E5MSMDEoAulmXNFquVs//DdoomxaoTY1kUhbc/qbZg=
github.com/klauspost/cpuid/v2 v2.2.5/go.mod h1:Lcz8mBdAVJIBVzewtcLocK12l3Y+JytZYpaMropDUws=
github.com/klippa-app/go-pdfium v1.6.0 h1:swz+bKYsrRSuPrczot2cE/FoR/1h13R8CjBOv2RcDm4=
github.com/klippa-app/go-pdfium v1.6.0/go.mod h1:Lh8U8bQ+Idxz3e89+0u59j64YTPaO3G5JbvRImVqIio=
github.com/knz/go-libedit v1.10.1/go.mod h1:MZTVkCWyz0oBc7JOWP3wNAzd002ZbM/5hgShxwh4x8M=
github.com/kr/pretty v0.1.0/go.mod h1:dAy3ld7l9f0ibDNOQOHHMYYIIbhfbHSm3C4ZsoJORNo=
github.com/kr/pretty v0.2.0/go.mod h1:ipq/a2n7PKx3OHsz4KJII5eveXtPO4qwEXGdVfWzfnI=
@ -538,6 +543,8 @@ github.com/stretchr/testify v1.8.4 h1:CcVxjf3Q8PM0mHUKJCdn+eZZtm5yQwehR5yeSVQQcU
github.com/stretchr/testify v1.8.4/go.mod h1:sz/lmYIOXD/1dqDmKjjqLyZ2RngseejIcXlSw2iwfAo=
github.com/studio-b12/gowebdav v0.9.0 h1:1j1sc9gQnNxbXXM4M/CebPOX4aXYtr7MojAVcN4dHjU=
github.com/studio-b12/gowebdav v0.9.0/go.mod h1:bHA7t77X/QFExdeAnDzK6vKM34kEZAcE1OX4MfiwjkE=
github.com/tetratelabs/wazero v1.3.1 h1:rnb9FgOEQRLLR8tgoD1mfjNjMhFeWRUk+a4b4j/GpUM=
github.com/tetratelabs/wazero v1.3.1/go.mod h1:wYx2gNRg8/WihJfSDxA1TIL8H+GkfLYm+bIfbblu9VQ=
github.com/tklauser/go-sysconf v0.3.11 h1:89WgdJhk5SNwJfu+GKyYveZ4IaJ7xAkecBo+KdJV0CM=
github.com/tklauser/go-sysconf v0.3.11/go.mod h1:GqXfhXY3kiPa0nAXPDIQIWzJbMCB7AmcWpGR8lSZfqI=
github.com/tklauser/numcpus v0.6.0/go.mod h1:FEZLMke0lhOUG6w2JadTzp0a+Nl8PF/GFkQ5UVIcaL4=

View file

@ -24,11 +24,14 @@ import (
"strconv"
"strings"
"sync"
"time"
"code.sajari.com/docconv"
"github.com/88250/gulu"
"github.com/88250/lute/ast"
"github.com/dustin/go-humanize"
"github.com/klippa-app/go-pdfium/requests"
"github.com/klippa-app/go-pdfium/webassembly"
"github.com/siyuan-note/eventbus"
"github.com/siyuan-note/filelock"
"github.com/siyuan-note/logging"
@ -416,6 +419,7 @@ func NewAssetsSearcher() *AssetsSearcher {
".docx": &DocxAssetParser{},
".pptx": &PptxAssetParser{},
".xlsx": &XlsxAssetParser{},
".pdf": &PdfAssetParser{},
},
lock: &sync.Mutex{},
@ -604,3 +608,98 @@ func (parser *XlsxAssetParser) Parse(absPath string) (ret *AssetParseResult) {
}
return
}
// PdfAssetParser parser factory product
type PdfAssetParser struct {
}
// Parse will parse a PDF document using PDFium webassembly module
func (parser *PdfAssetParser) Parse(absPath string) (ret *AssetParseResult) {
if !strings.HasSuffix(strings.ToLower(absPath), ".pdf") {
return
}
if !gulu.File.IsExist(absPath) {
return
}
tmp := copyTempAsset(absPath)
if "" == tmp {
return
}
defer os.RemoveAll(tmp)
f, err := os.Open(tmp)
if nil != err {
logging.LogErrorf("open [%s] failed: [%s]", tmp, err)
return
}
defer f.Close()
stat, err := f.Stat()
if nil != err {
logging.LogErrorf("open [%s] failed: [%s]", tmp, err)
return
}
// initialize pdfium with one worker
pool, err := webassembly.Init(webassembly.Config{
MinIdle: 1,
MaxIdle: 1,
MaxTotal: 1,
})
if err != nil {
logging.LogErrorf("convert [%s] failed: [%s]", tmp, err)
return
}
defer pool.Close()
instance, err := pool.GetInstance(time.Second * 30)
if err != nil {
logging.LogErrorf("convert [%s] failed: [%s]", tmp, err)
return
}
defer instance.Close()
// get number of pages inside PDF document
doc, err := instance.OpenDocument(&requests.OpenDocument{
FileReader: f,
FileReaderSize: stat.Size(),
})
if err != nil {
logging.LogErrorf("convert [%s] failed: [%s]", tmp, err)
return
}
defer instance.FPDF_CloseDocument(&requests.FPDF_CloseDocument{
Document: doc.Document,
})
pageCount, err := instance.FPDF_GetPageCount(&requests.FPDF_GetPageCount{Document: doc.Document})
if err != nil {
logging.LogErrorf("convert [%s] failed: [%s]", tmp, err)
return
}
// loop through pages and get content
content := ""
for page := 0; page < pageCount.PageCount; page++ {
req := &requests.GetPageText{
Page: requests.Page{
ByIndex: &requests.PageByIndex{
Document: doc.Document,
Index: page,
},
},
}
pt, err := instance.GetPageText(req)
if err != nil {
logging.LogErrorf("convert [%s] failed: [%s]", tmp, err)
return
}
content += " " + normalizeAssetContent(pt.Text)
}
ret = &AssetParseResult{
Content: content,
}
return
}

View file

@ -0,0 +1,13 @@
package model
import (
"testing"
)
func TestPDFParser(t *testing.T) {
p := &PdfAssetParser{}
res := p.Parse("../../testdata/parsertest.pdf")
if res == nil || res.Content == "" {
t.Fatalf("empty or nil PDF content result")
}
}

BIN
testdata/parsertest.pdf vendored Normal file

Binary file not shown.