asset_content.go 25 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939
  1. // SiYuan - Refactor your thinking
  2. // Copyright (c) 2020-present, b3log.org
  3. //
  4. // This program is free software: you can redistribute it and/or modify
  5. // it under the terms of the GNU Affero General Public License as published by
  6. // the Free Software Foundation, either version 3 of the License, or
  7. // (at your option) any later version.
  8. //
  9. // This program is distributed in the hope that it will be useful,
  10. // but WITHOUT ANY WARRANTY; without even the implied warranty of
  11. // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
  12. // GNU Affero General Public License for more details.
  13. //
  14. // You should have received a copy of the GNU Affero General Public License
  15. // along with this program. If not, see <https://www.gnu.org/licenses/>.
  16. package model
  17. import (
  18. "bytes"
  19. "io/fs"
  20. "os"
  21. "path/filepath"
  22. "runtime"
  23. "strconv"
  24. "strings"
  25. "sync"
  26. "time"
  27. "unicode/utf8"
  28. "code.sajari.com/docconv"
  29. "github.com/88250/epub"
  30. "github.com/88250/go-humanize"
  31. "github.com/88250/gulu"
  32. "github.com/88250/lute/ast"
  33. "github.com/klippa-app/go-pdfium"
  34. "github.com/klippa-app/go-pdfium/requests"
  35. "github.com/klippa-app/go-pdfium/webassembly"
  36. "github.com/siyuan-note/eventbus"
  37. "github.com/siyuan-note/filelock"
  38. "github.com/siyuan-note/logging"
  39. "github.com/siyuan-note/siyuan/kernel/search"
  40. "github.com/siyuan-note/siyuan/kernel/sql"
  41. "github.com/siyuan-note/siyuan/kernel/task"
  42. "github.com/siyuan-note/siyuan/kernel/util"
  43. "github.com/xuri/excelize/v2"
  44. )
  45. type AssetContent struct {
  46. ID string `json:"id"`
  47. Name string `json:"name"`
  48. Ext string `json:"ext"`
  49. Path string `json:"path"`
  50. Size int64 `json:"size"`
  51. HSize string `json:"hSize"`
  52. Updated int64 `json:"updated"`
  53. Content string `json:"content"`
  54. }
  55. func GetAssetContent(id, query string, queryMethod int) (ret *AssetContent) {
  56. if "" != query && (0 == queryMethod || 1 == queryMethod) {
  57. if 0 == queryMethod {
  58. query = stringQuery(query)
  59. }
  60. }
  61. table := "asset_contents_fts_case_insensitive"
  62. filter := " id = '" + id + "'"
  63. if "" != query {
  64. filter += " AND `" + table + "` MATCH '" + buildAssetContentColumnFilter() + ":(" + query + ")'"
  65. }
  66. projections := "id, name, ext, path, size, updated, " +
  67. "highlight(" + table + ", 6, '" + search.SearchMarkLeft + "', '" + search.SearchMarkRight + "') AS content"
  68. stmt := "SELECT " + projections + " FROM " + table + " WHERE " + filter
  69. assetContents := sql.SelectAssetContentsRawStmt(stmt, 1, 1)
  70. results := fromSQLAssetContents(&assetContents, 36)
  71. if 1 > len(results) {
  72. return
  73. }
  74. ret = results[0]
  75. ret.Content = strings.ReplaceAll(ret.Content, "\n", "<br>")
  76. return
  77. }
  78. // FullTextSearchAssetContent 搜索资源文件内容。
  79. //
  80. // method:0:关键字,1:查询语法,2:SQL,3:正则表达式
  81. // orderBy: 0:按相关度降序,1:按相关度升序,2:按更新时间升序,3:按更新时间降序
  82. func FullTextSearchAssetContent(query string, types map[string]bool, method, orderBy, page, pageSize int) (ret []*AssetContent, matchedAssetCount, pageCount int) {
  83. query = strings.TrimSpace(query)
  84. beforeLen := 36
  85. orderByClause := buildAssetContentOrderBy(orderBy)
  86. switch method {
  87. case 1: // 查询语法
  88. filter := buildAssetContentTypeFilter(types)
  89. ret, matchedAssetCount = fullTextSearchAssetContentByQuerySyntax(query, filter, orderByClause, beforeLen, page, pageSize)
  90. case 2: // SQL
  91. ret, matchedAssetCount = searchAssetContentBySQL(query, beforeLen, page, pageSize)
  92. case 3: // 正则表达式
  93. typeFilter := buildAssetContentTypeFilter(types)
  94. ret, matchedAssetCount = fullTextSearchAssetContentByRegexp(query, typeFilter, orderByClause, beforeLen, page, pageSize)
  95. default: // 关键字
  96. filter := buildAssetContentTypeFilter(types)
  97. ret, matchedAssetCount = fullTextSearchAssetContentByKeyword(query, filter, orderByClause, beforeLen, page, pageSize)
  98. }
  99. pageCount = (matchedAssetCount + pageSize - 1) / pageSize
  100. if 1 > len(ret) {
  101. ret = []*AssetContent{}
  102. }
  103. return
  104. }
  105. func fullTextSearchAssetContentByQuerySyntax(query, typeFilter, orderBy string, beforeLen, page, pageSize int) (ret []*AssetContent, matchedAssetCount int) {
  106. query = filterQueryInvisibleChars(query)
  107. return fullTextSearchAssetContentByFTS(query, typeFilter, orderBy, beforeLen, page, pageSize)
  108. }
  109. func fullTextSearchAssetContentByKeyword(query, typeFilter string, orderBy string, beforeLen, page, pageSize int) (ret []*AssetContent, matchedAssetCount int) {
  110. query = filterQueryInvisibleChars(query)
  111. query = stringQuery(query)
  112. return fullTextSearchAssetContentByFTS(query, typeFilter, orderBy, beforeLen, page, pageSize)
  113. }
  114. func fullTextSearchAssetContentByRegexp(exp, typeFilter, orderBy string, beforeLen, page, pageSize int) (ret []*AssetContent, matchedAssetCount int) {
  115. exp = filterQueryInvisibleChars(exp)
  116. fieldFilter := assetContentFieldRegexp(exp)
  117. stmt := "SELECT * FROM `asset_contents_fts_case_insensitive` WHERE " + fieldFilter + " AND ext IN " + typeFilter
  118. stmt += " " + orderBy
  119. stmt += " LIMIT " + strconv.Itoa(pageSize) + " OFFSET " + strconv.Itoa((page-1)*pageSize)
  120. assetContents := sql.SelectAssetContentsRawStmtNoParse(stmt, Conf.Search.Limit)
  121. ret = fromSQLAssetContents(&assetContents, beforeLen)
  122. if 1 > len(ret) {
  123. ret = []*AssetContent{}
  124. }
  125. matchedAssetCount = fullTextSearchAssetContentCountByRegexp(exp, typeFilter)
  126. return
  127. }
  128. func assetContentFieldRegexp(exp string) string {
  129. buf := bytes.Buffer{}
  130. buf.WriteString("(name REGEXP '")
  131. buf.WriteString(exp)
  132. buf.WriteString("' OR content REGEXP '")
  133. buf.WriteString(exp)
  134. buf.WriteString("')")
  135. return buf.String()
  136. }
  137. func fullTextSearchAssetContentCountByRegexp(exp, typeFilter string) (matchedAssetCount int) {
  138. table := "asset_contents_fts_case_insensitive"
  139. fieldFilter := assetContentFieldRegexp(exp)
  140. stmt := "SELECT COUNT(path) AS `assets` FROM `" + table + "` WHERE " + fieldFilter + " AND ext IN " + typeFilter
  141. result, _ := sql.QueryAssetContentNoLimit(stmt)
  142. if 1 > len(result) {
  143. return
  144. }
  145. matchedAssetCount = int(result[0]["assets"].(int64))
  146. return
  147. }
  148. func fullTextSearchAssetContentByFTS(query, typeFilter, orderBy string, beforeLen, page, pageSize int) (ret []*AssetContent, matchedAssetCount int) {
  149. table := "asset_contents_fts_case_insensitive"
  150. projections := "id, name, ext, path, size, updated, " +
  151. "snippet(" + table + ", 6, '" + search.SearchMarkLeft + "', '" + search.SearchMarkRight + "', '...', 64) AS content"
  152. stmt := "SELECT " + projections + " FROM " + table + " WHERE (`" + table + "` MATCH '" + buildAssetContentColumnFilter() + ":(" + query + ")'"
  153. stmt += ") AND ext IN " + typeFilter
  154. stmt += " " + orderBy
  155. stmt += " LIMIT " + strconv.Itoa(pageSize) + " OFFSET " + strconv.Itoa((page-1)*pageSize)
  156. assetContents := sql.SelectAssetContentsRawStmt(stmt, page, pageSize)
  157. ret = fromSQLAssetContents(&assetContents, beforeLen)
  158. if 1 > len(ret) {
  159. ret = []*AssetContent{}
  160. }
  161. matchedAssetCount = fullTextSearchAssetContentCount(query, typeFilter)
  162. return
  163. }
  164. func searchAssetContentBySQL(stmt string, beforeLen, page, pageSize int) (ret []*AssetContent, matchedAssetCount int) {
  165. stmt = filterQueryInvisibleChars(stmt)
  166. stmt = strings.TrimSpace(stmt)
  167. assetContents := sql.SelectAssetContentsRawStmt(stmt, page, pageSize)
  168. ret = fromSQLAssetContents(&assetContents, beforeLen)
  169. if 1 > len(ret) {
  170. ret = []*AssetContent{}
  171. return
  172. }
  173. stmt = strings.ToLower(stmt)
  174. stmt = strings.ReplaceAll(stmt, "select * ", "select COUNT(path) AS `assets` ")
  175. stmt = removeLimitClause(stmt)
  176. result, _ := sql.QueryAssetContentNoLimit(stmt)
  177. if 1 > len(ret) {
  178. return
  179. }
  180. matchedAssetCount = int(result[0]["assets"].(int64))
  181. return
  182. }
  183. func fullTextSearchAssetContentCount(query, typeFilter string) (matchedAssetCount int) {
  184. query = filterQueryInvisibleChars(query)
  185. table := "asset_contents_fts_case_insensitive"
  186. stmt := "SELECT COUNT(path) AS `assets` FROM `" + table + "` WHERE (`" + table + "` MATCH '" + buildAssetContentColumnFilter() + ":(" + query + ")'"
  187. stmt += ") AND ext IN " + typeFilter
  188. result, _ := sql.QueryAssetContentNoLimit(stmt)
  189. if 1 > len(result) {
  190. return
  191. }
  192. matchedAssetCount = int(result[0]["assets"].(int64))
  193. return
  194. }
  195. func fromSQLAssetContents(assetContents *[]*sql.AssetContent, beforeLen int) (ret []*AssetContent) {
  196. ret = []*AssetContent{}
  197. for _, assetContent := range *assetContents {
  198. ret = append(ret, fromSQLAssetContent(assetContent, beforeLen))
  199. }
  200. return
  201. }
  202. func fromSQLAssetContent(assetContent *sql.AssetContent, beforeLen int) *AssetContent {
  203. content := util.EscapeHTML(assetContent.Content)
  204. if strings.Contains(content, search.SearchMarkLeft) {
  205. content = strings.ReplaceAll(content, search.SearchMarkLeft, "<mark>")
  206. content = strings.ReplaceAll(content, search.SearchMarkRight, "</mark>")
  207. }
  208. return &AssetContent{
  209. ID: assetContent.ID,
  210. Name: assetContent.Name,
  211. Ext: assetContent.Ext,
  212. Path: assetContent.Path,
  213. Size: assetContent.Size,
  214. HSize: humanize.BytesCustomCeil(uint64(assetContent.Size), 2),
  215. Updated: assetContent.Updated,
  216. Content: content,
  217. }
  218. }
  219. func buildAssetContentColumnFilter() string {
  220. return "{name content}"
  221. }
  222. func buildAssetContentTypeFilter(types map[string]bool) string {
  223. if 0 == len(types) {
  224. return ""
  225. }
  226. var buf bytes.Buffer
  227. buf.WriteString("(")
  228. for k, enabled := range types {
  229. if !enabled {
  230. continue
  231. }
  232. buf.WriteString("'")
  233. buf.WriteString(k)
  234. buf.WriteString("',")
  235. }
  236. if 1 == buf.Len() {
  237. buf.WriteString(")")
  238. return buf.String()
  239. }
  240. buf.Truncate(buf.Len() - 1)
  241. buf.WriteString(")")
  242. return buf.String()
  243. }
  244. func buildAssetContentOrderBy(orderBy int) string {
  245. switch orderBy {
  246. case 0:
  247. return "ORDER BY rank DESC"
  248. case 1:
  249. return "ORDER BY rank ASC"
  250. case 2:
  251. return "ORDER BY updated ASC"
  252. case 3:
  253. return "ORDER BY updated DESC"
  254. default:
  255. return "ORDER BY rank DESC"
  256. }
  257. }
  258. var assetContentSearcher = NewAssetsSearcher()
  259. func RemoveIndexAssetContent(absPath string) {
  260. defer logging.Recover()
  261. assetsDir := util.GetDataAssetsAbsPath()
  262. p := "assets" + filepath.ToSlash(strings.TrimPrefix(absPath, assetsDir))
  263. sql.DeleteAssetContentsByPathQueue(p)
  264. }
  265. func IndexAssetContent(absPath string) {
  266. defer logging.Recover()
  267. ext := filepath.Ext(absPath)
  268. parser := assetContentSearcher.GetParser(ext)
  269. if nil == parser {
  270. return
  271. }
  272. result := parser.Parse(absPath)
  273. if nil == result {
  274. return
  275. }
  276. info, err := os.Stat(absPath)
  277. if nil != err {
  278. logging.LogErrorf("stat [%s] failed: %s", absPath, err)
  279. return
  280. }
  281. assetsDir := util.GetDataAssetsAbsPath()
  282. p := "assets" + filepath.ToSlash(strings.TrimPrefix(absPath, assetsDir))
  283. assetContents := []*sql.AssetContent{
  284. {
  285. ID: ast.NewNodeID(),
  286. Name: util.RemoveID(filepath.Base(p)),
  287. Ext: ext,
  288. Path: p,
  289. Size: info.Size(),
  290. Updated: info.ModTime().Unix(),
  291. Content: result.Content,
  292. },
  293. }
  294. sql.DeleteAssetContentsByPathQueue(p)
  295. sql.IndexAssetContentsQueue(assetContents)
  296. }
  297. func ReindexAssetContent() {
  298. task.AppendTask(task.AssetContentDatabaseIndexFull, fullReindexAssetContent)
  299. return
  300. }
  301. func fullReindexAssetContent() {
  302. util.PushMsg(Conf.Language(216), 7*1000)
  303. sql.InitAssetContentDatabase(true)
  304. assetContentSearcher.FullIndex()
  305. return
  306. }
  307. func init() {
  308. subscribeSQLAssetContentEvents()
  309. }
  310. func subscribeSQLAssetContentEvents() {
  311. eventbus.Subscribe(util.EvtSQLAssetContentRebuild, func() {
  312. ReindexAssetContent()
  313. })
  314. }
  315. var (
  316. AssetsSearchEnabled = true
  317. )
  318. type AssetsSearcher struct {
  319. parsers map[string]AssetParser
  320. lock *sync.Mutex
  321. }
  322. func (searcher *AssetsSearcher) GetParser(ext string) AssetParser {
  323. searcher.lock.Lock()
  324. defer searcher.lock.Unlock()
  325. return searcher.parsers[strings.ToLower(ext)]
  326. }
  327. func (searcher *AssetsSearcher) FullIndex() {
  328. defer logging.Recover()
  329. assetsDir := util.GetDataAssetsAbsPath()
  330. if !gulu.File.IsDir(assetsDir) {
  331. return
  332. }
  333. var results []*AssetParseResult
  334. filelock.Walk(assetsDir, func(absPath string, info fs.FileInfo, err error) error {
  335. if nil != err {
  336. logging.LogErrorf("walk dir [%s] failed: %s", absPath, err)
  337. return err
  338. }
  339. if info.IsDir() {
  340. return nil
  341. }
  342. ext := filepath.Ext(absPath)
  343. parser := searcher.GetParser(ext)
  344. if nil == parser {
  345. return nil
  346. }
  347. logging.LogInfof("parsing asset content [%s]", absPath)
  348. result := parser.Parse(absPath)
  349. if nil == result {
  350. return nil
  351. }
  352. result.Path = "assets" + filepath.ToSlash(strings.TrimPrefix(absPath, assetsDir))
  353. result.Size = info.Size()
  354. result.Updated = info.ModTime().Unix()
  355. results = append(results, result)
  356. return nil
  357. })
  358. var assetContents []*sql.AssetContent
  359. for _, result := range results {
  360. assetContents = append(assetContents, &sql.AssetContent{
  361. ID: ast.NewNodeID(),
  362. Name: util.RemoveID(filepath.Base(result.Path)),
  363. Ext: strings.ToLower(filepath.Ext(result.Path)),
  364. Path: result.Path,
  365. Size: result.Size,
  366. Updated: result.Updated,
  367. Content: result.Content,
  368. })
  369. }
  370. sql.IndexAssetContentsQueue(assetContents)
  371. }
  372. func NewAssetsSearcher() *AssetsSearcher {
  373. txtAssetParser := &TxtAssetParser{}
  374. return &AssetsSearcher{
  375. parsers: map[string]AssetParser{
  376. ".txt": txtAssetParser,
  377. ".md": txtAssetParser,
  378. ".markdown": txtAssetParser,
  379. ".json": txtAssetParser,
  380. ".log": txtAssetParser,
  381. ".sql": txtAssetParser,
  382. ".html": txtAssetParser,
  383. ".xml": txtAssetParser,
  384. ".java": txtAssetParser,
  385. ".h": txtAssetParser,
  386. ".c": txtAssetParser,
  387. ".cpp": txtAssetParser,
  388. ".go": txtAssetParser,
  389. ".rs": txtAssetParser,
  390. ".swift": txtAssetParser,
  391. ".kt": txtAssetParser,
  392. ".py": txtAssetParser,
  393. ".php": txtAssetParser,
  394. ".js": txtAssetParser,
  395. ".css": txtAssetParser,
  396. ".ts": txtAssetParser,
  397. ".sh": txtAssetParser,
  398. ".bat": txtAssetParser,
  399. ".cmd": txtAssetParser,
  400. ".ini": txtAssetParser,
  401. ".yaml": txtAssetParser,
  402. ".rst": txtAssetParser,
  403. ".adoc": txtAssetParser,
  404. ".textile": txtAssetParser,
  405. ".opml": txtAssetParser,
  406. ".org": txtAssetParser,
  407. ".wiki": txtAssetParser,
  408. ".docx": &DocxAssetParser{},
  409. ".pptx": &PptxAssetParser{},
  410. ".xlsx": &XlsxAssetParser{},
  411. ".pdf": &PdfAssetParser{},
  412. ".epub": &EpubAssetParser{},
  413. },
  414. lock: &sync.Mutex{},
  415. }
  416. }
  417. const (
  418. TxtAssetContentMaxSize = 1024 * 1024 * 4
  419. PDFAssetContentMaxPage = 1024
  420. )
  421. var (
  422. PDFAssetContentMaxSize uint64 = 1024 * 1024 * 128
  423. )
  424. type AssetParseResult struct {
  425. Path string
  426. Size int64
  427. Updated int64
  428. Content string
  429. }
  430. type AssetParser interface {
  431. Parse(absPath string) *AssetParseResult
  432. }
  433. type TxtAssetParser struct {
  434. }
  435. func (parser *TxtAssetParser) Parse(absPath string) (ret *AssetParseResult) {
  436. info, err := os.Stat(absPath)
  437. if nil != err {
  438. logging.LogErrorf("stat file [%s] failed: %s", absPath, err)
  439. return
  440. }
  441. if TxtAssetContentMaxSize < info.Size() {
  442. logging.LogWarnf("text asset [%s] is too large [%s]", absPath, humanize.BytesCustomCeil(uint64(info.Size()), 2))
  443. return
  444. }
  445. tmp := copyTempAsset(absPath)
  446. if "" == tmp {
  447. return
  448. }
  449. defer os.RemoveAll(tmp)
  450. data, err := os.ReadFile(tmp)
  451. if nil != err {
  452. logging.LogErrorf("read file [%s] failed: %s", absPath, err)
  453. return
  454. }
  455. if !utf8.Valid(data) {
  456. // Non-UTF-8 encoded text files are not included in asset file content searching https://github.com/siyuan-note/siyuan/issues/9052
  457. logging.LogWarnf("text asset [%s] is not UTF-8 encoded", absPath)
  458. return
  459. }
  460. content := string(data)
  461. ret = &AssetParseResult{
  462. Content: content,
  463. }
  464. return
  465. }
  466. func normalizeNonTxtAssetContent(content string) (ret string) {
  467. ret = strings.Join(strings.Fields(content), " ")
  468. return
  469. }
  470. func copyTempAsset(absPath string) (ret string) {
  471. dir := filepath.Join(util.TempDir, "convert", "asset_content")
  472. if err := os.MkdirAll(dir, 0755); nil != err {
  473. logging.LogErrorf("mkdir [%s] failed: [%s]", dir, err)
  474. return
  475. }
  476. baseName := filepath.Base(absPath)
  477. if strings.HasPrefix(baseName, "~") {
  478. return
  479. }
  480. filelock.Lock(absPath)
  481. defer filelock.Unlock(absPath)
  482. ext := filepath.Ext(absPath)
  483. ret = filepath.Join(dir, gulu.Rand.String(7)+ext)
  484. if err := gulu.File.Copy(absPath, ret); nil != err {
  485. logging.LogErrorf("copy [src=%s, dest=%s] failed: %s", absPath, ret, err)
  486. return
  487. }
  488. return
  489. }
  490. type DocxAssetParser struct {
  491. }
  492. func (parser *DocxAssetParser) Parse(absPath string) (ret *AssetParseResult) {
  493. if !strings.HasSuffix(strings.ToLower(absPath), ".docx") {
  494. return
  495. }
  496. if !gulu.File.IsExist(absPath) {
  497. return
  498. }
  499. tmp := copyTempAsset(absPath)
  500. if "" == tmp {
  501. return
  502. }
  503. defer os.RemoveAll(tmp)
  504. f, err := os.Open(tmp)
  505. if nil != err {
  506. logging.LogErrorf("open [%s] failed: [%s]", tmp, err)
  507. return
  508. }
  509. defer f.Close()
  510. data, _, err := docconv.ConvertDocx(f)
  511. if nil != err {
  512. logging.LogErrorf("convert [%s] failed: [%s]", tmp, err)
  513. return
  514. }
  515. var content = normalizeNonTxtAssetContent(data)
  516. ret = &AssetParseResult{
  517. Content: content,
  518. }
  519. return
  520. }
  521. type PptxAssetParser struct {
  522. }
  523. func (parser *PptxAssetParser) Parse(absPath string) (ret *AssetParseResult) {
  524. if !strings.HasSuffix(strings.ToLower(absPath), ".pptx") {
  525. return
  526. }
  527. if !gulu.File.IsExist(absPath) {
  528. return
  529. }
  530. tmp := copyTempAsset(absPath)
  531. if "" == tmp {
  532. return
  533. }
  534. defer os.RemoveAll(tmp)
  535. f, err := os.Open(tmp)
  536. if nil != err {
  537. logging.LogErrorf("open [%s] failed: [%s]", tmp, err)
  538. return
  539. }
  540. defer f.Close()
  541. data, _, err := docconv.ConvertPptx(f)
  542. if nil != err {
  543. logging.LogErrorf("convert [%s] failed: [%s]", tmp, err)
  544. return
  545. }
  546. var content = normalizeNonTxtAssetContent(data)
  547. ret = &AssetParseResult{
  548. Content: content,
  549. }
  550. return
  551. }
  552. type XlsxAssetParser struct {
  553. }
  554. func (parser *XlsxAssetParser) Parse(absPath string) (ret *AssetParseResult) {
  555. if !strings.HasSuffix(strings.ToLower(absPath), ".xlsx") {
  556. return
  557. }
  558. if !gulu.File.IsExist(absPath) {
  559. return
  560. }
  561. tmp := copyTempAsset(absPath)
  562. if "" == tmp {
  563. return
  564. }
  565. defer os.RemoveAll(tmp)
  566. x, err := excelize.OpenFile(tmp)
  567. if nil != err {
  568. logging.LogErrorf("open [%s] failed: [%s]", tmp, err)
  569. return
  570. }
  571. defer x.Close()
  572. buf := bytes.Buffer{}
  573. sheetMap := x.GetSheetMap()
  574. for _, sheetName := range sheetMap {
  575. rows, getErr := x.GetRows(sheetName)
  576. if nil != getErr {
  577. logging.LogErrorf("get rows from sheet [%s] failed: [%s]", sheetName, getErr)
  578. return
  579. }
  580. for _, row := range rows {
  581. for _, colCell := range row {
  582. buf.WriteString(colCell + " ")
  583. }
  584. }
  585. }
  586. var content = normalizeNonTxtAssetContent(buf.String())
  587. ret = &AssetParseResult{
  588. Content: content,
  589. }
  590. return
  591. }
  592. // PdfAssetParser parser factory product
  593. type PdfAssetParser struct {
  594. }
  595. // pdfPage struct defines a worker job for text extraction
  596. type pdfPage struct {
  597. pageNo int // page number for text extraction
  598. data *[]byte // pointer to PDF document data
  599. }
  600. // pdfTextResult struct defines the extracted PDF text result
  601. type pdfTextResult struct {
  602. pageNo int // page number of PDF document
  603. text string // text of converted page
  604. err error // processing error
  605. }
  606. // getTextPageWorker will extract the text from a given PDF page and return its result
  607. func (parser *PdfAssetParser) getTextPageWorker(id int, instance pdfium.Pdfium, page <-chan *pdfPage, result chan<- *pdfTextResult) {
  608. defer instance.Close()
  609. for pd := range page {
  610. doc, err := instance.OpenDocument(&requests.OpenDocument{
  611. File: pd.data,
  612. })
  613. if nil != err {
  614. instance.FPDF_CloseDocument(&requests.FPDF_CloseDocument{
  615. Document: doc.Document,
  616. })
  617. result <- &pdfTextResult{
  618. pageNo: pd.pageNo,
  619. err: err,
  620. }
  621. continue
  622. }
  623. req := &requests.GetPageText{
  624. Page: requests.Page{
  625. ByIndex: &requests.PageByIndex{
  626. Document: doc.Document,
  627. Index: pd.pageNo,
  628. },
  629. },
  630. }
  631. res, err := instance.GetPageText(req)
  632. if nil != err {
  633. instance.FPDF_CloseDocument(&requests.FPDF_CloseDocument{
  634. Document: doc.Document,
  635. })
  636. result <- &pdfTextResult{
  637. pageNo: pd.pageNo,
  638. err: err,
  639. }
  640. continue
  641. }
  642. instance.FPDF_CloseDocument(&requests.FPDF_CloseDocument{
  643. Document: doc.Document,
  644. })
  645. result <- &pdfTextResult{
  646. pageNo: pd.pageNo,
  647. text: res.Text,
  648. err: nil,
  649. }
  650. }
  651. }
  652. // Parse will parse a PDF document using PDFium webassembly module using a worker pool
  653. func (parser *PdfAssetParser) Parse(absPath string) (ret *AssetParseResult) {
  654. if util.ContainerIOS == util.Container || util.ContainerAndroid == util.Container {
  655. // PDF asset content searching is not supported on mobile platforms
  656. return
  657. }
  658. now := time.Now()
  659. if !strings.HasSuffix(strings.ToLower(absPath), ".pdf") {
  660. return
  661. }
  662. if !gulu.File.IsExist(absPath) {
  663. return
  664. }
  665. tmp := copyTempAsset(absPath)
  666. if "" == tmp {
  667. return
  668. }
  669. defer os.RemoveAll(tmp)
  670. // PDF blob will be processed in-memory making sharing of PDF document data across worker goroutines possible
  671. pdfData, err := os.ReadFile(tmp)
  672. if nil != err {
  673. logging.LogErrorf("open [%s] failed: [%s]", tmp, err)
  674. return
  675. }
  676. // initialize go-pdfium with number of available cores
  677. // we fire up the complete worker pool for maximum performance
  678. cores := runtime.NumCPU()
  679. if 4 < cores {
  680. cores = 4 // Limit memory usage
  681. }
  682. pool, err := webassembly.Init(webassembly.Config{
  683. MinIdle: cores,
  684. MaxIdle: cores,
  685. MaxTotal: cores,
  686. })
  687. if nil != err {
  688. logging.LogErrorf("convert [%s] failed: [%s]", tmp, err)
  689. return
  690. }
  691. defer pool.Close()
  692. // first get the number of PDF pages to convert into text
  693. instance, err := pool.GetInstance(time.Second * 30)
  694. if nil != err {
  695. logging.LogErrorf("convert [%s] failed: [%s]", tmp, err)
  696. return
  697. }
  698. doc, err := instance.OpenDocument(&requests.OpenDocument{
  699. File: &pdfData,
  700. })
  701. if nil != err {
  702. instance.Close()
  703. logging.LogErrorf("convert [%s] failed: [%s]", tmp, err)
  704. return
  705. }
  706. pc, err := instance.FPDF_GetPageCount(&requests.FPDF_GetPageCount{Document: doc.Document})
  707. if nil != err {
  708. instance.FPDF_CloseDocument(&requests.FPDF_CloseDocument{
  709. Document: doc.Document,
  710. })
  711. instance.Close()
  712. logging.LogErrorf("convert [%s] failed: [%s]", tmp, err)
  713. return
  714. }
  715. instance.Close()
  716. if PDFAssetContentMaxPage < pc.PageCount {
  717. // PDF files longer than 1024 pages are not included in asset file content searching https://github.com/siyuan-note/siyuan/issues/9053
  718. logging.LogWarnf("ignore large PDF asset [%s] with [%d] pages", absPath, pc.PageCount)
  719. return
  720. }
  721. if maxSizeVal := os.Getenv("SIYUAN_PDF_ASSET_CONTENT_INDEX_MAX_SIZE"); "" != maxSizeVal {
  722. if maxSize, parseErr := strconv.ParseUint(maxSizeVal, 10, 64); nil == parseErr {
  723. if maxSize != PDFAssetContentMaxSize {
  724. PDFAssetContentMaxSize = maxSize
  725. logging.LogInfof("set PDF asset content index max size to [%s]", humanize.BytesCustomCeil(maxSize, 2))
  726. }
  727. } else {
  728. logging.LogWarnf("invalid env [SIYUAN_PDF_ASSET_CONTENT_INDEX_MAX_SIZE]: [%s], parsing failed: %s", maxSizeVal, parseErr)
  729. }
  730. }
  731. if PDFAssetContentMaxSize < uint64(len(pdfData)) {
  732. // PDF files larger than 128MB are not included in asset file content searching https://github.com/siyuan-note/siyuan/issues/9500
  733. logging.LogWarnf("ignore large PDF asset [%s] with [%s]", absPath, humanize.BytesCustomCeil(uint64(len(pdfData)), 2))
  734. return
  735. }
  736. // next setup worker pool for processing PDF pages
  737. pages := make(chan *pdfPage, pc.PageCount)
  738. results := make(chan *pdfTextResult, pc.PageCount)
  739. for i := 0; i < cores; i++ {
  740. inst, err := pool.GetInstance(time.Second * 30)
  741. if nil != err {
  742. close(pages)
  743. close(results)
  744. logging.LogErrorf("convert [%s] failed: [%s]", tmp, err)
  745. return
  746. }
  747. go parser.getTextPageWorker(i, inst, pages, results)
  748. }
  749. // now split pages and let them process by worker pool
  750. for p := 0; p < pc.PageCount; p++ {
  751. pages <- &pdfPage{
  752. pageNo: p,
  753. data: &pdfData,
  754. }
  755. }
  756. close(pages)
  757. // finally fetch the PDF page text results
  758. // Note: some workers will process pages faster than other workers depending on the page contents
  759. // the order of returned PDF text pages is random and must be sorted using the pageNo index
  760. pageText := make([]string, pc.PageCount)
  761. for p := 0; p < pc.PageCount; p++ {
  762. res := <-results
  763. pageText[res.pageNo] = res.text
  764. if nil != res.err {
  765. logging.LogErrorf("convert [%s] of page %d failed: [%s]", tmp, res.pageNo, err)
  766. }
  767. }
  768. close(results)
  769. if 128 < pc.PageCount {
  770. logging.LogInfof("convert [%s] PDF with [%d] pages using [%d] workers took [%s]", absPath, pc.PageCount, cores, time.Since(now))
  771. }
  772. // loop through ordered PDF text pages and join content for asset parse DB result
  773. contentBuilder := bytes.Buffer{}
  774. for _, pt := range pageText {
  775. contentBuilder.WriteString(" " + normalizeNonTxtAssetContent(pt))
  776. }
  777. ret = &AssetParseResult{
  778. Content: contentBuilder.String(),
  779. }
  780. return
  781. }
  782. type EpubAssetParser struct {
  783. }
  784. func (parser *EpubAssetParser) Parse(absPath string) (ret *AssetParseResult) {
  785. if !strings.HasSuffix(strings.ToLower(absPath), ".epub") {
  786. return
  787. }
  788. if !gulu.File.IsExist(absPath) {
  789. return
  790. }
  791. tmp := copyTempAsset(absPath)
  792. if "" == tmp {
  793. return
  794. }
  795. defer os.RemoveAll(tmp)
  796. f, err := os.Open(tmp)
  797. if nil != err {
  798. logging.LogErrorf("open [%s] failed: [%s]", tmp, err)
  799. return
  800. }
  801. defer f.Close()
  802. buf := bytes.Buffer{}
  803. if err = epub.ToTxt(tmp, &buf); nil != err {
  804. logging.LogErrorf("convert [%s] failed: [%s]", tmp, err)
  805. return
  806. }
  807. content := normalizeNonTxtAssetContent(buf.String())
  808. ret = &AssetParseResult{
  809. Content: content,
  810. }
  811. return
  812. }