asset_content.go 15 KB


  1. // SiYuan - Refactor your thinking
  2. // Copyright (c) 2020-present, b3log.org
  3. //
  4. // This program is free software: you can redistribute it and/or modify
  5. // it under the terms of the GNU Affero General Public License as published by
  6. // the Free Software Foundation, either version 3 of the License, or
  7. // (at your option) any later version.
  8. //
  9. // This program is distributed in the hope that it will be useful,
  10. // but WITHOUT ANY WARRANTY; without even the implied warranty of
  11. // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
  12. // GNU Affero General Public License for more details.
  13. //
  14. // You should have received a copy of the GNU Affero General Public License
  15. // along with this program. If not, see <https://www.gnu.org/licenses/>.
  16. package model
  17. import (
  18. "bytes"
  19. "io/fs"
  20. "os"
  21. "path/filepath"
  22. "strconv"
  23. "strings"
  24. "sync"
  25. "code.sajari.com/docconv"
  26. "github.com/88250/gulu"
  27. "github.com/88250/lute/ast"
  28. "github.com/dustin/go-humanize"
  29. "github.com/siyuan-note/eventbus"
  30. "github.com/siyuan-note/filelock"
  31. "github.com/siyuan-note/logging"
  32. "github.com/siyuan-note/siyuan/kernel/sql"
  33. "github.com/siyuan-note/siyuan/kernel/task"
  34. "github.com/siyuan-note/siyuan/kernel/util"
  35. "github.com/xuri/excelize/v2"
  36. )
  37. type AssetContent struct {
  38. ID string `json:"id"`
  39. Name string `json:"name"`
  40. Ext string `json:"ext"`
  41. Path string `json:"path"`
  42. Size int64 `json:"size"`
  43. HSize string `json:"hSize"`
  44. Updated int64 `json:"updated"`
  45. Content string `json:"content"`
  46. }
  47. func GetAssetContent(id, query string, queryMethod int) (ret *AssetContent) {
  48. if "" != query && (0 == queryMethod || 1 == queryMethod) {
  49. if 0 == queryMethod {
  50. query = stringQuery(query)
  51. }
  52. }
  53. table := "asset_contents_fts_case_insensitive"
  54. filter := " id = '" + id + "'"
  55. if "" != query {
  56. filter += " AND `" + table + "` MATCH '" + buildAssetContentColumnFilter() + ":(" + query + ")'"
  57. }
  58. projections := "id, name, ext, path, size, updated, " +
  59. "highlight(" + table + ", 6, '<mark>', '</mark>') AS content"
  60. stmt := "SELECT " + projections + " FROM " + table + " WHERE " + filter
  61. assetContents := sql.SelectAssetContentsRawStmt(stmt, 1, 1)
  62. results := fromSQLAssetContents(&assetContents, 36)
  63. if 1 > len(results) {
  64. return
  65. }
  66. ret = results[0]
  67. return
  68. }
  69. // FullTextSearchAssetContent 搜索资源文件内容。
  70. //
  71. // method:0:关键字,1:查询语法,2:SQL,3:正则表达式
  72. // orderBy: 0:相关度(默认),1:按更新时间升序,2:按更新时间降序
  73. func FullTextSearchAssetContent(query string, types map[string]bool, method, orderBy, page, pageSize int) (ret []*AssetContent, matchedAssetCount, pageCount int) {
  74. query = strings.TrimSpace(query)
  75. beforeLen := 36
  76. orderByClause := buildAssetContentOrderBy(orderBy)
  77. switch method {
  78. case 1: // 查询语法
  79. filter := buildAssetContentTypeFilter(types)
  80. ret, matchedAssetCount = fullTextSearchAssetContentByQuerySyntax(query, filter, orderByClause, beforeLen, page, pageSize)
  81. case 2: // SQL
  82. ret, matchedAssetCount = searchAssetContentBySQL(query, beforeLen, page, pageSize)
  83. case 3: // 正则表达式
  84. typeFilter := buildAssetContentTypeFilter(types)
  85. ret, matchedAssetCount = fullTextSearchAssetContentByRegexp(query, typeFilter, orderByClause, beforeLen, page, pageSize)
  86. default: // 关键字
  87. filter := buildAssetContentTypeFilter(types)
  88. ret, matchedAssetCount = fullTextSearchAssetContentByKeyword(query, filter, orderByClause, beforeLen, page, pageSize)
  89. }
  90. pageCount = (matchedAssetCount + pageSize - 1) / pageSize
  91. if 1 > len(ret) {
  92. ret = []*AssetContent{}
  93. }
  94. return
  95. }
  96. func fullTextSearchAssetContentByQuerySyntax(query, typeFilter, orderBy string, beforeLen, page, pageSize int) (ret []*AssetContent, matchedAssetCount int) {
  97. query = gulu.Str.RemoveInvisible(query)
  98. return fullTextSearchAssetContentByFTS(query, typeFilter, orderBy, beforeLen, page, pageSize)
  99. }
  100. func fullTextSearchAssetContentByKeyword(query, typeFilter string, orderBy string, beforeLen, page, pageSize int) (ret []*AssetContent, matchedAssetCount int) {
  101. query = gulu.Str.RemoveInvisible(query)
  102. query = stringQuery(query)
  103. return fullTextSearchAssetContentByFTS(query, typeFilter, orderBy, beforeLen, page, pageSize)
  104. }
  105. func fullTextSearchAssetContentByRegexp(exp, typeFilter, orderBy string, beforeLen, page, pageSize int) (ret []*AssetContent, matchedAssetCount int) {
  106. exp = gulu.Str.RemoveInvisible(exp)
  107. fieldFilter := assetContentFieldRegexp(exp)
  108. stmt := "SELECT * FROM `asset_contents_fts_case_insensitive` WHERE " + fieldFilter + " AND ext IN " + typeFilter
  109. stmt += " " + orderBy
  110. stmt += " LIMIT " + strconv.Itoa(pageSize) + " OFFSET " + strconv.Itoa((page-1)*pageSize)
  111. assetContents := sql.SelectAssetContentsRawStmtNoParse(stmt, Conf.Search.Limit)
  112. ret = fromSQLAssetContents(&assetContents, beforeLen)
  113. if 1 > len(ret) {
  114. ret = []*AssetContent{}
  115. }
  116. matchedAssetCount = fullTextSearchAssetContentCountByRegexp(exp, typeFilter)
  117. return
  118. }
  119. func assetContentFieldRegexp(exp string) string {
  120. buf := bytes.Buffer{}
  121. buf.WriteString("(name REGEXP '")
  122. buf.WriteString(exp)
  123. buf.WriteString("' OR content REGEXP '")
  124. buf.WriteString(exp)
  125. buf.WriteString("')")
  126. return buf.String()
  127. }
  128. func fullTextSearchAssetContentCountByRegexp(exp, typeFilter string) (matchedAssetCount int) {
  129. table := "asset_contents_fts_case_insensitive"
  130. fieldFilter := fieldRegexp(exp)
  131. stmt := "SELECT COUNT(path) AS `assets` FROM `" + table + "` WHERE " + fieldFilter + " AND ext IN " + typeFilter
  132. result, _ := sql.QueryAssetContentNoLimit(stmt)
  133. if 1 > len(result) {
  134. return
  135. }
  136. matchedAssetCount = int(result[0]["assets"].(int64))
  137. return
  138. }
  139. func fullTextSearchAssetContentByFTS(query, typeFilter, orderBy string, beforeLen, page, pageSize int) (ret []*AssetContent, matchedAssetCount int) {
  140. table := "asset_contents_fts_case_insensitive"
  141. projections := "id, name, ext, path, size, updated, " +
  142. "snippet(" + table + ", 6, '<mark>', '</mark>', '...', 64) AS content"
  143. stmt := "SELECT " + projections + " FROM " + table + " WHERE (`" + table + "` MATCH '" + buildAssetContentColumnFilter() + ":(" + query + ")'"
  144. stmt += ") AND ext IN " + typeFilter
  145. stmt += " " + orderBy
  146. stmt += " LIMIT " + strconv.Itoa(pageSize) + " OFFSET " + strconv.Itoa((page-1)*pageSize)
  147. assetContents := sql.SelectAssetContentsRawStmt(stmt, page, pageSize)
  148. ret = fromSQLAssetContents(&assetContents, beforeLen)
  149. if 1 > len(ret) {
  150. ret = []*AssetContent{}
  151. }
  152. matchedAssetCount = fullTextSearchAssetContentCount(query, typeFilter)
  153. return
  154. }
  155. func searchAssetContentBySQL(stmt string, beforeLen, page, pageSize int) (ret []*AssetContent, matchedAssetCount int) {
  156. stmt = gulu.Str.RemoveInvisible(stmt)
  157. stmt = strings.TrimSpace(stmt)
  158. assetContents := sql.SelectAssetContentsRawStmt(stmt, page, pageSize)
  159. ret = fromSQLAssetContents(&assetContents, beforeLen)
  160. if 1 > len(ret) {
  161. ret = []*AssetContent{}
  162. return
  163. }
  164. stmt = strings.ToLower(stmt)
  165. stmt = strings.ReplaceAll(stmt, "select * ", "select COUNT(path) AS `assets` ")
  166. stmt = removeLimitClause(stmt)
  167. result, _ := sql.QueryAssetContentNoLimit(stmt)
  168. if 1 > len(ret) {
  169. return
  170. }
  171. matchedAssetCount = int(result[0]["assets"].(int64))
  172. return
  173. }
  174. func fullTextSearchAssetContentCount(query, typeFilter string) (matchedAssetCount int) {
  175. query = gulu.Str.RemoveInvisible(query)
  176. table := "asset_contents_fts_case_insensitive"
  177. stmt := "SELECT COUNT(path) AS `assets` FROM `" + table + "` WHERE (`" + table + "` MATCH '" + buildAssetContentColumnFilter() + ":(" + query + ")'"
  178. stmt += ") AND ext IN " + typeFilter
  179. result, _ := sql.QueryAssetContentNoLimit(stmt)
  180. if 1 > len(result) {
  181. return
  182. }
  183. matchedAssetCount = int(result[0]["assets"].(int64))
  184. return
  185. }
  186. func fromSQLAssetContents(assetContents *[]*sql.AssetContent, beforeLen int) (ret []*AssetContent) {
  187. ret = []*AssetContent{}
  188. for _, assetContent := range *assetContents {
  189. ret = append(ret, fromSQLAssetContent(assetContent, beforeLen))
  190. }
  191. return
  192. }
  193. func fromSQLAssetContent(assetContent *sql.AssetContent, beforeLen int) *AssetContent {
  194. return &AssetContent{
  195. ID: assetContent.ID,
  196. Name: assetContent.Name,
  197. Ext: assetContent.Ext,
  198. Path: assetContent.Path,
  199. Size: assetContent.Size,
  200. HSize: humanize.Bytes(uint64(assetContent.Size)),
  201. Updated: assetContent.Updated,
  202. Content: assetContent.Content,
  203. }
  204. }
  205. func buildAssetContentColumnFilter() string {
  206. return "{name content}"
  207. }
  208. func buildAssetContentTypeFilter(types map[string]bool) string {
  209. if 0 == len(types) {
  210. return ""
  211. }
  212. var buf bytes.Buffer
  213. buf.WriteString("(")
  214. for k, _ := range types {
  215. buf.WriteString("'")
  216. buf.WriteString(k)
  217. buf.WriteString("',")
  218. }
  219. buf.Truncate(buf.Len() - 1)
  220. buf.WriteString(")")
  221. return buf.String()
  222. }
  223. func buildAssetContentOrderBy(orderBy int) string {
  224. switch orderBy {
  225. case 0:
  226. return "ORDER BY rank DESC"
  227. case 1:
  228. return "ORDER BY updated ASC"
  229. case 2:
  230. return "ORDER BY updated DESC"
  231. default:
  232. return "ORDER BY rank DESC"
  233. }
  234. }
  235. var assetContentSearcher = NewAssetsSearcher()
  236. func IndexAssetContent(absPath string) {
  237. assetsDir := util.GetDataAssetsAbsPath()
  238. ext := strings.ToLower(filepath.Ext(absPath))
  239. parser, found := assetContentSearcher.Parsers[ext]
  240. if !found {
  241. return
  242. }
  243. result := parser.Parse(absPath)
  244. if nil == result {
  245. return
  246. }
  247. info, err := os.Stat(absPath)
  248. if nil != err {
  249. logging.LogErrorf("stat [%s] failed: %s", absPath, err)
  250. return
  251. }
  252. p := "assets" + filepath.ToSlash(strings.TrimPrefix(absPath, assetsDir))
  253. assetContents := []*sql.AssetContent{
  254. {
  255. ID: ast.NewNodeID(),
  256. Name: util.RemoveID(filepath.Base(p)),
  257. Ext: ext,
  258. Path: p,
  259. Size: info.Size(),
  260. Updated: info.ModTime().Unix(),
  261. Content: result.Content,
  262. },
  263. }
  264. sql.DeleteAssetContentsByPathQueue(p)
  265. sql.IndexAssetContentsQueue(assetContents)
  266. }
  267. func ReindexAssetContent() {
  268. task.AppendTask(task.AssetContentDatabaseIndexFull, fullReindexAssetContent)
  269. return
  270. }
  271. func fullReindexAssetContent() {
  272. util.PushMsg(Conf.Language(216), 7*1000)
  273. sql.InitAssetContentDatabase(true)
  274. assetContentSearcher.FullIndex()
  275. return
  276. }
  277. func init() {
  278. subscribeSQLAssetContentEvents()
  279. }
  280. func subscribeSQLAssetContentEvents() {
  281. eventbus.Subscribe(util.EvtSQLAssetContentRebuild, func() {
  282. ReindexAssetContent()
  283. })
  284. }
  285. var (
  286. AssetsSearchEnabled = true
  287. )
  288. type AssetsSearcher struct {
  289. Parsers map[string]AssetParser
  290. lock *sync.Mutex
  291. }
  292. func (searcher *AssetsSearcher) FullIndex() {
  293. assetsDir := util.GetDataAssetsAbsPath()
  294. if !gulu.File.IsDir(assetsDir) {
  295. return
  296. }
  297. var results []*AssetParseResult
  298. filepath.Walk(assetsDir, func(absPath string, info fs.FileInfo, err error) error {
  299. if nil != err {
  300. logging.LogErrorf("walk dir [%s] failed: %s", absPath, err)
  301. return err
  302. }
  303. if info.IsDir() {
  304. return nil
  305. }
  306. ext := strings.ToLower(filepath.Ext(absPath))
  307. parser, found := searcher.Parsers[ext]
  308. if !found {
  309. return nil
  310. }
  311. result := parser.Parse(absPath)
  312. if nil == result {
  313. return nil
  314. }
  315. result.Path = "assets" + filepath.ToSlash(strings.TrimPrefix(absPath, assetsDir))
  316. result.Size = info.Size()
  317. result.Updated = info.ModTime().Unix()
  318. results = append(results, result)
  319. return nil
  320. })
  321. var assetContents []*sql.AssetContent
  322. for _, result := range results {
  323. assetContents = append(assetContents, &sql.AssetContent{
  324. ID: ast.NewNodeID(),
  325. Name: util.RemoveID(filepath.Base(result.Path)),
  326. Ext: strings.ToLower(filepath.Ext(result.Path)),
  327. Path: result.Path,
  328. Size: result.Size,
  329. Updated: result.Updated,
  330. Content: result.Content,
  331. })
  332. }
  333. sql.IndexAssetContentsQueue(assetContents)
  334. }
  335. func NewAssetsSearcher() *AssetsSearcher {
  336. return &AssetsSearcher{
  337. Parsers: map[string]AssetParser{
  338. ".txt": &TxtAssetParser{},
  339. ".md": &TxtAssetParser{},
  340. ".markdown": &TxtAssetParser{},
  341. ".docx": &DocxAssetParser{},
  342. ".pptx": &PptxAssetParser{},
  343. ".xlsx": &XlsxAssetParser{},
  344. },
  345. lock: &sync.Mutex{},
  346. }
  347. }
  348. type AssetParseResult struct {
  349. Path string
  350. Size int64
  351. Updated int64
  352. Content string
  353. }
  354. type AssetParser interface {
  355. Parse(absPath string) *AssetParseResult
  356. }
  357. type TxtAssetParser struct {
  358. }
  359. func (parser *TxtAssetParser) Parse(absPath string) (ret *AssetParseResult) {
  360. if !strings.HasSuffix(strings.ToLower(absPath), ".txt") {
  361. return
  362. }
  363. data, err := filelock.ReadFile(absPath)
  364. if nil != err {
  365. logging.LogErrorf("read file [%s] failed: %s", absPath, err)
  366. return
  367. }
  368. content := normalizeAssetContent(string(data))
  369. ret = &AssetParseResult{
  370. Content: content,
  371. }
  372. return
  373. }
  374. func normalizeAssetContent(content string) (ret string) {
  375. ret = strings.Join(strings.Fields(content), " ")
  376. return
  377. }
  378. func copyTempAsset(absPath string) (ret string) {
  379. dir := filepath.Join(util.TempDir, "convert", "asset_content")
  380. if err := os.MkdirAll(dir, 0755); nil != err {
  381. logging.LogErrorf("mkdir [%s] failed: [%s]", dir, err)
  382. return
  383. }
  384. ret = filepath.Join(dir, gulu.Rand.String(7)+".docx")
  385. if err := filelock.Copy(absPath, ret); nil != err {
  386. logging.LogErrorf("copy [%s] to [%s] failed: [%s]", absPath, ret, err)
  387. return
  388. }
  389. return
  390. }
  391. type DocxAssetParser struct {
  392. }
  393. func (parser *DocxAssetParser) Parse(absPath string) (ret *AssetParseResult) {
  394. if !strings.HasSuffix(strings.ToLower(absPath), ".docx") {
  395. return
  396. }
  397. if !gulu.File.IsExist(absPath) {
  398. return
  399. }
  400. tmp := copyTempAsset(absPath)
  401. if "" == tmp {
  402. return
  403. }
  404. defer os.RemoveAll(tmp)
  405. f, err := os.Open(tmp)
  406. if nil != err {
  407. logging.LogErrorf("open [%s] failed: [%s]", tmp, err)
  408. return
  409. }
  410. defer f.Close()
  411. data, _, err := docconv.ConvertDocx(f)
  412. if nil != err {
  413. logging.LogErrorf("convert [%s] failed: [%s]", tmp, err)
  414. return
  415. }
  416. var content = normalizeAssetContent(data)
  417. ret = &AssetParseResult{
  418. Content: content,
  419. }
  420. return
  421. }
  422. type PptxAssetParser struct {
  423. }
  424. func (parser *PptxAssetParser) Parse(absPath string) (ret *AssetParseResult) {
  425. if !strings.HasSuffix(strings.ToLower(absPath), ".pptx") {
  426. return
  427. }
  428. if !gulu.File.IsExist(absPath) {
  429. return
  430. }
  431. tmp := copyTempAsset(absPath)
  432. if "" == tmp {
  433. return
  434. }
  435. defer os.RemoveAll(tmp)
  436. f, err := os.Open(tmp)
  437. if nil != err {
  438. logging.LogErrorf("open [%s] failed: [%s]", tmp, err)
  439. return
  440. }
  441. defer f.Close()
  442. data, _, err := docconv.ConvertPptx(f)
  443. if nil != err {
  444. logging.LogErrorf("convert [%s] failed: [%s]", tmp, err)
  445. return
  446. }
  447. var content = normalizeAssetContent(data)
  448. ret = &AssetParseResult{
  449. Content: content,
  450. }
  451. return
  452. }
  453. type XlsxAssetParser struct {
  454. }
  455. func (parser *XlsxAssetParser) Parse(absPath string) (ret *AssetParseResult) {
  456. if !strings.HasSuffix(strings.ToLower(absPath), ".xlsx") {
  457. return
  458. }
  459. if !gulu.File.IsExist(absPath) {
  460. return
  461. }
  462. tmp := copyTempAsset(absPath)
  463. if "" == tmp {
  464. return
  465. }
  466. defer os.RemoveAll(tmp)
  467. x, err := excelize.OpenFile(tmp)
  468. if nil != err {
  469. logging.LogErrorf("open [%s] failed: [%s]", tmp, err)
  470. return
  471. }
  472. defer x.Close()
  473. buf := bytes.Buffer{}
  474. sheetMap := x.GetSheetMap()
  475. for _, sheetName := range sheetMap {
  476. rows, getErr := x.GetRows(sheetName)
  477. if nil != getErr {
  478. logging.LogErrorf("get rows from sheet [%s] failed: [%s]", sheetName, getErr)
  479. return
  480. }
  481. for _, row := range rows {
  482. for _, colCell := range row {
  483. buf.WriteString(colCell + " ")
  484. }
  485. }
  486. }
  487. var content = normalizeAssetContent(buf.String())
  488. ret = &AssetParseResult{
  489. Content: content,
  490. }
  491. return
  492. }