rss.go 3.7 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172
  1. package feed
  2. import (
  3. "context"
  4. "fmt"
  5. "html"
  6. "log/slog"
  7. "regexp"
  8. "sort"
  9. "strings"
  10. "time"
  11. "github.com/mmcdole/gofeed"
  12. )
  13. type RSSFeedItem struct {
  14. ChannelName string
  15. ChannelURL string
  16. Title string
  17. Link string
  18. ImageURL string
  19. Categories []string
  20. Description string
  21. PublishedAt time.Time
  22. }
  23. // doesn't cover all cases but works the vast majority of the time
  24. var htmlTagsWithAttributesPattern = regexp.MustCompile(`<\/?[a-zA-Z0-9-]+ *(?:[a-zA-Z-]+=(?:"|').*?(?:"|') ?)* *\/?>`)
  25. var sequentialWhitespacePattern = regexp.MustCompile(`\s+`)
  26. func sanitizeFeedDescription(description string) string {
  27. if description == "" {
  28. return ""
  29. }
  30. description = strings.ReplaceAll(description, "\n", " ")
  31. description = htmlTagsWithAttributesPattern.ReplaceAllString(description, "")
  32. description = sequentialWhitespacePattern.ReplaceAllString(description, " ")
  33. description = strings.TrimSpace(description)
  34. description = html.UnescapeString(description)
  35. return description
  36. }
  37. type RSSFeedRequest struct {
  38. Url string `yaml:"url"`
  39. Title string `yaml:"title"`
  40. HideCategories bool `yaml:"hide-categories"`
  41. HideDescription bool `yaml:"hide-description"`
  42. }
  43. type RSSFeedItems []RSSFeedItem
  44. func (f RSSFeedItems) SortByNewest() RSSFeedItems {
  45. sort.Slice(f, func(i, j int) bool {
  46. return f[i].PublishedAt.After(f[j].PublishedAt)
  47. })
  48. return f
  49. }
  50. var feedParser = gofeed.NewParser()
  51. func getItemsFromRSSFeedTask(request RSSFeedRequest) ([]RSSFeedItem, error) {
  52. ctx, cancel := context.WithTimeout(context.Background(), time.Second*5)
  53. defer cancel()
  54. feed, err := feedParser.ParseURLWithContext(request.Url, ctx)
  55. if err != nil {
  56. return nil, err
  57. }
  58. items := make(RSSFeedItems, 0, len(feed.Items))
  59. for i := range feed.Items {
  60. item := feed.Items[i]
  61. rssItem := RSSFeedItem{
  62. ChannelURL: feed.Link,
  63. Title: item.Title,
  64. Link: item.Link,
  65. }
  66. if !request.HideDescription && item.Description != "" {
  67. description, _ := limitStringLength(item.Description, 1000)
  68. description = sanitizeFeedDescription(description)
  69. description, limited := limitStringLength(description, 200)
  70. if limited {
  71. description += "…"
  72. }
  73. rssItem.Description = description
  74. }
  75. if !request.HideCategories {
  76. var categories = make([]string, 0, 6)
  77. for _, category := range item.Categories {
  78. if len(categories) == 6 {
  79. break
  80. }
  81. if len(category) == 0 || len(category) > 30 {
  82. continue
  83. }
  84. categories = append(categories, category)
  85. }
  86. rssItem.Categories = categories
  87. }
  88. if request.Title != "" {
  89. rssItem.ChannelName = request.Title
  90. } else {
  91. rssItem.ChannelName = feed.Title
  92. }
  93. if item.Image != nil {
  94. rssItem.ImageURL = item.Image.URL
  95. } else if feed.Image != nil {
  96. rssItem.ImageURL = feed.Image.URL
  97. }
  98. if item.PublishedParsed != nil {
  99. rssItem.PublishedAt = *item.PublishedParsed
  100. } else {
  101. rssItem.PublishedAt = time.Now()
  102. }
  103. items = append(items, rssItem)
  104. }
  105. return items, nil
  106. }
  107. func GetItemsFromRSSFeeds(requests []RSSFeedRequest) (RSSFeedItems, error) {
  108. job := newJob(getItemsFromRSSFeedTask, requests).withWorkers(10)
  109. feeds, errs, err := workerPoolDo(job)
  110. if err != nil {
  111. return nil, fmt.Errorf("%w: %v", ErrNoContent, err)
  112. }
  113. failed := 0
  114. entries := make(RSSFeedItems, 0, len(feeds)*10)
  115. for i := range feeds {
  116. if errs[i] != nil {
  117. failed++
  118. slog.Error("failed to get rss feed", "error", errs[i], "url", requests[i].Url)
  119. continue
  120. }
  121. entries = append(entries, feeds[i]...)
  122. }
  123. if len(entries) == 0 {
  124. return nil, ErrNoContent
  125. }
  126. entries.SortByNewest()
  127. if failed > 0 {
  128. return entries, fmt.Errorf("%w: missing %d RSS feeds", ErrPartialContent, failed)
  129. }
  130. return entries, nil
  131. }