rss.go 4.2 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197
  1. package feed
  2. import (
  3. "context"
  4. "fmt"
  5. "html"
  6. "log/slog"
  7. "net/url"
  8. "regexp"
  9. "sort"
  10. "strings"
  11. "time"
  12. "github.com/mmcdole/gofeed"
  13. )
  14. type RSSFeedItem struct {
  15. ChannelName string
  16. ChannelURL string
  17. Title string
  18. Link string
  19. ImageURL string
  20. Categories []string
  21. Description string
  22. PublishedAt time.Time
  23. }
  24. // doesn't cover all cases but works the vast majority of the time
  25. var htmlTagsWithAttributesPattern = regexp.MustCompile(`<\/?[a-zA-Z0-9-]+ *(?:[a-zA-Z-]+=(?:"|').*?(?:"|') ?)* *\/?>`)
  26. var sequentialWhitespacePattern = regexp.MustCompile(`\s+`)
  27. func sanitizeFeedDescription(description string) string {
  28. if description == "" {
  29. return ""
  30. }
  31. description = strings.ReplaceAll(description, "\n", " ")
  32. description = htmlTagsWithAttributesPattern.ReplaceAllString(description, "")
  33. description = sequentialWhitespacePattern.ReplaceAllString(description, " ")
  34. description = strings.TrimSpace(description)
  35. description = html.UnescapeString(description)
  36. return description
  37. }
  38. type RSSFeedRequest struct {
  39. Url string `yaml:"url"`
  40. Title string `yaml:"title"`
  41. HideCategories bool `yaml:"hide-categories"`
  42. HideDescription bool `yaml:"hide-description"`
  43. ItemLinkPrefix string `yaml:"item-link-prefix"`
  44. }
  45. type RSSFeedItems []RSSFeedItem
  46. func (f RSSFeedItems) SortByNewest() RSSFeedItems {
  47. sort.Slice(f, func(i, j int) bool {
  48. return f[i].PublishedAt.After(f[j].PublishedAt)
  49. })
  50. return f
  51. }
  52. var feedParser = gofeed.NewParser()
  53. func getItemsFromRSSFeedTask(request RSSFeedRequest) ([]RSSFeedItem, error) {
  54. ctx, cancel := context.WithTimeout(context.Background(), time.Second*5)
  55. defer cancel()
  56. feed, err := feedParser.ParseURLWithContext(request.Url, ctx)
  57. if err != nil {
  58. return nil, err
  59. }
  60. items := make(RSSFeedItems, 0, len(feed.Items))
  61. for i := range feed.Items {
  62. item := feed.Items[i]
  63. rssItem := RSSFeedItem{
  64. ChannelURL: feed.Link,
  65. Title: item.Title,
  66. }
  67. if request.ItemLinkPrefix != "" {
  68. rssItem.Link = request.ItemLinkPrefix + item.Link
  69. } else if strings.HasPrefix(item.Link, "http://") || strings.HasPrefix(item.Link, "https://") {
  70. rssItem.Link = item.Link
  71. } else {
  72. parsedUrl, err := url.Parse(feed.Link)
  73. if err != nil {
  74. parsedUrl, err = url.Parse(request.Url)
  75. }
  76. if err == nil {
  77. var link string
  78. if item.Link[0] == '/' {
  79. link = item.Link
  80. } else {
  81. link = "/" + item.Link
  82. }
  83. rssItem.Link = parsedUrl.Scheme + "://" + parsedUrl.Host + link
  84. }
  85. }
  86. if !request.HideDescription && item.Description != "" {
  87. description, _ := limitStringLength(item.Description, 1000)
  88. description = sanitizeFeedDescription(description)
  89. description, limited := limitStringLength(description, 200)
  90. if limited {
  91. description += "…"
  92. }
  93. rssItem.Description = description
  94. }
  95. if !request.HideCategories {
  96. var categories = make([]string, 0, 6)
  97. for _, category := range item.Categories {
  98. if len(categories) == 6 {
  99. break
  100. }
  101. if len(category) == 0 || len(category) > 30 {
  102. continue
  103. }
  104. categories = append(categories, category)
  105. }
  106. rssItem.Categories = categories
  107. }
  108. if request.Title != "" {
  109. rssItem.ChannelName = request.Title
  110. } else {
  111. rssItem.ChannelName = feed.Title
  112. }
  113. if item.Image != nil {
  114. rssItem.ImageURL = item.Image.URL
  115. } else if feed.Image != nil {
  116. rssItem.ImageURL = feed.Image.URL
  117. }
  118. if item.PublishedParsed != nil {
  119. rssItem.PublishedAt = *item.PublishedParsed
  120. } else {
  121. rssItem.PublishedAt = time.Now()
  122. }
  123. items = append(items, rssItem)
  124. }
  125. return items, nil
  126. }
  127. func GetItemsFromRSSFeeds(requests []RSSFeedRequest) (RSSFeedItems, error) {
  128. job := newJob(getItemsFromRSSFeedTask, requests).withWorkers(10)
  129. feeds, errs, err := workerPoolDo(job)
  130. if err != nil {
  131. return nil, fmt.Errorf("%w: %v", ErrNoContent, err)
  132. }
  133. failed := 0
  134. entries := make(RSSFeedItems, 0, len(feeds)*10)
  135. for i := range feeds {
  136. if errs[i] != nil {
  137. failed++
  138. slog.Error("failed to get rss feed", "error", errs[i], "url", requests[i].Url)
  139. continue
  140. }
  141. entries = append(entries, feeds[i]...)
  142. }
  143. if len(entries) == 0 {
  144. return nil, ErrNoContent
  145. }
  146. entries.SortByNewest()
  147. if failed > 0 {
  148. return entries, fmt.Errorf("%w: missing %d RSS feeds", ErrPartialContent, failed)
  149. }
  150. return entries, nil
  151. }