rss.go 5.0 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230
  1. package feed
  2. import (
  3. "context"
  4. "fmt"
  5. "html"
  6. "log/slog"
  7. "net/url"
  8. "regexp"
  9. "sort"
  10. "strings"
  11. "time"
  12. "github.com/mmcdole/gofeed"
  13. gofeedext "github.com/mmcdole/gofeed/extensions"
  14. )
  15. type RSSFeedItem struct {
  16. ChannelName string
  17. ChannelURL string
  18. Title string
  19. Link string
  20. ImageURL string
  21. Categories []string
  22. Description string
  23. PublishedAt time.Time
  24. }
  25. // doesn't cover all cases but works the vast majority of the time
  26. var htmlTagsWithAttributesPattern = regexp.MustCompile(`<\/?[a-zA-Z0-9-]+ *(?:[a-zA-Z-]+=(?:"|').*?(?:"|') ?)* *\/?>`)
  27. var sequentialWhitespacePattern = regexp.MustCompile(`\s+`)
  28. func sanitizeFeedDescription(description string) string {
  29. if description == "" {
  30. return ""
  31. }
  32. description = strings.ReplaceAll(description, "\n", " ")
  33. description = htmlTagsWithAttributesPattern.ReplaceAllString(description, "")
  34. description = sequentialWhitespacePattern.ReplaceAllString(description, " ")
  35. description = strings.TrimSpace(description)
  36. description = html.UnescapeString(description)
  37. return description
  38. }
  39. type RSSFeedRequest struct {
  40. Url string `yaml:"url"`
  41. Title string `yaml:"title"`
  42. HideCategories bool `yaml:"hide-categories"`
  43. HideDescription bool `yaml:"hide-description"`
  44. ItemLinkPrefix string `yaml:"item-link-prefix"`
  45. }
  46. type RSSFeedItems []RSSFeedItem
  47. func (f RSSFeedItems) SortByNewest() RSSFeedItems {
  48. sort.Slice(f, func(i, j int) bool {
  49. return f[i].PublishedAt.After(f[j].PublishedAt)
  50. })
  51. return f
  52. }
  53. var feedParser = gofeed.NewParser()
  54. func getItemsFromRSSFeedTask(request RSSFeedRequest) ([]RSSFeedItem, error) {
  55. ctx, cancel := context.WithTimeout(context.Background(), time.Second*5)
  56. defer cancel()
  57. feed, err := feedParser.ParseURLWithContext(request.Url, ctx)
  58. if err != nil {
  59. return nil, err
  60. }
  61. items := make(RSSFeedItems, 0, len(feed.Items))
  62. for i := range feed.Items {
  63. item := feed.Items[i]
  64. rssItem := RSSFeedItem{
  65. ChannelURL: feed.Link,
  66. Title: item.Title,
  67. }
  68. if request.ItemLinkPrefix != "" {
  69. rssItem.Link = request.ItemLinkPrefix + item.Link
  70. } else if strings.HasPrefix(item.Link, "http://") || strings.HasPrefix(item.Link, "https://") {
  71. rssItem.Link = item.Link
  72. } else {
  73. parsedUrl, err := url.Parse(feed.Link)
  74. if err != nil {
  75. parsedUrl, err = url.Parse(request.Url)
  76. }
  77. if err == nil {
  78. var link string
  79. if item.Link[0] == '/' {
  80. link = item.Link
  81. } else {
  82. link = "/" + item.Link
  83. }
  84. rssItem.Link = parsedUrl.Scheme + "://" + parsedUrl.Host + link
  85. }
  86. }
  87. if !request.HideDescription && item.Description != "" {
  88. description, _ := limitStringLength(item.Description, 1000)
  89. description = sanitizeFeedDescription(description)
  90. description, limited := limitStringLength(description, 200)
  91. if limited {
  92. description += "…"
  93. }
  94. rssItem.Description = description
  95. }
  96. if !request.HideCategories {
  97. var categories = make([]string, 0, 6)
  98. for _, category := range item.Categories {
  99. if len(categories) == 6 {
  100. break
  101. }
  102. if len(category) == 0 || len(category) > 30 {
  103. continue
  104. }
  105. categories = append(categories, category)
  106. }
  107. rssItem.Categories = categories
  108. }
  109. if request.Title != "" {
  110. rssItem.ChannelName = request.Title
  111. } else {
  112. rssItem.ChannelName = feed.Title
  113. }
  114. if item.Image != nil {
  115. rssItem.ImageURL = item.Image.URL
  116. } else if url := findThumbnailInItemExtensions(item); url != "" {
  117. rssItem.ImageURL = url
  118. } else if feed.Image != nil {
  119. rssItem.ImageURL = feed.Image.URL
  120. }
  121. if item.PublishedParsed != nil {
  122. rssItem.PublishedAt = *item.PublishedParsed
  123. } else {
  124. rssItem.PublishedAt = time.Now()
  125. }
  126. items = append(items, rssItem)
  127. }
  128. return items, nil
  129. }
  130. func recursiveFindThumbnailInExtensions(extensions map[string][]gofeedext.Extension) string {
  131. for _, exts := range extensions {
  132. for _, ext := range exts {
  133. if ext.Name == "thumbnail" || ext.Name == "image" {
  134. if url, ok := ext.Attrs["url"]; ok {
  135. return url
  136. }
  137. }
  138. if ext.Children != nil {
  139. if url := recursiveFindThumbnailInExtensions(ext.Children); url != "" {
  140. return url
  141. }
  142. }
  143. }
  144. }
  145. return ""
  146. }
  147. func findThumbnailInItemExtensions(item *gofeed.Item) string {
  148. media, ok := item.Extensions["media"]
  149. if !ok {
  150. return ""
  151. }
  152. return recursiveFindThumbnailInExtensions(media)
  153. }
  154. func GetItemsFromRSSFeeds(requests []RSSFeedRequest) (RSSFeedItems, error) {
  155. job := newJob(getItemsFromRSSFeedTask, requests).withWorkers(10)
  156. feeds, errs, err := workerPoolDo(job)
  157. if err != nil {
  158. return nil, fmt.Errorf("%w: %v", ErrNoContent, err)
  159. }
  160. failed := 0
  161. entries := make(RSSFeedItems, 0, len(feeds)*10)
  162. for i := range feeds {
  163. if errs[i] != nil {
  164. failed++
  165. slog.Error("failed to get rss feed", "error", errs[i], "url", requests[i].Url)
  166. continue
  167. }
  168. entries = append(entries, feeds[i]...)
  169. }
  170. if len(entries) == 0 {
  171. return nil, ErrNoContent
  172. }
  173. entries.SortByNewest()
  174. if failed > 0 {
  175. return entries, fmt.Errorf("%w: missing %d RSS feeds", ErrPartialContent, failed)
  176. }
  177. return entries, nil
  178. }