123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172 |
- package feed
- import (
- "context"
- "fmt"
- "html"
- "log/slog"
- "regexp"
- "sort"
- "strings"
- "time"
- "github.com/mmcdole/gofeed"
- )
- type RSSFeedItem struct {
- ChannelName string
- ChannelURL string
- Title string
- Link string
- ImageURL string
- Categories []string
- Description string
- PublishedAt time.Time
- }
- // doesn't cover all cases but works the vast majority of the time
- var htmlTagsWithAttributesPattern = regexp.MustCompile(`<\/?[a-zA-Z0-9-]+ *(?:[a-zA-Z-]+=(?:"|').*?(?:"|') ?)* *\/?>`)
- var sequentialWhitespacePattern = regexp.MustCompile(`\s+`)
- func sanitizeFeedDescription(description string) string {
- if description == "" {
- return ""
- }
- description = strings.ReplaceAll(description, "\n", " ")
- description = htmlTagsWithAttributesPattern.ReplaceAllString(description, "")
- description = sequentialWhitespacePattern.ReplaceAllString(description, " ")
- description = strings.TrimSpace(description)
- description = html.UnescapeString(description)
- return description
- }
- type RSSFeedRequest struct {
- Url string `yaml:"url"`
- Title string `yaml:"title"`
- HideCategories bool `yaml:"hide-categories"`
- HideDescription bool `yaml:"hide-description"`
- }
- type RSSFeedItems []RSSFeedItem
- func (f RSSFeedItems) SortByNewest() RSSFeedItems {
- sort.Slice(f, func(i, j int) bool {
- return f[i].PublishedAt.After(f[j].PublishedAt)
- })
- return f
- }
- var feedParser = gofeed.NewParser()
- func getItemsFromRSSFeedTask(request RSSFeedRequest) ([]RSSFeedItem, error) {
- ctx, cancel := context.WithTimeout(context.Background(), time.Second*5)
- defer cancel()
- feed, err := feedParser.ParseURLWithContext(request.Url, ctx)
- if err != nil {
- return nil, err
- }
- items := make(RSSFeedItems, 0, len(feed.Items))
- for i := range feed.Items {
- item := feed.Items[i]
- rssItem := RSSFeedItem{
- ChannelURL: feed.Link,
- Title: item.Title,
- Link: item.Link,
- }
- if !request.HideDescription && item.Description != "" {
- description, _ := limitStringLength(item.Description, 1000)
- description = sanitizeFeedDescription(description)
- description, limited := limitStringLength(description, 200)
- if limited {
- description += "…"
- }
- rssItem.Description = description
- }
- if !request.HideCategories {
- var categories = make([]string, 0, 6)
- for _, category := range item.Categories {
- if len(categories) == 6 {
- break
- }
- if len(category) == 0 || len(category) > 30 {
- continue
- }
- categories = append(categories, category)
- }
- rssItem.Categories = categories
- }
- if request.Title != "" {
- rssItem.ChannelName = request.Title
- } else {
- rssItem.ChannelName = feed.Title
- }
- if item.Image != nil {
- rssItem.ImageURL = item.Image.URL
- } else if feed.Image != nil {
- rssItem.ImageURL = feed.Image.URL
- }
- if item.PublishedParsed != nil {
- rssItem.PublishedAt = *item.PublishedParsed
- } else {
- rssItem.PublishedAt = time.Now()
- }
- items = append(items, rssItem)
- }
- return items, nil
- }
- func GetItemsFromRSSFeeds(requests []RSSFeedRequest) (RSSFeedItems, error) {
- job := newJob(getItemsFromRSSFeedTask, requests).withWorkers(10)
- feeds, errs, err := workerPoolDo(job)
- if err != nil {
- return nil, fmt.Errorf("%w: %v", ErrNoContent, err)
- }
- failed := 0
- entries := make(RSSFeedItems, 0, len(feeds)*10)
- for i := range feeds {
- if errs[i] != nil {
- failed++
- slog.Error("failed to get rss feed", "error", errs[i], "url", requests[i].Url)
- continue
- }
- entries = append(entries, feeds[i]...)
- }
- if len(entries) == 0 {
- return nil, ErrNoContent
- }
- entries.SortByNewest()
- if failed > 0 {
- return entries, fmt.Errorf("%w: missing %d RSS feeds", ErrPartialContent, failed)
- }
- return entries, nil
- }
|