index.ts 4.0 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162
  1. import { Page } from 'playwright'
  2. import Turndown from 'turndown'
  3. import { LanguageModelV1 } from '@ai-sdk/provider'
  4. import { LlamaModel } from 'node-llama-cpp'
  5. import { z } from 'zod'
  6. import {
  7. generateLlamaCompletions,
  8. generateAISDKCompletions,
  9. streamAISDKCompletions,
  10. } from './models.js'
  11. import cleanup from './cleanup.js'
  12. export type ScraperLoadOptions =
  13. | {
  14. format?: 'html' | 'text' | 'markdown' | 'cleanup'
  15. }
  16. | {
  17. format: 'custom'
  18. formatFunction: (page: Page) => Promise<string> | string
  19. }
  20. | {
  21. format: 'image'
  22. fullPage?: boolean
  23. }
  24. export type ScraperLoadResult = {
  25. url: string
  26. content: string
  27. format: ScraperLoadOptions['format']
  28. }
  29. export type ScraperLLMOptions = {
  30. prompt?: string
  31. temperature?: number
  32. maxTokens?: number
  33. topP?: number
  34. }
  35. export type ScraperRunOptions = ScraperLLMOptions & ScraperLoadOptions
  36. export default class LLMScraper {
  37. constructor(private client: LanguageModelV1 | LlamaModel) {
  38. this.client = client
  39. }
  40. // Pre-process a page
  41. private async preprocess(
  42. page: Page,
  43. options: ScraperLoadOptions = { format: 'html' }
  44. ): Promise<ScraperLoadResult> {
  45. const url = page.url()
  46. let content
  47. if (options.format === 'html') {
  48. content = await page.content()
  49. }
  50. if (options.format === 'markdown') {
  51. const body = await page.innerHTML('body')
  52. content = new Turndown().turndown(body)
  53. }
  54. if (options.format === 'text') {
  55. const readable = await page.evaluate(async () => {
  56. const readability = await import(
  57. // @ts-ignore
  58. 'https://cdn.skypack.dev/@mozilla/readability'
  59. )
  60. return new readability.Readability(document).parse()
  61. })
  62. content = `Page Title: ${readable.title}\n${readable.textContent}`
  63. }
  64. if (options.format === 'cleanup') {
  65. await page.evaluate(cleanup)
  66. content = await page.content()
  67. }
  68. if (options.format === 'image') {
  69. const image = await page.screenshot({ fullPage: options.fullPage })
  70. content = image.toString('base64')
  71. }
  72. if (options.format === 'custom') {
  73. if (
  74. !options.formatFunction ||
  75. typeof options.formatFunction !== 'function'
  76. ) {
  77. throw new Error('customPreprocessor must be provided in custom mode')
  78. }
  79. content = await options.formatFunction(page)
  80. }
  81. return {
  82. url,
  83. content,
  84. format: options.format,
  85. }
  86. }
  87. // Generate completion using AI SDK
  88. private async generateCompletions<T extends z.ZodSchema<any>>(
  89. page: ScraperLoadResult,
  90. schema: T,
  91. options: ScraperRunOptions
  92. ) {
  93. switch (this.client.constructor) {
  94. default:
  95. return generateAISDKCompletions<T>(
  96. this.client as LanguageModelV1,
  97. page,
  98. schema,
  99. options
  100. )
  101. case LlamaModel:
  102. return generateLlamaCompletions<T>(this.client, page, schema, options)
  103. }
  104. }
  105. // Stream completions using AI SDK
  106. private async streamCompletions<T extends z.ZodSchema<any>>(
  107. page: ScraperLoadResult,
  108. schema: T,
  109. options: ScraperRunOptions
  110. ) {
  111. switch (this.client.constructor) {
  112. default:
  113. return streamAISDKCompletions<T>(
  114. this.client as LanguageModelV1,
  115. page,
  116. schema,
  117. options
  118. )
  119. case LlamaModel:
  120. throw new Error('Streaming not supported for local models yet')
  121. }
  122. }
  123. // Pre-process the page and generate completion
  124. async run<T extends z.ZodSchema<any>>(
  125. page: Page,
  126. schema: T,
  127. options: ScraperRunOptions
  128. ) {
  129. const preprocessed = await this.preprocess(page, options)
  130. return this.generateCompletions<T>(preprocessed, schema, options)
  131. }
  132. // Pre-process the page and generate completion
  133. async stream<T extends z.ZodSchema<any>>(
  134. page: Page,
  135. schema: T,
  136. options: ScraperRunOptions
  137. ) {
  138. const preprocessed = await this.preprocess(page, options)
  139. return this.streamCompletions<T>(preprocessed, schema, options)
  140. }
  141. }