Просмотр исходного кода

Merge pull request #48 from mishushakov/v1.6

v1.6
Mish Ushakov 2 месяцев назад
Родитель
Сommit
813b4fa194
14 измененных файлов с 1168 добавлено и 769 удалено
  1. 70 13
      README.md
  2. 17 11
      examples/codegen.ts
  3. 0 34
      examples/local.ts
  4. 1 1
      examples/ollama.ts
  5. 5 12
      examples/streaming.ts
  6. 47 0
      examples/toolUse.ts
  7. 693 471
      package-lock.json
  8. 13 13
      package.json
  9. 34 156
      src/index.ts
  10. 39 58
      src/models.ts
  11. 80 0
      src/preprocess.ts
  12. 38 0
      tests/index.ts
  13. 123 0
      tests/scraper.test.ts
  14. 8 0
      vitest.config.ts

+ 70 - 13
README.md

@@ -5,21 +5,23 @@
 LLM Scraper is a TypeScript library that allows you to extract structured data from **any** webpage using LLMs.
 
 > [!IMPORTANT]
-> [Code-generation](#code-generation) is now supported in LLM Scraper.
+> LLM Scraper was updated to version 1.6.
+> The new version comes with Vercel AI SDK 4 support, JSON Schema, better type-safety, improved code generation and updated examples.
 
 > [!TIP]
 > Under the hood, it uses function calling to convert pages to structured data. You can find more about this approach [here](https://til.simonwillison.net/gpt3/openai-python-functions-data-extraction).
 
 ### Features
 
-- Supports **Local (Ollama, GGUF)**, OpenAI, Vercel AI SDK Providers
-- Schemas defined with Zod
+- Supports GPT, Sonnet, Gemini, Llama, Qwen model series
+- Schemas defined with Zod or JSON Schema
 - Full type-safety with TypeScript
 - Based on Playwright framework
 - Streaming objects
-- **NEW** [Code-generation](#code-generation)
+- [Code-generation](#code-generation)
 - Supports 4 formatting modes:
-  - `html` for loading raw HTML
+  - `html` for loading pre-processed HTML
+  - `raw_html` for loading raw HTML (no processing)
   - `markdown` for loading markdown
   - `text` for loading extracted text (using [Readability.js](https://github.com/mozilla/readability))
   - `image` for loading a screenshot (multi-modal only)
@@ -50,6 +52,30 @@ LLM Scraper is a TypeScript library that allows you to extract structured data f
    const llm = openai.chat('gpt-4o')
    ```
 
+   **Anthropic**
+
+   ```
+   npm i @ai-sdk/anthropic
+   ```
+
+   ```js
+   import { anthropic } from '@ai-sdk/anthropic'
+
+   const llm = anthropic('claude-3-5-sonnet-20240620')
+   ```
+
+   **Google**
+
+   ```
+   npm i @ai-sdk/google
+   ```
+
+   ```js
+   import { google } from '@ai-sdk/google'
+
+   const llm = google('gemini-1.5-flash')
+   ```
+
    **Groq**
 
    ```
@@ -78,14 +104,6 @@ LLM Scraper is a TypeScript library that allows you to extract structured data f
    const llm = ollama('llama3')
    ```
 
-   **GGUF**
-
-   ```js
-   import { LlamaModel } from 'node-llama-cpp'
-
-   const llm = new LlamaModel({ modelPath: 'model.gguf' })
-   ```
-
 3. Create a new scraper instance provided with the llm:
 
    ```js
@@ -144,6 +162,45 @@ await page.close()
 await browser.close()
 ```
 
+Output
+
+```js
+[
+  {
+    title: "Palette lighting tricks on the Nintendo 64",
+    points: 105,
+    by: "ibobev",
+    commentsURL: "https://news.ycombinator.com/item?id=44014587",
+  },
+  {
+    title: "Push Ifs Up and Fors Down",
+    points: 187,
+    by: "goranmoomin",
+    commentsURL: "https://news.ycombinator.com/item?id=44013157",
+  },
+  {
+    title: "JavaScript's New Superpower: Explicit Resource Management",
+    points: 225,
+    by: "olalonde",
+    commentsURL: "https://news.ycombinator.com/item?id=44012227",
+  },
+  {
+    title: "\"We would be less confidential than Google\" Proton threatens to quit Switzerland",
+    points: 65,
+    by: "taubek",
+    commentsURL: "https://news.ycombinator.com/item?id=44014808",
+  },
+  {
+    title: "OBNC – Oberon-07 Compiler",
+    points: 37,
+    by: "AlexeyBrin",
+    commentsURL: "https://news.ycombinator.com/item?id=44013671",
+  }
+]
+```
+
+More examples can be found in the [examples](./examples) folder.
+
 ## Streaming
 
 Replace your `run` function with `stream` to get a partial object stream (Vercel AI SDK only).

+ 17 - 11
examples/codegen.ts

@@ -1,34 +1,40 @@
 import { chromium } from 'playwright'
 import { z } from 'zod'
-import { anthropic } from '@ai-sdk/anthropic'
+import { openai } from '@ai-sdk/openai'
 import LLMScraper from './../src'
 
 // Launch a browser instance
 const browser = await chromium.launch()
 
 // Initialize LLM provider
-const llm = anthropic('claude-3-5-sonnet-20240620')
+const llm = openai('gpt-4o-mini')
 
 // Create a new LLMScraper
 const scraper = new LLMScraper(llm)
 
 // Open new page
 const page = await browser.newPage()
-await page.goto('https://www.bbc.com')
+await page.goto('https://news.ycombinator.com')
 
 // Define schema to extract contents into
 const schema = z.object({
-  news: z.array(
-    z.object({
-      title: z.string(),
-      description: z.string(),
-      url: z.string(),
-    })
-  ),
+  top: z
+    .array(
+      z.object({
+        title: z.string(),
+        points: z.number(),
+        by: z.string(),
+        commentsURL: z.string(),
+      })
+    )
+    .length(5)
+    .describe('Top 5 stories on Hacker News'),
 })
 
 // Generate code and run it on the page
-const { code } = await scraper.generate(page, schema)
+const { code } = await scraper.generate(page, schema, {
+  format: 'raw_html',
+})
 console.log('code', code)
 
 const result = await page.evaluate(code)

+ 0 - 34
examples/local.ts

@@ -1,34 +0,0 @@
-import { chromium } from 'playwright'
-import { LlamaModel } from 'node-llama-cpp'
-import { z } from 'zod'
-import LLMScraper from './../src'
-
-// Launch a browser instance
-const browser = await chromium.launch()
-
-const modelPath =
-  '/Users/mish/jan/models/tinyllama-1.1b/tinyllama-1.1b-chat-v1.0.Q4_K_M.gguf'
-
-const llm = new LlamaModel({ modelPath })
-
-// Initialize a new LLMScraper with local model
-const scraper = new LLMScraper(llm)
-
-// Open the page
-const page = await browser.newPage()
-await page.goto('https://example.com')
-
-// Define schema to extract contents into
-const schema = z.object({
-  h1: z.string().describe('The main heading of the page'),
-})
-
-// Run the scraper
-const { data } = await scraper.run(page, schema, {
-  format: 'text',
-})
-
-console.log(data)
-
-await page.close()
-await browser.close()

+ 1 - 1
examples/ollama.ts

@@ -7,7 +7,7 @@ import LLMScraper from './../src'
 const browser = await chromium.launch()
 
 // Initialize LLM provider
-const llm = ollama('llama3')
+const llm = ollama('gemma3:1b')
 
 // Initialize a new LLMScraper with local model
 const scraper = new LLMScraper(llm)

+ 5 - 12
examples/streaming.ts

@@ -18,17 +18,10 @@ await page.goto('https://news.ycombinator.com')
 
 // Define schema to extract contents into
 const schema = z.object({
-  top: z
-    .array(
-      z.object({
-        title: z.string(),
-        points: z.number(),
-        by: z.string(),
-        commentsURL: z.string(),
-      })
-    )
-    .length(5)
-    .describe('Top 5 stories on Hacker News'),
+  title: z.string(),
+  points: z.number(),
+  by: z.string(),
+  commentsURL: z.string(),
 })
 
 // Run the scraper in streaming mode
@@ -38,7 +31,7 @@ const { stream } = await scraper.stream(page, schema, {
 
 // Stream the result from LLM
 for await (const data of stream) {
-  console.log(data.top)
+  console.log(data)
 }
 
 await page.close()

+ 47 - 0
examples/toolUse.ts

@@ -0,0 +1,47 @@
+import { openai } from '@ai-sdk/openai'
+import { generateText, jsonSchema as toJSONSChema, tool } from 'ai'
+import { chromium } from 'playwright'
+import { z } from 'zod'
+import LLMScraper from './../src'
+
+const model = openai('gpt-4o-mini')
+const scraper = new LLMScraper(model)
+
+const { text } = await generateText({
+  model,
+  tools: {
+    scrapeWebsite: tool({
+      description: 'Scrape a website with a given schema and URL',
+      parameters: z.object({
+        url: z.string(),
+        jsonSchema: z.string(),
+      }),
+      execute: async ({ url, jsonSchema }) => {
+        console.log('scraping website', url)
+        console.log('with schema', jsonSchema)
+
+        // Launch a browser instance
+        const browser = await chromium.launch()
+
+        // Open new page
+        const page = await browser.newPage()
+        await page.goto('https://news.ycombinator.com')
+
+        // Parse jsonSchema
+        const schema = toJSONSChema(JSON.parse(jsonSchema))
+
+        // Run the scraper
+        const result = await scraper.run(page, schema)
+        await page.close()
+        await browser.close()
+
+        // Feed the result back to the model
+        return result.data
+      },
+    }),
+  },
+  maxSteps: 2,
+  prompt: 'List top stories from HackerNews frontpage and summarize them',
+})
+
+console.log(text)

Разница между файлами не показана из-за своего большого размера
+ 693 - 471
package-lock.json


+ 13 - 13
package.json

@@ -1,12 +1,12 @@
 {
   "type": "module",
   "name": "llm-scraper",
-  "version": "1.5.1",
+  "version": "1.6.0",
   "description": "Turn any webpage intro structured data using LLMs",
   "main": "dist/index.js",
   "scripts": {
     "build": "tsc -p tsconfig.json",
-    "test": "echo \"Error: no test specified\" && exit 1"
+    "test": "vitest run"
   },
   "repository": {
     "type": "git",
@@ -26,19 +26,19 @@
   },
   "homepage": "https://github.com/mishushakov/llm-scraper#readme",
   "dependencies": {
-    "ai": "^3.1.12",
-    "node-llama-cpp": "^2.8.9",
-    "turndown": "^7.1.3",
-    "zod-to-json-schema": "^3.22.5"
+    "@ai-sdk/provider": "^1.1.3",
+    "ai": "^4.3.15",
+    "turndown": "^7.2.0",
+    "zod-to-json-schema": "^3.24.5"
   },
   "devDependencies": {
-    "@ai-sdk/anthropic": "^0.0.30",
-    "@ai-sdk/openai": "^0.0.2",
-    "@types/node": "^20.12.7",
+    "@ai-sdk/openai": "^1.3.22",
+    "@types/node": "^22.15.18",
     "@types/react": "^18.2.79",
-    "ollama-ai-provider": "^0.10.0",
-    "playwright": "^1.43.1",
-    "typescript": "^5.4.5",
-    "zod": "^3.22.5"
+    "ollama-ai-provider": "^1.2.0",
+    "playwright": "^1.52.0",
+    "typescript": "^5.8.3",
+    "vitest": "^3.1.3",
+    "zod": "^3.24.4"
   }
 }

+ 34 - 156
src/index.ts

@@ -1,191 +1,69 @@
-import { Page } from 'playwright'
-import Turndown from 'turndown'
+import { type Page } from 'playwright'
 import { LanguageModelV1 } from '@ai-sdk/provider'
-import { LlamaModel } from 'node-llama-cpp'
 import { z } from 'zod'
+import { Schema } from 'ai'
+import { preprocess, PreProcessOptions } from './preprocess.js'
 import {
-  generateLlamaCompletions,
   generateAISDKCompletions,
   streamAISDKCompletions,
   generateAISDKCode,
 } from './models.js'
 
-import cleanup from './cleanup.js'
-
-export type ScraperLoadOptions =
-  | {
-      format?: 'html' | 'text' | 'markdown' | 'cleanup'
-    }
-  | {
-      format: 'custom'
-      formatFunction: (page: Page) => Promise<string> | string
-    }
-  | {
-      format: 'image'
-      fullPage?: boolean
-    }
-
-export type ScraperLoadResult = {
-  url: string
-  content: string
-  format: ScraperLoadOptions['format']
-}
-
 export type ScraperLLMOptions = {
   prompt?: string
   temperature?: number
   maxTokens?: number
   topP?: number
-  mode?: 'auto' | 'json' | 'tool' | 'grammar'
+  mode?: 'auto' | 'json' | 'tool'
+  output?: 'array'
 }
 
-export type ScraperRunOptions = ScraperLLMOptions & ScraperLoadOptions
+export type ScraperGenerateOptions = Omit<
+  ScraperLLMOptions,
+  'output' | 'mode'
+> & {
+  format?: 'html'| 'raw_html'
+}
+
+export type ScraperRunOptions = ScraperLLMOptions & PreProcessOptions
 
 export default class LLMScraper {
-  constructor(private client: LanguageModelV1 | LlamaModel) {
+  constructor(private client: LanguageModelV1) {
     this.client = client
   }
 
-  // Pre-process a page
-  private async preprocess(
-    page: Page,
-    options: ScraperLoadOptions = { format: 'html' }
-  ): Promise<ScraperLoadResult> {
-    const url = page.url()
-    let content
-
-    if (options.format === 'html') {
-      content = await page.content()
-    }
-
-    if (options.format === 'markdown') {
-      const body = await page.innerHTML('body')
-      content = new Turndown().turndown(body)
-    }
-
-    if (options.format === 'text') {
-      const readable = await page.evaluate(async () => {
-        const readability = await import(
-          // @ts-ignore
-          'https://cdn.skypack.dev/@mozilla/readability'
-        )
-
-        return new readability.Readability(document).parse()
-      })
-
-      content = `Page Title: ${readable.title}\n${readable.textContent}`
-    }
-
-    if (options.format === 'cleanup') {
-      await page.evaluate(cleanup)
-      content = await page.content()
-    }
-
-    if (options.format === 'image') {
-      const image = await page.screenshot({ fullPage: options.fullPage })
-      content = image.toString('base64')
-    }
-
-    if (options.format === 'custom') {
-      if (
-        !options.formatFunction ||
-        typeof options.formatFunction !== 'function'
-      ) {
-        throw new Error('customPreprocessor must be provided in custom mode')
-      }
-
-      content = await options.formatFunction(page)
-    }
-
-    return {
-      url,
-      content,
-      format: options.format,
-    }
-  }
-
-  // Generate completion using AI SDK
-  private async generateCompletions<T extends z.ZodSchema<any>>(
-    page: ScraperLoadResult,
-    schema: T,
-    options?: ScraperRunOptions
-  ) {
-    switch (this.client.constructor) {
-      default:
-        return generateAISDKCompletions<T>(
-          this.client as LanguageModelV1,
-          page,
-          schema,
-          options
-        )
-      case LlamaModel:
-        return generateLlamaCompletions<T>(this.client, page, schema, options)
-    }
-  }
-
-  // Stream completions using AI SDK
-  private async streamCompletions<T extends z.ZodSchema<any>>(
-    page: ScraperLoadResult,
-    schema: T,
-    options?: ScraperRunOptions
-  ) {
-    switch (this.client.constructor) {
-      default:
-        return streamAISDKCompletions<T>(
-          this.client as LanguageModelV1,
-          page,
-          schema,
-          options
-        )
-      case LlamaModel:
-        throw new Error('Streaming not supported with GGUF models')
-    }
-  }
-
-  private async generateCode<T extends z.ZodSchema<any>>(
-    page: ScraperLoadResult,
-    schema: T,
-    options?: ScraperLLMOptions
-  ) {
-    switch (this.client.constructor) {
-      default:
-        return generateAISDKCode<T>(
-          this.client as LanguageModelV1,
-          page,
-          schema,
-          options
-        )
-      case LlamaModel:
-        throw new Error('Code-generation not supported with GGUF models')
-    }
-  }
-
   // Pre-process the page and generate completion
-  async run<T extends z.ZodSchema<any>>(
+  async run<T>(
     page: Page,
-    schema: T,
+    schema: z.Schema<T, z.ZodTypeDef, any> | Schema<T>,
     options?: ScraperRunOptions
   ) {
-    const preprocessed = await this.preprocess(page, options)
-    return this.generateCompletions<T>(preprocessed, schema, options)
+    const preprocessed = await preprocess(page, options)
+    return generateAISDKCompletions<T>(
+      this.client,
+      preprocessed,
+      schema,
+      options
+    )
   }
 
   // Pre-process the page and stream completion
-  async stream<T extends z.ZodSchema<any>>(
+  async stream<T>(
     page: Page,
-    schema: T,
+    schema: z.Schema<T, z.ZodTypeDef, any> | Schema<T>,
     options?: ScraperRunOptions
   ) {
-    const preprocessed = await this.preprocess(page, options)
-    return this.streamCompletions<T>(preprocessed, schema, options)
+    const preprocessed = await preprocess(page, options)
+    return streamAISDKCompletions<T>(this.client, preprocessed, schema, options)
   }
 
   // Pre-process the page and generate code
-  async generate(page, schema: z.ZodSchema<any>, options?: ScraperLLMOptions) {
-    const preprocessed = await this.preprocess(page, {
-      ...options,
-      format: 'cleanup',
-    })
-    return this.generateCode(preprocessed, schema, options)
+  async generate<T>(
+    page: Page,
+    schema: z.Schema<T, z.ZodTypeDef, any> | Schema<T>,
+    options?: ScraperGenerateOptions
+  ) {
+    const preprocessed = await preprocess(page, options)
+    return generateAISDKCode<T>(this.client, preprocessed, schema, options)
   }
 }

+ 39 - 58
src/models.ts

@@ -1,28 +1,30 @@
 import { LanguageModelV1 } from '@ai-sdk/provider'
-import { generateObject, generateText, streamObject, UserContent } from 'ai'
-import { z } from 'zod'
-import { ScraperLoadResult, ScraperLLMOptions } from './index.js'
 import {
-  LlamaModel,
-  LlamaJsonSchemaGrammar,
-  LlamaContext,
-  LlamaChatSession,
-  GbnfJsonSchema,
-} from 'node-llama-cpp'
+  generateObject,
+  generateText,
+  streamObject,
+  UserContent,
+  Schema,
+} from 'ai'
+import { z } from 'zod'
+import { ScraperLLMOptions, ScraperGenerateOptions } from './index.js'
+import { PreProcessResult } from './preprocess.js'
 import { zodToJsonSchema } from 'zod-to-json-schema'
 
-export type ScraperCompletionResult<T extends z.ZodSchema<any>> = {
-  data: z.infer<T>
-  url: string
-}
-
 const defaultPrompt =
   'You are a sophisticated web scraper. Extract the contents of the webpage'
 
-const defaultCodePrompt = `Provide a scraping function in JavaScript that extracts and formats data according to a schema from the current page.
-The function must be IIFE. No comments or imports. The code you generate will be executed straight away, you shouldn't output anything besides runnable code.`
+const defaultCodePrompt =
+  "Provide a scraping function in JavaScript that extracts and returns data according to a schema from the current page. The function must be IIFE. No comments or imports. No console.log. The code you generate will be executed straight away, you shouldn't output anything besides runnable code."
+
+function stripMarkdownBackticks(text: string) {
+  let trimmed = text.trim()
+  trimmed = trimmed.replace(/^```(?:javascript)?\s*/i, '')
+  trimmed = trimmed.replace(/\s*```$/i, '')
+  return trimmed
+}
 
-function prepareAISDKPage(page: ScraperLoadResult): UserContent {
+function prepareAISDKPage(page: PreProcessResult): UserContent {
   if (page.format === 'image') {
     return [
       {
@@ -35,14 +37,14 @@ function prepareAISDKPage(page: ScraperLoadResult): UserContent {
   return [{ type: 'text', text: page.content }]
 }
 
-export async function generateAISDKCompletions<T extends z.ZodSchema<any>>(
+export async function generateAISDKCompletions<T>(
   model: LanguageModelV1,
-  page: ScraperLoadResult,
-  schema: T,
+  page: PreProcessResult,
+  schema: z.Schema<T, z.ZodTypeDef, any> | Schema<T>,
   options?: ScraperLLMOptions
 ) {
   const content = prepareAISDKPage(page)
-  const result = await generateObject<z.infer<T>>({
+  const result = await generateObject<T>({
     model,
     messages: [
       { role: 'system', content: options?.prompt || defaultPrompt },
@@ -53,6 +55,7 @@ export async function generateAISDKCompletions<T extends z.ZodSchema<any>>(
     maxTokens: options?.maxTokens,
     topP: options?.topP,
     mode: options?.mode,
+    output: options?.output,
   })
 
   return {
@@ -61,23 +64,25 @@ export async function generateAISDKCompletions<T extends z.ZodSchema<any>>(
   }
 }
 
-export async function streamAISDKCompletions<T extends z.ZodSchema<any>>(
+export function streamAISDKCompletions<T>(
   model: LanguageModelV1,
-  page: ScraperLoadResult,
-  schema: T,
+  page: PreProcessResult,
+  schema: z.Schema<T, z.ZodTypeDef, any> | Schema<T>,
   options?: ScraperLLMOptions
 ) {
   const content = prepareAISDKPage(page)
-  const { partialObjectStream } = await streamObject<z.infer<T>>({
+  const { partialObjectStream } = streamObject<T>({
     model,
     messages: [
       { role: 'system', content: options?.prompt || defaultPrompt },
       { role: 'user', content },
     ],
     schema,
+    output: options?.output,
     temperature: options?.temperature,
     maxTokens: options?.maxTokens,
     topP: options?.topP,
+    mode: options?.mode,
   })
 
   return {
@@ -86,13 +91,15 @@ export async function streamAISDKCompletions<T extends z.ZodSchema<any>>(
   }
 }
 
-export async function generateAISDKCode<T extends z.ZodSchema<any>>(
+export async function generateAISDKCode<T>(
   model: LanguageModelV1,
-  page: ScraperLoadResult,
-  schema: T,
-  options?: ScraperLLMOptions
+  page: PreProcessResult,
+  schema: z.Schema<T, z.ZodTypeDef, any> | Schema<T>,
+  options?: ScraperGenerateOptions
 ) {
-  const generatedSchema = zodToJsonSchema(schema)
+  const parsedSchema =
+    schema instanceof z.ZodType ? zodToJsonSchema(schema) : schema
+
   const result = await generateText({
     model,
     messages: [
@@ -100,7 +107,7 @@ export async function generateAISDKCode<T extends z.ZodSchema<any>>(
       {
         role: 'user',
         content: `Website: ${page.url}
-        Schema: ${JSON.stringify(generatedSchema)}
+        Schema: ${JSON.stringify(parsedSchema)}
         Content: ${page.content}`,
       },
     ],
@@ -110,33 +117,7 @@ export async function generateAISDKCode<T extends z.ZodSchema<any>>(
   })
 
   return {
-    code: result.text,
-    url: page.url,
-  }
-}
-
-export async function generateLlamaCompletions<T extends z.ZodSchema<any>>(
-  model: LlamaModel,
-  page: ScraperLoadResult,
-  schema: T,
-  options?: ScraperLLMOptions
-): Promise<ScraperCompletionResult<T>> {
-  const generatedSchema = zodToJsonSchema(schema) as GbnfJsonSchema
-  const grammar = new LlamaJsonSchemaGrammar(generatedSchema) as any // any, because it has type inference going wild
-  const context = new LlamaContext({ model })
-  const session = new LlamaChatSession({ context })
-  const pagePrompt = `${options?.prompt || defaultPrompt}\n${page.content}`
-
-  const result = await session.prompt(pagePrompt, {
-    grammar,
-    temperature: options?.temperature,
-    maxTokens: options?.maxTokens,
-    topP: options?.topP,
-  })
-
-  const parsed = grammar.parse(result)
-  return {
-    data: parsed,
+    code: stripMarkdownBackticks(result.text),
     url: page.url,
   }
 }

+ 80 - 0
src/preprocess.ts

@@ -0,0 +1,80 @@
+import { type Page } from 'playwright'
+import Turndown from 'turndown'
+
+import cleanup from './cleanup.js'
+
+export type PreProcessOptions =
+  | {
+      format?: 'html' | 'text' | 'markdown' | 'raw_html'
+    }
+  | {
+      format: 'custom'
+      formatFunction: (page: Page) => Promise<string> | string
+    }
+  | {
+      format: 'image'
+      fullPage?: boolean
+    }
+
+export type PreProcessResult = {
+  url: string
+  content: string
+  format: PreProcessOptions['format']
+}
+
+export async function preprocess(
+  page: Page,
+  options: PreProcessOptions = { format: 'html' }
+): Promise<PreProcessResult> {
+  const url = page.url()
+  let content
+
+  if (options.format === 'raw_html') {
+    content = await page.content()
+  }
+
+  if (options.format === 'markdown') {
+    const body = await page.innerHTML('body')
+    content = new Turndown().turndown(body)
+  }
+
+  if (options.format === 'text') {
+    const readable = await page.evaluate(async () => {
+      const readability = await import(
+        // @ts-ignore
+        'https://cdn.skypack.dev/@mozilla/readability'
+      )
+
+      return new readability.Readability(document).parse()
+    })
+
+    content = `Page Title: ${readable.title}\n${readable.textContent}`
+  }
+
+  if (options.format === 'html') {
+    await page.evaluate(cleanup)
+    content = await page.content()
+  }
+
+  if (options.format === 'image') {
+    const image = await page.screenshot({ fullPage: options.fullPage })
+    content = image.toString('base64')
+  }
+
+  if (options.format === 'custom') {
+    if (
+      !options.formatFunction ||
+      typeof options.formatFunction !== 'function'
+    ) {
+      throw new Error('customPreprocessor must be provided in custom mode')
+    }
+
+    content = await options.formatFunction(page)
+  }
+
+  return {
+    url,
+    content,
+    format: options.format,
+  }
+}

+ 38 - 0
tests/index.ts

@@ -0,0 +1,38 @@
+import { test as baseTest, expect, afterAll } from 'vitest'
+import LLMScraper from '../src'
+import { openai } from '@ai-sdk/openai'
+import { chromium, Browser } from 'playwright'
+
+let browser: Browser | null = null
+
+async function getBrowser() {
+  if (!browser) {
+    browser = await chromium.launch()
+  }
+  return browser
+}
+
+afterAll(async () => {
+  if (browser) {
+    await browser.close()
+    browser = null
+  }
+})
+
+export const test = baseTest.extend<{
+  page: Awaited<ReturnType<Browser['newPage']>>
+  scraper: LLMScraper
+}>({
+  page: async ({}, use) => {
+    const browser = await getBrowser()
+    const page = await browser.newPage()
+    await use(page)
+    await page.close()
+  },
+  scraper: async ({}, use) => {
+    const scraper = new LLMScraper(openai('gpt-4o-mini'))
+    await use(scraper)
+  },
+})
+
+export { expect }

+ 123 - 0
tests/scraper.test.ts

@@ -0,0 +1,123 @@
+import { z } from 'zod'
+import { test, expect } from './index'
+import { jsonSchema } from 'ai'
+import { zodToJsonSchema } from 'zod-to-json-schema'
+
+const storySchema = z.object({
+  title: z.string(),
+  points: z.number(),
+  by: z.string(),
+  commentsURL: z.string(),
+})
+
+const schema = z.object({
+  top: z.array(storySchema).length(5).describe('Top 5 stories on Hacker News'),
+})
+
+test('scrapes top 5 stories from Hacker News', async ({ page, scraper }) => {
+  await page.goto('https://news.ycombinator.com')
+
+  const { data } = await scraper.run(page, schema)
+
+  expect(schema.safeParse(data).success).toBe(true)
+})
+
+test('scrapes top 5 stories from Hacker News (image format)', async ({
+  page,
+  scraper,
+}) => {
+  await page.goto('https://news.ycombinator.com')
+
+  const { data } = await scraper.run(page, schema, {
+    format: 'image',
+  })
+
+  expect(schema.safeParse(data).success).toBe(true)
+})
+
+test('scrapes top 5 stories from Hacker News (markdown format)', async ({
+  page,
+  scraper,
+}) => {
+  await page.goto('https://news.ycombinator.com')
+
+  const { data } = await scraper.run(page, schema, {
+    format: 'markdown',
+  })
+
+  expect(schema.safeParse(data).success).toBe(true)
+})
+
+test('scrapes top 5 stories from Hacker News (raw html)', async ({
+  page,
+  scraper,
+}) => {
+  await page.goto('https://news.ycombinator.com')
+
+  const { data } = await scraper.run(page, schema, {
+    format: 'raw_html',
+  })
+
+  expect(schema.safeParse(data).success).toBe(true)
+})
+
+test('scrapes top 5 stories from Hacker News (code generation)', async ({
+  page,
+  scraper,
+}) => {
+  await page.goto('https://news.ycombinator.com')
+
+  const { code } = await scraper.generate(page, schema)
+  const result: z.infer<typeof schema> = await page.evaluate(code)
+
+  expect(schema.safeParse(result).success).toBe(true)
+})
+
+test('scrapes top 5 stories from Hacker News (json schema)', async ({
+  page,
+  scraper,
+}) => {
+  await page.goto('https://news.ycombinator.com')
+
+  const m = jsonSchema<{ top: { title: string }[] }>(zodToJsonSchema(schema))
+  const { data } = await scraper.run(page, m)
+
+  expect(schema.safeParse(data).success).toBe(true)
+})
+
+test('scrapes example.com (streaming)', async ({ page, scraper }) => {
+  await page.goto('https://example.com')
+
+  const { stream } = await scraper.stream(
+    page,
+    z.object({
+      h1: z.string().describe('The main heading of the page'),
+    })
+  )
+
+  let text = ''
+  for await (const item of stream) {
+    text = item.h1 || ''
+  }
+
+  expect(text).toBe('Example Domain')
+})
+
+test('scrapes top stories from Hacker News (streaming, array)', async ({
+  page,
+  scraper,
+}) => {
+  await page.goto('https://news.ycombinator.com')
+
+  const { stream } = await scraper.stream(page, storySchema, {
+    format: 'raw_html',
+    output: 'array',
+  })
+
+  let last: Partial<z.infer<typeof storySchema>>[] = []
+  for await (const item of stream) {
+    last = item as typeof last
+  }
+
+  expect(last).toHaveLength(30)
+})

+ 8 - 0
vitest.config.ts

@@ -0,0 +1,8 @@
+import { defineConfig } from 'vitest/config'
+
+export default defineConfig({
+  test: {
+    include: ['tests/**/*.test.ts'],
+    testTimeout: 30000,
+  },
+})

Некоторые файлы не были показаны из-за большого количества измененных файлов