1 年之前 · 64ef126ab1
--- a/README.md
+++ b/README.md
@@ -2,7 +2,7 @@
 
															 <img width="1800" alt="Screenshot 2024-04-20 at 23 11 16" src="https://github.com/mishushakov/llm-scraper/assets/10400064/ab00e048-a9ff-43b6-81d5-2e58090e2e65">
														
 
															-LLM Scraper is a TypeScript library that allows you to convert **any** webpages into structured data using LLMs.
														
 
															+LLM Scraper is a TypeScript library that allows you to extract structured data from **any** webpage using LLMs.
														
 
															 > [!TIP]
														
 
															 > Under the hood, it uses function calling to convert pages to structured data. You can find more about this approach [here](https://til.simonwillison.net/gpt3/openai-python-functions-data-extraction)
														
@@ -14,7 +14,8 @@ LLM Scraper is a TypeScript library that allows you to convert **any** webpages
 
															 - Full type-safety with TypeScript
														
 
															 - Based on Playwright framework
														
 
															 - Streaming objects
														
 
															-- Supports 4 input modes:
														
 
															+- **NEW** Code-generation
														
 
															+- Supports 4 formatting modes:
														
 
															   - `html` for loading raw HTML
														
 
															   - `markdown` for loading markdown
														
 
															   - `text` for loading extracted text (using [Readability.js](https://github.com/mozilla/readability))
														
@@ -137,15 +138,13 @@ await page.close()
 
															 await browser.close()
														
 
															 ```
														
 
															-### Streaming
														
 
															+## Streaming
														
 
															 Replace your `run` function with `stream` to get a partial object stream (Vercel AI SDK only).
														
 
															 ```ts
														
 
															-// Run the scraper
														
 
															-const { stream } = await scraper.stream(page, schema, {
														
 
															-  format: 'html',
														
 
															-})
														
 
															+// Run the scraper in streaming mode
														
 
															+const { stream } = await scraper.stream(page, schema)
														
 
															 // Stream the result from LLM
														
 
															 for await (const data of stream) {
														
@@ -153,6 +152,20 @@ for await (const data of stream) {
 
															 }
														
 
															 ```
														
 
															+## NEW: Code-generation
														
 
															+
														
 
															+Using the `generate` function you can generate re-usable playwright script that scrapes the contents according to a schema.
														
 
															+
														
 
															+```ts
														
 
															+// Generate code and run it on the page
														
 
															+const { code } = await scraper.generate(page, schema)
														
 
															+const result = await page.evaluate(code)
														
 
															+const data = schema.parse(result)
														
 
															+
														
 
															+// Show the parsed result
														
 
															+console.log(data.news)
														
 
															+```
														
 
															+
														
 
															 ## Contributing
														
 
															 As an open-source project, we welcome contributions from the community. If you are experiencing any bugs or want to add some improvements, please feel free to open an issue or pull request.
														
--- a/examples/codegen.ts
+++ b/examples/codegen.ts
@@ -0,0 +1,41 @@
 
															+import { chromium } from 'playwright'
														
 
															+import { z } from 'zod'
														
 
															+import { anthropic } from '@ai-sdk/anthropic'
														
 
															+import LLMScraper from './../src'
														
 
															+
														
 
															+// Launch a browser instance
														
 
															+const browser = await chromium.launch()
														
 
															+
														
 
															+// Initialize LLM provider
														
 
															+const llm = anthropic('claude-3-5-sonnet-20240620')
														
 
															+
														
 
															+// Create a new LLMScraper
														
 
															+const scraper = new LLMScraper(llm)
														
 
															+
														
 
															+// Open new page
														
 
															+const page = await browser.newPage()
														
 
															+await page.goto('https://www.bbc.com')
														
 
															+
														
 
															+// Define schema to extract contents into
														
 
															+const schema = z.object({
														
 
															+  news: z.array(
														
 
															+    z.object({
														
 
															+      title: z.string(),
														
 
															+      description: z.string(),
														
 
															+      url: z.string(),
														
 
															+    })
														
 
															+  ),
														
 
															+})
														
 
															+
														
 
															+// Generate code and run it on the page
														
 
															+const { code } = await scraper.generate(page, schema)
														
 
															+console.log('code', code)
														
 
															+
														
 
															+const result = await page.evaluate(code)
														
 
															+const data = schema.parse(result)
														
 
															+
														
 
															+// Show the parsed result
														
 
															+console.log('result', data)
														
 
															+
														
 
															+await page.close()
														
 
															+await browser.close()
														
--- a/examples/streaming.ts
+++ b/examples/streaming.ts
@@ -31,7 +31,7 @@ const schema = z.object({
 
															     .describe('Top 5 stories on Hacker News'),
														
 
															 })
														
 
															-// Run the scraper
														
 
															+// Run the scraper in streaming mode
														
 
															 const { stream } = await scraper.stream(page, schema, {
														
 
															   format: 'html',
														
 
															 })
														
--- a/package-lock.json
+++ b/package-lock.json
@@ -1,12 +1,12 @@
 
															 {
														
 
															   "name": "llm-scraper",
														
 
															-  "version": "1.2.0",
														
 
															+  "version": "1.2.2",
														
 
															   "lockfileVersion": 3,
														
 
															   "requires": true,
														
 
															   "packages": {
														
 
															     "": {
														
 
															       "name": "llm-scraper",
														
 
															-      "version": "1.2.0",
														
 
															+      "version": "1.2.2",
														
 
															       "license": "MIT",
														
 
															       "dependencies": {
														
 
															         "ai": "^3.1.12",
														
@@ -15,6 +15,7 @@
 
															         "zod-to-json-schema": "^3.22.5"
														
 
															       },
														
 
															       "devDependencies": {
														
 
															+        "@ai-sdk/anthropic": "^0.0.30",
														
 
															         "@ai-sdk/openai": "^0.0.2",
														
 
															         "@types/node": "^20.12.7",
														
 
															         "@types/react": "^18.2.79",
														
@@ -24,6 +25,57 @@
 
															         "zod": "^3.22.5"
														
 
															       }
														
 
															     },
														
 
															+    "node_modules/@ai-sdk/anthropic": {
														
 
															+      "version": "0.0.30",
														
 
															+      "resolved": "https://registry.npmjs.org/@ai-sdk/anthropic/-/anthropic-0.0.30.tgz",
														
 
															+      "integrity": "sha512-iPJjKtIH8yk2cf5BNXLN6sn6TTghOh8puWothX4pPVBM/OKC4RWVjYTEELwUv2VDPIw918KBg2j/T0RfTgu+bw==",
														
 
															+      "dev": true,
														
 
															+      "dependencies": {
														
 
															+        "@ai-sdk/provider": "0.0.12",
														
 
															+        "@ai-sdk/provider-utils": "1.0.2"
														
 
															+      },
														
 
															+      "engines": {
														
 
															+        "node": ">=18"
														
 
															+      },
														
 
															+      "peerDependencies": {
														
 
															+        "zod": "^3.0.0"
														
 
															+      }
														
 
															+    },
														
 
															+    "node_modules/@ai-sdk/anthropic/node_modules/@ai-sdk/provider": {
														
 
															+      "version": "0.0.12",
														
 
															+      "resolved": "https://registry.npmjs.org/@ai-sdk/provider/-/provider-0.0.12.tgz",
														
 
															+      "integrity": "sha512-oOwPQD8i2Ynpn22cur4sk26FW3mSy6t6/X/K1Ay2yGBKYiSpRyLfObhOrZEGsXDx+3euKy4nEZ193R36NM+tpQ==",
														
 
															+      "dev": true,
														
 
															+      "dependencies": {
														
 
															+        "json-schema": "0.4.0"
														
 
															+      },
														
 
															+      "engines": {
														
 
															+        "node": ">=18"
														
 
															+      }
														
 
															+    },
														
 
															+    "node_modules/@ai-sdk/anthropic/node_modules/@ai-sdk/provider-utils": {
														
 
															+      "version": "1.0.2",
														
 
															+      "resolved": "https://registry.npmjs.org/@ai-sdk/provider-utils/-/provider-utils-1.0.2.tgz",
														
 
															+      "integrity": "sha512-57f6O4OFVNEpI8Z8o+K40tIB3YQiTw+VCql/qrAO9Utq7Ti1o6+X9tvm177DlZJL7ft0Rwzvgy48S9YhrEKgmA==",
														
 
															+      "dev": true,
														
 
															+      "dependencies": {
														
 
															+        "@ai-sdk/provider": "0.0.12",
														
 
															+        "eventsource-parser": "1.1.2",
														
 
															+        "nanoid": "3.3.6",
														
 
															+        "secure-json-parse": "2.7.0"
														
 
															+      },
														
 
															+      "engines": {
														
 
															+        "node": ">=18"
														
 
															+      },
														
 
															+      "peerDependencies": {
														
 
															+        "zod": "^3.0.0"
														
 
															+      },
														
 
															+      "peerDependenciesMeta": {
														
 
															+        "zod": {
														
 
															+          "optional": true
														
 
															+        }
														
 
															+      }
														
 
															+    },
														
 
															     "node_modules/@ai-sdk/openai": {
														
 
															       "version": "0.0.2",
														
 
															       "resolved": "https://registry.npmjs.org/@ai-sdk/openai/-/openai-0.0.2.tgz",
														
@@ -610,9 +662,9 @@
 
															       "dev": true
														
 
															     },
														
 
															     "node_modules/@types/react": {
														
 
															-      "version": "18.3.2",
														
 
															-      "resolved": "https://registry.npmjs.org/@types/react/-/react-18.3.2.tgz",
														
 
															-      "integrity": "sha512-Btgg89dAnqD4vV7R3hlwOxgqobUQKgx3MmrQRi0yYbs/P0ym8XozIAlkqVilPqHQwXs4e9Tf63rrCgl58BcO4w==",
														
 
															+      "version": "18.3.3",
														
 
															+      "resolved": "https://registry.npmjs.org/@types/react/-/react-18.3.3.tgz",
														
 
															+      "integrity": "sha512-hti/R0pS0q1/xx+TsI73XIqk26eBsISZ2R0wUijXIngRK9R/e7Xw/cXVxQK7R5JjW+SV4zGcn5hXjudkN/pLIw==",
														
 
															       "dev": true,
														
 
															       "dependencies": {
														
 
															         "@types/prop-types": "*",
														
--- a/package.json
+++ b/package.json
@@ -1,7 +1,7 @@
 
															 {
														
 
															   "type": "module",
														
 
															   "name": "llm-scraper",
														
 
															-  "version": "1.2.2",
														
 
															+  "version": "1.5.0",
														
 
															   "description": "Turn any webpage intro structured data using LLMs",
														
 
															   "main": "dist/index.js",
														
 
															   "scripts": {
														
@@ -32,6 +32,7 @@
 
															     "zod-to-json-schema": "^3.22.5"
														
 
															   },
														
 
															   "devDependencies": {
														
 
															+    "@ai-sdk/anthropic": "^0.0.30",
														
 
															     "@ai-sdk/openai": "^0.0.2",
														
 
															     "@types/node": "^20.12.7",
														
 
															     "@types/react": "^18.2.79",
														
--- a/src/cleanup.ts
+++ b/src/cleanup.ts
@@ -35,7 +35,6 @@ export default function cleanup() {
 
															   const attributesToRemove = [
														
 
															     'style',
														
 
															     'src',
														
 
															-    'href',
														
 
															     'alt',
														
 
															     'title',
														
 
															     'role',
														
--- a/src/index.ts
+++ b/src/index.ts
@@ -7,6 +7,7 @@ import {
 
															   generateLlamaCompletions,
														
 
															   generateAISDKCompletions,
														
 
															   streamAISDKCompletions,
														
 
															+  generateAISDKCode,
														
 
															 } from './models.js'
														
 
															 import cleanup from './cleanup.js'
														
@@ -107,7 +108,7 @@ export default class LLMScraper {
 
															   private async generateCompletions<T extends z.ZodSchema<any>>(
														
 
															     page: ScraperLoadResult,
														
 
															     schema: T,
														
 
															-    options: ScraperRunOptions
														
 
															+    options?: ScraperRunOptions
														
 
															   ) {
														
 
															     switch (this.client.constructor) {
														
 
															       default:
														
@@ -126,7 +127,7 @@ export default class LLMScraper {
 
															   private async streamCompletions<T extends z.ZodSchema<any>>(
														
 
															     page: ScraperLoadResult,
														
 
															     schema: T,
														
 
															-    options: ScraperRunOptions
														
 
															+    options?: ScraperRunOptions
														
 
															   ) {
														
 
															     switch (this.client.constructor) {
														
 
															       default:
														
@@ -137,7 +138,25 @@ export default class LLMScraper {
 
															           options
														
 
															         )
														
 
															       case LlamaModel:
														
 
															-        throw new Error('Streaming not supported for local models yet')
														
 
															+        throw new Error('Streaming not supported with GGUF models')
														
 
															+    }
														
 
															+  }
														
 
															+
														
 
															+  private async generateCode<T extends z.ZodSchema<any>>(
														
 
															+    page: ScraperLoadResult,
														
 
															+    schema: T,
														
 
															+    options?: ScraperLLMOptions
														
 
															+  ) {
														
 
															+    switch (this.client.constructor) {
														
 
															+      default:
														
 
															+        return generateAISDKCode<T>(
														
 
															+          this.client as LanguageModelV1,
														
 
															+          page,
														
 
															+          schema,
														
 
															+          options
														
 
															+        )
														
 
															+      case LlamaModel:
														
 
															+        throw new Error('Code-generation not supported with GGUF models')
														
 
															     }
														
 
															   }
														
@@ -145,19 +164,28 @@ export default class LLMScraper {
 
															   async run<T extends z.ZodSchema<any>>(
														
 
															     page: Page,
														
 
															     schema: T,
														
 
															-    options: ScraperRunOptions
														
 
															+    options?: ScraperRunOptions
														
 
															   ) {
														
 
															     const preprocessed = await this.preprocess(page, options)
														
 
															     return this.generateCompletions<T>(preprocessed, schema, options)
														
 
															   }
														
 
															-  // Pre-process the page and generate completion
														
 
															+  // Pre-process the page and stream completion
														
 
															   async stream<T extends z.ZodSchema<any>>(
														
 
															     page: Page,
														
 
															     schema: T,
														
 
															-    options: ScraperRunOptions
														
 
															+    options?: ScraperRunOptions
														
 
															   ) {
														
 
															     const preprocessed = await this.preprocess(page, options)
														
 
															     return this.streamCompletions<T>(preprocessed, schema, options)
														
 
															   }
														
 
															+
														
 
															+  // Pre-process the page and generate code
														
 
															+  async generate(page, schema: z.ZodSchema<any>, options?: ScraperLLMOptions) {
														
 
															+    const preprocessed = await this.preprocess(page, {
														
 
															+      ...options,
														
 
															+      format: 'cleanup',
														
 
															+    })
														
 
															+    return this.generateCode(preprocessed, schema, options)
														
 
															+  }
														
 
															 }
														
--- a/src/models.ts
+++ b/src/models.ts
@@ -1,5 +1,5 @@
 
															 import { LanguageModelV1 } from '@ai-sdk/provider'
														
 
															-import { generateObject, streamObject, UserContent } from 'ai'
														
 
															+import { generateObject, generateText, streamObject, UserContent } from 'ai'
														
 
															 import { z } from 'zod'
														
 
															 import { ScraperLoadResult, ScraperLLMOptions } from './index.js'
														
 
															 import {
														
@@ -19,6 +19,9 @@ export type ScraperCompletionResult<T extends z.ZodSchema<any>> = {
 
															 const defaultPrompt =
														
 
															   'You are a sophisticated web scraper. Extract the contents of the webpage'
														
 
															+const defaultCodePrompt = `Provide a scraping function (extract) in JavaScript that extracts and formats data according to a schema from the current page.
														
 
															+Use const syntax. Call the function. No comments or imports. The code you generate will be executed straight away, you shouldn't output anything besides runnable code.`
														
 
															+
														
 
															 function prepareAISDKPage(page: ScraperLoadResult): UserContent {
														
 
															   if (page.format === 'image') {
														
 
															     return [
														
@@ -36,20 +39,20 @@ export async function generateAISDKCompletions<T extends z.ZodSchema<any>>(
 
															   model: LanguageModelV1,
														
 
															   page: ScraperLoadResult,
														
 
															   schema: T,
														
 
															-  options: ScraperLLMOptions
														
 
															+  options?: ScraperLLMOptions
														
 
															 ) {
														
 
															   const content = prepareAISDKPage(page)
														
 
															   const result = await generateObject<z.infer<T>>({
														
 
															     model,
														
 
															     messages: [
														
 
															-      { role: 'system', content: options.prompt || defaultPrompt },
														
 
															+      { role: 'system', content: options?.prompt || defaultPrompt },
														
 
															       { role: 'user', content },
														
 
															     ],
														
 
															     schema,
														
 
															-    temperature: options.temperature,
														
 
															-    maxTokens: options.maxTokens,
														
 
															-    topP: options.topP,
														
 
															-    mode: options.mode,
														
 
															+    temperature: options?.temperature,
														
 
															+    maxTokens: options?.maxTokens,
														
 
															+    topP: options?.topP,
														
 
															+    mode: options?.mode,
														
 
															   })
														
 
															   return {
														
@@ -62,19 +65,19 @@ export async function streamAISDKCompletions<T extends z.ZodSchema<any>>(
 
															   model: LanguageModelV1,
														
 
															   page: ScraperLoadResult,
														
 
															   schema: T,
														
 
															-  options: ScraperLLMOptions
														
 
															+  options?: ScraperLLMOptions
														
 
															 ) {
														
 
															   const content = prepareAISDKPage(page)
														
 
															   const { partialObjectStream } = await streamObject<z.infer<T>>({
														
 
															     model,
														
 
															     messages: [
														
 
															-      { role: 'system', content: options.prompt || defaultPrompt },
														
 
															+      { role: 'system', content: options?.prompt || defaultPrompt },
														
 
															       { role: 'user', content },
														
 
															     ],
														
 
															     schema,
														
 
															-    temperature: options.temperature,
														
 
															-    maxTokens: options.maxTokens,
														
 
															-    topP: options.topP,
														
 
															+    temperature: options?.temperature,
														
 
															+    maxTokens: options?.maxTokens,
														
 
															+    topP: options?.topP,
														
 
															   })
														
 
															   return {
														
@@ -83,23 +86,52 @@ export async function streamAISDKCompletions<T extends z.ZodSchema<any>>(
 
															   }
														
 
															 }
														
 
															+export async function generateAISDKCode<T extends z.ZodSchema<any>>(
														
 
															+  model: LanguageModelV1,
														
 
															+  page: ScraperLoadResult,
														
 
															+  schema: T,
														
 
															+  options?: ScraperLLMOptions
														
 
															+) {
														
 
															+  const generatedSchema = zodToJsonSchema(schema)
														
 
															+  const result = await generateText({
														
 
															+    model,
														
 
															+    messages: [
														
 
															+      { role: 'system', content: options?.prompt || defaultCodePrompt },
														
 
															+      {
														
 
															+        role: 'user',
														
 
															+        content: `Website: ${page.url}
														
 
															+        Schema: ${JSON.stringify(generatedSchema)}
														
 
															+        Content: ${page.content}`,
														
 
															+      },
														
 
															+    ],
														
 
															+    temperature: options?.temperature,
														
 
															+    maxTokens: options?.maxTokens,
														
 
															+    topP: options?.topP,
														
 
															+  })
														
 
															+
														
 
															+  return {
														
 
															+    code: result.text,
														
 
															+    url: page.url,
														
 
															+  }
														
 
															+}
														
 
															+
														
 
															 export async function generateLlamaCompletions<T extends z.ZodSchema<any>>(
														
 
															   model: LlamaModel,
														
 
															   page: ScraperLoadResult,
														
 
															   schema: T,
														
 
															-  options: ScraperLLMOptions
														
 
															+  options?: ScraperLLMOptions
														
 
															 ): Promise<ScraperCompletionResult<T>> {
														
 
															   const generatedSchema = zodToJsonSchema(schema) as GbnfJsonSchema
														
 
															   const grammar = new LlamaJsonSchemaGrammar(generatedSchema) as any // any, because it has type inference going wild
														
 
															   const context = new LlamaContext({ model })
														
 
															   const session = new LlamaChatSession({ context })
														
 
															-  const pagePrompt = `${options.prompt || defaultPrompt}\n${page.content}`
														
 
															+  const pagePrompt = `${options?.prompt || defaultPrompt}\n${page.content}`
														
 
															   const result = await session.prompt(pagePrompt, {
														
 
															     grammar,
														
 
															-    temperature: options.temperature,
														
 
															-    maxTokens: options.maxTokens,
														
 
															-    topP: options.topP,
														
 
															+    temperature: options?.temperature,
														
 
															+    maxTokens: options?.maxTokens,
														
 
															+    topP: options?.topP,
														
 
															   })
														
 
															   const parsed = grammar.parse(result)