1 year ago · 64ef126ab1
--- a/README.md
+++ b/README.md
@@ -2,7 +2,7 @@
 
				 
			
 
				 <img width="1800" alt="Screenshot 2024-04-20 at 23 11 16" src="https://github.com/mishushakov/llm-scraper/assets/10400064/ab00e048-a9ff-43b6-81d5-2e58090e2e65">
			
 
				 
			
 
				-LLM Scraper is a TypeScript library that allows you to convert **any** webpages into structured data using LLMs.
			
 
				+LLM Scraper is a TypeScript library that allows you to extract structured data from **any** webpage using LLMs.
			
 
				 
			
 
				 > [!TIP]
			
 
				 > Under the hood, it uses function calling to convert pages to structured data. You can find more about this approach [here](https://til.simonwillison.net/gpt3/openai-python-functions-data-extraction)
			
@@ -14,7 +14,8 @@ LLM Scraper is a TypeScript library that allows you to convert **any** webpages
 
				 - Full type-safety with TypeScript
			
 
				 - Based on Playwright framework
			
 
				 - Streaming objects
			
 
				-- Supports 4 input modes:
			
 
				+- **NEW** Code-generation
			
 
				+- Supports 4 formatting modes:
			
 
				   - `html` for loading raw HTML
			
 
				   - `markdown` for loading markdown
			
 
				   - `text` for loading extracted text (using [Readability.js](https://github.com/mozilla/readability))
			
@@ -137,15 +138,13 @@ await page.close()
 
				 await browser.close()
			
 
				 ```
			
 
				 
			
 
				-### Streaming
			
 
				+## Streaming
			
 
				 
			
 
				 Replace your `run` function with `stream` to get a partial object stream (Vercel AI SDK only).
			
 
				 
			
 
				 ```ts
			
 
				-// Run the scraper
			
 
				-const { stream } = await scraper.stream(page, schema, {
			
 
				-  format: 'html',
			
 
				-})
			
 
				+// Run the scraper in streaming mode
			
 
				+const { stream } = await scraper.stream(page, schema)
			
 
				 
			
 
				 // Stream the result from LLM
			
 
				 for await (const data of stream) {
			
@@ -153,6 +152,20 @@ for await (const data of stream) {
 
				 }
			
 
				 ```
			
 
				 
			
 
				+## NEW: Code-generation
			
 
				+
			
 
				+Using the `generate` function you can generate re-usable playwright script that scrapes the contents according to a schema.
			
 
				+
			
 
				+```ts
			
 
				+// Generate code and run it on the page
			
 
				+const { code } = await scraper.generate(page, schema)
			
 
				+const result = await page.evaluate(code)
			
 
				+const data = schema.parse(result)
			
 
				+
			
 
				+// Show the parsed result
			
 
				+console.log(data.news)
			
 
				+```
			
 
				+
			
 
				 ## Contributing
			
 
				 
			
 
				 As an open-source project, we welcome contributions from the community. If you are experiencing any bugs or want to add some improvements, please feel free to open an issue or pull request.
			
--- a/examples/codegen.ts
+++ b/examples/codegen.ts
@@ -0,0 +1,41 @@
 
				+import { chromium } from 'playwright'
			
 
				+import { z } from 'zod'
			
 
				+import { anthropic } from '@ai-sdk/anthropic'
			
 
				+import LLMScraper from './../src'
			
 
				+
			
 
				+// Launch a browser instance
			
 
				+const browser = await chromium.launch()
			
 
				+
			
 
				+// Initialize LLM provider
			
 
				+const llm = anthropic('claude-3-5-sonnet-20240620')
			
 
				+
			
 
				+// Create a new LLMScraper
			
 
				+const scraper = new LLMScraper(llm)
			
 
				+
			
 
				+// Open new page
			
 
				+const page = await browser.newPage()
			
 
				+await page.goto('https://www.bbc.com')
			
 
				+
			
 
				+// Define schema to extract contents into
			
 
				+const schema = z.object({
			
 
				+  news: z.array(
			
 
				+    z.object({
			
 
				+      title: z.string(),
			
 
				+      description: z.string(),
			
 
				+      url: z.string(),
			
 
				+    })
			
 
				+  ),
			
 
				+})
			
 
				+
			
 
				+// Generate code and run it on the page
			
 
				+const { code } = await scraper.generate(page, schema)
			
 
				+console.log('code', code)
			
 
				+
			
 
				+const result = await page.evaluate(code)
			
 
				+const data = schema.parse(result)
			
 
				+
			
 
				+// Show the parsed result
			
 
				+console.log('result', data)
			
 
				+
			
 
				+await page.close()
			
 
				+await browser.close()
			
--- a/examples/streaming.ts
+++ b/examples/streaming.ts
@@ -31,7 +31,7 @@ const schema = z.object({
 
				     .describe('Top 5 stories on Hacker News'),
			
 
				 })
			
 
				 
			
 
				-// Run the scraper
			
 
				+// Run the scraper in streaming mode
			
 
				 const { stream } = await scraper.stream(page, schema, {
			
 
				   format: 'html',
			
 
				 })
			
--- a/package-lock.json
+++ b/package-lock.json
@@ -1,12 +1,12 @@
 
				 {
			
 
				   "name": "llm-scraper",
			
 
				-  "version": "1.2.0",
			
 
				+  "version": "1.2.2",
			
 
				   "lockfileVersion": 3,
			
 
				   "requires": true,
			
 
				   "packages": {
			
 
				     "": {
			
 
				       "name": "llm-scraper",
			
 
				-      "version": "1.2.0",
			
 
				+      "version": "1.2.2",
			
 
				       "license": "MIT",
			
 
				       "dependencies": {
			
 
				         "ai": "^3.1.12",
			
@@ -15,6 +15,7 @@
 
				         "zod-to-json-schema": "^3.22.5"
			
 
				       },
			
 
				       "devDependencies": {
			
 
				+        "@ai-sdk/anthropic": "^0.0.30",
			
 
				         "@ai-sdk/openai": "^0.0.2",
			
 
				         "@types/node": "^20.12.7",
			
 
				         "@types/react": "^18.2.79",
			
@@ -24,6 +25,57 @@
 
				         "zod": "^3.22.5"
			
 
				       }
			
 
				     },
			
 
				+    "node_modules/@ai-sdk/anthropic": {
			
 
				+      "version": "0.0.30",
			
 
				+      "resolved": "https://registry.npmjs.org/@ai-sdk/anthropic/-/anthropic-0.0.30.tgz",
			
 
				+      "integrity": "sha512-iPJjKtIH8yk2cf5BNXLN6sn6TTghOh8puWothX4pPVBM/OKC4RWVjYTEELwUv2VDPIw918KBg2j/T0RfTgu+bw==",
			
 
				+      "dev": true,
			
 
				+      "dependencies": {
			
 
				+        "@ai-sdk/provider": "0.0.12",
			
 
				+        "@ai-sdk/provider-utils": "1.0.2"
			
 
				+      },
			
 
				+      "engines": {
			
 
				+        "node": ">=18"
			
 
				+      },
			
 
				+      "peerDependencies": {
			
 
				+        "zod": "^3.0.0"
			
 
				+      }
			
 
				+    },
			
 
				+    "node_modules/@ai-sdk/anthropic/node_modules/@ai-sdk/provider": {
			
 
				+      "version": "0.0.12",
			
 
				+      "resolved": "https://registry.npmjs.org/@ai-sdk/provider/-/provider-0.0.12.tgz",
			
 
				+      "integrity": "sha512-oOwPQD8i2Ynpn22cur4sk26FW3mSy6t6/X/K1Ay2yGBKYiSpRyLfObhOrZEGsXDx+3euKy4nEZ193R36NM+tpQ==",
			
 
				+      "dev": true,
			
 
				+      "dependencies": {
			
 
				+        "json-schema": "0.4.0"
			
 
				+      },
			
 
				+      "engines": {
			
 
				+        "node": ">=18"
			
 
				+      }
			
 
				+    },
			
 
				+    "node_modules/@ai-sdk/anthropic/node_modules/@ai-sdk/provider-utils": {
			
 
				+      "version": "1.0.2",
			
 
				+      "resolved": "https://registry.npmjs.org/@ai-sdk/provider-utils/-/provider-utils-1.0.2.tgz",
			
 
				+      "integrity": "sha512-57f6O4OFVNEpI8Z8o+K40tIB3YQiTw+VCql/qrAO9Utq7Ti1o6+X9tvm177DlZJL7ft0Rwzvgy48S9YhrEKgmA==",
			
 
				+      "dev": true,
			
 
				+      "dependencies": {
			
 
				+        "@ai-sdk/provider": "0.0.12",
			
 
				+        "eventsource-parser": "1.1.2",
			
 
				+        "nanoid": "3.3.6",
			
 
				+        "secure-json-parse": "2.7.0"
			
 
				+      },
			
 
				+      "engines": {
			
 
				+        "node": ">=18"
			
 
				+      },
			
 
				+      "peerDependencies": {
			
 
				+        "zod": "^3.0.0"
			
 
				+      },
			
 
				+      "peerDependenciesMeta": {
			
 
				+        "zod": {
			
 
				+          "optional": true
			
 
				+        }
			
 
				+      }
			
 
				+    },
			
 
				     "node_modules/@ai-sdk/openai": {
			
 
				       "version": "0.0.2",
			
 
				       "resolved": "https://registry.npmjs.org/@ai-sdk/openai/-/openai-0.0.2.tgz",
			
@@ -610,9 +662,9 @@
 
				       "dev": true
			
 
				     },
			
 
				     "node_modules/@types/react": {
			
 
				-      "version": "18.3.2",
			
 
				-      "resolved": "https://registry.npmjs.org/@types/react/-/react-18.3.2.tgz",
			
 
				-      "integrity": "sha512-Btgg89dAnqD4vV7R3hlwOxgqobUQKgx3MmrQRi0yYbs/P0ym8XozIAlkqVilPqHQwXs4e9Tf63rrCgl58BcO4w==",
			
 
				+      "version": "18.3.3",
			
 
				+      "resolved": "https://registry.npmjs.org/@types/react/-/react-18.3.3.tgz",
			
 
				+      "integrity": "sha512-hti/R0pS0q1/xx+TsI73XIqk26eBsISZ2R0wUijXIngRK9R/e7Xw/cXVxQK7R5JjW+SV4zGcn5hXjudkN/pLIw==",
			
 
				       "dev": true,
			
 
				       "dependencies": {
			
 
				         "@types/prop-types": "*",
			
--- a/package.json
+++ b/package.json
@@ -1,7 +1,7 @@
 
				 {
			
 
				   "type": "module",
			
 
				   "name": "llm-scraper",
			
 
				-  "version": "1.2.2",
			
 
				+  "version": "1.5.0",
			
 
				   "description": "Turn any webpage intro structured data using LLMs",
			
 
				   "main": "dist/index.js",
			
 
				   "scripts": {
			
@@ -32,6 +32,7 @@
 
				     "zod-to-json-schema": "^3.22.5"
			
 
				   },
			
 
				   "devDependencies": {
			
 
				+    "@ai-sdk/anthropic": "^0.0.30",
			
 
				     "@ai-sdk/openai": "^0.0.2",
			
 
				     "@types/node": "^20.12.7",
			
 
				     "@types/react": "^18.2.79",
			
--- a/src/cleanup.ts
+++ b/src/cleanup.ts
@@ -35,7 +35,6 @@ export default function cleanup() {
 
				   const attributesToRemove = [
			
 
				     'style',
			
 
				     'src',
			
 
				-    'href',
			
 
				     'alt',
			
 
				     'title',
			
 
				     'role',
			
--- a/src/index.ts
+++ b/src/index.ts
@@ -7,6 +7,7 @@ import {
 
				   generateLlamaCompletions,
			
 
				   generateAISDKCompletions,
			
 
				   streamAISDKCompletions,
			
 
				+  generateAISDKCode,
			
 
				 } from './models.js'
			
 
				 
			
 
				 import cleanup from './cleanup.js'
			
@@ -107,7 +108,7 @@ export default class LLMScraper {
 
				   private async generateCompletions<T extends z.ZodSchema<any>>(
			
 
				     page: ScraperLoadResult,
			
 
				     schema: T,
			
 
				-    options: ScraperRunOptions
			
 
				+    options?: ScraperRunOptions
			
 
				   ) {
			
 
				     switch (this.client.constructor) {
			
 
				       default:
			
@@ -126,7 +127,7 @@ export default class LLMScraper {
 
				   private async streamCompletions<T extends z.ZodSchema<any>>(
			
 
				     page: ScraperLoadResult,
			
 
				     schema: T,
			
 
				-    options: ScraperRunOptions
			
 
				+    options?: ScraperRunOptions
			
 
				   ) {
			
 
				     switch (this.client.constructor) {
			
 
				       default:
			
@@ -137,7 +138,25 @@ export default class LLMScraper {
 
				           options
			
 
				         )
			
 
				       case LlamaModel:
			
 
				-        throw new Error('Streaming not supported for local models yet')
			
 
				+        throw new Error('Streaming not supported with GGUF models')
			
 
				+    }
			
 
				+  }
			
 
				+
			
 
				+  private async generateCode<T extends z.ZodSchema<any>>(
			
 
				+    page: ScraperLoadResult,
			
 
				+    schema: T,
			
 
				+    options?: ScraperLLMOptions
			
 
				+  ) {
			
 
				+    switch (this.client.constructor) {
			
 
				+      default:
			
 
				+        return generateAISDKCode<T>(
			
 
				+          this.client as LanguageModelV1,
			
 
				+          page,
			
 
				+          schema,
			
 
				+          options
			
 
				+        )
			
 
				+      case LlamaModel:
			
 
				+        throw new Error('Code-generation not supported with GGUF models')
			
 
				     }
			
 
				   }
			
 
				 
			
@@ -145,19 +164,28 @@ export default class LLMScraper {
 
				   async run<T extends z.ZodSchema<any>>(
			
 
				     page: Page,
			
 
				     schema: T,
			
 
				-    options: ScraperRunOptions
			
 
				+    options?: ScraperRunOptions
			
 
				   ) {
			
 
				     const preprocessed = await this.preprocess(page, options)
			
 
				     return this.generateCompletions<T>(preprocessed, schema, options)
			
 
				   }
			
 
				 
			
 
				-  // Pre-process the page and generate completion
			
 
				+  // Pre-process the page and stream completion
			
 
				   async stream<T extends z.ZodSchema<any>>(
			
 
				     page: Page,
			
 
				     schema: T,
			
 
				-    options: ScraperRunOptions
			
 
				+    options?: ScraperRunOptions
			
 
				   ) {
			
 
				     const preprocessed = await this.preprocess(page, options)
			
 
				     return this.streamCompletions<T>(preprocessed, schema, options)
			
 
				   }
			
 
				+
			
 
				+  // Pre-process the page and generate code
			
 
				+  async generate(page, schema: z.ZodSchema<any>, options?: ScraperLLMOptions) {
			
 
				+    const preprocessed = await this.preprocess(page, {
			
 
				+      ...options,
			
 
				+      format: 'cleanup',
			
 
				+    })
			
 
				+    return this.generateCode(preprocessed, schema, options)
			
 
				+  }
			
 
				 }
			
--- a/src/models.ts
+++ b/src/models.ts
@@ -1,5 +1,5 @@
 
				 import { LanguageModelV1 } from '@ai-sdk/provider'
			
 
				-import { generateObject, streamObject, UserContent } from 'ai'
			
 
				+import { generateObject, generateText, streamObject, UserContent } from 'ai'
			
 
				 import { z } from 'zod'
			
 
				 import { ScraperLoadResult, ScraperLLMOptions } from './index.js'
			
 
				 import {
			
@@ -19,6 +19,9 @@ export type ScraperCompletionResult<T extends z.ZodSchema<any>> = {
 
				 const defaultPrompt =
			
 
				   'You are a sophisticated web scraper. Extract the contents of the webpage'
			
 
				 
			
 
				+const defaultCodePrompt = `Provide a scraping function (extract) in JavaScript that extracts and formats data according to a schema from the current page.
			
 
				+Use const syntax. Call the function. No comments or imports. The code you generate will be executed straight away, you shouldn't output anything besides runnable code.`
			
 
				+
			
 
				 function prepareAISDKPage(page: ScraperLoadResult): UserContent {
			
 
				   if (page.format === 'image') {
			
 
				     return [
			
@@ -36,20 +39,20 @@ export async function generateAISDKCompletions<T extends z.ZodSchema<any>>(
 
				   model: LanguageModelV1,
			
 
				   page: ScraperLoadResult,
			
 
				   schema: T,
			
 
				-  options: ScraperLLMOptions
			
 
				+  options?: ScraperLLMOptions
			
 
				 ) {
			
 
				   const content = prepareAISDKPage(page)
			
 
				   const result = await generateObject<z.infer<T>>({
			
 
				     model,
			
 
				     messages: [
			
 
				-      { role: 'system', content: options.prompt || defaultPrompt },
			
 
				+      { role: 'system', content: options?.prompt || defaultPrompt },
			
 
				       { role: 'user', content },
			
 
				     ],
			
 
				     schema,
			
 
				-    temperature: options.temperature,
			
 
				-    maxTokens: options.maxTokens,
			
 
				-    topP: options.topP,
			
 
				-    mode: options.mode,
			
 
				+    temperature: options?.temperature,
			
 
				+    maxTokens: options?.maxTokens,
			
 
				+    topP: options?.topP,
			
 
				+    mode: options?.mode,
			
 
				   })
			
 
				 
			
 
				   return {
			
@@ -62,19 +65,19 @@ export async function streamAISDKCompletions<T extends z.ZodSchema<any>>(
 
				   model: LanguageModelV1,
			
 
				   page: ScraperLoadResult,
			
 
				   schema: T,
			
 
				-  options: ScraperLLMOptions
			
 
				+  options?: ScraperLLMOptions
			
 
				 ) {
			
 
				   const content = prepareAISDKPage(page)
			
 
				   const { partialObjectStream } = await streamObject<z.infer<T>>({
			
 
				     model,
			
 
				     messages: [
			
 
				-      { role: 'system', content: options.prompt || defaultPrompt },
			
 
				+      { role: 'system', content: options?.prompt || defaultPrompt },
			
 
				       { role: 'user', content },
			
 
				     ],
			
 
				     schema,
			
 
				-    temperature: options.temperature,
			
 
				-    maxTokens: options.maxTokens,
			
 
				-    topP: options.topP,
			
 
				+    temperature: options?.temperature,
			
 
				+    maxTokens: options?.maxTokens,
			
 
				+    topP: options?.topP,
			
 
				   })
			
 
				 
			
 
				   return {
			
@@ -83,23 +86,52 @@ export async function streamAISDKCompletions<T extends z.ZodSchema<any>>(
 
				   }
			
 
				 }
			
 
				 
			
 
				+export async function generateAISDKCode<T extends z.ZodSchema<any>>(
			
 
				+  model: LanguageModelV1,
			
 
				+  page: ScraperLoadResult,
			
 
				+  schema: T,
			
 
				+  options?: ScraperLLMOptions
			
 
				+) {
			
 
				+  const generatedSchema = zodToJsonSchema(schema)
			
 
				+  const result = await generateText({
			
 
				+    model,
			
 
				+    messages: [
			
 
				+      { role: 'system', content: options?.prompt || defaultCodePrompt },
			
 
				+      {
			
 
				+        role: 'user',
			
 
				+        content: `Website: ${page.url}
			
 
				+        Schema: ${JSON.stringify(generatedSchema)}
			
 
				+        Content: ${page.content}`,
			
 
				+      },
			
 
				+    ],
			
 
				+    temperature: options?.temperature,
			
 
				+    maxTokens: options?.maxTokens,
			
 
				+    topP: options?.topP,
			
 
				+  })
			
 
				+
			
 
				+  return {
			
 
				+    code: result.text,
			
 
				+    url: page.url,
			
 
				+  }
			
 
				+}
			
 
				+
			
 
				 export async function generateLlamaCompletions<T extends z.ZodSchema<any>>(
			
 
				   model: LlamaModel,
			
 
				   page: ScraperLoadResult,
			
 
				   schema: T,
			
 
				-  options: ScraperLLMOptions
			
 
				+  options?: ScraperLLMOptions
			
 
				 ): Promise<ScraperCompletionResult<T>> {
			
 
				   const generatedSchema = zodToJsonSchema(schema) as GbnfJsonSchema
			
 
				   const grammar = new LlamaJsonSchemaGrammar(generatedSchema) as any // any, because it has type inference going wild
			
 
				   const context = new LlamaContext({ model })
			
 
				   const session = new LlamaChatSession({ context })
			
 
				-  const pagePrompt = `${options.prompt || defaultPrompt}\n${page.content}`
			
 
				+  const pagePrompt = `${options?.prompt || defaultPrompt}\n${page.content}`
			
 
				 
			
 
				   const result = await session.prompt(pagePrompt, {
			
 
				     grammar,
			
 
				-    temperature: options.temperature,
			
 
				-    maxTokens: options.maxTokens,
			
 
				-    topP: options.topP,
			
 
				+    temperature: options?.temperature,
			
 
				+    maxTokens: options?.maxTokens,
			
 
				+    topP: options?.topP,
			
 
				   })
			
 
				 
			
 
				   const parsed = grammar.parse(result)