Browse Source

Merge pull request #32 from mishushakov/codegen

Codegen
Mish Ushakov 1 year ago
parent
commit
64ef126ab1
8 changed files with 204 additions and 38 deletions
  1. 20 7
      README.md
  2. 41 0
      examples/codegen.ts
  3. 1 1
      examples/streaming.ts
  4. 57 5
      package-lock.json
  5. 2 1
      package.json
  6. 0 1
      src/cleanup.ts
  7. 34 6
      src/index.ts
  8. 49 17
      src/models.ts

+ 20 - 7
README.md

@@ -2,7 +2,7 @@
 
 <img width="1800" alt="Screenshot 2024-04-20 at 23 11 16" src="https://github.com/mishushakov/llm-scraper/assets/10400064/ab00e048-a9ff-43b6-81d5-2e58090e2e65">
 
-LLM Scraper is a TypeScript library that allows you to convert **any** webpages into structured data using LLMs.
+LLM Scraper is a TypeScript library that allows you to extract structured data from **any** webpage using LLMs.
 
 > [!TIP]
 > Under the hood, it uses function calling to convert pages to structured data. You can find more about this approach [here](https://til.simonwillison.net/gpt3/openai-python-functions-data-extraction)
@@ -14,7 +14,8 @@ LLM Scraper is a TypeScript library that allows you to convert **any** webpages
 - Full type-safety with TypeScript
 - Based on Playwright framework
 - Streaming objects
-- Supports 4 input modes:
+- **NEW** Code-generation
+- Supports 4 formatting modes:
   - `html` for loading raw HTML
   - `markdown` for loading markdown
   - `text` for loading extracted text (using [Readability.js](https://github.com/mozilla/readability))
@@ -137,15 +138,13 @@ await page.close()
 await browser.close()
 ```
 
-### Streaming
+## Streaming
 
 Replace your `run` function with `stream` to get a partial object stream (Vercel AI SDK only).
 
 ```ts
-// Run the scraper
-const { stream } = await scraper.stream(page, schema, {
-  format: 'html',
-})
+// Run the scraper in streaming mode
+const { stream } = await scraper.stream(page, schema)
 
 // Stream the result from LLM
 for await (const data of stream) {
@@ -153,6 +152,20 @@ for await (const data of stream) {
 }
 ```
 
+## NEW: Code-generation
+
+Using the `generate` function you can generate re-usable playwright script that scrapes the contents according to a schema.
+
+```ts
+// Generate code and run it on the page
+const { code } = await scraper.generate(page, schema)
+const result = await page.evaluate(code)
+const data = schema.parse(result)
+
+// Show the parsed result
+console.log(data.news)
+```
+
 ## Contributing
 
 As an open-source project, we welcome contributions from the community. If you are experiencing any bugs or want to add some improvements, please feel free to open an issue or pull request.

+ 41 - 0
examples/codegen.ts

@@ -0,0 +1,41 @@
+import { chromium } from 'playwright'
+import { z } from 'zod'
+import { anthropic } from '@ai-sdk/anthropic'
+import LLMScraper from './../src'
+
+// Launch a browser instance
+const browser = await chromium.launch()
+
+// Initialize LLM provider
+const llm = anthropic('claude-3-5-sonnet-20240620')
+
+// Create a new LLMScraper
+const scraper = new LLMScraper(llm)
+
+// Open new page
+const page = await browser.newPage()
+await page.goto('https://www.bbc.com')
+
+// Define schema to extract contents into
+const schema = z.object({
+  news: z.array(
+    z.object({
+      title: z.string(),
+      description: z.string(),
+      url: z.string(),
+    })
+  ),
+})
+
+// Generate code and run it on the page
+const { code } = await scraper.generate(page, schema)
+console.log('code', code)
+
+const result = await page.evaluate(code)
+const data = schema.parse(result)
+
+// Show the parsed result
+console.log('result', data)
+
+await page.close()
+await browser.close()

+ 1 - 1
examples/streaming.ts

@@ -31,7 +31,7 @@ const schema = z.object({
     .describe('Top 5 stories on Hacker News'),
 })
 
-// Run the scraper
+// Run the scraper in streaming mode
 const { stream } = await scraper.stream(page, schema, {
   format: 'html',
 })

+ 57 - 5
package-lock.json

@@ -1,12 +1,12 @@
 {
   "name": "llm-scraper",
-  "version": "1.2.0",
+  "version": "1.2.2",
   "lockfileVersion": 3,
   "requires": true,
   "packages": {
     "": {
       "name": "llm-scraper",
-      "version": "1.2.0",
+      "version": "1.2.2",
       "license": "MIT",
       "dependencies": {
         "ai": "^3.1.12",
@@ -15,6 +15,7 @@
         "zod-to-json-schema": "^3.22.5"
       },
       "devDependencies": {
+        "@ai-sdk/anthropic": "^0.0.30",
         "@ai-sdk/openai": "^0.0.2",
         "@types/node": "^20.12.7",
         "@types/react": "^18.2.79",
@@ -24,6 +25,57 @@
         "zod": "^3.22.5"
       }
     },
+    "node_modules/@ai-sdk/anthropic": {
+      "version": "0.0.30",
+      "resolved": "https://registry.npmjs.org/@ai-sdk/anthropic/-/anthropic-0.0.30.tgz",
+      "integrity": "sha512-iPJjKtIH8yk2cf5BNXLN6sn6TTghOh8puWothX4pPVBM/OKC4RWVjYTEELwUv2VDPIw918KBg2j/T0RfTgu+bw==",
+      "dev": true,
+      "dependencies": {
+        "@ai-sdk/provider": "0.0.12",
+        "@ai-sdk/provider-utils": "1.0.2"
+      },
+      "engines": {
+        "node": ">=18"
+      },
+      "peerDependencies": {
+        "zod": "^3.0.0"
+      }
+    },
+    "node_modules/@ai-sdk/anthropic/node_modules/@ai-sdk/provider": {
+      "version": "0.0.12",
+      "resolved": "https://registry.npmjs.org/@ai-sdk/provider/-/provider-0.0.12.tgz",
+      "integrity": "sha512-oOwPQD8i2Ynpn22cur4sk26FW3mSy6t6/X/K1Ay2yGBKYiSpRyLfObhOrZEGsXDx+3euKy4nEZ193R36NM+tpQ==",
+      "dev": true,
+      "dependencies": {
+        "json-schema": "0.4.0"
+      },
+      "engines": {
+        "node": ">=18"
+      }
+    },
+    "node_modules/@ai-sdk/anthropic/node_modules/@ai-sdk/provider-utils": {
+      "version": "1.0.2",
+      "resolved": "https://registry.npmjs.org/@ai-sdk/provider-utils/-/provider-utils-1.0.2.tgz",
+      "integrity": "sha512-57f6O4OFVNEpI8Z8o+K40tIB3YQiTw+VCql/qrAO9Utq7Ti1o6+X9tvm177DlZJL7ft0Rwzvgy48S9YhrEKgmA==",
+      "dev": true,
+      "dependencies": {
+        "@ai-sdk/provider": "0.0.12",
+        "eventsource-parser": "1.1.2",
+        "nanoid": "3.3.6",
+        "secure-json-parse": "2.7.0"
+      },
+      "engines": {
+        "node": ">=18"
+      },
+      "peerDependencies": {
+        "zod": "^3.0.0"
+      },
+      "peerDependenciesMeta": {
+        "zod": {
+          "optional": true
+        }
+      }
+    },
     "node_modules/@ai-sdk/openai": {
       "version": "0.0.2",
       "resolved": "https://registry.npmjs.org/@ai-sdk/openai/-/openai-0.0.2.tgz",
@@ -610,9 +662,9 @@
       "dev": true
     },
     "node_modules/@types/react": {
-      "version": "18.3.2",
-      "resolved": "https://registry.npmjs.org/@types/react/-/react-18.3.2.tgz",
-      "integrity": "sha512-Btgg89dAnqD4vV7R3hlwOxgqobUQKgx3MmrQRi0yYbs/P0ym8XozIAlkqVilPqHQwXs4e9Tf63rrCgl58BcO4w==",
+      "version": "18.3.3",
+      "resolved": "https://registry.npmjs.org/@types/react/-/react-18.3.3.tgz",
+      "integrity": "sha512-hti/R0pS0q1/xx+TsI73XIqk26eBsISZ2R0wUijXIngRK9R/e7Xw/cXVxQK7R5JjW+SV4zGcn5hXjudkN/pLIw==",
       "dev": true,
       "dependencies": {
         "@types/prop-types": "*",

+ 2 - 1
package.json

@@ -1,7 +1,7 @@
 {
   "type": "module",
   "name": "llm-scraper",
-  "version": "1.2.2",
+  "version": "1.5.0",
   "description": "Turn any webpage intro structured data using LLMs",
   "main": "dist/index.js",
   "scripts": {
@@ -32,6 +32,7 @@
     "zod-to-json-schema": "^3.22.5"
   },
   "devDependencies": {
+    "@ai-sdk/anthropic": "^0.0.30",
     "@ai-sdk/openai": "^0.0.2",
     "@types/node": "^20.12.7",
     "@types/react": "^18.2.79",

+ 0 - 1
src/cleanup.ts

@@ -35,7 +35,6 @@ export default function cleanup() {
   const attributesToRemove = [
     'style',
     'src',
-    'href',
     'alt',
     'title',
     'role',

+ 34 - 6
src/index.ts

@@ -7,6 +7,7 @@ import {
   generateLlamaCompletions,
   generateAISDKCompletions,
   streamAISDKCompletions,
+  generateAISDKCode,
 } from './models.js'
 
 import cleanup from './cleanup.js'
@@ -107,7 +108,7 @@ export default class LLMScraper {
   private async generateCompletions<T extends z.ZodSchema<any>>(
     page: ScraperLoadResult,
     schema: T,
-    options: ScraperRunOptions
+    options?: ScraperRunOptions
   ) {
     switch (this.client.constructor) {
       default:
@@ -126,7 +127,7 @@ export default class LLMScraper {
   private async streamCompletions<T extends z.ZodSchema<any>>(
     page: ScraperLoadResult,
     schema: T,
-    options: ScraperRunOptions
+    options?: ScraperRunOptions
   ) {
     switch (this.client.constructor) {
       default:
@@ -137,7 +138,25 @@ export default class LLMScraper {
           options
         )
       case LlamaModel:
-        throw new Error('Streaming not supported for local models yet')
+        throw new Error('Streaming not supported with GGUF models')
+    }
+  }
+
+  private async generateCode<T extends z.ZodSchema<any>>(
+    page: ScraperLoadResult,
+    schema: T,
+    options?: ScraperLLMOptions
+  ) {
+    switch (this.client.constructor) {
+      default:
+        return generateAISDKCode<T>(
+          this.client as LanguageModelV1,
+          page,
+          schema,
+          options
+        )
+      case LlamaModel:
+        throw new Error('Code-generation not supported with GGUF models')
     }
   }
 
@@ -145,19 +164,28 @@ export default class LLMScraper {
   async run<T extends z.ZodSchema<any>>(
     page: Page,
     schema: T,
-    options: ScraperRunOptions
+    options?: ScraperRunOptions
   ) {
     const preprocessed = await this.preprocess(page, options)
     return this.generateCompletions<T>(preprocessed, schema, options)
   }
 
-  // Pre-process the page and generate completion
+  // Pre-process the page and stream completion
   async stream<T extends z.ZodSchema<any>>(
     page: Page,
     schema: T,
-    options: ScraperRunOptions
+    options?: ScraperRunOptions
   ) {
     const preprocessed = await this.preprocess(page, options)
     return this.streamCompletions<T>(preprocessed, schema, options)
   }
+
+  // Pre-process the page and generate code
+  async generate(page, schema: z.ZodSchema<any>, options?: ScraperLLMOptions) {
+    const preprocessed = await this.preprocess(page, {
+      ...options,
+      format: 'cleanup',
+    })
+    return this.generateCode(preprocessed, schema, options)
+  }
 }

+ 49 - 17
src/models.ts

@@ -1,5 +1,5 @@
 import { LanguageModelV1 } from '@ai-sdk/provider'
-import { generateObject, streamObject, UserContent } from 'ai'
+import { generateObject, generateText, streamObject, UserContent } from 'ai'
 import { z } from 'zod'
 import { ScraperLoadResult, ScraperLLMOptions } from './index.js'
 import {
@@ -19,6 +19,9 @@ export type ScraperCompletionResult<T extends z.ZodSchema<any>> = {
 const defaultPrompt =
   'You are a sophisticated web scraper. Extract the contents of the webpage'
 
+const defaultCodePrompt = `Provide a scraping function (extract) in JavaScript that extracts and formats data according to a schema from the current page.
+Use const syntax. Call the function. No comments or imports. The code you generate will be executed straight away, you shouldn't output anything besides runnable code.`
+
 function prepareAISDKPage(page: ScraperLoadResult): UserContent {
   if (page.format === 'image') {
     return [
@@ -36,20 +39,20 @@ export async function generateAISDKCompletions<T extends z.ZodSchema<any>>(
   model: LanguageModelV1,
   page: ScraperLoadResult,
   schema: T,
-  options: ScraperLLMOptions
+  options?: ScraperLLMOptions
 ) {
   const content = prepareAISDKPage(page)
   const result = await generateObject<z.infer<T>>({
     model,
     messages: [
-      { role: 'system', content: options.prompt || defaultPrompt },
+      { role: 'system', content: options?.prompt || defaultPrompt },
       { role: 'user', content },
     ],
     schema,
-    temperature: options.temperature,
-    maxTokens: options.maxTokens,
-    topP: options.topP,
-    mode: options.mode,
+    temperature: options?.temperature,
+    maxTokens: options?.maxTokens,
+    topP: options?.topP,
+    mode: options?.mode,
   })
 
   return {
@@ -62,19 +65,19 @@ export async function streamAISDKCompletions<T extends z.ZodSchema<any>>(
   model: LanguageModelV1,
   page: ScraperLoadResult,
   schema: T,
-  options: ScraperLLMOptions
+  options?: ScraperLLMOptions
 ) {
   const content = prepareAISDKPage(page)
   const { partialObjectStream } = await streamObject<z.infer<T>>({
     model,
     messages: [
-      { role: 'system', content: options.prompt || defaultPrompt },
+      { role: 'system', content: options?.prompt || defaultPrompt },
       { role: 'user', content },
     ],
     schema,
-    temperature: options.temperature,
-    maxTokens: options.maxTokens,
-    topP: options.topP,
+    temperature: options?.temperature,
+    maxTokens: options?.maxTokens,
+    topP: options?.topP,
   })
 
   return {
@@ -83,23 +86,52 @@ export async function streamAISDKCompletions<T extends z.ZodSchema<any>>(
   }
 }
 
+export async function generateAISDKCode<T extends z.ZodSchema<any>>(
+  model: LanguageModelV1,
+  page: ScraperLoadResult,
+  schema: T,
+  options?: ScraperLLMOptions
+) {
+  const generatedSchema = zodToJsonSchema(schema)
+  const result = await generateText({
+    model,
+    messages: [
+      { role: 'system', content: options?.prompt || defaultCodePrompt },
+      {
+        role: 'user',
+        content: `Website: ${page.url}
+        Schema: ${JSON.stringify(generatedSchema)}
+        Content: ${page.content}`,
+      },
+    ],
+    temperature: options?.temperature,
+    maxTokens: options?.maxTokens,
+    topP: options?.topP,
+  })
+
+  return {
+    code: result.text,
+    url: page.url,
+  }
+}
+
 export async function generateLlamaCompletions<T extends z.ZodSchema<any>>(
   model: LlamaModel,
   page: ScraperLoadResult,
   schema: T,
-  options: ScraperLLMOptions
+  options?: ScraperLLMOptions
 ): Promise<ScraperCompletionResult<T>> {
   const generatedSchema = zodToJsonSchema(schema) as GbnfJsonSchema
   const grammar = new LlamaJsonSchemaGrammar(generatedSchema) as any // any, because it has type inference going wild
   const context = new LlamaContext({ model })
   const session = new LlamaChatSession({ context })
-  const pagePrompt = `${options.prompt || defaultPrompt}\n${page.content}`
+  const pagePrompt = `${options?.prompt || defaultPrompt}\n${page.content}`
 
   const result = await session.prompt(pagePrompt, {
     grammar,
-    temperature: options.temperature,
-    maxTokens: options.maxTokens,
-    topP: options.topP,
+    temperature: options?.temperature,
+    maxTokens: options?.maxTokens,
+    topP: options?.topP,
   })
 
   const parsed = grammar.parse(result)