Explorar el Código

added codegen

Mish Ushakov hace 1 año
padre
commit
7246e50f6e
Se han modificado 6 ficheros con 182 adiciones y 29 borrados
  1. 41 0
      examples/codegen.ts
  2. 57 5
      package-lock.json
  3. 1 0
      package.json
  4. 0 1
      src/cleanup.ts
  5. 34 6
      src/index.ts
  6. 49 17
      src/models.ts

+ 41 - 0
examples/codegen.ts

@@ -0,0 +1,41 @@
+import { chromium } from 'playwright'
+import { z } from 'zod'
+import { anthropic } from '@ai-sdk/anthropic'
+import LLMScraper from './../src'
+
+// Launch a browser instance
+const browser = await chromium.launch()
+
+// Initialize LLM provider
+const llm = anthropic('claude-3-5-sonnet-20240620')
+
+// Create a new LLMScraper
+const scraper = new LLMScraper(llm)
+
+// Open new page
+const page = await browser.newPage()
+await page.goto('https://www.bbc.com')
+
+// Define schema to extract contents into
+const schema = z.object({
+  news: z.array(
+    z.object({
+      title: z.string(),
+      description: z.string(),
+      url: z.string(),
+    })
+  ),
+})
+
+// Run the scraper
+const { code } = await scraper.generate(page, schema)
+console.log('code', code)
+
+const result = await page.evaluate(code)
+const validated = schema.parse(result)
+
+// Show the result from LLM
+console.log('result', validated.news)
+
+await page.close()
+await browser.close()

+ 57 - 5
package-lock.json

@@ -1,12 +1,12 @@
 {
   "name": "llm-scraper",
-  "version": "1.2.0",
+  "version": "1.2.2",
   "lockfileVersion": 3,
   "requires": true,
   "packages": {
     "": {
       "name": "llm-scraper",
-      "version": "1.2.0",
+      "version": "1.2.2",
       "license": "MIT",
       "dependencies": {
         "ai": "^3.1.12",
@@ -15,6 +15,7 @@
         "zod-to-json-schema": "^3.22.5"
       },
       "devDependencies": {
+        "@ai-sdk/anthropic": "^0.0.30",
         "@ai-sdk/openai": "^0.0.2",
         "@types/node": "^20.12.7",
         "@types/react": "^18.2.79",
@@ -24,6 +25,57 @@
         "zod": "^3.22.5"
       }
     },
+    "node_modules/@ai-sdk/anthropic": {
+      "version": "0.0.30",
+      "resolved": "https://registry.npmjs.org/@ai-sdk/anthropic/-/anthropic-0.0.30.tgz",
+      "integrity": "sha512-iPJjKtIH8yk2cf5BNXLN6sn6TTghOh8puWothX4pPVBM/OKC4RWVjYTEELwUv2VDPIw918KBg2j/T0RfTgu+bw==",
+      "dev": true,
+      "dependencies": {
+        "@ai-sdk/provider": "0.0.12",
+        "@ai-sdk/provider-utils": "1.0.2"
+      },
+      "engines": {
+        "node": ">=18"
+      },
+      "peerDependencies": {
+        "zod": "^3.0.0"
+      }
+    },
+    "node_modules/@ai-sdk/anthropic/node_modules/@ai-sdk/provider": {
+      "version": "0.0.12",
+      "resolved": "https://registry.npmjs.org/@ai-sdk/provider/-/provider-0.0.12.tgz",
+      "integrity": "sha512-oOwPQD8i2Ynpn22cur4sk26FW3mSy6t6/X/K1Ay2yGBKYiSpRyLfObhOrZEGsXDx+3euKy4nEZ193R36NM+tpQ==",
+      "dev": true,
+      "dependencies": {
+        "json-schema": "0.4.0"
+      },
+      "engines": {
+        "node": ">=18"
+      }
+    },
+    "node_modules/@ai-sdk/anthropic/node_modules/@ai-sdk/provider-utils": {
+      "version": "1.0.2",
+      "resolved": "https://registry.npmjs.org/@ai-sdk/provider-utils/-/provider-utils-1.0.2.tgz",
+      "integrity": "sha512-57f6O4OFVNEpI8Z8o+K40tIB3YQiTw+VCql/qrAO9Utq7Ti1o6+X9tvm177DlZJL7ft0Rwzvgy48S9YhrEKgmA==",
+      "dev": true,
+      "dependencies": {
+        "@ai-sdk/provider": "0.0.12",
+        "eventsource-parser": "1.1.2",
+        "nanoid": "3.3.6",
+        "secure-json-parse": "2.7.0"
+      },
+      "engines": {
+        "node": ">=18"
+      },
+      "peerDependencies": {
+        "zod": "^3.0.0"
+      },
+      "peerDependenciesMeta": {
+        "zod": {
+          "optional": true
+        }
+      }
+    },
     "node_modules/@ai-sdk/openai": {
       "version": "0.0.2",
       "resolved": "https://registry.npmjs.org/@ai-sdk/openai/-/openai-0.0.2.tgz",
@@ -610,9 +662,9 @@
       "dev": true
     },
     "node_modules/@types/react": {
-      "version": "18.3.2",
-      "resolved": "https://registry.npmjs.org/@types/react/-/react-18.3.2.tgz",
-      "integrity": "sha512-Btgg89dAnqD4vV7R3hlwOxgqobUQKgx3MmrQRi0yYbs/P0ym8XozIAlkqVilPqHQwXs4e9Tf63rrCgl58BcO4w==",
+      "version": "18.3.3",
+      "resolved": "https://registry.npmjs.org/@types/react/-/react-18.3.3.tgz",
+      "integrity": "sha512-hti/R0pS0q1/xx+TsI73XIqk26eBsISZ2R0wUijXIngRK9R/e7Xw/cXVxQK7R5JjW+SV4zGcn5hXjudkN/pLIw==",
       "dev": true,
       "dependencies": {
         "@types/prop-types": "*",

+ 1 - 0
package.json

@@ -32,6 +32,7 @@
     "zod-to-json-schema": "^3.22.5"
   },
   "devDependencies": {
+    "@ai-sdk/anthropic": "^0.0.30",
     "@ai-sdk/openai": "^0.0.2",
     "@types/node": "^20.12.7",
     "@types/react": "^18.2.79",

+ 0 - 1
src/cleanup.ts

@@ -35,7 +35,6 @@ export default function cleanup() {
   const attributesToRemove = [
     'style',
     'src',
-    'href',
     'alt',
     'title',
     'role',

+ 34 - 6
src/index.ts

@@ -7,6 +7,7 @@ import {
   generateLlamaCompletions,
   generateAISDKCompletions,
   streamAISDKCompletions,
+  generateAISDKCode,
 } from './models.js'
 
 import cleanup from './cleanup.js'
@@ -107,7 +108,7 @@ export default class LLMScraper {
   private async generateCompletions<T extends z.ZodSchema<any>>(
     page: ScraperLoadResult,
     schema: T,
-    options: ScraperRunOptions
+    options?: ScraperRunOptions
   ) {
     switch (this.client.constructor) {
       default:
@@ -126,7 +127,7 @@ export default class LLMScraper {
   private async streamCompletions<T extends z.ZodSchema<any>>(
     page: ScraperLoadResult,
     schema: T,
-    options: ScraperRunOptions
+    options?: ScraperRunOptions
   ) {
     switch (this.client.constructor) {
       default:
@@ -137,7 +138,25 @@ export default class LLMScraper {
           options
         )
       case LlamaModel:
-        throw new Error('Streaming not supported for local models yet')
+        throw new Error('Streaming not supported with GGUF models')
+    }
+  }
+
+  private async generateCode<T extends z.ZodSchema<any>>(
+    page: ScraperLoadResult,
+    schema: T,
+    options?: ScraperLLMOptions
+  ) {
+    switch (this.client.constructor) {
+      default:
+        return generateAISDKCode<T>(
+          this.client as LanguageModelV1,
+          page,
+          schema,
+          options
+        )
+      case LlamaModel:
+        throw new Error('Code-generation not supported with GGUF models')
     }
   }
 
@@ -145,19 +164,28 @@ export default class LLMScraper {
   async run<T extends z.ZodSchema<any>>(
     page: Page,
     schema: T,
-    options: ScraperRunOptions
+    options?: ScraperRunOptions
   ) {
     const preprocessed = await this.preprocess(page, options)
     return this.generateCompletions<T>(preprocessed, schema, options)
   }
 
-  // Pre-process the page and generate completion
+  // Pre-process the page and stream completion
   async stream<T extends z.ZodSchema<any>>(
     page: Page,
     schema: T,
-    options: ScraperRunOptions
+    options?: ScraperRunOptions
   ) {
     const preprocessed = await this.preprocess(page, options)
     return this.streamCompletions<T>(preprocessed, schema, options)
   }
+
+  // Pre-process the page and generate code
+  async generate(page, schema: z.ZodSchema<any>, options?: ScraperLLMOptions) {
+    const preprocessed = await this.preprocess(page, {
+      ...options,
+      format: 'cleanup',
+    })
+    return this.generateCode(preprocessed, schema, options)
+  }
 }

+ 49 - 17
src/models.ts

@@ -1,5 +1,5 @@
 import { LanguageModelV1 } from '@ai-sdk/provider'
-import { generateObject, streamObject, UserContent } from 'ai'
+import { generateObject, generateText, streamObject, UserContent } from 'ai'
 import { z } from 'zod'
 import { ScraperLoadResult, ScraperLLMOptions } from './index.js'
 import {
@@ -19,6 +19,9 @@ export type ScraperCompletionResult<T extends z.ZodSchema<any>> = {
 const defaultPrompt =
   'You are a sophisticated web scraper. Extract the contents of the webpage'
 
+const defaultCodePrompt = `Provide a scraping function (extract) in JavaScript that extracts and formats data according to a schema from the current page.
+Use const syntax. Call the function. No comments or imports. The code you generate will be executed straight away, you shouldn't output anything besides runnable code.`
+
 function prepareAISDKPage(page: ScraperLoadResult): UserContent {
   if (page.format === 'image') {
     return [
@@ -36,20 +39,20 @@ export async function generateAISDKCompletions<T extends z.ZodSchema<any>>(
   model: LanguageModelV1,
   page: ScraperLoadResult,
   schema: T,
-  options: ScraperLLMOptions
+  options?: ScraperLLMOptions
 ) {
   const content = prepareAISDKPage(page)
   const result = await generateObject<z.infer<T>>({
     model,
     messages: [
-      { role: 'system', content: options.prompt || defaultPrompt },
+      { role: 'system', content: options?.prompt || defaultPrompt },
       { role: 'user', content },
     ],
     schema,
-    temperature: options.temperature,
-    maxTokens: options.maxTokens,
-    topP: options.topP,
-    mode: options.mode,
+    temperature: options?.temperature,
+    maxTokens: options?.maxTokens,
+    topP: options?.topP,
+    mode: options?.mode,
   })
 
   return {
@@ -62,19 +65,19 @@ export async function streamAISDKCompletions<T extends z.ZodSchema<any>>(
   model: LanguageModelV1,
   page: ScraperLoadResult,
   schema: T,
-  options: ScraperLLMOptions
+  options?: ScraperLLMOptions
 ) {
   const content = prepareAISDKPage(page)
   const { partialObjectStream } = await streamObject<z.infer<T>>({
     model,
     messages: [
-      { role: 'system', content: options.prompt || defaultPrompt },
+      { role: 'system', content: options?.prompt || defaultPrompt },
       { role: 'user', content },
     ],
     schema,
-    temperature: options.temperature,
-    maxTokens: options.maxTokens,
-    topP: options.topP,
+    temperature: options?.temperature,
+    maxTokens: options?.maxTokens,
+    topP: options?.topP,
   })
 
   return {
@@ -83,23 +86,52 @@ export async function streamAISDKCompletions<T extends z.ZodSchema<any>>(
   }
 }
 
+export async function generateAISDKCode<T extends z.ZodSchema<any>>(
+  model: LanguageModelV1,
+  page: ScraperLoadResult,
+  schema: T,
+  options?: ScraperLLMOptions
+) {
+  const generatedSchema = zodToJsonSchema(schema)
+  const result = await generateText({
+    model,
+    messages: [
+      { role: 'system', content: options?.prompt || defaultCodePrompt },
+      {
+        role: 'user',
+        content: `Website: ${page.url}
+        Schema: ${JSON.stringify(generatedSchema)}
+        Content: ${page.content}`,
+      },
+    ],
+    temperature: options?.temperature,
+    maxTokens: options?.maxTokens,
+    topP: options?.topP,
+  })
+
+  return {
+    code: result.text,
+    url: page.url,
+  }
+}
+
 export async function generateLlamaCompletions<T extends z.ZodSchema<any>>(
   model: LlamaModel,
   page: ScraperLoadResult,
   schema: T,
-  options: ScraperLLMOptions
+  options?: ScraperLLMOptions
 ): Promise<ScraperCompletionResult<T>> {
   const generatedSchema = zodToJsonSchema(schema) as GbnfJsonSchema
   const grammar = new LlamaJsonSchemaGrammar(generatedSchema) as any // any, because it has type inference going wild
   const context = new LlamaContext({ model })
   const session = new LlamaChatSession({ context })
-  const pagePrompt = `${options.prompt || defaultPrompt}\n${page.content}`
+  const pagePrompt = `${options?.prompt || defaultPrompt}\n${page.content}`
 
   const result = await session.prompt(pagePrompt, {
     grammar,
-    temperature: options.temperature,
-    maxTokens: options.maxTokens,
-    topP: options.topP,
+    temperature: options?.temperature,
+    maxTokens: options?.maxTokens,
+    topP: options?.topP,
   })
 
   const parsed = grammar.parse(result)