Mish Ushakov 2 months ago
parent
commit
274f2989f8
5 changed files with 46 additions and 7 deletions
  1. 1 1
      examples/codegen.ts
  2. 3 3
      src/index.ts
  3. 10 2
      src/models.ts
  4. 31 0
      tests/codegen.test.ts
  5. 1 1
      tests/streaming.test.ts

+ 1 - 1
examples/codegen.ts

@@ -7,7 +7,7 @@ import LLMScraper from './../src'
 const browser = await chromium.launch()
 
 // Initialize LLM provider
-const llm = openai('gpt-4o')
+const llm = openai('gpt-4o-mini')
 
 // Create a new LLMScraper
 const scraper = new LLMScraper(llm)

+ 3 - 3
src/index.ts

@@ -13,7 +13,7 @@ import cleanup from './cleanup.js'
 
 export type ScraperLoadOptions =
   | {
-      format?: 'html' | 'text' | 'markdown' | 'cleanup'
+      format?: 'html' | 'text' | 'markdown' | 'raw'
     }
   | {
       format: 'custom'
@@ -76,7 +76,7 @@ export default class LLMScraper {
       content = `Page Title: ${readable.title}\n${readable.textContent}`
     }
 
-    if (options.format === 'cleanup') {
+    if (options.format === 'html') {
       await page.evaluate(cleanup)
       content = await page.content()
     }
@@ -157,8 +157,8 @@ export default class LLMScraper {
     options?: ScraperLLMOptions
   ) {
     const preprocessed = await this.preprocess(page, {
+      format: 'html',
       ...options,
-      format: 'cleanup',
     })
     return this.generateCode(preprocessed, schema, options)
   }

+ 10 - 2
src/models.ts

@@ -21,6 +21,14 @@ const defaultPrompt =
 const defaultCodePrompt = `Provide a scraping function in JavaScript that extracts and formats data according to a schema from the current page.
 The function must be IIFE. No comments or imports. The code you generate will be executed straight away, you shouldn't output anything besides runnable code.`
 
+function stripMarkdownBackticks(text: string) {
+  const match = text.match(/^```(.*)\n(.*)/)
+  if (match) {
+    return match[2]
+  }
+  return text
+}
+
 function prepareAISDKPage(page: ScraperLoadResult): UserContent {
   if (page.format === 'image') {
     return [
@@ -104,7 +112,7 @@ export async function generateAISDKCode<T extends z.ZodSchema<any>>(
         role: 'user',
         content: `Website: ${page.url}
         Schema: ${JSON.stringify(parsedSchema)}
-        Content: ${page.content}`,
+        Content: ${stripMarkdownBackticks(page.content)}`,
       },
     ],
     temperature: options?.temperature,
@@ -113,7 +121,7 @@ export async function generateAISDKCode<T extends z.ZodSchema<any>>(
   })
 
   return {
-    code: result.text,
+    code: stripMarkdownBackticks(result.text),
     url: page.url,
   }
 }

+ 31 - 0
tests/codegen.test.ts

@@ -0,0 +1,31 @@
+import { expect, test } from './index'
+import { z } from 'zod'
+
+test('scrapes top 3 stories from Hacker News', async ({ page, scraper }) => {
+  await page.goto('https://news.ycombinator.com')
+
+  const schema = z.object({
+    top: z
+      .array(
+        z.object({
+          title: z.string(),
+        })
+      )
+      .length(3)
+      .describe('Top 3 stories on Hacker News'),
+  })
+
+  // Generate scraping code
+  const { code } = await scraper.generate(page, schema)
+  throw new Error(code)
+
+  // Evaluate the generated code in the page context
+  const result = await page.evaluate(code)
+
+  // Validate the result
+  const parsed = schema.safeParse(result)
+  expect(parsed.success).toBe(true)
+  if (parsed.success) {
+    expect(parsed.data.top).toHaveLength(3)
+  }
+})

+ 1 - 1
tests/streaming.test.ts

@@ -14,8 +14,8 @@ test('streaming', async ({ page, scraper }) => {
     .describe('Top 5 stories on Hacker News')
 
   const { stream } = await scraper.stream(page, schema, {
-    format: 'html',
     output: 'array',
+    format: 'html',
   })
 
   let last: Partial<z.infer<typeof schema>>[] = []