Bladeren bron

fixes codegen

Mish Ushakov 2 maanden geleden
bovenliggende
commit
72f7889515
2 gewijzigde bestanden met toevoegingen van 10 en 12 verwijderingen
  1. 4 5
      src/index.ts
  2. 6 7
      src/models.ts

+ 4 - 5
src/index.ts

@@ -21,7 +21,9 @@ export type ScraperLLMOptions = {
 export type ScraperGenerateOptions = Omit<
 export type ScraperGenerateOptions = Omit<
   ScraperLLMOptions,
   ScraperLLMOptions,
   'output' | 'mode'
   'output' | 'mode'
->
+> & {
+  format?: 'html'| 'raw_html'
+}
 
 
 export type ScraperRunOptions = ScraperLLMOptions & PreProcessOptions
 export type ScraperRunOptions = ScraperLLMOptions & PreProcessOptions
 
 
@@ -61,10 +63,7 @@ export default class LLMScraper {
     schema: z.Schema<T, z.ZodTypeDef, any> | Schema<T>,
     schema: z.Schema<T, z.ZodTypeDef, any> | Schema<T>,
     options?: ScraperGenerateOptions
     options?: ScraperGenerateOptions
   ) {
   ) {
-    const preprocessed = await preprocess(page, {
-      format: 'raw_html',
-      ...options,
-    })
+    const preprocessed = await preprocess(page, options)
     return generateAISDKCode<T>(this.client, preprocessed, schema, options)
     return generateAISDKCode<T>(this.client, preprocessed, schema, options)
   }
   }
 }
 }

+ 6 - 7
src/models.ts

@@ -15,14 +15,13 @@ const defaultPrompt =
   'You are a sophisticated web scraper. Extract the contents of the webpage'
   'You are a sophisticated web scraper. Extract the contents of the webpage'
 
 
 const defaultCodePrompt =
 const defaultCodePrompt =
-  "Provide a scraping function in JavaScript that extracts and formats data according to a schema from the current page. The function must be IIFE. No comments or imports. The code you generate will be executed straight away, you shouldn't output anything besides runnable code."
+  "Provide a scraping function in JavaScript that extracts and returns data according to a schema from the current page. The function must be IIFE. No comments or imports. No console.log. The code you generate will be executed straight away, you shouldn't output anything besides runnable code."
 
 
 function stripMarkdownBackticks(text: string) {
 function stripMarkdownBackticks(text: string) {
-  const match = text.match(/^```(.*)\n(.*)/)
-  if (match) {
-    return match[2]
-  }
-  return text
+  let trimmed = text.trim()
+  trimmed = trimmed.replace(/^```(?:javascript)?\s*/i, '')
+  trimmed = trimmed.replace(/\s*```$/i, '')
+  return trimmed
 }
 }
 
 
 function prepareAISDKPage(page: PreProcessResult): UserContent {
 function prepareAISDKPage(page: PreProcessResult): UserContent {
@@ -109,7 +108,7 @@ export async function generateAISDKCode<T>(
         role: 'user',
         role: 'user',
         content: `Website: ${page.url}
         content: `Website: ${page.url}
         Schema: ${JSON.stringify(parsedSchema)}
         Schema: ${JSON.stringify(parsedSchema)}
-        Content: ${stripMarkdownBackticks(page.content)}`,
+        Content: ${page.content}`,
       },
       },
     ],
     ],
     temperature: options?.temperature,
     temperature: options?.temperature,