|
@@ -1,5 +1,5 @@
|
|
|
import { LanguageModelV1 } from '@ai-sdk/provider'
|
|
|
-import { generateObject, streamObject, UserContent } from 'ai'
|
|
|
+import { generateObject, generateText, streamObject, UserContent } from 'ai'
|
|
|
import { z } from 'zod'
|
|
|
import { ScraperLoadResult, ScraperLLMOptions } from './index.js'
|
|
|
import {
|
|
@@ -19,6 +19,9 @@ export type ScraperCompletionResult<T extends z.ZodSchema<any>> = {
|
|
|
const defaultPrompt =
|
|
|
'You are a sophisticated web scraper. Extract the contents of the webpage'
|
|
|
|
|
|
+const defaultCodePrompt = `Provide a scraping function (extract) in JavaScript that extracts and formats data according to a schema from the current page.
|
|
|
+Use const syntax. Call the function. No comments or imports. The code you generate will be executed straight away, you shouldn't output anything besides runnable code.`
|
|
|
+
|
|
|
function prepareAISDKPage(page: ScraperLoadResult): UserContent {
|
|
|
if (page.format === 'image') {
|
|
|
return [
|
|
@@ -36,20 +39,20 @@ export async function generateAISDKCompletions<T extends z.ZodSchema<any>>(
|
|
|
model: LanguageModelV1,
|
|
|
page: ScraperLoadResult,
|
|
|
schema: T,
|
|
|
- options: ScraperLLMOptions
|
|
|
+ options?: ScraperLLMOptions
|
|
|
) {
|
|
|
const content = prepareAISDKPage(page)
|
|
|
const result = await generateObject<z.infer<T>>({
|
|
|
model,
|
|
|
messages: [
|
|
|
- { role: 'system', content: options.prompt || defaultPrompt },
|
|
|
+ { role: 'system', content: options?.prompt || defaultPrompt },
|
|
|
{ role: 'user', content },
|
|
|
],
|
|
|
schema,
|
|
|
- temperature: options.temperature,
|
|
|
- maxTokens: options.maxTokens,
|
|
|
- topP: options.topP,
|
|
|
- mode: options.mode,
|
|
|
+ temperature: options?.temperature,
|
|
|
+ maxTokens: options?.maxTokens,
|
|
|
+ topP: options?.topP,
|
|
|
+ mode: options?.mode,
|
|
|
})
|
|
|
|
|
|
return {
|
|
@@ -62,19 +65,19 @@ export async function streamAISDKCompletions<T extends z.ZodSchema<any>>(
|
|
|
model: LanguageModelV1,
|
|
|
page: ScraperLoadResult,
|
|
|
schema: T,
|
|
|
- options: ScraperLLMOptions
|
|
|
+ options?: ScraperLLMOptions
|
|
|
) {
|
|
|
const content = prepareAISDKPage(page)
|
|
|
const { partialObjectStream } = await streamObject<z.infer<T>>({
|
|
|
model,
|
|
|
messages: [
|
|
|
- { role: 'system', content: options.prompt || defaultPrompt },
|
|
|
+ { role: 'system', content: options?.prompt || defaultPrompt },
|
|
|
{ role: 'user', content },
|
|
|
],
|
|
|
schema,
|
|
|
- temperature: options.temperature,
|
|
|
- maxTokens: options.maxTokens,
|
|
|
- topP: options.topP,
|
|
|
+ temperature: options?.temperature,
|
|
|
+ maxTokens: options?.maxTokens,
|
|
|
+ topP: options?.topP,
|
|
|
})
|
|
|
|
|
|
return {
|
|
@@ -83,23 +86,52 @@ export async function streamAISDKCompletions<T extends z.ZodSchema<any>>(
|
|
|
}
|
|
|
}
|
|
|
|
|
|
+export async function generateAISDKCode<T extends z.ZodSchema<any>>(
|
|
|
+ model: LanguageModelV1,
|
|
|
+ page: ScraperLoadResult,
|
|
|
+ schema: T,
|
|
|
+ options?: ScraperLLMOptions
|
|
|
+) {
|
|
|
+ const generatedSchema = zodToJsonSchema(schema)
|
|
|
+ const result = await generateText({
|
|
|
+ model,
|
|
|
+ messages: [
|
|
|
+ { role: 'system', content: options?.prompt || defaultCodePrompt },
|
|
|
+ {
|
|
|
+ role: 'user',
|
|
|
+ content: `Website: ${page.url}
|
|
|
+ Schema: ${JSON.stringify(generatedSchema)}
|
|
|
+ Content: ${page.content}`,
|
|
|
+ },
|
|
|
+ ],
|
|
|
+ temperature: options?.temperature,
|
|
|
+ maxTokens: options?.maxTokens,
|
|
|
+ topP: options?.topP,
|
|
|
+ })
|
|
|
+
|
|
|
+ return {
|
|
|
+ code: result.text,
|
|
|
+ url: page.url,
|
|
|
+ }
|
|
|
+}
|
|
|
+
|
|
|
export async function generateLlamaCompletions<T extends z.ZodSchema<any>>(
|
|
|
model: LlamaModel,
|
|
|
page: ScraperLoadResult,
|
|
|
schema: T,
|
|
|
- options: ScraperLLMOptions
|
|
|
+ options?: ScraperLLMOptions
|
|
|
): Promise<ScraperCompletionResult<T>> {
|
|
|
const generatedSchema = zodToJsonSchema(schema) as GbnfJsonSchema
|
|
|
const grammar = new LlamaJsonSchemaGrammar(generatedSchema) as any // any, because it has type inference going wild
|
|
|
const context = new LlamaContext({ model })
|
|
|
const session = new LlamaChatSession({ context })
|
|
|
- const pagePrompt = `${options.prompt || defaultPrompt}\n${page.content}`
|
|
|
+ const pagePrompt = `${options?.prompt || defaultPrompt}\n${page.content}`
|
|
|
|
|
|
const result = await session.prompt(pagePrompt, {
|
|
|
grammar,
|
|
|
- temperature: options.temperature,
|
|
|
- maxTokens: options.maxTokens,
|
|
|
- topP: options.topP,
|
|
|
+ temperature: options?.temperature,
|
|
|
+ maxTokens: options?.maxTokens,
|
|
|
+ topP: options?.topP,
|
|
|
})
|
|
|
|
|
|
const parsed = grammar.parse(result)
|