Mish Ushakov 2 месяцев назад
Родитель
Сommit
a4a31513b1
13 измененных файлов с 1112 добавлено и 24 удалено
  1. 1 1
      examples/ollama.ts
  2. 6 12
      examples/streaming.ts
  3. 1001 10
      package-lock.json
  4. 2 1
      package.json
  5. 1 0
      src/index.ts
  6. 2 0
      src/models.ts
  7. 0 0
      tests/codegen.test.ts
  8. 38 0
      tests/index.ts
  9. 0 0
      tests/ollama.test.ts
  10. 27 0
      tests/run.test.ts
  11. 27 0
      tests/streaming.test.ts
  12. 0 0
      tests/toolUse.test.ts
  13. 7 0
      vitest.config.ts

+ 1 - 1
examples/ollama.ts

@@ -7,7 +7,7 @@ import LLMScraper from './../src'
 const browser = await chromium.launch()
 
 // Initialize LLM provider
-const llm = ollama('llama3')
+const llm = ollama('gemma3:1b')
 
 // Initialize a new LLMScraper with local model
 const scraper = new LLMScraper(llm)

+ 6 - 12
examples/streaming.ts

@@ -18,27 +18,21 @@ await page.goto('https://news.ycombinator.com')
 
 // Define schema to extract contents into
 const schema = z.object({
-  top: z
-    .array(
-      z.object({
-        title: z.string(),
-        points: z.number(),
-        by: z.string(),
-        commentsURL: z.string(),
-      })
-    )
-    .length(5)
-    .describe('Top 5 stories on Hacker News'),
+  title: z.string(),
+  points: z.number(),
+  by: z.string(),
+  commentsURL: z.string(),
 })
 
 // Run the scraper in streaming mode
 const { stream } = await scraper.stream(page, schema, {
   format: 'html',
+  output: 'array',
 })
 
 // Stream the result from LLM
 for await (const data of stream) {
-  console.log(data.top)
+  console.log(data)
 }
 
 await page.close()

Разница между файлами не показана из-за своего большого размера
+ 1001 - 10
package-lock.json


+ 2 - 1
package.json

@@ -6,7 +6,7 @@
   "main": "dist/index.js",
   "scripts": {
     "build": "tsc -p tsconfig.json",
-    "test": "echo \"Error: no test specified\" && exit 1"
+    "test": "vitest run"
   },
   "repository": {
     "type": "git",
@@ -38,6 +38,7 @@
     "ollama-ai-provider": "^1.2.0",
     "playwright": "^1.52.0",
     "typescript": "^5.8.3",
+    "vitest": "^3.1.3",
     "zod": "^3.24.4"
   }
 }

+ 1 - 0
src/index.ts

@@ -36,6 +36,7 @@ export type ScraperLLMOptions = {
   maxTokens?: number
   topP?: number
   mode?: 'auto' | 'json' | 'tool'
+  output?: 'array'
 }
 
 export type ScraperRunOptions = ScraperLLMOptions & ScraperLoadOptions

+ 2 - 0
src/models.ts

@@ -52,6 +52,7 @@ export async function generateAISDKCompletions<T extends z.ZodSchema<any>>(
     maxTokens: options?.maxTokens,
     topP: options?.topP,
     mode: options?.mode,
+    output: options?.output,
   })
 
   return {
@@ -74,6 +75,7 @@ export async function streamAISDKCompletions<T extends z.ZodSchema<any>>(
       { role: 'user', content },
     ],
     schema,
+    output: options?.output,
     temperature: options?.temperature,
     maxTokens: options?.maxTokens,
     topP: options?.topP,

+ 0 - 0
tests/codegen.test.ts


+ 38 - 0
tests/index.ts

@@ -0,0 +1,38 @@
+import { test as baseTest, expect, afterAll } from 'vitest'
+import LLMScraper from '../src'
+import { openai } from '@ai-sdk/openai'
+import { chromium, Browser } from 'playwright'
+
+let browser: Browser | null = null
+
+async function getBrowser() {
+  if (!browser) {
+    browser = await chromium.launch()
+  }
+  return browser
+}
+
+afterAll(async () => {
+  if (browser) {
+    await browser.close()
+    browser = null
+  }
+})
+
+export const test = baseTest.extend<{
+  page: Awaited<ReturnType<Browser['newPage']>>
+  scraper: LLMScraper
+}>({
+  page: async ({}, use) => {
+    const browser = await getBrowser()
+    const page = await browser.newPage()
+    await use(page)
+    await page.close()
+  },
+  scraper: async ({}, use) => {
+    const scraper = new LLMScraper(openai('gpt-4o-mini'))
+    await use(scraper)
+  },
+})
+
+export { expect }

+ 0 - 0
tests/ollama.test.ts


+ 27 - 0
tests/run.test.ts

@@ -0,0 +1,27 @@
+import { expect, test } from './index'
+import { z } from 'zod'
+
+test('scrapes top 3 stories from Hacker News', async ({ page, scraper }) => {
+  await page.goto('https://news.ycombinator.com')
+
+  const schema = z.object({
+    top: z
+      .array(
+        z.object({
+          title: z.string(),
+        })
+      )
+      .length(3)
+      .describe('Top 3 stories on Hacker News'),
+  })
+
+  const { data } = await scraper.run(page, schema, {
+    format: 'html',
+  })
+
+  // check length
+  expect(data.top).toHaveLength(3)
+
+  // check schema
+  expect(schema.safeParse(data).success).toBe(true)
+})

+ 27 - 0
tests/streaming.test.ts

@@ -0,0 +1,27 @@
+import { z } from 'zod'
+import { expect, test } from './index'
+
+test('streaming', async ({ page, scraper }) => {
+  await page.goto('https://news.ycombinator.com')
+
+  const schema = z
+    .object({
+      title: z.string(),
+      points: z.number(),
+      by: z.string(),
+      commentsURL: z.string(),
+    })
+    .describe('Top 5 stories on Hacker News')
+
+  const { stream } = await scraper.stream(page, schema, {
+    format: 'html',
+    output: 'array',
+  })
+
+  let last: Partial<z.infer<typeof schema>>[] = []
+  for await (const item of stream) {
+    last = item
+  }
+
+  expect(last).toHaveLength(5)
+}, 10000)

+ 0 - 0
tests/toolUse.test.ts


+ 7 - 0
vitest.config.ts

@@ -0,0 +1,7 @@
+import { defineConfig } from 'vitest/config'
+
+export default defineConfig({
+  test: {
+    include: ['tests/**/*.test.ts'],
+  },
+})

Некоторые файлы не были показаны из-за большого количества измененных файлов