0ct0pu5
/
llm-scraper


			
				
					
						
						
							123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123
							import { z } from 'zod'
import { test, expect } from './index'
import { jsonSchema } from 'ai'
import { zodToJsonSchema } from 'zod-to-json-schema'

const storySchema = z.object({
  title: z.string(),
  points: z.number(),
  by: z.string(),
  commentsURL: z.string(),
})

const schema = z.object({
  top: z.array(storySchema).length(5).describe('Top 5 stories on Hacker News'),
})

test('scrapes top 5 stories from Hacker News', async ({ page, scraper }) => {
  await page.goto('https://news.ycombinator.com')

  const { data } = await scraper.run(page, schema)

  expect(schema.safeParse(data).success).toBe(true)
})

test('scrapes top 5 stories from Hacker News (image format)', async ({
  page,
  scraper,
}) => {
  await page.goto('https://news.ycombinator.com')

  const { data } = await scraper.run(page, schema, {
    format: 'image',
  })

  expect(schema.safeParse(data).success).toBe(true)
})

test('scrapes top 5 stories from Hacker News (markdown format)', async ({
  page,
  scraper,
}) => {
  await page.goto('https://news.ycombinator.com')

  const { data } = await scraper.run(page, schema, {
    format: 'markdown',
  })

  expect(schema.safeParse(data).success).toBe(true)
})

test('scrapes top 5 stories from Hacker News (raw html)', async ({
  page,
  scraper,
}) => {
  await page.goto('https://news.ycombinator.com')

  const { data } = await scraper.run(page, schema, {
    format: 'raw_html',
  })

  expect(schema.safeParse(data).success).toBe(true)
})

test('scrapes top 5 stories from Hacker News (code generation)', async ({
  page,
  scraper,
}) => {
  await page.goto('https://news.ycombinator.com')

  const { code } = await scraper.generate(page, schema)
  const result: z.infer<typeof schema> = await page.evaluate(code)

  expect(schema.safeParse(result).success).toBe(true)
})

test('scrapes top 5 stories from Hacker News (json schema)', async ({
  page,
  scraper,
}) => {
  await page.goto('https://news.ycombinator.com')

  const m = jsonSchema<{ top: { title: string }[] }>(zodToJsonSchema(schema))
  const { data } = await scraper.run(page, m)

  expect(schema.safeParse(data).success).toBe(true)
})

test('scrapes example.com (streaming)', async ({ page, scraper }) => {
  await page.goto('https://example.com')

  const { stream } = await scraper.stream(
    page,
    z.object({
      h1: z.string().describe('The main heading of the page'),
    })
  )

  let text = ''
  for await (const item of stream) {
    text = item.h1 || ''
  }

  expect(text).toBe('Example Domain')
})

test('scrapes top stories from Hacker News (streaming, array)', async ({
  page,
  scraper,
}) => {
  await page.goto('https://news.ycombinator.com')

  const { stream } = await scraper.stream(page, storySchema, {
    format: 'raw_html',
    output: 'array',
  })

  let last: Partial<z.infer<typeof storySchema>>[] = []
  for await (const item of stream) {
    last = item as typeof last
  }

  expect(last).toHaveLength(30)
})