123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123 |
- import { z } from 'zod'
- import { test, expect } from './index'
- import { jsonSchema } from 'ai'
- import { zodToJsonSchema } from 'zod-to-json-schema'
- const storySchema = z.object({
- title: z.string(),
- points: z.number(),
- by: z.string(),
- commentsURL: z.string(),
- })
- const schema = z.object({
- top: z.array(storySchema).length(5).describe('Top 5 stories on Hacker News'),
- })
- test('scrapes top 5 stories from Hacker News', async ({ page, scraper }) => {
- await page.goto('https://news.ycombinator.com')
- const { data } = await scraper.run(page, schema)
- expect(schema.safeParse(data).success).toBe(true)
- })
- test('scrapes top 5 stories from Hacker News (image format)', async ({
- page,
- scraper,
- }) => {
- await page.goto('https://news.ycombinator.com')
- const { data } = await scraper.run(page, schema, {
- format: 'image',
- })
- expect(schema.safeParse(data).success).toBe(true)
- })
- test('scrapes top 5 stories from Hacker News (markdown format)', async ({
- page,
- scraper,
- }) => {
- await page.goto('https://news.ycombinator.com')
- const { data } = await scraper.run(page, schema, {
- format: 'markdown',
- })
- expect(schema.safeParse(data).success).toBe(true)
- })
- test('scrapes top 5 stories from Hacker News (raw html)', async ({
- page,
- scraper,
- }) => {
- await page.goto('https://news.ycombinator.com')
- const { data } = await scraper.run(page, schema, {
- format: 'raw_html',
- })
- expect(schema.safeParse(data).success).toBe(true)
- })
- test('scrapes top 5 stories from Hacker News (code generation)', async ({
- page,
- scraper,
- }) => {
- await page.goto('https://news.ycombinator.com')
- const { code } = await scraper.generate(page, schema)
- const result: z.infer<typeof schema> = await page.evaluate(code)
- expect(schema.safeParse(result).success).toBe(true)
- })
- test('scrapes top 5 stories from Hacker News (json schema)', async ({
- page,
- scraper,
- }) => {
- await page.goto('https://news.ycombinator.com')
- const m = jsonSchema<{ top: { title: string }[] }>(zodToJsonSchema(schema))
- const { data } = await scraper.run(page, m)
- expect(schema.safeParse(data).success).toBe(true)
- })
- test('scrapes example.com (streaming)', async ({ page, scraper }) => {
- await page.goto('https://example.com')
- const { stream } = await scraper.stream(
- page,
- z.object({
- h1: z.string().describe('The main heading of the page'),
- })
- )
- let text = ''
- for await (const item of stream) {
- text = item.h1 || ''
- }
- expect(text).toBe('Example Domain')
- })
- test('scrapes top stories from Hacker News (streaming, array)', async ({
- page,
- scraper,
- }) => {
- await page.goto('https://news.ycombinator.com')
- const { stream } = await scraper.stream(page, storySchema, {
- format: 'raw_html',
- output: 'array',
- })
- let last: Partial<z.infer<typeof storySchema>>[] = []
- for await (const item of stream) {
- last = item as typeof last
- }
- expect(last).toHaveLength(30)
- })
|