Browse Source

Merge pull request #31 from mishushakov/cleanup

Added cleanup mode
Mish Ushakov 1 year ago
parent
commit
4924529943
4 changed files with 82 additions and 15 deletions
  1. 1 1
      examples/ollama.ts
  2. 61 0
      src/cleanup.ts
  3. 8 1
      src/index.ts
  4. 12 13
      src/models.ts

+ 1 - 1
examples/ollama.ts

@@ -23,7 +23,7 @@ const schema = z.object({
 
 // Run the scraper
 const { data } = await scraper.run(page, schema, {
-  format: 'text',
+  format: 'html',
 })
 
 console.log(data)

+ 61 - 0
src/cleanup.ts

@@ -0,0 +1,61 @@
+export default function cleanup() {
+  const elementsToRemove = [
+    'script',
+    'style',
+    'noscript',
+    'iframe',
+    'svg',
+    'img',
+    'audio',
+    'video',
+    'canvas',
+    'map',
+    'source',
+    'dialog',
+    'menu',
+    'menuitem',
+    'track',
+    'object',
+    'embed',
+    'form',
+    'input',
+    'button',
+    'select',
+    'textarea',
+    'label',
+    'option',
+    'optgroup',
+    'aside',
+    'footer',
+    'header',
+    'nav',
+    'head',
+  ]
+
+  const attributesToRemove = [
+    'style',
+    'src',
+    'href',
+    'alt',
+    'title',
+    'role',
+    'aria-',
+    'tabindex',
+    'on',
+    'data-',
+  ]
+
+  const elementTree = document.querySelectorAll('*')
+
+  elementTree.forEach((element) => {
+    if (elementsToRemove.includes(element.tagName.toLowerCase())) {
+      element.remove()
+    }
+
+    Array.from(element.attributes).forEach((attr) => {
+      if (attributesToRemove.some((a) => attr.name.startsWith(a))) {
+        element.removeAttribute(attr.name)
+      }
+    })
+  })
+}

+ 8 - 1
src/index.ts

@@ -9,9 +9,11 @@ import {
   streamAISDKCompletions,
 } from './models.js'
 
+import cleanup from './cleanup.js'
+
 export type ScraperLoadOptions =
   | {
-      format?: 'html' | 'text' | 'markdown'
+      format?: 'html' | 'text' | 'markdown' | 'cleanup'
     }
   | {
       format: 'custom'
@@ -73,6 +75,11 @@ export default class LLMScraper {
       content = `Page Title: ${readable.title}\n${readable.textContent}`
     }
 
+    if (options.format === 'cleanup') {
+      await page.evaluate(cleanup)
+      content = await page.content()
+    }
+
     if (options.format === 'image') {
       const image = await page.screenshot({ fullPage: options.fullPage })
       content = image.toString('base64')

+ 12 - 13
src/models.ts

@@ -19,13 +19,9 @@ export type ScraperCompletionResult<T extends z.ZodSchema<any>> = {
 const defaultPrompt =
   'You are a sophisticated web scraper. Extract the contents of the webpage'
 
-function prepareAISDKPage(
-  prompt: string,
-  page: ScraperLoadResult
-): UserContent {
+function prepareAISDKPage(page: ScraperLoadResult): UserContent {
   if (page.format === 'image') {
     return [
-      { type: 'text', text: prompt },
       {
         type: 'image',
         image: page.content,
@@ -33,10 +29,7 @@ function prepareAISDKPage(
     ]
   }
 
-  return [
-    { type: 'text', text: prompt },
-    { type: 'text', text: page.content },
-  ]
+  return [{ type: 'text', text: page.content }]
 }
 
 export async function generateAISDKCompletions<T extends z.ZodSchema<any>>(
@@ -45,10 +38,13 @@ export async function generateAISDKCompletions<T extends z.ZodSchema<any>>(
   schema: T,
   options: ScraperLLMOptions
 ) {
-  const content = prepareAISDKPage(options.prompt || defaultPrompt, page)
+  const content = prepareAISDKPage(page)
   const result = await generateObject<z.infer<T>>({
     model,
-    messages: [{ role: 'user', content }],
+    messages: [
+      { role: 'system', content: options.prompt || defaultPrompt },
+      { role: 'user', content },
+    ],
     schema,
     temperature: options.temperature,
     maxTokens: options.maxTokens,
@@ -68,10 +64,13 @@ export async function streamAISDKCompletions<T extends z.ZodSchema<any>>(
   schema: T,
   options: ScraperLLMOptions
 ) {
-  const content = prepareAISDKPage(options.prompt || defaultPrompt, page)
+  const content = prepareAISDKPage(page)
   const { partialObjectStream } = await streamObject<z.infer<T>>({
     model,
-    messages: [{ role: 'user', content }],
+    messages: [
+      { role: 'system', content: options.prompt || defaultPrompt },
+      { role: 'user', content },
+    ],
     schema,
     temperature: options.temperature,
     maxTokens: options.maxTokens,