Explorar o código

cleanup (WIP)

Mish Ushakov hai 1 ano
pai
achega
add343ed18
Modificáronse 3 ficheiros con 70 adicións e 2 borrados
  1. 1 1
      examples/ollama.ts
  2. 61 0
      src/cleanup.ts
  3. 8 1
      src/index.ts

+ 1 - 1
examples/ollama.ts

@@ -23,7 +23,7 @@ const schema = z.object({
 
 // Run the scraper
 const { data } = await scraper.run(page, schema, {
-  format: 'text',
+  format: 'cleanup',
 })
 
 console.log(data)

+ 61 - 0
src/cleanup.ts

@@ -0,0 +1,61 @@
+export default function cleanup() {
+  const elementsToRemove = [
+    'script',
+    'style',
+    'noscript',
+    'iframe',
+    'svg',
+    'img',
+    'audio',
+    'video',
+    'canvas',
+    'map',
+    'source',
+    'dialog',
+    'menu',
+    'menuitem',
+    'track',
+    'object',
+    'embed',
+    'form',
+    'input',
+    'button',
+    'select',
+    'textarea',
+    'label',
+    'option',
+    'optgroup',
+    'aside',
+    'footer',
+    'header',
+    'nav',
+    'head',
+  ]
+
+  const attributesToRemove = [
+    'style',
+    'src',
+    'href',
+    'alt',
+    'title',
+    'role',
+    'aria-',
+    'tabindex',
+    'on',
+    'data-',
+  ]
+
+  const elementTree = document.querySelectorAll('*')
+
+  elementTree.forEach((element) => {
+    if (elementsToRemove.includes(element.tagName.toLowerCase())) {
+      element.remove()
+    }
+
+    Array.from(element.attributes).forEach((attr) => {
+      if (attributesToRemove.some((a) => new RegExp(a).test(attr.name))) {
+        element.removeAttribute(attr.name)
+      }
+    })
+  })
+}

+ 8 - 1
src/index.ts

@@ -9,9 +9,11 @@ import {
   streamAISDKCompletions,
 } from './models.js'
 
+import cleanup from './cleanup.js'
+
 export type ScraperLoadOptions =
   | {
-      format?: 'html' | 'text' | 'markdown'
+      format?: 'html' | 'text' | 'markdown' | 'cleanup'
     }
   | {
       format: 'custom'
@@ -72,6 +74,11 @@ export default class LLMScraper {
       content = `Page Title: ${readable.title}\n${readable.textContent}`
     }
 
+    if (options.format === 'cleanup') {
+      await page.evaluate(cleanup)
+      content = await page.content()
+    }
+
     if (options.format === 'image') {
       const image = await page.screenshot({ fullPage: options.fullPage })
       content = image.toString('base64')