37 lines
906 B
Python
37 lines
906 B
Python
import os
|
|
import json
|
|
from pydantic import BaseModel, Field
|
|
|
|
|
|
class CrawlUrl(BaseModel):
|
|
base_url: str
|
|
prefix: str
|
|
max_depth: int = Field(default=1, ge=0)
|
|
|
|
|
|
class WebLoaderConfig(BaseModel):
|
|
driver_arguments: list[str] = Field(default=None)
|
|
urls: list[CrawlUrl]
|
|
|
|
|
|
def get_web_documents(config: WebLoaderConfig):
|
|
from llama_index.readers.web import WholeSiteReader
|
|
from selenium import webdriver
|
|
from selenium.webdriver.chrome.options import Options
|
|
|
|
options = Options()
|
|
driver_arguments = config.driver_arguments or []
|
|
for arg in driver_arguments:
|
|
options.add_argument(arg)
|
|
|
|
docs = []
|
|
for url in config.urls:
|
|
scraper = WholeSiteReader(
|
|
prefix=url.prefix,
|
|
max_depth=url.max_depth,
|
|
driver=webdriver.Chrome(options=options),
|
|
)
|
|
docs.extend(scraper.load_data(url.base_url))
|
|
|
|
return docs
|