Initial commit from Create Llama
This commit is contained in:
@@ -0,0 +1,36 @@
|
||||
import os
|
||||
import json
|
||||
from pydantic import BaseModel, Field
|
||||
|
||||
|
||||
class CrawlUrl(BaseModel):
|
||||
base_url: str
|
||||
prefix: str
|
||||
max_depth: int = Field(default=1, ge=0)
|
||||
|
||||
|
||||
class WebLoaderConfig(BaseModel):
|
||||
driver_arguments: list[str] = Field(default=None)
|
||||
urls: list[CrawlUrl]
|
||||
|
||||
|
||||
def get_web_documents(config: WebLoaderConfig):
|
||||
from llama_index.readers.web import WholeSiteReader
|
||||
from selenium import webdriver
|
||||
from selenium.webdriver.chrome.options import Options
|
||||
|
||||
options = Options()
|
||||
driver_arguments = config.driver_arguments or []
|
||||
for arg in driver_arguments:
|
||||
options.add_argument(arg)
|
||||
|
||||
docs = []
|
||||
for url in config.urls:
|
||||
scraper = WholeSiteReader(
|
||||
prefix=url.prefix,
|
||||
max_depth=url.max_depth,
|
||||
driver=webdriver.Chrome(options=options),
|
||||
)
|
||||
docs.extend(scraper.load_data(url.base_url))
|
||||
|
||||
return docs
|
||||
Reference in New Issue
Block a user