优化了提示词

This commit is contained in:
chentianrui
2024-08-23 18:35:19 +08:00
parent 7691b22274
commit a200e8adfc
39 changed files with 3083 additions and 21 deletions
@@ -0,0 +1,37 @@
import os
import json
from pydantic import BaseModel, Field
class CrawlUrl(BaseModel):
base_url: str
prefix: str
max_depth: int = Field(default=1, ge=0)
class WebLoaderConfig(BaseModel):
driver_arguments: list[str] = Field(default=None)
urls: list[CrawlUrl] = []
def get_web_documents(config: WebLoaderConfig):
from llama_index.readers.web import WholeSiteReader
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
options = Options()
driver_arguments = config.driver_arguments or []
for arg in driver_arguments:
options.add_argument(arg)
docs = []
urls = config.urls or []
for url in config.urls:
scraper = WholeSiteReader(
prefix=url.prefix,
max_depth=url.max_depth,
driver=webdriver.Chrome(options=options),
)
docs.extend(scraper.load_data(url.base_url))
return docs