38 lines
1.1 KiB
Python
38 lines
1.1 KiB
Python
"""
|
||
===================================
|
||
@Auther:WenZ
|
||
@Company: BooWay
|
||
@project:dify_lab
|
||
===================================
|
||
"""
|
||
import pandas as pd
|
||
|
||
def read_title_column(csv_file: str) -> list:
|
||
"""
|
||
读取CSV文件中的'title'列,并返回一个列表,使用UTF-8编码。
|
||
|
||
:param csv_file: CSV文件的路径
|
||
:return: 包含title列数据的列表
|
||
"""
|
||
try:
|
||
df = pd.read_csv(csv_file, encoding='utf-8') # 使用 UTF-8 编码读取 CSV
|
||
if 'title' in df.columns:
|
||
return df['title'].dropna().tolist() # 去除缺失值并转换为列表
|
||
else:
|
||
raise ValueError("CSV文件中未找到'title'列")
|
||
except Exception as e:
|
||
print(f"读取文件时发生错误: {e}")
|
||
return []
|
||
|
||
titles = read_title_column("info_data.csv")
|
||
|
||
from langchain_huggingface import HuggingFaceEmbeddings
|
||
|
||
embedding_path = "D:/迅雷下载/模型权重/bge-m3"
|
||
embeddings = HuggingFaceEmbeddings(model_name=embedding_path)
|
||
|
||
from langchain_community.vectorstores import Chroma
|
||
|
||
chroma_archived = "chroma_titles"
|
||
|
||
vectorstore_txt_chroma = Chroma.from_texts(titles, embeddings, persist_directory=chroma_archived) |