Files
DM_rewrite_3.31/booway_kg_api/chorma_embedding.py
T
2025-03-31 15:17:47 +08:00

38 lines
1.1 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
"""
===================================
@AutherWenZ
@Company: BooWay
@projectdify_lab
===================================
"""
import pandas as pd
def read_title_column(csv_file: str) -> list:
"""
读取CSV文件中的'title'列,并返回一个列表,使用UTF-8编码。
:param csv_file: CSV文件的路径
:return: 包含title列数据的列表
"""
try:
df = pd.read_csv(csv_file, encoding='utf-8') # 使用 UTF-8 编码读取 CSV
if 'title' in df.columns:
return df['title'].dropna().tolist() # 去除缺失值并转换为列表
else:
raise ValueError("CSV文件中未找到'title'列")
except Exception as e:
print(f"读取文件时发生错误: {e}")
return []
titles = read_title_column("info_data.csv")
from langchain_huggingface import HuggingFaceEmbeddings
embedding_path = "D:/迅雷下载/模型权重/bge-m3"
embeddings = HuggingFaceEmbeddings(model_name=embedding_path)
from langchain_community.vectorstores import Chroma
chroma_archived = "chroma_titles"
vectorstore_txt_chroma = Chroma.from_texts(titles, embeddings, persist_directory=chroma_archived)