193 lines
6.8 KiB
Python
193 lines
6.8 KiB
Python
import os
|
|
import json
|
|
import re
|
|
from dotenv import load_dotenv
|
|
from utils.llm import llm
|
|
from utils.prompt import PROMPTS
|
|
|
|
load_dotenv()
|
|
|
|
|
|
class EntityRelationExtractor:
|
|
def __init__(self):
|
|
# 使用从llm.py导入的模型
|
|
self.llm = llm
|
|
# 设置分隔符
|
|
self.tuple_delimiter = "|||"
|
|
self.record_delimiter = "\n"
|
|
self.completion_delimiter = ""
|
|
|
|
def extract_from_text(self, text, entity_types=None):
|
|
"""从文本中提取实体和关系"""
|
|
try:
|
|
entity_types = PROMPTS["DEFAULT_ENTITY_TYPES"]
|
|
relationship_types = PROMPTS["DEFAULT_RELATIONSHIP_TYPES"]
|
|
|
|
# 构建提示词
|
|
prompt = PROMPTS["entity_extraction"]
|
|
prompt = prompt.replace("{tuple_delimiter}", self.tuple_delimiter)
|
|
prompt = prompt.replace("{record_delimiter}", self.record_delimiter)
|
|
prompt = prompt.replace("{completion_delimiter}", self.completion_delimiter)
|
|
|
|
# 添加实体类型和文本内容
|
|
entity_types_str = ", ".join(entity_types)
|
|
relationship_types_str = ", ".join(relationship_types)
|
|
user_message = (
|
|
f"实体类型列表: {entity_types_str}\n\n关系类型列表: {relationship_types_str}\n\n文本内容:\n{text}"
|
|
)
|
|
|
|
# 调用LLM
|
|
full_prompt = f"System: {prompt}\n\nUser: {user_message}"
|
|
response = self.llm.generate(full_prompt)
|
|
|
|
# 解析结果
|
|
return self._parse_extraction_result(response)
|
|
|
|
except Exception as e:
|
|
return {"error": str(e)}
|
|
|
|
def _parse_extraction_result(self, result):
|
|
"""解析模型返回的实体和关系结果"""
|
|
entities = []
|
|
relations = []
|
|
keywords = []
|
|
|
|
# 按行分割结果
|
|
lines = result.strip().split(self.record_delimiter)
|
|
|
|
for line in lines:
|
|
if not line.strip():
|
|
continue
|
|
|
|
# 移除可能的括号
|
|
line = line.strip()
|
|
if line.startswith("(") and line.endswith(")"):
|
|
line = line[1:-1]
|
|
|
|
# 分割字段
|
|
parts = line.split(self.tuple_delimiter)
|
|
|
|
if len(parts) < 3:
|
|
continue
|
|
|
|
record_type = parts[0].strip('"')
|
|
|
|
if record_type == "entity":
|
|
if len(parts) >= 4:
|
|
entity = {
|
|
"name": parts[1].strip('"'),
|
|
"type": parts[2].strip('"'),
|
|
"description": parts[3].strip('"'),
|
|
}
|
|
entities.append(entity)
|
|
elif record_type == "relationship":
|
|
if len(parts) >= 5:
|
|
relation = {
|
|
"source": parts[1].strip('"'),
|
|
"target": parts[2].strip('"'),
|
|
"description": parts[3].strip('"'),
|
|
"type": parts[4].strip('"'),
|
|
"confidence": float(parts[5]) if len(parts) > 5 else 1.0,
|
|
}
|
|
relations.append(relation)
|
|
elif record_type == "content_keywords":
|
|
keywords = [kw.strip() for kw in parts[1].strip('"').split(",")]
|
|
|
|
return {"entities": entities, "relations": relations, "keywords": keywords}
|
|
|
|
def extract_from_file(self, file_path, entity_types=None):
|
|
"""从文件中提取实体和关系"""
|
|
try:
|
|
with open(file_path, "r", encoding="utf-8") as file:
|
|
text = file.read()
|
|
return self.extract_from_text(text, entity_types)
|
|
except Exception as e:
|
|
return {"error": f"读取文件时出错: {str(e)}"}
|
|
|
|
def extract_from_documents(self, documents, entity_types=None):
|
|
"""从多个文档中提取实体和关系"""
|
|
all_entities = {}
|
|
all_relations = []
|
|
all_keywords = set()
|
|
|
|
for doc in documents:
|
|
result = self.extract_from_text(doc, entity_types)
|
|
if "error" in result:
|
|
continue
|
|
|
|
# 合并实体(避免重复)
|
|
for entity in result.get("entities", []):
|
|
entity_name = entity["name"]
|
|
if entity_name not in all_entities:
|
|
all_entities[entity_name] = entity
|
|
|
|
# 添加关系
|
|
all_relations.extend(result.get("relations", []))
|
|
|
|
# 添加关键词
|
|
all_keywords.update(result.get("keywords", []))
|
|
|
|
return {"entities": list(all_entities.values()), "relations": all_relations, "keywords": list(all_keywords)}
|
|
|
|
|
|
# 测试代码
|
|
if __name__ == "__main__":
|
|
extractor = EntityRelationExtractor()
|
|
|
|
# 测试文本
|
|
test_text = """
|
|
|
|
(电力建设计价通软件) (计价通)导入或清除电子徽标
|
|
|
|
# (电力建设计价通软件) (计价通)导入或清除电子徽标
|
|
|
|
## 使用场景
|
|
1.单位对打印的报表有要求,需要插入显示企业电子徽标。
|
|
|
|
2.报表中有电子徽标,现在想要清除。
|
|
## 功能入口
|
|
【报表输出】界面——“导入电子徽标”按钮。
|
|
|
|

|
|
## 操作步骤
|
|
|
|
### 导入电子徽标
|
|
1.左侧勾选需要显示徽标的报表,点击“选择徽标”,选中需要导入的电子徽标(可识别bmp,gif,jpg格式文件),点击“打开”即可导入徽标;
|
|
|
|

|
|
### **清除电子徽标**
|
|
左侧勾选需要清除徽标的报表,点击“清除徽标”按钮,点击“确定”,即可清除徽标。
|
|
|
|

|
|
|
|
"""
|
|
|
|
# 提取实体和关系
|
|
print("正在提取实体和关系...")
|
|
result = extractor.extract_from_text(test_text)
|
|
|
|
# 打印结果
|
|
print("\n提取结果:")
|
|
if "error" in result:
|
|
print(f"错误: {result['error']}")
|
|
else:
|
|
print(f"发现 {len(result.get('entities', []))} 个实体和 {len(result.get('relations', []))} 个关系")
|
|
|
|
# 打印实体
|
|
print("\n实体:")
|
|
for entity in result.get("entities", []):
|
|
print(f"- {entity['name']} (类型: {entity['type']})")
|
|
print(f" 描述: {entity['description']}")
|
|
|
|
# 打印关系
|
|
print("\n关系:")
|
|
for relation in result.get("relations", []):
|
|
print(f"- {relation['source']} --[{relation['type']}]--> {relation['target']}")
|
|
print(f" 描述: {relation['description']}")
|
|
print(f" 置信度: {relation['confidence']}")
|
|
|
|
# 打印关键词
|
|
if "keywords" in result and result["keywords"]:
|
|
print("\n关键词:")
|
|
print(", ".join(result["keywords"]))
|