更新词库,删除冗余同义词条目,优化意图识别逻辑,增强代码可读性和维护性,同时添加去重功能以处理同义词。更新相关文档以反映最新变化。
This commit is contained in:
@@ -71,16 +71,16 @@ class TermMerger:
|
||||
logging.warning(f"读取{file}失败: {e}")
|
||||
|
||||
# 加载suffix_keywords.json文件
|
||||
suffix_keywords_path = os.path.join(os.path.dirname(os.path.dirname(self.EXTRACTED_NOUNS_DIR)), 'data', 'nouns', 'suffix_keywords.json')
|
||||
if os.path.exists(suffix_keywords_path):
|
||||
try:
|
||||
with open(suffix_keywords_path, 'r', encoding='utf-8') as f:
|
||||
suffix_terms = json.load(f)
|
||||
suffix_terms = [{"name": term["name"].upper(), "synonymous": "", "description": ""} for term in suffix_terms]
|
||||
all_terms.extend(suffix_terms)
|
||||
logging.info(f"加载{suffix_keywords_path},共{len(suffix_terms)}条")
|
||||
except Exception as e:
|
||||
logging.warning(f"读取{suffix_keywords_path}失败: {e}")
|
||||
# suffix_keywords_path = os.path.join(os.path.dirname(os.path.dirname(self.EXTRACTED_NOUNS_DIR)), 'data', 'nouns', 'suffix_keywords.json')
|
||||
# if os.path.exists(suffix_keywords_path):
|
||||
# try:
|
||||
# with open(suffix_keywords_path, 'r', encoding='utf-8') as f:
|
||||
# suffix_terms = json.load(f)
|
||||
# suffix_terms = [{"name": term["name"].upper(), "synonymous": "", "description": ""} for term in suffix_terms]
|
||||
# all_terms.extend(suffix_terms)
|
||||
# logging.info(f"加载{suffix_keywords_path},共{len(suffix_terms)}条")
|
||||
# except Exception as e:
|
||||
# logging.warning(f"读取{suffix_keywords_path}失败: {e}")
|
||||
|
||||
return all_terms
|
||||
|
||||
@@ -154,6 +154,9 @@ class TermMerger:
|
||||
for result in tqdm(executor.map(self.process_term, items_to_process), total=len(items_to_process)):
|
||||
merged_terms.append(result)
|
||||
|
||||
# 4. 去重
|
||||
merged_terms = self.deduplicate_synonymous_name(merged_terms)
|
||||
|
||||
# 4. 保存合并结果
|
||||
os.makedirs(os.path.dirname(self.OUTPUT_PATH), exist_ok=True)
|
||||
with open(self.OUTPUT_PATH, 'w', encoding='utf-8') as f:
|
||||
@@ -162,6 +165,26 @@ class TermMerger:
|
||||
|
||||
return merged_terms
|
||||
|
||||
def deduplicate_synonymous_name(self, terms):
|
||||
# 1. 删除name字段重复的条目
|
||||
unique_names = set()
|
||||
unique_data = []
|
||||
|
||||
for item in terms:
|
||||
if item["name"] not in unique_names:
|
||||
unique_names.add(item["name"])
|
||||
unique_data.append(item)
|
||||
# 如果重复,则跳过该条目
|
||||
|
||||
# 2. 如果A条目的某一个synonymou字段是B条目的name,则删除A条目中的对应的synonymou
|
||||
name_set = {item["name"] for item in unique_data}
|
||||
|
||||
for item in unique_data:
|
||||
# 过滤掉synonymous中与其他条目name重复的部分
|
||||
filtered_synonymous = [syn for syn in item["synonymous"] if syn not in name_set]
|
||||
item["synonymous"] = filtered_synonymous
|
||||
|
||||
return unique_data
|
||||
|
||||
def main():
|
||||
"""主函数,创建TermMerger实例并执行合并"""
|
||||
|
||||
Reference in New Issue
Block a user