Files
DM_rewrite_3.31/utils.py
T
2025-04-03 17:24:18 +08:00

157 lines
4.5 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
"""
===================================
@AutherWenZ
@Company: BooWay
@projectbooway_dm
===================================
"""
# import spacy
# import zh_core_web_sm, zh_core_web_md, zh_core_web_lg, zh_core_web_trf
#
# # nlp_sm = zh_core_web_sm.load()
# # nlp_md = zh_core_web_md.load()
# # nlp_lg = zh_core_web_lg.load()
# nlp_trf = zh_core_web_trf.load()
#
# polite_words = {"你好", "您好", "请", "请问", "谢谢", "不客气", "麻烦", "打扰", "拜托", "辛苦", "劳驾"}
# 停用词清理
def stop_word_processing(input_str, nlp_stytle, polite_words):
doc = nlp_stytle(input_str)
# 去除停用词
filtered_tokens = [
token.text for token in doc
if not token.is_stop and not token.is_punct and not token.is_space and token.text not in polite_words]
return ''.join(filtered_tokens)
# 后缀名检测
def extract_names_from_json(file_path):
import json
with open(file_path, 'r', encoding='utf-8') as file:
data = json.load(file)
# 确保数据是一个列表
if isinstance(data, list):
names = [item.get("name") for item in data if "name" in item]
return names
else:
raise ValueError("JSON 文件的格式应为包含对象的列表")
def judge_define_suffix(input_str):
suffix_file_path = "../data/booway_knowledge_base/keywords_kg/suffix_keywords.json"
suffix_fields = extract_names_from_json(suffix_file_path)
suffix_fields.extend(['gec5', 'bczc2', 'xzwb2', 'BPQ', 'BPY'])
import re
# 构建正则表达式模式,匹配大小写不敏感且前面可能带有.
# 去掉 \b 以允许字段是其他字符串的一部分
pattern = r'(?:\.?)(' + '|'.join(re.escape(field) for field in suffix_fields) + r')'
# 使用 re.IGNORECASE 标志来忽略大小写
if re.search(pattern, input_str, re.IGNORECASE):
return True
else:
return False
def match_suffix(input_str):
import re
# 修改正则表达式,匹配连续字母组合或字母+数字组合
segments = re.findall(r'[A-Za-z]+(?:[0-9]+)?', input_str)
# 过滤条件:必须是包含字母的组合,且可以包含字母或字母+数字
matches = [seg for seg in segments if any(c.isalpha() for c in seg)]
return matches[0] if matches else '未知'
def retrieve_relevant_software(suffix_name):
import json
suffix_file_path = "../data/booway_knowledge_base/keywords_kg/suffix_keywords.json"
with open(suffix_file_path, 'r', encoding='utf-8') as file:
data = json.load(file)
suffix_name_lower = suffix_name.lower()
for item in data:
item_name = item.get('name', '').lower()
if item_name == suffix_name_lower:
return item.get('description', {}).get('software_name', 0)
return 0
def get_keywords(input_str):
import re
matches = re.findall(r'【(.*?)】', input_str)
# 获取第二个【】里的内容(索引为1
if len(matches) >= 2:
second = matches[1]
return second
else:
return None
def get_keywords_v2(input_str):
import re
match = re.search(r'】(.*)', input_str)
if match:
output = match.group(1)
return output
else:
return None
def get_keywords_v3(input_str):
import re
match = re.findall(r'【(.*?)】', input_str)
if match:
output = match[1:]
return output
else:
return None
def get_keywords_v4(input_str):
import re
matches = re.findall(r'【(.*?)】', input_str)
# 获取第一个和第二个【】里的内容(索引为0和1)
first = matches[0] if len(matches) >= 1 else None
second = matches[1] if len(matches) >= 2 else None
return first, second
def normalize_text(text: str, synonym_dict: dict) -> str:
import re
# 构建同义词到主词的映射表(扁平化)
flat_synonyms = {}
for main_word, synonyms in synonym_dict.items():
for syn in synonyms:
flat_synonyms[syn] = main_word
# 按长度从大到小排序,避免短词覆盖长词(如“下载”和“下载下来”)
sorted_synonyms = sorted(flat_synonyms.keys(), key=len, reverse=True)
# 逐个替换
for syn in sorted_synonyms:
# 用正则确保是整词匹配,但保留灵活性(可处理“...费费率...”这种词连着的情况)
text = re.sub(re.escape(syn), flat_synonyms[syn], text)
return text