""" =================================== @Auther:WenZ @Company: BooWay @project:booway_dm =================================== """ # import spacy # import zh_core_web_sm, zh_core_web_md, zh_core_web_lg, zh_core_web_trf # # # nlp_sm = zh_core_web_sm.load() # # nlp_md = zh_core_web_md.load() # # nlp_lg = zh_core_web_lg.load() # nlp_trf = zh_core_web_trf.load() # # polite_words = {"你好", "您好", "请", "请问", "谢谢", "不客气", "麻烦", "打扰", "拜托", "辛苦", "劳驾"} # 停用词清理 def stop_word_processing(input_str, nlp_stytle, polite_words): doc = nlp_stytle(input_str) # 去除停用词 filtered_tokens = [ token.text for token in doc if not token.is_stop and not token.is_punct and not token.is_space and token.text not in polite_words] return ''.join(filtered_tokens) # 后缀名检测 def extract_names_from_json(file_path): import json with open(file_path, 'r', encoding='utf-8') as file: data = json.load(file) # 确保数据是一个列表 if isinstance(data, list): names = [item.get("name") for item in data if "name" in item] return names else: raise ValueError("JSON 文件的格式应为包含对象的列表") def judge_define_suffix(input_str): suffix_file_path = "../data/booway_knowledge_base/keywords_kg/suffix_keywords.json" suffix_fields = extract_names_from_json(suffix_file_path) suffix_fields.extend(['gec5', 'bczc2', 'xzwb2', 'BPQ', 'BPY']) import re # 构建正则表达式模式,匹配大小写不敏感且前面可能带有. # 去掉 \b 以允许字段是其他字符串的一部分 pattern = r'(?:\.?)(' + '|'.join(re.escape(field) for field in suffix_fields) + r')' # 使用 re.IGNORECASE 标志来忽略大小写 if re.search(pattern, input_str, re.IGNORECASE): return True else: return False def match_suffix(input_str): import re # 修改正则表达式,匹配连续字母组合或字母+数字组合 segments = re.findall(r'[A-Za-z]+(?:[0-9]+)?', input_str) # 过滤条件:必须是包含字母的组合,且可以包含字母或字母+数字 matches = [seg for seg in segments if any(c.isalpha() for c in seg)] return matches[0] if matches else '未知' def retrieve_relevant_software(suffix_name): import json suffix_file_path = "../data/booway_knowledge_base/keywords_kg/suffix_keywords.json" with open(suffix_file_path, 'r', encoding='utf-8') as file: data = json.load(file) suffix_name_lower = suffix_name.lower() for item in data: item_name = item.get('name', '').lower() if item_name == suffix_name_lower: return item.get('description', {}).get('software_name', 0) return 0 def get_keywords(input_str): import re matches = re.findall(r'【(.*?)】', input_str) # 获取第二个【】里的内容(索引为1) if len(matches) >= 2: second = matches[1] return second else: return None def get_keywords_v2(input_str): import re match = re.search(r'】(.*)', input_str) if match: output = match.group(1) return output else: return None def get_keywords_v3(input_str): import re match = re.findall(r'【(.*?)】', input_str) if match: output = match[1:] return output else: return None def normalize_text(text: str, synonym_dict: dict) -> str: import re # 构建同义词到主词的映射表(扁平化) flat_synonyms = {} for main_word, synonyms in synonym_dict.items(): for syn in synonyms: flat_synonyms[syn] = main_word # 按长度从大到小排序,避免短词覆盖长词(如“下载”和“下载下来”) sorted_synonyms = sorted(flat_synonyms.keys(), key=len, reverse=True) # 逐个替换 for syn in sorted_synonyms: # 用正则确保是整词匹配,但保留灵活性(可处理“...费费率...”这种词连着的情况) text = re.sub(re.escape(syn), flat_synonyms[syn], text) return text