diff --git a/utils.py b/utils.py index 6596c79..be2b72a 100644 --- a/utils.py +++ b/utils.py @@ -1,143 +1,156 @@ -""" -=================================== -@Auther:WenZ -@Company: BooWay -@project:booway_dm -=================================== -""" - -# import spacy -# import zh_core_web_sm, zh_core_web_md, zh_core_web_lg, zh_core_web_trf -# -# # nlp_sm = zh_core_web_sm.load() -# # nlp_md = zh_core_web_md.load() -# # nlp_lg = zh_core_web_lg.load() -# nlp_trf = zh_core_web_trf.load() -# -# polite_words = {"你好", "您好", "请", "请问", "谢谢", "不客气", "麻烦", "打扰", "拜托", "辛苦", "劳驾"} - -# 停用词清理 -def stop_word_processing(input_str, nlp_stytle, polite_words): - doc = nlp_stytle(input_str) - - # 去除停用词 - filtered_tokens = [ - token.text for token in doc - if not token.is_stop and not token.is_punct and not token.is_space and token.text not in polite_words] - - return ''.join(filtered_tokens) - - -# 后缀名检测 -def extract_names_from_json(file_path): - import json - - with open(file_path, 'r', encoding='utf-8') as file: - data = json.load(file) - - # 确保数据是一个列表 - if isinstance(data, list): - names = [item.get("name") for item in data if "name" in item] - return names - else: - raise ValueError("JSON 文件的格式应为包含对象的列表") - - -def judge_define_suffix(input_str): - suffix_file_path = "../data/booway_knowledge_base/keywords_kg/suffix_keywords.json" - suffix_fields = extract_names_from_json(suffix_file_path) - suffix_fields.extend(['gec5', 'bczc2', 'xzwb2', 'BPQ', 'BPY']) - - import re - - # 构建正则表达式模式,匹配大小写不敏感且前面可能带有. - # 去掉 \b 以允许字段是其他字符串的一部分 - pattern = r'(?:\.?)(' + '|'.join(re.escape(field) for field in suffix_fields) + r')' - - # 使用 re.IGNORECASE 标志来忽略大小写 - if re.search(pattern, input_str, re.IGNORECASE): - return True - else: - return False - -def match_suffix(input_str): - import re - - # 修改正则表达式,匹配连续字母组合或字母+数字组合 - segments = re.findall(r'[A-Za-z]+(?:[0-9]+)?', input_str) - - # 过滤条件:必须是包含字母的组合,且可以包含字母或字母+数字 - matches = [seg for seg in segments if any(c.isalpha() for c in seg)] - - return matches[0] if matches else '未知' - -def retrieve_relevant_software(suffix_name): - import json - suffix_file_path = "../data/booway_knowledge_base/keywords_kg/suffix_keywords.json" - - with open(suffix_file_path, 'r', encoding='utf-8') as file: - data = json.load(file) - - suffix_name_lower = suffix_name.lower() - - for item in data: - item_name = item.get('name', '').lower() - if item_name == suffix_name_lower: - return item.get('description', {}).get('software_name', 0) - - return 0 - - - -def get_keywords(input_str): - import re - matches = re.findall(r'【(.*?)】', input_str) - - # 获取第二个【】里的内容(索引为1) - if len(matches) >= 2: - second = matches[1] - return second - else: - return None - -def get_keywords_v2(input_str): - import re - match = re.search(r'】(.*)', input_str) - - if match: - output = match.group(1) - return output - else: - return None - -def get_keywords_v3(input_str): - import re - match = re.findall(r'【(.*?)】', input_str) - - if match: - output = match[1:] - return output - else: - return None - - -def normalize_text(text: str, synonym_dict: dict) -> str: - import re - # 构建同义词到主词的映射表(扁平化) - flat_synonyms = {} - for main_word, synonyms in synonym_dict.items(): - for syn in synonyms: - flat_synonyms[syn] = main_word - - # 按长度从大到小排序,避免短词覆盖长词(如“下载”和“下载下来”) - sorted_synonyms = sorted(flat_synonyms.keys(), key=len, reverse=True) - - # 逐个替换 - for syn in sorted_synonyms: - # 用正则确保是整词匹配,但保留灵活性(可处理“...费费率...”这种词连着的情况) - text = re.sub(re.escape(syn), flat_synonyms[syn], text) - - return text - - - +""" +=================================== +@Auther:WenZ +@Company: BooWay +@project:booway_dm +=================================== +""" + +# import spacy +# import zh_core_web_sm, zh_core_web_md, zh_core_web_lg, zh_core_web_trf +# +# # nlp_sm = zh_core_web_sm.load() +# # nlp_md = zh_core_web_md.load() +# # nlp_lg = zh_core_web_lg.load() +# nlp_trf = zh_core_web_trf.load() +# +# polite_words = {"你好", "您好", "请", "请问", "谢谢", "不客气", "麻烦", "打扰", "拜托", "辛苦", "劳驾"} + +# 停用词清理 +def stop_word_processing(input_str, nlp_stytle, polite_words): + doc = nlp_stytle(input_str) + + # 去除停用词 + filtered_tokens = [ + token.text for token in doc + if not token.is_stop and not token.is_punct and not token.is_space and token.text not in polite_words] + + return ''.join(filtered_tokens) + + +# 后缀名检测 +def extract_names_from_json(file_path): + import json + + with open(file_path, 'r', encoding='utf-8') as file: + data = json.load(file) + + # 确保数据是一个列表 + if isinstance(data, list): + names = [item.get("name") for item in data if "name" in item] + return names + else: + raise ValueError("JSON 文件的格式应为包含对象的列表") + + +def judge_define_suffix(input_str): + suffix_file_path = "../data/booway_knowledge_base/keywords_kg/suffix_keywords.json" + suffix_fields = extract_names_from_json(suffix_file_path) + suffix_fields.extend(['gec5', 'bczc2', 'xzwb2', 'BPQ', 'BPY']) + + import re + + # 构建正则表达式模式,匹配大小写不敏感且前面可能带有. + # 去掉 \b 以允许字段是其他字符串的一部分 + pattern = r'(?:\.?)(' + '|'.join(re.escape(field) for field in suffix_fields) + r')' + + # 使用 re.IGNORECASE 标志来忽略大小写 + if re.search(pattern, input_str, re.IGNORECASE): + return True + else: + return False + +def match_suffix(input_str): + import re + + # 修改正则表达式,匹配连续字母组合或字母+数字组合 + segments = re.findall(r'[A-Za-z]+(?:[0-9]+)?', input_str) + + # 过滤条件:必须是包含字母的组合,且可以包含字母或字母+数字 + matches = [seg for seg in segments if any(c.isalpha() for c in seg)] + + return matches[0] if matches else '未知' + +def retrieve_relevant_software(suffix_name): + import json + suffix_file_path = "../data/booway_knowledge_base/keywords_kg/suffix_keywords.json" + + with open(suffix_file_path, 'r', encoding='utf-8') as file: + data = json.load(file) + + suffix_name_lower = suffix_name.lower() + + for item in data: + item_name = item.get('name', '').lower() + if item_name == suffix_name_lower: + return item.get('description', {}).get('software_name', 0) + + return 0 + + + +def get_keywords(input_str): + import re + matches = re.findall(r'【(.*?)】', input_str) + + # 获取第二个【】里的内容(索引为1) + if len(matches) >= 2: + second = matches[1] + return second + else: + return None + +def get_keywords_v2(input_str): + import re + match = re.search(r'】(.*)', input_str) + + if match: + output = match.group(1) + return output + else: + return None + +def get_keywords_v3(input_str): + import re + match = re.findall(r'【(.*?)】', input_str) + + if match: + output = match[1:] + return output + else: + return None + +def get_keywords_v4(input_str): + import re + matches = re.findall(r'【(.*?)】', input_str) + + # 获取第一个和第二个【】里的内容(索引为0和1) + first = matches[0] if len(matches) >= 1 else None + second = matches[1] if len(matches) >= 2 else None + + return first, second + + + +def normalize_text(text: str, synonym_dict: dict) -> str: + import re + # 构建同义词到主词的映射表(扁平化) + flat_synonyms = {} + for main_word, synonyms in synonym_dict.items(): + for syn in synonyms: + flat_synonyms[syn] = main_word + + # 按长度从大到小排序,避免短词覆盖长词(如“下载”和“下载下来”) + sorted_synonyms = sorted(flat_synonyms.keys(), key=len, reverse=True) + + # 逐个替换 + for syn in sorted_synonyms: + # 用正则确保是整词匹配,但保留灵活性(可处理“...费费率...”这种词连着的情况) + text = re.sub(re.escape(syn), flat_synonyms[syn], text) + + return text + + + + + diff --git a/vector_load.py b/vector_load.py index 2956e8e..28b9809 100644 --- a/vector_load.py +++ b/vector_load.py @@ -1,9 +1,44 @@ import os from langchain_community.vectorstores import FAISS -from langchain_huggingface import HuggingFaceEmbeddings +# from langchain_huggingface import HuggingFaceEmbeddings -embedding_path = "/data/Z_LLM_data/Embed_data/bge-m3" -embeddings = HuggingFaceEmbeddings(model_name=embedding_path) +# embedding_path = "/data/Z/Z_llm_dm/vector_data/bge-m3" +# embeddings = HuggingFaceEmbeddings(model_name=embedding_path) + + +from typing import List +import requests +from langchain.embeddings.base import Embeddings + + +class SiliconFlowEmbeddings(Embeddings): + def __init__(self, api_key: str, model: str = "bge-m3"): + self.api_key = api_key + self.model = model + self.url = "http://10.1.16.39:9995/v1/embeddings" + self.headers = { + "Authorization": f"Bearer {self.api_key}", + "Content-Type": "application/json" + } + + def _embed(self, input: List[str]) -> List[List[float]]: + payload = { + "model": self.model, + "input": input, + "encoding_format": "float" + } + response = requests.post(self.url, json=payload, headers=self.headers) + response.raise_for_status() + data = response.json() + return [item["embedding"] for item in data["data"]] + + def embed_documents(self, texts: List[str]) -> List[List[float]]: + return self._embed(texts) + + def embed_query(self, text: str) -> List[float]: + return self._embed([text])[0] + +embeddings = SiliconFlowEmbeddings(api_key="sk-ftnofbucchwnscojohyxwmfzgaykdxihafnlphohsinftkbr") def Mixed_retrieval(input_path): file_name = os.path.splitext(os.path.basename(input_path))[0] @@ -33,6 +68,8 @@ def Mixed_retrieval(input_path): return retriever_txt_faiss1, retriever_txt_faiss2, retriever_txt_faiss3 + + def interface_search(input_str, retriever_txt_faiss1, retriever_txt_faiss2, retriever_txt_faiss3): index_keyword1 = [] for i in retriever_txt_faiss1.invoke(input_str):