import sys sys.path.append('../') from src.loader.load import loads_path from src.pgdb.knowledge.similarity import VectorStore_FAISS from src.config.consts import ( VEC_DB_DBNAME, VEC_DB_HOST, VEC_DB_PASSWORD, VEC_DB_PORT, VEC_DB_USER, EMBEEDING_MODEL_PATH, FAISS_STORE_PATH, SIMILARITY_SHOW_NUMBER, KNOWLEDGE_PATH, INDEX_NAME ) from src.loader.callback import BaseCallback from src.server.rerank import BgeRerank import time # 当返回值中带有“思考题”字样的时候,默认将其忽略。 class LocalCallBack(BaseCallback): def filter(self, title: str, content: str) -> bool: if len(title + content) == 0: return True return (len(title + content) / (len(title.splitlines()) + len(content.splitlines())) < 20) or "思考题" in title """测试资料入库(pgsql和faiss)""" def test_faiss_from_dir(): vecstore_faiss = VectorStore_FAISS( embedding_model_name=EMBEEDING_MODEL_PATH, store_path=FAISS_STORE_PATH, index_name=INDEX_NAME, info={"port": VEC_DB_PORT, "host": VEC_DB_HOST, "dbname": VEC_DB_DBNAME, "username": VEC_DB_USER, "password": VEC_DB_PASSWORD}, show_number=SIMILARITY_SHOW_NUMBER, reset=True) docs = loads_path(KNOWLEDGE_PATH, mode="paged", sentence_size=512, callbacks=[LocalCallBack()]) print(len(docs)) last_doc = None docs1 = [] for doc in docs: if not last_doc: last_doc = doc continue if "font-size" not in doc.metadata or "page_number" not in doc.metadata: continue if doc.metadata["font-size"] == last_doc.metadata["font-size"] and doc.metadata["page_number"] == \ last_doc.metadata["page_number"] and len(doc.page_content) + len(last_doc.page_content) < 512 / 4 * 3: last_doc.page_content += doc.page_content else: docs1.append(last_doc) last_doc = doc if last_doc: docs1.append(last_doc) docs = docs1 print(len(docs)) print(vecstore_faiss._faiss.index.ntotal) for i in range(0, len(docs), 300): vecstore_faiss._add_documents(docs[i:i + 300 if i + 300 < len(docs) else len(docs)], need_split=True) print(vecstore_faiss._faiss.index.ntotal) vecstore_faiss._save_local() """测试faiss向量数据库查询结果""" def test_faiss_load(): vecstore_faiss = VectorStore_FAISS( embedding_model_name=EMBEEDING_MODEL_PATH, store_path=FAISS_STORE_PATH, index_name=INDEX_NAME, info={"port": VEC_DB_PORT, "host": VEC_DB_HOST, "dbname": VEC_DB_DBNAME, "username": VEC_DB_USER, "password": VEC_DB_PASSWORD}, show_number=SIMILARITY_SHOW_NUMBER, reset=False) print(vecstore_faiss.join_document(vecstore_faiss.get_text_similarity("我国什么时候全面开放低空领域"))) def test_faiss_search(): vecstore_faiss = VectorStore_FAISS( embedding_model_name=EMBEEDING_MODEL_PATH, store_path=FAISS_STORE_PATH, index_name=INDEX_NAME, info={"port": VEC_DB_PORT, "host": VEC_DB_HOST, "dbname": VEC_DB_DBNAME, "username": VEC_DB_USER, "password": VEC_DB_PASSWORD}, show_number=SIMILARITY_SHOW_NUMBER, reset=False) tic = time.perf_counter() print(vecstore_faiss.get_text_similarity("化隆县的降雨情况如何")) toc = time.perf_counter() - tic print("Time:", toc) def test_reranker(): from langchain_core.documents import Document reranker = BgeRerank(model_name="BAAI/bge-reranker-large") docs = ["低空经济是指在 200 米的空域相关业务","我国什么时候全面开放低空领域","今天早上雨很大"] docs2 = [Document(page_content=d) for d in docs] print(reranker.bge_rerank("我国什么时候全面开放低空领域", docs)) print(reranker.compress_documents(docs2, "我国什么时候全面开放低空领域")) if __name__ == "__main__": # test_faiss_from_dir() # test_faiss_load() # test_reranker() test_faiss_search()