k_store_test.py 3.44 KB
Newer Older
1
import sys
陈正乐 committed
2
sys.path.append('../')
陈正乐 committed
3
from src.loader.load import loads_path
4 5 6 7 8 9 10 11 12 13 14 15 16 17
from src.pgdb.knowledge.similarity import VectorStore_FAISS
from src.config.consts import (
    VEC_DB_DBNAME,
    VEC_DB_HOST,
    VEC_DB_PASSWORD,
    VEC_DB_PORT,
    VEC_DB_USER,
    EMBEEDING_MODEL_PATH,
    FAISS_STORE_PATH,
    SIMILARITY_SHOW_NUMBER,
    KNOWLEDGE_PATH,
    INDEX_NAME
)
from src.loader.callback import BaseCallback
18
from src.server.rerank import BgeRerank
19

陈正乐 committed
20 21
sys.path.append("../")

22 23

# 当返回值中带有“思考题”字样的时候,默认将其忽略。
陈正乐 committed
24
class LocalCallBack(BaseCallback):
陈正乐 committed
25 26
    def filter(self, title: str, content: str) -> bool:
        if len(title + content) == 0:
27
            return True
陈正乐 committed
28 29
        return (len(title + content) / (len(title.splitlines()) + len(content.splitlines())) < 20) or "思考题" in title

30

31
"""测试资料入库(pgsql和faiss)"""
陈正乐 committed
32 33


34 35
def test_faiss_from_dir():
    vecstore_faiss = VectorStore_FAISS(
陈正乐 committed
36 37 38 39 40 41 42
        embedding_model_name=EMBEEDING_MODEL_PATH,
        store_path=FAISS_STORE_PATH,
        index_name=INDEX_NAME,
        info={"port": VEC_DB_PORT, "host": VEC_DB_HOST, "dbname": VEC_DB_DBNAME, "username": VEC_DB_USER,
              "password": VEC_DB_PASSWORD},
        show_number=SIMILARITY_SHOW_NUMBER,
        reset=True)
陈正乐 committed
43
    docs = loads_path(KNOWLEDGE_PATH, mode="paged", sentence_size=512, callbacks=[LocalCallBack()])
44 45 46 47 48 49 50 51 52
    print(len(docs))
    last_doc = None
    docs1 = []
    for doc in docs:
        if not last_doc:
            last_doc = doc
            continue
        if "font-size" not in doc.metadata or "page_number" not in doc.metadata:
            continue
陈正乐 committed
53 54
        if doc.metadata["font-size"] == last_doc.metadata["font-size"] and doc.metadata["page_number"] == \
                last_doc.metadata["page_number"] and len(doc.page_content) + len(last_doc.page_content) < 512 / 4 * 3:
55 56 57 58 59 60 61 62 63 64
            last_doc.page_content += doc.page_content
        else:
            docs1.append(last_doc)
            last_doc = doc
    if last_doc:
        docs1.append(last_doc)
    docs = docs1
    print(len(docs))
    print(vecstore_faiss._faiss.index.ntotal)
    for i in range(0, len(docs), 300):
陈正乐 committed
65
        vecstore_faiss._add_documents(docs[i:i + 300 if i + 300 < len(docs) else len(docs)], need_split=True)
66 67 68
        print(vecstore_faiss._faiss.index.ntotal)
    vecstore_faiss._save_local()

陈正乐 committed
69

70
"""测试faiss向量数据库查询结果"""
陈正乐 committed
71 72


73 74
def test_faiss_load():
    vecstore_faiss = VectorStore_FAISS(
陈正乐 committed
75 76 77 78 79 80 81
        embedding_model_name=EMBEEDING_MODEL_PATH,
        store_path=FAISS_STORE_PATH,
        index_name=INDEX_NAME,
        info={"port": VEC_DB_PORT, "host": VEC_DB_HOST, "dbname": VEC_DB_DBNAME, "username": VEC_DB_USER,
              "password": VEC_DB_PASSWORD},
        show_number=SIMILARITY_SHOW_NUMBER,
        reset=False)
陈正乐 committed
82
    print(vecstore_faiss.join_document(vecstore_faiss.get_text_similarity("我国什么时候全面开放低空领域")))
83 84


85 86 87 88 89 90 91 92
def test_reranker():
    from langchain_core.documents import Document
    reranker = BgeRerank(model_name="BAAI/bge-reranker-large")
    docs = ["低空经济是指在 200 米的空域相关业务","我国什么时候全面开放低空领域","今天早上雨很大"]
    docs2 = [Document(page_content=d) for d in docs]
    print(reranker.bge_rerank("我国什么时候全面开放低空领域", docs))
    print(reranker.compress_documents(docs2, "我国什么时候全面开放低空领域"))

93
if __name__ == "__main__":
94 95 96
    # test_faiss_from_dir()
    # test_faiss_load()
    test_reranker()