import sys
sys.path.append('../')
from src.loader.load import loads_path
from src.pgdb.knowledge.similarity import VectorStore_FAISS
from src.config.consts import (
VEC_DB_DBNAME,
VEC_DB_HOST,
VEC_DB_PASSWORD,
VEC_DB_PORT,
VEC_DB_USER,
EMBEEDING_MODEL_PATH,
FAISS_STORE_PATH,
SIMILARITY_SHOW_NUMBER,
KNOWLEDGE_PATH,
INDEX_NAME
)
from src.loader.callback import BaseCallback
sys.path.append("../")
# 当返回值中带有“思考题”字样的时候,默认将其忽略。
class LocalCallBack(BaseCallback):
def filter(self, title: str, content: str) -> bool:
if len(title + content) == 0:
return True
return (len(title + content) / (len(title.splitlines()) + len(content.splitlines())) < 20) or "思考题" in title
"""测试资料入库(pgsql和faiss)"""
def test_faiss_from_dir():
vecstore_faiss = VectorStore_FAISS(
embedding_model_name=EMBEEDING_MODEL_PATH,
store_path=FAISS_STORE_PATH,
index_name=INDEX_NAME,
info={"port": VEC_DB_PORT, "host": VEC_DB_HOST, "dbname": VEC_DB_DBNAME, "username": VEC_DB_USER,
"password": VEC_DB_PASSWORD},
show_number=SIMILARITY_SHOW_NUMBER,
reset=True)
docs = loads_path(KNOWLEDGE_PATH, mode="paged", sentence_size=512, callbacks=[LocalCallBack()])
print(len(docs))
last_doc = None
docs1 = []
for doc in docs:
if not last_doc:
last_doc = doc
continue
if "font-size" not in doc.metadata or "page_number" not in doc.metadata:
continue
if doc.metadata["font-size"] == last_doc.metadata["font-size"] and doc.metadata["page_number"] == \
last_doc.metadata["page_number"] and len(doc.page_content) + len(last_doc.page_content) < 512 / 4 * 3:
last_doc.page_content += doc.page_content
else:
docs1.append(last_doc)
last_doc = doc
if last_doc:
docs1.append(last_doc)
docs = docs1
print(len(docs))
print(vecstore_faiss._faiss.index.ntotal)
for i in range(0, len(docs), 300):
vecstore_faiss._add_documents(docs[i:i + 300 if i + 300 < len(docs) else len(docs)], need_split=True)
print(vecstore_faiss._faiss.index.ntotal)
vecstore_faiss._save_local()
"""测试faiss向量数据库查询结果"""
def test_faiss_load():
vecstore_faiss = VectorStore_FAISS(
embedding_model_name=EMBEEDING_MODEL_PATH,
store_path=FAISS_STORE_PATH,
index_name=INDEX_NAME,
info={"port": VEC_DB_PORT, "host": VEC_DB_HOST, "dbname": VEC_DB_DBNAME, "username": VEC_DB_USER,
"password": VEC_DB_PASSWORD},
show_number=SIMILARITY_SHOW_NUMBER,
reset=False)
print(vecstore_faiss.join_document(vecstore_faiss.get_text_similarity("我国什么时候全面开放低空领域")))
if __name__ == "__main__":
# test_faiss_from_dir()
test_faiss_load()