Commit a14b9e36 by 文靖昊

定时构建索引实现

parent 7891dc93
from src.pgdb.knowledge.similarity import VectorStore_FAISS
from src.loader.load import loads_path
import os
import shutil
def Auto_Task(src_path: str,dest_path:str, faiss_db: VectorStore_FAISS):
if not os.path.exists(src_path):
os.makedirs(dest_path)
# 检查目标目录是否存在,如果不存在则创建
if not os.path.exists(dest_path):
os.makedirs(dest_path)
print("目标目录不存在,已创建。")
files = os.listdir(src_path)
if len(files) == 0:
return
docs = loads_path(src_path, mode="paged", sentence_size=512, callbacks=[])
last_doc = None
docs1 = []
for doc in docs:
if not last_doc:
last_doc = doc
continue
if "font-size" not in doc.metadata or "page_number" not in doc.metadata:
continue
if doc.metadata["font-size"] == last_doc.metadata["font-size"] and doc.metadata["page_number"] == \
last_doc.metadata["page_number"] and len(doc.page_content) + len(
last_doc.page_content) < 512 / 4 * 3:
last_doc.page_content += doc.page_content
else:
docs1.append(last_doc)
last_doc = doc
if last_doc:
docs1.append(last_doc)
docs = docs1
for i in range(0, len(docs), 300):
faiss_db._add_documents(docs[i:i + 300 if i + 300 < len(docs) else len(docs)], need_split=True)
faiss_db._save_local()
# 遍历文件列表
for file_name in files:
# 构建文件的完整路径
source_file = os.path.join(src_path, file_name)
# 检查是否为文件
if os.path.isfile(source_file):
# 构建目标文件路径
destination_file = os.path.join(dest_path, file_name)
try:
# 将文件移动到目标目录
shutil.move(source_file, destination_file)
except Exception as e:
print(f"移动文件时出错: {source_file} -> {destination_file},错误信息: {e}")
\ No newline at end of file
......@@ -5,9 +5,9 @@ class TaskScheduler:
def __init__(self):
self.scheduler = BackgroundScheduler()
def add_task(self,timedTask,timeMinute):
def add_task(self,timedTask,timeMinute,*args):
# Define your timed task here
self.scheduler.add_job(timedTask, 'interval', minutes=timeMinute)
self.scheduler.add_job(timedTask, 'interval', minutes=timeMinute,args=args)
def start_scheduler(self):
# Add the timed task to the scheduler
......
from src.pgdb.knowledge.similarity import VectorStore_FAISS
from src.scheduler.scheduler import TaskScheduler
from src.scheduler.auto_update_index_task import Auto_Task
import time
from src.config.consts import (
VEC_DB_DBNAME,
VEC_DB_HOST,
VEC_DB_PASSWORD,
VEC_DB_PORT,
VEC_DB_USER,
EMBEEDING_MODEL_PATH,
FAISS_STORE_PATH,
SIMILARITY_SHOW_NUMBER,
KNOWLEDGE_PATH,
INDEX_NAME
)
def auto_task():
vecstore_faiss = VectorStore_FAISS(
embedding_model_name=EMBEEDING_MODEL_PATH,
store_path=FAISS_STORE_PATH,
index_name=INDEX_NAME,
info={"port": VEC_DB_PORT, "host": VEC_DB_HOST, "dbname": VEC_DB_DBNAME, "username": VEC_DB_USER,
"password": VEC_DB_PASSWORD},
show_number=SIMILARITY_SHOW_NUMBER,
reset=False)
print(vecstore_faiss.join_document(vecstore_faiss.get_text_similarity("什么是暹罗猫")))
scheduler = TaskScheduler()
scheduler.add_task(Auto_Task, 2,"../test_dir/src","../test_dir/dest", vecstore_faiss)
scheduler.start_scheduler()
for i in range(10):
print(i)
print(vecstore_faiss.join_document(vecstore_faiss.get_text_similarity("什么是暹罗猫")))
time.sleep(30)
scheduler.stop_scheduler()
if __name__ == "__main__":
auto_task()
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment