auto_update_index_task.py 1.99 KB
Newer Older
文靖昊 committed
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58
from src.pgdb.knowledge.similarity import VectorStore_FAISS
from src.loader.load import loads_path
import os
import shutil




def Auto_Task(src_path: str,dest_path:str, faiss_db: VectorStore_FAISS):
    if not os.path.exists(src_path):
        os.makedirs(dest_path)

        # 检查目标目录是否存在,如果不存在则创建
    if not os.path.exists(dest_path):
        os.makedirs(dest_path)
        print("目标目录不存在,已创建。")
    files = os.listdir(src_path)
    if len(files) == 0:
        return
    docs = loads_path(src_path, mode="paged", sentence_size=512, callbacks=[])
    last_doc = None
    docs1 = []
    for doc in docs:
        if not last_doc:
            last_doc = doc
            continue
        if "font-size" not in doc.metadata or "page_number" not in doc.metadata:
            continue
        if doc.metadata["font-size"] == last_doc.metadata["font-size"] and doc.metadata["page_number"] == \
                last_doc.metadata["page_number"] and len(doc.page_content) + len(
            last_doc.page_content) < 512 / 4 * 3:
            last_doc.page_content += doc.page_content
        else:
            docs1.append(last_doc)
            last_doc = doc
    if last_doc:
        docs1.append(last_doc)
    docs = docs1
    for i in range(0, len(docs), 300):
        faiss_db._add_documents(docs[i:i + 300 if i + 300 < len(docs) else len(docs)], need_split=True)
    faiss_db._save_local()


    # 遍历文件列表
    for file_name in files:
        # 构建文件的完整路径
        source_file = os.path.join(src_path, file_name)

        # 检查是否为文件
        if os.path.isfile(source_file):
            # 构建目标文件路径
            destination_file = os.path.join(dest_path, file_name)

            try:
                # 将文件移动到目标目录
                shutil.move(source_file, destination_file)
            except Exception as e:
                print(f"移动文件时出错: {source_file} -> {destination_file},错误信息: {e}")