Skip to content
Projects
Groups
Snippets
Help
This project
Loading...
Sign in / Register
Toggle navigation
L
LAE
Overview
Overview
Details
Activity
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
文靖昊
LAE
Commits
a14b9e36
Commit
a14b9e36
authored
May 15, 2024
by
文靖昊
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
定时构建索引实现
parent
7891dc93
Hide whitespace changes
Inline
Side-by-side
Showing
3 changed files
with
102 additions
and
2 deletions
+102
-2
auto_update_index_task.py
src/scheduler/auto_update_index_task.py
+59
-0
scheduler.py
src/scheduler/scheduler.py
+2
-2
auto_task.py
test/auto_task.py
+41
-0
No files found.
src/scheduler/auto_update_index_task.py
0 → 100644
View file @
a14b9e36
from
src.pgdb.knowledge.similarity
import
VectorStore_FAISS
from
src.loader.load
import
loads_path
import
os
import
shutil
def
Auto_Task
(
src_path
:
str
,
dest_path
:
str
,
faiss_db
:
VectorStore_FAISS
):
if
not
os
.
path
.
exists
(
src_path
):
os
.
makedirs
(
dest_path
)
# 检查目标目录是否存在,如果不存在则创建
if
not
os
.
path
.
exists
(
dest_path
):
os
.
makedirs
(
dest_path
)
print
(
"目标目录不存在,已创建。"
)
files
=
os
.
listdir
(
src_path
)
if
len
(
files
)
==
0
:
return
docs
=
loads_path
(
src_path
,
mode
=
"paged"
,
sentence_size
=
512
,
callbacks
=
[])
last_doc
=
None
docs1
=
[]
for
doc
in
docs
:
if
not
last_doc
:
last_doc
=
doc
continue
if
"font-size"
not
in
doc
.
metadata
or
"page_number"
not
in
doc
.
metadata
:
continue
if
doc
.
metadata
[
"font-size"
]
==
last_doc
.
metadata
[
"font-size"
]
and
doc
.
metadata
[
"page_number"
]
==
\
last_doc
.
metadata
[
"page_number"
]
and
len
(
doc
.
page_content
)
+
len
(
last_doc
.
page_content
)
<
512
/
4
*
3
:
last_doc
.
page_content
+=
doc
.
page_content
else
:
docs1
.
append
(
last_doc
)
last_doc
=
doc
if
last_doc
:
docs1
.
append
(
last_doc
)
docs
=
docs1
for
i
in
range
(
0
,
len
(
docs
),
300
):
faiss_db
.
_add_documents
(
docs
[
i
:
i
+
300
if
i
+
300
<
len
(
docs
)
else
len
(
docs
)],
need_split
=
True
)
faiss_db
.
_save_local
()
# 遍历文件列表
for
file_name
in
files
:
# 构建文件的完整路径
source_file
=
os
.
path
.
join
(
src_path
,
file_name
)
# 检查是否为文件
if
os
.
path
.
isfile
(
source_file
):
# 构建目标文件路径
destination_file
=
os
.
path
.
join
(
dest_path
,
file_name
)
try
:
# 将文件移动到目标目录
shutil
.
move
(
source_file
,
destination_file
)
except
Exception
as
e
:
print
(
f
"移动文件时出错: {source_file} -> {destination_file},错误信息: {e}"
)
\ No newline at end of file
src/scheduler/scheduler.py
View file @
a14b9e36
...
...
@@ -5,9 +5,9 @@ class TaskScheduler:
def
__init__
(
self
):
self
.
scheduler
=
BackgroundScheduler
()
def
add_task
(
self
,
timedTask
,
timeMinute
):
def
add_task
(
self
,
timedTask
,
timeMinute
,
*
args
):
# Define your timed task here
self
.
scheduler
.
add_job
(
timedTask
,
'interval'
,
minutes
=
timeMinute
)
self
.
scheduler
.
add_job
(
timedTask
,
'interval'
,
minutes
=
timeMinute
,
args
=
args
)
def
start_scheduler
(
self
):
# Add the timed task to the scheduler
...
...
test/auto_task.py
0 → 100644
View file @
a14b9e36
from
src.pgdb.knowledge.similarity
import
VectorStore_FAISS
from
src.scheduler.scheduler
import
TaskScheduler
from
src.scheduler.auto_update_index_task
import
Auto_Task
import
time
from
src.config.consts
import
(
VEC_DB_DBNAME
,
VEC_DB_HOST
,
VEC_DB_PASSWORD
,
VEC_DB_PORT
,
VEC_DB_USER
,
EMBEEDING_MODEL_PATH
,
FAISS_STORE_PATH
,
SIMILARITY_SHOW_NUMBER
,
KNOWLEDGE_PATH
,
INDEX_NAME
)
def
auto_task
():
vecstore_faiss
=
VectorStore_FAISS
(
embedding_model_name
=
EMBEEDING_MODEL_PATH
,
store_path
=
FAISS_STORE_PATH
,
index_name
=
INDEX_NAME
,
info
=
{
"port"
:
VEC_DB_PORT
,
"host"
:
VEC_DB_HOST
,
"dbname"
:
VEC_DB_DBNAME
,
"username"
:
VEC_DB_USER
,
"password"
:
VEC_DB_PASSWORD
},
show_number
=
SIMILARITY_SHOW_NUMBER
,
reset
=
False
)
print
(
vecstore_faiss
.
join_document
(
vecstore_faiss
.
get_text_similarity
(
"什么是暹罗猫"
)))
scheduler
=
TaskScheduler
()
scheduler
.
add_task
(
Auto_Task
,
2
,
"../test_dir/src"
,
"../test_dir/dest"
,
vecstore_faiss
)
scheduler
.
start_scheduler
()
for
i
in
range
(
10
):
print
(
i
)
print
(
vecstore_faiss
.
join_document
(
vecstore_faiss
.
get_text_similarity
(
"什么是暹罗猫"
)))
time
.
sleep
(
30
)
scheduler
.
stop_scheduler
()
if
__name__
==
"__main__"
:
auto_task
()
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment