调用模型实现文本分割

d1dd3af6 · 陈正乐 · 7891dc93 · d1dd3af6 · d1dd3af6 · d1dd3af6
Commit d1dd3af6 authored May 14, 2024 by 陈正乐
Showing with 99 additions and 35 deletions

consts.py src/config/consts.py +5 -0

similarity.py src/pgdb/knowledge/similarity.py +15 -10

file_load_test.py test/file_load_test.py +19 -0

lk_test.py test/lk_test.py +60 -25

No files found.
--- a/src/config/consts.py
+++ b/src/config/consts.py
@@ -57,3 +57,8 @@ prompt1 = """'''
 '''
 请你根据上述已知资料回答下面的问题，问题如下：
 {question}"""
+
+# =============================
+# NLP_BERT模型路径配置
+# =============================
+NLP_BERT_PATH = 'C:\\Users\\15663\\AI\\models\\nlp_bert_document-segmentation_chinese-base'
--- a/src/pgdb/knowledge/similarity.py
+++ b/src/pgdb/knowledge/similarity.py
 import os
 import sys
-import re
 from os import path

 import copy
@@ -14,7 +13,6 @@ from langchain.embeddings.huggingface import (
 )
 import math
 import faiss
-from langchain.vectorstores.utils import DistanceStrategy
 from langchain.vectorstores.base import VectorStoreRetriever
 from langchain.callbacks.manager import (
    AsyncCallbackManagerForRetrieverRun,
@@ -23,9 +21,9 @@ from langchain.callbacks.manager import (
 from src.loader import load
 from langchain.embeddings.base import Embeddings
 from src.pgdb.knowledge.callback import DocumentCallback, DefaultDocumentCallback
-import operator
-from langchain.vectorstores.utils import DistanceStrategy
-import numpy as np
+from modelscope.pipelines import pipeline
+from modelscope.utils.constant import Tasks
+from src.config.consts import NLP_BERT_PATH
 sys.path.append("../")


@@ -55,9 +53,6 @@ def get_embding(_path: str) -> Embeddings:
    return EmbeddingFactory(_path).get_embedding()


-
-
-
 class RE_FAISS(FAISS):
    # 去重，并保留metadate
    @staticmethod
@@ -180,7 +175,8 @@ def getFAISS(embedding_model_name: str, store_path: str, info: dict = None, inde
                        index_to_docstore_id={})
    else:
        print("load_local faiss")
-        _faiss = RE_FAISS.load_local(folder_path=store_path, index_name=index_name, embeddings=embeddings, allow_dangerous_deserialization=True)
+        _faiss = RE_FAISS.load_local(folder_path=store_path, index_name=index_name, embeddings=embeddings,
+                                     allow_dangerous_deserialization=True)
        if docstore1 and is_pgsql:  # 如果外部参数调整，更新docstore
            _faiss.docstore = docstore1
        return _faiss
@@ -257,8 +253,14 @@ class VectorStore_FAISS(FAISS):
        if self.doc_callback:
            new_docs = self.doc_callback.before_store(self._faiss.docstore, new_docs)
        if need_split:
+            p = pipeline(
+                task=Tasks.document_segmentation,
+                model=NLP_BERT_PATH,
+                model_revision='v1.0.1'
+            )
            for doc in new_docs:
-                words_list = re.split(pattern, doc.page_content)
+                # words_list = re.split(pattern, doc.page_content)
+                words_list = p(documents=doc.page_content)['text'].split('\n\t')
                # 去掉重复项
                words_list = set(words_list)
                words_list = [str(words) for words in words_list]
@@ -269,6 +271,9 @@ class VectorStore_FAISS(FAISS):
                        list_of_documents.append(Document(page_content=words, metadata=metadata))
        else:
            list_of_documents = new_docs
+        print("====================================================================================")
+        print(list_of_documents)
+        print("====================================================================================")
        self._faiss.add_documents(list_of_documents)

    def _add_documents_from_dir(self, filepaths=None, load_kwargs=None):

--- a/test/file_load_test.py
+++ b/test/file_load_test.py
+# -*- coding: utf-8 -*-
+# 模型分割效果测试
+from modelscope.outputs import OutputKeys
+from modelscope.pipelines import pipeline
+from modelscope.utils.constant import Tasks
+
+p = pipeline(
+    task=Tasks.document_segmentation,
+    model="C:\\Users\\15663\\AI\\models\\nlp_bert_document-segmentation_chinese-base",
+    model_revision='v1.0.1')
+
+result = p(documents='移动端语音唤醒模型，检测关键词为“小云小云”。模型主体为4层FSMN结构，使用CTC训练准则，参数量750K，适用于移动端设备运行。模型输入为Fbank特征，输出为基于char建模的中文全集token预测，测试工具根据每一帧的预测数据进行后处理得到输入音频的实时检测结果。模型训练采用“basetrain + finetune”的模式，basetrain过程使用大量内部移动端数据，在此基础上，使用1万条设备端录制安静场景“小云小云”数据进行微调，得到最终面向业务的模型。后续用户可在basetrain模型基础上，使用其他关键词数据进行微调，得到新的语音唤醒模型，但暂时未开放模型finetune功能。')
+print(result['text'].split('\n'))
+print(type([OutputKeys.TEXT]), len(result[OutputKeys.TEXT]))
+
+# 模型下载
+# from modelscope import snapshot_download
+# model_dir = snapshot_download('iic/nlp_bert_document-segmentation_chinese-base')
\ No newline at end of file
--- a/test/lk_test.py
+++ b/test/lk_test.py
-class Solution:
-    @staticmethod
-    def numDecodings(s: str) -> int:
-        length = len(s)
-        ans = [0 for i in range(length+1)]
-        ans[0] = 0
-        ans[1] = 1
-        for i in range(1, length+1):
-            print(i)
-            if s[i - 1] + s[i] == '10' or s[i - 1] + s[i] == '11' or s[i - 1] + s[i] == '12' or s[i - 1] + s[
-                i] == '13' or s[i - 1] + s[i] == '14' or s[i - 1] + s[i] == '15' or s[i - 1] + s[i] == '16' or s[
-                i - 1] + s[i] == '17' or s[i - 1] + s[i] == '18' or s[i - 1] + s[i] == '19' or s[i - 1] + s[
-                i] == '20' or s[i - 1] + s[i] == '21' or s[i - 1] + s[i] == '22' or s[i - 1] + s[i] == '23' or s[
-                i - 1] + s[i] == '24' or s[i - 1] + s[i] == '25' or s[i - 1] + s[i] == '26':
-                if s[i] == '0':
-                    ans[i] = ans[i - 1] + 1
-                else:
-                    ans[i] = ans[i - 1] + 2
-            else:
-                ans[i] = ans[i - 1] + 1
-        print(ans)
-        return ans[length-1]
-
-Solution.numDecodings("226")
\ No newline at end of file
+"""给定三个字符串 s1、s2、s3，请你帮忙验证 s3 是否是由 s1 和 s2 交错 组成的。
+
+两个字符串 s 和 t 交错 的定义与过程如下，其中每个字符串都会被分割成若干 非空
+子字符串
+：
+
+s = s1 + s2 + ... + sn
+t = t1 + t2 + ... + tm
+|n - m| <= 1
+交错 是 s1 + t1 + s2 + t2 + s3 + t3 + ... 或者 t1 + s1 + t2 + s2 + t3 + s3 + ...
+注意：a + b 意味着字符串 a 和 b 连接。
+
+
+
+示例 1：
+
+
+输入：s1 = "aabcc", s2 = "dbbca", s3 = "aadbbcbcac"
+输出：true
+示例 2：
+
+输入：s1 = "aabcc", s2 = "dbbca", s3 = "aadbbbaccc"
+输出：false
+示例 3：
+
+输入：s1 = "", s2 = "", s3 = ""
+输出：true
+
+
+提示：
+
+0 <= s1.length, s2.length <= 100
+0 <= s3.length <= 200
+s1、s2、和 s3 都由小写英文字母组成"""
+
+
+# Definition for a binary tree node.
+class TreeNode(object):
+    def __init__(self, val=0, left=None, right=None):
+        self.val = val
+        self.left = left
+        self.right = right
+
+
+class Solution(object):
+    def isValidBST(self, root):
+        """
+        :type root: TreeNode
+        :rtype: bool
+        """
+        if root is None:
+            return True
+        if root.left is None and root.right is None:
+            return True
+        if root.left is None:
+            return self.isValidBST(root.right) and root.val < root.right.val
+        if root.right is None:
+            return self.isValidBST(root.left) and root.val > root.left.val
+        return self.isValidBST(root.left) and self.isValidBST(
+            root.right) and root.val > root.left.val and root.val < root.right.val