Commit d1dd3af6 by 陈正乐

调用模型实现文本分割

parent 7891dc93
...@@ -56,4 +56,9 @@ prompt1 = """''' ...@@ -56,4 +56,9 @@ prompt1 = """'''
{context} {context}
''' '''
请你根据上述已知资料回答下面的问题,问题如下: 请你根据上述已知资料回答下面的问题,问题如下:
{question}""" {question}"""
\ No newline at end of file
# =============================
# NLP_BERT模型路径配置
# =============================
NLP_BERT_PATH = 'C:\\Users\\15663\\AI\\models\\nlp_bert_document-segmentation_chinese-base'
import os import os
import sys import sys
import re
from os import path from os import path
import copy import copy
...@@ -14,7 +13,6 @@ from langchain.embeddings.huggingface import ( ...@@ -14,7 +13,6 @@ from langchain.embeddings.huggingface import (
) )
import math import math
import faiss import faiss
from langchain.vectorstores.utils import DistanceStrategy
from langchain.vectorstores.base import VectorStoreRetriever from langchain.vectorstores.base import VectorStoreRetriever
from langchain.callbacks.manager import ( from langchain.callbacks.manager import (
AsyncCallbackManagerForRetrieverRun, AsyncCallbackManagerForRetrieverRun,
...@@ -23,9 +21,9 @@ from langchain.callbacks.manager import ( ...@@ -23,9 +21,9 @@ from langchain.callbacks.manager import (
from src.loader import load from src.loader import load
from langchain.embeddings.base import Embeddings from langchain.embeddings.base import Embeddings
from src.pgdb.knowledge.callback import DocumentCallback, DefaultDocumentCallback from src.pgdb.knowledge.callback import DocumentCallback, DefaultDocumentCallback
import operator from modelscope.pipelines import pipeline
from langchain.vectorstores.utils import DistanceStrategy from modelscope.utils.constant import Tasks
import numpy as np from src.config.consts import NLP_BERT_PATH
sys.path.append("../") sys.path.append("../")
...@@ -55,9 +53,6 @@ def get_embding(_path: str) -> Embeddings: ...@@ -55,9 +53,6 @@ def get_embding(_path: str) -> Embeddings:
return EmbeddingFactory(_path).get_embedding() return EmbeddingFactory(_path).get_embedding()
class RE_FAISS(FAISS): class RE_FAISS(FAISS):
# 去重,并保留metadate # 去重,并保留metadate
@staticmethod @staticmethod
...@@ -180,7 +175,8 @@ def getFAISS(embedding_model_name: str, store_path: str, info: dict = None, inde ...@@ -180,7 +175,8 @@ def getFAISS(embedding_model_name: str, store_path: str, info: dict = None, inde
index_to_docstore_id={}) index_to_docstore_id={})
else: else:
print("load_local faiss") print("load_local faiss")
_faiss = RE_FAISS.load_local(folder_path=store_path, index_name=index_name, embeddings=embeddings, allow_dangerous_deserialization=True) _faiss = RE_FAISS.load_local(folder_path=store_path, index_name=index_name, embeddings=embeddings,
allow_dangerous_deserialization=True)
if docstore1 and is_pgsql: # 如果外部参数调整,更新docstore if docstore1 and is_pgsql: # 如果外部参数调整,更新docstore
_faiss.docstore = docstore1 _faiss.docstore = docstore1
return _faiss return _faiss
...@@ -257,8 +253,14 @@ class VectorStore_FAISS(FAISS): ...@@ -257,8 +253,14 @@ class VectorStore_FAISS(FAISS):
if self.doc_callback: if self.doc_callback:
new_docs = self.doc_callback.before_store(self._faiss.docstore, new_docs) new_docs = self.doc_callback.before_store(self._faiss.docstore, new_docs)
if need_split: if need_split:
p = pipeline(
task=Tasks.document_segmentation,
model=NLP_BERT_PATH,
model_revision='v1.0.1'
)
for doc in new_docs: for doc in new_docs:
words_list = re.split(pattern, doc.page_content) # words_list = re.split(pattern, doc.page_content)
words_list = p(documents=doc.page_content)['text'].split('\n\t')
# 去掉重复项 # 去掉重复项
words_list = set(words_list) words_list = set(words_list)
words_list = [str(words) for words in words_list] words_list = [str(words) for words in words_list]
...@@ -269,6 +271,9 @@ class VectorStore_FAISS(FAISS): ...@@ -269,6 +271,9 @@ class VectorStore_FAISS(FAISS):
list_of_documents.append(Document(page_content=words, metadata=metadata)) list_of_documents.append(Document(page_content=words, metadata=metadata))
else: else:
list_of_documents = new_docs list_of_documents = new_docs
print("====================================================================================")
print(list_of_documents)
print("====================================================================================")
self._faiss.add_documents(list_of_documents) self._faiss.add_documents(list_of_documents)
def _add_documents_from_dir(self, filepaths=None, load_kwargs=None): def _add_documents_from_dir(self, filepaths=None, load_kwargs=None):
......
# -*- coding: utf-8 -*-
# 模型分割效果测试
from modelscope.outputs import OutputKeys
from modelscope.pipelines import pipeline
from modelscope.utils.constant import Tasks
p = pipeline(
task=Tasks.document_segmentation,
model="C:\\Users\\15663\\AI\\models\\nlp_bert_document-segmentation_chinese-base",
model_revision='v1.0.1')
result = p(documents='移动端语音唤醒模型,检测关键词为“小云小云”。模型主体为4层FSMN结构,使用CTC训练准则,参数量750K,适用于移动端设备运行。模型输入为Fbank特征,输出为基于char建模的中文全集token预测,测试工具根据每一帧的预测数据进行后处理得到输入音频的实时检测结果。模型训练采用“basetrain + finetune”的模式,basetrain过程使用大量内部移动端数据,在此基础上,使用1万条设备端录制安静场景“小云小云”数据进行微调,得到最终面向业务的模型。后续用户可在basetrain模型基础上,使用其他关键词数据进行微调,得到新的语音唤醒模型,但暂时未开放模型finetune功能。')
print(result['text'].split('\n'))
print(type([OutputKeys.TEXT]), len(result[OutputKeys.TEXT]))
# 模型下载
# from modelscope import snapshot_download
# model_dir = snapshot_download('iic/nlp_bert_document-segmentation_chinese-base')
\ No newline at end of file
class Solution: """给定三个字符串 s1、s2、s3,请你帮忙验证 s3 是否是由 s1 和 s2 交错 组成的。
@staticmethod
def numDecodings(s: str) -> int: 两个字符串 s 和 t 交错 的定义与过程如下,其中每个字符串都会被分割成若干 非空
length = len(s) 子字符串
ans = [0 for i in range(length+1)]
ans[0] = 0
ans[1] = 1 s = s1 + s2 + ... + sn
for i in range(1, length+1): t = t1 + t2 + ... + tm
print(i) |n - m| <= 1
if s[i - 1] + s[i] == '10' or s[i - 1] + s[i] == '11' or s[i - 1] + s[i] == '12' or s[i - 1] + s[ 交错 是 s1 + t1 + s2 + t2 + s3 + t3 + ... 或者 t1 + s1 + t2 + s2 + t3 + s3 + ...
i] == '13' or s[i - 1] + s[i] == '14' or s[i - 1] + s[i] == '15' or s[i - 1] + s[i] == '16' or s[ 注意:a + b 意味着字符串 a 和 b 连接。
i - 1] + s[i] == '17' or s[i - 1] + s[i] == '18' or s[i - 1] + s[i] == '19' or s[i - 1] + s[
i] == '20' or s[i - 1] + s[i] == '21' or s[i - 1] + s[i] == '22' or s[i - 1] + s[i] == '23' or s[
i - 1] + s[i] == '24' or s[i - 1] + s[i] == '25' or s[i - 1] + s[i] == '26':
if s[i] == '0': 示例 1:
ans[i] = ans[i - 1] + 1
else:
ans[i] = ans[i - 1] + 2 输入:s1 = "aabcc", s2 = "dbbca", s3 = "aadbbcbcac"
else: 输出:true
ans[i] = ans[i - 1] + 1 示例 2:
print(ans)
return ans[length-1] 输入:s1 = "aabcc", s2 = "dbbca", s3 = "aadbbbaccc"
输出:false
Solution.numDecodings("226") 示例 3:
\ No newline at end of file
输入:s1 = "", s2 = "", s3 = ""
输出:true
提示:
0 <= s1.length, s2.length <= 100
0 <= s3.length <= 200
s1、s2、和 s3 都由小写英文字母组成"""
# Definition for a binary tree node.
class TreeNode(object):
def __init__(self, val=0, left=None, right=None):
self.val = val
self.left = left
self.right = right
class Solution(object):
def isValidBST(self, root):
"""
:type root: TreeNode
:rtype: bool
"""
if root is None:
return True
if root.left is None and root.right is None:
return True
if root.left is None:
return self.isValidBST(root.right) and root.val < root.right.val
if root.right is None:
return self.isValidBST(root.left) and root.val > root.left.val
return self.isValidBST(root.left) and self.isValidBST(
root.right) and root.val > root.left.val and root.val < root.right.val
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment