load.py 14.5 KB
Newer Older
陈正乐 committed
1
import os, copy
2

3
from langchain_community.document_loaders import UnstructuredFileLoader, TextLoader, CSVLoader, UnstructuredPDFLoader, \
陈正乐 committed
4
    UnstructuredWordDocumentLoader, PDFMinerPDFasHTMLLoader
5 6 7 8 9

from .config import SENTENCE_SIZE, ZH_TITLE_ENHANCE
from .chinese_text_splitter import ChineseTextSplitter
from .zh_title_enhance import zh_title_enhance
from langchain.schema import Document
10
from typing import List
11 12 13 14
from src.loader.callback import BaseCallback
import re
from bs4 import BeautifulSoup

陈正乐 committed
15 16

def load(filepath, mode: str = None, sentence_size: int = 0, metadata=None, callbacks=None, **kwargs):
17 18 19 20 21 22 23
    r"""
        加载文档,参数说明
        mode:文档切割方式,"single", "elements", "paged"
        sentence_size:对于较大的document再次切割成多个
        kwargs
    """
    if filepath.lower().endswith(".md"):
陈正乐 committed
24
        loader = UnstructuredFileLoader(filepath, mode=mode or "elements", **kwargs)
25
    elif filepath.lower().endswith(".txt"):
陈正乐 committed
26
        loader = TextLoader(filepath, autodetect_encoding=True, **kwargs)
27
    elif filepath.lower().endswith(".csv"):
陈正乐 committed
28
        loader = CSVLoader(filepath, **kwargs)
29 30 31
    elif filepath.lower().endswith(".pdf"):
        # loader = UnstructuredPDFLoader(filepath, mode=mode or "elements",**kwargs)
        # 使用自定义pdf loader
陈正乐 committed
32
        return __pdf_loader(filepath, sentence_size=sentence_size, metadata=metadata, callbacks=callbacks)
33 34 35
    elif filepath.lower().endswith(".docx") or filepath.lower().endswith(".doc"):
        loader = UnstructuredWordDocumentLoader(filepath, mode=mode or "elements", **kwargs)
    else:
陈正乐 committed
36
        loader = UnstructuredFileLoader(filepath, mode=mode or "elements", **kwargs)
37
    if sentence_size > 0:
陈正乐 committed
38
        return split(loader.load(), sentence_size)
39
    return loader.load()
陈正乐 committed
40 41 42 43 44 45 46 47


def loads_path(path: str, **kwargs):
    return loads(get_files_in_directory(path), **kwargs)


def loads(filepaths, **kwargs):
    default_kwargs = {"mode": "paged"}
48 49 50 51
    default_kwargs.update(**kwargs)
    documents = [load(filepath=file, **default_kwargs) for file in filepaths]
    return [item for sublist in documents for item in sublist]

陈正乐 committed
52 53 54 55

def append(documents=None, sentence_size: int = SENTENCE_SIZE):  # 保留文档结构信息,注意处理hash
    if documents is None:
        documents = []
56 57 58 59 60
    effect_documents = []
    last_doc = documents[0]
    for doc in documents[1:]:
        last_hash = "" if "next_hash" not in last_doc.metadata else last_doc.metadata["next_hash"]
        doc_hash = "" if "next_hash" not in doc.metadata else doc.metadata["next_hash"]
陈正乐 committed
61
        if len(last_doc.page_content) + len(doc.page_content) <= sentence_size and last_hash == doc_hash:
62 63 64 65 66 67 68 69
            last_doc.page_content = last_doc.page_content + doc.page_content
            continue
        else:
            effect_documents.append(last_doc)
            last_doc = doc
    effect_documents.append(last_doc)
    return effect_documents

陈正乐 committed
70 71 72 73

def split(documents=None, sentence_size: int = SENTENCE_SIZE):  # 保留文档结构信息,注意处理hash
    if documents is None:
        documents = []
74 75 76
    effect_documents = []
    for doc in documents:
        if len(doc.page_content) > sentence_size:
陈正乐 committed
77 78
            words_list = re.split(r'·-·', doc.page_content.replace("。", "。·-·").replace("\n", "\n·-·"))  # 插入分隔符,分割
            document = Document(page_content="", metadata=copy.deepcopy(doc.metadata))
79 80 81 82 83
            first = True
            for word in words_list:
                if len(document.page_content) + len(word) < sentence_size:
                    document.page_content += word
                else:
陈正乐 committed
84
                    if len(document.page_content.replace(" ", "").replace("\n", "")) > 0:
85
                        if first:
陈正乐 committed
86
                            first = False
87
                        else:
陈正乐 committed
88

89 90
                            effect_documents[-1].metadata["next_doc"] = document.page_content
                        effect_documents.append(document)
陈正乐 committed
91 92
                    document = Document(page_content=word, metadata=copy.deepcopy(doc.metadata))
            if len(document.page_content.replace(" ", "").replace("\n", "")) > 0:
93
                if first:
陈正乐 committed
94
                    pass
95 96 97 98 99 100 101
                else:
                    effect_documents[-1].metadata["next_doc"] = document.page_content
                effect_documents.append(document)
        else:
            effect_documents.append(doc)
    return effect_documents

陈正乐 committed
102 103 104

def load_file(filepath, sentence_size=SENTENCE_SIZE, using_zh_title_enhance=ZH_TITLE_ENHANCE, mode: str = None,
              **kwargs):
105 106
    print("load_file", filepath)
    if filepath.lower().endswith(".md"):
陈正乐 committed
107
        loader = UnstructuredFileLoader(filepath, mode=mode or "elements", **kwargs)
108 109 110 111 112 113 114 115 116
        docs = loader.load()
    elif filepath.lower().endswith(".txt"):
        loader = TextLoader(filepath, autodetect_encoding=True, **kwargs)
        textsplitter = ChineseTextSplitter(pdf=False, sentence_size=sentence_size)
        docs = loader.load_and_split(textsplitter)
    elif filepath.lower().endswith(".csv"):
        loader = CSVLoader(filepath, **kwargs)
        docs = loader.load()
    elif filepath.lower().endswith(".pdf"):
陈正乐 committed
117
        loader = UnstructuredPDFLoader(filepath, mode=mode or "elements", **kwargs)
118 119 120
        textsplitter = ChineseTextSplitter(pdf=True, sentence_size=sentence_size)
        docs = loader.load_and_split(textsplitter)
    elif filepath.lower().endswith(".docx"):
陈正乐 committed
121
        loader = UnstructuredWordDocumentLoader(filepath, mode=mode or "elements", **kwargs)
122 123 124
        textsplitter = ChineseTextSplitter(pdf=False, sentence_size=sentence_size)
        docs = loader.load_and_split(textsplitter)
    else:
陈正乐 committed
125
        loader = UnstructuredFileLoader(filepath, mode=mode or "elements", **kwargs)
126 127 128 129 130 131 132
        textsplitter = ChineseTextSplitter(pdf=False, sentence_size=sentence_size)
        docs = loader.load_and_split(text_splitter=textsplitter)
    if using_zh_title_enhance:
        docs = zh_title_enhance(docs)
    write_check_file(filepath, docs)
    return docs

陈正乐 committed
133

134 135 136 137 138 139 140 141 142 143 144 145
def write_check_file(filepath, docs):
    folder_path = os.path.join(os.path.dirname(filepath), "tmp_files")
    if not os.path.exists(folder_path):
        os.makedirs(folder_path)
    fp = os.path.join(folder_path, 'load_file.txt')
    with open(fp, 'a+', encoding='utf-8') as fout:
        fout.write("filepath=%s,len=%s" % (filepath, len(docs)))
        fout.write('\n')
        for i in docs:
            fout.write(str(i))
            fout.write('\n')
        fout.close()
陈正乐 committed
146 147


148 149 150 151 152 153 154 155
def get_files_in_directory(directory):
    file_paths = []
    for root, dirs, files in os.walk(directory):
        for file in files:
            file_path = os.path.join(root, file)
            file_paths.append(file_path)
    return file_paths

陈正乐 committed
156 157 158

# 自定义pdf load部分
def __checkV(strings: str):
159
    lines = len(strings.splitlines())
陈正乐 committed
160
    if lines > 3 and len(strings.replace(" ", "")) / lines < 15:
161 162
        return False
    return True
陈正乐 committed
163 164 165 166 167 168 169 170 171 172 173 174


def __isTitle(strings: str):
    return len(strings.splitlines()) == 1 and len(strings) > 0 and strings.endswith("\n")


def __appendPara(strings: str):
    return strings.replace(".\n", "^_^").replace("。\n", "^-^").replace("?\n", "?^-^").replace("?\n", "?^-^").replace(
        "\n", "").replace("^_^", ".\n").replace("^-^", "。\n").replace("?^-^", "?\n").replace("?^-^", "?\n")


def __check_fs_ff(line_ff_fs_s, fs, ff):  # 若当前行有上一行一样的字体、字号文字,则返回相同的。默认返回最长文本的字体和字号
175 176 177
    re_fs = line_ff_fs_s[-1][0][-1]
    re_ff = line_ff_fs_s[-1][1][-1] if line_ff_fs_s[-1][1] else None
    max_len = 0
陈正乐 committed
178
    for ff_fs in line_ff_fs_s:  # 寻找最长文本字体和字号
179 180 181 182 183 184 185 186 187 188 189
        c_max = max(list(map(int, ff_fs[0])))
        if max_len < ff_fs[2] or (max_len == ff_fs[2] and c_max > int(re_fs)):
            max_len = ff_fs[2]
            re_fs = c_max
            re_ff = ff_fs[1][-1] if ff_fs[1] else None
    if fs:
        for ff_fs in line_ff_fs_s:
            if str(fs) in ff_fs[0] and ff in ff_fs[1]:
                re_fs = fs
                re_ff = ff
                break
陈正乐 committed
190 191
    return int(re_fs), re_ff

192

陈正乐 committed
193 194
def append_document(snippets1: List[Document], title: str, content: str, callbacks, font_size, page_num, metadate,
                    need_append: bool = False):
195 196
    if callbacks:
        for cb in callbacks:
陈正乐 committed
197 198
            if isinstance(cb, BaseCallback):
                if cb.filter(title, content):
199
                    return
陈正乐 committed
200
    if need_append and len(snippets1) > 0:
201
        ps = snippets1.pop()
陈正乐 committed
202
        snippets1.append(Document(page_content=ps.page_content + title, metadata=ps.metadata))
203
    else:
陈正乐 committed
204
        doc_metadata = {"font-size": font_size, "page_number": page_num}
205
        doc_metadata.update(metadate)
陈正乐 committed
206 207
        snippets1.append(Document(page_content=title + content, metadata=doc_metadata))

208 209 210 211 212

'''
    提取pdf文档,按标题和内容进行分割,文档的页码按标题所在页码为准
    分割后的文本按sentence_size值再次分割,分割的文本的页码均属于父文本的页码
'''
陈正乐 committed
213 214 215


def __pdf_loader(filepath: str, sentence_size: int = 0, metadata=None, callbacks=None):
216 217 218 219
    if not filepath.lower().endswith(".pdf"):
        raise ValueError("file is not pdf document")
    loader = PDFMinerPDFasHTMLLoader(filepath)
    documents = loader.load()
陈正乐 committed
220
    soup = BeautifulSoup(documents[0].page_content, 'html.parser')
221
    content = soup.find_all('div')
陈正乐 committed
222 223 224
    cur_fs = None  # 当前文本font-size
    last_fs = None  # 上一段文本font-size
    cur_ff = None  # 当前文本风格
225
    cur_text = ''
陈正乐 committed
226
    fs_increasing = False  # 下一行字体变大,判断为标题,从此处分割
227
    last_text = ''
陈正乐 committed
228 229 230 231 232 233 234
    last_page_num = 1  # 上一页页码 根据page_split判断当前文本页码
    page_num = 1  # 初始页码
    page_change = False  # 页面切换
    page_split = False  # 页面是否出现文本分割
    last_is_title = False  # 上一个文本是否是标题
    snippets: List[Document] = []

235 236
    filename = os.path.basename(filepath)
    if metadata:
陈正乐 committed
237
        metadata.update({'source': filepath, 'filename': filename, 'filetype': 'application/pdf'})
238
    else:
陈正乐 committed
239
        metadata = {'source': filepath, 'filename': filename, 'filetype': 'application/pdf'}
240 241
    for c in content:
        divs = c.get('style')
陈正乐 committed
242 243
        if re.match(r"^(Page|page)", c.text):  # 检测当前页的页码
            match = re.match(r"^(page|Page)\s+(\d+)", c.text)
244
            if match:
陈正乐 committed
245
                if page_split:  # 如果有文本分割,则换页,没有则保持当前文本起始页码
246 247
                    last_page_num = page_num
                page_num = match.group(2)
陈正乐 committed
248
                if len(last_text) + len(cur_text) == 0:  # 如果翻页且文本为空,上一页页码为当前页码
249 250 251 252
                    last_page_num = page_num
                page_change = True
                page_split = False
            continue
陈正乐 committed
253 254
        if re.findall('writing-mode:(.*?);', divs) == ['False'] or re.match(r'^[0-9\s\n]+$', c.text) or re.match(
                r"^第\s+\d+\s+页$", c.text):  # 如果不显示或者纯数字
255
            continue
陈正乐 committed
256
        if len(c.text.replace("\n", "").replace(" ", "")) <= 1:  # 去掉有效字符小于1的行
257 258 259 260
            continue
        sps = c.find_all('span')
        if not sps:
            continue
陈正乐 committed
261 262 263 264
        line_ff_fs_s = []  # 有效字符大于1的集合
        line_ff_fs_s2 = []  # 有效字符为1的集合
        for sp in sps:  # 如果一行中有多个不同样式的
            sp_len = len(sp.text.replace("\n", "").replace(" ", ""))
265 266 267
            if sp_len > 0:
                st = sp.get('style')
                if st:
陈正乐 committed
268 269 270
                    ff_fs = (re.findall('font-size:(\d+)px', st), re.findall('font-family:(.*?);', st),
                             len(sp.text.replace("\n", "").replace(" ", "")))
                    if sp_len == 1:  # 过滤一个有效字符的span
271 272 273
                        line_ff_fs_s2.append(ff_fs)
                    else:
                        line_ff_fs_s.append(ff_fs)
陈正乐 committed
274 275 276

        if len(line_ff_fs_s) == 0:  # 如果为空,则以一个有效字符span为准
            if len(line_ff_fs_s2) > 0:
277 278
                line_ff_fs_s = line_ff_fs_s2
            else:
陈正乐 committed
279
                if len(c.text) > 0:
280 281
                    page_change = False
                continue
陈正乐 committed
282
        fs, ff = __check_fs_ff(line_ff_fs_s, cur_fs, cur_ff)
283 284 285 286
        if not cur_ff:
            cur_ff = ff
        if not cur_fs:
            cur_fs = fs
陈正乐 committed
287 288

        if abs(fs - cur_fs) <= 1 and ff == cur_ff:  # 风格和字体都没改变
289 290 291
            cur_text += c.text
            cur_fs = fs
            page_change = False
陈正乐 committed
292
            if len(cur_text.splitlines()) > 3:  # 连续多行则fs_increasing不再生效
293 294
                fs_increasing = False
        else:
陈正乐 committed
295
            if page_change and cur_fs > fs + 1:  # 翻页,(字体变小)  大概率是页眉,跳过c.text。-----有可能切掉一行文本
296 297
                page_change = False
                continue
陈正乐 committed
298 299
            if last_is_title:  # 如果上一个为title
                if __isTitle(cur_text) or fs_increasing:  # 连续多个title 或者 有变大标识的
300 301 302 303
                    last_text = last_text + cur_text
                    last_is_title = True
                    fs_increasing = False
                else:
陈正乐 committed
304 305
                    append_document(snippets, last_text, __appendPara(cur_text), callbacks, cur_fs,
                                    page_num if page_split else last_page_num, metadata)
306 307
                    page_split = True
                    last_text = ''
陈正乐 committed
308 309
                    last_is_title = False
                    fs_increasing = int(fs) > int(cur_fs)  # 字体变大
310
            else:
陈正乐 committed
311 312 313 314 315 316
                if len(last_text) > 0 and __checkV(last_text):  # 过滤部分文本
                    # 将跨页的两段或者行数较少的文本合并
                    append_document(snippets, __appendPara(last_text), "", callbacks, last_fs,
                                    page_num if page_split else last_page_num, metadata,
                                    need_append=len(last_text.splitlines()) <= 2 or page_change)
                    page_split = True
317 318 319 320 321 322 323 324
                last_text = cur_text
                last_is_title = __isTitle(last_text) or fs_increasing
                fs_increasing = int(fs) > int(cur_fs)
            if page_split:
                last_page_num = page_num
            last_fs = cur_fs
            cur_fs = fs
            cur_ff = ff
陈正乐 committed
325
            cur_text = c.text
326
            page_change = False
陈正乐 committed
327 328
    append_document(snippets, last_text, __appendPara(cur_text), callbacks, cur_fs,
                    page_num if page_split else last_page_num, metadata)
329
    if sentence_size > 0:
陈正乐 committed
330
        return split(snippets, sentence_size)
331
    return snippets