File size: 3,546 Bytes
91975ca
 
0c277f0
 
 
 
 
 
 
 
 
91975ca
0c277f0
91975ca
 
 
0c277f0
 
 
 
 
 
91975ca
c2c2862
 
 
 
 
91975ca
 
c2c2862
 
 
 
 
91975ca
 
 
0c277f0
91975ca
0c277f0
91975ca
0c277f0
91975ca
 
 
 
c2c2862
 
 
91975ca
c2c2862
 
 
91975ca
c2c2862
 
 
 
0c277f0
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
91975ca
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
import streamlit as st
import os
from haystack.utils import fetch_archive_from_http, clean_wiki_text, convert_files_to_docs
from haystack.schema import Answer
from haystack.document_stores import InMemoryDocumentStore
from haystack.pipelines import ExtractiveQAPipeline
from haystack.nodes import FARMReader, TfidfRetriever
import logging
from markdown import markdown
from annotated_text import annotation
from PIL import Image

os.environ['TOKENIZERS_PARALLELISM'] ="false"



#def load_and_write_data(document_store):
#    doc_dir = './article_txt_got'
#    docs = convert_files_to_docs(dir_path=doc_dir, clean_func=clean_wiki_text, split_paragraphs=True)
#    document_store.write_documents(docs)

#pipeline = start_haystack()
def load_document(
    file_path: str,
    file_name,
    encoding: Optional[str] = None,
    id_hash_keys: Optional[List[str]] = None,
) -> List[Document]:
    
    """
    takes docx, txt and pdf files as input and \
    extracts text as well as the filename as metadata. \
    Since haystack does not take care of all pdf files, \
    pdfplumber is attached to the pipeline in case the pdf \ 
    extraction fails via Haystack.
    Returns a list of type haystack.schema.Document
    """

    if file_name.endswith('.pdf'):
        converter = PDFToTextConverter(remove_numeric_tables=True)
    if file_name.endswith('.txt'):
        converter = TextConverter()
    if file_name.endswith('.docx'):
        converter = DocxToTextConverter()


    documents = []
    logger.info("Converting {}".format(file_name))
    # PDFToTextConverter, TextConverter, and DocxToTextConverter 
    # return a list containing a single Document
    document = converter.convert(
                file_path=file_path, meta=None, 
                encoding=encoding, id_hash_keys=id_hash_keys
                )[0]
    text = document.content
    documents.append(Document(content=text, 
                              meta={"name": file_name}, 
                              id_hash_keys=id_hash_keys))
    
    return documents
    
 def preprocessing(document, 
                  split_by: Literal["sentence", "word"] = 'sentence',
                  split_length:int = 3):

    """
    takes in haystack document object and splits it into synthetically generated paragraphs and applies simple cleaning.
    Returns cleaned list of haystack document objects. One paragraph per object. Also returns pandas df and 
    list that contains all text joined together.
    """    
    if split_by == 'sentence':
      split_respect_sentence_boundary = False
      split_overlap=0
    else:
      split_respect_sentence_boundary = True
      split_overlap= 20
    
    preprocessor = PreProcessor(
        clean_empty_lines=True,
        clean_whitespace=True,
        clean_header_footer=True,
        split_by=split_by,
        split_length=split_length,
        split_respect_sentence_boundary= split_respect_sentence_boundary,
        split_overlap=split_overlap
    )
    for i in document:
        docs_processed = preprocessor.process([i])
        for item in docs_processed:
            item.content = basic(item.content)

    print("\n your document has been splitted to", len(docs_processed), "paragraphs")
    # logger.info("document has been splitted to {}".format(len(docs_processed)))
    
    # create dataframe of text and list of all text
    #df = pd.DataFrame(docs_processed)
    #all_text = " ".join(df.content.to_list())
    #par_list = df.content.to_list()

    return docs_processed #, df, all_text, par_list