import streamlit as st import os from haystack.utils import fetch_archive_from_http, clean_wiki_text, convert_files_to_docs from haystack.schema import Answer from haystack.document_stores import InMemoryDocumentStore from haystack.pipelines import ExtractiveQAPipeline from haystack.nodes import FARMReader, TfidfRetriever import logging from markdown import markdown from annotated_text import annotation from PIL import Image os.environ['TOKENIZERS_PARALLELISM'] ="false" #def load_and_write_data(document_store): # doc_dir = './article_txt_got' # docs = convert_files_to_docs(dir_path=doc_dir, clean_func=clean_wiki_text, split_paragraphs=True) # document_store.write_documents(docs) #pipeline = start_haystack() def load_document( file_path: str, encoding: Optional[str] = None, id_hash_keys: Optional[List[str]] = None, ) -> List[Document]: """ Takes docx, txt and pdf files as input and extracts text as well as the filename as metadata. Image pdf will not be handled in this notebook. Returns a list of type haystack.schema.Document """ file_name = str.split(file_path,'/')[-1] if file_name.endswith('.pdf'): converter = PDFToTextConverter(remove_numeric_tables=True) if file_name.endswith('.txt'): converter = TextConverter() if file_name.endswith('.docx'): converter = DocxToTextConverter() documents = [] #logger.info("Converting {}".format(file_name)) print("Converting '{}'".format(file_name)) # PDFToTextConverter, TextConverter, and DocxToTextConverter return a list containing a single Document document = converter.convert( file_path=file_path, meta=None, encoding=encoding, id_hash_keys=id_hash_keys )[0] text = document.content # creating the Haystack document by extracting 'content' from the returned object and passing meta information documents.append(Document(content=text, meta={"name": file_name}, id_hash_keys=id_hash_keys)) return documents def preprocessing(document, split_by: Literal["sentence", "word"] = 'sentence', split_length:int = 3): """ takes in haystack document object and splits it into synthetically generated paragraphs and applies simple cleaning. Returns cleaned list of haystack document objects. One paragraph per object. Also returns pandas df and list that contains all text joined together. """ if split_by == 'sentence': split_respect_sentence_boundary = False split_overlap=0 else: split_respect_sentence_boundary = True split_overlap= 20 preprocessor = PreProcessor( clean_empty_lines=True, clean_whitespace=True, clean_header_footer=True, split_by=split_by, split_length=split_length, split_respect_sentence_boundary= split_respect_sentence_boundary, split_overlap=split_overlap ) for i in document: docs_processed = preprocessor.process([i]) for item in docs_processed: item.content = basic(item.content) print("\n your document has been splitted to", len(docs_processed), "paragraphs") # logger.info("document has been splitted to {}".format(len(docs_processed))) # create dataframe of text and list of all text #df = pd.DataFrame(docs_processed) #all_text = " ".join(df.content.to_list()) #par_list = df.content.to_list() return docs_processed #, df, all_text, par_list