File size: 6,763 Bytes
32d9382 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 |
import re
import logging
from haystack.document_stores import ElasticsearchDocumentStore
from haystack.utils import launch_es,print_answers
from haystack.nodes import FARMReader,TransformersReader,BM25Retriever
from haystack.pipelines import ExtractiveQAPipeline
from haystack.nodes import TextConverter,PDFToTextConverter,PreProcessor
from haystack.utils import convert_files_to_docs, fetch_archive_from_http
from Reader import PdfReader,ExtractedText
launch_es() # Launches an Elasticsearch instance on your local machine
# Install the latest release of Haystack in your own environment
#! pip install farm-haystack
"""Install the latest main of Haystack"""
# !pip install --upgrade pip
# !pip install git+https://github.com/deepset-ai/haystack.git#egg=farm-haystack[colab,ocr]
# # For Colab/linux based machines
# !wget --no-check-certificate https://dl.xpdfreader.com/xpdf-tools-linux-4.04.tar.gz
# !tar -xvf xpdf-tools-linux-4.04.tar.gz && sudo cp xpdf-tools-linux-4.04/bin64/pdftotext /usr/local/bin
# For Macos machines
# !wget --no-check-certificate https://dl.xpdfreader.com/xpdf-tools-mac-4.03.tar.gz
# !tar -xvf xpdf-tools-mac-4.03.tar.gz && sudo cp xpdf-tools-mac-4.03/bin64/pdftotext /usr/local/bin
"Run this script from the root of the project"
# # In Colab / No Docker environments: Start Elasticsearch from source
# ! wget https://artifacts.elastic.co/downloads/elasticsearch/elasticsearch-7.9.2-linux-x86_64.tar.gz -q
# ! tar -xzf elasticsearch-7.9.2-linux-x86_64.tar.gz
# ! chown -R daemon:daemon elasticsearch-7.9.2
# import os
# from subprocess import Popen, PIPE, STDOUT
# es_server = Popen(
# ["elasticsearch-7.9.2/bin/elasticsearch"], stdout=PIPE, stderr=STDOUT, preexec_fn=lambda: os.setuid(1) # as daemon
# )
# # wait until ES has started
# ! sleep 30
logging.basicConfig(format="%(levelname)s - %(name)s - %(message)s", level=logging.WARNING)
logging.getLogger("haystack").setLevel(logging.INFO)
class Connection:
def __init__(self,host="localhost",username="",password="",index="document"):
"""
host: Elasticsearch host. If no host is provided, the default host "localhost" is used.
port: Elasticsearch port. If no port is provided, the default port 9200 is used.
username: Elasticsearch username. If no username is provided, no username is used.
password: Elasticsearch password. If no password is provided, no password is used.
index: Elasticsearch index. If no index is provided, the default index "document" is used.
"""
self.host=host
self.username=username
self.password=password
self.index=index
def get_connection(self):
document_store=ElasticsearchDocumentStore(host=self.host,username=self.username,password=self.password,index=self.index)
return document_store
class QAHaystack:
def __init__(self, filename):
self.filename=filename
def preprocessing(self,data):
"""
This function is used to preprocess the data. Its a simple function which removes the special characters and converts the data to lower case.
"""
converter = TextConverter(remove_numeric_tables=True, valid_languages=["en"])
doc_txt = converter.convert(file_path=ExtractedText(self.filename,'data.txt').save(4,6), meta=None)[0]
converter = PDFToTextConverter(remove_numeric_tables=True, valid_languages=["en"])
doc_pdf = converter.convert(file_path="data/tutorial8/manibook.pdf", meta=None)[0]
preprocess_text=data.lower() # lowercase
preprocess_text = re.sub(r'\s+', ' ', preprocess_text) # remove extra spaces
return preprocess_text
def convert_to_document(self,data):
"""
Write the data to a text file. This is required since the haystack library requires the data to be in a text file so that it can then be converted to a document.
"""
data=self.preprocessing(data)
with open(self.filename,'w') as f:
f.write(data)
"""
Read the data from the text file.
"""
data=self.preprocessing(data)
with open(self.filename,'r') as f:
data=f.read()
data=data.split("\n")
"""
DocumentStores expect Documents in dictionary form, like that below. They are loaded using the DocumentStore.write_documents()
dicts=[
{
'content': DOCUMENT_TEXT_HERE,
'meta':{'name': DOCUMENT_NAME,...}
},...
]
(Optionally: you can also add more key-value-pairs here, that will be indexed as fields in Elasticsearch and can be accessed later for filtering or shown in the responses of the Pipeline)
"""
data_json=[{
'content':paragraph,
'meta':{
'name':self.filename
}
} for paragraph in data
]
document_store=Connection().get_connection()
document_store.write_documents(data_json)
return document_store
class Pipeline:
def __init__(self,filename,retriever=BM25Retriever,reader=FARMReader):
self.reader=reader
self.retriever=retriever
self.filename=filename
def get_prediction(self,data,query):
"""
Retrievers help narrowing down the scope for the Reader to smaller units of text where a given question could be answered. They use some simple but fast algorithm.
Here: We use Elasticsearch's default BM25 algorithm . I'll check out the other retrievers as well.
"""
retriever=self.retriever(document_store=QAHaystack(self.filename).convert_to_document(data))
"""
Readers scan the texts returned by retrievers in detail and extract k best answers. They are based on powerful, but slower deep learning models.Haystack currently supports Readers based on the frameworks FARM and Transformers.
"""
reader = self.reader(model_name_or_path="deepset/roberta-base-squad2", use_gpu=True)
"""
With a Haystack Pipeline we can stick together your building blocks to a search pipeline. Under the hood, Pipelines are Directed Acyclic Graphs (DAGs) that you can easily customize for our own use cases. To speed things up, Haystack also comes with a few predefined Pipelines. One of them is the ExtractiveQAPipeline that combines a retriever and a reader to answer our questions.
"""
pipe = ExtractiveQAPipeline(reader, retriever)
"""
This function is used to get the prediction from the pipeline.
"""
prediction = pipe.run(query=query, params={"Retriever":{"top_k":10}, "Reader":{"top_k":5}})
return prediction |