Spaces:
Runtime error
Runtime error
Update scripts/process.py
Browse files- scripts/process.py +20 -18
scripts/process.py
CHANGED
|
@@ -20,20 +20,21 @@ os.environ['TOKENIZERS_PARALLELISM'] ="false"
|
|
| 20 |
# document_store.write_documents(docs)
|
| 21 |
|
| 22 |
#pipeline = start_haystack()
|
| 23 |
-
|
| 24 |
def load_document(
|
| 25 |
-
|
| 26 |
-
|
| 27 |
-
|
| 28 |
-
|
|
|
|
| 29 |
|
| 30 |
"""
|
| 31 |
-
|
| 32 |
-
|
| 33 |
-
|
|
|
|
|
|
|
| 34 |
Returns a list of type haystack.schema.Document
|
| 35 |
"""
|
| 36 |
-
file_name = str.split(file_path,'/')[-1]
|
| 37 |
|
| 38 |
if file_name.endswith('.pdf'):
|
| 39 |
converter = PDFToTextConverter(remove_numeric_tables=True)
|
|
@@ -44,17 +45,18 @@ def load_document(
|
|
| 44 |
|
| 45 |
|
| 46 |
documents = []
|
| 47 |
-
|
| 48 |
-
#
|
| 49 |
-
|
| 50 |
-
# PDFToTextConverter, TextConverter, and DocxToTextConverter return a list containing a single Document
|
| 51 |
document = converter.convert(
|
| 52 |
-
file_path=file_path, meta=None,
|
| 53 |
-
|
|
|
|
| 54 |
text = document.content
|
| 55 |
-
|
| 56 |
-
|
| 57 |
-
|
|
|
|
| 58 |
return documents
|
| 59 |
|
| 60 |
def preprocessing(document,
|
|
|
|
| 20 |
# document_store.write_documents(docs)
|
| 21 |
|
| 22 |
#pipeline = start_haystack()
|
|
|
|
| 23 |
def load_document(
|
| 24 |
+
file_path: str,
|
| 25 |
+
file_name,
|
| 26 |
+
encoding: Optional[str] = None,
|
| 27 |
+
id_hash_keys: Optional[List[str]] = None,
|
| 28 |
+
) -> List[Document]:
|
| 29 |
|
| 30 |
"""
|
| 31 |
+
takes docx, txt and pdf files as input and \
|
| 32 |
+
extracts text as well as the filename as metadata. \
|
| 33 |
+
Since haystack does not take care of all pdf files, \
|
| 34 |
+
pdfplumber is attached to the pipeline in case the pdf \
|
| 35 |
+
extraction fails via Haystack.
|
| 36 |
Returns a list of type haystack.schema.Document
|
| 37 |
"""
|
|
|
|
| 38 |
|
| 39 |
if file_name.endswith('.pdf'):
|
| 40 |
converter = PDFToTextConverter(remove_numeric_tables=True)
|
|
|
|
| 45 |
|
| 46 |
|
| 47 |
documents = []
|
| 48 |
+
logger.info("Converting {}".format(file_name))
|
| 49 |
+
# PDFToTextConverter, TextConverter, and DocxToTextConverter
|
| 50 |
+
# return a list containing a single Document
|
|
|
|
| 51 |
document = converter.convert(
|
| 52 |
+
file_path=file_path, meta=None,
|
| 53 |
+
encoding=encoding, id_hash_keys=id_hash_keys
|
| 54 |
+
)[0]
|
| 55 |
text = document.content
|
| 56 |
+
documents.append(Document(content=text,
|
| 57 |
+
meta={"name": file_name},
|
| 58 |
+
id_hash_keys=id_hash_keys))
|
| 59 |
+
|
| 60 |
return documents
|
| 61 |
|
| 62 |
def preprocessing(document,
|