Spaces:
Runtime error
Runtime error
Commit
·
a5ee254
1
Parent(s):
970a7e9
Update app.py
Browse files
app.py
CHANGED
@@ -350,47 +350,90 @@ def load_single_example_text(
|
|
350 |
return text
|
351 |
|
352 |
|
353 |
-
def load_uploaded_file(file_obj, max_pages: int = 20, lower: bool = False) -> str:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
354 |
"""
|
355 |
-
|
356 |
|
357 |
-
:param
|
358 |
:param int max_pages: the maximum number of pages to load from a PDF
|
359 |
:param bool lower: whether to lowercase the text
|
360 |
-
:return str: the text of the
|
361 |
"""
|
362 |
global ocr_model
|
363 |
logger = logging.getLogger(__name__)
|
364 |
-
#
|
365 |
-
if isinstance(file_obj, list):
|
366 |
-
file_obj = file_obj[0]
|
367 |
-
file_path = Path(file_obj.name)
|
368 |
-
try:
|
369 |
-
logger.info(f"Loading file:\t{file_path}")
|
370 |
-
if file_path.suffix in [".txt", ".md"]:
|
371 |
-
with open(file_path, "r", encoding="utf-8", errors="ignore") as f:
|
372 |
-
raw_text = f.read()
|
373 |
-
text = clean(raw_text, lower=lower)
|
374 |
-
elif file_path.suffix == ".pdf":
|
375 |
-
logger.info(f"loading a PDF file: {file_path.name}")
|
376 |
-
max_pages = int(os.environ.get("APP_OCR_MAX_PAGES", max_pages))
|
377 |
-
logger.info(f"max_pages is: {max_pages}. Starting conversion...")
|
378 |
-
conversion_stats = convert_PDF_to_Text(
|
379 |
-
file_path,
|
380 |
-
ocr_model=ocr_model,
|
381 |
-
max_pages=max_pages,
|
382 |
-
)
|
383 |
-
text = conversion_stats["converted_text"]
|
384 |
-
else:
|
385 |
-
logger.error(f"Unknown file type:\t{file_path.suffix}")
|
386 |
-
text = "ERROR - check file - unknown file type. PDF, TXT, and MD are supported."
|
387 |
|
388 |
-
|
389 |
-
|
390 |
-
|
391 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
392 |
|
|
|
393 |
|
|
|
|
|
|
|
|
|
|
|
394 |
def parse_args():
|
395 |
"""arguments for the command line interface"""
|
396 |
parser = argparse.ArgumentParser(
|
|
|
350 |
return text
|
351 |
|
352 |
|
353 |
+
# def load_uploaded_file(file_obj, max_pages: int = 20, lower: bool = False) -> str:
|
354 |
+
# """
|
355 |
+
# load_uploaded_file - loads a file uploaded by the user
|
356 |
+
|
357 |
+
# :param file_obj (POTENTIALLY list): Gradio file object inside a list
|
358 |
+
# :param int max_pages: the maximum number of pages to load from a PDF
|
359 |
+
# :param bool lower: whether to lowercase the text
|
360 |
+
# :return str: the text of the file
|
361 |
+
# """
|
362 |
+
# global ocr_model
|
363 |
+
# logger = logging.getLogger(__name__)
|
364 |
+
# # check if mysterious file object is a list
|
365 |
+
# if isinstance(file_obj, list):
|
366 |
+
# file_obj = file_obj[0]
|
367 |
+
# file_path = Path(file_obj.name)
|
368 |
+
# try:
|
369 |
+
# logger.info(f"Loading file:\t{file_path}")
|
370 |
+
# if file_path.suffix in [".txt", ".md"]:
|
371 |
+
# with open(file_path, "r", encoding="utf-8", errors="ignore") as f:
|
372 |
+
# raw_text = f.read()
|
373 |
+
# text = clean(raw_text, lower=lower)
|
374 |
+
# elif file_path.suffix == ".pdf":
|
375 |
+
# logger.info(f"loading a PDF file: {file_path.name}")
|
376 |
+
# max_pages = int(os.environ.get("APP_OCR_MAX_PAGES", max_pages))
|
377 |
+
# logger.info(f"max_pages is: {max_pages}. Starting conversion...")
|
378 |
+
# conversion_stats = convert_PDF_to_Text(
|
379 |
+
# file_path,
|
380 |
+
# ocr_model=ocr_model,
|
381 |
+
# max_pages=max_pages,
|
382 |
+
# )
|
383 |
+
# text = conversion_stats["converted_text"]
|
384 |
+
# else:
|
385 |
+
# logger.error(f"Unknown file type:\t{file_path.suffix}")
|
386 |
+
# text = "ERROR - check file - unknown file type. PDF, TXT, and MD are supported."
|
387 |
+
|
388 |
+
# return text
|
389 |
+
# except Exception as e:
|
390 |
+
# logger.error(f"Trying to load file:\t{file_path},\nerror:\t{e}")
|
391 |
+
# return f"Error: Could not read file {file_path.name}. Make sure it is a PDF, TXT, or MD file."
|
392 |
+
|
393 |
+
|
394 |
+
def load_uploaded_files(file_objs, max_pages: int = 20, lower: bool = False) -> str:
|
395 |
"""
|
396 |
+
load_uploaded_files - loads multiple files uploaded by the user and concatenates their contents
|
397 |
|
398 |
+
:param file_objs (list): List of Gradio file objects
|
399 |
:param int max_pages: the maximum number of pages to load from a PDF
|
400 |
:param bool lower: whether to lowercase the text
|
401 |
+
:return str: the concatenated text of all the files
|
402 |
"""
|
403 |
global ocr_model
|
404 |
logger = logging.getLogger(__name__)
|
405 |
+
concatenated_text = "" # Initialize an empty string to concatenate text
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
406 |
|
407 |
+
try:
|
408 |
+
for file_obj in file_objs:
|
409 |
+
file_path = Path(file_obj.name)
|
410 |
+
logger.info(f"Loading file:\t{file_path}")
|
411 |
+
|
412 |
+
if file_path.suffix in [".txt", ".md"]:
|
413 |
+
with open(file_path, "r", encoding="utf-8", errors="ignore") as f:
|
414 |
+
raw_text = f.read()
|
415 |
+
text = clean(raw_text, lower=lower)
|
416 |
+
elif file_path.suffix == ".pdf":
|
417 |
+
logger.info(f"loading a PDF file: {file_path.name}")
|
418 |
+
max_pages = int(os.environ.get("APP_OCR_MAX_PAGES", max_pages))
|
419 |
+
logger.info(f"max_pages is: {max_pages}. Starting conversion...")
|
420 |
+
conversion_stats = convert_PDF_to_Text(
|
421 |
+
file_path,
|
422 |
+
ocr_model=ocr_model,
|
423 |
+
max_pages=max_pages,
|
424 |
+
)
|
425 |
+
text = conversion_stats["converted_text"]
|
426 |
+
else:
|
427 |
+
logger.error(f"Unknown file type:\t{file_path.suffix}")
|
428 |
+
text = f"ERROR - check file - unknown file type. PDF, TXT, and MD are supported."
|
429 |
|
430 |
+
concatenated_text += text # Concatenate text from each file
|
431 |
|
432 |
+
return concatenated_text
|
433 |
+
except Exception as e:
|
434 |
+
logger.error(f"Error: {e}")
|
435 |
+
return f"Error: Could not read one or more files. Make sure they are PDF, TXT, or MD files."
|
436 |
+
|
437 |
def parse_args():
|
438 |
"""arguments for the command line interface"""
|
439 |
parser = argparse.ArgumentParser(
|