MeetJivani commited on
Commit
a5ee254
·
1 Parent(s): 970a7e9

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +74 -31
app.py CHANGED
@@ -350,47 +350,90 @@ def load_single_example_text(
350
  return text
351
 
352
 
353
- def load_uploaded_file(file_obj, max_pages: int = 20, lower: bool = False) -> str:
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
354
  """
355
- load_uploaded_file - loads a file uploaded by the user
356
 
357
- :param file_obj (POTENTIALLY list): Gradio file object inside a list
358
  :param int max_pages: the maximum number of pages to load from a PDF
359
  :param bool lower: whether to lowercase the text
360
- :return str: the text of the file
361
  """
362
  global ocr_model
363
  logger = logging.getLogger(__name__)
364
- # check if mysterious file object is a list
365
- if isinstance(file_obj, list):
366
- file_obj = file_obj[0]
367
- file_path = Path(file_obj.name)
368
- try:
369
- logger.info(f"Loading file:\t{file_path}")
370
- if file_path.suffix in [".txt", ".md"]:
371
- with open(file_path, "r", encoding="utf-8", errors="ignore") as f:
372
- raw_text = f.read()
373
- text = clean(raw_text, lower=lower)
374
- elif file_path.suffix == ".pdf":
375
- logger.info(f"loading a PDF file: {file_path.name}")
376
- max_pages = int(os.environ.get("APP_OCR_MAX_PAGES", max_pages))
377
- logger.info(f"max_pages is: {max_pages}. Starting conversion...")
378
- conversion_stats = convert_PDF_to_Text(
379
- file_path,
380
- ocr_model=ocr_model,
381
- max_pages=max_pages,
382
- )
383
- text = conversion_stats["converted_text"]
384
- else:
385
- logger.error(f"Unknown file type:\t{file_path.suffix}")
386
- text = "ERROR - check file - unknown file type. PDF, TXT, and MD are supported."
387
 
388
- return text
389
- except Exception as e:
390
- logger.error(f"Trying to load file:\t{file_path},\nerror:\t{e}")
391
- return f"Error: Could not read file {file_path.name}. Make sure it is a PDF, TXT, or MD file."
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
392
 
 
393
 
 
 
 
 
 
394
  def parse_args():
395
  """arguments for the command line interface"""
396
  parser = argparse.ArgumentParser(
 
350
  return text
351
 
352
 
353
+ # def load_uploaded_file(file_obj, max_pages: int = 20, lower: bool = False) -> str:
354
+ # """
355
+ # load_uploaded_file - loads a file uploaded by the user
356
+
357
+ # :param file_obj (POTENTIALLY list): Gradio file object inside a list
358
+ # :param int max_pages: the maximum number of pages to load from a PDF
359
+ # :param bool lower: whether to lowercase the text
360
+ # :return str: the text of the file
361
+ # """
362
+ # global ocr_model
363
+ # logger = logging.getLogger(__name__)
364
+ # # check if mysterious file object is a list
365
+ # if isinstance(file_obj, list):
366
+ # file_obj = file_obj[0]
367
+ # file_path = Path(file_obj.name)
368
+ # try:
369
+ # logger.info(f"Loading file:\t{file_path}")
370
+ # if file_path.suffix in [".txt", ".md"]:
371
+ # with open(file_path, "r", encoding="utf-8", errors="ignore") as f:
372
+ # raw_text = f.read()
373
+ # text = clean(raw_text, lower=lower)
374
+ # elif file_path.suffix == ".pdf":
375
+ # logger.info(f"loading a PDF file: {file_path.name}")
376
+ # max_pages = int(os.environ.get("APP_OCR_MAX_PAGES", max_pages))
377
+ # logger.info(f"max_pages is: {max_pages}. Starting conversion...")
378
+ # conversion_stats = convert_PDF_to_Text(
379
+ # file_path,
380
+ # ocr_model=ocr_model,
381
+ # max_pages=max_pages,
382
+ # )
383
+ # text = conversion_stats["converted_text"]
384
+ # else:
385
+ # logger.error(f"Unknown file type:\t{file_path.suffix}")
386
+ # text = "ERROR - check file - unknown file type. PDF, TXT, and MD are supported."
387
+
388
+ # return text
389
+ # except Exception as e:
390
+ # logger.error(f"Trying to load file:\t{file_path},\nerror:\t{e}")
391
+ # return f"Error: Could not read file {file_path.name}. Make sure it is a PDF, TXT, or MD file."
392
+
393
+
394
+ def load_uploaded_files(file_objs, max_pages: int = 20, lower: bool = False) -> str:
395
  """
396
+ load_uploaded_files - loads multiple files uploaded by the user and concatenates their contents
397
 
398
+ :param file_objs (list): List of Gradio file objects
399
  :param int max_pages: the maximum number of pages to load from a PDF
400
  :param bool lower: whether to lowercase the text
401
+ :return str: the concatenated text of all the files
402
  """
403
  global ocr_model
404
  logger = logging.getLogger(__name__)
405
+ concatenated_text = "" # Initialize an empty string to concatenate text
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
406
 
407
+ try:
408
+ for file_obj in file_objs:
409
+ file_path = Path(file_obj.name)
410
+ logger.info(f"Loading file:\t{file_path}")
411
+
412
+ if file_path.suffix in [".txt", ".md"]:
413
+ with open(file_path, "r", encoding="utf-8", errors="ignore") as f:
414
+ raw_text = f.read()
415
+ text = clean(raw_text, lower=lower)
416
+ elif file_path.suffix == ".pdf":
417
+ logger.info(f"loading a PDF file: {file_path.name}")
418
+ max_pages = int(os.environ.get("APP_OCR_MAX_PAGES", max_pages))
419
+ logger.info(f"max_pages is: {max_pages}. Starting conversion...")
420
+ conversion_stats = convert_PDF_to_Text(
421
+ file_path,
422
+ ocr_model=ocr_model,
423
+ max_pages=max_pages,
424
+ )
425
+ text = conversion_stats["converted_text"]
426
+ else:
427
+ logger.error(f"Unknown file type:\t{file_path.suffix}")
428
+ text = f"ERROR - check file - unknown file type. PDF, TXT, and MD are supported."
429
 
430
+ concatenated_text += text # Concatenate text from each file
431
 
432
+ return concatenated_text
433
+ except Exception as e:
434
+ logger.error(f"Error: {e}")
435
+ return f"Error: Could not read one or more files. Make sure they are PDF, TXT, or MD files."
436
+
437
  def parse_args():
438
  """arguments for the command line interface"""
439
  parser = argparse.ArgumentParser(