AdrienB134 commited on
Commit
3133ca1
·
verified ·
1 Parent(s): 9d1233a

Update rag_demo/preprocessing/pdf_conversion.py

Browse files
rag_demo/preprocessing/pdf_conversion.py CHANGED
@@ -3,7 +3,7 @@ from llama_index.core import SimpleDirectoryReader
3
  from uuid import uuid4
4
  from .base import Document
5
  from loguru import logger
6
-
7
  from dotenv import load_dotenv
8
 
9
  load_dotenv()
@@ -11,23 +11,27 @@ load_dotenv()
11
 
12
  # set up parser
13
  parser = LlamaParse(
14
- api_key="llx-TN6YSXvZdpG0qhJ7rVx9QFg5Zq298RXr7Id7XzXb5Wr4Rnpt",
15
  result_type="markdown", # "markdown" and "text" are available
16
  )
17
 
18
 
19
  def convert_pdf_to_text(filepaths: list[str]) -> Document:
20
- file_extractor = {".pdf": parser}
21
- # use SimpleDirectoryReader to parse our file
22
-
23
- documents = SimpleDirectoryReader(
24
- input_files=filepaths, file_extractor=file_extractor
25
- ).load_data()
26
-
27
- logger.info("Converted 1 documents")
28
-
29
- return Document(
30
- document_id=uuid4(),
31
- text=" ".join(document.text for document in documents),
32
- metadata={"filename": filepaths[0].split("/")[-1]},
33
- )
 
 
 
 
 
3
  from uuid import uuid4
4
  from .base import Document
5
  from loguru import logger
6
+ import os
7
  from dotenv import load_dotenv
8
 
9
  load_dotenv()
 
11
 
12
  # set up parser
13
  parser = LlamaParse(
14
+ api_key=os.getenv("LLAMA_PARSE_API_KEY"),
15
  result_type="markdown", # "markdown" and "text" are available
16
  )
17
 
18
 
19
  def convert_pdf_to_text(filepaths: list[str]) -> Document:
20
+ try:
21
+ file_extractor = {".pdf": parser}
22
+ # use SimpleDirectoryReader to parse our file
23
+
24
+ documents = SimpleDirectoryReader(
25
+ input_files=filepaths, file_extractor=file_extractor
26
+ ).load_data()
27
+
28
+ logger.info("Converted 1 documents")
29
+
30
+ return Document(
31
+ document_id=uuid4(),
32
+ text=" ".join(document.text for document in documents),
33
+ metadata={"filename": filepaths[0].split("/")[-1]},
34
+ )
35
+ except Exception as e:
36
+ logger.error(f"Error converting PDF to text: {e}")
37
+ raise e