Spaces:
Runtime error
Runtime error
with app and other files
Browse files- .gitattributes +0 -0
- .github/workflows/sync_to_huggingface_space.yml +1 -1
- .gitignore +1 -1
- .python-version +1 -0
- README.md +37 -2
- app.py +194 -0
- htmlTemplates.py +43 -0
- vector_loader.py +265 -0
.gitattributes
ADDED
File without changes
|
.github/workflows/sync_to_huggingface_space.yml
CHANGED
@@ -17,4 +17,4 @@ jobs:
|
|
17 |
- name: Push to hub
|
18 |
env:
|
19 |
HF_TOKEN: ${{ secrets.HF_SPACES_TOKEN }}
|
20 |
-
run: git push
|
|
|
17 |
- name: Push to hub
|
18 |
env:
|
19 |
HF_TOKEN: ${{ secrets.HF_SPACES_TOKEN }}
|
20 |
+
run: git push https://ravi259:[email protected]/spaces/ravi259/baserag_hf main
|
.gitignore
CHANGED
@@ -1,4 +1,4 @@
|
|
1 |
data/
|
2 |
vectorstore/
|
3 |
.env
|
4 |
-
|
|
|
1 |
data/
|
2 |
vectorstore/
|
3 |
.env
|
4 |
+
*.ipynb
|
.python-version
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
3.10
|
README.md
CHANGED
@@ -1,2 +1,37 @@
|
|
1 |
-
#
|
2 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# LLM Evluation using Ragas and Langchain
|
2 |
+
|
3 |
+
Ragas is a framework that helps you evaluate an enterprise Retrieval Augmented Generation (RAG) pipelines.
|
4 |
+
Ragas is very easy to use and evaluate the RAG since there is no additional data required. The Context used in the RAG pipeline and Question and Answers are used for evaluating the RAG.
|
5 |
+
|
6 |
+
Ragas can provide below metrics https://docs.ragas.io/en/latest/concepts/metrics/index.html
|
7 |
+
|
8 |
+
* Faithfulness
|
9 |
+
* Answer relevancy
|
10 |
+
* Context recall
|
11 |
+
* Context precision
|
12 |
+
* Context relevancy
|
13 |
+
* Context entity recall
|
14 |
+
|
15 |
+
We will use LangChain framework to implement the RAG and functions/chains provided within LangChain
|
16 |
+
|
17 |
+
## Purpose
|
18 |
+
|
19 |
+
Evaluation or RAG approach using LangChain and OpenAI
|
20 |
+
|
21 |
+
## Features
|
22 |
+
|
23 |
+
|
24 |
+
## Usage
|
25 |
+
|
26 |
+
## Sample Output
|
27 |
+
|
28 |
+
|
29 |
+
## Future Enhancements
|
30 |
+
|
31 |
+
## Contributing
|
32 |
+
|
33 |
+
Contributions are welcome! If you have any ideas, suggestions, or bug fixes, please submit a pull request or open an issue in the GitHub repository.
|
34 |
+
|
35 |
+
## License
|
36 |
+
|
37 |
+
This project is licensed under the MIT License.
|
app.py
ADDED
@@ -0,0 +1,194 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import easyocr as ocr #OCR
|
2 |
+
import streamlit as st #Web App
|
3 |
+
from PIL import Image #Image Processing
|
4 |
+
import numpy as np #Image Processing
|
5 |
+
|
6 |
+
# To analyze the PDF layout and extract text
|
7 |
+
from pdfminer.high_level import extract_pages, extract_text
|
8 |
+
from pdfminer.layout import LTTextContainer, LTChar, LTRect, LTFigure
|
9 |
+
# To extract text from tables in PDF
|
10 |
+
import pdfplumber
|
11 |
+
# To extract the images from the PDFs
|
12 |
+
from PIL import Image
|
13 |
+
from pdf2image import convert_from_path
|
14 |
+
|
15 |
+
import streamlit as st
|
16 |
+
import pandas as pd
|
17 |
+
|
18 |
+
import gradio as gr
|
19 |
+
import time
|
20 |
+
from PyPDF2 import PdfReader
|
21 |
+
import easyocr as ocr #OCR
|
22 |
+
import streamlit as st #Web App
|
23 |
+
from PIL import Image #Image Processing
|
24 |
+
import numpy as np #Image Processing
|
25 |
+
# To read the PDF
|
26 |
+
import PyPDF2
|
27 |
+
# To analyze the PDF layout and extract text
|
28 |
+
from pdfminer.high_level import extract_pages, extract_text
|
29 |
+
from pdfminer.layout import LTTextContainer, LTChar, LTRect, LTFigure
|
30 |
+
# To extract text from tables in PDF
|
31 |
+
import pdfplumber
|
32 |
+
# To extract the images from the PDFs
|
33 |
+
from PIL import Image
|
34 |
+
from pdf2image import convert_from_path
|
35 |
+
# To perform OCR to extract text from images
|
36 |
+
import pytesseract
|
37 |
+
# To remove the additional created files
|
38 |
+
import os
|
39 |
+
import tiktoken
|
40 |
+
import streamlit as st
|
41 |
+
import pandas as pd
|
42 |
+
from io import StringIO
|
43 |
+
import time
|
44 |
+
import json
|
45 |
+
import openai
|
46 |
+
|
47 |
+
|
48 |
+
import requests
|
49 |
+
from langchain_community.document_loaders import TextLoader
|
50 |
+
from langchain.text_splitter import CharacterTextSplitter
|
51 |
+
|
52 |
+
#from langchain_community.embeddings import OpenAIEmbeddings
|
53 |
+
from langchain_openai import OpenAIEmbeddings
|
54 |
+
from langchain_community.vectorstores import FAISS
|
55 |
+
from dotenv import load_dotenv,find_dotenv
|
56 |
+
|
57 |
+
#from langchain_community.chat_models import ChatOpenAI
|
58 |
+
from langchain_openai import ChatOpenAI
|
59 |
+
from langchain.prompts import ChatPromptTemplate
|
60 |
+
from langchain.schema.runnable import RunnablePassthrough
|
61 |
+
from langchain.schema.output_parser import StrOutputParser
|
62 |
+
from langchain.memory import ConversationBufferMemory
|
63 |
+
from langchain.chains import ConversationChain
|
64 |
+
|
65 |
+
from datasets import Dataset
|
66 |
+
|
67 |
+
from ragas import evaluate
|
68 |
+
from ragas.metrics import (
|
69 |
+
faithfulness,
|
70 |
+
answer_relevancy,
|
71 |
+
context_recall,
|
72 |
+
context_precision,
|
73 |
+
)
|
74 |
+
|
75 |
+
import os
|
76 |
+
from dotenv import load_dotenv
|
77 |
+
from htmlTemplates import bot_template, user_template, css
|
78 |
+
|
79 |
+
load_dotenv()
|
80 |
+
OPENAI_API_KEY = os.getenv('OPENAI_API_KEY')
|
81 |
+
|
82 |
+
def load_knowledgeBase():
|
83 |
+
embeddings=OpenAIEmbeddings(api_key=OPENAI_API_KEY)
|
84 |
+
DB_FAISS_PATH = "../Ragas-LangChain-Evaluation/vectorstore/db_faiss/"
|
85 |
+
db = FAISS.load_local(
|
86 |
+
DB_FAISS_PATH,
|
87 |
+
embeddings,
|
88 |
+
allow_dangerous_deserialization=True,
|
89 |
+
index_name="njmvc_Index"
|
90 |
+
)
|
91 |
+
return db
|
92 |
+
def load_prompt():
|
93 |
+
prompt = """ You are helping students to pass NJMVC Knowledge Test. Provide a Single multiple choice question with 4 options to choose from.
|
94 |
+
Use the context to provide the question and answer choices.
|
95 |
+
context = {context}
|
96 |
+
question = {question}
|
97 |
+
if the answer is not in the pdf answer "i donot know what the hell you are asking about"
|
98 |
+
"""
|
99 |
+
prompt = ChatPromptTemplate.from_template(prompt)
|
100 |
+
return prompt
|
101 |
+
|
102 |
+
#function to load the OPENAI LLM
|
103 |
+
def load_llm():
|
104 |
+
llm = ChatOpenAI(model_name="gpt-3.5-turbo", temperature=0, api_key=OPENAI_API_KEY)
|
105 |
+
return llm
|
106 |
+
|
107 |
+
knowledgeBase=load_knowledgeBase()
|
108 |
+
prompt = load_prompt()
|
109 |
+
llm=load_llm()
|
110 |
+
|
111 |
+
def get_conversation_chain(vectorstore, llm):
|
112 |
+
llm = llm
|
113 |
+
#llm = HuggingFaceHub(repo_id="google/flan-t5-xxl", model_kwargs={"temperature":0.5, "max_length":512})
|
114 |
+
|
115 |
+
memory = ConversationBufferMemory(memory_key="chat_history")
|
116 |
+
conversation_chain = ConversationChain(
|
117 |
+
llm=llm,
|
118 |
+
verbose=True,
|
119 |
+
memory=ConversationBufferMemory(),
|
120 |
+
)
|
121 |
+
return conversation_chain
|
122 |
+
|
123 |
+
def format_docs(docs):
|
124 |
+
return "\n\n".join(doc.page_content for doc in docs)
|
125 |
+
|
126 |
+
def get_pdf_text(pdf_files):
|
127 |
+
|
128 |
+
text = ""
|
129 |
+
for pdf_file in pdf_files:
|
130 |
+
reader = PdfReader(pdf_file)
|
131 |
+
for page in reader.pages:
|
132 |
+
text += page.extract_text()
|
133 |
+
return text
|
134 |
+
|
135 |
+
def get_chunk_text(text):
|
136 |
+
text_splitter = CharacterTextSplitter(
|
137 |
+
separator = "\n",
|
138 |
+
chunk_size = 1000,
|
139 |
+
chunk_overlap = 200,
|
140 |
+
length_function = len
|
141 |
+
)
|
142 |
+
chunks = text_splitter.split_text(text)
|
143 |
+
return chunks
|
144 |
+
|
145 |
+
def handle_user_input(question):
|
146 |
+
response = st.session_state.conversation({'question':question})
|
147 |
+
|
148 |
+
st.session_state.chat_history = response['chat_history']
|
149 |
+
|
150 |
+
for i, message in enumerate(st.session_state.chat_history):
|
151 |
+
if i % 2 == 0:
|
152 |
+
st.write(user_template.replace("{{MSG}}", message.content), unsafe_allow_html=True)
|
153 |
+
else:
|
154 |
+
st.write(bot_template.replace("{{MSG}}", message.content), unsafe_allow_html=True)
|
155 |
+
|
156 |
+
def main():
|
157 |
+
st.set_page_config(page_title='NJMVC Knowledge Test with RAGAS', page_icon=':cars:')
|
158 |
+
|
159 |
+
st.write(css, unsafe_allow_html=True)
|
160 |
+
|
161 |
+
if "conversation" not in st.session_state:
|
162 |
+
st.session_state.conversation = None
|
163 |
+
|
164 |
+
if "chat_history" not in st.session_state:
|
165 |
+
st.session_state.chat_history = None
|
166 |
+
|
167 |
+
st.header('NJMVC Knowledge Test with RAGAS :cars:')
|
168 |
+
question = st.text_input("Input the Topic you want to test your knowledge: ")
|
169 |
+
|
170 |
+
if question:
|
171 |
+
#handle_user_input(question)
|
172 |
+
|
173 |
+
with st.spinner("Get ready..."):
|
174 |
+
text_chunks = get_chunk_text(question)
|
175 |
+
|
176 |
+
db = FAISS.load_local(folder_path="../Ragas-LangChain-Evaluation/vectorstore/db_faiss/",embeddings=OpenAIEmbeddings(api_key=OPENAI_API_KEY),allow_dangerous_deserialization=True, index_name="njmvc_Index")
|
177 |
+
searchDocs = db.similarity_search("what is the NJMVC driving test")
|
178 |
+
|
179 |
+
similar_embeddings=FAISS.from_documents(documents=searchDocs, embedding=OpenAIEmbeddings(api_key=OPENAI_API_KEY))
|
180 |
+
#creating the chain for integrating llm,prompt,stroutputparser
|
181 |
+
retriever = similar_embeddings.as_retriever()
|
182 |
+
rag_chain = (
|
183 |
+
{"context": retriever | format_docs, "question": RunnablePassthrough()}
|
184 |
+
| prompt
|
185 |
+
| llm
|
186 |
+
| StrOutputParser()
|
187 |
+
)
|
188 |
+
#st.session_state.conversation = get_conversation_chain(vector_store)
|
189 |
+
|
190 |
+
response=rag_chain.invoke(question)
|
191 |
+
st.write(response)
|
192 |
+
|
193 |
+
if __name__ == '__main__':
|
194 |
+
main()
|
htmlTemplates.py
ADDED
@@ -0,0 +1,43 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
css = '''
|
2 |
+
<style>
|
3 |
+
.chat-message {
|
4 |
+
padding: 1.5rem; border-radius: 0.5rem; margin-bottom: 1rem; display: flex
|
5 |
+
}
|
6 |
+
.chat-message.user {
|
7 |
+
background-color: #2b313e
|
8 |
+
}
|
9 |
+
.chat-message.bot {
|
10 |
+
background-color: #475063
|
11 |
+
}
|
12 |
+
.chat-message .avatar {
|
13 |
+
width: 20%;
|
14 |
+
}
|
15 |
+
.chat-message .avatar img {
|
16 |
+
max-width: 78px;
|
17 |
+
max-height: 78px;
|
18 |
+
border-radius: 50%;
|
19 |
+
object-fit: cover;
|
20 |
+
}
|
21 |
+
.chat-message .message {
|
22 |
+
width: 80%;
|
23 |
+
padding: 0 1.5rem;
|
24 |
+
color: #fff;
|
25 |
+
}
|
26 |
+
'''
|
27 |
+
|
28 |
+
bot_template = '''
|
29 |
+
<div class="chat-message bot">
|
30 |
+
<div class="avatar">
|
31 |
+
<img src="https://i.ibb.co/cN0nmSj/Screenshot-2023-05-28-at-02-37-21.png">
|
32 |
+
</div>
|
33 |
+
<div class="message">{{MSG}}</div>
|
34 |
+
</div>
|
35 |
+
'''
|
36 |
+
user_template = '''
|
37 |
+
<div class="chat-message user">
|
38 |
+
<div class="avatar">
|
39 |
+
<img src="https://i.ibb.co/rdZC7LZ/Photo-logo-1.png">
|
40 |
+
</div>
|
41 |
+
<div class="message">{{MSG}}</div>
|
42 |
+
</div>
|
43 |
+
'''
|
vector_loader.py
ADDED
@@ -0,0 +1,265 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from langchain_community.document_loaders import PyPDFLoader
|
2 |
+
from langchain.text_splitter import RecursiveCharacterTextSplitter
|
3 |
+
from langchain_community.vectorstores import FAISS
|
4 |
+
from langchain_openai import OpenAIEmbeddings
|
5 |
+
import PyPDF2
|
6 |
+
from PyPDF2 import PdfReader
|
7 |
+
import pdfplumber
|
8 |
+
from PIL import Image
|
9 |
+
import pytesseract
|
10 |
+
from pdf2image import convert_from_path
|
11 |
+
|
12 |
+
from pdfminer.high_level import extract_pages, extract_text
|
13 |
+
from pdfminer.layout import LTTextContainer, LTChar, LTRect, LTFigure
|
14 |
+
|
15 |
+
import os
|
16 |
+
from dotenv import load_dotenv
|
17 |
+
|
18 |
+
load_dotenv()
|
19 |
+
OPENAI_API_KEY = os.getenv('OPENAI_API_KEY')
|
20 |
+
|
21 |
+
# Extracting tables from the page
|
22 |
+
def extract_table(pdf_path, page_num, table_num):
|
23 |
+
# Open the pdf file
|
24 |
+
pdf = pdfplumber.open(pdf_path)
|
25 |
+
# Find the examined page
|
26 |
+
table_page = pdf.pages[page_num]
|
27 |
+
# Extract the appropriate table
|
28 |
+
table = table_page.extract_tables()[table_num]
|
29 |
+
|
30 |
+
return table
|
31 |
+
|
32 |
+
# Convert table into appropriate fromat
|
33 |
+
def table_converter(table):
|
34 |
+
table_string = ''
|
35 |
+
# Iterate through each row of the table
|
36 |
+
for row_num in range(len(table)):
|
37 |
+
row = table[row_num]
|
38 |
+
# Remove the line breaker from the wrapted texts
|
39 |
+
cleaned_row = [item.replace('\n', ' ') if item is not None and '\n' in item else 'None' if item is None else item for item in row]
|
40 |
+
# Convert the table into a string
|
41 |
+
table_string+=('|'+'|'.join(cleaned_row)+'|'+'\n')
|
42 |
+
# Removing the last line break
|
43 |
+
table_string = table_string[:-1]
|
44 |
+
return table_string
|
45 |
+
|
46 |
+
|
47 |
+
# Create a function to check if the element is in any tables present in the page
|
48 |
+
def is_element_inside_any_table(element, page ,tables):
|
49 |
+
x0, y0up, x1, y1up = element.bbox
|
50 |
+
# Change the cordinates because the pdfminer counts from the botton to top of the page
|
51 |
+
y0 = page.bbox[3] - y1up
|
52 |
+
y1 = page.bbox[3] - y0up
|
53 |
+
for table in tables:
|
54 |
+
tx0, ty0, tx1, ty1 = table.bbox
|
55 |
+
if tx0 <= x0 <= x1 <= tx1 and ty0 <= y0 <= y1 <= ty1:
|
56 |
+
return True
|
57 |
+
return False
|
58 |
+
|
59 |
+
# Function to find the table for a given element
|
60 |
+
def find_table_for_element(element, page ,tables):
|
61 |
+
x0, y0up, x1, y1up = element.bbox
|
62 |
+
# Change the cordinates because the pdfminer counts from the botton to top of the page
|
63 |
+
y0 = page.bbox[3] - y1up
|
64 |
+
y1 = page.bbox[3] - y0up
|
65 |
+
for i, table in enumerate(tables):
|
66 |
+
tx0, ty0, tx1, ty1 = table.bbox
|
67 |
+
if tx0 <= x0 <= x1 <= tx1 and ty0 <= y0 <= y1 <= ty1:
|
68 |
+
return i # Return the index of the table
|
69 |
+
return None
|
70 |
+
|
71 |
+
|
72 |
+
def text_extraction(element):
|
73 |
+
# Extracting the text from the in line text element
|
74 |
+
line_text = element.get_text()
|
75 |
+
|
76 |
+
# Find the formats of the text
|
77 |
+
# Initialize the list with all the formats appeared in the line of text
|
78 |
+
line_formats = []
|
79 |
+
for text_line in element:
|
80 |
+
if isinstance(text_line, LTTextContainer):
|
81 |
+
# Iterating through each character in the line of text
|
82 |
+
for character in text_line:
|
83 |
+
if isinstance(character, LTChar):
|
84 |
+
# Append the font name of the character
|
85 |
+
#line_formats.append(character.fontname)
|
86 |
+
# Append the font size of the character
|
87 |
+
#line_formats.append(character.size)
|
88 |
+
line_formats.append("")
|
89 |
+
|
90 |
+
# Find the unique font sizes and names in the line
|
91 |
+
format_per_line = list(set(line_formats))
|
92 |
+
|
93 |
+
# Return a tuple with the text in each line along with its format
|
94 |
+
return (line_text, format_per_line)
|
95 |
+
|
96 |
+
|
97 |
+
# Create a function to crop the image elements from PDFs
|
98 |
+
def crop_image(element, pageObj):
|
99 |
+
# Get the coordinates to crop the image from PDF
|
100 |
+
[image_left, image_top, image_right, image_bottom] = [element.x0,element.y0,element.x1,element.y1]
|
101 |
+
# Crop the page using coordinates (left, bottom, right, top)
|
102 |
+
pageObj.mediabox.lower_left = (image_left, image_bottom)
|
103 |
+
pageObj.mediabox.upper_right = (image_right, image_top)
|
104 |
+
# Save the cropped page to a new PDF
|
105 |
+
cropped_pdf_writer = PyPDF2.PdfWriter()
|
106 |
+
cropped_pdf_writer.add_page(pageObj)
|
107 |
+
# Save the cropped PDF to a new file
|
108 |
+
with open('cropped_image.pdf', 'wb') as cropped_pdf_file:
|
109 |
+
cropped_pdf_writer.write(cropped_pdf_file)
|
110 |
+
|
111 |
+
# Create a function to convert the PDF to images
|
112 |
+
def convert_to_images(input_file,):
|
113 |
+
images = convert_from_path(input_file)
|
114 |
+
image = images[0]
|
115 |
+
output_file = 'PDF_image.png'
|
116 |
+
image.save(output_file, 'PNG')
|
117 |
+
|
118 |
+
# Create a function to read text from images
|
119 |
+
def image_to_text(image_path):
|
120 |
+
# Read the image
|
121 |
+
img = Image.open(image_path)
|
122 |
+
# Extract the text from the image
|
123 |
+
text = pytesseract.image_to_string(img)
|
124 |
+
return text
|
125 |
+
|
126 |
+
|
127 |
+
|
128 |
+
def read_file_get_prompts(file_name):
|
129 |
+
if file_name is not None:
|
130 |
+
|
131 |
+
# Find the PDF path
|
132 |
+
pdf_path = file_name # '/content/data/'+file_name+".pdf"
|
133 |
+
pdfReaded = PyPDF2.PdfReader(file_name)
|
134 |
+
|
135 |
+
# Create the dictionary to extract text from each image
|
136 |
+
text_per_page = {}
|
137 |
+
# Create a boolean variable for image detection
|
138 |
+
image_flag = False
|
139 |
+
|
140 |
+
number_of_pages = len(list(extract_pages(file_name)))
|
141 |
+
result = ''
|
142 |
+
|
143 |
+
# We extract the pages from the PDF
|
144 |
+
for pagenum, page in enumerate(extract_pages(file_name)):
|
145 |
+
|
146 |
+
# Initialize the variables needed for the text extraction from the page
|
147 |
+
pageObj = pdfReaded.pages[pagenum]
|
148 |
+
page_text = []
|
149 |
+
line_format = []
|
150 |
+
text_from_images = []
|
151 |
+
text_from_tables = []
|
152 |
+
page_content = []
|
153 |
+
# Initialize the number of the examined tables
|
154 |
+
table_in_page= -1
|
155 |
+
# Open the pdf file
|
156 |
+
pdf = pdfplumber.open(pdf_path)
|
157 |
+
# Find the examined page
|
158 |
+
page_tables = pdf.pages[pagenum]
|
159 |
+
# Find the number of tables in the page
|
160 |
+
tables = page_tables.find_tables()
|
161 |
+
if len(tables)!=0:
|
162 |
+
table_in_page = 0
|
163 |
+
|
164 |
+
# Extracting the tables of the page
|
165 |
+
for table_num in range(len(tables)):
|
166 |
+
# Extract the information of the table
|
167 |
+
table = extract_table(pdf_path, pagenum, table_num)
|
168 |
+
# Convert the table information in structured string format
|
169 |
+
table_string = table_converter(table)
|
170 |
+
# Append the table string into a list
|
171 |
+
text_from_tables.append(table_string)
|
172 |
+
|
173 |
+
# Find all the elements
|
174 |
+
page_elements = [(element.y1, element) for element in page._objs]
|
175 |
+
# Sort all the element as they appear in the page
|
176 |
+
page_elements.sort(key=lambda a: a[0], reverse=True)
|
177 |
+
|
178 |
+
|
179 |
+
# Find the elements that composed a page
|
180 |
+
for i,component in enumerate(page_elements):
|
181 |
+
# Extract the element of the page layout
|
182 |
+
element = component[1]
|
183 |
+
|
184 |
+
# Check the elements for tables
|
185 |
+
if table_in_page == -1:
|
186 |
+
pass
|
187 |
+
else:
|
188 |
+
if is_element_inside_any_table(element, page ,tables):
|
189 |
+
table_found = find_table_for_element(element,page ,tables)
|
190 |
+
if table_found == table_in_page and table_found != None:
|
191 |
+
page_content.append(text_from_tables[table_in_page])
|
192 |
+
#page_text.append('table')
|
193 |
+
#line_format.append('table')
|
194 |
+
table_in_page+=1
|
195 |
+
# Pass this iteration because the content of this element was extracted from the tables
|
196 |
+
continue
|
197 |
+
|
198 |
+
if not is_element_inside_any_table(element,page,tables):
|
199 |
+
|
200 |
+
# Check if the element is text element
|
201 |
+
if isinstance(element, LTTextContainer):
|
202 |
+
# Use the function to extract the text and format for each text element
|
203 |
+
(line_text, format_per_line) = text_extraction(element)
|
204 |
+
# Append the text of each line to the page text
|
205 |
+
page_text.append(line_text)
|
206 |
+
# Append the format for each line containing text
|
207 |
+
line_format.append(format_per_line)
|
208 |
+
page_content.append(line_text)
|
209 |
+
|
210 |
+
|
211 |
+
# Check the elements for images
|
212 |
+
if isinstance(element, LTFigure):
|
213 |
+
# Crop the image from PDF
|
214 |
+
crop_image(element, pageObj)
|
215 |
+
# Convert the croped pdf to image
|
216 |
+
convert_to_images('cropped_image.pdf')
|
217 |
+
# Extract the text from image
|
218 |
+
image_text = image_to_text('PDF_image.png')
|
219 |
+
image_text = "" # removed to remove the errors with image
|
220 |
+
text_from_images.append(image_text)
|
221 |
+
page_content.append(image_text)
|
222 |
+
# Add a placeholder in the text and format lists
|
223 |
+
#page_text.append('image')
|
224 |
+
#line_format.append('image')
|
225 |
+
# Update the flag for image detection
|
226 |
+
image_flag = True
|
227 |
+
|
228 |
+
|
229 |
+
# Create the key of the dictionary
|
230 |
+
dctkey = 'Page_'+str(pagenum)
|
231 |
+
print(dctkey)
|
232 |
+
|
233 |
+
# Add the list of list as value of the page key
|
234 |
+
#text_per_page[dctkey]= [page_text, line_format, text_from_images,text_from_tables, page_content]
|
235 |
+
text_per_page[dctkey]= [page_text, text_from_images,text_from_tables, page_content]
|
236 |
+
#result = result.join(page_text).join(line_format).join(text_from_images).join(text_from_tables).join(page_content)
|
237 |
+
result = " "
|
238 |
+
for t in range(number_of_pages):
|
239 |
+
page = 'Page_'+str(t)
|
240 |
+
#result = result.join(map(str, text_per_page[page]))
|
241 |
+
for q in range(len(text_per_page[page])):
|
242 |
+
#print(f"{''.join(map(str, text_per_page[page][q]))}")
|
243 |
+
result = result + f"{''.join(map(str, text_per_page[page][q]))}"
|
244 |
+
|
245 |
+
return result
|
246 |
+
|
247 |
+
return True
|
248 |
+
|
249 |
+
def save_to_vector_store(text):
|
250 |
+
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
|
251 |
+
docs = text_splitter.create_documents(text)
|
252 |
+
vectorstore = FAISS.from_documents(documents=docs, embedding=OpenAIEmbeddings(api_key=OPENAI_API_KEY))
|
253 |
+
vectorstore.save_local(DB_FAISS_PATH, index_name="njmvc_Index")
|
254 |
+
#create a new file named vectorstore in your current directory.
|
255 |
+
if __name__=="__main__":
|
256 |
+
DB_FAISS_PATH = 'vectorstore/db_faiss'
|
257 |
+
file_name = "./data/drivermanual-2-very-small.pdf"
|
258 |
+
#loader=read_file_get_prompts(file_name)
|
259 |
+
text=read_file_get_prompts(file_name)
|
260 |
+
#pdfReaded = PyPDF2.PdfReader(file_name)
|
261 |
+
#docs=loader.load()
|
262 |
+
save_to_vector_store(text)
|
263 |
+
#save_to_vector_store(text)
|
264 |
+
|
265 |
+
|