Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
@@ -4,6 +4,7 @@ from langchain_core.prompts import ChatPromptTemplate, SystemMessagePromptTempla
|
|
4 |
import os
|
5 |
import nltk
|
6 |
import io
|
|
|
7 |
nltk.download("punkt")
|
8 |
|
9 |
st.title(':blue[Langchain:] A Rag System on “Leave No Context Behind” Paper')
|
@@ -46,6 +47,13 @@ from langchain_google_genai import GoogleGenerativeAIEmbeddings
|
|
46 |
from langchain_community.vectorstores import Chroma
|
47 |
from langchain_core.runnables import RunnablePassthrough
|
48 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
49 |
|
50 |
|
51 |
uploaded_file = st.file_uploader("Choose a pdf file",type = "pdf")
|
@@ -53,10 +61,11 @@ uploaded_file = st.file_uploader("Choose a pdf file",type = "pdf")
|
|
53 |
if uploaded_file is not None:
|
54 |
|
55 |
pdf_file = io.BytesIO(uploaded_file.read())
|
56 |
-
|
57 |
-
|
|
|
58 |
text_splitter = NLTKTextSplitter(chunk_size = 500,chunk_overlap = 100)
|
59 |
-
chunks = test_splitter.split_documents(
|
60 |
|
61 |
embedding_model = GoogleGenerativeAIEmbeddings(model="models/embedding-001")
|
62 |
|
|
|
4 |
import os
|
5 |
import nltk
|
6 |
import io
|
7 |
+
import fitz
|
8 |
nltk.download("punkt")
|
9 |
|
10 |
st.title(':blue[Langchain:] A Rag System on “Leave No Context Behind” Paper')
|
|
|
47 |
from langchain_community.vectorstores import Chroma
|
48 |
from langchain_core.runnables import RunnablePassthrough
|
49 |
|
50 |
+
def extract_text_from_pdf(pdf_file):
|
51 |
+
document = fitz.open(stream=pdf_file, filetype="pdf")
|
52 |
+
text = ""
|
53 |
+
for page_num in range(len(document)):
|
54 |
+
page = document.load_page(page_num)
|
55 |
+
text += page.get_text()
|
56 |
+
return text
|
57 |
|
58 |
|
59 |
uploaded_file = st.file_uploader("Choose a pdf file",type = "pdf")
|
|
|
61 |
if uploaded_file is not None:
|
62 |
|
63 |
pdf_file = io.BytesIO(uploaded_file.read())
|
64 |
+
text = extract_text_from_pdf(pdf_file)
|
65 |
+
#pdf_loader = PDFMinerLoader(pdf_file)
|
66 |
+
#dat_nik = pdf_loader.load()
|
67 |
text_splitter = NLTKTextSplitter(chunk_size = 500,chunk_overlap = 100)
|
68 |
+
chunks = test_splitter.split_documents([text])
|
69 |
|
70 |
embedding_model = GoogleGenerativeAIEmbeddings(model="models/embedding-001")
|
71 |
|