dammy commited on
Commit
b12ea1f
·
1 Parent(s): f05dba6

update app.py

Browse files
Files changed (1) hide show
  1. app.py +13 -7
app.py CHANGED
@@ -1,13 +1,19 @@
1
  import gradio as gr
2
- import fitz # PyMuPDF
 
 
3
 
4
  def extract_text(pdf_file):
5
- doc = fitz.open(pdf_file.name)
6
- text = ""
7
- for page_num in range(doc.page_count):
8
- page = doc[page_num]
9
- text += page.get_text()
10
- return text
 
 
 
 
11
 
12
  iface = gr.Interface(
13
  fn=extract_text,
 
1
  import gradio as gr
2
+ from langchain.document_loaders import PDFMinerLoader, PyMuPDFLoader
3
+ from langchain.text_splitter import CharacterTextSplitter
4
+
5
 
6
  def extract_text(pdf_file):
7
+ # Load a document
8
+ loader = PDFMinerLoader("cereal.pdf")
9
+ doc = loader.load()
10
+
11
+ text_splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=0)
12
+ texts = text_splitter.split_documents(doc)
13
+
14
+ texts = [i.page_content for i in texts]
15
+
16
+ return texts[0]
17
 
18
  iface = gr.Interface(
19
  fn=extract_text,