DrishtiSharma commited on
Commit
33dd4ca
Β·
verified Β·
1 Parent(s): 97baa24

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +0 -97
app.py CHANGED
@@ -47,85 +47,6 @@ if "processed_chunks" not in st.session_state:
47
  if "vector_store" not in st.session_state:
48
  st.session_state.vector_store = None
49
 
50
-
51
- # ----------------- Text Cleaning Functions -----------------
52
- def clean_extracted_text(text):
53
- """
54
- Cleans extracted PDF text by removing excessive line breaks, fixing spacing issues, and resolving OCR artifacts.
55
- """
56
- text = re.sub(r'\n+', '\n', text) # Remove excessive newlines
57
- text = re.sub(r'\s{2,}', ' ', text) # Remove extra spaces
58
- text = re.sub(r'(\w)-\n(\w)', r'\1\2', text) # Fix hyphenated words split by a newline
59
- return text.strip()
60
-
61
- def extract_title_manually(text):
62
- """
63
- Attempts to find the title by checking the first few lines.
64
- - Titles are usually long enough (more than 5 words).
65
- - Ignores common header text like "Abstract", "Introduction".
66
- """
67
- lines = text.split("\n")
68
- ignore_keywords = ["abstract", "introduction", "keywords", "contents", "table", "figure"]
69
-
70
- for line in lines[:5]: # Check only the first 5 lines
71
- clean_line = line.strip()
72
- if len(clean_line.split()) > 5 and not any(word.lower() in clean_line.lower() for word in ignore_keywords):
73
- return clean_line # Return first valid title
74
- return "Unknown"
75
-
76
- # ----------------- Metadata Extraction -----------------
77
- def extract_metadata_llm(pdf_path):
78
- """Extracts metadata using LLM for better accuracy."""
79
-
80
- with pdfplumber.open(pdf_path) as pdf:
81
- if not pdf.pages:
82
- return {"Title": "Unknown", "Author": "Unknown", "Emails": "No emails found", "Affiliations": "No affiliations found"}
83
-
84
- # Extract text from the first page
85
- first_page_text = pdf.pages[0].extract_text()
86
- if not first_page_text:
87
- return {"Title": "Unknown", "Author": "Unknown", "Emails": "No emails found", "Affiliations": "No affiliations found"}
88
-
89
- cleaned_text = first_page_text.strip()
90
-
91
- # Define a structured prompt for the LLM
92
- metadata_prompt = PromptTemplate(
93
- input_variables=["text"],
94
- template="""
95
- Extract the following metadata from the research paper's first page:
96
- - Title
97
- - Authors (comma-separated)
98
- - Emails (comma-separated)
99
- - Affiliations
100
-
101
- Ensure the output is in **valid JSON format** with keys: "Title", "Author", "Emails", "Affiliations".
102
-
103
- Here is the text:
104
- {text}
105
-
106
- Provide the JSON output only, no extra text.
107
- """
108
- )
109
-
110
- # Run the LLM Metadata Extraction
111
- metadata_chain = LLMChain(llm=llm_judge, prompt=metadata_prompt, output_key="metadata")
112
-
113
- try:
114
- metadata_response = metadata_chain.invoke({"text": cleaned_text})
115
-
116
- # Convert the LLM response into a dictionary
117
- metadata_dict = json.loads(metadata_response["metadata"])
118
-
119
- except Exception as e:
120
- metadata_dict = {
121
- "Title": "Unknown",
122
- "Author": "Unknown",
123
- "Emails": "No emails found",
124
- "Affiliations": "No affiliations found"
125
- }
126
-
127
- return metadata_dict
128
-
129
  # ----------------- Step 1: Choose PDF Source -----------------
130
  pdf_source = st.radio("Upload or provide a link to a PDF:", ["Upload a PDF file", "Enter a PDF URL"], index=0, horizontal=True)
131
 
@@ -164,34 +85,16 @@ if not st.session_state.pdf_loaded and "pdf_path" in st.session_state:
164
  with st.spinner("πŸ”„ Processing document... Please wait."):
165
  loader = PDFPlumberLoader(st.session_state.pdf_path)
166
  docs = loader.load()
167
- st.json(docs[0].metadata)
168
-
169
- # Extract metadata
170
- metadata = extract_metadata_llm(st.session_state.pdf_path)
171
-
172
- # Display extracted-metadata
173
- if isinstance(metadata, dict):
174
- st.subheader("πŸ“„ Extracted Document Metadata")
175
- st.write(f"**Title:** {metadata.get('Title', 'Unknown')}")
176
- st.write(f"**Author:** {metadata.get('Author', 'Unknown')}")
177
- st.write(f"**Emails:** {metadata.get('Emails', 'No emails found')}")
178
- st.write(f"**Affiliations:** {metadata.get('Affiliations', 'No affiliations found')}")
179
- else:
180
- st.error("Metadata extraction failed.")
181
 
182
  # Embedding Model
183
  model_name = "nomic-ai/modernbert-embed-base"
184
  embedding_model = HuggingFaceEmbeddings(model_name=model_name, model_kwargs={"device": "cpu"}, encode_kwargs={'normalize_embeddings': False})
185
 
186
- # Convert metadata into a retrievable chunk
187
- metadata_doc = {"page_content": metadata, "metadata": {"source": "metadata"}}
188
-
189
 
190
  # Prevent unnecessary re-chunking
191
  if not st.session_state.chunked:
192
  text_splitter = SemanticChunker(embedding_model)
193
  document_chunks = text_splitter.split_documents(docs)
194
- document_chunks.insert(0, metadata_doc) # Insert metadata as a retrievable document
195
  st.session_state.processed_chunks = document_chunks
196
  st.session_state.chunked = True
197
 
 
47
  if "vector_store" not in st.session_state:
48
  st.session_state.vector_store = None
49
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
50
  # ----------------- Step 1: Choose PDF Source -----------------
51
  pdf_source = st.radio("Upload or provide a link to a PDF:", ["Upload a PDF file", "Enter a PDF URL"], index=0, horizontal=True)
52
 
 
85
  with st.spinner("πŸ”„ Processing document... Please wait."):
86
  loader = PDFPlumberLoader(st.session_state.pdf_path)
87
  docs = loader.load()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
88
 
89
  # Embedding Model
90
  model_name = "nomic-ai/modernbert-embed-base"
91
  embedding_model = HuggingFaceEmbeddings(model_name=model_name, model_kwargs={"device": "cpu"}, encode_kwargs={'normalize_embeddings': False})
92
 
 
 
 
93
 
94
  # Prevent unnecessary re-chunking
95
  if not st.session_state.chunked:
96
  text_splitter = SemanticChunker(embedding_model)
97
  document_chunks = text_splitter.split_documents(docs)
 
98
  st.session_state.processed_chunks = document_chunks
99
  st.session_state.chunked = True
100