DrishtiSharma commited on
Commit
c0bbae3
Β·
verified Β·
1 Parent(s): b9040c0

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +52 -16
app.py CHANGED
@@ -48,34 +48,68 @@ if "vector_store" not in st.session_state:
48
  st.session_state.vector_store = None
49
 
50
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
51
  # ----------------- Metadata Extraction -----------------
52
  def extract_metadata_llm(pdf_path):
53
- """Extracts metadata using LLM instead of regex and logs progress in Streamlit UI."""
54
 
55
  with pdfplumber.open(pdf_path) as pdf:
56
  first_page_text = pdf.pages[0].extract_text() or "No text found." if pdf.pages else "No text found."
57
 
 
 
 
 
 
 
58
  # Streamlit Debugging: Show extracted text
59
- st.subheader("πŸ“„ Extracted First Page Text for Metadata")
60
- st.text_area("First Page Text:", first_page_text, height=200)
61
 
62
  # Define metadata prompt
63
  metadata_prompt = PromptTemplate(
64
- input_variables=["text"],
65
  template="""
66
- Given the following first page of a research paper, extract metadata **strictly in JSON format**.
67
- - If no data is found for a field, return `"Unknown"` instead.
68
- - Ensure the output is valid JSON (do not include markdown syntax).
69
-
 
 
 
70
  Example output:
71
- {
72
  "Title": "Example Paper Title",
73
  "Author": "John Doe, Jane Smith",
74
75
  "Affiliations": "School of AI, University of Example"
76
- }
77
-
78
- Now, extract the metadata from this document:
79
  {text}
80
  """
81
  )
@@ -85,10 +119,10 @@ def extract_metadata_llm(pdf_path):
85
 
86
  # Debugging: Log the LLM input
87
  st.subheader("πŸ” LLM Input for Metadata Extraction")
88
- st.json({"text": first_page_text})
89
 
90
  try:
91
- metadata_response = metadata_chain.invoke({"text": first_page_text})
92
 
93
  # Debugging: Log raw LLM response
94
  st.subheader("πŸ” Raw LLM Response")
@@ -103,7 +137,7 @@ def extract_metadata_llm(pdf_path):
103
  metadata_dict = json.loads(metadata_response["metadata"].strip("```json\n").strip("\n```"))
104
  except json.JSONDecodeError:
105
  metadata_dict = {
106
- "Title": "Unknown",
107
  "Author": "Unknown",
108
  "Emails": "No emails found",
109
  "Affiliations": "No affiliations found"
@@ -112,7 +146,7 @@ def extract_metadata_llm(pdf_path):
112
  except Exception as e:
113
  st.error(f"❌ LLM Metadata Extraction Failed: {e}")
114
  metadata_dict = {
115
- "Title": "Unknown",
116
  "Author": "Unknown",
117
  "Emails": "No emails found",
118
  "Affiliations": "No affiliations found"
@@ -128,6 +162,8 @@ def extract_metadata_llm(pdf_path):
128
  st.json(metadata_dict)
129
 
130
  return metadata_dict
 
 
131
 
132
 
133
  # ----------------- Step 1: Choose PDF Source -----------------
 
48
  st.session_state.vector_store = None
49
 
50
 
51
+ # ----------------- Text Cleaning Functions -----------------
52
+ def clean_extracted_text(text):
53
+ """
54
+ Cleans extracted PDF text by removing excessive line breaks, fixing spacing issues, and resolving OCR artifacts.
55
+ """
56
+ text = re.sub(r'\n+', '\n', text) # Remove excessive newlines
57
+ text = re.sub(r'\s{2,}', ' ', text) # Remove extra spaces
58
+ text = re.sub(r'(\w)-\n(\w)', r'\1\2', text) # Fix hyphenated words split by a newline
59
+ return text.strip()
60
+
61
+ def extract_title_manually(text):
62
+ """
63
+ Attempts to find the title by checking the first few lines.
64
+ - Titles are usually long enough (more than 5 words).
65
+ - Ignores common header text like "Abstract", "Introduction".
66
+ """
67
+ lines = text.split("\n")
68
+ ignore_keywords = ["abstract", "introduction", "keywords", "contents", "table", "figure"]
69
+
70
+ for line in lines[:5]: # Check only the first 5 lines
71
+ clean_line = line.strip()
72
+ if len(clean_line.split()) > 5 and not any(word.lower() in clean_line.lower() for word in ignore_keywords):
73
+ return clean_line # Return first valid title
74
+ return "Unknown"
75
+
76
  # ----------------- Metadata Extraction -----------------
77
  def extract_metadata_llm(pdf_path):
78
+ """Extracts metadata using LLM with improved title detection and JSON handling."""
79
 
80
  with pdfplumber.open(pdf_path) as pdf:
81
  first_page_text = pdf.pages[0].extract_text() or "No text found." if pdf.pages else "No text found."
82
 
83
+ # Apply text cleaning
84
+ cleaned_text = clean_extracted_text(first_page_text)
85
+
86
+ # Attempt manual title extraction before LLM
87
+ pre_extracted_title = extract_title_manually(cleaned_text)
88
+
89
  # Streamlit Debugging: Show extracted text
90
+ st.subheader("πŸ“„ Extracted First Page Text (Cleaned)")
91
+ st.text_area("Cleaned Text:", cleaned_text, height=200)
92
 
93
  # Define metadata prompt
94
  metadata_prompt = PromptTemplate(
95
+ input_variables=["text", "pre_title"],
96
  template="""
97
+ Given the first page of a research paper, extract metadata **strictly in JSON format**.
98
+ - The title is typically in the first few lines and is often in a larger font or bold.
99
+ - If a phrase like "Short Paper:" appears, the actual title follows.
100
+ - If no clear title is found, use the pre-extracted title: "{pre_title}".
101
+ - If a field is missing, return `"Unknown"`.
102
+ - Ensure the JSON format is **valid**.
103
+
104
  Example output:
105
+ {{
106
  "Title": "Example Paper Title",
107
  "Author": "John Doe, Jane Smith",
108
109
  "Affiliations": "School of AI, University of Example"
110
+ }}
111
+
112
+ Now, extract metadata from this document:
113
  {text}
114
  """
115
  )
 
119
 
120
  # Debugging: Log the LLM input
121
  st.subheader("πŸ” LLM Input for Metadata Extraction")
122
+ st.json({"text": cleaned_text, "pre_title": pre_extracted_title})
123
 
124
  try:
125
+ metadata_response = metadata_chain.invoke({"text": cleaned_text, "pre_title": pre_extracted_title})
126
 
127
  # Debugging: Log raw LLM response
128
  st.subheader("πŸ” Raw LLM Response")
 
137
  metadata_dict = json.loads(metadata_response["metadata"].strip("```json\n").strip("\n```"))
138
  except json.JSONDecodeError:
139
  metadata_dict = {
140
+ "Title": pre_extracted_title, # Use pre-extracted title as fallback
141
  "Author": "Unknown",
142
  "Emails": "No emails found",
143
  "Affiliations": "No affiliations found"
 
146
  except Exception as e:
147
  st.error(f"❌ LLM Metadata Extraction Failed: {e}")
148
  metadata_dict = {
149
+ "Title": pre_extracted_title, # Use pre-extracted title
150
  "Author": "Unknown",
151
  "Emails": "No emails found",
152
  "Affiliations": "No affiliations found"
 
162
  st.json(metadata_dict)
163
 
164
  return metadata_dict
165
+
166
+
167
 
168
 
169
  # ----------------- Step 1: Choose PDF Source -----------------