DrishtiSharma commited on
Commit
fd8e822
·
verified ·
1 Parent(s): 264abd1

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +42 -54
app.py CHANGED
@@ -74,70 +74,58 @@ def extract_title_manually(text):
74
  return "Unknown"
75
 
76
  # ----------------- Metadata Extraction -----------------
77
- def extract_metadata(pdf_path):
78
- """Extracts Title, Authors, Emails, and Affiliations from the first page of a PDF with improved accuracy."""
79
-
80
  with pdfplumber.open(pdf_path) as pdf:
81
  if not pdf.pages:
82
- return {
83
- "Title": "Unknown",
84
- "Author": "Unknown",
85
- "Emails": "No emails found",
86
- "Affiliations": "No affiliations found"
87
- }
88
 
89
  # Extract text from the first page
90
  first_page_text = pdf.pages[0].extract_text()
91
  if not first_page_text:
92
- return {
93
- "Title": "Unknown",
94
- "Author": "Unknown",
95
- "Emails": "No emails found",
96
- "Affiliations": "No affiliations found"
97
- }
98
 
99
  cleaned_text = first_page_text.strip()
100
- lines = cleaned_text.split("\n")
101
-
102
- # ---- Extract Title ----
103
- title = "Unknown"
104
- for line in lines[:5]: # Only check the first few lines
105
- clean_line = line.strip()
106
- if 6 < len(clean_line.split()) < 20 and not clean_line.lower().startswith(("abstract", "keywords", "introduction")):
107
- title = clean_line
108
- break
109
-
110
- # ---- Extract Authors ----
111
- author_candidates = []
112
- name_pattern = re.compile(r"\b[A-Z][a-z]+(?:\s[A-Z][a-z]+)+\b") # Firstname Lastname format
113
- for line in lines:
114
- if "@" in line: # Stop when reaching emails
115
- break
116
- matches = name_pattern.findall(line)
117
- if matches and len(matches) < 5: # Avoid false positives
118
- author_candidates.extend(matches)
119
-
120
- authors = ", ".join(author_candidates) if author_candidates else "Unknown"
121
-
122
- # ---- Extract Emails ----
123
- email_pattern = re.compile(r"[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}")
124
- emails = ", ".join(email_pattern.findall(cleaned_text)) or "No emails found"
125
-
126
- # ---- Extract Affiliations ----
127
- affiliations = "Unknown"
128
- for i, line in enumerate(lines):
129
- if "@" in line: # Look for affiliations after email section
130
- if i + 1 < len(lines):
131
- affiliations = lines[i + 1].strip()
132
- break
133
-
134
- return {
135
- "Title": title,
136
- "Author": authors,
137
- "Emails": emails,
138
- "Affiliations": affiliations
139
  }
140
 
 
 
141
  # ----------------- Step 1: Choose PDF Source -----------------
142
  pdf_source = st.radio("Upload or provide a link to a PDF:", ["Upload a PDF file", "Enter a PDF URL"], index=0, horizontal=True)
143
 
 
74
  return "Unknown"
75
 
76
  # ----------------- Metadata Extraction -----------------
77
+ def extract_metadata_llm(pdf_path):
78
+ """Extracts metadata using LLM for better accuracy."""
79
+
80
  with pdfplumber.open(pdf_path) as pdf:
81
  if not pdf.pages:
82
+ return {"Title": "Unknown", "Author": "Unknown", "Emails": "No emails found", "Affiliations": "No affiliations found"}
 
 
 
 
 
83
 
84
  # Extract text from the first page
85
  first_page_text = pdf.pages[0].extract_text()
86
  if not first_page_text:
87
+ return {"Title": "Unknown", "Author": "Unknown", "Emails": "No emails found", "Affiliations": "No affiliations found"}
 
 
 
 
 
88
 
89
  cleaned_text = first_page_text.strip()
90
+
91
+ # Define a structured prompt for the LLM
92
+ metadata_prompt = PromptTemplate(
93
+ input_variables=["text"],
94
+ template="""
95
+ Extract the following metadata from the research paper's first page:
96
+ - Title
97
+ - Authors (comma-separated)
98
+ - Emails (comma-separated)
99
+ - Affiliations
100
+
101
+ Ensure the output is in **valid JSON format** with keys: "Title", "Author", "Emails", "Affiliations".
102
+
103
+ Here is the text:
104
+ {text}
105
+
106
+ Provide the JSON output only, no extra text.
107
+ """
108
+ )
109
+
110
+ # Run the LLM Metadata Extraction
111
+ metadata_chain = LLMChain(llm=llm, prompt=metadata_prompt, output_key="metadata")
112
+
113
+ try:
114
+ metadata_response = metadata_chain.invoke({"text": cleaned_text})
115
+
116
+ # Convert the LLM response into a dictionary
117
+ metadata_dict = json.loads(metadata_response["metadata"])
118
+
119
+ except Exception as e:
120
+ metadata_dict = {
121
+ "Title": "Unknown",
122
+ "Author": "Unknown",
123
+ "Emails": "No emails found",
124
+ "Affiliations": "No affiliations found"
 
 
 
 
125
  }
126
 
127
+ return metadata_dict
128
+
129
  # ----------------- Step 1: Choose PDF Source -----------------
130
  pdf_source = st.radio("Upload or provide a link to a PDF:", ["Upload a PDF file", "Enter a PDF URL"], index=0, horizontal=True)
131