DrishtiSharma commited on
Commit
aeca549
·
verified ·
1 Parent(s): dba1813

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +25 -36
app.py CHANGED
@@ -49,43 +49,32 @@ if "vector_store" not in st.session_state:
49
  st.session_state.vector_store = None
50
 
51
  # ----------------- Improved Metadata Extraction -----------------
52
- def extract_metadata(pdf_path):
53
- """Extracts title, author, emails, and affiliations from the first page if metadata is missing."""
54
  with pdfplumber.open(pdf_path) as pdf:
55
- metadata = pdf.metadata or {}
56
- first_page_text = pdf.pages[0].extract_text() if pdf.pages else ""
57
-
58
- # Normalize and split first-page text into lines
59
- lines = first_page_text.split("\n") if first_page_text else []
60
-
61
- # Extract title (Try multiple strategies)
62
- title = metadata.get("Title", "").strip()
63
- if not title:
64
- for line in lines[:5]: # Check the first few lines
65
- if len(line.strip()) > 5 and not line.isdigit(): # Avoid numbers (page numbers)
66
- title = line.strip()
67
- break
68
- title = title if title else "Untitled Document"
69
-
70
- # Extract author
71
- author = metadata.get("Author", "").strip()
72
- if not author:
73
- for line in lines:
74
- author_match = re.search(r"(?i)(?:by|authors?)[:\-]?\s*([\w\s,]+)", line)
75
- if author_match:
76
- author = author_match.group(1).strip()
77
- break
78
- author = author if author else "Unknown Author"
79
-
80
- # Extract emails
81
- emails = set(re.findall(r"[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}", first_page_text))
82
- email_str = ", ".join(emails) if emails else "No emails found"
83
-
84
- # Extract affiliations
85
- affiliations = set(re.findall(r"(?:Department|Faculty|Institute|University|College|School)\s+[\w\s]+", first_page_text))
86
- affiliation_str = ", ".join(affiliations) if affiliations else "No affiliations found"
87
-
88
- return title, author, email_str, affiliation_str
89
 
90
  # ----------------- Step 1: Choose PDF Source -----------------
91
  pdf_source = st.radio("Upload or provide a link to a PDF:", ["Upload a PDF file", "Enter a PDF URL"], index=0, horizontal=True)
 
49
  st.session_state.vector_store = None
50
 
51
  # ----------------- Improved Metadata Extraction -----------------
52
+ def extract_metadata_llm(pdf_path):
53
+ """Extracts metadata using LLM instead of regex."""
54
  with pdfplumber.open(pdf_path) as pdf:
55
+ first_page_text = pdf.pages[0].extract_text() if pdf.pages else "No text found."
56
+
57
+ # LLM prompt for extracting metadata
58
+ metadata_prompt = PromptTemplate(
59
+ input_variables=["text"],
60
+ template="""
61
+ Given the following first page of a research paper, extract:
62
+ - The title of the paper
63
+ - The authors' names
64
+ - Any email addresses present
65
+ - The affiliations of the authors
66
+
67
+ Ensure accurate extraction.
68
+
69
+ First page content:
70
+ {text}
71
+ """
72
+ )
73
+
74
+ metadata_chain = LLMChain(llm=llm_judge, prompt=metadata_prompt, output_key="metadata")
75
+ metadata_response = metadata_chain.invoke({"text": first_page_text})
76
+
77
+ return metadata_response["metadata"]
 
 
 
 
 
 
 
 
 
 
 
78
 
79
  # ----------------- Step 1: Choose PDF Source -----------------
80
  pdf_source = st.radio("Upload or provide a link to a PDF:", ["Upload a PDF file", "Enter a PDF URL"], index=0, horizontal=True)