DrishtiSharma commited on
Commit
8b24191
·
verified ·
1 Parent(s): 8ec9856

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +22 -8
app.py CHANGED
@@ -1,5 +1,6 @@
1
  import streamlit as st
2
  import os
 
3
  import requests
4
  import pdfplumber
5
  import chromadb
@@ -48,7 +49,7 @@ if "processed_chunks" not in st.session_state:
48
  if "vector_store" not in st.session_state:
49
  st.session_state.vector_store = None
50
 
51
- # ----------------- Improved Metadata Extraction -----------------
52
  def extract_metadata_llm(pdf_path):
53
  """Extracts metadata using LLM instead of regex."""
54
  with pdfplumber.open(pdf_path) as pdf:
@@ -58,12 +59,13 @@ def extract_metadata_llm(pdf_path):
58
  metadata_prompt = PromptTemplate(
59
  input_variables=["text"],
60
  template="""
61
- Given the following first page of a research paper, extract:
62
- - The title of the paper
63
- - The authors' names
64
- - Any email addresses present
65
- - The affiliations of the authors
66
-
 
67
  Ensure accurate extraction.
68
 
69
  First page content:
@@ -74,7 +76,19 @@ def extract_metadata_llm(pdf_path):
74
  metadata_chain = LLMChain(llm=llm_judge, prompt=metadata_prompt, output_key="metadata")
75
  metadata_response = metadata_chain.invoke({"text": first_page_text})
76
 
77
- return metadata_response["metadata"]
 
 
 
 
 
 
 
 
 
 
 
 
78
 
79
  # ----------------- Step 1: Choose PDF Source -----------------
80
  pdf_source = st.radio("Upload or provide a link to a PDF:", ["Upload a PDF file", "Enter a PDF URL"], index=0, horizontal=True)
 
1
  import streamlit as st
2
  import os
3
+ import json
4
  import requests
5
  import pdfplumber
6
  import chromadb
 
49
  if "vector_store" not in st.session_state:
50
  st.session_state.vector_store = None
51
 
52
+ # ----------------- Metadata Extraction -----------------
53
  def extract_metadata_llm(pdf_path):
54
  """Extracts metadata using LLM instead of regex."""
55
  with pdfplumber.open(pdf_path) as pdf:
 
59
  metadata_prompt = PromptTemplate(
60
  input_variables=["text"],
61
  template="""
62
+ Given the following first page of a research paper, extract metadata in JSON format with these fields:
63
+ {
64
+ "Title": "Paper Title",
65
+ "Author": "Author Name(s)",
66
+ "Emails": "List of Emails",
67
+ "Affiliations": "Author Affiliation(s)"
68
+ }
69
  Ensure accurate extraction.
70
 
71
  First page content:
 
76
  metadata_chain = LLMChain(llm=llm_judge, prompt=metadata_prompt, output_key="metadata")
77
  metadata_response = metadata_chain.invoke({"text": first_page_text})
78
 
79
+ try:
80
+ # Ensure response is a valid JSON string and convert it to a dictionary
81
+ metadata_dict = json.loads(metadata_response["metadata"])
82
+ except json.JSONDecodeError:
83
+ metadata_dict = {
84
+ "Title": "Unknown",
85
+ "Author": "Unknown",
86
+ "Emails": "No emails found",
87
+ "Affiliations": "No affiliations found"
88
+ }
89
+
90
+ return metadata_dict
91
+
92
 
93
  # ----------------- Step 1: Choose PDF Source -----------------
94
  pdf_source = st.radio("Upload or provide a link to a PDF:", ["Upload a PDF file", "Enter a PDF URL"], index=0, horizontal=True)