DrishtiSharma commited on
Commit
f8ed37f
·
verified ·
1 Parent(s): 97424be

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +7 -20
app.py CHANGED
@@ -54,30 +54,16 @@ def extract_metadata_llm(pdf_path):
54
  with pdfplumber.open(pdf_path) as pdf:
55
  first_page_text = pdf.pages[0].extract_text() if pdf.pages else "No text found."
56
 
57
- # LLM prompt for extracting metadata
58
- metadata_prompt = PromptTemplate(
59
- input_variables=["text"],
60
- template="""
61
- Given the following first page of a research paper, extract metadata in JSON format with these fields:
62
- {
63
- "Title": "Paper Title",
64
- "Author": "Author Name(s)",
65
- "Emails": "List of Emails",
66
- "Affiliations": "Author Affiliation(s)"
67
- }
68
- Ensure accurate extraction.
69
-
70
- First page content:
71
- {text}
72
- """
73
- )
74
-
75
  metadata_chain = LLMChain(llm=llm_judge, prompt=metadata_prompt, output_key="metadata")
76
  metadata_response = metadata_chain.invoke({"text": first_page_text})
77
 
 
 
 
 
78
  try:
79
- # Ensure response is a valid JSON string and convert it to a dictionary
80
- metadata_dict = json.loads(metadata_response["metadata"])
81
  except json.JSONDecodeError:
82
  metadata_dict = {
83
  "Title": "Unknown",
@@ -89,6 +75,7 @@ def extract_metadata_llm(pdf_path):
89
  return metadata_dict
90
 
91
 
 
92
  # ----------------- Step 1: Choose PDF Source -----------------
93
  pdf_source = st.radio("Upload or provide a link to a PDF:", ["Upload a PDF file", "Enter a PDF URL"], index=0, horizontal=True)
94
 
 
54
  with pdfplumber.open(pdf_path) as pdf:
55
  first_page_text = pdf.pages[0].extract_text() if pdf.pages else "No text found."
56
 
57
+ # Run LLM Metadata Extraction
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
58
  metadata_chain = LLMChain(llm=llm_judge, prompt=metadata_prompt, output_key="metadata")
59
  metadata_response = metadata_chain.invoke({"text": first_page_text})
60
 
61
+ # Ensure LLM Output is a Proper JSON String
62
+ json_match = re.search(r"```json\n(.*?)\n```", metadata_response["metadata"], re.DOTALL)
63
+ json_text = json_match.group(1) if json_match else metadata_response["metadata"]
64
+
65
  try:
66
+ metadata_dict = json.loads(json_text)
 
67
  except json.JSONDecodeError:
68
  metadata_dict = {
69
  "Title": "Unknown",
 
75
  return metadata_dict
76
 
77
 
78
+
79
  # ----------------- Step 1: Choose PDF Source -----------------
80
  pdf_source = st.radio("Upload or provide a link to a PDF:", ["Upload a PDF file", "Enter a PDF URL"], index=0, horizontal=True)
81